1 | /*
|
---|
2 | * UTF-8 support routines
|
---|
3 | *
|
---|
4 | * Copyright 2000 Alexandre Julliard
|
---|
5 | */
|
---|
6 |
|
---|
7 | #include <string.h>
|
---|
8 |
|
---|
9 | #include "winnls.h"
|
---|
10 | #include "wine/unicode.h"
|
---|
11 |
|
---|
12 | /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
|
---|
13 | static const char utf8_length[128] =
|
---|
14 | {
|
---|
15 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
|
---|
16 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
|
---|
17 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
|
---|
18 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
|
---|
19 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
|
---|
20 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
|
---|
21 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
|
---|
22 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0 /* 0xf0-0xff */
|
---|
23 | };
|
---|
24 |
|
---|
25 | /* first byte mask depending on UTF-8 sequence length */
|
---|
26 | static const unsigned char utf8_mask[6] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
|
---|
27 |
|
---|
28 | /* minimum Unicode value depending on UTF-8 sequence length */
|
---|
29 | static const unsigned int utf8_minval[6] = { 0x0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
|
---|
30 |
|
---|
31 |
|
---|
32 | /* query necessary dst length for src string */
|
---|
33 | inline static int get_length_wcs_utf8( const WCHAR *src, unsigned int srclen )
|
---|
34 | {
|
---|
35 | int len;
|
---|
36 | for (len = 0; srclen; srclen--, src++, len++)
|
---|
37 | {
|
---|
38 | if (*src >= 0x80)
|
---|
39 | {
|
---|
40 | len++;
|
---|
41 | if (*src >= 0x800) len++;
|
---|
42 | }
|
---|
43 | }
|
---|
44 | return len;
|
---|
45 | }
|
---|
46 |
|
---|
47 | /* wide char to UTF-8 string conversion */
|
---|
48 | /* return -1 on dst buffer overflow */
|
---|
49 | int utf8_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen )
|
---|
50 | {
|
---|
51 | int ret = srclen;
|
---|
52 |
|
---|
53 | if (!dstlen) return get_length_wcs_utf8( src, srclen );
|
---|
54 |
|
---|
55 | for (ret = srclen; srclen; srclen--, src++)
|
---|
56 | {
|
---|
57 | WCHAR ch = *src;
|
---|
58 |
|
---|
59 | if (ch < 0x80) /* 0x00-0x7f: 1 byte */
|
---|
60 | {
|
---|
61 | if (!dstlen--) return -1; /* overflow */
|
---|
62 | *dst++ = ch;
|
---|
63 | continue;
|
---|
64 | }
|
---|
65 |
|
---|
66 | if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
|
---|
67 | {
|
---|
68 | if ((dstlen -= 2) < 0) return -1; /* overflow */
|
---|
69 | dst[1] = 0x80 | (ch & 0x3f);
|
---|
70 | ch >>= 6;
|
---|
71 | dst[0] = 0xc0 | ch;
|
---|
72 | dst += 2;
|
---|
73 | continue;
|
---|
74 | }
|
---|
75 |
|
---|
76 | /* 0x800-0xffff: 3 bytes */
|
---|
77 |
|
---|
78 | if ((dstlen -= 3) < 0) return -1; /* overflow */
|
---|
79 | dst[2] = 0x80 | (ch & 0x3f);
|
---|
80 | ch >>= 6;
|
---|
81 | dst[1] = 0x80 | (ch & 0x3f);
|
---|
82 | ch >>= 6;
|
---|
83 | dst[0] = 0xe0 | ch;
|
---|
84 | dst += 3;
|
---|
85 | }
|
---|
86 | return ret;
|
---|
87 | }
|
---|
88 |
|
---|
89 | /* query necessary dst length for src string */
|
---|
90 | inline static int get_length_mbs_utf8( const unsigned char *src, int srclen )
|
---|
91 | {
|
---|
92 | int ret;
|
---|
93 | const unsigned char *srcend = src + srclen;
|
---|
94 |
|
---|
95 | for (ret = 0; src < srcend; ret++)
|
---|
96 | {
|
---|
97 | unsigned char ch = *src++;
|
---|
98 | if (ch < 0xc0) continue;
|
---|
99 |
|
---|
100 | switch(utf8_length[ch-0x80])
|
---|
101 | {
|
---|
102 | case 5:
|
---|
103 | if (src >= srcend) return ret; /* ignore partial char */
|
---|
104 | if ((ch = *src ^ 0x80) >= 0x40) continue;
|
---|
105 | src++;
|
---|
106 | case 4:
|
---|
107 | if (src >= srcend) return ret; /* ignore partial char */
|
---|
108 | if ((ch = *src ^ 0x80) >= 0x40) continue;
|
---|
109 | src++;
|
---|
110 | case 3:
|
---|
111 | if (src >= srcend) return ret; /* ignore partial char */
|
---|
112 | if ((ch = *src ^ 0x80) >= 0x40) continue;
|
---|
113 | src++;
|
---|
114 | case 2:
|
---|
115 | if (src >= srcend) return ret; /* ignore partial char */
|
---|
116 | if ((ch = *src ^ 0x80) >= 0x40) continue;
|
---|
117 | src++;
|
---|
118 | case 1:
|
---|
119 | if (src >= srcend) return ret; /* ignore partial char */
|
---|
120 | if ((ch = *src ^ 0x80) >= 0x40) continue;
|
---|
121 | src++;
|
---|
122 | }
|
---|
123 | }
|
---|
124 | return ret;
|
---|
125 | }
|
---|
126 |
|
---|
127 | /* UTF-8 to wide char string conversion */
|
---|
128 | /* return -1 on dst buffer overflow, -2 on invalid input char */
|
---|
129 | int utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
|
---|
130 | {
|
---|
131 | int len, count;
|
---|
132 | unsigned int res;
|
---|
133 | const char *srcend = src + srclen;
|
---|
134 |
|
---|
135 | if (!dstlen) return get_length_mbs_utf8( (const unsigned char *)src, srclen );
|
---|
136 |
|
---|
137 | for (count = dstlen; count && (src < srcend); count--, dst++)
|
---|
138 | {
|
---|
139 | unsigned char ch = *src++;
|
---|
140 | if (ch < 0x80) /* special fast case for 7-bit ASCII */
|
---|
141 | {
|
---|
142 | *dst = ch;
|
---|
143 | continue;
|
---|
144 | }
|
---|
145 | len = utf8_length[ch-0x80];
|
---|
146 | res = ch & utf8_mask[len];
|
---|
147 |
|
---|
148 | switch(len)
|
---|
149 | {
|
---|
150 | case 5:
|
---|
151 | if (src >= srcend) goto done; /* ignore partial char */
|
---|
152 | if ((ch = *src ^ 0x80) >= 0x40) goto bad;
|
---|
153 | res = (res << 6) | ch;
|
---|
154 | src++;
|
---|
155 | case 4:
|
---|
156 | if (src >= srcend) goto done; /* ignore partial char */
|
---|
157 | if ((ch = *src ^ 0x80) >= 0x40) goto bad;
|
---|
158 | res = (res << 6) | ch;
|
---|
159 | src++;
|
---|
160 | case 3:
|
---|
161 | if (src >= srcend) goto done; /* ignore partial char */
|
---|
162 | if ((ch = *src ^ 0x80) >= 0x40) goto bad;
|
---|
163 | res = (res << 6) | ch;
|
---|
164 | src++;
|
---|
165 | case 2:
|
---|
166 | if (src >= srcend) goto done; /* ignore partial char */
|
---|
167 | if ((ch = *src ^ 0x80) >= 0x40) goto bad;
|
---|
168 | res = (res << 6) | ch;
|
---|
169 | src++;
|
---|
170 | case 1:
|
---|
171 | if (src >= srcend) goto done; /* ignore partial char */
|
---|
172 | if ((ch = *src ^ 0x80) >= 0x40) goto bad;
|
---|
173 | res = (res << 6) | ch;
|
---|
174 | src++;
|
---|
175 | if (res < utf8_minval[len]) goto bad;
|
---|
176 | if (res >= 0x10000) goto bad; /* FIXME: maybe we should do surrogates here */
|
---|
177 | *dst = res;
|
---|
178 | continue;
|
---|
179 | }
|
---|
180 | bad:
|
---|
181 | if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
|
---|
182 | *dst = (WCHAR)'?';
|
---|
183 | }
|
---|
184 | if (src < srcend) return -1; /* overflow */
|
---|
185 | done:
|
---|
186 | return dstlen - count;
|
---|
187 | }
|
---|