source: heimdal/trunk/lib/wind/normalize.c

Last change on this file was 1, checked in by Paul Smedley, 10 years ago

Initial commit of Heimdal 1.5.3

File size: 7.3 KB
Line 
1/*
2 * Copyright (c) 2004 Kungliga Tekniska Högskolan
3 * (Royal Institute of Technology, Stockholm, Sweden).
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the Institute nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#ifdef HAVE_CONFIG_H
35#include <config.h>
36#endif
37#include "windlocl.h"
38
39#include <assert.h>
40#include <stdlib.h>
41#include <errno.h>
42#include <stdio.h>
43
44#include "roken.h"
45
46#include "normalize_table.h"
47
48static int
49translation_cmp(const void *key, const void *data)
50{
51 const struct translation *t1 = (const struct translation *)key;
52 const struct translation *t2 = (const struct translation *)data;
53
54 return t1->key - t2->key;
55}
56
57enum { s_base = 0xAC00};
58enum { s_count = 11172};
59enum { l_base = 0x1100};
60enum { l_count = 19};
61enum { v_base = 0x1161};
62enum { v_count = 21};
63enum { t_base = 0x11A7};
64enum { t_count = 28};
65enum { n_count = v_count * t_count};
66
67static int
68hangul_decomp(const uint32_t *in, size_t in_len,
69 uint32_t *out, size_t *out_len)
70{
71 uint32_t u = *in;
72 unsigned s_index;
73 unsigned l, v, t;
74 unsigned o;
75
76 if (u < s_base || u >= s_base + s_count)
77 return 0;
78 s_index = u - s_base;
79 l = l_base + s_index / n_count;
80 v = v_base + (s_index % n_count) / t_count;
81 t = t_base + s_index % t_count;
82 o = 2;
83 if (t != t_base)
84 ++o;
85 if (*out_len < o)
86 return WIND_ERR_OVERRUN;
87 out[0] = l;
88 out[1] = v;
89 if (t != t_base)
90 out[2] = t;
91 *out_len = o;
92 return 1;
93}
94
95static uint32_t
96hangul_composition(const uint32_t *in, size_t in_len)
97{
98 if (in_len < 2)
99 return 0;
100 if (in[0] >= l_base && in[0] < l_base + l_count) {
101 unsigned l_index = in[0] - l_base;
102 unsigned v_index;
103
104 if (in[1] < v_base || in[1] >= v_base + v_count)
105 return 0;
106 v_index = in[1] - v_base;
107 return (l_index * v_count + v_index) * t_count + s_base;
108 } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109 unsigned s_index = in[0] - s_base;
110 unsigned t_index;
111
112 if (s_index % t_count != 0)
113 return 0;
114 if (in[1] < t_base || in[1] >= t_base + t_count)
115 return 0;
116 t_index = in[1] - t_base;
117 return in[0] + t_index;
118 }
119 return 0;
120}
121
122static int
123compat_decomp(const uint32_t *in, size_t in_len,
124 uint32_t *out, size_t *out_len)
125{
126 unsigned i;
127 unsigned o = 0;
128
129 for (i = 0; i < in_len; ++i) {
130 struct translation ts = {in[i]};
131 size_t sub_len = *out_len - o;
132 int ret;
133
134 ret = hangul_decomp(in + i, in_len - i,
135 out + o, &sub_len);
136 if (ret) {
137 if (ret == WIND_ERR_OVERRUN)
138 return ret;
139 o += sub_len;
140 } else {
141 void *s = bsearch(&ts,
142 _wind_normalize_table,
143 _wind_normalize_table_size,
144 sizeof(_wind_normalize_table[0]),
145 translation_cmp);
146 if (s != NULL) {
147 const struct translation *t = (const struct translation *)s;
148
149 ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150 t->val_len,
151 out + o, &sub_len);
152 if (ret)
153 return ret;
154 o += sub_len;
155 } else {
156 if (o >= *out_len)
157 return WIND_ERR_OVERRUN;
158 out[o++] = in[i];
159
160 }
161 }
162 }
163 *out_len = o;
164 return 0;
165}
166
167static void
168swap_char(uint32_t * a, uint32_t * b)
169{
170 uint32_t t;
171 t = *a;
172 *a = *b;
173 *b = t;
174}
175
176/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177 * that all have Canonical_Combining_Class > 0 */
178static void
179canonical_reorder_sequence(uint32_t * a, size_t len)
180{
181 size_t i, j;
182
183 if (len <= 1)
184 return;
185
186 for (i = 1; i < len; i++) {
187 for (j = i;
188 j > 0 &&
189 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190 j--)
191 swap_char(&a[j], &a[j-1]);
192 }
193}
194
195static void
196canonical_reorder(uint32_t *tmp, size_t tmp_len)
197{
198 size_t i;
199
200 for (i = 0; i < tmp_len; ++i) {
201 int cc = _wind_combining_class(tmp[i]);
202 if (cc) {
203 size_t j;
204 for (j = i + 1;
205 j < tmp_len && _wind_combining_class(tmp[j]);
206 ++j)
207 ;
208 canonical_reorder_sequence(&tmp[i], j - i);
209 i = j;
210 }
211 }
212}
213
214static uint32_t
215find_composition(const uint32_t *in, unsigned in_len)
216{
217 unsigned short canon_index = 0;
218 uint32_t cur;
219 unsigned n = 0;
220
221 cur = hangul_composition(in, in_len);
222 if (cur)
223 return cur;
224
225 do {
226 const struct canon_node *c = &_wind_canon_table[canon_index];
227 unsigned i;
228
229 if (n % 5 == 0) {
230 cur = *in++;
231 if (in_len-- == 0)
232 return c->val;
233 }
234
235 i = cur >> 16;
236 if (i < c->next_start || i >= c->next_end)
237 canon_index = 0;
238 else
239 canon_index =
240 _wind_canon_next_table[c->next_offset + i - c->next_start];
241 if (canon_index != 0) {
242 cur = (cur << 4) & 0xFFFFF;
243 ++n;
244 }
245 } while (canon_index != 0);
246 return 0;
247}
248
249static int
250combine(const uint32_t *in, size_t in_len,
251 uint32_t *out, size_t *out_len)
252{
253 unsigned i;
254 int ostarter;
255 unsigned o = 0;
256 int old_cc;
257
258 for (i = 0; i < in_len;) {
259 while (i < in_len && _wind_combining_class(in[i]) != 0) {
260 out[o++] = in[i++];
261 }
262 if (i < in_len) {
263 if (o >= *out_len)
264 return WIND_ERR_OVERRUN;
265 ostarter = o;
266 out[o++] = in[i++];
267 old_cc = -1;
268
269 while (i < in_len) {
270 uint32_t comb;
271 uint32_t v[2];
272 int cc;
273
274 v[0] = out[ostarter];
275 v[1] = in[i];
276
277 cc = _wind_combining_class(in[i]);
278 if (old_cc != cc && (comb = find_composition(v, 2))) {
279 out[ostarter] = comb;
280 } else if (cc == 0) {
281 break;
282 } else {
283 if (o >= *out_len)
284 return WIND_ERR_OVERRUN;
285 out[o++] = in[i];
286 old_cc = cc;
287 }
288 ++i;
289 }
290 }
291 }
292 *out_len = o;
293 return 0;
294}
295
296int
297_wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298 uint32_t *out, size_t *out_len)
299{
300 size_t tmp_len;
301 uint32_t *tmp;
302 int ret;
303
304 if (in_len == 0) {
305 *out_len = 0;
306 return 0;
307 }
308
309 tmp_len = in_len * 4;
310 if (tmp_len < MAX_LENGTH_CANON)
311 tmp_len = MAX_LENGTH_CANON;
312 tmp = malloc(tmp_len * sizeof(uint32_t));
313 if (tmp == NULL)
314 return ENOMEM;
315
316 ret = compat_decomp(in, in_len, tmp, &tmp_len);
317 if (ret) {
318 free(tmp);
319 return ret;
320 }
321 canonical_reorder(tmp, tmp_len);
322 ret = combine(tmp, tmp_len, out, out_len);
323 free(tmp);
324 return ret;
325}
Note: See TracBrowser for help on using the repository browser.