1 | /*
|
---|
2 | * cjkcodecs.h: common header for cjkcodecs
|
---|
3 | *
|
---|
4 | * Written by Hye-Shik Chang <perky@FreeBSD.org>
|
---|
5 | */
|
---|
6 |
|
---|
7 | #ifndef _CJKCODECS_H_
|
---|
8 | #define _CJKCODECS_H_
|
---|
9 |
|
---|
10 | #define PY_SSIZE_T_CLEAN
|
---|
11 | #include "Python.h"
|
---|
12 | #include "multibytecodec.h"
|
---|
13 |
|
---|
14 |
|
---|
15 | /* a unicode "undefined" codepoint */
|
---|
16 | #define UNIINV 0xFFFE
|
---|
17 |
|
---|
18 | /* internal-use DBCS codepoints which aren't used by any charsets */
|
---|
19 | #define NOCHAR 0xFFFF
|
---|
20 | #define MULTIC 0xFFFE
|
---|
21 | #define DBCINV 0xFFFD
|
---|
22 |
|
---|
23 | /* shorter macros to save source size of mapping tables */
|
---|
24 | #define U UNIINV
|
---|
25 | #define N NOCHAR
|
---|
26 | #define M MULTIC
|
---|
27 | #define D DBCINV
|
---|
28 |
|
---|
29 | struct dbcs_index {
|
---|
30 | const ucs2_t *map;
|
---|
31 | unsigned char bottom, top;
|
---|
32 | };
|
---|
33 | typedef struct dbcs_index decode_map;
|
---|
34 |
|
---|
35 | struct widedbcs_index {
|
---|
36 | const ucs4_t *map;
|
---|
37 | unsigned char bottom, top;
|
---|
38 | };
|
---|
39 | typedef struct widedbcs_index widedecode_map;
|
---|
40 |
|
---|
41 | struct unim_index {
|
---|
42 | const DBCHAR *map;
|
---|
43 | unsigned char bottom, top;
|
---|
44 | };
|
---|
45 | typedef struct unim_index encode_map;
|
---|
46 |
|
---|
47 | struct unim_index_bytebased {
|
---|
48 | const unsigned char *map;
|
---|
49 | unsigned char bottom, top;
|
---|
50 | };
|
---|
51 |
|
---|
52 | struct dbcs_map {
|
---|
53 | const char *charset;
|
---|
54 | const struct unim_index *encmap;
|
---|
55 | const struct dbcs_index *decmap;
|
---|
56 | };
|
---|
57 |
|
---|
58 | struct pair_encodemap {
|
---|
59 | ucs4_t uniseq;
|
---|
60 | DBCHAR code;
|
---|
61 | };
|
---|
62 |
|
---|
63 | static const MultibyteCodec *codec_list;
|
---|
64 | static const struct dbcs_map *mapping_list;
|
---|
65 |
|
---|
66 | #define CODEC_INIT(encoding) \
|
---|
67 | static int encoding##_codec_init(const void *config)
|
---|
68 |
|
---|
69 | #define ENCODER_INIT(encoding) \
|
---|
70 | static int encoding##_encode_init( \
|
---|
71 | MultibyteCodec_State *state, const void *config)
|
---|
72 | #define ENCODER(encoding) \
|
---|
73 | static Py_ssize_t encoding##_encode( \
|
---|
74 | MultibyteCodec_State *state, const void *config, \
|
---|
75 | const Py_UNICODE **inbuf, Py_ssize_t inleft, \
|
---|
76 | unsigned char **outbuf, Py_ssize_t outleft, int flags)
|
---|
77 | #define ENCODER_RESET(encoding) \
|
---|
78 | static Py_ssize_t encoding##_encode_reset( \
|
---|
79 | MultibyteCodec_State *state, const void *config, \
|
---|
80 | unsigned char **outbuf, Py_ssize_t outleft)
|
---|
81 |
|
---|
82 | #define DECODER_INIT(encoding) \
|
---|
83 | static int encoding##_decode_init( \
|
---|
84 | MultibyteCodec_State *state, const void *config)
|
---|
85 | #define DECODER(encoding) \
|
---|
86 | static Py_ssize_t encoding##_decode( \
|
---|
87 | MultibyteCodec_State *state, const void *config, \
|
---|
88 | const unsigned char **inbuf, Py_ssize_t inleft, \
|
---|
89 | Py_UNICODE **outbuf, Py_ssize_t outleft)
|
---|
90 | #define DECODER_RESET(encoding) \
|
---|
91 | static Py_ssize_t encoding##_decode_reset( \
|
---|
92 | MultibyteCodec_State *state, const void *config)
|
---|
93 |
|
---|
94 | #if Py_UNICODE_SIZE == 4
|
---|
95 | #define UCS4INVALID(code) \
|
---|
96 | if ((code) > 0xFFFF) \
|
---|
97 | return 1;
|
---|
98 | #else
|
---|
99 | #define UCS4INVALID(code) \
|
---|
100 | if (0) ;
|
---|
101 | #endif
|
---|
102 |
|
---|
103 | #define NEXT_IN(i) \
|
---|
104 | (*inbuf) += (i); \
|
---|
105 | (inleft) -= (i);
|
---|
106 | #define NEXT_OUT(o) \
|
---|
107 | (*outbuf) += (o); \
|
---|
108 | (outleft) -= (o);
|
---|
109 | #define NEXT(i, o) \
|
---|
110 | NEXT_IN(i) NEXT_OUT(o)
|
---|
111 |
|
---|
112 | #define REQUIRE_INBUF(n) \
|
---|
113 | if (inleft < (n)) \
|
---|
114 | return MBERR_TOOFEW;
|
---|
115 | #define REQUIRE_OUTBUF(n) \
|
---|
116 | if (outleft < (n)) \
|
---|
117 | return MBERR_TOOSMALL;
|
---|
118 |
|
---|
119 | #define IN1 ((*inbuf)[0])
|
---|
120 | #define IN2 ((*inbuf)[1])
|
---|
121 | #define IN3 ((*inbuf)[2])
|
---|
122 | #define IN4 ((*inbuf)[3])
|
---|
123 |
|
---|
124 | #define OUT1(c) ((*outbuf)[0]) = (c);
|
---|
125 | #define OUT2(c) ((*outbuf)[1]) = (c);
|
---|
126 | #define OUT3(c) ((*outbuf)[2]) = (c);
|
---|
127 | #define OUT4(c) ((*outbuf)[3]) = (c);
|
---|
128 |
|
---|
129 | #define WRITE1(c1) \
|
---|
130 | REQUIRE_OUTBUF(1) \
|
---|
131 | (*outbuf)[0] = (c1);
|
---|
132 | #define WRITE2(c1, c2) \
|
---|
133 | REQUIRE_OUTBUF(2) \
|
---|
134 | (*outbuf)[0] = (c1); \
|
---|
135 | (*outbuf)[1] = (c2);
|
---|
136 | #define WRITE3(c1, c2, c3) \
|
---|
137 | REQUIRE_OUTBUF(3) \
|
---|
138 | (*outbuf)[0] = (c1); \
|
---|
139 | (*outbuf)[1] = (c2); \
|
---|
140 | (*outbuf)[2] = (c3);
|
---|
141 | #define WRITE4(c1, c2, c3, c4) \
|
---|
142 | REQUIRE_OUTBUF(4) \
|
---|
143 | (*outbuf)[0] = (c1); \
|
---|
144 | (*outbuf)[1] = (c2); \
|
---|
145 | (*outbuf)[2] = (c3); \
|
---|
146 | (*outbuf)[3] = (c4);
|
---|
147 |
|
---|
148 | #if Py_UNICODE_SIZE == 2
|
---|
149 | # define WRITEUCS4(c) \
|
---|
150 | REQUIRE_OUTBUF(2) \
|
---|
151 | (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \
|
---|
152 | (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \
|
---|
153 | NEXT_OUT(2)
|
---|
154 | #else
|
---|
155 | # define WRITEUCS4(c) \
|
---|
156 | REQUIRE_OUTBUF(1) \
|
---|
157 | **outbuf = (Py_UNICODE)(c); \
|
---|
158 | NEXT_OUT(1)
|
---|
159 | #endif
|
---|
160 |
|
---|
161 | #define _TRYMAP_ENC(m, assi, val) \
|
---|
162 | ((m)->map != NULL && (val) >= (m)->bottom && \
|
---|
163 | (val)<= (m)->top && ((assi) = (m)->map[(val) - \
|
---|
164 | (m)->bottom]) != NOCHAR)
|
---|
165 | #define TRYMAP_ENC_COND(charset, assi, uni) \
|
---|
166 | _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
|
---|
167 | #define TRYMAP_ENC(charset, assi, uni) \
|
---|
168 | if TRYMAP_ENC_COND(charset, assi, uni)
|
---|
169 |
|
---|
170 | #define _TRYMAP_DEC(m, assi, val) \
|
---|
171 | ((m)->map != NULL && (val) >= (m)->bottom && \
|
---|
172 | (val)<= (m)->top && ((assi) = (m)->map[(val) - \
|
---|
173 | (m)->bottom]) != UNIINV)
|
---|
174 | #define TRYMAP_DEC(charset, assi, c1, c2) \
|
---|
175 | if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
|
---|
176 |
|
---|
177 | #define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \
|
---|
178 | ((m)->map != NULL && (val) >= (m)->bottom && \
|
---|
179 | (val)<= (m)->top && \
|
---|
180 | ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \
|
---|
181 | (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \
|
---|
182 | (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1))
|
---|
183 | #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \
|
---|
184 | if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \
|
---|
185 | assplane, asshi, asslo, (uni) & 0xff)
|
---|
186 | #define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \
|
---|
187 | if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2)
|
---|
188 |
|
---|
189 | #if Py_UNICODE_SIZE == 2
|
---|
190 | #define DECODE_SURROGATE(c) \
|
---|
191 | if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ \
|
---|
192 | REQUIRE_INBUF(2) \
|
---|
193 | if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \
|
---|
194 | c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \
|
---|
195 | ((ucs4_t)(IN2) - 0xdc00); \
|
---|
196 | } \
|
---|
197 | }
|
---|
198 | #define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1)
|
---|
199 | #else
|
---|
200 | #define DECODE_SURROGATE(c) {;}
|
---|
201 | #define GET_INSIZE(c) 1
|
---|
202 | #endif
|
---|
203 |
|
---|
204 | #define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = {
|
---|
205 | #define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL},
|
---|
206 | #define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap},
|
---|
207 | #define MAPPING_ENCDEC(enc) {#enc, (void*)enc##_encmap, (void*)enc##_decmap},
|
---|
208 | #define END_MAPPINGS_LIST \
|
---|
209 | {"", NULL, NULL} }; \
|
---|
210 | static const struct dbcs_map *mapping_list = \
|
---|
211 | (const struct dbcs_map *)_mapping_list;
|
---|
212 |
|
---|
213 | #define BEGIN_CODECS_LIST static const MultibyteCodec _codec_list[] = {
|
---|
214 | #define _STATEFUL_METHODS(enc) \
|
---|
215 | enc##_encode, \
|
---|
216 | enc##_encode_init, \
|
---|
217 | enc##_encode_reset, \
|
---|
218 | enc##_decode, \
|
---|
219 | enc##_decode_init, \
|
---|
220 | enc##_decode_reset,
|
---|
221 | #define _STATELESS_METHODS(enc) \
|
---|
222 | enc##_encode, NULL, NULL, \
|
---|
223 | enc##_decode, NULL, NULL,
|
---|
224 | #define CODEC_STATEFUL(enc) { \
|
---|
225 | #enc, NULL, NULL, \
|
---|
226 | _STATEFUL_METHODS(enc) \
|
---|
227 | },
|
---|
228 | #define CODEC_STATELESS(enc) { \
|
---|
229 | #enc, NULL, NULL, \
|
---|
230 | _STATELESS_METHODS(enc) \
|
---|
231 | },
|
---|
232 | #define CODEC_STATELESS_WINIT(enc) { \
|
---|
233 | #enc, NULL, \
|
---|
234 | enc##_codec_init, \
|
---|
235 | _STATELESS_METHODS(enc) \
|
---|
236 | },
|
---|
237 | #define END_CODECS_LIST \
|
---|
238 | {"", NULL,} }; \
|
---|
239 | static const MultibyteCodec *codec_list = \
|
---|
240 | (const MultibyteCodec *)_codec_list;
|
---|
241 |
|
---|
242 | static PyObject *
|
---|
243 | getmultibytecodec(void)
|
---|
244 | {
|
---|
245 | static PyObject *cofunc = NULL;
|
---|
246 |
|
---|
247 | if (cofunc == NULL) {
|
---|
248 | PyObject *mod = PyImport_ImportModuleNoBlock("_multibytecodec");
|
---|
249 | if (mod == NULL)
|
---|
250 | return NULL;
|
---|
251 | cofunc = PyObject_GetAttrString(mod, "__create_codec");
|
---|
252 | Py_DECREF(mod);
|
---|
253 | }
|
---|
254 | return cofunc;
|
---|
255 | }
|
---|
256 |
|
---|
257 | static PyObject *
|
---|
258 | getcodec(PyObject *self, PyObject *encoding)
|
---|
259 | {
|
---|
260 | PyObject *codecobj, *r, *cofunc;
|
---|
261 | const MultibyteCodec *codec;
|
---|
262 | const char *enc;
|
---|
263 |
|
---|
264 | if (!PyString_Check(encoding)) {
|
---|
265 | PyErr_SetString(PyExc_TypeError,
|
---|
266 | "encoding name must be a string.");
|
---|
267 | return NULL;
|
---|
268 | }
|
---|
269 |
|
---|
270 | cofunc = getmultibytecodec();
|
---|
271 | if (cofunc == NULL)
|
---|
272 | return NULL;
|
---|
273 |
|
---|
274 | enc = PyString_AS_STRING(encoding);
|
---|
275 | for (codec = codec_list; codec->encoding[0]; codec++)
|
---|
276 | if (strcmp(codec->encoding, enc) == 0)
|
---|
277 | break;
|
---|
278 |
|
---|
279 | if (codec->encoding[0] == '\0') {
|
---|
280 | PyErr_SetString(PyExc_LookupError,
|
---|
281 | "no such codec is supported.");
|
---|
282 | return NULL;
|
---|
283 | }
|
---|
284 |
|
---|
285 | codecobj = PyCapsule_New((void *)codec, PyMultibyteCodec_CAPSULE_NAME, NULL);
|
---|
286 | if (codecobj == NULL)
|
---|
287 | return NULL;
|
---|
288 |
|
---|
289 | r = PyObject_CallFunctionObjArgs(cofunc, codecobj, NULL);
|
---|
290 | Py_DECREF(codecobj);
|
---|
291 |
|
---|
292 | return r;
|
---|
293 | }
|
---|
294 |
|
---|
295 | static struct PyMethodDef __methods[] = {
|
---|
296 | {"getcodec", (PyCFunction)getcodec, METH_O, ""},
|
---|
297 | {NULL, NULL},
|
---|
298 | };
|
---|
299 |
|
---|
300 | static int
|
---|
301 | register_maps(PyObject *module)
|
---|
302 | {
|
---|
303 | const struct dbcs_map *h;
|
---|
304 |
|
---|
305 | for (h = mapping_list; h->charset[0] != '\0'; h++) {
|
---|
306 | char mhname[256] = "__map_";
|
---|
307 | int r;
|
---|
308 | strcpy(mhname + sizeof("__map_") - 1, h->charset);
|
---|
309 | r = PyModule_AddObject(module, mhname,
|
---|
310 | PyCapsule_New((void *)h, PyMultibyteCodec_CAPSULE_NAME, NULL));
|
---|
311 | if (r == -1)
|
---|
312 | return -1;
|
---|
313 | }
|
---|
314 | return 0;
|
---|
315 | }
|
---|
316 |
|
---|
317 | #ifdef USING_BINARY_PAIR_SEARCH
|
---|
318 | static DBCHAR
|
---|
319 | find_pairencmap(ucs2_t body, ucs2_t modifier,
|
---|
320 | const struct pair_encodemap *haystack, int haystacksize)
|
---|
321 | {
|
---|
322 | int pos, min, max;
|
---|
323 | ucs4_t value = body << 16 | modifier;
|
---|
324 |
|
---|
325 | min = 0;
|
---|
326 | max = haystacksize;
|
---|
327 |
|
---|
328 | for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1)
|
---|
329 | if (value < haystack[pos].uniseq) {
|
---|
330 | if (max == pos) break;
|
---|
331 | else max = pos;
|
---|
332 | }
|
---|
333 | else if (value > haystack[pos].uniseq) {
|
---|
334 | if (min == pos) break;
|
---|
335 | else min = pos;
|
---|
336 | }
|
---|
337 | else
|
---|
338 | break;
|
---|
339 |
|
---|
340 | if (value == haystack[pos].uniseq)
|
---|
341 | return haystack[pos].code;
|
---|
342 | else
|
---|
343 | return DBCINV;
|
---|
344 | }
|
---|
345 | #endif
|
---|
346 |
|
---|
347 | #ifdef USING_IMPORTED_MAPS
|
---|
348 | #define IMPORT_MAP(locale, charset, encmap, decmap) \
|
---|
349 | importmap("_codecs_" #locale, "__map_" #charset, \
|
---|
350 | (const void**)encmap, (const void**)decmap)
|
---|
351 |
|
---|
352 | static int
|
---|
353 | importmap(const char *modname, const char *symbol,
|
---|
354 | const void **encmap, const void **decmap)
|
---|
355 | {
|
---|
356 | PyObject *o, *mod;
|
---|
357 |
|
---|
358 | mod = PyImport_ImportModule((char *)modname);
|
---|
359 | if (mod == NULL)
|
---|
360 | return -1;
|
---|
361 |
|
---|
362 | o = PyObject_GetAttrString(mod, (char*)symbol);
|
---|
363 | if (o == NULL)
|
---|
364 | goto errorexit;
|
---|
365 | else if (!PyCapsule_IsValid(o, PyMultibyteCodec_CAPSULE_NAME)) {
|
---|
366 | PyErr_SetString(PyExc_ValueError,
|
---|
367 | "map data must be a Capsule.");
|
---|
368 | goto errorexit;
|
---|
369 | }
|
---|
370 | else {
|
---|
371 | struct dbcs_map *map;
|
---|
372 | map = PyCapsule_GetPointer(o, PyMultibyteCodec_CAPSULE_NAME);
|
---|
373 | if (encmap != NULL)
|
---|
374 | *encmap = map->encmap;
|
---|
375 | if (decmap != NULL)
|
---|
376 | *decmap = map->decmap;
|
---|
377 | Py_DECREF(o);
|
---|
378 | }
|
---|
379 |
|
---|
380 | Py_DECREF(mod);
|
---|
381 | return 0;
|
---|
382 |
|
---|
383 | errorexit:
|
---|
384 | Py_DECREF(mod);
|
---|
385 | return -1;
|
---|
386 | }
|
---|
387 | #endif
|
---|
388 |
|
---|
389 | #define I_AM_A_MODULE_FOR(loc) \
|
---|
390 | void \
|
---|
391 | init_codecs_##loc(void) \
|
---|
392 | { \
|
---|
393 | PyObject *m = Py_InitModule("_codecs_" #loc, __methods);\
|
---|
394 | if (m != NULL) \
|
---|
395 | (void)register_maps(m); \
|
---|
396 | }
|
---|
397 |
|
---|
398 | #endif
|
---|