1 | """ Unicode Mapping Parser and Codec Generator.
|
---|
2 |
|
---|
3 | This script parses Unicode mapping files as available from the Unicode
|
---|
4 | site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
|
---|
5 | modules from them. The codecs use the standard character mapping codec
|
---|
6 | to actually apply the mapping.
|
---|
7 |
|
---|
8 | Synopsis: gencodec.py dir codec_prefix
|
---|
9 |
|
---|
10 | All files in dir are scanned and those producing non-empty mappings
|
---|
11 | will be written to <codec_prefix><mapname>.py with <mapname> being the
|
---|
12 | first part of the map's filename ('a' in a.b.c.txt) converted to
|
---|
13 | lowercase with hyphens replaced by underscores.
|
---|
14 |
|
---|
15 | The tool also writes marshalled versions of the mapping tables to the
|
---|
16 | same location (with .mapping extension).
|
---|
17 |
|
---|
18 | Written by Marc-Andre Lemburg (mal@lemburg.com).
|
---|
19 |
|
---|
20 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
---|
21 | (c) Copyright Guido van Rossum, 2000.
|
---|
22 |
|
---|
23 | Table generation:
|
---|
24 | (c) Copyright Marc-Andre Lemburg, 2005.
|
---|
25 | Licensed to PSF under a Contributor Agreement.
|
---|
26 |
|
---|
27 | """#"
|
---|
28 |
|
---|
29 | import re, os, time, marshal, codecs
|
---|
30 |
|
---|
31 | # Maximum allowed size of charmap tables
|
---|
32 | MAX_TABLE_SIZE = 8192
|
---|
33 |
|
---|
34 | # Standard undefined Unicode code point
|
---|
35 | UNI_UNDEFINED = unichr(0xFFFE)
|
---|
36 |
|
---|
37 | mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
|
---|
38 | '\s+'
|
---|
39 | '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
|
---|
40 | '\s*'
|
---|
41 | '(#.+)?')
|
---|
42 |
|
---|
43 | def parsecodes(codes,
|
---|
44 | len=len, filter=filter,range=range):
|
---|
45 |
|
---|
46 | """ Converts code combinations to either a single code integer
|
---|
47 | or a tuple of integers.
|
---|
48 |
|
---|
49 | meta-codes (in angular brackets, e.g. <LR> and <RL>) are
|
---|
50 | ignored.
|
---|
51 |
|
---|
52 | Empty codes or illegal ones are returned as None.
|
---|
53 |
|
---|
54 | """
|
---|
55 | if not codes:
|
---|
56 | return None
|
---|
57 | l = codes.split('+')
|
---|
58 | if len(l) == 1:
|
---|
59 | return int(l[0],16)
|
---|
60 | for i in range(len(l)):
|
---|
61 | try:
|
---|
62 | l[i] = int(l[i],16)
|
---|
63 | except ValueError:
|
---|
64 | l[i] = None
|
---|
65 | l = filter(lambda x: x is not None, l)
|
---|
66 | if len(l) == 1:
|
---|
67 | return l[0]
|
---|
68 | else:
|
---|
69 | return tuple(l)
|
---|
70 |
|
---|
71 | def readmap(filename):
|
---|
72 |
|
---|
73 | f = open(filename,'r')
|
---|
74 | lines = f.readlines()
|
---|
75 | f.close()
|
---|
76 | enc2uni = {}
|
---|
77 | identity = []
|
---|
78 | unmapped = range(256)
|
---|
79 |
|
---|
80 | # UTC mapping tables per convention don't include the identity
|
---|
81 | # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
|
---|
82 | # explicitly mapped to different characters or undefined
|
---|
83 | for i in range(32) + [127]:
|
---|
84 | identity.append(i)
|
---|
85 | unmapped.remove(i)
|
---|
86 | enc2uni[i] = (i, 'CONTROL CHARACTER')
|
---|
87 |
|
---|
88 | for line in lines:
|
---|
89 | line = line.strip()
|
---|
90 | if not line or line[0] == '#':
|
---|
91 | continue
|
---|
92 | m = mapRE.match(line)
|
---|
93 | if not m:
|
---|
94 | #print '* not matched: %s' % repr(line)
|
---|
95 | continue
|
---|
96 | enc,uni,comment = m.groups()
|
---|
97 | enc = parsecodes(enc)
|
---|
98 | uni = parsecodes(uni)
|
---|
99 | if comment is None:
|
---|
100 | comment = ''
|
---|
101 | else:
|
---|
102 | comment = comment[1:].strip()
|
---|
103 | if enc < 256:
|
---|
104 | if enc in unmapped:
|
---|
105 | unmapped.remove(enc)
|
---|
106 | if enc == uni:
|
---|
107 | identity.append(enc)
|
---|
108 | enc2uni[enc] = (uni,comment)
|
---|
109 | else:
|
---|
110 | enc2uni[enc] = (uni,comment)
|
---|
111 |
|
---|
112 | # If there are more identity-mapped entries than unmapped entries,
|
---|
113 | # it pays to generate an identity dictionary first, and add explicit
|
---|
114 | # mappings to None for the rest
|
---|
115 | if len(identity) >= len(unmapped):
|
---|
116 | for enc in unmapped:
|
---|
117 | enc2uni[enc] = (None, "")
|
---|
118 | enc2uni['IDENTITY'] = 256
|
---|
119 |
|
---|
120 | return enc2uni
|
---|
121 |
|
---|
122 | def hexrepr(t, precision=4):
|
---|
123 |
|
---|
124 | if t is None:
|
---|
125 | return 'None'
|
---|
126 | try:
|
---|
127 | len(t)
|
---|
128 | except:
|
---|
129 | return '0x%0*X' % (precision, t)
|
---|
130 | try:
|
---|
131 | return '(' + ', '.join(['0x%0*X' % (precision, item)
|
---|
132 | for item in t]) + ')'
|
---|
133 | except TypeError, why:
|
---|
134 | print '* failed to convert %r: %s' % (t, why)
|
---|
135 | raise
|
---|
136 |
|
---|
137 | def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
|
---|
138 |
|
---|
139 | l = []
|
---|
140 | append = l.append
|
---|
141 | if map.has_key("IDENTITY"):
|
---|
142 | append("%s = codecs.make_identity_dict(range(%d))" %
|
---|
143 | (varname, map["IDENTITY"]))
|
---|
144 | append("%s.update({" % varname)
|
---|
145 | splits = 1
|
---|
146 | del map["IDENTITY"]
|
---|
147 | identity = 1
|
---|
148 | else:
|
---|
149 | append("%s = {" % varname)
|
---|
150 | splits = 0
|
---|
151 | identity = 0
|
---|
152 |
|
---|
153 | mappings = map.items()
|
---|
154 | mappings.sort()
|
---|
155 | i = 0
|
---|
156 | key_precision, value_precision = precisions
|
---|
157 | for mapkey, mapvalue in mappings:
|
---|
158 | mapcomment = ''
|
---|
159 | if isinstance(mapkey, tuple):
|
---|
160 | (mapkey, mapcomment) = mapkey
|
---|
161 | if isinstance(mapvalue, tuple):
|
---|
162 | (mapvalue, mapcomment) = mapvalue
|
---|
163 | if mapkey is None:
|
---|
164 | continue
|
---|
165 | if (identity and
|
---|
166 | mapkey == mapvalue and
|
---|
167 | mapkey < 256):
|
---|
168 | # No need to include identity mappings, since these
|
---|
169 | # are already set for the first 256 code points.
|
---|
170 | continue
|
---|
171 | key = hexrepr(mapkey, key_precision)
|
---|
172 | value = hexrepr(mapvalue, value_precision)
|
---|
173 | if mapcomment and comments:
|
---|
174 | append(' %s: %s,\t# %s' % (key, value, mapcomment))
|
---|
175 | else:
|
---|
176 | append(' %s: %s,' % (key, value))
|
---|
177 | i += 1
|
---|
178 | if i == 4096:
|
---|
179 | # Split the definition into parts to that the Python
|
---|
180 | # parser doesn't dump core
|
---|
181 | if splits == 0:
|
---|
182 | append('}')
|
---|
183 | else:
|
---|
184 | append('})')
|
---|
185 | append('%s.update({' % varname)
|
---|
186 | i = 0
|
---|
187 | splits = splits + 1
|
---|
188 | if splits == 0:
|
---|
189 | append('}')
|
---|
190 | else:
|
---|
191 | append('})')
|
---|
192 |
|
---|
193 | return l
|
---|
194 |
|
---|
195 | def python_tabledef_code(varname, map, comments=1, key_precision=2):
|
---|
196 |
|
---|
197 | l = []
|
---|
198 | append = l.append
|
---|
199 | append('%s = (' % varname)
|
---|
200 |
|
---|
201 | # Analyze map and create table dict
|
---|
202 | mappings = map.items()
|
---|
203 | mappings.sort()
|
---|
204 | table = {}
|
---|
205 | maxkey = 0
|
---|
206 | if map.has_key('IDENTITY'):
|
---|
207 | for key in range(256):
|
---|
208 | table[key] = (key, '')
|
---|
209 | maxkey = 255
|
---|
210 | del map['IDENTITY']
|
---|
211 | for mapkey, mapvalue in mappings:
|
---|
212 | mapcomment = ''
|
---|
213 | if isinstance(mapkey, tuple):
|
---|
214 | (mapkey, mapcomment) = mapkey
|
---|
215 | if isinstance(mapvalue, tuple):
|
---|
216 | (mapvalue, mapcomment) = mapvalue
|
---|
217 | if mapkey is None:
|
---|
218 | continue
|
---|
219 | table[mapkey] = (mapvalue, mapcomment)
|
---|
220 | if mapkey > maxkey:
|
---|
221 | maxkey = mapkey
|
---|
222 | if maxkey > MAX_TABLE_SIZE:
|
---|
223 | # Table too large
|
---|
224 | return None
|
---|
225 |
|
---|
226 | # Create table code
|
---|
227 | for key in range(maxkey + 1):
|
---|
228 | if key not in table:
|
---|
229 | mapvalue = None
|
---|
230 | mapcomment = 'UNDEFINED'
|
---|
231 | else:
|
---|
232 | mapvalue, mapcomment = table[key]
|
---|
233 | if mapvalue is None:
|
---|
234 | mapchar = UNI_UNDEFINED
|
---|
235 | else:
|
---|
236 | if isinstance(mapvalue, tuple):
|
---|
237 | # 1-n mappings not supported
|
---|
238 | return None
|
---|
239 | else:
|
---|
240 | mapchar = unichr(mapvalue)
|
---|
241 | if mapcomment and comments:
|
---|
242 | append(' %r\t# %s -> %s' % (mapchar,
|
---|
243 | hexrepr(key, key_precision),
|
---|
244 | mapcomment))
|
---|
245 | else:
|
---|
246 | append(' %r' % mapchar)
|
---|
247 |
|
---|
248 | append(')')
|
---|
249 | return l
|
---|
250 |
|
---|
251 | def codegen(name, map, encodingname, comments=1):
|
---|
252 |
|
---|
253 | """ Returns Python source for the given map.
|
---|
254 |
|
---|
255 | Comments are included in the source, if comments is true (default).
|
---|
256 |
|
---|
257 | """
|
---|
258 | # Generate code
|
---|
259 | decoding_map_code = python_mapdef_code(
|
---|
260 | 'decoding_map',
|
---|
261 | map,
|
---|
262 | comments=comments)
|
---|
263 | decoding_table_code = python_tabledef_code(
|
---|
264 | 'decoding_table',
|
---|
265 | map,
|
---|
266 | comments=comments)
|
---|
267 | encoding_map_code = python_mapdef_code(
|
---|
268 | 'encoding_map',
|
---|
269 | codecs.make_encoding_map(map),
|
---|
270 | comments=comments,
|
---|
271 | precisions=(4, 2))
|
---|
272 |
|
---|
273 | if decoding_table_code:
|
---|
274 | suffix = 'table'
|
---|
275 | else:
|
---|
276 | suffix = 'map'
|
---|
277 |
|
---|
278 | l = [
|
---|
279 | '''\
|
---|
280 | """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
|
---|
281 |
|
---|
282 | """#"
|
---|
283 |
|
---|
284 | import codecs
|
---|
285 |
|
---|
286 | ### Codec APIs
|
---|
287 |
|
---|
288 | class Codec(codecs.Codec):
|
---|
289 |
|
---|
290 | def encode(self,input,errors='strict'):
|
---|
291 | return codecs.charmap_encode(input,errors,encoding_%s)
|
---|
292 |
|
---|
293 | def decode(self,input,errors='strict'):
|
---|
294 | return codecs.charmap_decode(input,errors,decoding_%s)
|
---|
295 | ''' % (encodingname, name, suffix, suffix)]
|
---|
296 | l.append('''\
|
---|
297 | class IncrementalEncoder(codecs.IncrementalEncoder):
|
---|
298 | def encode(self, input, final=False):
|
---|
299 | return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
|
---|
300 |
|
---|
301 | class IncrementalDecoder(codecs.IncrementalDecoder):
|
---|
302 | def decode(self, input, final=False):
|
---|
303 | return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
|
---|
304 | (suffix, suffix))
|
---|
305 |
|
---|
306 | l.append('''
|
---|
307 | class StreamWriter(Codec,codecs.StreamWriter):
|
---|
308 | pass
|
---|
309 |
|
---|
310 | class StreamReader(Codec,codecs.StreamReader):
|
---|
311 | pass
|
---|
312 |
|
---|
313 | ### encodings module API
|
---|
314 |
|
---|
315 | def getregentry():
|
---|
316 | return codecs.CodecInfo(
|
---|
317 | name=%r,
|
---|
318 | encode=Codec().encode,
|
---|
319 | decode=Codec().decode,
|
---|
320 | incrementalencoder=IncrementalEncoder,
|
---|
321 | incrementaldecoder=IncrementalDecoder,
|
---|
322 | streamreader=StreamReader,
|
---|
323 | streamwriter=StreamWriter,
|
---|
324 | )
|
---|
325 | ''' % encodingname.replace('_', '-'))
|
---|
326 |
|
---|
327 | # Add decoding table or map (with preference to the table)
|
---|
328 | if not decoding_table_code:
|
---|
329 | l.append('''
|
---|
330 | ### Decoding Map
|
---|
331 | ''')
|
---|
332 | l.extend(decoding_map_code)
|
---|
333 | else:
|
---|
334 | l.append('''
|
---|
335 | ### Decoding Table
|
---|
336 | ''')
|
---|
337 | l.extend(decoding_table_code)
|
---|
338 |
|
---|
339 | # Add encoding map
|
---|
340 | if decoding_table_code:
|
---|
341 | l.append('''
|
---|
342 | ### Encoding table
|
---|
343 | encoding_table=codecs.charmap_build(decoding_table)
|
---|
344 | ''')
|
---|
345 | else:
|
---|
346 | l.append('''
|
---|
347 | ### Encoding Map
|
---|
348 | ''')
|
---|
349 | l.extend(encoding_map_code)
|
---|
350 |
|
---|
351 | # Final new-line
|
---|
352 | l.append('')
|
---|
353 |
|
---|
354 | return '\n'.join(l).expandtabs()
|
---|
355 |
|
---|
356 | def pymap(name,map,pyfile,encodingname,comments=1):
|
---|
357 |
|
---|
358 | code = codegen(name,map,encodingname,comments)
|
---|
359 | f = open(pyfile,'w')
|
---|
360 | f.write(code)
|
---|
361 | f.close()
|
---|
362 |
|
---|
363 | def marshalmap(name,map,marshalfile):
|
---|
364 |
|
---|
365 | d = {}
|
---|
366 | for e,(u,c) in map.items():
|
---|
367 | d[e] = (u,c)
|
---|
368 | f = open(marshalfile,'wb')
|
---|
369 | marshal.dump(d,f)
|
---|
370 | f.close()
|
---|
371 |
|
---|
372 | def convertdir(dir, dirprefix='', nameprefix='', comments=1):
|
---|
373 |
|
---|
374 | mapnames = os.listdir(dir)
|
---|
375 | for mapname in mapnames:
|
---|
376 | mappathname = os.path.join(dir, mapname)
|
---|
377 | if not os.path.isfile(mappathname):
|
---|
378 | continue
|
---|
379 | name = os.path.split(mapname)[1]
|
---|
380 | name = name.replace('-','_')
|
---|
381 | name = name.split('.')[0]
|
---|
382 | name = name.lower()
|
---|
383 | name = nameprefix + name
|
---|
384 | codefile = name + '.py'
|
---|
385 | marshalfile = name + '.mapping'
|
---|
386 | print 'converting %s to %s and %s' % (mapname,
|
---|
387 | dirprefix + codefile,
|
---|
388 | dirprefix + marshalfile)
|
---|
389 | try:
|
---|
390 | map = readmap(os.path.join(dir,mapname))
|
---|
391 | if not map:
|
---|
392 | print '* map is empty; skipping'
|
---|
393 | else:
|
---|
394 | pymap(mappathname, map, dirprefix + codefile,name,comments)
|
---|
395 | marshalmap(mappathname, map, dirprefix + marshalfile)
|
---|
396 | except ValueError, why:
|
---|
397 | print '* conversion failed: %s' % why
|
---|
398 | raise
|
---|
399 |
|
---|
400 | def rewritepythondir(dir, dirprefix='', comments=1):
|
---|
401 |
|
---|
402 | mapnames = os.listdir(dir)
|
---|
403 | for mapname in mapnames:
|
---|
404 | if not mapname.endswith('.mapping'):
|
---|
405 | continue
|
---|
406 | name = mapname[:-len('.mapping')]
|
---|
407 | codefile = name + '.py'
|
---|
408 | print 'converting %s to %s' % (mapname,
|
---|
409 | dirprefix + codefile)
|
---|
410 | try:
|
---|
411 | map = marshal.load(open(os.path.join(dir,mapname),
|
---|
412 | 'rb'))
|
---|
413 | if not map:
|
---|
414 | print '* map is empty; skipping'
|
---|
415 | else:
|
---|
416 | pymap(mapname, map, dirprefix + codefile,name,comments)
|
---|
417 | except ValueError, why:
|
---|
418 | print '* conversion failed: %s' % why
|
---|
419 |
|
---|
420 | if __name__ == '__main__':
|
---|
421 |
|
---|
422 | import sys
|
---|
423 | if 1:
|
---|
424 | apply(convertdir,tuple(sys.argv[1:]))
|
---|
425 | else:
|
---|
426 | apply(rewritepythondir,tuple(sys.argv[1:]))
|
---|