Context Navigation

gencodec.py

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 11.8 KB

Line
1	""" Unicode Mapping Parser and Codec Generator.
2
3	This script parses Unicode mapping files as available from the Unicode
4	site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5	modules from them. The codecs use the standard character mapping codec
6	to actually apply the mapping.
7
8	Synopsis: gencodec.py dir codec_prefix
9
10	All files in dir are scanned and those producing non-empty mappings
11	will be written to <codec_prefix><mapname>.py with <mapname> being the
12	first part of the map's filename ('a' in a.b.c.txt) converted to
13	lowercase with hyphens replaced by underscores.
14
15	The tool also writes marshalled versions of the mapping tables to the
16	same location (with .mapping extension).
17
18	Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21	(c) Copyright Guido van Rossum, 2000.
22
23	Table generation:
24	(c) Copyright Marc-Andre Lemburg, 2005.
25	Licensed to PSF under a Contributor Agreement.
26
27	"""#"
28
29	import re, os, time, marshal, codecs
30
31	# Maximum allowed size of charmap tables
32	MAX_TABLE_SIZE = 8192
33
34	# Standard undefined Unicode code point
35	UNI_UNDEFINED = unichr(0xFFFE)
36
37	mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38	'\s+'
39	'((?:(?:0x[0-9a-fA-Z]+\|<[A-Za-z]+>)\+?)*)'
40	'\s*'
41	'(#.+)?')
42
43	def parsecodes(codes,
44	len=len, filter=filter,range=range):
45
46	""" Converts code combinations to either a single code integer
47	or a tuple of integers.
48
49	meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50	ignored.
51
52	Empty codes or illegal ones are returned as None.
53
54	"""
55	if not codes:
56	return None
57	l = codes.split('+')
58	if len(l) == 1:
59	return int(l[0],16)
60	for i in range(len(l)):
61	try:
62	l[i] = int(l[i],16)
63	except ValueError:
64	l[i] = None
65	l = filter(lambda x: x is not None, l)
66	if len(l) == 1:
67	return l[0]
68	else:
69	return tuple(l)
70
71	def readmap(filename):
72
73	f = open(filename,'r')
74	lines = f.readlines()
75	f.close()
76	enc2uni = {}
77	identity = []
78	unmapped = range(256)
79
80	# UTC mapping tables per convention don't include the identity
81	# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82	# explicitly mapped to different characters or undefined
83	for i in range(32) + [127]:
84	identity.append(i)
85	unmapped.remove(i)
86	enc2uni[i] = (i, 'CONTROL CHARACTER')
87
88	for line in lines:
89	line = line.strip()
90	if not line or line[0] == '#':
91	continue
92	m = mapRE.match(line)
93	if not m:
94	#print '* not matched: %s' % repr(line)
95	continue
96	enc,uni,comment = m.groups()
97	enc = parsecodes(enc)
98	uni = parsecodes(uni)
99	if comment is None:
100	comment = ''
101	else:
102	comment = comment[1:].strip()
103	if enc < 256:
104	if enc in unmapped:
105	unmapped.remove(enc)
106	if enc == uni:
107	identity.append(enc)
108	enc2uni[enc] = (uni,comment)
109	else:
110	enc2uni[enc] = (uni,comment)
111
112	# If there are more identity-mapped entries than unmapped entries,
113	# it pays to generate an identity dictionary first, and add explicit
114	# mappings to None for the rest
115	if len(identity) >= len(unmapped):
116	for enc in unmapped:
117	enc2uni[enc] = (None, "")
118	enc2uni['IDENTITY'] = 256
119
120	return enc2uni
121
122	def hexrepr(t, precision=4):
123
124	if t is None:
125	return 'None'
126	try:
127	len(t)
128	except:
129	return '0x%0*X' % (precision, t)
130	try:
131	return '(' + ', '.join(['0x%0*X' % (precision, item)
132	for item in t]) + ')'
133	except TypeError, why:
134	print '* failed to convert %r: %s' % (t, why)
135	raise
136
137	def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
138
139	l = []
140	append = l.append
141	if map.has_key("IDENTITY"):
142	append("%s = codecs.make_identity_dict(range(%d))" %
143	(varname, map["IDENTITY"]))
144	append("%s.update({" % varname)
145	splits = 1
146	del map["IDENTITY"]
147	identity = 1
148	else:
149	append("%s = {" % varname)
150	splits = 0
151	identity = 0
152
153	mappings = map.items()
154	mappings.sort()
155	i = 0
156	key_precision, value_precision = precisions
157	for mapkey, mapvalue in mappings:
158	mapcomment = ''
159	if isinstance(mapkey, tuple):
160	(mapkey, mapcomment) = mapkey
161	if isinstance(mapvalue, tuple):
162	(mapvalue, mapcomment) = mapvalue
163	if mapkey is None:
164	continue
165	if (identity and
166	mapkey == mapvalue and
167	mapkey < 256):
168	# No need to include identity mappings, since these
169	# are already set for the first 256 code points.
170	continue
171	key = hexrepr(mapkey, key_precision)
172	value = hexrepr(mapvalue, value_precision)
173	if mapcomment and comments:
174	append(' %s: %s,\t# %s' % (key, value, mapcomment))
175	else:
176	append(' %s: %s,' % (key, value))
177	i += 1
178	if i == 4096:
179	# Split the definition into parts to that the Python
180	# parser doesn't dump core
181	if splits == 0:
182	append('}')
183	else:
184	append('})')
185	append('%s.update({' % varname)
186	i = 0
187	splits = splits + 1
188	if splits == 0:
189	append('}')
190	else:
191	append('})')
192
193	return l
194
195	def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197	l = []
198	append = l.append
199	append('%s = (' % varname)
200
201	# Analyze map and create table dict
202	mappings = map.items()
203	mappings.sort()
204	table = {}
205	maxkey = 0
206	if map.has_key('IDENTITY'):
207	for key in range(256):
208	table[key] = (key, '')
209	maxkey = 255
210	del map['IDENTITY']
211	for mapkey, mapvalue in mappings:
212	mapcomment = ''
213	if isinstance(mapkey, tuple):
214	(mapkey, mapcomment) = mapkey
215	if isinstance(mapvalue, tuple):
216	(mapvalue, mapcomment) = mapvalue
217	if mapkey is None:
218	continue
219	table[mapkey] = (mapvalue, mapcomment)
220	if mapkey > maxkey:
221	maxkey = mapkey
222	if maxkey > MAX_TABLE_SIZE:
223	# Table too large
224	return None
225
226	# Create table code
227	for key in range(maxkey + 1):
228	if key not in table:
229	mapvalue = None
230	mapcomment = 'UNDEFINED'
231	else:
232	mapvalue, mapcomment = table[key]
233	if mapvalue is None:
234	mapchar = UNI_UNDEFINED
235	else:
236	if isinstance(mapvalue, tuple):
237	# 1-n mappings not supported
238	return None
239	else:
240	mapchar = unichr(mapvalue)
241	if mapcomment and comments:
242	append(' %r\t# %s -> %s' % (mapchar,
243	hexrepr(key, key_precision),
244	mapcomment))
245	else:
246	append(' %r' % mapchar)
247
248	append(')')
249	return l
250
251	def codegen(name, map, encodingname, comments=1):
252
253	""" Returns Python source for the given map.
254
255	Comments are included in the source, if comments is true (default).
256
257	"""
258	# Generate code
259	decoding_map_code = python_mapdef_code(
260	'decoding_map',
261	map,
262	comments=comments)
263	decoding_table_code = python_tabledef_code(
264	'decoding_table',
265	map,
266	comments=comments)
267	encoding_map_code = python_mapdef_code(
268	'encoding_map',
269	codecs.make_encoding_map(map),
270	comments=comments,
271	precisions=(4, 2))
272
273	if decoding_table_code:
274	suffix = 'table'
275	else:
276	suffix = 'map'
277
278	l = [
279	'''\
280	""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
281
282	"""#"
283
284	import codecs
285
286	### Codec APIs
287
288	class Codec(codecs.Codec):
289
290	def encode(self,input,errors='strict'):
291	return codecs.charmap_encode(input,errors,encoding_%s)
292
293	def decode(self,input,errors='strict'):
294	return codecs.charmap_decode(input,errors,decoding_%s)
295	''' % (encodingname, name, suffix, suffix)]
296	l.append('''\
297	class IncrementalEncoder(codecs.IncrementalEncoder):
298	def encode(self, input, final=False):
299	return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
300
301	class IncrementalDecoder(codecs.IncrementalDecoder):
302	def decode(self, input, final=False):
303	return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
304	(suffix, suffix))
305
306	l.append('''
307	class StreamWriter(Codec,codecs.StreamWriter):
308	pass
309
310	class StreamReader(Codec,codecs.StreamReader):
311	pass
312
313	### encodings module API
314
315	def getregentry():
316	return codecs.CodecInfo(
317	name=%r,
318	encode=Codec().encode,
319	decode=Codec().decode,
320	incrementalencoder=IncrementalEncoder,
321	incrementaldecoder=IncrementalDecoder,
322	streamreader=StreamReader,
323	streamwriter=StreamWriter,
324	)
325	''' % encodingname.replace('_', '-'))
326
327	# Add decoding table or map (with preference to the table)
328	if not decoding_table_code:
329	l.append('''
330	### Decoding Map
331	''')
332	l.extend(decoding_map_code)
333	else:
334	l.append('''
335	### Decoding Table
336	''')
337	l.extend(decoding_table_code)
338
339	# Add encoding map
340	if decoding_table_code:
341	l.append('''
342	### Encoding table
343	encoding_table=codecs.charmap_build(decoding_table)
344	''')
345	else:
346	l.append('''
347	### Encoding Map
348	''')
349	l.extend(encoding_map_code)
350
351	# Final new-line
352	l.append('')
353
354	return '\n'.join(l).expandtabs()
355
356	def pymap(name,map,pyfile,encodingname,comments=1):
357
358	code = codegen(name,map,encodingname,comments)
359	f = open(pyfile,'w')
360	f.write(code)
361	f.close()
362
363	def marshalmap(name,map,marshalfile):
364
365	d = {}
366	for e,(u,c) in map.items():
367	d[e] = (u,c)
368	f = open(marshalfile,'wb')
369	marshal.dump(d,f)
370	f.close()
371
372	def convertdir(dir, dirprefix='', nameprefix='', comments=1):
373
374	mapnames = os.listdir(dir)
375	for mapname in mapnames:
376	mappathname = os.path.join(dir, mapname)
377	if not os.path.isfile(mappathname):
378	continue
379	name = os.path.split(mapname)[1]
380	name = name.replace('-','_')
381	name = name.split('.')[0]
382	name = name.lower()
383	name = nameprefix + name
384	codefile = name + '.py'
385	marshalfile = name + '.mapping'
386	print 'converting %s to %s and %s' % (mapname,
387	dirprefix + codefile,
388	dirprefix + marshalfile)
389	try:
390	map = readmap(os.path.join(dir,mapname))
391	if not map:
392	print '* map is empty; skipping'
393	else:
394	pymap(mappathname, map, dirprefix + codefile,name,comments)
395	marshalmap(mappathname, map, dirprefix + marshalfile)
396	except ValueError, why:
397	print '* conversion failed: %s' % why
398	raise
399
400	def rewritepythondir(dir, dirprefix='', comments=1):
401
402	mapnames = os.listdir(dir)
403	for mapname in mapnames:
404	if not mapname.endswith('.mapping'):
405	continue
406	name = mapname[:-len('.mapping')]
407	codefile = name + '.py'
408	print 'converting %s to %s' % (mapname,
409	dirprefix + codefile)
410	try:
411	map = marshal.load(open(os.path.join(dir,mapname),
412	'rb'))
413	if not map:
414	print '* map is empty; skipping'
415	else:
416	pymap(mapname, map, dirprefix + codefile,name,comments)
417	except ValueError, why:
418	print '* conversion failed: %s' % why
419
420	if __name__ == '__main__':
421
422	import sys
423	if 1:
424	apply(convertdir,tuple(sys.argv[1:]))
425	else:
426	apply(rewritepythondir,tuple(sys.argv[1:]))

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/python/2.5/Tools/unicode/gencodec.py

Download in other formats: