Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

gencodec.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 11.7 KB

Rev	Line
[2]	1	""" Unicode Mapping Parser and Codec Generator.
	2
	3	This script parses Unicode mapping files as available from the Unicode
	4	site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
	5	modules from them. The codecs use the standard character mapping codec
	6	to actually apply the mapping.
	7
	8	Synopsis: gencodec.py dir codec_prefix
	9
	10	All files in dir are scanned and those producing non-empty mappings
	11	will be written to <codec_prefix><mapname>.py with <mapname> being the
	12	first part of the map's filename ('a' in a.b.c.txt) converted to
	13	lowercase with hyphens replaced by underscores.
	14
	15	The tool also writes marshalled versions of the mapping tables to the
	16	same location (with .mapping extension).
	17
	18	Written by Marc-Andre Lemburg (mal@lemburg.com).
	19
	20	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
	21	(c) Copyright Guido van Rossum, 2000.
	22
	23	Table generation:
	24	(c) Copyright Marc-Andre Lemburg, 2005.
	25	Licensed to PSF under a Contributor Agreement.
	26
	27	"""#"
	28
	29	import re, os, marshal, codecs
	30
	31	# Maximum allowed size of charmap tables
	32	MAX_TABLE_SIZE = 8192
	33
	34	# Standard undefined Unicode code point
	35	UNI_UNDEFINED = unichr(0xFFFE)
	36
	37	mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
	38	'\s+'
	39	'((?:(?:0x[0-9a-fA-Z]+\|<[A-Za-z]+>)\+?)*)'
	40	'\s*'
	41	'(#.+)?')
	42
[391]	43	def parsecodes(codes, len=len, range=range):
[2]	44
	45	""" Converts code combinations to either a single code integer
	46	or a tuple of integers.
	47
	48	meta-codes (in angular brackets, e.g. <LR> and <RL>) are
	49	ignored.
	50
	51	Empty codes or illegal ones are returned as None.
	52
	53	"""
	54	if not codes:
	55	return None
	56	l = codes.split('+')
	57	if len(l) == 1:
	58	return int(l[0],16)
	59	for i in range(len(l)):
	60	try:
	61	l[i] = int(l[i],16)
	62	except ValueError:
	63	l[i] = None
[391]	64	l = [x for x in l if x is not None]
[2]	65	if len(l) == 1:
	66	return l[0]
	67	else:
	68	return tuple(l)
	69
	70	def readmap(filename):
	71
	72	f = open(filename,'r')
	73	lines = f.readlines()
	74	f.close()
	75	enc2uni = {}
	76	identity = []
	77	unmapped = range(256)
	78
	79	# UTC mapping tables per convention don't include the identity
	80	# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
	81	# explicitly mapped to different characters or undefined
	82	for i in range(32) + [127]:
	83	identity.append(i)
	84	unmapped.remove(i)
	85	enc2uni[i] = (i, 'CONTROL CHARACTER')
	86
	87	for line in lines:
	88	line = line.strip()
	89	if not line or line[0] == '#':
	90	continue
	91	m = mapRE.match(line)
	92	if not m:
	93	#print '* not matched: %s' % repr(line)
	94	continue
	95	enc,uni,comment = m.groups()
	96	enc = parsecodes(enc)
	97	uni = parsecodes(uni)
	98	if comment is None:
	99	comment = ''
	100	else:
	101	comment = comment[1:].strip()
	102	if enc < 256:
	103	if enc in unmapped:
	104	unmapped.remove(enc)
	105	if enc == uni:
	106	identity.append(enc)
	107	enc2uni[enc] = (uni,comment)
	108	else:
	109	enc2uni[enc] = (uni,comment)
	110
	111	# If there are more identity-mapped entries than unmapped entries,
	112	# it pays to generate an identity dictionary first, and add explicit
	113	# mappings to None for the rest
	114	if len(identity) >= len(unmapped):
	115	for enc in unmapped:
	116	enc2uni[enc] = (None, "")
	117	enc2uni['IDENTITY'] = 256
	118
	119	return enc2uni
	120
	121	def hexrepr(t, precision=4):
	122
	123	if t is None:
	124	return 'None'
	125	try:
	126	len(t)
	127	except:
	128	return '0x%0*X' % (precision, t)
	129	try:
	130	return '(' + ', '.join(['0x%0*X' % (precision, item)
	131	for item in t]) + ')'
	132	except TypeError, why:
	133	print '* failed to convert %r: %s' % (t, why)
	134	raise
	135
	136	def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
	137
	138	l = []
	139	append = l.append
[391]	140	if "IDENTITY" in map:
[2]	141	append("%s = codecs.make_identity_dict(range(%d))" %
	142	(varname, map["IDENTITY"]))
	143	append("%s.update({" % varname)
	144	splits = 1
	145	del map["IDENTITY"]
	146	identity = 1
	147	else:
	148	append("%s = {" % varname)
	149	splits = 0
	150	identity = 0
	151
[391]	152	mappings = sorted(map.items())
[2]	153	i = 0
	154	key_precision, value_precision = precisions
	155	for mapkey, mapvalue in mappings:
	156	mapcomment = ''
	157	if isinstance(mapkey, tuple):
	158	(mapkey, mapcomment) = mapkey
	159	if isinstance(mapvalue, tuple):
	160	(mapvalue, mapcomment) = mapvalue
	161	if mapkey is None:
	162	continue
	163	if (identity and
	164	mapkey == mapvalue and
	165	mapkey < 256):
	166	# No need to include identity mappings, since these
	167	# are already set for the first 256 code points.
	168	continue
	169	key = hexrepr(mapkey, key_precision)
	170	value = hexrepr(mapvalue, value_precision)
	171	if mapcomment and comments:
	172	append(' %s: %s,\t# %s' % (key, value, mapcomment))
	173	else:
	174	append(' %s: %s,' % (key, value))
	175	i += 1
	176	if i == 4096:
	177	# Split the definition into parts to that the Python
	178	# parser doesn't dump core
	179	if splits == 0:
	180	append('}')
	181	else:
	182	append('})')
	183	append('%s.update({' % varname)
	184	i = 0
	185	splits = splits + 1
	186	if splits == 0:
	187	append('}')
	188	else:
	189	append('})')
	190
	191	return l
	192
	193	def python_tabledef_code(varname, map, comments=1, key_precision=2):
	194
	195	l = []
	196	append = l.append
	197	append('%s = (' % varname)
	198
	199	# Analyze map and create table dict
[391]	200	mappings = sorted(map.items())
[2]	201	table = {}
	202	maxkey = 0
[391]	203	if 'IDENTITY' in map:
[2]	204	for key in range(256):
	205	table[key] = (key, '')
	206	maxkey = 255
	207	del map['IDENTITY']
	208	for mapkey, mapvalue in mappings:
	209	mapcomment = ''
	210	if isinstance(mapkey, tuple):
	211	(mapkey, mapcomment) = mapkey
	212	if isinstance(mapvalue, tuple):
	213	(mapvalue, mapcomment) = mapvalue
	214	if mapkey is None:
	215	continue
	216	table[mapkey] = (mapvalue, mapcomment)
	217	if mapkey > maxkey:
	218	maxkey = mapkey
	219	if maxkey > MAX_TABLE_SIZE:
	220	# Table too large
	221	return None
	222
	223	# Create table code
	224	for key in range(maxkey + 1):
	225	if key not in table:
	226	mapvalue = None
	227	mapcomment = 'UNDEFINED'
	228	else:
	229	mapvalue, mapcomment = table[key]
	230	if mapvalue is None:
	231	mapchar = UNI_UNDEFINED
	232	else:
	233	if isinstance(mapvalue, tuple):
	234	# 1-n mappings not supported
	235	return None
	236	else:
	237	mapchar = unichr(mapvalue)
	238	if mapcomment and comments:
	239	append(' %r\t# %s -> %s' % (mapchar,
	240	hexrepr(key, key_precision),
	241	mapcomment))
	242	else:
	243	append(' %r' % mapchar)
	244
	245	append(')')
	246	return l
	247
	248	def codegen(name, map, encodingname, comments=1):
	249
	250	""" Returns Python source for the given map.
	251
	252	Comments are included in the source, if comments is true (default).
	253
	254	"""
	255	# Generate code
	256	decoding_map_code = python_mapdef_code(
	257	'decoding_map',
	258	map,
	259	comments=comments)
	260	decoding_table_code = python_tabledef_code(
	261	'decoding_table',
	262	map,
	263	comments=comments)
	264	encoding_map_code = python_mapdef_code(
	265	'encoding_map',
	266	codecs.make_encoding_map(map),
	267	comments=comments,
	268	precisions=(4, 2))
	269
	270	if decoding_table_code:
	271	suffix = 'table'
	272	else:
	273	suffix = 'map'
	274
	275	l = [
	276	'''\
	277	""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
	278
	279	"""#"
	280
	281	import codecs
	282
	283	### Codec APIs
	284
	285	class Codec(codecs.Codec):
	286
	287	def encode(self,input,errors='strict'):
	288	return codecs.charmap_encode(input,errors,encoding_%s)
	289
	290	def decode(self,input,errors='strict'):
	291	return codecs.charmap_decode(input,errors,decoding_%s)
	292	''' % (encodingname, name, suffix, suffix)]
	293	l.append('''\
	294	class IncrementalEncoder(codecs.IncrementalEncoder):
	295	def encode(self, input, final=False):
	296	return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
	297
	298	class IncrementalDecoder(codecs.IncrementalDecoder):
	299	def decode(self, input, final=False):
	300	return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
	301	(suffix, suffix))
	302
	303	l.append('''
	304	class StreamWriter(Codec,codecs.StreamWriter):
	305	pass
	306
	307	class StreamReader(Codec,codecs.StreamReader):
	308	pass
	309
	310	### encodings module API
	311
	312	def getregentry():
	313	return codecs.CodecInfo(
	314	name=%r,
	315	encode=Codec().encode,
	316	decode=Codec().decode,
	317	incrementalencoder=IncrementalEncoder,
	318	incrementaldecoder=IncrementalDecoder,
	319	streamreader=StreamReader,
	320	streamwriter=StreamWriter,
	321	)
	322	''' % encodingname.replace('_', '-'))
	323
	324	# Add decoding table or map (with preference to the table)
	325	if not decoding_table_code:
	326	l.append('''
	327	### Decoding Map
	328	''')
	329	l.extend(decoding_map_code)
	330	else:
	331	l.append('''
	332	### Decoding Table
	333	''')
	334	l.extend(decoding_table_code)
	335
	336	# Add encoding map
	337	if decoding_table_code:
	338	l.append('''
	339	### Encoding table
	340	encoding_table=codecs.charmap_build(decoding_table)
	341	''')
	342	else:
	343	l.append('''
	344	### Encoding Map
	345	''')
	346	l.extend(encoding_map_code)
	347
	348	# Final new-line
	349	l.append('')
	350
	351	return '\n'.join(l).expandtabs()
	352
	353	def pymap(name,map,pyfile,encodingname,comments=1):
	354
	355	code = codegen(name,map,encodingname,comments)
	356	f = open(pyfile,'w')
	357	f.write(code)
	358	f.close()
	359
	360	def marshalmap(name,map,marshalfile):
	361
	362	d = {}
	363	for e,(u,c) in map.items():
	364	d[e] = (u,c)
	365	f = open(marshalfile,'wb')
	366	marshal.dump(d,f)
	367	f.close()
	368
	369	def convertdir(dir, dirprefix='', nameprefix='', comments=1):
	370
	371	mapnames = os.listdir(dir)
	372	for mapname in mapnames:
	373	mappathname = os.path.join(dir, mapname)
	374	if not os.path.isfile(mappathname):
	375	continue
	376	name = os.path.split(mapname)[1]
	377	name = name.replace('-','_')
	378	name = name.split('.')[0]
	379	name = name.lower()
	380	name = nameprefix + name
	381	codefile = name + '.py'
	382	marshalfile = name + '.mapping'
	383	print 'converting %s to %s and %s' % (mapname,
	384	dirprefix + codefile,
	385	dirprefix + marshalfile)
	386	try:
	387	map = readmap(os.path.join(dir,mapname))
	388	if not map:
	389	print '* map is empty; skipping'
	390	else:
	391	pymap(mappathname, map, dirprefix + codefile,name,comments)
	392	marshalmap(mappathname, map, dirprefix + marshalfile)
	393	except ValueError, why:
	394	print '* conversion failed: %s' % why
	395	raise
	396
	397	def rewritepythondir(dir, dirprefix='', comments=1):
	398
	399	mapnames = os.listdir(dir)
	400	for mapname in mapnames:
	401	if not mapname.endswith('.mapping'):
	402	continue
	403	name = mapname[:-len('.mapping')]
	404	codefile = name + '.py'
	405	print 'converting %s to %s' % (mapname,
	406	dirprefix + codefile)
	407	try:
	408	map = marshal.load(open(os.path.join(dir,mapname),
	409	'rb'))
	410	if not map:
	411	print '* map is empty; skipping'
	412	else:
	413	pymap(mapname, map, dirprefix + codefile,name,comments)
	414	except ValueError, why:
	415	print '* conversion failed: %s' % why
	416
	417	if __name__ == '__main__':
	418
	419	import sys
	420	if 1:
[391]	421	convertdir(*sys.argv[1:])
[2]	422	else:
[391]	423	rewritepythondir(*sys.argv[1:])

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Tools/unicode/gencodec.py

Download in other formats: