Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

makeunicodedata.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 38.0 KB

Rev	Line
[2]	1	#
	2	# (re)generate unicode property and type databases
	3	#
	4	# this script converts a unicode 3.2 database file to
	5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
	6	# and Objects/unicodetype_db.h
	7	#
	8	# history:
	9	# 2000-09-24 fl created (based on bits and pieces from unidb)
	10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
	11	# 2000-09-25 fl added character type table
	12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
	13	# 2000-11-03 fl expand first/last ranges
	14	# 2001-01-19 fl added character name tables (2.1)
	15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
	16	# 2002-09-11 wd use string methods
	17	# 2002-10-18 mvl update to Unicode 3.2
	18	# 2002-10-22 mvl generate NFC tables
	19	# 2002-11-24 mvl expand all ranges, sort names version-independently
	20	# 2002-11-25 mvl add UNIDATA_VERSION
	21	# 2004-05-29 perky add east asian width information
	22	# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
	23	#
	24	# written by Fredrik Lundh (fredrik@pythonware.com)
	25	#
	26
	27	import sys
	28
	29	SCRIPT = sys.argv[0]
	30	VERSION = "2.6"
	31
	32	# The Unicode Database
[391]	33	UNIDATA_VERSION = "5.2.0"
[2]	34	UNICODE_DATA = "UnicodeData%s.txt"
	35	COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
	36	EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
[391]	37	UNIHAN = "Unihan%s.txt"
	38	DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
	39	LINE_BREAK = "LineBreak%s.txt"
[2]	40
	41	old_versions = ["3.2.0"]
	42
	43	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
	44	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
	45	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
	46	"So" ]
	47
	48	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
	49	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
	50	"ON" ]
	51
	52	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
	53
[391]	54	MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
	55
[2]	56	# note: should match definitions in Objects/unicodectype.c
	57	ALPHA_MASK = 0x01
	58	DECIMAL_MASK = 0x02
	59	DIGIT_MASK = 0x04
	60	LOWER_MASK = 0x08
	61	LINEBREAK_MASK = 0x10
	62	SPACE_MASK = 0x20
	63	TITLE_MASK = 0x40
	64	UPPER_MASK = 0x80
	65	NODELTA_MASK = 0x100
[391]	66	NUMERIC_MASK = 0x200
[2]	67
	68	def maketables(trace=0):
	69
	70	print "--- Reading", UNICODE_DATA % "", "..."
	71
	72	version = ""
	73	unicode = UnicodeData(UNICODE_DATA % version,
	74	COMPOSITION_EXCLUSIONS % version,
[391]	75	EASTASIAN_WIDTH % version,
	76	UNIHAN % version,
	77	DERIVEDNORMALIZATION_PROPS % version,
	78	LINE_BREAK % version)
[2]	79
	80	print len(filter(None, unicode.table)), "characters"
	81
	82	for version in old_versions:
	83	print "--- Reading", UNICODE_DATA % ("-"+version), "..."
	84	old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
	85	COMPOSITION_EXCLUSIONS % ("-"+version),
[391]	86	EASTASIAN_WIDTH % ("-"+version),
	87	UNIHAN % ("-"+version))
[2]	88	print len(filter(None, old_unicode.table)), "characters"
	89	merge_old_version(version, unicode, old_unicode)
	90
	91	makeunicodename(unicode, trace)
	92	makeunicodedata(unicode, trace)
	93	makeunicodetype(unicode, trace)
	94
	95	# --------------------------------------------------------------------
	96	# unicode character properties
	97
	98	def makeunicodedata(unicode, trace):
	99
[391]	100	dummy = (0, 0, 0, 0, 0, 0)
[2]	101	table = [dummy]
	102	cache = {0: dummy}
	103	index = [0] * len(unicode.chars)
	104
	105	FILE = "Modules/unicodedata_db.h"
	106
	107	print "--- Preparing", FILE, "..."
	108
	109	# 1) database properties
	110
	111	for char in unicode.chars:
	112	record = unicode.table[char]
	113	if record:
	114	# extract database properties
	115	category = CATEGORY_NAMES.index(record[2])
	116	combining = int(record[3])
	117	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
	118	mirrored = record[9] == "Y"
	119	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
[391]	120	normalizationquickcheck = record[17]
[2]	121	item = (
[391]	122	category, combining, bidirectional, mirrored, eastasianwidth,
	123	normalizationquickcheck
[2]	124	)
	125	# add entry to index and item tables
	126	i = cache.get(item)
	127	if i is None:
	128	cache[item] = i = len(table)
	129	table.append(item)
	130	index[char] = i
	131
	132	# 2) decomposition data
	133
	134	decomp_data = [0]
	135	decomp_prefix = [""]
	136	decomp_index = [0] * len(unicode.chars)
	137	decomp_size = 0
	138
	139	comp_pairs = []
	140	comp_first = [None] * len(unicode.chars)
	141	comp_last = [None] * len(unicode.chars)
	142
	143	for char in unicode.chars:
	144	record = unicode.table[char]
	145	if record:
	146	if record[5]:
	147	decomp = record[5].split()
	148	if len(decomp) > 19:
	149	raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
	150	# prefix
	151	if decomp[0][0] == "<":
	152	prefix = decomp.pop(0)
	153	else:
	154	prefix = ""
	155	try:
	156	i = decomp_prefix.index(prefix)
	157	except ValueError:
	158	i = len(decomp_prefix)
	159	decomp_prefix.append(prefix)
	160	prefix = i
	161	assert prefix < 256
	162	# content
[391]	163	decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
[2]	164	# Collect NFC pairs
	165	if not prefix and len(decomp) == 3 and \
	166	char not in unicode.exclusions and \
	167	unicode.table[decomp[1]][3] == "0":
	168	p, l, r = decomp
	169	comp_first[l] = 1
	170	comp_last[r] = 1
	171	comp_pairs.append((l,r,char))
	172	try:
	173	i = decomp_data.index(decomp)
	174	except ValueError:
	175	i = len(decomp_data)
	176	decomp_data.extend(decomp)
	177	decomp_size = decomp_size + len(decomp) * 2
	178	else:
	179	i = 0
	180	decomp_index[char] = i
	181
	182	f = l = 0
	183	comp_first_ranges = []
	184	comp_last_ranges = []
	185	prev_f = prev_l = None
	186	for i in unicode.chars:
	187	if comp_first[i] is not None:
	188	comp_first[i] = f
	189	f += 1
	190	if prev_f is None:
	191	prev_f = (i,i)
	192	elif prev_f[1]+1 == i:
	193	prev_f = prev_f[0],i
	194	else:
	195	comp_first_ranges.append(prev_f)
	196	prev_f = (i,i)
	197	if comp_last[i] is not None:
	198	comp_last[i] = l
	199	l += 1
	200	if prev_l is None:
	201	prev_l = (i,i)
	202	elif prev_l[1]+1 == i:
	203	prev_l = prev_l[0],i
	204	else:
	205	comp_last_ranges.append(prev_l)
	206	prev_l = (i,i)
	207	comp_first_ranges.append(prev_f)
	208	comp_last_ranges.append(prev_l)
	209	total_first = f
	210	total_last = l
	211
	212	comp_data = [0](total_firsttotal_last)
	213	for f,l,char in comp_pairs:
	214	f = comp_first[f]
	215	l = comp_last[l]
	216	comp_data[f*total_last+l] = char
	217
	218	print len(table), "unique properties"
	219	print len(decomp_prefix), "unique decomposition prefixes"
	220	print len(decomp_data), "unique decomposition entries:",
	221	print decomp_size, "bytes"
	222	print total_first, "first characters in NFC"
	223	print total_last, "last characters in NFC"
	224	print len(comp_pairs), "NFC pairs"
	225
	226	print "--- Writing", FILE, "..."
	227
	228	fp = open(FILE, "w")
	229	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
	230	print >>fp
	231	print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
	232	print >>fp, "/* a list of unique database records */"
	233	print >>fp, \
	234	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
	235	for item in table:
[391]	236	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
[2]	237	print >>fp, "};"
	238	print >>fp
	239
	240	print >>fp, "/* Reindexing of NFC first characters. */"
	241	print >>fp, "#define TOTAL_FIRST",total_first
	242	print >>fp, "#define TOTAL_LAST",total_last
	243	print >>fp, "struct reindex{int start;short count,index;};"
	244	print >>fp, "static struct reindex nfc_first[] = {"
	245	for start,end in comp_first_ranges:
	246	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
	247	print >>fp," {0,0,0}"
	248	print >>fp,"};\n"
	249	print >>fp, "static struct reindex nfc_last[] = {"
	250	for start,end in comp_last_ranges:
	251	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
	252	print >>fp," {0,0,0}"
	253	print >>fp,"};\n"
	254
	255	# FIXME: <fl> the following tables could be made static, and
	256	# the support code moved into unicodedatabase.c
	257
	258	print >>fp, "/* string literals */"
	259	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
	260	for name in CATEGORY_NAMES:
	261	print >>fp, " \"%s\"," % name
	262	print >>fp, " NULL"
	263	print >>fp, "};"
	264
	265	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
	266	for name in BIDIRECTIONAL_NAMES:
	267	print >>fp, " \"%s\"," % name
	268	print >>fp, " NULL"
	269	print >>fp, "};"
	270
	271	print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
	272	for name in EASTASIANWIDTH_NAMES:
	273	print >>fp, " \"%s\"," % name
	274	print >>fp, " NULL"
	275	print >>fp, "};"
	276
	277	print >>fp, "static const char *decomp_prefix[] = {"
	278	for name in decomp_prefix:
	279	print >>fp, " \"%s\"," % name
	280	print >>fp, " NULL"
	281	print >>fp, "};"
	282
	283	# split record index table
	284	index1, index2, shift = splitbins(index, trace)
	285
	286	print >>fp, "/* index tables for the database records */"
	287	print >>fp, "#define SHIFT", shift
	288	Array("index1", index1).dump(fp, trace)
	289	Array("index2", index2).dump(fp, trace)
	290
	291	# split decomposition index table
	292	index1, index2, shift = splitbins(decomp_index, trace)
	293
	294	print >>fp, "/* decomposition data */"
	295	Array("decomp_data", decomp_data).dump(fp, trace)
	296
	297	print >>fp, "/* index tables for the decomposition data */"
	298	print >>fp, "#define DECOMP_SHIFT", shift
	299	Array("decomp_index1", index1).dump(fp, trace)
	300	Array("decomp_index2", index2).dump(fp, trace)
	301
	302	index, index2, shift = splitbins(comp_data, trace)
	303	print >>fp, "/* NFC pairs */"
	304	print >>fp, "#define COMP_SHIFT", shift
	305	Array("comp_index", index).dump(fp, trace)
	306	Array("comp_data", index2).dump(fp, trace)
	307
	308	# Generate delta tables for old versions
	309	for version, table, normalization in unicode.changed:
	310	cversion = version.replace(".","_")
	311	records = [table[0]]
	312	cache = {table[0]:0}
	313	index = [0] * len(table)
	314	for i, record in enumerate(table):
	315	try:
	316	index[i] = cache[record]
	317	except KeyError:
	318	index[i] = cache[record] = len(records)
	319	records.append(record)
	320	index1, index2, shift = splitbins(index, trace)
	321	print >>fp, "static const change_record change_records_%s[] = {" % cversion
	322	for record in records:
	323	print >>fp, "\t{ %s }," % ", ".join(map(str,record))
	324	print >>fp, "};"
	325	Array("changes_%s_index" % cversion, index1).dump(fp, trace)
	326	Array("changes_%s_data" % cversion, index2).dump(fp, trace)
	327	print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
	328	print >>fp, "{"
	329	print >>fp, "\tint index;"
	330	print >>fp, "\tif (n >= 0x110000) index = 0;"
	331	print >>fp, "\telse {"
	332	print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
	333	print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
	334	(cversion, shift, ((1<<shift)-1))
	335	print >>fp, "\t}"
	336	print >>fp, "\treturn change_records_%s+index;" % cversion
	337	print >>fp, "}\n"
	338	print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
	339	print >>fp, "{"
	340	print >>fp, "\tswitch(n) {"
	341	for k, v in normalization:
	342	print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
	343	print >>fp, "\tdefault: return 0;"
	344	print >>fp, "\t}\n}\n"
	345
	346	fp.close()
	347
	348	# --------------------------------------------------------------------
	349	# unicode character type tables
	350
	351	def makeunicodetype(unicode, trace):
	352
	353	FILE = "Objects/unicodetype_db.h"
	354
	355	print "--- Preparing", FILE, "..."
	356
	357	# extract unicode types
	358	dummy = (0, 0, 0, 0, 0, 0)
	359	table = [dummy]
	360	cache = {0: dummy}
	361	index = [0] * len(unicode.chars)
[391]	362	numeric = {}
	363	spaces = []
	364	linebreaks = []
[2]	365
	366	for char in unicode.chars:
	367	record = unicode.table[char]
	368	if record:
	369	# extract database properties
	370	category = record[2]
	371	bidirectional = record[4]
[391]	372	properties = record[16]
[2]	373	flags = 0
	374	delta = True
	375	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
	376	flags \|= ALPHA_MASK
	377	if category == "Ll":
	378	flags \|= LOWER_MASK
[391]	379	if 'Line_Break' in properties or bidirectional == "B":
[2]	380	flags \|= LINEBREAK_MASK
[391]	381	linebreaks.append(char)
[2]	382	if category == "Zs" or bidirectional in ("WS", "B", "S"):
	383	flags \|= SPACE_MASK
[391]	384	spaces.append(char)
[2]	385	if category == "Lt":
	386	flags \|= TITLE_MASK
	387	if category == "Lu":
	388	flags \|= UPPER_MASK
	389	# use delta predictor for upper/lower/title if it fits
	390	if record[12]:
	391	upper = int(record[12], 16)
	392	else:
	393	upper = char
	394	if record[13]:
	395	lower = int(record[13], 16)
	396	else:
	397	lower = char
	398	if record[14]:
	399	title = int(record[14], 16)
	400	else:
	401	# UCD.html says that a missing title char means that
	402	# it defaults to the uppercase character, not to the
	403	# character itself. Apparently, in the current UCD (5.x)
	404	# this feature is never used
	405	title = upper
	406	upper_d = upper - char
	407	lower_d = lower - char
	408	title_d = title - char
	409	if -32768 <= upper_d <= 32767 and \
	410	-32768 <= lower_d <= 32767 and \
	411	-32768 <= title_d <= 32767:
	412	# use deltas
	413	upper = upper_d & 0xffff
	414	lower = lower_d & 0xffff
	415	title = title_d & 0xffff
	416	else:
	417	flags \|= NODELTA_MASK
	418	# decimal digit, integer digit
	419	decimal = 0
	420	if record[6]:
	421	flags \|= DECIMAL_MASK
	422	decimal = int(record[6])
	423	digit = 0
	424	if record[7]:
	425	flags \|= DIGIT_MASK
	426	digit = int(record[7])
[391]	427	if record[8]:
	428	flags \|= NUMERIC_MASK
	429	numeric.setdefault(record[8], []).append(char)
[2]	430	item = (
	431	upper, lower, title, decimal, digit, flags
	432	)
	433	# add entry to index and item tables
	434	i = cache.get(item)
	435	if i is None:
	436	cache[item] = i = len(table)
	437	table.append(item)
	438	index[char] = i
	439
	440	print len(table), "unique character type entries"
[391]	441	print sum(map(len, numeric.values())), "numeric code points"
	442	print len(spaces), "whitespace code points"
	443	print len(linebreaks), "linebreak code points"
[2]	444
	445	print "--- Writing", FILE, "..."
	446
	447	fp = open(FILE, "w")
	448	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
	449	print >>fp
	450	print >>fp, "/* a list of unique character type descriptors */"
	451	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
	452	for item in table:
	453	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
	454	print >>fp, "};"
	455	print >>fp
	456
	457	# split decomposition index table
	458	index1, index2, shift = splitbins(index, trace)
	459
	460	print >>fp, "/* type indexes */"
	461	print >>fp, "#define SHIFT", shift
	462	Array("index1", index1).dump(fp, trace)
	463	Array("index2", index2).dump(fp, trace)
	464
[391]	465	# Generate code for _PyUnicode_ToNumeric()
	466	numeric_items = sorted(numeric.items())
	467	print >>fp, '/* Returns the numeric value as double for Unicode characters'
	468	print >>fp, ' * having this property, -1.0 otherwise.'
	469	print >>fp, ' */'
	470	print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
	471	print >>fp, '{'
	472	print >>fp, ' switch (ch) {'
	473	for value, codepoints in numeric_items:
	474	# Turn text into float literals
	475	parts = value.split('/')
	476	parts = [repr(float(part)) for part in parts]
	477	value = '/'.join(parts)
	478
	479	haswide = False
	480	hasnonewide = False
	481	codepoints.sort()
	482	for codepoint in codepoints:
	483	if codepoint < 0x10000:
	484	hasnonewide = True
	485	if codepoint >= 0x10000 and not haswide:
	486	print >>fp, '#ifdef Py_UNICODE_WIDE'
	487	haswide = True
	488	print >>fp, ' case 0x%04X:' % (codepoint,)
	489	if haswide and hasnonewide:
	490	print >>fp, '#endif'
	491	print >>fp, ' return (double) %s;' % (value,)
	492	if haswide and not hasnonewide:
	493	print >>fp, '#endif'
	494	print >>fp,' }'
	495	print >>fp,' return -1.0;'
	496	print >>fp,'}'
	497	print >>fp
	498
	499	# Generate code for _PyUnicode_IsWhitespace()
	500	print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
	501	print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
	502	print >>fp, " */"
	503	print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
	504	print >>fp, '{'
	505	print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
	506	print >>fp, ' return iswspace(ch);'
	507	print >>fp, '#else'
	508	print >>fp, ' switch (ch) {'
	509
	510	haswide = False
	511	hasnonewide = False
	512	for codepoint in sorted(spaces):
	513	if codepoint < 0x10000:
	514	hasnonewide = True
	515	if codepoint >= 0x10000 and not haswide:
	516	print >>fp, '#ifdef Py_UNICODE_WIDE'
	517	haswide = True
	518	print >>fp, ' case 0x%04X:' % (codepoint,)
	519	if haswide and hasnonewide:
	520	print >>fp, '#endif'
	521	print >>fp, ' return 1;'
	522	if haswide and not hasnonewide:
	523	print >>fp, '#endif'
	524
	525	print >>fp,' }'
	526	print >>fp,' return 0;'
	527	print >>fp, '#endif'
	528	print >>fp,'}'
	529	print >>fp
	530
	531	# Generate code for _PyUnicode_IsLinebreak()
	532	print >>fp, "/* Returns 1 for Unicode characters having the line break"
	533	print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
	534	print >>fp, " * type 'B', 0 otherwise."
	535	print >>fp, " */"
	536	print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
	537	print >>fp, '{'
	538	print >>fp, ' switch (ch) {'
	539	haswide = False
	540	hasnonewide = False
	541	for codepoint in sorted(linebreaks):
	542	if codepoint < 0x10000:
	543	hasnonewide = True
	544	if codepoint >= 0x10000 and not haswide:
	545	print >>fp, '#ifdef Py_UNICODE_WIDE'
	546	haswide = True
	547	print >>fp, ' case 0x%04X:' % (codepoint,)
	548	if haswide and hasnonewide:
	549	print >>fp, '#endif'
	550	print >>fp, ' return 1;'
	551	if haswide and not hasnonewide:
	552	print >>fp, '#endif'
	553
	554	print >>fp,' }'
	555	print >>fp,' return 0;'
	556	print >>fp,'}'
	557	print >>fp
	558
[2]	559	fp.close()
	560
	561	# --------------------------------------------------------------------
	562	# unicode name database
	563
	564	def makeunicodename(unicode, trace):
	565
	566	FILE = "Modules/unicodename_db.h"
	567
	568	print "--- Preparing", FILE, "..."
	569
	570	# collect names
	571	names = [None] * len(unicode.chars)
	572
	573	for char in unicode.chars:
	574	record = unicode.table[char]
	575	if record:
	576	name = record[1].strip()
	577	if name and name[0] != "<":
	578	names[char] = name + chr(0)
	579
	580	print len(filter(lambda n: n is not None, names)), "distinct names"
	581
	582	# collect unique words from names (note that we differ between
	583	# words inside a sentence, and words ending a sentence. the
	584	# latter includes the trailing null byte.
	585
	586	words = {}
	587	n = b = 0
	588	for char in unicode.chars:
	589	name = names[char]
	590	if name:
	591	w = name.split()
	592	b = b + len(name)
	593	n = n + len(w)
	594	for w in w:
	595	l = words.get(w)
	596	if l:
	597	l.append(None)
	598	else:
	599	words[w] = [len(words)]
	600
	601	print n, "words in text;", b, "bytes"
	602
	603	wordlist = words.items()
	604
	605	# sort on falling frequency, then by name
[391]	606	def word_key(a):
	607	aword, alist = a
	608	return -len(alist), aword
	609	wordlist.sort(key=word_key)
[2]	610
	611	# figure out how many phrasebook escapes we need
	612	escapes = 0
	613	while escapes * 256 < len(wordlist):
	614	escapes = escapes + 1
	615	print escapes, "escapes"
	616
	617	short = 256 - escapes
	618
	619	assert short > 0
	620
	621	print short, "short indexes in lexicon"
	622
	623	# statistics
	624	n = 0
	625	for i in range(short):
	626	n = n + len(wordlist[i][1])
	627	print n, "short indexes in phrasebook"
	628
	629	# pick the most commonly used words, and sort the rest on falling
	630	# length (to maximize overlap)
	631
	632	wordlist, wordtail = wordlist[:short], wordlist[short:]
[391]	633	wordtail.sort(key=lambda a: a[0], reverse=True)
[2]	634	wordlist.extend(wordtail)
	635
	636	# generate lexicon from words
	637
	638	lexicon_offset = [0]
	639	lexicon = ""
	640	words = {}
	641
	642	# build a lexicon string
	643	offset = 0
	644	for w, x in wordlist:
	645	# encoding: bit 7 indicates last character in word (chr(128)
	646	# indicates the last character in an entire string)
	647	ww = w[:-1] + chr(ord(w[-1])+128)
	648	# reuse string tails, when possible
	649	o = lexicon.find(ww)
	650	if o < 0:
	651	o = offset
	652	lexicon = lexicon + ww
	653	offset = offset + len(w)
	654	words[w] = len(lexicon_offset)
	655	lexicon_offset.append(o)
	656
	657	lexicon = map(ord, lexicon)
	658
	659	# generate phrasebook from names and lexicon
	660	phrasebook = [0]
	661	phrasebook_offset = [0] * len(unicode.chars)
	662	for char in unicode.chars:
	663	name = names[char]
	664	if name:
	665	w = name.split()
	666	phrasebook_offset[char] = len(phrasebook)
	667	for w in w:
	668	i = words[w]
	669	if i < short:
	670	phrasebook.append(i)
	671	else:
	672	# store as two bytes
	673	phrasebook.append((i>>8) + short)
	674	phrasebook.append(i&255)
	675
	676	assert getsize(phrasebook) == 1
	677
	678	#
	679	# unicode name hash table
	680
	681	# extract names
	682	data = []
	683	for char in unicode.chars:
	684	record = unicode.table[char]
	685	if record:
	686	name = record[1].strip()
	687	if name and name[0] != "<":
	688	data.append((name, char))
	689
	690	# the magic number 47 was chosen to minimize the number of
	691	# collisions on the current data set. if you like, change it
	692	# and see what happens...
	693
	694	codehash = Hash("code", data, 47)
	695
	696	print "--- Writing", FILE, "..."
	697
	698	fp = open(FILE, "w")
	699	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
	700	print >>fp
	701	print >>fp, "#define NAME_MAXLEN", 256
	702	print >>fp
	703	print >>fp, "/* lexicon */"
	704	Array("lexicon", lexicon).dump(fp, trace)
	705	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
	706
	707	# split decomposition index table
	708	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
	709
	710	print >>fp, "/* code->name phrasebook */"
	711	print >>fp, "#define phrasebook_shift", shift
	712	print >>fp, "#define phrasebook_short", short
	713
	714	Array("phrasebook", phrasebook).dump(fp, trace)
	715	Array("phrasebook_offset1", offset1).dump(fp, trace)
	716	Array("phrasebook_offset2", offset2).dump(fp, trace)
	717
	718	print >>fp, "/* name->code dictionary */"
	719	codehash.dump(fp, trace)
	720
	721	fp.close()
	722
	723
	724	def merge_old_version(version, new, old):
	725	# Changes to exclusion file not implemented yet
	726	if old.exclusions != new.exclusions:
	727	raise NotImplementedError, "exclusions differ"
	728
	729	# In these change records, 0xFF means "no change"
	730	bidir_changes = [0xFF]*0x110000
	731	category_changes = [0xFF]*0x110000
	732	decimal_changes = [0xFF]*0x110000
	733	mirrored_changes = [0xFF]*0x110000
	734	# In numeric data, 0 means "no change",
	735	# -1 means "did not have a numeric value
	736	numeric_changes = [0] * 0x110000
	737	# normalization_changes is a list of key-value pairs
	738	normalization_changes = []
	739	for i in range(0x110000):
	740	if new.table[i] is None:
	741	# Characters unassigned in the new version ought to
	742	# be unassigned in the old one
	743	assert old.table[i] is None
	744	continue
	745	# check characters unassigned in the old version
	746	if old.table[i] is None:
	747	# category 0 is "unassigned"
	748	category_changes[i] = 0
	749	continue
	750	# check characters that differ
	751	if old.table[i] != new.table[i]:
	752	for k in range(len(old.table[i])):
	753	if old.table[i][k] != new.table[i][k]:
	754	value = old.table[i][k]
	755	if k == 2:
	756	#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
	757	category_changes[i] = CATEGORY_NAMES.index(value)
	758	elif k == 4:
	759	#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
	760	bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
	761	elif k == 5:
	762	#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
	763	# We assume that all normalization changes are in 1:1 mappings
	764	assert " " not in value
	765	normalization_changes.append((i, value))
	766	elif k == 6:
	767	#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
	768	# we only support changes where the old value is a single digit
	769	assert value in "0123456789"
	770	decimal_changes[i] = int(value)
	771	elif k == 8:
	772	# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
	773	# Since 0 encodes "no change", the old value is better not 0
	774	if not value:
	775	numeric_changes[i] = -1
	776	else:
[391]	777	numeric_changes[i] = float(value)
	778	assert numeric_changes[i] not in (0, -1)
[2]	779	elif k == 9:
	780	if value == 'Y':
	781	mirrored_changes[i] = '1'
	782	else:
	783	mirrored_changes[i] = '0'
	784	elif k == 11:
	785	# change to ISO comment, ignore
	786	pass
	787	elif k == 12:
	788	# change to simple uppercase mapping; ignore
	789	pass
	790	elif k == 13:
	791	# change to simple lowercase mapping; ignore
	792	pass
	793	elif k == 14:
	794	# change to simple titlecase mapping; ignore
	795	pass
[391]	796	elif k == 16:
	797	# change to properties; not yet
	798	pass
[2]	799	else:
	800	class Difference(Exception):pass
	801	raise Difference, (hex(i), k, old.table[i], new.table[i])
	802	new.changed.append((version, zip(bidir_changes, category_changes,
	803	decimal_changes, mirrored_changes,
	804	numeric_changes),
	805	normalization_changes))
	806
	807
	808	# --------------------------------------------------------------------
	809	# the following support code is taken from the unidb utilities
	810	# Copyright (c) 1999-2000 by Secret Labs AB
	811
	812	# load a unicode-data file from disk
	813
	814	class UnicodeData:
[391]	815	# Record structure:
	816	# [ID, name, category, combining, bidi, decomp, (6)
	817	# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
	818	# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
	819	# properties] (17)
[2]	820
[391]	821	def __init__(self, filename, exclusions, eastasianwidth, unihan,
	822	derivednormalizationprops=None, linebreakprops=None,
	823	expand=1):
[2]	824	self.changed = []
	825	file = open(filename)
	826	table = [None] * 0x110000
	827	while 1:
	828	s = file.readline()
	829	if not s:
	830	break
	831	s = s.strip().split(";")
	832	char = int(s[0], 16)
	833	table[char] = s
	834
	835	# expand first-last ranges
	836	if expand:
	837	field = None
	838	for i in range(0, 0x110000):
	839	s = table[i]
	840	if s:
	841	if s[1][-6:] == "First>":
	842	s[1] = ""
	843	field = s
	844	elif s[1][-5:] == "Last>":
	845	s[1] = ""
	846	field = None
	847	elif field:
	848	f2 = field[:]
	849	f2[0] = "%X" % i
	850	table[i] = f2
	851
	852	# public attributes
	853	self.filename = filename
	854	self.table = table
	855	self.chars = range(0x110000) # unicode 3.2
	856
	857	file = open(exclusions)
	858	self.exclusions = {}
	859	for s in file:
	860	s = s.strip()
	861	if not s:
	862	continue
	863	if s[0] == '#':
	864	continue
	865	char = int(s.split()[0],16)
	866	self.exclusions[char] = 1
	867
	868	widths = [None] * 0x110000
	869	for s in open(eastasianwidth):
	870	s = s.strip()
	871	if not s:
	872	continue
	873	if s[0] == '#':
	874	continue
	875	s = s.split()[0].split(';')
	876	if '..' in s[0]:
	877	first, last = [int(c, 16) for c in s[0].split('..')]
	878	chars = range(first, last+1)
	879	else:
	880	chars = [int(s[0], 16)]
	881	for char in chars:
	882	widths[char] = s[1]
	883	for i in range(0, 0x110000):
	884	if table[i] is not None:
	885	table[i].append(widths[i])
	886
[391]	887	for i in range(0, 0x110000):
	888	if table[i] is not None:
	889	table[i].append(set())
	890	if linebreakprops:
	891	for s in open(linebreakprops):
	892	s = s.partition('#')[0]
	893	s = [i.strip() for i in s.split(';')]
	894	if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
	895	continue
	896	if '..' not in s[0]:
	897	first = last = int(s[0], 16)
	898	else:
	899	first, last = [int(c, 16) for c in s[0].split('..')]
	900	for char in range(first, last+1):
	901	table[char][-1].add('Line_Break')
	902
	903	if derivednormalizationprops:
	904	quickchecks = [0] * 0x110000 # default is Yes
	905	qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
	906	for s in open(derivednormalizationprops):
	907	if '#' in s:
	908	s = s[:s.index('#')]
	909	s = [i.strip() for i in s.split(';')]
	910	if len(s) < 2 or s[1] not in qc_order:
	911	continue
	912	quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
	913	quickcheck_shift = qc_order.index(s[1])*2
	914	quickcheck <<= quickcheck_shift
	915	if '..' not in s[0]:
	916	first = last = int(s[0], 16)
	917	else:
	918	first, last = [int(c, 16) for c in s[0].split('..')]
	919	for char in range(first, last+1):
	920	assert not (quickchecks[char]>>quickcheck_shift)&3
	921	quickchecks[char] \|= quickcheck
	922	for i in range(0, 0x110000):
	923	if table[i] is not None:
	924	table[i].append(quickchecks[i])
	925
	926	for line in open(unihan):
	927	if not line.startswith('U+'):
	928	continue
	929	code, tag, value = line.split(None, 3)[:3]
	930	if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
	931	'kOtherNumeric'):
	932	continue
	933	value = value.strip().replace(',', '')
	934	i = int(code[2:], 16)
	935	# Patch the numeric field
	936	if table[i] is not None:
	937	table[i][8] = value
	938
[2]	939	def uselatin1(self):
	940	# restrict character range to ISO Latin 1
	941	self.chars = range(256)
	942
	943	# hash table tools
	944
	945	# this is a straight-forward reimplementation of Python's built-in
	946	# dictionary type, using a static data structure, and a custom string
	947	# hash algorithm.
	948
	949	def myhash(s, magic):
	950	h = 0
	951	for c in map(ord, s.upper()):
	952	h = (h * magic) + c
	953	ix = h & 0xff000000L
	954	if ix:
	955	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
	956	return h
	957
	958	SIZES = [
	959	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
	960	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
	961	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
	962	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
	963	]
	964
	965	class Hash:
	966	def __init__(self, name, data, magic):
	967	# turn a (key, value) list into a static hash table structure
	968
	969	# determine table size
	970	for size, poly in SIZES:
	971	if size > len(data):
	972	poly = size + poly
	973	break
	974	else:
	975	raise AssertionError, "ran out of polynominals"
	976
	977	print size, "slots in hash table"
	978
	979	table = [None] * size
	980
	981	mask = size-1
	982
	983	n = 0
	984
	985	hash = myhash
	986
	987	# initialize hash table
	988	for key, value in data:
	989	h = hash(key, magic)
	990	i = (~h) & mask
	991	v = table[i]
	992	if v is None:
	993	table[i] = value
	994	continue
	995	incr = (h ^ (h >> 3)) & mask;
	996	if not incr:
	997	incr = mask
	998	while 1:
	999	n = n + 1
	1000	i = (i + incr) & mask
	1001	v = table[i]
	1002	if v is None:
	1003	table[i] = value
	1004	break
	1005	incr = incr << 1
	1006	if incr > mask:
	1007	incr = incr ^ poly
	1008
	1009	print n, "collisions"
	1010	self.collisions = n
	1011
	1012	for i in range(len(table)):
	1013	if table[i] is None:
	1014	table[i] = 0
	1015
	1016	self.data = Array(name + "_hash", table)
	1017	self.magic = magic
	1018	self.name = name
	1019	self.size = size
	1020	self.poly = poly
	1021
	1022	def dump(self, file, trace):
	1023	# write data to file, as a C array
	1024	self.data.dump(file, trace)
	1025	file.write("#define %s_magic %d\n" % (self.name, self.magic))
	1026	file.write("#define %s_size %d\n" % (self.name, self.size))
	1027	file.write("#define %s_poly %d\n" % (self.name, self.poly))
	1028
	1029	# stuff to deal with arrays of unsigned integers
	1030
	1031	class Array:
	1032
	1033	def __init__(self, name, data):
	1034	self.name = name
	1035	self.data = data
	1036
	1037	def dump(self, file, trace=0):
	1038	# write data to file, as a C array
	1039	size = getsize(self.data)
	1040	if trace:
	1041	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
	1042	file.write("static ")
	1043	if size == 1:
	1044	file.write("unsigned char")
	1045	elif size == 2:
	1046	file.write("unsigned short")
	1047	else:
	1048	file.write("unsigned int")
	1049	file.write(" " + self.name + "[] = {\n")
	1050	if self.data:
	1051	s = " "
	1052	for item in self.data:
	1053	i = str(item) + ", "
	1054	if len(s) + len(i) > 78:
	1055	file.write(s + "\n")
	1056	s = " " + i
	1057	else:
	1058	s = s + i
	1059	if s.strip():
	1060	file.write(s + "\n")
	1061	file.write("};\n\n")
	1062
	1063	def getsize(data):
	1064	# return smallest possible integer size for the given array
	1065	maxdata = max(data)
	1066	if maxdata < 256:
	1067	return 1
	1068	elif maxdata < 65536:
	1069	return 2
	1070	else:
	1071	return 4
	1072
	1073	def splitbins(t, trace=0):
	1074	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
	1075
	1076	t is a sequence of ints. This function can be useful to save space if
	1077	many of the ints are the same. t1 and t2 are lists of ints, and shift
	1078	is an int, chosen to minimize the combined size of t1 and t2 (in C
	1079	code), and where for each i in range(len(t)),
	1080	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
	1081	where mask is a bitmask isolating the last "shift" bits.
	1082
	1083	If optional arg trace is non-zero (default zero), progress info
	1084	is printed to sys.stderr. The higher the value, the more info
	1085	you'll get.
	1086	"""
	1087
	1088	if trace:
	1089	def dump(t1, t2, shift, bytes):
	1090	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
	1091	len(t1), len(t2), shift, bytes)
	1092	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
	1093	"bytes"
	1094	n = len(t)-1 # last valid index
	1095	maxshift = 0 # the most we can shift n and still have something left
	1096	if n > 0:
	1097	while n >> 1:
	1098	n >>= 1
	1099	maxshift += 1
	1100	del n
	1101	bytes = sys.maxint # smallest total size so far
	1102	t = tuple(t) # so slices can be dict keys
	1103	for shift in range(maxshift + 1):
	1104	t1 = []
	1105	t2 = []
	1106	size = 2**shift
	1107	bincache = {}
	1108	for i in range(0, len(t), size):
	1109	bin = t[i:i+size]
	1110	index = bincache.get(bin)
	1111	if index is None:
	1112	index = len(t2)
	1113	bincache[bin] = index
	1114	t2.extend(bin)
	1115	t1.append(index >> shift)
	1116	# determine memory size
	1117	b = len(t1)getsize(t1) + len(t2)getsize(t2)
	1118	if trace > 1:
	1119	dump(t1, t2, shift, b)
	1120	if b < bytes:
	1121	best = t1, t2, shift
	1122	bytes = b
	1123	t1, t2, shift = best
	1124	if trace:
	1125	print >>sys.stderr, "Best:",
	1126	dump(t1, t2, shift, bytes)
	1127	if __debug__:
	1128	# exhaustively verify that the decomposition is correct
	1129	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
	1130	for i in xrange(len(t)):
	1131	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
	1132	return best
	1133
	1134	if __name__ == "__main__":
	1135	maketables(1)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Tools/unicode/makeunicodedata.py

Download in other formats: