Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

makeunicodedata.py@ 389

Last change on this file since 389 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 31.4 KB

Line
1	#
2	# (re)generate unicode property and type databases
3	#
4	# this script converts a unicode 3.2 database file to
5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6	# and Objects/unicodetype_db.h
7	#
8	# history:
9	# 2000-09-24 fl created (based on bits and pieces from unidb)
10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11	# 2000-09-25 fl added character type table
12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13	# 2000-11-03 fl expand first/last ranges
14	# 2001-01-19 fl added character name tables (2.1)
15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16	# 2002-09-11 wd use string methods
17	# 2002-10-18 mvl update to Unicode 3.2
18	# 2002-10-22 mvl generate NFC tables
19	# 2002-11-24 mvl expand all ranges, sort names version-independently
20	# 2002-11-25 mvl add UNIDATA_VERSION
21	# 2004-05-29 perky add east asian width information
22	# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23	#
24	# written by Fredrik Lundh (fredrik@pythonware.com)
25	#
26
27	import sys
28
29	SCRIPT = sys.argv[0]
30	VERSION = "2.6"
31
32	# The Unicode Database
33	UNIDATA_VERSION = "5.1.0"
34	UNICODE_DATA = "UnicodeData%s.txt"
35	COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36	EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38	old_versions = ["3.2.0"]
39
40	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43	"So" ]
44
45	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47	"ON" ]
48
49	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
51	# note: should match definitions in Objects/unicodectype.c
52	ALPHA_MASK = 0x01
53	DECIMAL_MASK = 0x02
54	DIGIT_MASK = 0x04
55	LOWER_MASK = 0x08
56	LINEBREAK_MASK = 0x10
57	SPACE_MASK = 0x20
58	TITLE_MASK = 0x40
59	UPPER_MASK = 0x80
60	NODELTA_MASK = 0x100
61
62	def maketables(trace=0):
63
64	print "--- Reading", UNICODE_DATA % "", "..."
65
66	version = ""
67	unicode = UnicodeData(UNICODE_DATA % version,
68	COMPOSITION_EXCLUSIONS % version,
69	EASTASIAN_WIDTH % version)
70
71	print len(filter(None, unicode.table)), "characters"
72
73	for version in old_versions:
74	print "--- Reading", UNICODE_DATA % ("-"+version), "..."
75	old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
76	COMPOSITION_EXCLUSIONS % ("-"+version),
77	EASTASIAN_WIDTH % ("-"+version))
78	print len(filter(None, old_unicode.table)), "characters"
79	merge_old_version(version, unicode, old_unicode)
80
81	makeunicodename(unicode, trace)
82	makeunicodedata(unicode, trace)
83	makeunicodetype(unicode, trace)
84
85	# --------------------------------------------------------------------
86	# unicode character properties
87
88	def makeunicodedata(unicode, trace):
89
90	dummy = (0, 0, 0, 0, 0)
91	table = [dummy]
92	cache = {0: dummy}
93	index = [0] * len(unicode.chars)
94
95	FILE = "Modules/unicodedata_db.h"
96
97	print "--- Preparing", FILE, "..."
98
99	# 1) database properties
100
101	for char in unicode.chars:
102	record = unicode.table[char]
103	if record:
104	# extract database properties
105	category = CATEGORY_NAMES.index(record[2])
106	combining = int(record[3])
107	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
108	mirrored = record[9] == "Y"
109	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
110	item = (
111	category, combining, bidirectional, mirrored, eastasianwidth
112	)
113	# add entry to index and item tables
114	i = cache.get(item)
115	if i is None:
116	cache[item] = i = len(table)
117	table.append(item)
118	index[char] = i
119
120	# 2) decomposition data
121
122	decomp_data = [0]
123	decomp_prefix = [""]
124	decomp_index = [0] * len(unicode.chars)
125	decomp_size = 0
126
127	comp_pairs = []
128	comp_first = [None] * len(unicode.chars)
129	comp_last = [None] * len(unicode.chars)
130
131	for char in unicode.chars:
132	record = unicode.table[char]
133	if record:
134	if record[5]:
135	decomp = record[5].split()
136	if len(decomp) > 19:
137	raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
138	# prefix
139	if decomp[0][0] == "<":
140	prefix = decomp.pop(0)
141	else:
142	prefix = ""
143	try:
144	i = decomp_prefix.index(prefix)
145	except ValueError:
146	i = len(decomp_prefix)
147	decomp_prefix.append(prefix)
148	prefix = i
149	assert prefix < 256
150	# content
151	decomp = [prefix + (len(decomp)<<8)] +\
152	map(lambda s: int(s, 16), decomp)
153	# Collect NFC pairs
154	if not prefix and len(decomp) == 3 and \
155	char not in unicode.exclusions and \
156	unicode.table[decomp[1]][3] == "0":
157	p, l, r = decomp
158	comp_first[l] = 1
159	comp_last[r] = 1
160	comp_pairs.append((l,r,char))
161	try:
162	i = decomp_data.index(decomp)
163	except ValueError:
164	i = len(decomp_data)
165	decomp_data.extend(decomp)
166	decomp_size = decomp_size + len(decomp) * 2
167	else:
168	i = 0
169	decomp_index[char] = i
170
171	f = l = 0
172	comp_first_ranges = []
173	comp_last_ranges = []
174	prev_f = prev_l = None
175	for i in unicode.chars:
176	if comp_first[i] is not None:
177	comp_first[i] = f
178	f += 1
179	if prev_f is None:
180	prev_f = (i,i)
181	elif prev_f[1]+1 == i:
182	prev_f = prev_f[0],i
183	else:
184	comp_first_ranges.append(prev_f)
185	prev_f = (i,i)
186	if comp_last[i] is not None:
187	comp_last[i] = l
188	l += 1
189	if prev_l is None:
190	prev_l = (i,i)
191	elif prev_l[1]+1 == i:
192	prev_l = prev_l[0],i
193	else:
194	comp_last_ranges.append(prev_l)
195	prev_l = (i,i)
196	comp_first_ranges.append(prev_f)
197	comp_last_ranges.append(prev_l)
198	total_first = f
199	total_last = l
200
201	comp_data = [0](total_firsttotal_last)
202	for f,l,char in comp_pairs:
203	f = comp_first[f]
204	l = comp_last[l]
205	comp_data[f*total_last+l] = char
206
207	print len(table), "unique properties"
208	print len(decomp_prefix), "unique decomposition prefixes"
209	print len(decomp_data), "unique decomposition entries:",
210	print decomp_size, "bytes"
211	print total_first, "first characters in NFC"
212	print total_last, "last characters in NFC"
213	print len(comp_pairs), "NFC pairs"
214
215	print "--- Writing", FILE, "..."
216
217	fp = open(FILE, "w")
218	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
219	print >>fp
220	print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
221	print >>fp, "/* a list of unique database records */"
222	print >>fp, \
223	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
224	for item in table:
225	print >>fp, " {%d, %d, %d, %d, %d}," % item
226	print >>fp, "};"
227	print >>fp
228
229	print >>fp, "/* Reindexing of NFC first characters. */"
230	print >>fp, "#define TOTAL_FIRST",total_first
231	print >>fp, "#define TOTAL_LAST",total_last
232	print >>fp, "struct reindex{int start;short count,index;};"
233	print >>fp, "static struct reindex nfc_first[] = {"
234	for start,end in comp_first_ranges:
235	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
236	print >>fp," {0,0,0}"
237	print >>fp,"};\n"
238	print >>fp, "static struct reindex nfc_last[] = {"
239	for start,end in comp_last_ranges:
240	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
241	print >>fp," {0,0,0}"
242	print >>fp,"};\n"
243
244	# FIXME: <fl> the following tables could be made static, and
245	# the support code moved into unicodedatabase.c
246
247	print >>fp, "/* string literals */"
248	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
249	for name in CATEGORY_NAMES:
250	print >>fp, " \"%s\"," % name
251	print >>fp, " NULL"
252	print >>fp, "};"
253
254	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
255	for name in BIDIRECTIONAL_NAMES:
256	print >>fp, " \"%s\"," % name
257	print >>fp, " NULL"
258	print >>fp, "};"
259
260	print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
261	for name in EASTASIANWIDTH_NAMES:
262	print >>fp, " \"%s\"," % name
263	print >>fp, " NULL"
264	print >>fp, "};"
265
266	print >>fp, "static const char *decomp_prefix[] = {"
267	for name in decomp_prefix:
268	print >>fp, " \"%s\"," % name
269	print >>fp, " NULL"
270	print >>fp, "};"
271
272	# split record index table
273	index1, index2, shift = splitbins(index, trace)
274
275	print >>fp, "/* index tables for the database records */"
276	print >>fp, "#define SHIFT", shift
277	Array("index1", index1).dump(fp, trace)
278	Array("index2", index2).dump(fp, trace)
279
280	# split decomposition index table
281	index1, index2, shift = splitbins(decomp_index, trace)
282
283	print >>fp, "/* decomposition data */"
284	Array("decomp_data", decomp_data).dump(fp, trace)
285
286	print >>fp, "/* index tables for the decomposition data */"
287	print >>fp, "#define DECOMP_SHIFT", shift
288	Array("decomp_index1", index1).dump(fp, trace)
289	Array("decomp_index2", index2).dump(fp, trace)
290
291	index, index2, shift = splitbins(comp_data, trace)
292	print >>fp, "/* NFC pairs */"
293	print >>fp, "#define COMP_SHIFT", shift
294	Array("comp_index", index).dump(fp, trace)
295	Array("comp_data", index2).dump(fp, trace)
296
297	# Generate delta tables for old versions
298	for version, table, normalization in unicode.changed:
299	cversion = version.replace(".","_")
300	records = [table[0]]
301	cache = {table[0]:0}
302	index = [0] * len(table)
303	for i, record in enumerate(table):
304	try:
305	index[i] = cache[record]
306	except KeyError:
307	index[i] = cache[record] = len(records)
308	records.append(record)
309	index1, index2, shift = splitbins(index, trace)
310	print >>fp, "static const change_record change_records_%s[] = {" % cversion
311	for record in records:
312	print >>fp, "\t{ %s }," % ", ".join(map(str,record))
313	print >>fp, "};"
314	Array("changes_%s_index" % cversion, index1).dump(fp, trace)
315	Array("changes_%s_data" % cversion, index2).dump(fp, trace)
316	print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
317	print >>fp, "{"
318	print >>fp, "\tint index;"
319	print >>fp, "\tif (n >= 0x110000) index = 0;"
320	print >>fp, "\telse {"
321	print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
322	print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
323	(cversion, shift, ((1<<shift)-1))
324	print >>fp, "\t}"
325	print >>fp, "\treturn change_records_%s+index;" % cversion
326	print >>fp, "}\n"
327	print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
328	print >>fp, "{"
329	print >>fp, "\tswitch(n) {"
330	for k, v in normalization:
331	print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
332	print >>fp, "\tdefault: return 0;"
333	print >>fp, "\t}\n}\n"
334
335	fp.close()
336
337	# --------------------------------------------------------------------
338	# unicode character type tables
339
340	def makeunicodetype(unicode, trace):
341
342	FILE = "Objects/unicodetype_db.h"
343
344	print "--- Preparing", FILE, "..."
345
346	# extract unicode types
347	dummy = (0, 0, 0, 0, 0, 0)
348	table = [dummy]
349	cache = {0: dummy}
350	index = [0] * len(unicode.chars)
351
352	for char in unicode.chars:
353	record = unicode.table[char]
354	if record:
355	# extract database properties
356	category = record[2]
357	bidirectional = record[4]
358	flags = 0
359	delta = True
360	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
361	flags \|= ALPHA_MASK
362	if category == "Ll":
363	flags \|= LOWER_MASK
364	if category == "Zl" or bidirectional == "B":
365	flags \|= LINEBREAK_MASK
366	if category == "Zs" or bidirectional in ("WS", "B", "S"):
367	flags \|= SPACE_MASK
368	if category == "Lt":
369	flags \|= TITLE_MASK
370	if category == "Lu":
371	flags \|= UPPER_MASK
372	# use delta predictor for upper/lower/title if it fits
373	if record[12]:
374	upper = int(record[12], 16)
375	else:
376	upper = char
377	if record[13]:
378	lower = int(record[13], 16)
379	else:
380	lower = char
381	if record[14]:
382	title = int(record[14], 16)
383	else:
384	# UCD.html says that a missing title char means that
385	# it defaults to the uppercase character, not to the
386	# character itself. Apparently, in the current UCD (5.x)
387	# this feature is never used
388	title = upper
389	upper_d = upper - char
390	lower_d = lower - char
391	title_d = title - char
392	if -32768 <= upper_d <= 32767 and \
393	-32768 <= lower_d <= 32767 and \
394	-32768 <= title_d <= 32767:
395	# use deltas
396	upper = upper_d & 0xffff
397	lower = lower_d & 0xffff
398	title = title_d & 0xffff
399	else:
400	flags \|= NODELTA_MASK
401	# decimal digit, integer digit
402	decimal = 0
403	if record[6]:
404	flags \|= DECIMAL_MASK
405	decimal = int(record[6])
406	digit = 0
407	if record[7]:
408	flags \|= DIGIT_MASK
409	digit = int(record[7])
410	item = (
411	upper, lower, title, decimal, digit, flags
412	)
413	# add entry to index and item tables
414	i = cache.get(item)
415	if i is None:
416	cache[item] = i = len(table)
417	table.append(item)
418	index[char] = i
419
420	print len(table), "unique character type entries"
421
422	print "--- Writing", FILE, "..."
423
424	fp = open(FILE, "w")
425	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
426	print >>fp
427	print >>fp, "/* a list of unique character type descriptors */"
428	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
429	for item in table:
430	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
431	print >>fp, "};"
432	print >>fp
433
434	# split decomposition index table
435	index1, index2, shift = splitbins(index, trace)
436
437	print >>fp, "/* type indexes */"
438	print >>fp, "#define SHIFT", shift
439	Array("index1", index1).dump(fp, trace)
440	Array("index2", index2).dump(fp, trace)
441
442	fp.close()
443
444	# --------------------------------------------------------------------
445	# unicode name database
446
447	def makeunicodename(unicode, trace):
448
449	FILE = "Modules/unicodename_db.h"
450
451	print "--- Preparing", FILE, "..."
452
453	# collect names
454	names = [None] * len(unicode.chars)
455
456	for char in unicode.chars:
457	record = unicode.table[char]
458	if record:
459	name = record[1].strip()
460	if name and name[0] != "<":
461	names[char] = name + chr(0)
462
463	print len(filter(lambda n: n is not None, names)), "distinct names"
464
465	# collect unique words from names (note that we differ between
466	# words inside a sentence, and words ending a sentence. the
467	# latter includes the trailing null byte.
468
469	words = {}
470	n = b = 0
471	for char in unicode.chars:
472	name = names[char]
473	if name:
474	w = name.split()
475	b = b + len(name)
476	n = n + len(w)
477	for w in w:
478	l = words.get(w)
479	if l:
480	l.append(None)
481	else:
482	words[w] = [len(words)]
483
484	print n, "words in text;", b, "bytes"
485
486	wordlist = words.items()
487
488	# sort on falling frequency, then by name
489	def cmpwords((aword, alist),(bword, blist)):
490	r = -cmp(len(alist),len(blist))
491	if r:
492	return r
493	return cmp(aword, bword)
494	wordlist.sort(cmpwords)
495
496	# figure out how many phrasebook escapes we need
497	escapes = 0
498	while escapes * 256 < len(wordlist):
499	escapes = escapes + 1
500	print escapes, "escapes"
501
502	short = 256 - escapes
503
504	assert short > 0
505
506	print short, "short indexes in lexicon"
507
508	# statistics
509	n = 0
510	for i in range(short):
511	n = n + len(wordlist[i][1])
512	print n, "short indexes in phrasebook"
513
514	# pick the most commonly used words, and sort the rest on falling
515	# length (to maximize overlap)
516
517	wordlist, wordtail = wordlist[:short], wordlist[short:]
518	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
519	wordlist.extend(wordtail)
520
521	# generate lexicon from words
522
523	lexicon_offset = [0]
524	lexicon = ""
525	words = {}
526
527	# build a lexicon string
528	offset = 0
529	for w, x in wordlist:
530	# encoding: bit 7 indicates last character in word (chr(128)
531	# indicates the last character in an entire string)
532	ww = w[:-1] + chr(ord(w[-1])+128)
533	# reuse string tails, when possible
534	o = lexicon.find(ww)
535	if o < 0:
536	o = offset
537	lexicon = lexicon + ww
538	offset = offset + len(w)
539	words[w] = len(lexicon_offset)
540	lexicon_offset.append(o)
541
542	lexicon = map(ord, lexicon)
543
544	# generate phrasebook from names and lexicon
545	phrasebook = [0]
546	phrasebook_offset = [0] * len(unicode.chars)
547	for char in unicode.chars:
548	name = names[char]
549	if name:
550	w = name.split()
551	phrasebook_offset[char] = len(phrasebook)
552	for w in w:
553	i = words[w]
554	if i < short:
555	phrasebook.append(i)
556	else:
557	# store as two bytes
558	phrasebook.append((i>>8) + short)
559	phrasebook.append(i&255)
560
561	assert getsize(phrasebook) == 1
562
563	#
564	# unicode name hash table
565
566	# extract names
567	data = []
568	for char in unicode.chars:
569	record = unicode.table[char]
570	if record:
571	name = record[1].strip()
572	if name and name[0] != "<":
573	data.append((name, char))
574
575	# the magic number 47 was chosen to minimize the number of
576	# collisions on the current data set. if you like, change it
577	# and see what happens...
578
579	codehash = Hash("code", data, 47)
580
581	print "--- Writing", FILE, "..."
582
583	fp = open(FILE, "w")
584	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
585	print >>fp
586	print >>fp, "#define NAME_MAXLEN", 256
587	print >>fp
588	print >>fp, "/* lexicon */"
589	Array("lexicon", lexicon).dump(fp, trace)
590	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
591
592	# split decomposition index table
593	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
594
595	print >>fp, "/* code->name phrasebook */"
596	print >>fp, "#define phrasebook_shift", shift
597	print >>fp, "#define phrasebook_short", short
598
599	Array("phrasebook", phrasebook).dump(fp, trace)
600	Array("phrasebook_offset1", offset1).dump(fp, trace)
601	Array("phrasebook_offset2", offset2).dump(fp, trace)
602
603	print >>fp, "/* name->code dictionary */"
604	codehash.dump(fp, trace)
605
606	fp.close()
607
608
609	def merge_old_version(version, new, old):
610	# Changes to exclusion file not implemented yet
611	if old.exclusions != new.exclusions:
612	raise NotImplementedError, "exclusions differ"
613
614	# In these change records, 0xFF means "no change"
615	bidir_changes = [0xFF]*0x110000
616	category_changes = [0xFF]*0x110000
617	decimal_changes = [0xFF]*0x110000
618	mirrored_changes = [0xFF]*0x110000
619	# In numeric data, 0 means "no change",
620	# -1 means "did not have a numeric value
621	numeric_changes = [0] * 0x110000
622	# normalization_changes is a list of key-value pairs
623	normalization_changes = []
624	for i in range(0x110000):
625	if new.table[i] is None:
626	# Characters unassigned in the new version ought to
627	# be unassigned in the old one
628	assert old.table[i] is None
629	continue
630	# check characters unassigned in the old version
631	if old.table[i] is None:
632	# category 0 is "unassigned"
633	category_changes[i] = 0
634	continue
635	# check characters that differ
636	if old.table[i] != new.table[i]:
637	for k in range(len(old.table[i])):
638	if old.table[i][k] != new.table[i][k]:
639	value = old.table[i][k]
640	if k == 2:
641	#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
642	category_changes[i] = CATEGORY_NAMES.index(value)
643	elif k == 4:
644	#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
645	bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
646	elif k == 5:
647	#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
648	# We assume that all normalization changes are in 1:1 mappings
649	assert " " not in value
650	normalization_changes.append((i, value))
651	elif k == 6:
652	#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
653	# we only support changes where the old value is a single digit
654	assert value in "0123456789"
655	decimal_changes[i] = int(value)
656	elif k == 8:
657	# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
658	# Since 0 encodes "no change", the old value is better not 0
659	assert value != "0" and value != "-1"
660	if not value:
661	numeric_changes[i] = -1
662	else:
663	assert re.match("^[0-9]+$", value)
664	numeric_changes[i] = int(value)
665	elif k == 9:
666	if value == 'Y':
667	mirrored_changes[i] = '1'
668	else:
669	mirrored_changes[i] = '0'
670	elif k == 11:
671	# change to ISO comment, ignore
672	pass
673	elif k == 12:
674	# change to simple uppercase mapping; ignore
675	pass
676	elif k == 13:
677	# change to simple lowercase mapping; ignore
678	pass
679	elif k == 14:
680	# change to simple titlecase mapping; ignore
681	pass
682	else:
683	class Difference(Exception):pass
684	raise Difference, (hex(i), k, old.table[i], new.table[i])
685	new.changed.append((version, zip(bidir_changes, category_changes,
686	decimal_changes, mirrored_changes,
687	numeric_changes),
688	normalization_changes))
689
690
691	# --------------------------------------------------------------------
692	# the following support code is taken from the unidb utilities
693	# Copyright (c) 1999-2000 by Secret Labs AB
694
695	# load a unicode-data file from disk
696
697	import sys
698
699	class UnicodeData:
700
701	def __init__(self, filename, exclusions, eastasianwidth, expand=1):
702	self.changed = []
703	file = open(filename)
704	table = [None] * 0x110000
705	while 1:
706	s = file.readline()
707	if not s:
708	break
709	s = s.strip().split(";")
710	char = int(s[0], 16)
711	table[char] = s
712
713	# expand first-last ranges
714	if expand:
715	field = None
716	for i in range(0, 0x110000):
717	s = table[i]
718	if s:
719	if s[1][-6:] == "First>":
720	s[1] = ""
721	field = s
722	elif s[1][-5:] == "Last>":
723	s[1] = ""
724	field = None
725	elif field:
726	f2 = field[:]
727	f2[0] = "%X" % i
728	table[i] = f2
729
730	# public attributes
731	self.filename = filename
732	self.table = table
733	self.chars = range(0x110000) # unicode 3.2
734
735	file = open(exclusions)
736	self.exclusions = {}
737	for s in file:
738	s = s.strip()
739	if not s:
740	continue
741	if s[0] == '#':
742	continue
743	char = int(s.split()[0],16)
744	self.exclusions[char] = 1
745
746	widths = [None] * 0x110000
747	for s in open(eastasianwidth):
748	s = s.strip()
749	if not s:
750	continue
751	if s[0] == '#':
752	continue
753	s = s.split()[0].split(';')
754	if '..' in s[0]:
755	first, last = [int(c, 16) for c in s[0].split('..')]
756	chars = range(first, last+1)
757	else:
758	chars = [int(s[0], 16)]
759	for char in chars:
760	widths[char] = s[1]
761	for i in range(0, 0x110000):
762	if table[i] is not None:
763	table[i].append(widths[i])
764
765	def uselatin1(self):
766	# restrict character range to ISO Latin 1
767	self.chars = range(256)
768
769	# hash table tools
770
771	# this is a straight-forward reimplementation of Python's built-in
772	# dictionary type, using a static data structure, and a custom string
773	# hash algorithm.
774
775	def myhash(s, magic):
776	h = 0
777	for c in map(ord, s.upper()):
778	h = (h * magic) + c
779	ix = h & 0xff000000L
780	if ix:
781	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
782	return h
783
784	SIZES = [
785	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
786	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
787	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
788	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
789	]
790
791	class Hash:
792	def __init__(self, name, data, magic):
793	# turn a (key, value) list into a static hash table structure
794
795	# determine table size
796	for size, poly in SIZES:
797	if size > len(data):
798	poly = size + poly
799	break
800	else:
801	raise AssertionError, "ran out of polynominals"
802
803	print size, "slots in hash table"
804
805	table = [None] * size
806
807	mask = size-1
808
809	n = 0
810
811	hash = myhash
812
813	# initialize hash table
814	for key, value in data:
815	h = hash(key, magic)
816	i = (~h) & mask
817	v = table[i]
818	if v is None:
819	table[i] = value
820	continue
821	incr = (h ^ (h >> 3)) & mask;
822	if not incr:
823	incr = mask
824	while 1:
825	n = n + 1
826	i = (i + incr) & mask
827	v = table[i]
828	if v is None:
829	table[i] = value
830	break
831	incr = incr << 1
832	if incr > mask:
833	incr = incr ^ poly
834
835	print n, "collisions"
836	self.collisions = n
837
838	for i in range(len(table)):
839	if table[i] is None:
840	table[i] = 0
841
842	self.data = Array(name + "_hash", table)
843	self.magic = magic
844	self.name = name
845	self.size = size
846	self.poly = poly
847
848	def dump(self, file, trace):
849	# write data to file, as a C array
850	self.data.dump(file, trace)
851	file.write("#define %s_magic %d\n" % (self.name, self.magic))
852	file.write("#define %s_size %d\n" % (self.name, self.size))
853	file.write("#define %s_poly %d\n" % (self.name, self.poly))
854
855	# stuff to deal with arrays of unsigned integers
856
857	class Array:
858
859	def __init__(self, name, data):
860	self.name = name
861	self.data = data
862
863	def dump(self, file, trace=0):
864	# write data to file, as a C array
865	size = getsize(self.data)
866	if trace:
867	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
868	file.write("static ")
869	if size == 1:
870	file.write("unsigned char")
871	elif size == 2:
872	file.write("unsigned short")
873	else:
874	file.write("unsigned int")
875	file.write(" " + self.name + "[] = {\n")
876	if self.data:
877	s = " "
878	for item in self.data:
879	i = str(item) + ", "
880	if len(s) + len(i) > 78:
881	file.write(s + "\n")
882	s = " " + i
883	else:
884	s = s + i
885	if s.strip():
886	file.write(s + "\n")
887	file.write("};\n\n")
888
889	def getsize(data):
890	# return smallest possible integer size for the given array
891	maxdata = max(data)
892	if maxdata < 256:
893	return 1
894	elif maxdata < 65536:
895	return 2
896	else:
897	return 4
898
899	def splitbins(t, trace=0):
900	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
901
902	t is a sequence of ints. This function can be useful to save space if
903	many of the ints are the same. t1 and t2 are lists of ints, and shift
904	is an int, chosen to minimize the combined size of t1 and t2 (in C
905	code), and where for each i in range(len(t)),
906	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
907	where mask is a bitmask isolating the last "shift" bits.
908
909	If optional arg trace is non-zero (default zero), progress info
910	is printed to sys.stderr. The higher the value, the more info
911	you'll get.
912	"""
913
914	import sys
915	if trace:
916	def dump(t1, t2, shift, bytes):
917	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
918	len(t1), len(t2), shift, bytes)
919	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
920	"bytes"
921	n = len(t)-1 # last valid index
922	maxshift = 0 # the most we can shift n and still have something left
923	if n > 0:
924	while n >> 1:
925	n >>= 1
926	maxshift += 1
927	del n
928	bytes = sys.maxint # smallest total size so far
929	t = tuple(t) # so slices can be dict keys
930	for shift in range(maxshift + 1):
931	t1 = []
932	t2 = []
933	size = 2**shift
934	bincache = {}
935	for i in range(0, len(t), size):
936	bin = t[i:i+size]
937	index = bincache.get(bin)
938	if index is None:
939	index = len(t2)
940	bincache[bin] = index
941	t2.extend(bin)
942	t1.append(index >> shift)
943	# determine memory size
944	b = len(t1)getsize(t1) + len(t2)getsize(t2)
945	if trace > 1:
946	dump(t1, t2, shift, b)
947	if b < bytes:
948	best = t1, t2, shift
949	bytes = b
950	t1, t2, shift = best
951	if trace:
952	print >>sys.stderr, "Best:",
953	dump(t1, t2, shift, bytes)
954	if __debug__:
955	# exhaustively verify that the decomposition is correct
956	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
957	for i in xrange(len(t)):
958	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
959	return best
960
961	if __name__ == "__main__":
962	maketables(1)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Tools/unicode/makeunicodedata.py@ 389

Download in other formats: