Context Navigation

makeunicodedata.py@ 1251

Last change on this file since 1251 was 391, checked in by dmik, 12 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 38.0 KB

Line
1	#
2	# (re)generate unicode property and type databases
3	#
4	# this script converts a unicode 3.2 database file to
5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6	# and Objects/unicodetype_db.h
7	#
8	# history:
9	# 2000-09-24 fl created (based on bits and pieces from unidb)
10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11	# 2000-09-25 fl added character type table
12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13	# 2000-11-03 fl expand first/last ranges
14	# 2001-01-19 fl added character name tables (2.1)
15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16	# 2002-09-11 wd use string methods
17	# 2002-10-18 mvl update to Unicode 3.2
18	# 2002-10-22 mvl generate NFC tables
19	# 2002-11-24 mvl expand all ranges, sort names version-independently
20	# 2002-11-25 mvl add UNIDATA_VERSION
21	# 2004-05-29 perky add east asian width information
22	# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23	#
24	# written by Fredrik Lundh (fredrik@pythonware.com)
25	#
26
27	import sys
28
29	SCRIPT = sys.argv[0]
30	VERSION = "2.6"
31
32	# The Unicode Database
33	UNIDATA_VERSION = "5.2.0"
34	UNICODE_DATA = "UnicodeData%s.txt"
35	COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36	EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37	UNIHAN = "Unihan%s.txt"
38	DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
39	LINE_BREAK = "LineBreak%s.txt"
40
41	old_versions = ["3.2.0"]
42
43	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
44	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
45	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
46	"So" ]
47
48	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
49	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
50	"ON" ]
51
52	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
53
54	MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
55
56	# note: should match definitions in Objects/unicodectype.c
57	ALPHA_MASK = 0x01
58	DECIMAL_MASK = 0x02
59	DIGIT_MASK = 0x04
60	LOWER_MASK = 0x08
61	LINEBREAK_MASK = 0x10
62	SPACE_MASK = 0x20
63	TITLE_MASK = 0x40
64	UPPER_MASK = 0x80
65	NODELTA_MASK = 0x100
66	NUMERIC_MASK = 0x200
67
68	def maketables(trace=0):
69
70	print "--- Reading", UNICODE_DATA % "", "..."
71
72	version = ""
73	unicode = UnicodeData(UNICODE_DATA % version,
74	COMPOSITION_EXCLUSIONS % version,
75	EASTASIAN_WIDTH % version,
76	UNIHAN % version,
77	DERIVEDNORMALIZATION_PROPS % version,
78	LINE_BREAK % version)
79
80	print len(filter(None, unicode.table)), "characters"
81
82	for version in old_versions:
83	print "--- Reading", UNICODE_DATA % ("-"+version), "..."
84	old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
85	COMPOSITION_EXCLUSIONS % ("-"+version),
86	EASTASIAN_WIDTH % ("-"+version),
87	UNIHAN % ("-"+version))
88	print len(filter(None, old_unicode.table)), "characters"
89	merge_old_version(version, unicode, old_unicode)
90
91	makeunicodename(unicode, trace)
92	makeunicodedata(unicode, trace)
93	makeunicodetype(unicode, trace)
94
95	# --------------------------------------------------------------------
96	# unicode character properties
97
98	def makeunicodedata(unicode, trace):
99
100	dummy = (0, 0, 0, 0, 0, 0)
101	table = [dummy]
102	cache = {0: dummy}
103	index = [0] * len(unicode.chars)
104
105	FILE = "Modules/unicodedata_db.h"
106
107	print "--- Preparing", FILE, "..."
108
109	# 1) database properties
110
111	for char in unicode.chars:
112	record = unicode.table[char]
113	if record:
114	# extract database properties
115	category = CATEGORY_NAMES.index(record[2])
116	combining = int(record[3])
117	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
118	mirrored = record[9] == "Y"
119	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
120	normalizationquickcheck = record[17]
121	item = (
122	category, combining, bidirectional, mirrored, eastasianwidth,
123	normalizationquickcheck
124	)
125	# add entry to index and item tables
126	i = cache.get(item)
127	if i is None:
128	cache[item] = i = len(table)
129	table.append(item)
130	index[char] = i
131
132	# 2) decomposition data
133
134	decomp_data = [0]
135	decomp_prefix = [""]
136	decomp_index = [0] * len(unicode.chars)
137	decomp_size = 0
138
139	comp_pairs = []
140	comp_first = [None] * len(unicode.chars)
141	comp_last = [None] * len(unicode.chars)
142
143	for char in unicode.chars:
144	record = unicode.table[char]
145	if record:
146	if record[5]:
147	decomp = record[5].split()
148	if len(decomp) > 19:
149	raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
150	# prefix
151	if decomp[0][0] == "<":
152	prefix = decomp.pop(0)
153	else:
154	prefix = ""
155	try:
156	i = decomp_prefix.index(prefix)
157	except ValueError:
158	i = len(decomp_prefix)
159	decomp_prefix.append(prefix)
160	prefix = i
161	assert prefix < 256
162	# content
163	decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
164	# Collect NFC pairs
165	if not prefix and len(decomp) == 3 and \
166	char not in unicode.exclusions and \
167	unicode.table[decomp[1]][3] == "0":
168	p, l, r = decomp
169	comp_first[l] = 1
170	comp_last[r] = 1
171	comp_pairs.append((l,r,char))
172	try:
173	i = decomp_data.index(decomp)
174	except ValueError:
175	i = len(decomp_data)
176	decomp_data.extend(decomp)
177	decomp_size = decomp_size + len(decomp) * 2
178	else:
179	i = 0
180	decomp_index[char] = i
181
182	f = l = 0
183	comp_first_ranges = []
184	comp_last_ranges = []
185	prev_f = prev_l = None
186	for i in unicode.chars:
187	if comp_first[i] is not None:
188	comp_first[i] = f
189	f += 1
190	if prev_f is None:
191	prev_f = (i,i)
192	elif prev_f[1]+1 == i:
193	prev_f = prev_f[0],i
194	else:
195	comp_first_ranges.append(prev_f)
196	prev_f = (i,i)
197	if comp_last[i] is not None:
198	comp_last[i] = l
199	l += 1
200	if prev_l is None:
201	prev_l = (i,i)
202	elif prev_l[1]+1 == i:
203	prev_l = prev_l[0],i
204	else:
205	comp_last_ranges.append(prev_l)
206	prev_l = (i,i)
207	comp_first_ranges.append(prev_f)
208	comp_last_ranges.append(prev_l)
209	total_first = f
210	total_last = l
211
212	comp_data = [0](total_firsttotal_last)
213	for f,l,char in comp_pairs:
214	f = comp_first[f]
215	l = comp_last[l]
216	comp_data[f*total_last+l] = char
217
218	print len(table), "unique properties"
219	print len(decomp_prefix), "unique decomposition prefixes"
220	print len(decomp_data), "unique decomposition entries:",
221	print decomp_size, "bytes"
222	print total_first, "first characters in NFC"
223	print total_last, "last characters in NFC"
224	print len(comp_pairs), "NFC pairs"
225
226	print "--- Writing", FILE, "..."
227
228	fp = open(FILE, "w")
229	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
230	print >>fp
231	print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
232	print >>fp, "/* a list of unique database records */"
233	print >>fp, \
234	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
235	for item in table:
236	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
237	print >>fp, "};"
238	print >>fp
239
240	print >>fp, "/* Reindexing of NFC first characters. */"
241	print >>fp, "#define TOTAL_FIRST",total_first
242	print >>fp, "#define TOTAL_LAST",total_last
243	print >>fp, "struct reindex{int start;short count,index;};"
244	print >>fp, "static struct reindex nfc_first[] = {"
245	for start,end in comp_first_ranges:
246	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
247	print >>fp," {0,0,0}"
248	print >>fp,"};\n"
249	print >>fp, "static struct reindex nfc_last[] = {"
250	for start,end in comp_last_ranges:
251	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
252	print >>fp," {0,0,0}"
253	print >>fp,"};\n"
254
255	# FIXME: <fl> the following tables could be made static, and
256	# the support code moved into unicodedatabase.c
257
258	print >>fp, "/* string literals */"
259	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
260	for name in CATEGORY_NAMES:
261	print >>fp, " \"%s\"," % name
262	print >>fp, " NULL"
263	print >>fp, "};"
264
265	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
266	for name in BIDIRECTIONAL_NAMES:
267	print >>fp, " \"%s\"," % name
268	print >>fp, " NULL"
269	print >>fp, "};"
270
271	print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
272	for name in EASTASIANWIDTH_NAMES:
273	print >>fp, " \"%s\"," % name
274	print >>fp, " NULL"
275	print >>fp, "};"
276
277	print >>fp, "static const char *decomp_prefix[] = {"
278	for name in decomp_prefix:
279	print >>fp, " \"%s\"," % name
280	print >>fp, " NULL"
281	print >>fp, "};"
282
283	# split record index table
284	index1, index2, shift = splitbins(index, trace)
285
286	print >>fp, "/* index tables for the database records */"
287	print >>fp, "#define SHIFT", shift
288	Array("index1", index1).dump(fp, trace)
289	Array("index2", index2).dump(fp, trace)
290
291	# split decomposition index table
292	index1, index2, shift = splitbins(decomp_index, trace)
293
294	print >>fp, "/* decomposition data */"
295	Array("decomp_data", decomp_data).dump(fp, trace)
296
297	print >>fp, "/* index tables for the decomposition data */"
298	print >>fp, "#define DECOMP_SHIFT", shift
299	Array("decomp_index1", index1).dump(fp, trace)
300	Array("decomp_index2", index2).dump(fp, trace)
301
302	index, index2, shift = splitbins(comp_data, trace)
303	print >>fp, "/* NFC pairs */"
304	print >>fp, "#define COMP_SHIFT", shift
305	Array("comp_index", index).dump(fp, trace)
306	Array("comp_data", index2).dump(fp, trace)
307
308	# Generate delta tables for old versions
309	for version, table, normalization in unicode.changed:
310	cversion = version.replace(".","_")
311	records = [table[0]]
312	cache = {table[0]:0}
313	index = [0] * len(table)
314	for i, record in enumerate(table):
315	try:
316	index[i] = cache[record]
317	except KeyError:
318	index[i] = cache[record] = len(records)
319	records.append(record)
320	index1, index2, shift = splitbins(index, trace)
321	print >>fp, "static const change_record change_records_%s[] = {" % cversion
322	for record in records:
323	print >>fp, "\t{ %s }," % ", ".join(map(str,record))
324	print >>fp, "};"
325	Array("changes_%s_index" % cversion, index1).dump(fp, trace)
326	Array("changes_%s_data" % cversion, index2).dump(fp, trace)
327	print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
328	print >>fp, "{"
329	print >>fp, "\tint index;"
330	print >>fp, "\tif (n >= 0x110000) index = 0;"
331	print >>fp, "\telse {"
332	print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
333	print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
334	(cversion, shift, ((1<<shift)-1))
335	print >>fp, "\t}"
336	print >>fp, "\treturn change_records_%s+index;" % cversion
337	print >>fp, "}\n"
338	print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
339	print >>fp, "{"
340	print >>fp, "\tswitch(n) {"
341	for k, v in normalization:
342	print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
343	print >>fp, "\tdefault: return 0;"
344	print >>fp, "\t}\n}\n"
345
346	fp.close()
347
348	# --------------------------------------------------------------------
349	# unicode character type tables
350
351	def makeunicodetype(unicode, trace):
352
353	FILE = "Objects/unicodetype_db.h"
354
355	print "--- Preparing", FILE, "..."
356
357	# extract unicode types
358	dummy = (0, 0, 0, 0, 0, 0)
359	table = [dummy]
360	cache = {0: dummy}
361	index = [0] * len(unicode.chars)
362	numeric = {}
363	spaces = []
364	linebreaks = []
365
366	for char in unicode.chars:
367	record = unicode.table[char]
368	if record:
369	# extract database properties
370	category = record[2]
371	bidirectional = record[4]
372	properties = record[16]
373	flags = 0
374	delta = True
375	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
376	flags \|= ALPHA_MASK
377	if category == "Ll":
378	flags \|= LOWER_MASK
379	if 'Line_Break' in properties or bidirectional == "B":
380	flags \|= LINEBREAK_MASK
381	linebreaks.append(char)
382	if category == "Zs" or bidirectional in ("WS", "B", "S"):
383	flags \|= SPACE_MASK
384	spaces.append(char)
385	if category == "Lt":
386	flags \|= TITLE_MASK
387	if category == "Lu":
388	flags \|= UPPER_MASK
389	# use delta predictor for upper/lower/title if it fits
390	if record[12]:
391	upper = int(record[12], 16)
392	else:
393	upper = char
394	if record[13]:
395	lower = int(record[13], 16)
396	else:
397	lower = char
398	if record[14]:
399	title = int(record[14], 16)
400	else:
401	# UCD.html says that a missing title char means that
402	# it defaults to the uppercase character, not to the
403	# character itself. Apparently, in the current UCD (5.x)
404	# this feature is never used
405	title = upper
406	upper_d = upper - char
407	lower_d = lower - char
408	title_d = title - char
409	if -32768 <= upper_d <= 32767 and \
410	-32768 <= lower_d <= 32767 and \
411	-32768 <= title_d <= 32767:
412	# use deltas
413	upper = upper_d & 0xffff
414	lower = lower_d & 0xffff
415	title = title_d & 0xffff
416	else:
417	flags \|= NODELTA_MASK
418	# decimal digit, integer digit
419	decimal = 0
420	if record[6]:
421	flags \|= DECIMAL_MASK
422	decimal = int(record[6])
423	digit = 0
424	if record[7]:
425	flags \|= DIGIT_MASK
426	digit = int(record[7])
427	if record[8]:
428	flags \|= NUMERIC_MASK
429	numeric.setdefault(record[8], []).append(char)
430	item = (
431	upper, lower, title, decimal, digit, flags
432	)
433	# add entry to index and item tables
434	i = cache.get(item)
435	if i is None:
436	cache[item] = i = len(table)
437	table.append(item)
438	index[char] = i
439
440	print len(table), "unique character type entries"
441	print sum(map(len, numeric.values())), "numeric code points"
442	print len(spaces), "whitespace code points"
443	print len(linebreaks), "linebreak code points"
444
445	print "--- Writing", FILE, "..."
446
447	fp = open(FILE, "w")
448	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
449	print >>fp
450	print >>fp, "/* a list of unique character type descriptors */"
451	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
452	for item in table:
453	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
454	print >>fp, "};"
455	print >>fp
456
457	# split decomposition index table
458	index1, index2, shift = splitbins(index, trace)
459
460	print >>fp, "/* type indexes */"
461	print >>fp, "#define SHIFT", shift
462	Array("index1", index1).dump(fp, trace)
463	Array("index2", index2).dump(fp, trace)
464
465	# Generate code for _PyUnicode_ToNumeric()
466	numeric_items = sorted(numeric.items())
467	print >>fp, '/* Returns the numeric value as double for Unicode characters'
468	print >>fp, ' * having this property, -1.0 otherwise.'
469	print >>fp, ' */'
470	print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
471	print >>fp, '{'
472	print >>fp, ' switch (ch) {'
473	for value, codepoints in numeric_items:
474	# Turn text into float literals
475	parts = value.split('/')
476	parts = [repr(float(part)) for part in parts]
477	value = '/'.join(parts)
478
479	haswide = False
480	hasnonewide = False
481	codepoints.sort()
482	for codepoint in codepoints:
483	if codepoint < 0x10000:
484	hasnonewide = True
485	if codepoint >= 0x10000 and not haswide:
486	print >>fp, '#ifdef Py_UNICODE_WIDE'
487	haswide = True
488	print >>fp, ' case 0x%04X:' % (codepoint,)
489	if haswide and hasnonewide:
490	print >>fp, '#endif'
491	print >>fp, ' return (double) %s;' % (value,)
492	if haswide and not hasnonewide:
493	print >>fp, '#endif'
494	print >>fp,' }'
495	print >>fp,' return -1.0;'
496	print >>fp,'}'
497	print >>fp
498
499	# Generate code for _PyUnicode_IsWhitespace()
500	print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
501	print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
502	print >>fp, " */"
503	print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
504	print >>fp, '{'
505	print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
506	print >>fp, ' return iswspace(ch);'
507	print >>fp, '#else'
508	print >>fp, ' switch (ch) {'
509
510	haswide = False
511	hasnonewide = False
512	for codepoint in sorted(spaces):
513	if codepoint < 0x10000:
514	hasnonewide = True
515	if codepoint >= 0x10000 and not haswide:
516	print >>fp, '#ifdef Py_UNICODE_WIDE'
517	haswide = True
518	print >>fp, ' case 0x%04X:' % (codepoint,)
519	if haswide and hasnonewide:
520	print >>fp, '#endif'
521	print >>fp, ' return 1;'
522	if haswide and not hasnonewide:
523	print >>fp, '#endif'
524
525	print >>fp,' }'
526	print >>fp,' return 0;'
527	print >>fp, '#endif'
528	print >>fp,'}'
529	print >>fp
530
531	# Generate code for _PyUnicode_IsLinebreak()
532	print >>fp, "/* Returns 1 for Unicode characters having the line break"
533	print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
534	print >>fp, " * type 'B', 0 otherwise."
535	print >>fp, " */"
536	print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
537	print >>fp, '{'
538	print >>fp, ' switch (ch) {'
539	haswide = False
540	hasnonewide = False
541	for codepoint in sorted(linebreaks):
542	if codepoint < 0x10000:
543	hasnonewide = True
544	if codepoint >= 0x10000 and not haswide:
545	print >>fp, '#ifdef Py_UNICODE_WIDE'
546	haswide = True
547	print >>fp, ' case 0x%04X:' % (codepoint,)
548	if haswide and hasnonewide:
549	print >>fp, '#endif'
550	print >>fp, ' return 1;'
551	if haswide and not hasnonewide:
552	print >>fp, '#endif'
553
554	print >>fp,' }'
555	print >>fp,' return 0;'
556	print >>fp,'}'
557	print >>fp
558
559	fp.close()
560
561	# --------------------------------------------------------------------
562	# unicode name database
563
564	def makeunicodename(unicode, trace):
565
566	FILE = "Modules/unicodename_db.h"
567
568	print "--- Preparing", FILE, "..."
569
570	# collect names
571	names = [None] * len(unicode.chars)
572
573	for char in unicode.chars:
574	record = unicode.table[char]
575	if record:
576	name = record[1].strip()
577	if name and name[0] != "<":
578	names[char] = name + chr(0)
579
580	print len(filter(lambda n: n is not None, names)), "distinct names"
581
582	# collect unique words from names (note that we differ between
583	# words inside a sentence, and words ending a sentence. the
584	# latter includes the trailing null byte.
585
586	words = {}
587	n = b = 0
588	for char in unicode.chars:
589	name = names[char]
590	if name:
591	w = name.split()
592	b = b + len(name)
593	n = n + len(w)
594	for w in w:
595	l = words.get(w)
596	if l:
597	l.append(None)
598	else:
599	words[w] = [len(words)]
600
601	print n, "words in text;", b, "bytes"
602
603	wordlist = words.items()
604
605	# sort on falling frequency, then by name
606	def word_key(a):
607	aword, alist = a
608	return -len(alist), aword
609	wordlist.sort(key=word_key)
610
611	# figure out how many phrasebook escapes we need
612	escapes = 0
613	while escapes * 256 < len(wordlist):
614	escapes = escapes + 1
615	print escapes, "escapes"
616
617	short = 256 - escapes
618
619	assert short > 0
620
621	print short, "short indexes in lexicon"
622
623	# statistics
624	n = 0
625	for i in range(short):
626	n = n + len(wordlist[i][1])
627	print n, "short indexes in phrasebook"
628
629	# pick the most commonly used words, and sort the rest on falling
630	# length (to maximize overlap)
631
632	wordlist, wordtail = wordlist[:short], wordlist[short:]
633	wordtail.sort(key=lambda a: a[0], reverse=True)
634	wordlist.extend(wordtail)
635
636	# generate lexicon from words
637
638	lexicon_offset = [0]
639	lexicon = ""
640	words = {}
641
642	# build a lexicon string
643	offset = 0
644	for w, x in wordlist:
645	# encoding: bit 7 indicates last character in word (chr(128)
646	# indicates the last character in an entire string)
647	ww = w[:-1] + chr(ord(w[-1])+128)
648	# reuse string tails, when possible
649	o = lexicon.find(ww)
650	if o < 0:
651	o = offset
652	lexicon = lexicon + ww
653	offset = offset + len(w)
654	words[w] = len(lexicon_offset)
655	lexicon_offset.append(o)
656
657	lexicon = map(ord, lexicon)
658
659	# generate phrasebook from names and lexicon
660	phrasebook = [0]
661	phrasebook_offset = [0] * len(unicode.chars)
662	for char in unicode.chars:
663	name = names[char]
664	if name:
665	w = name.split()
666	phrasebook_offset[char] = len(phrasebook)
667	for w in w:
668	i = words[w]
669	if i < short:
670	phrasebook.append(i)
671	else:
672	# store as two bytes
673	phrasebook.append((i>>8) + short)
674	phrasebook.append(i&255)
675
676	assert getsize(phrasebook) == 1
677
678	#
679	# unicode name hash table
680
681	# extract names
682	data = []
683	for char in unicode.chars:
684	record = unicode.table[char]
685	if record:
686	name = record[1].strip()
687	if name and name[0] != "<":
688	data.append((name, char))
689
690	# the magic number 47 was chosen to minimize the number of
691	# collisions on the current data set. if you like, change it
692	# and see what happens...
693
694	codehash = Hash("code", data, 47)
695
696	print "--- Writing", FILE, "..."
697
698	fp = open(FILE, "w")
699	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
700	print >>fp
701	print >>fp, "#define NAME_MAXLEN", 256
702	print >>fp
703	print >>fp, "/* lexicon */"
704	Array("lexicon", lexicon).dump(fp, trace)
705	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
706
707	# split decomposition index table
708	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
709
710	print >>fp, "/* code->name phrasebook */"
711	print >>fp, "#define phrasebook_shift", shift
712	print >>fp, "#define phrasebook_short", short
713
714	Array("phrasebook", phrasebook).dump(fp, trace)
715	Array("phrasebook_offset1", offset1).dump(fp, trace)
716	Array("phrasebook_offset2", offset2).dump(fp, trace)
717
718	print >>fp, "/* name->code dictionary */"
719	codehash.dump(fp, trace)
720
721	fp.close()
722
723
724	def merge_old_version(version, new, old):
725	# Changes to exclusion file not implemented yet
726	if old.exclusions != new.exclusions:
727	raise NotImplementedError, "exclusions differ"
728
729	# In these change records, 0xFF means "no change"
730	bidir_changes = [0xFF]*0x110000
731	category_changes = [0xFF]*0x110000
732	decimal_changes = [0xFF]*0x110000
733	mirrored_changes = [0xFF]*0x110000
734	# In numeric data, 0 means "no change",
735	# -1 means "did not have a numeric value
736	numeric_changes = [0] * 0x110000
737	# normalization_changes is a list of key-value pairs
738	normalization_changes = []
739	for i in range(0x110000):
740	if new.table[i] is None:
741	# Characters unassigned in the new version ought to
742	# be unassigned in the old one
743	assert old.table[i] is None
744	continue
745	# check characters unassigned in the old version
746	if old.table[i] is None:
747	# category 0 is "unassigned"
748	category_changes[i] = 0
749	continue
750	# check characters that differ
751	if old.table[i] != new.table[i]:
752	for k in range(len(old.table[i])):
753	if old.table[i][k] != new.table[i][k]:
754	value = old.table[i][k]
755	if k == 2:
756	#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
757	category_changes[i] = CATEGORY_NAMES.index(value)
758	elif k == 4:
759	#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
760	bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
761	elif k == 5:
762	#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
763	# We assume that all normalization changes are in 1:1 mappings
764	assert " " not in value
765	normalization_changes.append((i, value))
766	elif k == 6:
767	#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
768	# we only support changes where the old value is a single digit
769	assert value in "0123456789"
770	decimal_changes[i] = int(value)
771	elif k == 8:
772	# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
773	# Since 0 encodes "no change", the old value is better not 0
774	if not value:
775	numeric_changes[i] = -1
776	else:
777	numeric_changes[i] = float(value)
778	assert numeric_changes[i] not in (0, -1)
779	elif k == 9:
780	if value == 'Y':
781	mirrored_changes[i] = '1'
782	else:
783	mirrored_changes[i] = '0'
784	elif k == 11:
785	# change to ISO comment, ignore
786	pass
787	elif k == 12:
788	# change to simple uppercase mapping; ignore
789	pass
790	elif k == 13:
791	# change to simple lowercase mapping; ignore
792	pass
793	elif k == 14:
794	# change to simple titlecase mapping; ignore
795	pass
796	elif k == 16:
797	# change to properties; not yet
798	pass
799	else:
800	class Difference(Exception):pass
801	raise Difference, (hex(i), k, old.table[i], new.table[i])
802	new.changed.append((version, zip(bidir_changes, category_changes,
803	decimal_changes, mirrored_changes,
804	numeric_changes),
805	normalization_changes))
806
807
808	# --------------------------------------------------------------------
809	# the following support code is taken from the unidb utilities
810	# Copyright (c) 1999-2000 by Secret Labs AB
811
812	# load a unicode-data file from disk
813
814	class UnicodeData:
815	# Record structure:
816	# [ID, name, category, combining, bidi, decomp, (6)
817	# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
818	# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
819	# properties] (17)
820
821	def __init__(self, filename, exclusions, eastasianwidth, unihan,
822	derivednormalizationprops=None, linebreakprops=None,
823	expand=1):
824	self.changed = []
825	file = open(filename)
826	table = [None] * 0x110000
827	while 1:
828	s = file.readline()
829	if not s:
830	break
831	s = s.strip().split(";")
832	char = int(s[0], 16)
833	table[char] = s
834
835	# expand first-last ranges
836	if expand:
837	field = None
838	for i in range(0, 0x110000):
839	s = table[i]
840	if s:
841	if s[1][-6:] == "First>":
842	s[1] = ""
843	field = s
844	elif s[1][-5:] == "Last>":
845	s[1] = ""
846	field = None
847	elif field:
848	f2 = field[:]
849	f2[0] = "%X" % i
850	table[i] = f2
851
852	# public attributes
853	self.filename = filename
854	self.table = table
855	self.chars = range(0x110000) # unicode 3.2
856
857	file = open(exclusions)
858	self.exclusions = {}
859	for s in file:
860	s = s.strip()
861	if not s:
862	continue
863	if s[0] == '#':
864	continue
865	char = int(s.split()[0],16)
866	self.exclusions[char] = 1
867
868	widths = [None] * 0x110000
869	for s in open(eastasianwidth):
870	s = s.strip()
871	if not s:
872	continue
873	if s[0] == '#':
874	continue
875	s = s.split()[0].split(';')
876	if '..' in s[0]:
877	first, last = [int(c, 16) for c in s[0].split('..')]
878	chars = range(first, last+1)
879	else:
880	chars = [int(s[0], 16)]
881	for char in chars:
882	widths[char] = s[1]
883	for i in range(0, 0x110000):
884	if table[i] is not None:
885	table[i].append(widths[i])
886
887	for i in range(0, 0x110000):
888	if table[i] is not None:
889	table[i].append(set())
890	if linebreakprops:
891	for s in open(linebreakprops):
892	s = s.partition('#')[0]
893	s = [i.strip() for i in s.split(';')]
894	if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
895	continue
896	if '..' not in s[0]:
897	first = last = int(s[0], 16)
898	else:
899	first, last = [int(c, 16) for c in s[0].split('..')]
900	for char in range(first, last+1):
901	table[char][-1].add('Line_Break')
902
903	if derivednormalizationprops:
904	quickchecks = [0] * 0x110000 # default is Yes
905	qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
906	for s in open(derivednormalizationprops):
907	if '#' in s:
908	s = s[:s.index('#')]
909	s = [i.strip() for i in s.split(';')]
910	if len(s) < 2 or s[1] not in qc_order:
911	continue
912	quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
913	quickcheck_shift = qc_order.index(s[1])*2
914	quickcheck <<= quickcheck_shift
915	if '..' not in s[0]:
916	first = last = int(s[0], 16)
917	else:
918	first, last = [int(c, 16) for c in s[0].split('..')]
919	for char in range(first, last+1):
920	assert not (quickchecks[char]>>quickcheck_shift)&3
921	quickchecks[char] \|= quickcheck
922	for i in range(0, 0x110000):
923	if table[i] is not None:
924	table[i].append(quickchecks[i])
925
926	for line in open(unihan):
927	if not line.startswith('U+'):
928	continue
929	code, tag, value = line.split(None, 3)[:3]
930	if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
931	'kOtherNumeric'):
932	continue
933	value = value.strip().replace(',', '')
934	i = int(code[2:], 16)
935	# Patch the numeric field
936	if table[i] is not None:
937	table[i][8] = value
938
939	def uselatin1(self):
940	# restrict character range to ISO Latin 1
941	self.chars = range(256)
942
943	# hash table tools
944
945	# this is a straight-forward reimplementation of Python's built-in
946	# dictionary type, using a static data structure, and a custom string
947	# hash algorithm.
948
949	def myhash(s, magic):
950	h = 0
951	for c in map(ord, s.upper()):
952	h = (h * magic) + c
953	ix = h & 0xff000000L
954	if ix:
955	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
956	return h
957
958	SIZES = [
959	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
960	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
961	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
962	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
963	]
964
965	class Hash:
966	def __init__(self, name, data, magic):
967	# turn a (key, value) list into a static hash table structure
968
969	# determine table size
970	for size, poly in SIZES:
971	if size > len(data):
972	poly = size + poly
973	break
974	else:
975	raise AssertionError, "ran out of polynominals"
976
977	print size, "slots in hash table"
978
979	table = [None] * size
980
981	mask = size-1
982
983	n = 0
984
985	hash = myhash
986
987	# initialize hash table
988	for key, value in data:
989	h = hash(key, magic)
990	i = (~h) & mask
991	v = table[i]
992	if v is None:
993	table[i] = value
994	continue
995	incr = (h ^ (h >> 3)) & mask;
996	if not incr:
997	incr = mask
998	while 1:
999	n = n + 1
1000	i = (i + incr) & mask
1001	v = table[i]
1002	if v is None:
1003	table[i] = value
1004	break
1005	incr = incr << 1
1006	if incr > mask:
1007	incr = incr ^ poly
1008
1009	print n, "collisions"
1010	self.collisions = n
1011
1012	for i in range(len(table)):
1013	if table[i] is None:
1014	table[i] = 0
1015
1016	self.data = Array(name + "_hash", table)
1017	self.magic = magic
1018	self.name = name
1019	self.size = size
1020	self.poly = poly
1021
1022	def dump(self, file, trace):
1023	# write data to file, as a C array
1024	self.data.dump(file, trace)
1025	file.write("#define %s_magic %d\n" % (self.name, self.magic))
1026	file.write("#define %s_size %d\n" % (self.name, self.size))
1027	file.write("#define %s_poly %d\n" % (self.name, self.poly))
1028
1029	# stuff to deal with arrays of unsigned integers
1030
1031	class Array:
1032
1033	def __init__(self, name, data):
1034	self.name = name
1035	self.data = data
1036
1037	def dump(self, file, trace=0):
1038	# write data to file, as a C array
1039	size = getsize(self.data)
1040	if trace:
1041	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
1042	file.write("static ")
1043	if size == 1:
1044	file.write("unsigned char")
1045	elif size == 2:
1046	file.write("unsigned short")
1047	else:
1048	file.write("unsigned int")
1049	file.write(" " + self.name + "[] = {\n")
1050	if self.data:
1051	s = " "
1052	for item in self.data:
1053	i = str(item) + ", "
1054	if len(s) + len(i) > 78:
1055	file.write(s + "\n")
1056	s = " " + i
1057	else:
1058	s = s + i
1059	if s.strip():
1060	file.write(s + "\n")
1061	file.write("};\n\n")
1062
1063	def getsize(data):
1064	# return smallest possible integer size for the given array
1065	maxdata = max(data)
1066	if maxdata < 256:
1067	return 1
1068	elif maxdata < 65536:
1069	return 2
1070	else:
1071	return 4
1072
1073	def splitbins(t, trace=0):
1074	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
1075
1076	t is a sequence of ints. This function can be useful to save space if
1077	many of the ints are the same. t1 and t2 are lists of ints, and shift
1078	is an int, chosen to minimize the combined size of t1 and t2 (in C
1079	code), and where for each i in range(len(t)),
1080	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1081	where mask is a bitmask isolating the last "shift" bits.
1082
1083	If optional arg trace is non-zero (default zero), progress info
1084	is printed to sys.stderr. The higher the value, the more info
1085	you'll get.
1086	"""
1087
1088	if trace:
1089	def dump(t1, t2, shift, bytes):
1090	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1091	len(t1), len(t2), shift, bytes)
1092	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1093	"bytes"
1094	n = len(t)-1 # last valid index
1095	maxshift = 0 # the most we can shift n and still have something left
1096	if n > 0:
1097	while n >> 1:
1098	n >>= 1
1099	maxshift += 1
1100	del n
1101	bytes = sys.maxint # smallest total size so far
1102	t = tuple(t) # so slices can be dict keys
1103	for shift in range(maxshift + 1):
1104	t1 = []
1105	t2 = []
1106	size = 2**shift
1107	bincache = {}
1108	for i in range(0, len(t), size):
1109	bin = t[i:i+size]
1110	index = bincache.get(bin)
1111	if index is None:
1112	index = len(t2)
1113	bincache[bin] = index
1114	t2.extend(bin)
1115	t1.append(index >> shift)
1116	# determine memory size
1117	b = len(t1)getsize(t1) + len(t2)getsize(t2)
1118	if trace > 1:
1119	dump(t1, t2, shift, b)
1120	if b < bytes:
1121	best = t1, t2, shift
1122	bytes = b
1123	t1, t2, shift = best
1124	if trace:
1125	print >>sys.stderr, "Best:",
1126	dump(t1, t2, shift, bytes)
1127	if __debug__:
1128	# exhaustively verify that the decomposition is correct
1129	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1130	for i in xrange(len(t)):
1131	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1132	return best
1133
1134	if __name__ == "__main__":
1135	maketables(1)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Tools/unicode/makeunicodedata.py@ 1251

Download in other formats: