source: python/vendor/Python-2.6.5/Tools/unicode/makeunicodedata.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago

Initial import for vendor code.

  • Property svn:eol-style set to native
File size: 31.4 KB
Line 
1#
2# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.2 database file to
5# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
7#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11# 2000-09-25 fl added character type table
12# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13# 2000-11-03 fl expand first/last ranges
14# 2001-01-19 fl added character name tables (2.1)
15# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
19# 2002-11-24 mvl expand all ranges, sort names version-independently
20# 2002-11-25 mvl add UNIDATA_VERSION
21# 2004-05-29 perky add east asian width information
22# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23#
24# written by Fredrik Lundh (fredrik@pythonware.com)
25#
26
27import sys
28
29SCRIPT = sys.argv[0]
30VERSION = "2.6"
31
32# The Unicode Database
33UNIDATA_VERSION = "5.1.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38old_versions = ["3.2.0"]
39
40CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
44
45BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
48
49EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
51# note: should match definitions in Objects/unicodectype.c
52ALPHA_MASK = 0x01
53DECIMAL_MASK = 0x02
54DIGIT_MASK = 0x04
55LOWER_MASK = 0x08
56LINEBREAK_MASK = 0x10
57SPACE_MASK = 0x20
58TITLE_MASK = 0x40
59UPPER_MASK = 0x80
60NODELTA_MASK = 0x100
61
62def maketables(trace=0):
63
64 print "--- Reading", UNICODE_DATA % "", "..."
65
66 version = ""
67 unicode = UnicodeData(UNICODE_DATA % version,
68 COMPOSITION_EXCLUSIONS % version,
69 EASTASIAN_WIDTH % version)
70
71 print len(filter(None, unicode.table)), "characters"
72
73 for version in old_versions:
74 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
75 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
76 COMPOSITION_EXCLUSIONS % ("-"+version),
77 EASTASIAN_WIDTH % ("-"+version))
78 print len(filter(None, old_unicode.table)), "characters"
79 merge_old_version(version, unicode, old_unicode)
80
81 makeunicodename(unicode, trace)
82 makeunicodedata(unicode, trace)
83 makeunicodetype(unicode, trace)
84
85# --------------------------------------------------------------------
86# unicode character properties
87
88def makeunicodedata(unicode, trace):
89
90 dummy = (0, 0, 0, 0, 0)
91 table = [dummy]
92 cache = {0: dummy}
93 index = [0] * len(unicode.chars)
94
95 FILE = "Modules/unicodedata_db.h"
96
97 print "--- Preparing", FILE, "..."
98
99 # 1) database properties
100
101 for char in unicode.chars:
102 record = unicode.table[char]
103 if record:
104 # extract database properties
105 category = CATEGORY_NAMES.index(record[2])
106 combining = int(record[3])
107 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
108 mirrored = record[9] == "Y"
109 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
110 item = (
111 category, combining, bidirectional, mirrored, eastasianwidth
112 )
113 # add entry to index and item tables
114 i = cache.get(item)
115 if i is None:
116 cache[item] = i = len(table)
117 table.append(item)
118 index[char] = i
119
120 # 2) decomposition data
121
122 decomp_data = [0]
123 decomp_prefix = [""]
124 decomp_index = [0] * len(unicode.chars)
125 decomp_size = 0
126
127 comp_pairs = []
128 comp_first = [None] * len(unicode.chars)
129 comp_last = [None] * len(unicode.chars)
130
131 for char in unicode.chars:
132 record = unicode.table[char]
133 if record:
134 if record[5]:
135 decomp = record[5].split()
136 if len(decomp) > 19:
137 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
138 # prefix
139 if decomp[0][0] == "<":
140 prefix = decomp.pop(0)
141 else:
142 prefix = ""
143 try:
144 i = decomp_prefix.index(prefix)
145 except ValueError:
146 i = len(decomp_prefix)
147 decomp_prefix.append(prefix)
148 prefix = i
149 assert prefix < 256
150 # content
151 decomp = [prefix + (len(decomp)<<8)] +\
152 map(lambda s: int(s, 16), decomp)
153 # Collect NFC pairs
154 if not prefix and len(decomp) == 3 and \
155 char not in unicode.exclusions and \
156 unicode.table[decomp[1]][3] == "0":
157 p, l, r = decomp
158 comp_first[l] = 1
159 comp_last[r] = 1
160 comp_pairs.append((l,r,char))
161 try:
162 i = decomp_data.index(decomp)
163 except ValueError:
164 i = len(decomp_data)
165 decomp_data.extend(decomp)
166 decomp_size = decomp_size + len(decomp) * 2
167 else:
168 i = 0
169 decomp_index[char] = i
170
171 f = l = 0
172 comp_first_ranges = []
173 comp_last_ranges = []
174 prev_f = prev_l = None
175 for i in unicode.chars:
176 if comp_first[i] is not None:
177 comp_first[i] = f
178 f += 1
179 if prev_f is None:
180 prev_f = (i,i)
181 elif prev_f[1]+1 == i:
182 prev_f = prev_f[0],i
183 else:
184 comp_first_ranges.append(prev_f)
185 prev_f = (i,i)
186 if comp_last[i] is not None:
187 comp_last[i] = l
188 l += 1
189 if prev_l is None:
190 prev_l = (i,i)
191 elif prev_l[1]+1 == i:
192 prev_l = prev_l[0],i
193 else:
194 comp_last_ranges.append(prev_l)
195 prev_l = (i,i)
196 comp_first_ranges.append(prev_f)
197 comp_last_ranges.append(prev_l)
198 total_first = f
199 total_last = l
200
201 comp_data = [0]*(total_first*total_last)
202 for f,l,char in comp_pairs:
203 f = comp_first[f]
204 l = comp_last[l]
205 comp_data[f*total_last+l] = char
206
207 print len(table), "unique properties"
208 print len(decomp_prefix), "unique decomposition prefixes"
209 print len(decomp_data), "unique decomposition entries:",
210 print decomp_size, "bytes"
211 print total_first, "first characters in NFC"
212 print total_last, "last characters in NFC"
213 print len(comp_pairs), "NFC pairs"
214
215 print "--- Writing", FILE, "..."
216
217 fp = open(FILE, "w")
218 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
219 print >>fp
220 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
221 print >>fp, "/* a list of unique database records */"
222 print >>fp, \
223 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
224 for item in table:
225 print >>fp, " {%d, %d, %d, %d, %d}," % item
226 print >>fp, "};"
227 print >>fp
228
229 print >>fp, "/* Reindexing of NFC first characters. */"
230 print >>fp, "#define TOTAL_FIRST",total_first
231 print >>fp, "#define TOTAL_LAST",total_last
232 print >>fp, "struct reindex{int start;short count,index;};"
233 print >>fp, "static struct reindex nfc_first[] = {"
234 for start,end in comp_first_ranges:
235 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
236 print >>fp," {0,0,0}"
237 print >>fp,"};\n"
238 print >>fp, "static struct reindex nfc_last[] = {"
239 for start,end in comp_last_ranges:
240 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
241 print >>fp," {0,0,0}"
242 print >>fp,"};\n"
243
244 # FIXME: <fl> the following tables could be made static, and
245 # the support code moved into unicodedatabase.c
246
247 print >>fp, "/* string literals */"
248 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
249 for name in CATEGORY_NAMES:
250 print >>fp, " \"%s\"," % name
251 print >>fp, " NULL"
252 print >>fp, "};"
253
254 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
255 for name in BIDIRECTIONAL_NAMES:
256 print >>fp, " \"%s\"," % name
257 print >>fp, " NULL"
258 print >>fp, "};"
259
260 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
261 for name in EASTASIANWIDTH_NAMES:
262 print >>fp, " \"%s\"," % name
263 print >>fp, " NULL"
264 print >>fp, "};"
265
266 print >>fp, "static const char *decomp_prefix[] = {"
267 for name in decomp_prefix:
268 print >>fp, " \"%s\"," % name
269 print >>fp, " NULL"
270 print >>fp, "};"
271
272 # split record index table
273 index1, index2, shift = splitbins(index, trace)
274
275 print >>fp, "/* index tables for the database records */"
276 print >>fp, "#define SHIFT", shift
277 Array("index1", index1).dump(fp, trace)
278 Array("index2", index2).dump(fp, trace)
279
280 # split decomposition index table
281 index1, index2, shift = splitbins(decomp_index, trace)
282
283 print >>fp, "/* decomposition data */"
284 Array("decomp_data", decomp_data).dump(fp, trace)
285
286 print >>fp, "/* index tables for the decomposition data */"
287 print >>fp, "#define DECOMP_SHIFT", shift
288 Array("decomp_index1", index1).dump(fp, trace)
289 Array("decomp_index2", index2).dump(fp, trace)
290
291 index, index2, shift = splitbins(comp_data, trace)
292 print >>fp, "/* NFC pairs */"
293 print >>fp, "#define COMP_SHIFT", shift
294 Array("comp_index", index).dump(fp, trace)
295 Array("comp_data", index2).dump(fp, trace)
296
297 # Generate delta tables for old versions
298 for version, table, normalization in unicode.changed:
299 cversion = version.replace(".","_")
300 records = [table[0]]
301 cache = {table[0]:0}
302 index = [0] * len(table)
303 for i, record in enumerate(table):
304 try:
305 index[i] = cache[record]
306 except KeyError:
307 index[i] = cache[record] = len(records)
308 records.append(record)
309 index1, index2, shift = splitbins(index, trace)
310 print >>fp, "static const change_record change_records_%s[] = {" % cversion
311 for record in records:
312 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
313 print >>fp, "};"
314 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
315 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
316 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
317 print >>fp, "{"
318 print >>fp, "\tint index;"
319 print >>fp, "\tif (n >= 0x110000) index = 0;"
320 print >>fp, "\telse {"
321 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
322 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
323 (cversion, shift, ((1<<shift)-1))
324 print >>fp, "\t}"
325 print >>fp, "\treturn change_records_%s+index;" % cversion
326 print >>fp, "}\n"
327 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
328 print >>fp, "{"
329 print >>fp, "\tswitch(n) {"
330 for k, v in normalization:
331 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
332 print >>fp, "\tdefault: return 0;"
333 print >>fp, "\t}\n}\n"
334
335 fp.close()
336
337# --------------------------------------------------------------------
338# unicode character type tables
339
340def makeunicodetype(unicode, trace):
341
342 FILE = "Objects/unicodetype_db.h"
343
344 print "--- Preparing", FILE, "..."
345
346 # extract unicode types
347 dummy = (0, 0, 0, 0, 0, 0)
348 table = [dummy]
349 cache = {0: dummy}
350 index = [0] * len(unicode.chars)
351
352 for char in unicode.chars:
353 record = unicode.table[char]
354 if record:
355 # extract database properties
356 category = record[2]
357 bidirectional = record[4]
358 flags = 0
359 delta = True
360 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
361 flags |= ALPHA_MASK
362 if category == "Ll":
363 flags |= LOWER_MASK
364 if category == "Zl" or bidirectional == "B":
365 flags |= LINEBREAK_MASK
366 if category == "Zs" or bidirectional in ("WS", "B", "S"):
367 flags |= SPACE_MASK
368 if category == "Lt":
369 flags |= TITLE_MASK
370 if category == "Lu":
371 flags |= UPPER_MASK
372 # use delta predictor for upper/lower/title if it fits
373 if record[12]:
374 upper = int(record[12], 16)
375 else:
376 upper = char
377 if record[13]:
378 lower = int(record[13], 16)
379 else:
380 lower = char
381 if record[14]:
382 title = int(record[14], 16)
383 else:
384 # UCD.html says that a missing title char means that
385 # it defaults to the uppercase character, not to the
386 # character itself. Apparently, in the current UCD (5.x)
387 # this feature is never used
388 title = upper
389 upper_d = upper - char
390 lower_d = lower - char
391 title_d = title - char
392 if -32768 <= upper_d <= 32767 and \
393 -32768 <= lower_d <= 32767 and \
394 -32768 <= title_d <= 32767:
395 # use deltas
396 upper = upper_d & 0xffff
397 lower = lower_d & 0xffff
398 title = title_d & 0xffff
399 else:
400 flags |= NODELTA_MASK
401 # decimal digit, integer digit
402 decimal = 0
403 if record[6]:
404 flags |= DECIMAL_MASK
405 decimal = int(record[6])
406 digit = 0
407 if record[7]:
408 flags |= DIGIT_MASK
409 digit = int(record[7])
410 item = (
411 upper, lower, title, decimal, digit, flags
412 )
413 # add entry to index and item tables
414 i = cache.get(item)
415 if i is None:
416 cache[item] = i = len(table)
417 table.append(item)
418 index[char] = i
419
420 print len(table), "unique character type entries"
421
422 print "--- Writing", FILE, "..."
423
424 fp = open(FILE, "w")
425 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
426 print >>fp
427 print >>fp, "/* a list of unique character type descriptors */"
428 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
429 for item in table:
430 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
431 print >>fp, "};"
432 print >>fp
433
434 # split decomposition index table
435 index1, index2, shift = splitbins(index, trace)
436
437 print >>fp, "/* type indexes */"
438 print >>fp, "#define SHIFT", shift
439 Array("index1", index1).dump(fp, trace)
440 Array("index2", index2).dump(fp, trace)
441
442 fp.close()
443
444# --------------------------------------------------------------------
445# unicode name database
446
447def makeunicodename(unicode, trace):
448
449 FILE = "Modules/unicodename_db.h"
450
451 print "--- Preparing", FILE, "..."
452
453 # collect names
454 names = [None] * len(unicode.chars)
455
456 for char in unicode.chars:
457 record = unicode.table[char]
458 if record:
459 name = record[1].strip()
460 if name and name[0] != "<":
461 names[char] = name + chr(0)
462
463 print len(filter(lambda n: n is not None, names)), "distinct names"
464
465 # collect unique words from names (note that we differ between
466 # words inside a sentence, and words ending a sentence. the
467 # latter includes the trailing null byte.
468
469 words = {}
470 n = b = 0
471 for char in unicode.chars:
472 name = names[char]
473 if name:
474 w = name.split()
475 b = b + len(name)
476 n = n + len(w)
477 for w in w:
478 l = words.get(w)
479 if l:
480 l.append(None)
481 else:
482 words[w] = [len(words)]
483
484 print n, "words in text;", b, "bytes"
485
486 wordlist = words.items()
487
488 # sort on falling frequency, then by name
489 def cmpwords((aword, alist),(bword, blist)):
490 r = -cmp(len(alist),len(blist))
491 if r:
492 return r
493 return cmp(aword, bword)
494 wordlist.sort(cmpwords)
495
496 # figure out how many phrasebook escapes we need
497 escapes = 0
498 while escapes * 256 < len(wordlist):
499 escapes = escapes + 1
500 print escapes, "escapes"
501
502 short = 256 - escapes
503
504 assert short > 0
505
506 print short, "short indexes in lexicon"
507
508 # statistics
509 n = 0
510 for i in range(short):
511 n = n + len(wordlist[i][1])
512 print n, "short indexes in phrasebook"
513
514 # pick the most commonly used words, and sort the rest on falling
515 # length (to maximize overlap)
516
517 wordlist, wordtail = wordlist[:short], wordlist[short:]
518 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
519 wordlist.extend(wordtail)
520
521 # generate lexicon from words
522
523 lexicon_offset = [0]
524 lexicon = ""
525 words = {}
526
527 # build a lexicon string
528 offset = 0
529 for w, x in wordlist:
530 # encoding: bit 7 indicates last character in word (chr(128)
531 # indicates the last character in an entire string)
532 ww = w[:-1] + chr(ord(w[-1])+128)
533 # reuse string tails, when possible
534 o = lexicon.find(ww)
535 if o < 0:
536 o = offset
537 lexicon = lexicon + ww
538 offset = offset + len(w)
539 words[w] = len(lexicon_offset)
540 lexicon_offset.append(o)
541
542 lexicon = map(ord, lexicon)
543
544 # generate phrasebook from names and lexicon
545 phrasebook = [0]
546 phrasebook_offset = [0] * len(unicode.chars)
547 for char in unicode.chars:
548 name = names[char]
549 if name:
550 w = name.split()
551 phrasebook_offset[char] = len(phrasebook)
552 for w in w:
553 i = words[w]
554 if i < short:
555 phrasebook.append(i)
556 else:
557 # store as two bytes
558 phrasebook.append((i>>8) + short)
559 phrasebook.append(i&255)
560
561 assert getsize(phrasebook) == 1
562
563 #
564 # unicode name hash table
565
566 # extract names
567 data = []
568 for char in unicode.chars:
569 record = unicode.table[char]
570 if record:
571 name = record[1].strip()
572 if name and name[0] != "<":
573 data.append((name, char))
574
575 # the magic number 47 was chosen to minimize the number of
576 # collisions on the current data set. if you like, change it
577 # and see what happens...
578
579 codehash = Hash("code", data, 47)
580
581 print "--- Writing", FILE, "..."
582
583 fp = open(FILE, "w")
584 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
585 print >>fp
586 print >>fp, "#define NAME_MAXLEN", 256
587 print >>fp
588 print >>fp, "/* lexicon */"
589 Array("lexicon", lexicon).dump(fp, trace)
590 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
591
592 # split decomposition index table
593 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
594
595 print >>fp, "/* code->name phrasebook */"
596 print >>fp, "#define phrasebook_shift", shift
597 print >>fp, "#define phrasebook_short", short
598
599 Array("phrasebook", phrasebook).dump(fp, trace)
600 Array("phrasebook_offset1", offset1).dump(fp, trace)
601 Array("phrasebook_offset2", offset2).dump(fp, trace)
602
603 print >>fp, "/* name->code dictionary */"
604 codehash.dump(fp, trace)
605
606 fp.close()
607
608
609def merge_old_version(version, new, old):
610 # Changes to exclusion file not implemented yet
611 if old.exclusions != new.exclusions:
612 raise NotImplementedError, "exclusions differ"
613
614 # In these change records, 0xFF means "no change"
615 bidir_changes = [0xFF]*0x110000
616 category_changes = [0xFF]*0x110000
617 decimal_changes = [0xFF]*0x110000
618 mirrored_changes = [0xFF]*0x110000
619 # In numeric data, 0 means "no change",
620 # -1 means "did not have a numeric value
621 numeric_changes = [0] * 0x110000
622 # normalization_changes is a list of key-value pairs
623 normalization_changes = []
624 for i in range(0x110000):
625 if new.table[i] is None:
626 # Characters unassigned in the new version ought to
627 # be unassigned in the old one
628 assert old.table[i] is None
629 continue
630 # check characters unassigned in the old version
631 if old.table[i] is None:
632 # category 0 is "unassigned"
633 category_changes[i] = 0
634 continue
635 # check characters that differ
636 if old.table[i] != new.table[i]:
637 for k in range(len(old.table[i])):
638 if old.table[i][k] != new.table[i][k]:
639 value = old.table[i][k]
640 if k == 2:
641 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
642 category_changes[i] = CATEGORY_NAMES.index(value)
643 elif k == 4:
644 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
645 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
646 elif k == 5:
647 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
648 # We assume that all normalization changes are in 1:1 mappings
649 assert " " not in value
650 normalization_changes.append((i, value))
651 elif k == 6:
652 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
653 # we only support changes where the old value is a single digit
654 assert value in "0123456789"
655 decimal_changes[i] = int(value)
656 elif k == 8:
657 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
658 # Since 0 encodes "no change", the old value is better not 0
659 assert value != "0" and value != "-1"
660 if not value:
661 numeric_changes[i] = -1
662 else:
663 assert re.match("^[0-9]+$", value)
664 numeric_changes[i] = int(value)
665 elif k == 9:
666 if value == 'Y':
667 mirrored_changes[i] = '1'
668 else:
669 mirrored_changes[i] = '0'
670 elif k == 11:
671 # change to ISO comment, ignore
672 pass
673 elif k == 12:
674 # change to simple uppercase mapping; ignore
675 pass
676 elif k == 13:
677 # change to simple lowercase mapping; ignore
678 pass
679 elif k == 14:
680 # change to simple titlecase mapping; ignore
681 pass
682 else:
683 class Difference(Exception):pass
684 raise Difference, (hex(i), k, old.table[i], new.table[i])
685 new.changed.append((version, zip(bidir_changes, category_changes,
686 decimal_changes, mirrored_changes,
687 numeric_changes),
688 normalization_changes))
689
690
691# --------------------------------------------------------------------
692# the following support code is taken from the unidb utilities
693# Copyright (c) 1999-2000 by Secret Labs AB
694
695# load a unicode-data file from disk
696
697import sys
698
699class UnicodeData:
700
701 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
702 self.changed = []
703 file = open(filename)
704 table = [None] * 0x110000
705 while 1:
706 s = file.readline()
707 if not s:
708 break
709 s = s.strip().split(";")
710 char = int(s[0], 16)
711 table[char] = s
712
713 # expand first-last ranges
714 if expand:
715 field = None
716 for i in range(0, 0x110000):
717 s = table[i]
718 if s:
719 if s[1][-6:] == "First>":
720 s[1] = ""
721 field = s
722 elif s[1][-5:] == "Last>":
723 s[1] = ""
724 field = None
725 elif field:
726 f2 = field[:]
727 f2[0] = "%X" % i
728 table[i] = f2
729
730 # public attributes
731 self.filename = filename
732 self.table = table
733 self.chars = range(0x110000) # unicode 3.2
734
735 file = open(exclusions)
736 self.exclusions = {}
737 for s in file:
738 s = s.strip()
739 if not s:
740 continue
741 if s[0] == '#':
742 continue
743 char = int(s.split()[0],16)
744 self.exclusions[char] = 1
745
746 widths = [None] * 0x110000
747 for s in open(eastasianwidth):
748 s = s.strip()
749 if not s:
750 continue
751 if s[0] == '#':
752 continue
753 s = s.split()[0].split(';')
754 if '..' in s[0]:
755 first, last = [int(c, 16) for c in s[0].split('..')]
756 chars = range(first, last+1)
757 else:
758 chars = [int(s[0], 16)]
759 for char in chars:
760 widths[char] = s[1]
761 for i in range(0, 0x110000):
762 if table[i] is not None:
763 table[i].append(widths[i])
764
765 def uselatin1(self):
766 # restrict character range to ISO Latin 1
767 self.chars = range(256)
768
769# hash table tools
770
771# this is a straight-forward reimplementation of Python's built-in
772# dictionary type, using a static data structure, and a custom string
773# hash algorithm.
774
775def myhash(s, magic):
776 h = 0
777 for c in map(ord, s.upper()):
778 h = (h * magic) + c
779 ix = h & 0xff000000L
780 if ix:
781 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
782 return h
783
784SIZES = [
785 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
786 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
787 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
788 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
789]
790
791class Hash:
792 def __init__(self, name, data, magic):
793 # turn a (key, value) list into a static hash table structure
794
795 # determine table size
796 for size, poly in SIZES:
797 if size > len(data):
798 poly = size + poly
799 break
800 else:
801 raise AssertionError, "ran out of polynominals"
802
803 print size, "slots in hash table"
804
805 table = [None] * size
806
807 mask = size-1
808
809 n = 0
810
811 hash = myhash
812
813 # initialize hash table
814 for key, value in data:
815 h = hash(key, magic)
816 i = (~h) & mask
817 v = table[i]
818 if v is None:
819 table[i] = value
820 continue
821 incr = (h ^ (h >> 3)) & mask;
822 if not incr:
823 incr = mask
824 while 1:
825 n = n + 1
826 i = (i + incr) & mask
827 v = table[i]
828 if v is None:
829 table[i] = value
830 break
831 incr = incr << 1
832 if incr > mask:
833 incr = incr ^ poly
834
835 print n, "collisions"
836 self.collisions = n
837
838 for i in range(len(table)):
839 if table[i] is None:
840 table[i] = 0
841
842 self.data = Array(name + "_hash", table)
843 self.magic = magic
844 self.name = name
845 self.size = size
846 self.poly = poly
847
848 def dump(self, file, trace):
849 # write data to file, as a C array
850 self.data.dump(file, trace)
851 file.write("#define %s_magic %d\n" % (self.name, self.magic))
852 file.write("#define %s_size %d\n" % (self.name, self.size))
853 file.write("#define %s_poly %d\n" % (self.name, self.poly))
854
855# stuff to deal with arrays of unsigned integers
856
857class Array:
858
859 def __init__(self, name, data):
860 self.name = name
861 self.data = data
862
863 def dump(self, file, trace=0):
864 # write data to file, as a C array
865 size = getsize(self.data)
866 if trace:
867 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
868 file.write("static ")
869 if size == 1:
870 file.write("unsigned char")
871 elif size == 2:
872 file.write("unsigned short")
873 else:
874 file.write("unsigned int")
875 file.write(" " + self.name + "[] = {\n")
876 if self.data:
877 s = " "
878 for item in self.data:
879 i = str(item) + ", "
880 if len(s) + len(i) > 78:
881 file.write(s + "\n")
882 s = " " + i
883 else:
884 s = s + i
885 if s.strip():
886 file.write(s + "\n")
887 file.write("};\n\n")
888
889def getsize(data):
890 # return smallest possible integer size for the given array
891 maxdata = max(data)
892 if maxdata < 256:
893 return 1
894 elif maxdata < 65536:
895 return 2
896 else:
897 return 4
898
899def splitbins(t, trace=0):
900 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
901
902 t is a sequence of ints. This function can be useful to save space if
903 many of the ints are the same. t1 and t2 are lists of ints, and shift
904 is an int, chosen to minimize the combined size of t1 and t2 (in C
905 code), and where for each i in range(len(t)),
906 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
907 where mask is a bitmask isolating the last "shift" bits.
908
909 If optional arg trace is non-zero (default zero), progress info
910 is printed to sys.stderr. The higher the value, the more info
911 you'll get.
912 """
913
914 import sys
915 if trace:
916 def dump(t1, t2, shift, bytes):
917 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
918 len(t1), len(t2), shift, bytes)
919 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
920 "bytes"
921 n = len(t)-1 # last valid index
922 maxshift = 0 # the most we can shift n and still have something left
923 if n > 0:
924 while n >> 1:
925 n >>= 1
926 maxshift += 1
927 del n
928 bytes = sys.maxint # smallest total size so far
929 t = tuple(t) # so slices can be dict keys
930 for shift in range(maxshift + 1):
931 t1 = []
932 t2 = []
933 size = 2**shift
934 bincache = {}
935 for i in range(0, len(t), size):
936 bin = t[i:i+size]
937 index = bincache.get(bin)
938 if index is None:
939 index = len(t2)
940 bincache[bin] = index
941 t2.extend(bin)
942 t1.append(index >> shift)
943 # determine memory size
944 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
945 if trace > 1:
946 dump(t1, t2, shift, b)
947 if b < bytes:
948 best = t1, t2, shift
949 bytes = b
950 t1, t2, shift = best
951 if trace:
952 print >>sys.stderr, "Best:",
953 dump(t1, t2, shift, bytes)
954 if __debug__:
955 # exhaustively verify that the decomposition is correct
956 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
957 for i in xrange(len(t)):
958 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
959 return best
960
961if __name__ == "__main__":
962 maketables(1)
Note: See TracBrowser for help on using the repository browser.