Changeset 391 for python/trunk/Tools/unicode/makeunicodedata.py
- Timestamp:
- Mar 19, 2014, 11:31:01 PM (11 years ago)
- Location:
- python/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
python/trunk
-
Property svn:mergeinfo
set to
/python/vendor/Python-2.7.6 merged eligible /python/vendor/current merged eligible
-
Property svn:mergeinfo
set to
-
python/trunk/Tools/unicode/makeunicodedata.py
r2 r391 31 31 32 32 # The Unicode Database 33 UNIDATA_VERSION = "5. 1.0"33 UNIDATA_VERSION = "5.2.0" 34 34 UNICODE_DATA = "UnicodeData%s.txt" 35 35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 36 36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt" 37 UNIHAN = "Unihan%s.txt" 38 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 39 LINE_BREAK = "LineBreak%s.txt" 37 40 38 41 old_versions = ["3.2.0"] … … 48 51 49 52 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] 53 54 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 50 55 51 56 # note: should match definitions in Objects/unicodectype.c … … 59 64 UPPER_MASK = 0x80 60 65 NODELTA_MASK = 0x100 66 NUMERIC_MASK = 0x200 61 67 62 68 def maketables(trace=0): … … 67 73 unicode = UnicodeData(UNICODE_DATA % version, 68 74 COMPOSITION_EXCLUSIONS % version, 69 EASTASIAN_WIDTH % version) 75 EASTASIAN_WIDTH % version, 76 UNIHAN % version, 77 DERIVEDNORMALIZATION_PROPS % version, 78 LINE_BREAK % version) 70 79 71 80 print len(filter(None, unicode.table)), "characters" … … 75 84 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), 76 85 COMPOSITION_EXCLUSIONS % ("-"+version), 77 EASTASIAN_WIDTH % ("-"+version)) 86 EASTASIAN_WIDTH % ("-"+version), 87 UNIHAN % ("-"+version)) 78 88 print len(filter(None, old_unicode.table)), "characters" 79 89 merge_old_version(version, unicode, old_unicode) … … 88 98 def makeunicodedata(unicode, trace): 89 99 90 dummy = (0, 0, 0, 0, 0 )100 dummy = (0, 0, 0, 0, 0, 0) 91 101 table = [dummy] 92 102 cache = {0: dummy} … … 108 118 mirrored = record[9] == "Y" 109 119 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) 120 normalizationquickcheck = record[17] 110 121 item = ( 111 category, combining, bidirectional, mirrored, eastasianwidth 122 category, combining, bidirectional, mirrored, eastasianwidth, 123 normalizationquickcheck 112 124 ) 113 125 # add entry to index and item tables … … 149 161 assert prefix < 256 150 162 # content 151 decomp = [prefix + (len(decomp)<<8)] +\ 152 map(lambda s: int(s, 16), decomp) 163 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 153 164 # Collect NFC pairs 154 165 if not prefix and len(decomp) == 3 and \ … … 223 234 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" 224 235 for item in table: 225 print >>fp, " {%d, %d, %d, %d, %d }," % item236 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item 226 237 print >>fp, "};" 227 238 print >>fp … … 349 360 cache = {0: dummy} 350 361 index = [0] * len(unicode.chars) 362 numeric = {} 363 spaces = [] 364 linebreaks = [] 351 365 352 366 for char in unicode.chars: … … 356 370 category = record[2] 357 371 bidirectional = record[4] 372 properties = record[16] 358 373 flags = 0 359 374 delta = True … … 362 377 if category == "Ll": 363 378 flags |= LOWER_MASK 364 if category == "Zl"or bidirectional == "B":379 if 'Line_Break' in properties or bidirectional == "B": 365 380 flags |= LINEBREAK_MASK 381 linebreaks.append(char) 366 382 if category == "Zs" or bidirectional in ("WS", "B", "S"): 367 383 flags |= SPACE_MASK 384 spaces.append(char) 368 385 if category == "Lt": 369 386 flags |= TITLE_MASK … … 408 425 flags |= DIGIT_MASK 409 426 digit = int(record[7]) 427 if record[8]: 428 flags |= NUMERIC_MASK 429 numeric.setdefault(record[8], []).append(char) 410 430 item = ( 411 431 upper, lower, title, decimal, digit, flags … … 419 439 420 440 print len(table), "unique character type entries" 441 print sum(map(len, numeric.values())), "numeric code points" 442 print len(spaces), "whitespace code points" 443 print len(linebreaks), "linebreak code points" 421 444 422 445 print "--- Writing", FILE, "..." … … 439 462 Array("index1", index1).dump(fp, trace) 440 463 Array("index2", index2).dump(fp, trace) 464 465 # Generate code for _PyUnicode_ToNumeric() 466 numeric_items = sorted(numeric.items()) 467 print >>fp, '/* Returns the numeric value as double for Unicode characters' 468 print >>fp, ' * having this property, -1.0 otherwise.' 469 print >>fp, ' */' 470 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)' 471 print >>fp, '{' 472 print >>fp, ' switch (ch) {' 473 for value, codepoints in numeric_items: 474 # Turn text into float literals 475 parts = value.split('/') 476 parts = [repr(float(part)) for part in parts] 477 value = '/'.join(parts) 478 479 haswide = False 480 hasnonewide = False 481 codepoints.sort() 482 for codepoint in codepoints: 483 if codepoint < 0x10000: 484 hasnonewide = True 485 if codepoint >= 0x10000 and not haswide: 486 print >>fp, '#ifdef Py_UNICODE_WIDE' 487 haswide = True 488 print >>fp, ' case 0x%04X:' % (codepoint,) 489 if haswide and hasnonewide: 490 print >>fp, '#endif' 491 print >>fp, ' return (double) %s;' % (value,) 492 if haswide and not hasnonewide: 493 print >>fp, '#endif' 494 print >>fp,' }' 495 print >>fp,' return -1.0;' 496 print >>fp,'}' 497 print >>fp 498 499 # Generate code for _PyUnicode_IsWhitespace() 500 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional" 501 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise." 502 print >>fp, " */" 503 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)' 504 print >>fp, '{' 505 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS' 506 print >>fp, ' return iswspace(ch);' 507 print >>fp, '#else' 508 print >>fp, ' switch (ch) {' 509 510 haswide = False 511 hasnonewide = False 512 for codepoint in sorted(spaces): 513 if codepoint < 0x10000: 514 hasnonewide = True 515 if codepoint >= 0x10000 and not haswide: 516 print >>fp, '#ifdef Py_UNICODE_WIDE' 517 haswide = True 518 print >>fp, ' case 0x%04X:' % (codepoint,) 519 if haswide and hasnonewide: 520 print >>fp, '#endif' 521 print >>fp, ' return 1;' 522 if haswide and not hasnonewide: 523 print >>fp, '#endif' 524 525 print >>fp,' }' 526 print >>fp,' return 0;' 527 print >>fp, '#endif' 528 print >>fp,'}' 529 print >>fp 530 531 # Generate code for _PyUnicode_IsLinebreak() 532 print >>fp, "/* Returns 1 for Unicode characters having the line break" 533 print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional" 534 print >>fp, " * type 'B', 0 otherwise." 535 print >>fp, " */" 536 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)' 537 print >>fp, '{' 538 print >>fp, ' switch (ch) {' 539 haswide = False 540 hasnonewide = False 541 for codepoint in sorted(linebreaks): 542 if codepoint < 0x10000: 543 hasnonewide = True 544 if codepoint >= 0x10000 and not haswide: 545 print >>fp, '#ifdef Py_UNICODE_WIDE' 546 haswide = True 547 print >>fp, ' case 0x%04X:' % (codepoint,) 548 if haswide and hasnonewide: 549 print >>fp, '#endif' 550 print >>fp, ' return 1;' 551 if haswide and not hasnonewide: 552 print >>fp, '#endif' 553 554 print >>fp,' }' 555 print >>fp,' return 0;' 556 print >>fp,'}' 557 print >>fp 441 558 442 559 fp.close() … … 487 604 488 605 # sort on falling frequency, then by name 489 def cmpwords((aword, alist),(bword, blist)): 490 r = -cmp(len(alist),len(blist)) 491 if r: 492 return r 493 return cmp(aword, bword) 494 wordlist.sort(cmpwords) 606 def word_key(a): 607 aword, alist = a 608 return -len(alist), aword 609 wordlist.sort(key=word_key) 495 610 496 611 # figure out how many phrasebook escapes we need … … 516 631 517 632 wordlist, wordtail = wordlist[:short], wordlist[short:] 518 wordtail.sort( lambda a, b: len(b[0])-len(a[0]))633 wordtail.sort(key=lambda a: a[0], reverse=True) 519 634 wordlist.extend(wordtail) 520 635 … … 657 772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] 658 773 # Since 0 encodes "no change", the old value is better not 0 659 assert value != "0" and value != "-1"660 774 if not value: 661 775 numeric_changes[i] = -1 662 776 else: 663 assert re.match("^[0-9]+$",value)664 numeric_changes[i] = int(value)777 numeric_changes[i] = float(value) 778 assert numeric_changes[i] not in (0, -1) 665 779 elif k == 9: 666 780 if value == 'Y': … … 680 794 # change to simple titlecase mapping; ignore 681 795 pass 796 elif k == 16: 797 # change to properties; not yet 798 pass 682 799 else: 683 800 class Difference(Exception):pass … … 695 812 # load a unicode-data file from disk 696 813 697 import sys698 699 814 class UnicodeData: 700 701 def __init__(self, filename, exclusions, eastasianwidth, expand=1): 815 # Record structure: 816 # [ID, name, category, combining, bidi, decomp, (6) 817 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) 818 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) 819 # properties] (17) 820 821 def __init__(self, filename, exclusions, eastasianwidth, unihan, 822 derivednormalizationprops=None, linebreakprops=None, 823 expand=1): 702 824 self.changed = [] 703 825 file = open(filename) … … 762 884 if table[i] is not None: 763 885 table[i].append(widths[i]) 886 887 for i in range(0, 0x110000): 888 if table[i] is not None: 889 table[i].append(set()) 890 if linebreakprops: 891 for s in open(linebreakprops): 892 s = s.partition('#')[0] 893 s = [i.strip() for i in s.split(';')] 894 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: 895 continue 896 if '..' not in s[0]: 897 first = last = int(s[0], 16) 898 else: 899 first, last = [int(c, 16) for c in s[0].split('..')] 900 for char in range(first, last+1): 901 table[char][-1].add('Line_Break') 902 903 if derivednormalizationprops: 904 quickchecks = [0] * 0x110000 # default is Yes 905 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 906 for s in open(derivednormalizationprops): 907 if '#' in s: 908 s = s[:s.index('#')] 909 s = [i.strip() for i in s.split(';')] 910 if len(s) < 2 or s[1] not in qc_order: 911 continue 912 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 913 quickcheck_shift = qc_order.index(s[1])*2 914 quickcheck <<= quickcheck_shift 915 if '..' not in s[0]: 916 first = last = int(s[0], 16) 917 else: 918 first, last = [int(c, 16) for c in s[0].split('..')] 919 for char in range(first, last+1): 920 assert not (quickchecks[char]>>quickcheck_shift)&3 921 quickchecks[char] |= quickcheck 922 for i in range(0, 0x110000): 923 if table[i] is not None: 924 table[i].append(quickchecks[i]) 925 926 for line in open(unihan): 927 if not line.startswith('U+'): 928 continue 929 code, tag, value = line.split(None, 3)[:3] 930 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 931 'kOtherNumeric'): 932 continue 933 value = value.strip().replace(',', '') 934 i = int(code[2:], 16) 935 # Patch the numeric field 936 if table[i] is not None: 937 table[i][8] = value 764 938 765 939 def uselatin1(self): … … 912 1086 """ 913 1087 914 import sys915 1088 if trace: 916 1089 def dump(t1, t2, shift, bytes):
Note:
See TracChangeset
for help on using the changeset viewer.