| 1 | #!/usr/bin/perl
 | 
|---|
| 2 | #
 | 
|---|
| 3 | # Generate code page .c files from ftp.unicode.org descriptions
 | 
|---|
| 4 | #
 | 
|---|
| 5 | # Copyright 2000 Alexandre Julliard
 | 
|---|
| 6 | #
 | 
|---|
| 7 | 
 | 
|---|
| 8 | # base directory for ftp.unicode.org files
 | 
|---|
| 9 | $BASEDIR = "ftp.unicode.org/Public/";
 | 
|---|
| 10 | $MAPPREFIX = $BASEDIR . "MAPPINGS/";
 | 
|---|
| 11 | 
 | 
|---|
| 12 | # UnicodeData file
 | 
|---|
| 13 | $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
 | 
|---|
| 14 | 
 | 
|---|
| 15 | # Defaults mapping
 | 
|---|
| 16 | $DEFAULTS = "./defaults";
 | 
|---|
| 17 | 
 | 
|---|
| 18 | # Default char for undefined mappings
 | 
|---|
| 19 | $DEF_CHAR = ord '?';
 | 
|---|
| 20 | 
 | 
|---|
| 21 | @allfiles =
 | 
|---|
| 22 | (
 | 
|---|
| 23 |     [ 37,    "VENDORS/MICSFT/EBCDIC/CP037.TXT",   "IBM EBCDIC US Canada" ],
 | 
|---|
| 24 |     [ 42,    "VENDORS/ADOBE/symbol.txt",          "Symbol" ],
 | 
|---|
| 25 |     [ 424,   "VENDORS/MISC/CP424.TXT",            "IBM EBCDIC Hebrew" ],
 | 
|---|
| 26 |     [ 437,   "VENDORS/MICSFT/PC/CP437.TXT",       "OEM United States" ],
 | 
|---|
| 27 |     [ 500,   "VENDORS/MICSFT/EBCDIC/CP500.TXT",   "IBM EBCDIC International" ],
 | 
|---|
| 28 |     [ 737,   "VENDORS/MICSFT/PC/CP737.TXT",       "OEM Greek 437G" ],
 | 
|---|
| 29 |     [ 775,   "VENDORS/MICSFT/PC/CP775.TXT",       "OEM Baltic" ],
 | 
|---|
| 30 |     [ 850,   "VENDORS/MICSFT/PC/CP850.TXT",       "OEM Multilingual Latin 1" ],
 | 
|---|
| 31 |     [ 852,   "VENDORS/MICSFT/PC/CP852.TXT",       "OEM Slovak Latin 2" ],
 | 
|---|
| 32 |     [ 855,   "VENDORS/MICSFT/PC/CP855.TXT",       "OEM Cyrillic" ],
 | 
|---|
| 33 |     [ 856,   "VENDORS/MISC/CP856.TXT",            "Hebrew PC" ],
 | 
|---|
| 34 |     [ 857,   "VENDORS/MICSFT/PC/CP857.TXT",       "OEM Turkish" ],
 | 
|---|
| 35 |     [ 860,   "VENDORS/MICSFT/PC/CP860.TXT",       "OEM Portuguese" ],
 | 
|---|
| 36 |     [ 861,   "VENDORS/MICSFT/PC/CP861.TXT",       "OEM Icelandic" ],
 | 
|---|
| 37 |     [ 862,   "VENDORS/MICSFT/PC/CP862.TXT",       "OEM Hebrew" ],
 | 
|---|
| 38 |     [ 863,   "VENDORS/MICSFT/PC/CP863.TXT",       "OEM Canadian French" ],
 | 
|---|
| 39 |     [ 864,   "VENDORS/MICSFT/PC/CP864.TXT",       "OEM Arabic" ],
 | 
|---|
| 40 |     [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       "OEM Nordic" ],
 | 
|---|
| 41 |     [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       "OEM Russian" ],
 | 
|---|
| 42 |     [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       "OEM Greek" ],
 | 
|---|
| 43 |     [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       "ANSI/OEM Thai" ],
 | 
|---|
| 44 |     [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   "IBM EBCDIC Greek" ],
 | 
|---|
| 45 |     [ 878,   "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
 | 
|---|
| 46 |     [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  "ANSI/OEM Japanese Shift-JIS" ],
 | 
|---|
| 47 |     [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  "ANSI/OEM Simplified Chinese GBK" ],
 | 
|---|
| 48 |     [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  "ANSI/OEM Korean Unified Hangul" ],
 | 
|---|
| 49 |     [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  "ANSI/OEM Traditional Chinese Big5" ],
 | 
|---|
| 50 |     [ 1006,  "VENDORS/MISC/CP1006.TXT",           "IBM Arabic" ],
 | 
|---|
| 51 |     [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  "IBM EBCDIC Latin 5 Turkish" ],
 | 
|---|
| 52 |     [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", "ANSI Eastern Europe" ],
 | 
|---|
| 53 |     [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", "ANSI Cyrillic" ],
 | 
|---|
| 54 |     [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", "ANSI Latin 1" ],
 | 
|---|
| 55 |     [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", "ANSI Greek" ],
 | 
|---|
| 56 |     [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", "ANSI Turkish" ],
 | 
|---|
| 57 |     [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", "ANSI Hebrew" ],
 | 
|---|
| 58 |     [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", "ANSI Arabic" ],
 | 
|---|
| 59 |     [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", "ANSI Baltic" ],
 | 
|---|
| 60 |     [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", "ANSI/OEM Viet Nam" ],
 | 
|---|
| 61 |     [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      "Mac Roman" ],
 | 
|---|
| 62 |     [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      "Mac Greek" ],
 | 
|---|
| 63 |     [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT",   "Mac Cyrillic" ],
 | 
|---|
| 64 |     [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT",     "Mac Latin 2" ],
 | 
|---|
| 65 |     [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT",    "Mac Icelandic" ],
 | 
|---|
| 66 |     [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT",    "Mac Turkish" ],
 | 
|---|
| 67 |     [ 20866, "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
 | 
|---|
| 68 |     [ 28591, "ISO8859/8859-1.TXT",                "ISO 8859-1 Latin 1" ],
 | 
|---|
| 69 |     [ 28592, "ISO8859/8859-2.TXT",                "ISO 8859-2 Latin 2 (East European)" ],
 | 
|---|
| 70 |     [ 28593, "ISO8859/8859-3.TXT",                "ISO 8859-3 Latin 3 (South European)" ],
 | 
|---|
| 71 |     [ 28594, "ISO8859/8859-4.TXT",                "ISO 8859-4 Latin 4 (Baltic old)" ],
 | 
|---|
| 72 |     [ 28595, "ISO8859/8859-5.TXT",                "ISO 8859-5 Cyrillic" ],
 | 
|---|
| 73 |     [ 28596, "ISO8859/8859-6.TXT",                "ISO 8859-6 Arabic" ],
 | 
|---|
| 74 |     [ 28597, "ISO8859/8859-7.TXT",                "ISO 8859-7 Greek" ],
 | 
|---|
| 75 |     [ 28598, "ISO8859/8859-8.TXT",                "ISO 8859-8 Hebrew" ],
 | 
|---|
| 76 |     [ 28599, "ISO8859/8859-9.TXT",                "ISO 8859-9 Latin 5 (Turkish)" ],
 | 
|---|
| 77 |     [ 28600, "ISO8859/8859-10.TXT",               "ISO 8859-10 Latin 6 (Nordic)" ],
 | 
|---|
| 78 |     [ 28603, "ISO8859/8859-13.TXT",               "ISO 8859-13 Latin 7 (Baltic)" ],
 | 
|---|
| 79 |     [ 28604, "ISO8859/8859-14.TXT",               "ISO 8859-14 Latin 8 (Celtic)" ],
 | 
|---|
| 80 |     [ 28605, "ISO8859/8859-15.TXT",               "ISO 8859-15 Latin 9 (Euro)" ]
 | 
|---|
| 81 | );
 | 
|---|
| 82 | 
 | 
|---|
| 83 | 
 | 
|---|
| 84 | %ctype =
 | 
|---|
| 85 | (
 | 
|---|
| 86 |     "upper"  => 0x0001,
 | 
|---|
| 87 |     "lower"  => 0x0002,
 | 
|---|
| 88 |     "digit"  => 0x0004,
 | 
|---|
| 89 |     "space"  => 0x0008,
 | 
|---|
| 90 |     "punct"  => 0x0010,
 | 
|---|
| 91 |     "cntrl"  => 0x0020,
 | 
|---|
| 92 |     "blank"  => 0x0040,
 | 
|---|
| 93 |     "xdigit" => 0x0080,
 | 
|---|
| 94 |     "alpha"  => 0x0100
 | 
|---|
| 95 | );
 | 
|---|
| 96 | 
 | 
|---|
| 97 | %categories =
 | 
|---|
| 98 | (
 | 
|---|
| 99 |     "Lu" => $ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 | 
|---|
| 100 |     "Ll" => $ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 | 
|---|
| 101 |     "Lt" => $ctype{"alpha"},    # Letter, Titlecase
 | 
|---|
| 102 |     "Mn" => $ctype{"punct"},    # Mark, Non-Spacing
 | 
|---|
| 103 |     "Mc" => $ctype{"punct"},    # Mark, Spacing Combining
 | 
|---|
| 104 |     "Me" => $ctype{"punct"},    # Mark, Enclosing
 | 
|---|
| 105 |     "Nd" => $ctype{"digit"},    # Number, Decimal Digit
 | 
|---|
| 106 |     "Nl" => $ctype{"punct"},    # Number, Letter
 | 
|---|
| 107 |     "No" => $ctype{"punct"},    # Number, Other
 | 
|---|
| 108 |     "Zs" => $ctype{"space"},    # Separator, Space
 | 
|---|
| 109 |     "Zl" => 0,                  # Separator, Line
 | 
|---|
| 110 |     "Zp" => 0,                  # Separator, Paragraph
 | 
|---|
| 111 |     "Cc" => $ctype{"cntrl"},    # Other, Control
 | 
|---|
| 112 |     "Cf" => 0,                  # Other, Format
 | 
|---|
| 113 |     "Cs" => 0,                  # Other, Surrogate
 | 
|---|
| 114 |     "Co" => 0,                  # Other, Private Use
 | 
|---|
| 115 |     "Cn" => 0,                  # Other, Not Assigned
 | 
|---|
| 116 |     "Lm" => $ctype{"punct"},    # Letter, Modifier
 | 
|---|
| 117 |     "Lo" => $ctype{"alpha"},    # Letter, Other
 | 
|---|
| 118 |     "Pc" => $ctype{"punct"},    # Punctuation, Connector
 | 
|---|
| 119 |     "Pd" => $ctype{"punct"},    # Punctuation, Dash
 | 
|---|
| 120 |     "Ps" => $ctype{"punct"},    # Punctuation, Open
 | 
|---|
| 121 |     "Pe" => $ctype{"punct"},    # Punctuation, Close
 | 
|---|
| 122 |     "Pi" => $ctype{"punct"},    # Punctuation, Initial quote
 | 
|---|
| 123 |     "Pf" => $ctype{"punct"},    # Punctuation, Final quote
 | 
|---|
| 124 |     "Po" => $ctype{"punct"},    # Punctuation, Other
 | 
|---|
| 125 |     "Sm" => $ctype{"punct"},    # Symbol, Math
 | 
|---|
| 126 |     "Sc" => $ctype{"punct"},    # Symbol, Currency
 | 
|---|
| 127 |     "Sk" => $ctype{"punct"},    # Symbol, Modifier
 | 
|---|
| 128 |     "So" => $ctype{"punct"}     # Symbol, Other 
 | 
|---|
| 129 | );
 | 
|---|
| 130 | 
 | 
|---|
| 131 | # a few characters need additional categories that cannot be determined automatically
 | 
|---|
| 132 | %special_categories =
 | 
|---|
| 133 | (
 | 
|---|
| 134 |     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 | 
|---|
| 135 |                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 | 
|---|
| 136 |     "space"  => [ 0x09..0x0d, 0xfeff ],
 | 
|---|
| 137 |     "blank"  => [ 0x09, 0x20, 0xa0, 0xfeff ]
 | 
|---|
| 138 | );
 | 
|---|
| 139 | 
 | 
|---|
| 140 | %directions =
 | 
|---|
| 141 | (
 | 
|---|
| 142 |     "L"   => 1,    # Left-to-Right
 | 
|---|
| 143 |     "LRE" => 11,   # Left-to-Right Embedding
 | 
|---|
| 144 |     "LRO" => 11,   # Left-to-Right Override
 | 
|---|
| 145 |     "R"   => 2,    # Right-to-Left
 | 
|---|
| 146 |     "AL"  => 2,    # Right-to-Left Arabic
 | 
|---|
| 147 |     "RLE" => 11,   # Right-to-Left Embedding
 | 
|---|
| 148 |     "RLO" => 11,   # Right-to-Left Override
 | 
|---|
| 149 |     "PDF" => 11,   # Pop Directional Format
 | 
|---|
| 150 |     "EN"  => 3,    # European Number
 | 
|---|
| 151 |     "ES"  => 4,    # European Number Separator
 | 
|---|
| 152 |     "ET"  => 5,    # European Number Terminator
 | 
|---|
| 153 |     "AN"  => 6,    # Arabic Number
 | 
|---|
| 154 |     "CS"  => 7,    # Common Number Separator
 | 
|---|
| 155 |     "NSM" => 0,    # Non-Spacing Mark
 | 
|---|
| 156 |     "BN"  => 0,    # Boundary Neutral
 | 
|---|
| 157 |     "B"   => 8,    # Paragraph Separator
 | 
|---|
| 158 |     "S"   => 9,    # Segment Separator
 | 
|---|
| 159 |     "WS"  => 10,   # Whitespace
 | 
|---|
| 160 |     "ON"  => 11    # Other Neutrals
 | 
|---|
| 161 | );
 | 
|---|
| 162 | 
 | 
|---|
| 163 | 
 | 
|---|
| 164 | ################################################################
 | 
|---|
| 165 | # main routine
 | 
|---|
| 166 | 
 | 
|---|
| 167 | READ_DEFAULTS();
 | 
|---|
| 168 | DUMP_CASE_MAPPINGS();
 | 
|---|
| 169 | DUMP_COMPOSE_TABLES();
 | 
|---|
| 170 | DUMP_CTYPE_TABLES();
 | 
|---|
| 171 | 
 | 
|---|
| 172 | foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
 | 
|---|
| 173 | 
 | 
|---|
| 174 | OUTPUT_CPTABLE();
 | 
|---|
| 175 | 
 | 
|---|
| 176 | exit(0);
 | 
|---|
| 177 | 
 | 
|---|
| 178 | 
 | 
|---|
| 179 | ################################################################
 | 
|---|
| 180 | # read in the defaults file
 | 
|---|
| 181 | sub READ_DEFAULTS
 | 
|---|
| 182 | {
 | 
|---|
| 183 |     @unicode_defaults = ();
 | 
|---|
| 184 |     @unicode_aliases = ();
 | 
|---|
| 185 |     @tolower_table = ();
 | 
|---|
| 186 |     @toupper_table = ();
 | 
|---|
| 187 |     @category_table = ();
 | 
|---|
| 188 |     @direction_table = ();
 | 
|---|
| 189 |     @decomp_table = ();
 | 
|---|
| 190 |     @compose_table = ();
 | 
|---|
| 191 | 
 | 
|---|
| 192 |     # first setup a few default mappings
 | 
|---|
| 193 | 
 | 
|---|
| 194 |     open DEFAULTS or die "Cannot open $DEFAULTS";
 | 
|---|
| 195 |     print "Loading $DEFAULTS\n";
 | 
|---|
| 196 |     while (<DEFAULTS>)
 | 
|---|
| 197 |     {
 | 
|---|
| 198 |         next if /^\#/;  # skip comments
 | 
|---|
| 199 |         next if /^$/;  # skip empty lines
 | 
|---|
| 200 |         if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
 | 
|---|
| 201 |         {
 | 
|---|
| 202 |             my @src = map hex, split /,/,$1;
 | 
|---|
| 203 |             my $dst = $4;
 | 
|---|
| 204 |             my $comment = $5;
 | 
|---|
| 205 |             if ($#src > 0) { push @unicode_aliases, \@src; }
 | 
|---|
| 206 |             next if ($dst eq "none");
 | 
|---|
| 207 |             $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
 | 
|---|
| 208 |             foreach $src (@src)
 | 
|---|
| 209 |             {
 | 
|---|
| 210 |                 die "Duplicate value" if defined($unicode_defaults[$src]);
 | 
|---|
| 211 |                 $unicode_defaults[$src] = $dst;
 | 
|---|
| 212 |             }
 | 
|---|
| 213 |             next;
 | 
|---|
| 214 |         }
 | 
|---|
| 215 |         die "Unrecognized line $_\n";
 | 
|---|
| 216 |     }
 | 
|---|
| 217 | 
 | 
|---|
| 218 |     # now build mappings from the decomposition field of the Unicode database
 | 
|---|
| 219 | 
 | 
|---|
| 220 |     open UNICODEDATA or die "Cannot open $UNICODEDATA";
 | 
|---|
| 221 |     print "Loading $UNICODEDATA\n";
 | 
|---|
| 222 |     while (<UNICODEDATA>)
 | 
|---|
| 223 |     {
 | 
|---|
| 224 |         # Decode the fields ...
 | 
|---|
| 225 |         ($code, $name, $cat, $comb, $bidi, 
 | 
|---|
| 226 |          $decomp, $dec, $dig, $num, $mirror, 
 | 
|---|
| 227 |          $oldname, $comment, $upper, $lower, $title) = split /;/;
 | 
|---|
| 228 | 
 | 
|---|
| 229 |         my $src = hex $code;
 | 
|---|
| 230 | 
 | 
|---|
| 231 |         die "unknown category $cat" unless defined $categories{$cat};
 | 
|---|
| 232 |         die "unknown directionality $bidi" unless defined $directions{$bidi};
 | 
|---|
| 233 | 
 | 
|---|
| 234 |         $uniname[$src] = $name;
 | 
|---|
| 235 |         $category_table[$src] = $categories{$cat};
 | 
|---|
| 236 |         $direction_table[$src] = $directions{$bidi};
 | 
|---|
| 237 | 
 | 
|---|
| 238 |         if ($lower ne "")
 | 
|---|
| 239 |         {
 | 
|---|
| 240 |             $tolower_table[$src] = hex $lower;
 | 
|---|
| 241 |             $category_table[$src] |= $ctype{"upper"}|$ctype{"alpha"};
 | 
|---|
| 242 |         }
 | 
|---|
| 243 |         if ($upper ne "")
 | 
|---|
| 244 |         {
 | 
|---|
| 245 |             $toupper_table[$src] = hex $upper;
 | 
|---|
| 246 |             $category_table[$src] |= $ctype{"lower"}|$ctype{"alpha"};
 | 
|---|
| 247 |         }
 | 
|---|
| 248 |         if ($dec ne "")
 | 
|---|
| 249 |         {
 | 
|---|
| 250 |             $category_table[$src] |= $ctype{"digit"};
 | 
|---|
| 251 |         }
 | 
|---|
| 252 | 
 | 
|---|
| 253 |         # copy the category and direction for everything between First/Last pairs
 | 
|---|
| 254 |         if ($name =~ /, First>/) { $start = $src; }
 | 
|---|
| 255 |         if ($name =~ /, Last>/)
 | 
|---|
| 256 |         {
 | 
|---|
| 257 |             while ($start < $src)
 | 
|---|
| 258 |             {
 | 
|---|
| 259 |                 $category_table[$start] = $category_table[$src];
 | 
|---|
| 260 |                 $direction_table[$start] = $direction_table[$src];
 | 
|---|
| 261 |                 $start++;
 | 
|---|
| 262 |             }
 | 
|---|
| 263 |         }
 | 
|---|
| 264 |         
 | 
|---|
| 265 |         next if $decomp eq "";  # no decomposition, skip it
 | 
|---|
| 266 | 
 | 
|---|
| 267 |         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 | 
|---|
| 268 |         {
 | 
|---|
| 269 |             # decomposition of the form "<foo> 1234" -> use char if type is known
 | 
|---|
| 270 |             next unless ($1 eq "font" ||
 | 
|---|
| 271 |                          $1 eq "noBreak" ||
 | 
|---|
| 272 |                          $1 eq "circle" ||
 | 
|---|
| 273 |                          $1 eq "super" ||
 | 
|---|
| 274 |                          $1 eq "sub" ||
 | 
|---|
| 275 |                          $1 eq "wide" ||
 | 
|---|
| 276 |                          $1 eq "narrow" ||
 | 
|---|
| 277 |                          $1 eq "compat" ||
 | 
|---|
| 278 |                          $1 eq "small");
 | 
|---|
| 279 |             $dst = hex $2;
 | 
|---|
| 280 |         }
 | 
|---|
| 281 |         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 | 
|---|
| 282 |         {
 | 
|---|
| 283 |             # decomposition "<compat> 0020 1234" -> combining accent
 | 
|---|
| 284 |             $dst = hex $1;
 | 
|---|
| 285 |         }
 | 
|---|
| 286 |         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 | 
|---|
| 287 |         {
 | 
|---|
| 288 |             # decomposition contains only char values without prefix -> use first char
 | 
|---|
| 289 |             $dst = hex $1;
 | 
|---|
| 290 |             $category_table[$src] |= $category_table[$dst];
 | 
|---|
| 291 |             # store decomposition if it contains two chars
 | 
|---|
| 292 |             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 | 
|---|
| 293 |             {
 | 
|---|
| 294 |                 $decomp_table[$src] = [ hex $1, hex $2 ];
 | 
|---|
| 295 |                 push @compose_table, [ hex $1, hex $2, $src ];
 | 
|---|
| 296 |             }
 | 
|---|
| 297 |         }
 | 
|---|
| 298 |         else
 | 
|---|
| 299 |         {
 | 
|---|
| 300 |             next;
 | 
|---|
| 301 |         }
 | 
|---|
| 302 | 
 | 
|---|
| 303 |         next if defined($unicode_defaults[$src]);  # may have been set in the defaults file
 | 
|---|
| 304 | 
 | 
|---|
| 305 |         # check for loops
 | 
|---|
| 306 |         for ($i = $dst; ; $i = $unicode_defaults[$i])
 | 
|---|
| 307 |         {
 | 
|---|
| 308 |             die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
 | 
|---|
| 309 |             last unless defined($unicode_defaults[$i]);
 | 
|---|
| 310 |         }
 | 
|---|
| 311 |         $unicode_defaults[$src] = $dst;
 | 
|---|
| 312 |     }
 | 
|---|
| 313 | 
 | 
|---|
| 314 |     # patch the category of some special characters
 | 
|---|
| 315 | 
 | 
|---|
| 316 |     foreach $cat (keys %special_categories)
 | 
|---|
| 317 |     {
 | 
|---|
| 318 |         my $flag = $ctype{$cat};
 | 
|---|
| 319 |         foreach $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 | 
|---|
| 320 |     }
 | 
|---|
| 321 | }
 | 
|---|
| 322 | 
 | 
|---|
| 323 | 
 | 
|---|
| 324 | ################################################################
 | 
|---|
| 325 | # parse the input file
 | 
|---|
| 326 | sub READ_FILE
 | 
|---|
| 327 | {
 | 
|---|
| 328 |     my $name = shift;
 | 
|---|
| 329 |     open INPUT,$name or die "Cannot open $name";
 | 
|---|
| 330 |     @cp2uni = ();
 | 
|---|
| 331 |     @lead_bytes = ();
 | 
|---|
| 332 |     @uni2cp = ();
 | 
|---|
| 333 | 
 | 
|---|
| 334 |     while (<INPUT>)
 | 
|---|
| 335 |     {
 | 
|---|
| 336 |         next if /^\#/;  # skip comments
 | 
|---|
| 337 |         next if /^$/;  # skip empty lines
 | 
|---|
| 338 |         next if /\x1a/;  # skip ^Z
 | 
|---|
| 339 |         next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/);  # undefined char
 | 
|---|
| 340 | 
 | 
|---|
| 341 |         if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
 | 
|---|
| 342 |         {
 | 
|---|
| 343 |             $cp = hex $1;
 | 
|---|
| 344 |             push @lead_bytes,$cp;
 | 
|---|
| 345 |             $cp2uni[$cp] = 0;
 | 
|---|
| 346 |             next;
 | 
|---|
| 347 |         }
 | 
|---|
| 348 |         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 | 
|---|
| 349 |         {
 | 
|---|
| 350 |             $cp = hex $1;
 | 
|---|
| 351 |             $uni = hex $2;
 | 
|---|
| 352 |             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 | 
|---|
| 353 |             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 | 
|---|
| 354 |             next;
 | 
|---|
| 355 |         }
 | 
|---|
| 356 |         die "$name: Unrecognized line $_\n";
 | 
|---|
| 357 |     }
 | 
|---|
| 358 | }
 | 
|---|
| 359 | 
 | 
|---|
| 360 | 
 | 
|---|
| 361 | ################################################################
 | 
|---|
| 362 | # parse the symbol.txt file, since its syntax is different from the other ones
 | 
|---|
| 363 | sub READ_SYMBOL_FILE
 | 
|---|
| 364 | {
 | 
|---|
| 365 |     my $name = shift;
 | 
|---|
| 366 |     open INPUT,$name or die "Cannot open $name";
 | 
|---|
| 367 |     @cp2uni = ();
 | 
|---|
| 368 |     @lead_bytes = ();
 | 
|---|
| 369 |     @uni2cp = ();
 | 
|---|
| 370 | 
 | 
|---|
| 371 |     while (<INPUT>)
 | 
|---|
| 372 |     {
 | 
|---|
| 373 |         next if /^\#/;  # skip comments
 | 
|---|
| 374 |         next if /^$/;  # skip empty lines
 | 
|---|
| 375 |         next if /\x1a/;  # skip ^Z
 | 
|---|
| 376 |         if (/^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\#.*)?/)
 | 
|---|
| 377 |         {
 | 
|---|
| 378 |             $uni = hex $1;
 | 
|---|
| 379 |             $cp = hex $2;
 | 
|---|
| 380 |             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 | 
|---|
| 381 |             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 | 
|---|
| 382 |             next;
 | 
|---|
| 383 |         }
 | 
|---|
| 384 |         die "$name: Unrecognized line $_\n";
 | 
|---|
| 385 |     }
 | 
|---|
| 386 | }
 | 
|---|
| 387 | 
 | 
|---|
| 388 | 
 | 
|---|
| 389 | ################################################################
 | 
|---|
| 390 | # add default mappings once the file had been read
 | 
|---|
| 391 | sub ADD_DEFAULT_MAPPINGS
 | 
|---|
| 392 | {
 | 
|---|
| 393 |     # Apply aliases
 | 
|---|
| 394 | 
 | 
|---|
| 395 |     foreach $alias (@unicode_aliases)
 | 
|---|
| 396 |     {
 | 
|---|
| 397 |         my $target = undef;
 | 
|---|
| 398 |         foreach $src (@$alias)
 | 
|---|
| 399 |         {
 | 
|---|
| 400 |             if (defined($uni2cp[$src]))
 | 
|---|
| 401 |             {
 | 
|---|
| 402 |                 $target = $uni2cp[$src];
 | 
|---|
| 403 |                 last;
 | 
|---|
| 404 |             }
 | 
|---|
| 405 |         }
 | 
|---|
| 406 |         next unless defined($target);
 | 
|---|
| 407 | 
 | 
|---|
| 408 |         # At least one char of the alias set is defined, set the others to the same value
 | 
|---|
| 409 |         foreach $src (@$alias)
 | 
|---|
| 410 |         {
 | 
|---|
| 411 |             $uni2cp[$src] = $target unless defined($uni2cp[$src]);
 | 
|---|
| 412 |         }
 | 
|---|
| 413 |     }
 | 
|---|
| 414 | 
 | 
|---|
| 415 |     # For every src -> target mapping in the defaults table,
 | 
|---|
| 416 |     # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
 | 
|---|
| 417 | 
 | 
|---|
| 418 |     for ($src = 0; $src < 65536; $src++)
 | 
|---|
| 419 |     {
 | 
|---|
| 420 |         next if defined($uni2cp[$src]);  # source has a definition already
 | 
|---|
| 421 |         next unless defined($unicode_defaults[$src]);  # no default for this char
 | 
|---|
| 422 |         my $target = $unicode_defaults[$src];
 | 
|---|
| 423 | 
 | 
|---|
| 424 |         # do a recursive mapping until we find a target char that is defined
 | 
|---|
| 425 |         while (!defined($uni2cp[$target]) &&
 | 
|---|
| 426 |                defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
 | 
|---|
| 427 | 
 | 
|---|
| 428 |         if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
 | 
|---|
| 429 |     }
 | 
|---|
| 430 | 
 | 
|---|
| 431 |     # Add an identity mapping for all undefined chars
 | 
|---|
| 432 | 
 | 
|---|
| 433 |     for ($i = 0; $i < 256; $i++)
 | 
|---|
| 434 |     {
 | 
|---|
| 435 |         next if defined($cp2uni[$i]);
 | 
|---|
| 436 |         next if defined($uni2cp[$i]);
 | 
|---|
| 437 |         $cp2uni[$i] = $uni2cp[$i] = $i;
 | 
|---|
| 438 |     }
 | 
|---|
| 439 | }
 | 
|---|
| 440 | 
 | 
|---|
| 441 | ################################################################
 | 
|---|
| 442 | # dump an array of integers
 | 
|---|
| 443 | sub DUMP_ARRAY
 | 
|---|
| 444 | {
 | 
|---|
| 445 |     my ($format,$default,@array) = @_;
 | 
|---|
| 446 |     my $i, $ret = "    ";
 | 
|---|
| 447 |     for ($i = 0; $i < $#array; $i++)
 | 
|---|
| 448 |     {
 | 
|---|
| 449 |         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
 | 
|---|
| 450 |         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
 | 
|---|
| 451 |     }
 | 
|---|
| 452 |     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
 | 
|---|
| 453 |     return $ret;
 | 
|---|
| 454 | }
 | 
|---|
| 455 | 
 | 
|---|
| 456 | ################################################################
 | 
|---|
| 457 | # dump an SBCS mapping table
 | 
|---|
| 458 | sub DUMP_SBCS_TABLE
 | 
|---|
| 459 | {
 | 
|---|
| 460 |     my ($codepage, $name) = @_;
 | 
|---|
| 461 |     my $i;
 | 
|---|
| 462 | 
 | 
|---|
| 463 |     # output the ascii->unicode table
 | 
|---|
| 464 | 
 | 
|---|
| 465 |     printf OUTPUT "static const WCHAR cp2uni[256] =\n";
 | 
|---|
| 466 |     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
 | 
|---|
| 467 | 
 | 
|---|
| 468 |     # count the number of unicode->ascii subtables that contain something
 | 
|---|
| 469 | 
 | 
|---|
| 470 |     my @filled = ();
 | 
|---|
| 471 |     my $subtables = 1;
 | 
|---|
| 472 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 473 |     {
 | 
|---|
| 474 |         next unless defined $uni2cp[$i];
 | 
|---|
| 475 |         $filled[$i >> 8] = 1;
 | 
|---|
| 476 |         $subtables++;
 | 
|---|
| 477 |         $i |= 255;
 | 
|---|
| 478 |     }
 | 
|---|
| 479 | 
 | 
|---|
| 480 |     # output all the subtables into a single array
 | 
|---|
| 481 | 
 | 
|---|
| 482 |     printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
 | 
|---|
| 483 |     for ($i = 0; $i < 256; $i++)
 | 
|---|
| 484 |     {
 | 
|---|
| 485 |         next unless $filled[$i];
 | 
|---|
| 486 |         printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
 | 
|---|
| 487 |         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
 | 
|---|
| 488 |     }
 | 
|---|
| 489 |     printf OUTPUT "    /* defaults */\n";
 | 
|---|
| 490 |     printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
 | 
|---|
| 491 | 
 | 
|---|
| 492 |     # output a table of the offsets of the subtables in the previous array
 | 
|---|
| 493 | 
 | 
|---|
| 494 |     my $pos = 0;
 | 
|---|
| 495 |     my @offsets = ();
 | 
|---|
| 496 |     for ($i = 0; $i < 256; $i++)
 | 
|---|
| 497 |     {
 | 
|---|
| 498 |         if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
 | 
|---|
| 499 |         else { push @offsets, ($subtables-1) * 256; }
 | 
|---|
| 500 |     }
 | 
|---|
| 501 |     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
 | 
|---|
| 502 |     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
 | 
|---|
| 503 | 
 | 
|---|
| 504 |     # output the code page descriptor
 | 
|---|
| 505 | 
 | 
|---|
| 506 |     printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
 | 
|---|
| 507 |     printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
 | 
|---|
| 508 |                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 | 
|---|
| 509 |     printf OUTPUT "    cp2uni,\n";
 | 
|---|
| 510 |     printf OUTPUT "    uni2cp_low,\n";
 | 
|---|
| 511 |     printf OUTPUT "    uni2cp_high\n};\n";
 | 
|---|
| 512 | }
 | 
|---|
| 513 | 
 | 
|---|
| 514 | 
 | 
|---|
| 515 | ################################################################
 | 
|---|
| 516 | # dump a DBCS mapping table
 | 
|---|
| 517 | sub DUMP_DBCS_TABLE
 | 
|---|
| 518 | {
 | 
|---|
| 519 |     my ($codepage, $name) = @_;
 | 
|---|
| 520 |     my $i, $x, $y;
 | 
|---|
| 521 | 
 | 
|---|
| 522 |     # build a list of lead bytes that are actually used
 | 
|---|
| 523 | 
 | 
|---|
| 524 |     my @lblist = ();
 | 
|---|
| 525 |     LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++)
 | 
|---|
| 526 |     {
 | 
|---|
| 527 |         my $base = $lead_bytes[$y] << 8;
 | 
|---|
| 528 |         for ($x = 0; $x < 256; $x++)
 | 
|---|
| 529 |         {
 | 
|---|
| 530 |             if (defined $cp2uni[$base+$x])
 | 
|---|
| 531 |             {
 | 
|---|
| 532 |                 push @lblist,$lead_bytes[$y];
 | 
|---|
| 533 |                 next LBLOOP;
 | 
|---|
| 534 |             }
 | 
|---|
| 535 |         }
 | 
|---|
| 536 |     }
 | 
|---|
| 537 |     my $unused = ($#lead_bytes > $#lblist);
 | 
|---|
| 538 | 
 | 
|---|
| 539 |     # output the ascii->unicode table for the single byte chars
 | 
|---|
| 540 | 
 | 
|---|
| 541 |     printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
 | 
|---|
| 542 |     printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
 | 
|---|
| 543 | 
 | 
|---|
| 544 |     # output the default table for unused lead bytes
 | 
|---|
| 545 | 
 | 
|---|
| 546 |     if ($unused)
 | 
|---|
| 547 |     {
 | 
|---|
| 548 |         printf OUTPUT "    /* unused lead bytes */\n";
 | 
|---|
| 549 |         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
 | 
|---|
| 550 |     }
 | 
|---|
| 551 | 
 | 
|---|
| 552 |     # output the ascii->unicode table for each DBCS lead byte
 | 
|---|
| 553 | 
 | 
|---|
| 554 |     for ($y = 0; $y <= $#lblist; $y++)
 | 
|---|
| 555 |     {
 | 
|---|
| 556 |         my $base = $lblist[$y] << 8;
 | 
|---|
| 557 |         printf OUTPUT "    /* lead byte %02x */\n", $lblist[$y];
 | 
|---|
| 558 |         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
 | 
|---|
| 559 |         printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
 | 
|---|
| 560 |     }
 | 
|---|
| 561 | 
 | 
|---|
| 562 |     # output the lead byte subtables offsets
 | 
|---|
| 563 | 
 | 
|---|
| 564 |     my @offsets = ();
 | 
|---|
| 565 |     for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
 | 
|---|
| 566 |     for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
 | 
|---|
| 567 |     if ($unused)
 | 
|---|
| 568 |     {
 | 
|---|
| 569 |         # increment all lead bytes offset to take into account the unused table
 | 
|---|
| 570 |         for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
 | 
|---|
| 571 |     }
 | 
|---|
| 572 |     printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
 | 
|---|
| 573 |     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
 | 
|---|
| 574 | 
 | 
|---|
| 575 |     # count the number of unicode->ascii subtables that contain something
 | 
|---|
| 576 | 
 | 
|---|
| 577 |     my @filled = ();
 | 
|---|
| 578 |     my $subtables = 1;
 | 
|---|
| 579 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 580 |     {
 | 
|---|
| 581 |         next unless defined $uni2cp[$i];
 | 
|---|
| 582 |         $filled[$i >> 8] = 1;
 | 
|---|
| 583 |         $subtables++;
 | 
|---|
| 584 |         $i |= 255;
 | 
|---|
| 585 |     }
 | 
|---|
| 586 | 
 | 
|---|
| 587 |     # output all the subtables into a single array
 | 
|---|
| 588 | 
 | 
|---|
| 589 |     printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
 | 
|---|
| 590 |     for ($y = 0; $y < 256; $y++)
 | 
|---|
| 591 |     {
 | 
|---|
| 592 |         next unless $filled[$y];
 | 
|---|
| 593 |         printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
 | 
|---|
| 594 |         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
 | 
|---|
| 595 |     }
 | 
|---|
| 596 |     printf OUTPUT "    /* defaults */\n";
 | 
|---|
| 597 |     printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
 | 
|---|
| 598 | 
 | 
|---|
| 599 |     # output a table of the offsets of the subtables in the previous array
 | 
|---|
| 600 | 
 | 
|---|
| 601 |     my $pos = 0;
 | 
|---|
| 602 |     my @offsets = ();
 | 
|---|
| 603 |     for ($y = 0; $y < 256; $y++)
 | 
|---|
| 604 |     {
 | 
|---|
| 605 |         if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
 | 
|---|
| 606 |         else { push @offsets, ($subtables-1) * 256; }
 | 
|---|
| 607 |     }
 | 
|---|
| 608 |     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
 | 
|---|
| 609 |     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
 | 
|---|
| 610 | 
 | 
|---|
| 611 |     # output the code page descriptor
 | 
|---|
| 612 | 
 | 
|---|
| 613 |     printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
 | 
|---|
| 614 |     printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
 | 
|---|
| 615 |                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 | 
|---|
| 616 |     printf OUTPUT "    cp2uni,\n";
 | 
|---|
| 617 |     printf OUTPUT "    cp2uni_leadbytes,\n";
 | 
|---|
| 618 |     printf OUTPUT "    uni2cp_low,\n";
 | 
|---|
| 619 |     printf OUTPUT "    uni2cp_high,\n";
 | 
|---|
| 620 |     DUMP_LB_RANGES();
 | 
|---|
| 621 |     printf OUTPUT "};\n";
 | 
|---|
| 622 | }
 | 
|---|
| 623 | 
 | 
|---|
| 624 | 
 | 
|---|
| 625 | ################################################################
 | 
|---|
| 626 | # dump the list of defined lead byte ranges
 | 
|---|
| 627 | sub DUMP_LB_RANGES
 | 
|---|
| 628 | {
 | 
|---|
| 629 |     my @list = ();
 | 
|---|
| 630 |     my $i = 0;
 | 
|---|
| 631 |     foreach $i (@lead_bytes) { $list[$i] = 1; }
 | 
|---|
| 632 |     my $on = 0;
 | 
|---|
| 633 |     printf OUTPUT "    { ";
 | 
|---|
| 634 |     for ($i = 0; $i < 256; $i++)
 | 
|---|
| 635 |     {
 | 
|---|
| 636 |         if ($on)
 | 
|---|
| 637 |         {
 | 
|---|
| 638 |             if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
 | 
|---|
| 639 |         }
 | 
|---|
| 640 |         else
 | 
|---|
| 641 |         {
 | 
|---|
| 642 |             if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
 | 
|---|
| 643 |         }
 | 
|---|
| 644 |     }
 | 
|---|
| 645 |     if ($on) { printf OUTPUT "0xff, "; }
 | 
|---|
| 646 |     printf OUTPUT "0x00, 0x00 }\n";
 | 
|---|
| 647 | }
 | 
|---|
| 648 | 
 | 
|---|
| 649 | 
 | 
|---|
| 650 | ################################################################
 | 
|---|
| 651 | # dump the case mapping tables
 | 
|---|
| 652 | sub DUMP_CASE_MAPPINGS
 | 
|---|
| 653 | {
 | 
|---|
| 654 |     open OUTPUT,">casemap.c" or die "Cannot create casemap.c";
 | 
|---|
| 655 |     printf "Building casemap.c\n";
 | 
|---|
| 656 |     printf OUTPUT "/* Unicode case mappings */\n";
 | 
|---|
| 657 |     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 | 
|---|
| 658 |     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 | 
|---|
| 659 | 
 | 
|---|
| 660 |     DUMP_CASE_TABLE( "casemap_lower", @tolower_table );
 | 
|---|
| 661 |     DUMP_CASE_TABLE( "casemap_upper", @toupper_table );
 | 
|---|
| 662 |     close OUTPUT;
 | 
|---|
| 663 | }
 | 
|---|
| 664 | 
 | 
|---|
| 665 | 
 | 
|---|
| 666 | ################################################################
 | 
|---|
| 667 | # dump a case mapping table
 | 
|---|
| 668 | sub DUMP_CASE_TABLE
 | 
|---|
| 669 | {
 | 
|---|
| 670 |     my ($name,@table) = @_;
 | 
|---|
| 671 | 
 | 
|---|
| 672 |     # count the number of sub tables that contain something
 | 
|---|
| 673 | 
 | 
|---|
| 674 |     my @filled = ();
 | 
|---|
| 675 |     my $pos = 512;
 | 
|---|
| 676 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 677 |     {
 | 
|---|
| 678 |         next unless defined $table[$i];
 | 
|---|
| 679 |         $filled[$i >> 8] = $pos;
 | 
|---|
| 680 |         $pos += 256;
 | 
|---|
| 681 |         $i |= 255;
 | 
|---|
| 682 |     }
 | 
|---|
| 683 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 684 |     {
 | 
|---|
| 685 |         next unless defined $table[$i];
 | 
|---|
| 686 |         $table[$i] = ($table[$i] - $i) & 0xffff;
 | 
|---|
| 687 |     }
 | 
|---|
| 688 | 
 | 
|---|
| 689 |     # dump the table
 | 
|---|
| 690 | 
 | 
|---|
| 691 |     printf OUTPUT "const WCHAR %s[%d] =\n", $name, $pos;
 | 
|---|
| 692 |     printf OUTPUT "{\n    /* index */\n";
 | 
|---|
| 693 |     printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
 | 
|---|
| 694 |     printf OUTPUT "    /* defaults */\n";
 | 
|---|
| 695 |     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
 | 
|---|
| 696 |     for ($i = 0; $i < 256; $i++)
 | 
|---|
| 697 |     {
 | 
|---|
| 698 |         next unless $filled[$i];
 | 
|---|
| 699 |         printf OUTPUT ",\n    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
 | 
|---|
| 700 |         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table[($i<<8) .. ($i<<8)+255] );
 | 
|---|
| 701 |     }
 | 
|---|
| 702 |     printf OUTPUT "\n};\n";
 | 
|---|
| 703 | }
 | 
|---|
| 704 | 
 | 
|---|
| 705 | 
 | 
|---|
| 706 | ################################################################
 | 
|---|
| 707 | # dump the ctype tables
 | 
|---|
| 708 | sub DUMP_CTYPE_TABLES
 | 
|---|
| 709 | {
 | 
|---|
| 710 |     open OUTPUT,">wctype.c" or die "Cannot create casemap.c";
 | 
|---|
| 711 |     printf "Building wctype.c\n";
 | 
|---|
| 712 |     printf OUTPUT "/* Unicode ctype tables */\n";
 | 
|---|
| 713 |     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 | 
|---|
| 714 |     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 | 
|---|
| 715 | 
 | 
|---|
| 716 |     my $i;
 | 
|---|
| 717 |     my @array = (0) x 256;
 | 
|---|
| 718 | 
 | 
|---|
| 719 |     # add the direction in the high 4 bits of the category
 | 
|---|
| 720 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 721 |     {
 | 
|---|
| 722 |         $category_table[$i] |= $direction_table[$i] << 12;
 | 
|---|
| 723 |     }
 | 
|---|
| 724 | 
 | 
|---|
| 725 |     # try to merge table rows
 | 
|---|
| 726 |     for ($row = 0; $row < 256; $row++)
 | 
|---|
| 727 |     {
 | 
|---|
| 728 |         my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
 | 
|---|
| 729 |         if (defined($sequences{$rowtxt}))
 | 
|---|
| 730 |         {
 | 
|---|
| 731 |             # reuse an existing row
 | 
|---|
| 732 |             $array[$row] = $sequences{$rowtxt};
 | 
|---|
| 733 |         }
 | 
|---|
| 734 |         else
 | 
|---|
| 735 |         {
 | 
|---|
| 736 |             # create a new row
 | 
|---|
| 737 |             $sequences{$rowtxt} = $array[$row] = $#array + 1;
 | 
|---|
| 738 |             push @array, @category_table[($row<<8)..($row<<8)+255];
 | 
|---|
| 739 |         }
 | 
|---|
| 740 |     }
 | 
|---|
| 741 | 
 | 
|---|
| 742 |     printf OUTPUT "const unsigned short wctype_table[%d] =\n{\n", $#array+1;
 | 
|---|
| 743 |     printf OUTPUT "    /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
 | 
|---|
| 744 |     printf OUTPUT "    /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );
 | 
|---|
| 745 | 
 | 
|---|
| 746 |     close OUTPUT;
 | 
|---|
| 747 | }
 | 
|---|
| 748 | 
 | 
|---|
| 749 | 
 | 
|---|
| 750 | ################################################################
 | 
|---|
| 751 | # dump the char composition tables
 | 
|---|
| 752 | sub DUMP_COMPOSE_TABLES
 | 
|---|
| 753 | {
 | 
|---|
| 754 |     open OUTPUT,">compose.c" or die "Cannot create compose.c";
 | 
|---|
| 755 |     printf "Building compose.c\n";
 | 
|---|
| 756 |     printf OUTPUT "/* Unicode char composition */\n";
 | 
|---|
| 757 |     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 | 
|---|
| 758 |     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 | 
|---|
| 759 | 
 | 
|---|
| 760 |     ######### composition table
 | 
|---|
| 761 | 
 | 
|---|
| 762 |     my @filled = ();
 | 
|---|
| 763 |     foreach $i (@compose_table)
 | 
|---|
| 764 |     {
 | 
|---|
| 765 |         my @comp = @$i;
 | 
|---|
| 766 |         push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
 | 
|---|
| 767 |     }
 | 
|---|
| 768 | 
 | 
|---|
| 769 |     # count how many different second chars we have
 | 
|---|
| 770 | 
 | 
|---|
| 771 |     for ($i = $count = 0; $i < 65536; $i++)
 | 
|---|
| 772 |     {
 | 
|---|
| 773 |         next unless defined $filled[$i];
 | 
|---|
| 774 |         $count++;
 | 
|---|
| 775 |     }
 | 
|---|
| 776 | 
 | 
|---|
| 777 |     # build the table of second chars and offsets
 | 
|---|
| 778 | 
 | 
|---|
| 779 |     my $pos = $count + 1;
 | 
|---|
| 780 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 781 |     {
 | 
|---|
| 782 |         next unless defined $filled[$i];
 | 
|---|
| 783 |         push @table, $i, $pos;
 | 
|---|
| 784 |         $pos += @{$filled[$i]};
 | 
|---|
| 785 |     }
 | 
|---|
| 786 |     # terminator with last position
 | 
|---|
| 787 |     push @table, 0, $pos;
 | 
|---|
| 788 |     printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
 | 
|---|
| 789 |     printf OUTPUT "    /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
 | 
|---|
| 790 | 
 | 
|---|
| 791 |     # build the table of first chars and mappings
 | 
|---|
| 792 | 
 | 
|---|
| 793 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 794 |     {
 | 
|---|
| 795 |         next unless defined $filled[$i];
 | 
|---|
| 796 |         my @table = ();
 | 
|---|
| 797 |         my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
 | 
|---|
| 798 |         for ($j = 0; $j <= $#list; $j++)
 | 
|---|
| 799 |         {
 | 
|---|
| 800 |             push @table, $list[$j][0], $list[$j][1];
 | 
|---|
| 801 |         }
 | 
|---|
| 802 |         printf OUTPUT ",\n    /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
 | 
|---|
| 803 |     }
 | 
|---|
| 804 |     printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
 | 
|---|
| 805 | 
 | 
|---|
| 806 |     ######### decomposition table
 | 
|---|
| 807 | 
 | 
|---|
| 808 |     # first determine all the 16-char subsets that contain something
 | 
|---|
| 809 | 
 | 
|---|
| 810 |     my @filled = (0) x 4096;
 | 
|---|
| 811 |     my $pos = 16*2;  # for the null subset
 | 
|---|
| 812 |     for ($i = 0; $i < 65536; $i++)
 | 
|---|
| 813 |     {
 | 
|---|
| 814 |         next unless defined $decomp_table[$i];
 | 
|---|
| 815 |         $filled[$i >> 4] = $pos;
 | 
|---|
| 816 |         $pos += 16*2;
 | 
|---|
| 817 |         $i |= 15;
 | 
|---|
| 818 |     }
 | 
|---|
| 819 |     my $total = $pos;
 | 
|---|
| 820 | 
 | 
|---|
| 821 |     # now count the 256-char subsets that contain something
 | 
|---|
| 822 | 
 | 
|---|
| 823 |     my @filled_idx = (256) x 256;
 | 
|---|
| 824 |     $pos = 256 + 16;
 | 
|---|
| 825 |     for ($i = 0; $i < 4096; $i++)
 | 
|---|
| 826 |     {
 | 
|---|
| 827 |         next unless $filled[$i];
 | 
|---|
| 828 |         $filled_idx[$i >> 4] = $pos;
 | 
|---|
| 829 |         $pos += 16;
 | 
|---|
| 830 |         $i |= 15;
 | 
|---|
| 831 |     }
 | 
|---|
| 832 |     my $null_offset = $pos;  # null mapping
 | 
|---|
| 833 |     $total += $pos;
 | 
|---|
| 834 | 
 | 
|---|
| 835 |     # add the index offsets to the subsets positions
 | 
|---|
| 836 | 
 | 
|---|
| 837 |     for ($i = 0; $i < 4096; $i++)
 | 
|---|
| 838 |     {
 | 
|---|
| 839 |         next unless $filled[$i];
 | 
|---|
| 840 |         $filled[$i] += $null_offset;
 | 
|---|
| 841 |     }
 | 
|---|
| 842 | 
 | 
|---|
| 843 |     # dump the main index
 | 
|---|
| 844 | 
 | 
|---|
| 845 |     printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
 | 
|---|
| 846 |     printf OUTPUT "{\n    /* index */\n";
 | 
|---|
| 847 |     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
 | 
|---|
| 848 |     printf OUTPUT ",\n    /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
 | 
|---|
| 849 | 
 | 
|---|
| 850 |     # dump the second-level indexes
 | 
|---|
| 851 | 
 | 
|---|
| 852 |     for ($i = 0; $i < 256; $i++)
 | 
|---|
| 853 |     {
 | 
|---|
| 854 |         next unless ($filled_idx[$i] > 256);
 | 
|---|
| 855 |         my @table = @filled[($i<<4)..($i<<4)+15];
 | 
|---|
| 856 |         for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
 | 
|---|
| 857 |         printf OUTPUT ",\n    /* sub-index %02x */\n", $i;
 | 
|---|
| 858 |         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
 | 
|---|
| 859 |     }
 | 
|---|
| 860 | 
 | 
|---|
| 861 |     # dump the 16-char subsets
 | 
|---|
| 862 | 
 | 
|---|
| 863 |     printf OUTPUT ",\n    /* null mapping */\n";
 | 
|---|
| 864 |     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
 | 
|---|
| 865 | 
 | 
|---|
| 866 |     for ($i = 0; $i < 4096; $i++)
 | 
|---|
| 867 |     {
 | 
|---|
| 868 |         next unless $filled[$i];
 | 
|---|
| 869 |         my @table = (0) x 32;
 | 
|---|
| 870 |         for ($j = 0; $j < 16; $j++)
 | 
|---|
| 871 |         {
 | 
|---|
| 872 |             if (defined $decomp_table[($i<<4) + $j])
 | 
|---|
| 873 |             {
 | 
|---|
| 874 |                 $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
 | 
|---|
| 875 |                 $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
 | 
|---|
| 876 |             }
 | 
|---|
| 877 |         }
 | 
|---|
| 878 |         printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
 | 
|---|
| 879 |         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
 | 
|---|
| 880 |     }
 | 
|---|
| 881 | 
 | 
|---|
| 882 |     printf OUTPUT "\n};\n";
 | 
|---|
| 883 |     close OUTPUT;
 | 
|---|
| 884 | }
 | 
|---|
| 885 | 
 | 
|---|
| 886 | 
 | 
|---|
| 887 | ################################################################
 | 
|---|
| 888 | # read an input file and generate the corresponding .c file
 | 
|---|
| 889 | sub HANDLE_FILE
 | 
|---|
| 890 | {
 | 
|---|
| 891 |     my ($codepage,$filename,$comment) = @_;
 | 
|---|
| 892 | 
 | 
|---|
| 893 |     # symbol codepage file is special
 | 
|---|
| 894 |     if ($codepage == 42) { READ_SYMBOL_FILE($MAPPREFIX . $filename); }
 | 
|---|
| 895 |     else { READ_FILE($MAPPREFIX . $filename); }
 | 
|---|
| 896 | 
 | 
|---|
| 897 |     ADD_DEFAULT_MAPPINGS();
 | 
|---|
| 898 | 
 | 
|---|
| 899 |     my $output = sprintf "c_%03d.c", $codepage;
 | 
|---|
| 900 |     open OUTPUT,">$output" or die "Cannot create $output";
 | 
|---|
| 901 | 
 | 
|---|
| 902 |     printf "Building %s from %s (%s)\n", $output, $filename, $comment;
 | 
|---|
| 903 | 
 | 
|---|
| 904 |     # dump all tables
 | 
|---|
| 905 | 
 | 
|---|
| 906 |     printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
 | 
|---|
| 907 |     printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
 | 
|---|
| 908 |     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
 | 
|---|
| 909 |     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 | 
|---|
| 910 | 
 | 
|---|
| 911 |     if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $comment ); }
 | 
|---|
| 912 |     else { DUMP_DBCS_TABLE( $codepage, $comment ); }
 | 
|---|
| 913 |     close OUTPUT;
 | 
|---|
| 914 | }
 | 
|---|
| 915 | 
 | 
|---|
| 916 | 
 | 
|---|
| 917 | ################################################################
 | 
|---|
| 918 | # output the list of codepage tables into the cptable.c file
 | 
|---|
| 919 | sub OUTPUT_CPTABLE
 | 
|---|
| 920 | {
 | 
|---|
| 921 |     @tables_decl = ();
 | 
|---|
| 922 | 
 | 
|---|
| 923 |     foreach $file (@allfiles)
 | 
|---|
| 924 |     {
 | 
|---|
| 925 |         my ($codepage,$filename,$comment) = @$file;
 | 
|---|
| 926 |         push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
 | 
|---|
| 927 |     }
 | 
|---|
| 928 | 
 | 
|---|
| 929 |     push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
 | 
|---|
| 930 |     foreach $file (@allfiles)
 | 
|---|
| 931 |     {
 | 
|---|
| 932 |         my ($codepage,$filename,$comment) = @$file;
 | 
|---|
| 933 |         push @tables_decl, sprintf("    &cptable_%03d,\n", $codepage);
 | 
|---|
| 934 |     }
 | 
|---|
| 935 |     push @tables_decl, "};";
 | 
|---|
| 936 |     REPLACE_IN_FILE( "cptable.c", @tables_decl );
 | 
|---|
| 937 | }
 | 
|---|
| 938 | 
 | 
|---|
| 939 | ################################################################
 | 
|---|
| 940 | # replace the contents of a file between ### cpmap ### marks
 | 
|---|
| 941 | 
 | 
|---|
| 942 | sub REPLACE_IN_FILE
 | 
|---|
| 943 | {
 | 
|---|
| 944 |     my $name = shift;
 | 
|---|
| 945 |     my @data = @_;
 | 
|---|
| 946 |     my @lines = ();
 | 
|---|
| 947 |     open(FILE,$name) or die "Can't open $name";
 | 
|---|
| 948 |     while (<FILE>)
 | 
|---|
| 949 |     {
 | 
|---|
| 950 |         push @lines, $_;
 | 
|---|
| 951 |         last if /\#\#\# cpmap begin \#\#\#/;
 | 
|---|
| 952 |     }
 | 
|---|
| 953 |     push @lines, @data;
 | 
|---|
| 954 |     while (<FILE>)
 | 
|---|
| 955 |     {
 | 
|---|
| 956 |         if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
 | 
|---|
| 957 |     }
 | 
|---|
| 958 |     push @lines, <FILE>;
 | 
|---|
| 959 |     open(FILE,">$name") or die "Can't modify $name";
 | 
|---|
| 960 |     print FILE @lines;
 | 
|---|
| 961 |     close(FILE);
 | 
|---|
| 962 | }
 | 
|---|