source: trunk/gcc/libjava/scripts/unicode-muncher.pl@ 3773

Last change on this file since 3773 was 2, checked in by bird, 23 years ago

Initial revision

  • Property cvs2svn:cvs-rev set to 1.1
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 18.5 KB
Line 
1#!/usr/bin/perl -w
2# unicode-muncher.pl -- generate Unicode database for java.lang.Character
3# Copyright (C) 1998, 2002 Free Software Foundation, Inc.
4#
5# This file is part of GNU Classpath.
6#
7# GNU Classpath is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2, or (at your option)
10# any later version.
11#
12# GNU Classpath is distributed in the hope that it will be useful, but
13# WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15# General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with GNU Classpath; see the file COPYING. If not, write to the
19# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20# 02111-1307 USA.
21#
22# Linking this library statically or dynamically with other modules is
23# making a combined work based on this library. Thus, the terms and
24# conditions of the GNU General Public License cover the whole
25# combination.
26#
27# As a special exception, the copyright holders of this library give you
28# permission to link this library with independent modules to produce an
29# executable, regardless of the license terms of these independent
30# modules, and to copy and distribute the resulting executable under
31# terms of your choice, provided that you also meet, for each linked
32# independent module, the terms and conditions of the license of that
33# module. An independent module is a module which is not derived from
34# or based on this library. If you modify this library, you may extend
35# this exception to your version of the library, but you are not
36# obligated to do so. If you do not wish to do so, delete this
37# exception statement from your version.
38
39# Code for reading UnicodeData.txt and generating the code for
40# gnu.java.lang.CharData. For now, the relevant Unicode definition files
41# are found in libjava/gnu/gcj/convert/.
42#
43# Inspired by code from Jochen Hoenicke.
44# author Eric Blake <ebb9@email.byu.edu>
45#
46# Usage: ./unicode-muncher <UnicodeData.txt> <CharData.java>
47# where <UnicodeData.txt> is obtained from www.unicode.org (named
48# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
49# is the final location for the Java interface gnu.java.lang.CharData.
50# As of JDK 1.4, use Unicode version 3.0.0 for best results.
51
52##
53## Convert a 16-bit integer to a Java source code String literal character
54##
55sub javaChar($) {
56 my ($char) = @_;
57 die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
58 $char += 0x10000 if $char < 0;
59 # Special case characters that must be escaped, or are shorter as ASCII
60 return sprintf("\\%03o", $char) if $char < 0x20;
61 return "\\\"" if $char == 0x22;
62 return "\\\\" if $char == 0x5c;
63 return pack("C", $char) if $char < 0x7f;
64 return sprintf("\\u%04x", $char);
65}
66
67##
68## Convert the text UnicodeData file from www.unicode.org into a Java
69## interface with string constants holding the compressed information.
70##
71my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
72 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
73my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
74
75my $NOBREAK_FLAG = 32;
76my $MIRRORED_FLAG = 64;
77
78my @info = ();
79my $titlecase = "";
80my $count = 0;
81my $range = 0;
82
83die "Usage: $0 <UnicodeData.txt> <CharData.java>" unless @ARGV == 2;
84open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
85
86# Stage 1: Parse the attribute file
87$| = 1;
88print "GNU Classpath Unicode Attribute Database Generator 2.0\n";
89print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
90print "Parsing attributes file";
91while(<UNICODE>) {
92 print "." unless $count++ % 1000;
93 chomp;
94 s/\r//g;
95 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
96 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
97 $ch = hex($ch);
98 next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
99
100 my ($type, $numValue, $upperchar, $lowerchar, $direction);
101
102 $type = 0;
103 while ($category !~ /^$TYPECODES[$type]$/) {
104 if (++$type == @TYPECODES) {
105 die "$ch: Unknown type: $category";
106 }
107 }
108 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
109 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
110
111 if ($numeric =~ /^[0-9]+$/) {
112 $numValue = $numeric;
113 die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
114 } elsif ($numeric eq "") {
115 # Special case sequences of 'a'-'z'
116 if ($ch >= 0x0041 && $ch <= 0x005a) {
117 $numValue = $ch - 0x0037;
118 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
119 $numValue = $ch - 0x0057;
120 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
121 $numValue = $ch - 0xff17;
122 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
123 $numValue = $ch - 0xff37;
124 } else {
125 $numValue = -1;
126 }
127 } else {
128 $numValue = -2;
129 }
130
131 $upperchar = $upcase ? hex($upcase) - $ch : 0;
132 $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
133 if ($title ne $upcase) {
134 my $titlechar = $title ? hex($title) : $ch;
135 $titlecase .= pack("n2", $ch, $titlechar);
136 }
137
138 $direction = 0;
139 while ($bidir !~ /^$DIRCODES[$direction]$/) {
140 if (++$direction == @DIRCODES) {
141 $direction = -1;
142 last;
143 }
144 }
145
146 if ($range) {
147 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
148 for ($range + 1 .. $ch - 1) {
149 $info[$_] = pack("n5", $type, $numValue, $upperchar,
150 $lowerchar, $direction);
151 }
152 $range = 0;
153 } elsif ($name =~ /First>$/) {
154 $range = $ch;
155 }
156 $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
157 $direction);
158}
159close UNICODE;
160
161# Stage 2: Compress the data structures
162printf "\nCompressing data structures";
163$count = 0;
164my $info = ();
165my %charhash = ();
166my @charinfo = ();
167
168for my $ch (0 .. 0xffff) {
169 print "." unless $count++ % 0x1000;
170 if (! defined $info[$ch]) {
171 $info[$ch] = pack("n5", 0, -1, 0, 0, -1);
172 }
173
174 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
175 if (! exists $charhash{$info[$ch]}) {
176 push @charinfo, [ $numVal, $upper, $lower, $direction ];
177 $charhash{$info[$ch]} = $#charinfo;
178 }
179 $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
180}
181
182my $charlen = @charinfo;
183my $bestshift;
184my $bestest = 1000000;
185my $bestblkstr;
186die "Too many unique character entries: $charlen\n" if $charlen > 512;
187print "\nUnique character entries: $charlen\n";
188
189for my $i (3 .. 8) {
190 my $blksize = 1 << $i;
191 my %blocks = ();
192 my @blkarray = ();
193 my ($j, $k);
194 print "shift: $i";
195
196 for ($j = 0; $j < 0x10000; $j += $blksize) {
197 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
198 if (! exists $blocks{$blkkey}) {
199 push @blkarray, $blkkey;
200 $blocks{$blkkey} = $#blkarray;
201 }
202 }
203 my $blknum = @blkarray;
204 my $blocklen = $blknum * $blksize;
205 printf " before %5d", $blocklen;
206
207 # Now we try to pack the blkarray as tight as possible by finding matching
208 # heads and tails.
209 for ($j = $blksize - 1; $j > 0; $j--) {
210 my %tails = ();
211 for $k (0 .. $#blkarray) {
212 next if ! defined $blkarray[$k];
213 my $len = length $blkarray[$k];
214 my $tail = substr $blkarray[$k], $len - $j * 2;
215 if (exists $tails{$tail}) {
216 push @{$tails{$tail}}, $k;
217 } else {
218 $tails{$tail} = [ $k ];
219 }
220 }
221
222 # tails are calculated, now calculate the heads and merge.
223 BLOCK:
224 for $k (0 .. $#blkarray) {
225 next if ! defined $blkarray[$k];
226 my $tomerge = $k;
227 while (1) {
228 my $head = substr($blkarray[$tomerge], 0, $j * 2);
229 my $entry = $tails{$head};
230 next BLOCK if ! defined $entry;
231
232 my $other = shift @{$entry};
233 if ($other == $tomerge) {
234 if (@{$entry}) {
235 push @{$entry}, $other;
236 $other = shift @{$entry};
237 } else {
238 push @{$entry}, $other;
239 next BLOCK;
240 }
241 }
242 if (@{$entry} == 0) {
243 delete $tails{$head};
244 }
245
246 # a match was found
247 my $merge = $blkarray[$other]
248 . substr($blkarray[$tomerge], $j * 2);
249 $blocklen -= $j;
250 $blknum--;
251
252 if ($other < $tomerge) {
253 $blkarray[$tomerge] = undef;
254 $blkarray[$other] = $merge;
255 my $len = length $merge;
256 my $tail = substr $merge, $len - $j * 2;
257 $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
258 @{$tails{$tail}} ];
259 next BLOCK;
260 }
261 $blkarray[$tomerge] = $merge;
262 $blkarray[$other] = undef;
263 }
264 }
265 }
266 my $blockstr;
267 for $k (0 .. $#blkarray) {
268 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
269 }
270
271 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
272 my $estimate = 2 * $blocklen + (0x20000 >> $i);
273
274 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
275 if ($estimate < $bestest) {
276 $bestest = $estimate;
277 $bestshift = $i;
278 $bestblkstr = $blockstr;
279 }
280}
281
282my @blocks;
283my $blksize = 1 << $bestshift;
284for (my $j = 0; $j < 0x10000; $j += $blksize) {
285 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
286 my $index = index $bestblkstr, $blkkey;
287 while ($index & 1) {
288 die "not found: $j" if $index == -1;
289 $index = index $bestblkstr, $blkkey, $index + 1;
290 }
291 push @blocks, ($index / 2 - $j) & 0xffff;
292}
293
294# Phase 3: Generate the file
295die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
296 if @blocks > 0xffff / 3;
297die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
298 if length($bestblkstr) > 0xffff / 3;
299{
300 print "Generating $ARGV[1] with shift of $bestshift";
301 my ($i, $j);
302
303 open OUTPUT, "> $ARGV[1]" or die "Failed creating output file: $!\n";
304 print OUTPUT <<EOF;
305/* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
306 Copyright (C) 2002 Free Software Foundation, Inc.
307 *** This file is generated by scripts/unicode-muncher.pl ***
308
309This file is part of GNU Classpath.
310
311GNU Classpath is free software; you can redistribute it and/or modify
312it under the terms of the GNU General Public License as published by
313the Free Software Foundation; either version 2, or (at your option)
314any later version.
315
316GNU Classpath is distributed in the hope that it will be useful, but
317WITHOUT ANY WARRANTY; without even the implied warranty of
318MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
319General Public License for more details.
320
321You should have received a copy of the GNU General Public License
322along with GNU Classpath; see the file COPYING. If not, write to the
323Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
32402111-1307 USA.
325
326Linking this library statically or dynamically with other modules is
327making a combined work based on this library. Thus, the terms and
328conditions of the GNU General Public License cover the whole
329combination.
330
331As a special exception, the copyright holders of this library give you
332permission to link this library with independent modules to produce an
333executable, regardless of the license terms of these independent
334modules, and to copy and distribute the resulting executable under
335terms of your choice, provided that you also meet, for each linked
336independent module, the terms and conditions of the license of that
337module. An independent module is a module which is not derived from
338or based on this library. If you modify this library, you may extend
339this exception to your version of the library, but you are not
340obligated to do so. If you do not wish to do so, delete this
341exception statement from your version. */
342
343package gnu.java.lang;
344
345/**
346 * This contains the info about the unicode characters, that
347 * java.lang.Character needs. It is generated automatically from
348 * <code>$ARGV[0]</code>, by some
349 * perl scripts. This Unicode definition file can be found on the
350 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
351 * JDK 1.4 uses Unicode version 3.0.0.
352 *
353 * The data is stored as string constants, but Character will convert these
354 * Strings to their respective <code>char[]</code> components. The field
355 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
356 * characters within <code>DATA</code>. The DATA field, in turn, stores
357 * information about each character in the low order bits, and an offset
358 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
359 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
360 * attribute tables are much smaller than 0xffff entries; as many characters
361 * in Unicode share common attributes. Finally, there is a listing for
362 * <code>TITLE</code> exceptions (most characters just have the same
363 * title case as upper case).
364 *
365 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
366 * Eric Blake)
367 * \@see Character
368 */
369public interface CharData
370{
371 /**
372 * The Unicode definition file that was parsed to build this database.
373 */
374 String SOURCE = \"$ARGV[0]\";
375
376 /**
377 * The character shift amount to look up the block offset. In other words,
378 * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
379 * <code>ch</code> is described in <code>DATA</code>.
380 */
381 int SHIFT = $bestshift;
382
383 /**
384 * The mapping of character blocks to their location in <code>DATA</code>.
385 * Each entry has been adjusted so that the 16-bit sum with the desired
386 * character gives the actual index into <code>DATA</code>.
387 */
388 String BLOCKS
389EOF
390
391 for ($i = 0; $i < @blocks / 11; $i++) {
392 print OUTPUT $i ? "\n + \"" : " = \"";
393 for $j (0 .. 10) {
394 last if @blocks <= $i * 11 + $j;
395 my $val = $blocks[$i * 11 + $j];
396 print OUTPUT javaChar($val);
397 }
398 print OUTPUT "\"";
399 }
400
401 print OUTPUT <<EOF;
402;
403
404 /**
405 * Information about each character. The low order 5 bits form the
406 * character type, the next bit is a flag for non-breaking spaces, and the
407 * next bit is a flag for mirrored directionality. The high order 9 bits
408 * form the offset into the attribute tables. Note that this limits the
409 * number of unique character attributes to 512, which is not a problem
410 * as of Unicode version 3.2.0, but may soon become one.
411 */
412 String DATA
413EOF
414
415 my $len = length($bestblkstr) / 2;
416 for ($i = 0; $i < $len / 11; $i++) {
417 print OUTPUT $i ? "\n + \"" : " = \"";
418 for $j (0 .. 10) {
419 last if $len <= $i * 11 + $j;
420 my $val = unpack "n", substr($bestblkstr, 2 * ($i*11 + $j), 2);
421 print OUTPUT javaChar($val);
422 }
423 print OUTPUT "\"";
424 }
425
426 print OUTPUT <<EOF;
427;
428
429 /**
430 * This is the attribute table for computing the numeric value of a
431 * character. The value is -1 if Unicode does not define a value, -2
432 * if the value is not a positive integer, otherwise it is the value.
433 * Note that this is a signed value, but stored as an unsigned char
434 * since this is a String literal.
435 */
436 String NUM_VALUE
437EOF
438
439 $len = @charinfo;
440 for ($i = 0; $i < $len / 11; $i++) {
441 print OUTPUT $i ? "\n + \"" : " = \"";
442 for $j (0 .. 10) {
443 last if $len <= $i * 11 + $j;
444 my $val = $charinfo[$i * 11 + $j][0];
445 print OUTPUT javaChar($val);
446 }
447 print OUTPUT "\"";
448 }
449
450 print OUTPUT <<EOF;
451;
452
453 /**
454 * This is the attribute table for computing the uppercase representation
455 * of a character. The value is the signed difference between the
456 * character and its uppercase version. Note that this is stored as an
457 * unsigned char since this is a String literal.
458 */
459 String UPPER
460EOF
461
462 $len = @charinfo;
463 for ($i = 0; $i < $len / 11; $i++) {
464 print OUTPUT $i ? "\n + \"" : " = \"";
465 for $j (0 .. 10) {
466 last if $len <= $i * 11 + $j;
467 my $val = $charinfo[$i * 11 + $j][1];
468 print OUTPUT javaChar($val);
469 }
470 print OUTPUT "\"";
471 }
472
473 print OUTPUT <<EOF;
474;
475
476 /**
477 * This is the attribute table for computing the lowercase representation
478 * of a character. The value is the signed difference between the
479 * character and its lowercase version. Note that this is stored as an
480 * unsigned char since this is a String literal.
481 */
482 String LOWER
483EOF
484
485 $len = @charinfo;
486 for ($i = 0; $i < $len / 11; $i++) {
487 print OUTPUT $i ? "\n + \"" : " = \"";
488 for $j (0 .. 10) {
489 last if $len <= $i * 11 + $j;
490 my $val = $charinfo[$i * 11 + $j][2];
491 print OUTPUT javaChar($val);
492 }
493 print OUTPUT "\"";
494 }
495
496 print OUTPUT <<EOF;
497;
498
499 /**
500 * This is the attribute table for computing the directionality class
501 * of a character. At present, the value is in the range 0 - 18 if the
502 * character has a direction, otherwise it is -1. Note that this is
503 * stored as an unsigned char since this is a String literal.
504 */
505 String DIRECTION
506EOF
507
508 $len = @charinfo;
509 for ($i = 0; $i < $len / 11; $i++) {
510 print OUTPUT $i ? "\n + \"" : " = \"";
511 for $j (0 .. 10) {
512 last if $len <= $i * 11 + $j;
513 my $val = $charinfo[$i * 11 + $j][3];
514 print OUTPUT javaChar($val);
515 }
516 print OUTPUT "\"";
517 }
518
519 print OUTPUT <<EOF;
520;
521
522 /**
523 * This is the listing of titlecase special cases (all other character
524 * can use <code>UPPER</code> to determine their titlecase). The listing
525 * is a sequence of character pairs; converting the first character of the
526 * pair to titlecase produces the second character.
527 */
528 String TITLE
529EOF
530
531 $len = length($titlecase) / 2;
532 for ($i = 0; $i < $len / 11; $i++) {
533 print OUTPUT $i ? "\n + \"" : " = \"";
534 for $j (0 .. 10) {
535 last if $len <= $i * 11 + $j;
536 my $val = unpack "n", substr($titlecase, 2 * ($i*11 + $j), 2);
537 print OUTPUT javaChar($val);
538 }
539 print OUTPUT "\"";
540 }
541
542 print OUTPUT ";\n}\n";
543 close OUTPUT;
544}
545print "\nDone.\n";
Note: See TracBrowser for help on using the repository browser.