Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

unicode-muncher.pl@ 2828

Visit:

Last change on this file since 2828 was 2, checked in by bird, 23 years ago
Initial revision
Property cvs2svn:cvs-rev set to `1.1` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 18.5 KB

Line
1	#!/usr/bin/perl -w
2	# unicode-muncher.pl -- generate Unicode database for java.lang.Character
3	# Copyright (C) 1998, 2002 Free Software Foundation, Inc.
4	#
5	# This file is part of GNU Classpath.
6	#
7	# GNU Classpath is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2, or (at your option)
10	# any later version.
11	#
12	# GNU Classpath is distributed in the hope that it will be useful, but
13	# WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	# General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with GNU Classpath; see the file COPYING. If not, write to the
19	# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20	# 02111-1307 USA.
21	#
22	# Linking this library statically or dynamically with other modules is
23	# making a combined work based on this library. Thus, the terms and
24	# conditions of the GNU General Public License cover the whole
25	# combination.
26	#
27	# As a special exception, the copyright holders of this library give you
28	# permission to link this library with independent modules to produce an
29	# executable, regardless of the license terms of these independent
30	# modules, and to copy and distribute the resulting executable under
31	# terms of your choice, provided that you also meet, for each linked
32	# independent module, the terms and conditions of the license of that
33	# module. An independent module is a module which is not derived from
34	# or based on this library. If you modify this library, you may extend
35	# this exception to your version of the library, but you are not
36	# obligated to do so. If you do not wish to do so, delete this
37	# exception statement from your version.
38
39	# Code for reading UnicodeData.txt and generating the code for
40	# gnu.java.lang.CharData. For now, the relevant Unicode definition files
41	# are found in libjava/gnu/gcj/convert/.
42	#
43	# Inspired by code from Jochen Hoenicke.
44	# author Eric Blake <ebb9@email.byu.edu>
45	#
46	# Usage: ./unicode-muncher <UnicodeData.txt> <CharData.java>
47	# where <UnicodeData.txt> is obtained from www.unicode.org (named
48	# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
49	# is the final location for the Java interface gnu.java.lang.CharData.
50	# As of JDK 1.4, use Unicode version 3.0.0 for best results.
51
52	##
53	## Convert a 16-bit integer to a Java source code String literal character
54	##
55	sub javaChar($) {
56	my ($char) = @_;
57	die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
58	$char += 0x10000 if $char < 0;
59	# Special case characters that must be escaped, or are shorter as ASCII
60	return sprintf("\\%03o", $char) if $char < 0x20;
61	return "\\\"" if $char == 0x22;
62	return "\\\\" if $char == 0x5c;
63	return pack("C", $char) if $char < 0x7f;
64	return sprintf("\\u%04x", $char);
65	}
66
67	##
68	## Convert the text UnicodeData file from www.unicode.org into a Java
69	## interface with string constants holding the compressed information.
70	##
71	my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
72	SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
73	my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
74
75	my $NOBREAK_FLAG = 32;
76	my $MIRRORED_FLAG = 64;
77
78	my @info = ();
79	my $titlecase = "";
80	my $count = 0;
81	my $range = 0;
82
83	die "Usage: $0 <UnicodeData.txt> <CharData.java>" unless @ARGV == 2;
84	open (UNICODE, "< $ARGV[0]") \|\| die "Can't open Unicode attribute file: $!\n";
85
86	# Stage 1: Parse the attribute file
87	$\| = 1;
88	print "GNU Classpath Unicode Attribute Database Generator 2.0\n";
89	print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
90	print "Parsing attributes file";
91	while(<UNICODE>) {
92	print "." unless $count++ % 1000;
93	chomp;
94	s/\r//g;
95	my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
96	$mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
97	$ch = hex($ch);
98	next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
99
100	my ($type, $numValue, $upperchar, $lowerchar, $direction);
101
102	$type = 0;
103	while ($category !~ /^$TYPECODES[$type]$/) {
104	if (++$type == @TYPECODES) {
105	die "$ch: Unknown type: $category";
106	}
107	}
108	$type \|= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
109	$type \|= $MIRRORED_FLAG if ($mirrored =~ /Y/);
110
111	if ($numeric =~ /^[0-9]+$/) {
112	$numValue = $numeric;
113	die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
114	} elsif ($numeric eq "") {
115	# Special case sequences of 'a'-'z'
116	if ($ch >= 0x0041 && $ch <= 0x005a) {
117	$numValue = $ch - 0x0037;
118	} elsif ($ch >= 0x0061 && $ch <= 0x007a) {
119	$numValue = $ch - 0x0057;
120	} elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
121	$numValue = $ch - 0xff17;
122	} elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
123	$numValue = $ch - 0xff37;
124	} else {
125	$numValue = -1;
126	}
127	} else {
128	$numValue = -2;
129	}
130
131	$upperchar = $upcase ? hex($upcase) - $ch : 0;
132	$lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
133	if ($title ne $upcase) {
134	my $titlechar = $title ? hex($title) : $ch;
135	$titlecase .= pack("n2", $ch, $titlechar);
136	}
137
138	$direction = 0;
139	while ($bidir !~ /^$DIRCODES[$direction]$/) {
140	if (++$direction == @DIRCODES) {
141	$direction = -1;
142	last;
143	}
144	}
145
146	if ($range) {
147	die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
148	for ($range + 1 .. $ch - 1) {
149	$info[$_] = pack("n5", $type, $numValue, $upperchar,
150	$lowerchar, $direction);
151	}
152	$range = 0;
153	} elsif ($name =~ /First>$/) {
154	$range = $ch;
155	}
156	$info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
157	$direction);
158	}
159	close UNICODE;
160
161	# Stage 2: Compress the data structures
162	printf "\nCompressing data structures";
163	$count = 0;
164	my $info = ();
165	my %charhash = ();
166	my @charinfo = ();
167
168	for my $ch (0 .. 0xffff) {
169	print "." unless $count++ % 0x1000;
170	if (! defined $info[$ch]) {
171	$info[$ch] = pack("n5", 0, -1, 0, 0, -1);
172	}
173
174	my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
175	if (! exists $charhash{$info[$ch]}) {
176	push @charinfo, [ $numVal, $upper, $lower, $direction ];
177	$charhash{$info[$ch]} = $#charinfo;
178	}
179	$info .= pack("n", ($charhash{$info[$ch]} << 7) \| $type);
180	}
181
182	my $charlen = @charinfo;
183	my $bestshift;
184	my $bestest = 1000000;
185	my $bestblkstr;
186	die "Too many unique character entries: $charlen\n" if $charlen > 512;
187	print "\nUnique character entries: $charlen\n";
188
189	for my $i (3 .. 8) {
190	my $blksize = 1 << $i;
191	my %blocks = ();
192	my @blkarray = ();
193	my ($j, $k);
194	print "shift: $i";
195
196	for ($j = 0; $j < 0x10000; $j += $blksize) {
197	my $blkkey = substr $info, 2 * $j, 2 * $blksize;
198	if (! exists $blocks{$blkkey}) {
199	push @blkarray, $blkkey;
200	$blocks{$blkkey} = $#blkarray;
201	}
202	}
203	my $blknum = @blkarray;
204	my $blocklen = $blknum * $blksize;
205	printf " before %5d", $blocklen;
206
207	# Now we try to pack the blkarray as tight as possible by finding matching
208	# heads and tails.
209	for ($j = $blksize - 1; $j > 0; $j--) {
210	my %tails = ();
211	for $k (0 .. $#blkarray) {
212	next if ! defined $blkarray[$k];
213	my $len = length $blkarray[$k];
214	my $tail = substr $blkarray[$k], $len - $j * 2;
215	if (exists $tails{$tail}) {
216	push @{$tails{$tail}}, $k;
217	} else {
218	$tails{$tail} = [ $k ];
219	}
220	}
221
222	# tails are calculated, now calculate the heads and merge.
223	BLOCK:
224	for $k (0 .. $#blkarray) {
225	next if ! defined $blkarray[$k];
226	my $tomerge = $k;
227	while (1) {
228	my $head = substr($blkarray[$tomerge], 0, $j * 2);
229	my $entry = $tails{$head};
230	next BLOCK if ! defined $entry;
231
232	my $other = shift @{$entry};
233	if ($other == $tomerge) {
234	if (@{$entry}) {
235	push @{$entry}, $other;
236	$other = shift @{$entry};
237	} else {
238	push @{$entry}, $other;
239	next BLOCK;
240	}
241	}
242	if (@{$entry} == 0) {
243	delete $tails{$head};
244	}
245
246	# a match was found
247	my $merge = $blkarray[$other]
248	. substr($blkarray[$tomerge], $j * 2);
249	$blocklen -= $j;
250	$blknum--;
251
252	if ($other < $tomerge) {
253	$blkarray[$tomerge] = undef;
254	$blkarray[$other] = $merge;
255	my $len = length $merge;
256	my $tail = substr $merge, $len - $j * 2;
257	$tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
258	@{$tails{$tail}} ];
259	next BLOCK;
260	}
261	$blkarray[$tomerge] = $merge;
262	$blkarray[$other] = undef;
263	}
264	}
265	}
266	my $blockstr;
267	for $k (0 .. $#blkarray) {
268	$blockstr .= $blkarray[$k] if defined $blkarray[$k];
269	}
270
271	die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
272	my $estimate = 2 * $blocklen + (0x20000 >> $i);
273
274	printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
275	if ($estimate < $bestest) {
276	$bestest = $estimate;
277	$bestshift = $i;
278	$bestblkstr = $blockstr;
279	}
280	}
281
282	my @blocks;
283	my $blksize = 1 << $bestshift;
284	for (my $j = 0; $j < 0x10000; $j += $blksize) {
285	my $blkkey = substr $info, 2 * $j, 2 * $blksize;
286	my $index = index $bestblkstr, $blkkey;
287	while ($index & 1) {
288	die "not found: $j" if $index == -1;
289	$index = index $bestblkstr, $blkkey, $index + 1;
290	}
291	push @blocks, ($index / 2 - $j) & 0xffff;
292	}
293
294	# Phase 3: Generate the file
295	die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
296	if @blocks > 0xffff / 3;
297	die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
298	if length($bestblkstr) > 0xffff / 3;
299	{
300	print "Generating $ARGV[1] with shift of $bestshift";
301	my ($i, $j);
302
303	open OUTPUT, "> $ARGV[1]" or die "Failed creating output file: $!\n";
304	print OUTPUT <<EOF;
305	/* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
306	Copyright (C) 2002 Free Software Foundation, Inc.
307	* This file is generated by scripts/unicode-muncher.pl *
308
309	This file is part of GNU Classpath.
310
311	GNU Classpath is free software; you can redistribute it and/or modify
312	it under the terms of the GNU General Public License as published by
313	the Free Software Foundation; either version 2, or (at your option)
314	any later version.
315
316	GNU Classpath is distributed in the hope that it will be useful, but
317	WITHOUT ANY WARRANTY; without even the implied warranty of
318	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
319	General Public License for more details.
320
321	You should have received a copy of the GNU General Public License
322	along with GNU Classpath; see the file COPYING. If not, write to the
323	Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
324	02111-1307 USA.
325
326	Linking this library statically or dynamically with other modules is
327	making a combined work based on this library. Thus, the terms and
328	conditions of the GNU General Public License cover the whole
329	combination.
330
331	As a special exception, the copyright holders of this library give you
332	permission to link this library with independent modules to produce an
333	executable, regardless of the license terms of these independent
334	modules, and to copy and distribute the resulting executable under
335	terms of your choice, provided that you also meet, for each linked
336	independent module, the terms and conditions of the license of that
337	module. An independent module is a module which is not derived from
338	or based on this library. If you modify this library, you may extend
339	this exception to your version of the library, but you are not
340	obligated to do so. If you do not wish to do so, delete this
341	exception statement from your version. */
342
343	package gnu.java.lang;
344
345	/**
346	* This contains the info about the unicode characters, that
347	* java.lang.Character needs. It is generated automatically from
348	* <code>$ARGV[0]</code>, by some
349	* perl scripts. This Unicode definition file can be found on the
350	* <a href="http://www.unicode.org">http://www.unicode.org</a> website.
351	* JDK 1.4 uses Unicode version 3.0.0.
352	*
353	* The data is stored as string constants, but Character will convert these
354	* Strings to their respective <code>char[]</code> components. The field
355	* <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
356	* characters within <code>DATA</code>. The DATA field, in turn, stores
357	* information about each character in the low order bits, and an offset
358	* into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
359	* <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
360	* attribute tables are much smaller than 0xffff entries; as many characters
361	* in Unicode share common attributes. Finally, there is a listing for
362	* <code>TITLE</code> exceptions (most characters just have the same
363	* title case as upper case).
364	*
365	* \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
366	* Eric Blake)
367	* \@see Character
368	*/
369	public interface CharData
370	{
371	/**
372	* The Unicode definition file that was parsed to build this database.
373	*/
374	String SOURCE = \"$ARGV[0]\";
375
376	/**
377	* The character shift amount to look up the block offset. In other words,
378	* <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
379	* <code>ch</code> is described in <code>DATA</code>.
380	*/
381	int SHIFT = $bestshift;
382
383	/**
384	* The mapping of character blocks to their location in <code>DATA</code>.
385	* Each entry has been adjusted so that the 16-bit sum with the desired
386	* character gives the actual index into <code>DATA</code>.
387	*/
388	String BLOCKS
389	EOF
390
391	for ($i = 0; $i < @blocks / 11; $i++) {
392	print OUTPUT $i ? "\n + \"" : " = \"";
393	for $j (0 .. 10) {
394	last if @blocks <= $i * 11 + $j;
395	my $val = $blocks[$i * 11 + $j];
396	print OUTPUT javaChar($val);
397	}
398	print OUTPUT "\"";
399	}
400
401	print OUTPUT <<EOF;
402	;
403
404	/**
405	* Information about each character. The low order 5 bits form the
406	* character type, the next bit is a flag for non-breaking spaces, and the
407	* next bit is a flag for mirrored directionality. The high order 9 bits
408	* form the offset into the attribute tables. Note that this limits the
409	* number of unique character attributes to 512, which is not a problem
410	* as of Unicode version 3.2.0, but may soon become one.
411	*/
412	String DATA
413	EOF
414
415	my $len = length($bestblkstr) / 2;
416	for ($i = 0; $i < $len / 11; $i++) {
417	print OUTPUT $i ? "\n + \"" : " = \"";
418	for $j (0 .. 10) {
419	last if $len <= $i * 11 + $j;
420	my $val = unpack "n", substr($bestblkstr, 2 * ($i*11 + $j), 2);
421	print OUTPUT javaChar($val);
422	}
423	print OUTPUT "\"";
424	}
425
426	print OUTPUT <<EOF;
427	;
428
429	/**
430	* This is the attribute table for computing the numeric value of a
431	* character. The value is -1 if Unicode does not define a value, -2
432	* if the value is not a positive integer, otherwise it is the value.
433	* Note that this is a signed value, but stored as an unsigned char
434	* since this is a String literal.
435	*/
436	String NUM_VALUE
437	EOF
438
439	$len = @charinfo;
440	for ($i = 0; $i < $len / 11; $i++) {
441	print OUTPUT $i ? "\n + \"" : " = \"";
442	for $j (0 .. 10) {
443	last if $len <= $i * 11 + $j;
444	my $val = $charinfo[$i * 11 + $j][0];
445	print OUTPUT javaChar($val);
446	}
447	print OUTPUT "\"";
448	}
449
450	print OUTPUT <<EOF;
451	;
452
453	/**
454	* This is the attribute table for computing the uppercase representation
455	* of a character. The value is the signed difference between the
456	* character and its uppercase version. Note that this is stored as an
457	* unsigned char since this is a String literal.
458	*/
459	String UPPER
460	EOF
461
462	$len = @charinfo;
463	for ($i = 0; $i < $len / 11; $i++) {
464	print OUTPUT $i ? "\n + \"" : " = \"";
465	for $j (0 .. 10) {
466	last if $len <= $i * 11 + $j;
467	my $val = $charinfo[$i * 11 + $j][1];
468	print OUTPUT javaChar($val);
469	}
470	print OUTPUT "\"";
471	}
472
473	print OUTPUT <<EOF;
474	;
475
476	/**
477	* This is the attribute table for computing the lowercase representation
478	* of a character. The value is the signed difference between the
479	* character and its lowercase version. Note that this is stored as an
480	* unsigned char since this is a String literal.
481	*/
482	String LOWER
483	EOF
484
485	$len = @charinfo;
486	for ($i = 0; $i < $len / 11; $i++) {
487	print OUTPUT $i ? "\n + \"" : " = \"";
488	for $j (0 .. 10) {
489	last if $len <= $i * 11 + $j;
490	my $val = $charinfo[$i * 11 + $j][2];
491	print OUTPUT javaChar($val);
492	}
493	print OUTPUT "\"";
494	}
495
496	print OUTPUT <<EOF;
497	;
498
499	/**
500	* This is the attribute table for computing the directionality class
501	* of a character. At present, the value is in the range 0 - 18 if the
502	* character has a direction, otherwise it is -1. Note that this is
503	* stored as an unsigned char since this is a String literal.
504	*/
505	String DIRECTION
506	EOF
507
508	$len = @charinfo;
509	for ($i = 0; $i < $len / 11; $i++) {
510	print OUTPUT $i ? "\n + \"" : " = \"";
511	for $j (0 .. 10) {
512	last if $len <= $i * 11 + $j;
513	my $val = $charinfo[$i * 11 + $j][3];
514	print OUTPUT javaChar($val);
515	}
516	print OUTPUT "\"";
517	}
518
519	print OUTPUT <<EOF;
520	;
521
522	/**
523	* This is the listing of titlecase special cases (all other character
524	* can use <code>UPPER</code> to determine their titlecase). The listing
525	* is a sequence of character pairs; converting the first character of the
526	* pair to titlecase produces the second character.
527	*/
528	String TITLE
529	EOF
530
531	$len = length($titlecase) / 2;
532	for ($i = 0; $i < $len / 11; $i++) {
533	print OUTPUT $i ? "\n + \"" : " = \"";
534	for $j (0 .. 10) {
535	last if $len <= $i * 11 + $j;
536	my $val = unpack "n", substr($titlecase, 2 * ($i*11 + $j), 2);
537	print OUTPUT javaChar($val);
538	}
539	print OUTPUT "\"";
540	}
541
542	print OUTPUT ";\n}\n";
543	close OUTPUT;
544	}
545	print "\nDone.\n";

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/gcc/libjava/scripts/unicode-muncher.pl@ 2828

Download in other formats: