Context Navigation

Collate.pm

Visit:

Last change on this file was 3181, checked in by bird, 18 years ago
perl 5.8.8
File size: 51.5 KB

Line
1	package Unicode::Collate;
2
3	BEGIN {
4	unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6	}
7	}
8
9	use 5.006;
10	use strict;
11	use warnings;
12	use Carp;
13	use File::Spec;
14
15	no warnings 'utf8';
16
17	our $VERSION = '0.52';
18	our $PACKAGE = __PACKAGE__;
19
20	my @Path = qw(Unicode Collate);
21	my $KeyFile = "allkeys.txt";
22
23	# Perl's boolean
24	use constant TRUE => 1;
25	use constant FALSE => "";
26	use constant NOMATCHPOS => -1;
27
28	# A coderef to get combining class imported from Unicode::Normalize
29	# (i.e. \&Unicode::Normalize::getCombinClass).
30	# This is also used as a HAS_UNICODE_NORMALIZE flag.
31	my $CVgetCombinClass;
32
33	# Supported Levels
34	use constant MinLevel => 1;
35	use constant MaxLevel => 4;
36
37	# Minimum weights at level 2 and 3, respectively
38	use constant Min2Wt => 0x20;
39	use constant Min3Wt => 0x02;
40
41	# Shifted weight at 4th level
42	use constant Shift4Wt => 0xFFFF;
43
44	# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45	# PROBLEM: The Default Unicode Collation Element Table
46	# has weights over 0xFFFF at the 4th level.
47	# The tie-breaking in the variable weights
48	# other than "shift" (as well as "shift-trimmed") is unreliable.
49	use constant VCE_TEMPLATE => 'Cn4';
50
51	# A sort key: 16-bit weights
52	# See also the PROBLEM on VCE_TEMPLATE above.
53	use constant KEY_TEMPLATE => 'n*';
54
55	# Level separator in a sort key:
56	# i.e. pack(KEY_TEMPLATE, 0)
57	use constant LEVEL_SEP => "\0\0";
58
59	# As Unicode code point separator for hash keys.
60	# A joined code point string (denoted by JCPS below)
61	# like "65;768" is used for internal processing
62	# instead of Perl's Unicode string like "\x41\x{300}",
63	# as the native code point is different from the Unicode code point
64	# on EBCDIC platform.
65	# This character must not be included in any stringified
66	# representation of an integer.
67	use constant CODE_SEP => ';';
68
69	# boolean values of variable weights
70	use constant NON_VAR => 0; # Non-Variable character
71	use constant VAR => 1; # Variable character
72
73	# specific code points
74	use constant Hangul_LBase => 0x1100;
75	use constant Hangul_LIni => 0x1100;
76	use constant Hangul_LFin => 0x1159;
77	use constant Hangul_LFill => 0x115F;
78	use constant Hangul_VBase => 0x1161;
79	use constant Hangul_VIni => 0x1160; # from Vowel Filler
80	use constant Hangul_VFin => 0x11A2;
81	use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
82	use constant Hangul_TIni => 0x11A8;
83	use constant Hangul_TFin => 0x11F9;
84	use constant Hangul_TCount => 28;
85	use constant Hangul_NCount => 588;
86	use constant Hangul_SBase => 0xAC00;
87	use constant Hangul_SIni => 0xAC00;
88	use constant Hangul_SFin => 0xD7A3;
89	use constant CJK_UidIni => 0x4E00;
90	use constant CJK_UidFin => 0x9FA5;
91	use constant CJK_UidF41 => 0x9FBB;
92	use constant CJK_ExtAIni => 0x3400;
93	use constant CJK_ExtAFin => 0x4DB5;
94	use constant CJK_ExtBIni => 0x20000;
95	use constant CJK_ExtBFin => 0x2A6D6;
96	use constant BMP_Max => 0xFFFF;
97
98	# Logical_Order_Exception in PropList.txt
99	my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
100
101	sub UCA_Version { "14" }
102
103	sub Base_Unicode_Version { "4.1.0" }
104
105	######
106
107	sub pack_U {
108	return pack('U*', @_);
109	}
110
111	sub unpack_U {
112	return unpack('U', shift(@_).pack('U'));
113	}
114
115	######
116
117	my (%VariableOK);
118	@VariableOK{ qw/
119	blanked non-ignorable shifted shift-trimmed
120	/ } = (); # keys lowercased
121
122	our @ChangeOK = qw/
123	alternate backwards level normalization rearrange
124	katakana_before_hiragana upper_before_lower
125	overrideHangul overrideCJK preprocess UCA_Version
126	hangul_terminator variable
127	/;
128
129	our @ChangeNG = qw/
130	entry mapping table maxlength
131	ignoreChar ignoreName undefChar undefName variableTable
132	versionTable alternateTable backwardsTable forwardsTable rearrangeTable
133	derivCode normCode rearrangeHash
134	backwardsFlag
135	/;
136	# The hash key 'ignored' is deleted at v 0.21.
137	# The hash key 'isShift' is deleted at v 0.23.
138	# The hash key 'combining' is deleted at v 0.24.
139	# The hash key 'entries' is deleted at v 0.30.
140	# The hash key 'L3_ignorable' is deleted at v 0.40.
141
142	sub version {
143	my $self = shift;
144	return $self->{versionTable} \|\| 'unknown';
145	}
146
147	my (%ChangeOK, %ChangeNG);
148	@ChangeOK{ @ChangeOK } = ();
149	@ChangeNG{ @ChangeNG } = ();
150
151	sub change {
152	my $self = shift;
153	my %hash = @_;
154	my %old;
155	if (exists $hash{variable} && exists $hash{alternate}) {
156	delete $hash{alternate};
157	}
158	elsif (!exists $hash{variable} && exists $hash{alternate}) {
159	$hash{variable} = $hash{alternate};
160	}
161	foreach my $k (keys %hash) {
162	if (exists $ChangeOK{$k}) {
163	$old{$k} = $self->{$k};
164	$self->{$k} = $hash{$k};
165	}
166	elsif (exists $ChangeNG{$k}) {
167	croak "change of $k via change() is not allowed!";
168	}
169	# else => ignored
170	}
171	$self->checkCollator();
172	return wantarray ? %old : $self;
173	}
174
175	sub _checkLevel {
176	my $level = shift;
177	my $key = shift; # 'level' or 'backwards'
178	MinLevel <= $level or croak sprintf
179	"Illegal level %d (in value for key '%s') lower than %d.",
180	$level, $key, MinLevel;
181	$level <= MaxLevel or croak sprintf
182	"Unsupported level %d (in value for key '%s') higher than %d.",
183	$level, $key, MaxLevel;
184	}
185
186	my %DerivCode = (
187	8 => \&_derivCE_8,
188	9 => \&_derivCE_9,
189	11 => \&_derivCE_9, # 11 == 9
190	14 => \&_derivCE_14,
191	);
192
193	sub checkCollator {
194	my $self = shift;
195	_checkLevel($self->{level}, "level");
196
197	$self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
199
200	$self->{variable} \|\|= $self->{alternate} \|\| $self->{variableTable} \|\|
201	$self->{alternateTable} \|\| 'shifted';
202	$self->{variable} = $self->{alternate} = lc($self->{variable});
203	exists $VariableOK{ $self->{variable} }
204	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
205
206	if (! defined $self->{backwards}) {
207	$self->{backwardsFlag} = 0;
208	}
209	elsif (! ref $self->{backwards}) {
210	_checkLevel($self->{backwards}, "backwards");
211	$self->{backwardsFlag} = 1 << $self->{backwards};
212	}
213	else {
214	my %level;
215	$self->{backwardsFlag} = 0;
216	for my $b (@{ $self->{backwards} }) {
217	_checkLevel($b, "backwards");
218	$level{$b} = 1;
219	}
220	for my $v (sort keys %level) {
221	$self->{backwardsFlag} += 1 << $v;
222	}
223	}
224
225	defined $self->{rearrange} or $self->{rearrange} = [];
226	ref $self->{rearrange}
227	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
228
229	# keys of $self->{rearrangeHash} are $self->{rearrange}.
230	$self->{rearrangeHash} = undef;
231
232	if (@{ $self->{rearrange} }) {
233	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234	}
235
236	$self->{normCode} = undef;
237
238	if (defined $self->{normalization}) {
239	eval { require Unicode::Normalize };
240	$@ and croak "Unicode::Normalize is required to normalize strings";
241
242	$CVgetCombinClass \|\|= \&Unicode::Normalize::getCombinClass;
243
244	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245	$self->{normCode} = \&Unicode::Normalize::NFD;
246	}
247	elsif ($self->{normalization} ne 'prenormalized') {
248	my $norm = $self->{normalization};
249	$self->{normCode} = sub {
250	Unicode::Normalize::normalize($norm, shift);
251	};
252	eval { $self->{normCode}->("") }; # try
253	$@ and croak "$PACKAGE unknown normalization form name: $norm";
254	}
255	}
256	return;
257	}
258
259	sub new
260	{
261	my $class = shift;
262	my $self = bless { @_ }, $class;
263
264	# If undef is passed explicitly, no file is read.
265	$self->{table} = $KeyFile if ! exists $self->{table};
266	$self->read_table() if defined $self->{table};
267
268	if ($self->{entry}) {
269	while ($self->{entry} =~ /([^\n]+)/g) {
270	$self->parseEntry($1);
271	}
272	}
273
274	$self->{level} \|\|= MaxLevel;
275	$self->{UCA_Version} \|\|= UCA_Version();
276
277	$self->{overrideHangul} = FALSE
278	if ! exists $self->{overrideHangul};
279	$self->{overrideCJK} = FALSE
280	if ! exists $self->{overrideCJK};
281	$self->{normalization} = 'NFD'
282	if ! exists $self->{normalization};
283	$self->{rearrange} = $self->{rearrangeTable} \|\|
284	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
285	if ! exists $self->{rearrange};
286	$self->{backwards} = $self->{backwardsTable}
287	if ! exists $self->{backwards};
288
289	$self->checkCollator();
290
291	return $self;
292	}
293
294	sub read_table {
295	my $self = shift;
296
297	my($f, $fh);
298	foreach my $d (@INC) {
299	$f = File::Spec->catfile($d, @Path, $self->{table});
300	last if open($fh, $f);
301	$f = undef;
302	}
303	if (!defined $f) {
304	$f = File::Spec->catfile(@Path, $self->{table});
305	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
306	}
307
308	while (my $line = <$fh>) {
309	next if $line =~ /^\s*#/;
310	unless ($line =~ s/^\s*\@//) {
311	$self->parseEntry($line);
312	next;
313	}
314
315	# matched ^\s*\@
316	if ($line =~ /^version\s(\S)/) {
317	$self->{versionTable} \|\|= $1;
318	}
319	elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
320	$self->{variableTable} \|\|= $1;
321	}
322	elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
323	$self->{alternateTable} \|\|= $1;
324	}
325	elsif ($line =~ /^backwards\s+(\S*)/) {
326	push @{ $self->{backwardsTable} }, $1;
327	}
328	elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
329	push @{ $self->{forwardsTable} }, $1;
330	}
331	elsif ($line =~ /^rearrange\s+(.)/) { # (\S) is NG
332	push @{ $self->{rearrangeTable} }, _getHexArray($1);
333	}
334	}
335	close $fh;
336	}
337
338
339	##
340	## get $line, parse it, and write an entry in $self
341	##
342	sub parseEntry
343	{
344	my $self = shift;
345	my $line = shift;
346	my($name, $entry, @uv, @key);
347
348	return if $line !~ /^\s*[0-9A-Fa-f]/;
349
350	# removes comment and gets name
351	$name = $1
352	if $line =~ s/[#%]\s(.)//;
353	return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
354
355	# gets element
356	my($e, $k) = split /;/, $line;
357	croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
358	if ! $k;
359
360	@uv = _getHexArray($e);
361	return if !@uv;
362
363	$entry = join(CODE_SEP, @uv); # in JCPS
364
365	if (defined $self->{undefChar} \|\| defined $self->{ignoreChar}) {
366	my $ele = pack_U(@uv);
367
368	# regarded as if it were not entried in the table
369	return
370	if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
371
372	# replaced as completely ignorable
373	$k = '[.0000.0000.0000.0000]'
374	if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
375	}
376
377	# replaced as completely ignorable
378	$k = '[.0000.0000.0000.0000]'
379	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
380
381	my $is_L3_ignorable = TRUE;
382
383	foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
384	my $var = $arr =~ /\/; # exactly /^\/ but be lenient.
385	my @wt = _getHexArray($arr);
386	push @key, pack(VCE_TEMPLATE, $var, @wt);
387	$is_L3_ignorable = FALSE
388	if $wt[0] \|\| $wt[1] \|\| $wt[2];
389	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
390	# is completely ignorable.
391	# For expansion, an entry $is_L3_ignorable
392	# if and only if "all" CEs are [.0000.0000.0000].
393	}
394
395	$self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
396
397	if (@uv > 1) {
398	(!$self->{maxlength}{$uv[0]} \|\| $self->{maxlength}{$uv[0]} < @uv)
399	and $self->{maxlength}{$uv[0]} = @uv;
400	}
401	}
402
403
404	##
405	## VCE = _varCE(variable term, VCE)
406	##
407	sub _varCE
408	{
409	my $vbl = shift;
410	my $vce = shift;
411	if ($vbl eq 'non-ignorable') {
412	return $vce;
413	}
414	my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
415
416	if ($var) {
417	return pack(VCE_TEMPLATE, $var, 0, 0, 0,
418	$vbl eq 'blanked' ? $wt[3] : $wt[0]);
419	}
420	elsif ($vbl eq 'blanked') {
421	return $vce;
422	}
423	else {
424	return pack(VCE_TEMPLATE, $var, @wt[0..2],
425	$vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
426	}
427	}
428
429	sub viewSortKey
430	{
431	my $self = shift;
432	$self->visualizeSortKey($self->getSortKey(@_));
433	}
434
435	sub visualizeSortKey
436	{
437	my $self = shift;
438	my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
439
440	if ($self->{UCA_Version} <= 8) {
441	$view =~ s/ ?0000 ?/\|/g;
442	} else {
443	$view =~ s/\b0000\b/\|/g;
444	}
445	return "[$view]";
446	}
447
448
449	##
450	## arrayref of JCPS = splitEnt(string to be collated)
451	## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
452	##
453	sub splitEnt
454	{
455	my $self = shift;
456	my $wLen = $_[1];
457
458	my $code = $self->{preprocess};
459	my $norm = $self->{normCode};
460	my $map = $self->{mapping};
461	my $max = $self->{maxlength};
462	my $reH = $self->{rearrangeHash};
463	my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
464
465	my ($str, @buf);
466
467	if ($wLen) {
468	$code and croak "Preprocess breaks character positions. "
469	. "Don't use with index(), match(), etc.";
470	$norm and croak "Normalization breaks character positions. "
471	. "Don't use with index(), match(), etc.";
472	$str = $_[0];
473	}
474	else {
475	$str = $_[0];
476	$str = &$code($str) if ref $code;
477	$str = &$norm($str) if ref $norm;
478	}
479
480	# get array of Unicode code point of string.
481	my @src = unpack_U($str);
482
483	# rearrangement:
484	# Character positions are not kept if rearranged,
485	# then neglected if $wLen is true.
486	if ($reH && ! $wLen) {
487	for (my $i = 0; $i < @src; $i++) {
488	if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
489	($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
490	$i++;
491	}
492	}
493	}
494
495	# remove a code point marked as a completely ignorable.
496	for (my $i = 0; $i < @src; $i++) {
497	$src[$i] = undef
498	if _isIllegal($src[$i]) \|\| ($ver9 &&
499	$map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
500	}
501
502	for (my $i = 0; $i < @src; $i++) {
503	my $jcps = $src[$i];
504
505	# skip removed code point
506	if (! defined $jcps) {
507	if ($wLen && @buf) {
508	$buf[-1][2] = $i + 1;
509	}
510	next;
511	}
512
513	my $i_orig = $i;
514
515	# find contraction
516	if ($max->{$jcps}) {
517	my $temp_jcps = $jcps;
518	my $jcpsLen = 1;
519	my $maxLen = $max->{$jcps};
520
521	for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
522	next if ! defined $src[$p];
523	$temp_jcps .= CODE_SEP . $src[$p];
524	$jcpsLen++;
525	if ($map->{$temp_jcps}) {
526	$jcps = $temp_jcps;
527	$i = $p;
528	}
529	}
530
531	# not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
532	# This process requires Unicode::Normalize.
533	# If "normalization" is undef, here should be skipped always
534	# (in spite of bool value of $CVgetCombinClass),
535	# since canonical ordering cannot be expected.
536	# Blocked combining character should not be contracted.
537
538	if ($self->{normalization})
539	# $self->{normCode} is false in the case of "prenormalized".
540	{
541	my $preCC = 0;
542	my $curCC = 0;
543
544	for (my $p = $i + 1; $p < @src; $p++) {
545	next if ! defined $src[$p];
546	$curCC = $CVgetCombinClass->($src[$p]);
547	last unless $curCC;
548	my $tail = CODE_SEP . $src[$p];
549	if ($preCC != $curCC && $map->{$jcps.$tail}) {
550	$jcps .= $tail;
551	$src[$p] = undef;
552	} else {
553	$preCC = $curCC;
554	}
555	}
556	}
557	}
558
559	# skip completely ignorable
560	if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
561	if ($wLen && @buf) {
562	$buf[-1][2] = $i + 1;
563	}
564	next;
565	}
566
567	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
568	}
569	return \@buf;
570	}
571
572
573	##
574	## list of VCE = getWt(JCPS)
575	##
576	sub getWt
577	{
578	my $self = shift;
579	my $u = shift;
580	my $vbl = $self->{variable};
581	my $map = $self->{mapping};
582	my $der = $self->{derivCode};
583
584	return if !defined $u;
585	return map(_varCE($vbl, $_), @{ $map->{$u} })
586	if $map->{$u};
587
588	# JCPS must not be a contraction, then it's a code point.
589	if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
590	my $hang = $self->{overrideHangul};
591	my @hangulCE;
592	if ($hang) {
593	@hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
594	}
595	elsif (!defined $hang) {
596	@hangulCE = $der->($u);
597	}
598	else {
599	my $max = $self->{maxlength};
600	my @decH = _decompHangul($u);
601
602	if (@decH == 2) {
603	my $contract = join(CODE_SEP, @decH);
604	@decH = ($contract) if $map->{$contract};
605	} else { # must be <@decH == 3>
606	if ($max->{$decH[0]}) {
607	my $contract = join(CODE_SEP, @decH);
608	if ($map->{$contract}) {
609	@decH = ($contract);
610	} else {
611	$contract = join(CODE_SEP, @decH[0,1]);
612	$map->{$contract} and @decH = ($contract, $decH[2]);
613	}
614	# even if V's ignorable, LT contraction is not supported.
615	# If such a situatution were required, NFD should be used.
616	}
617	if (@decH == 3 && $max->{$decH[1]}) {
618	my $contract = join(CODE_SEP, @decH[1,2]);
619	$map->{$contract} and @decH = ($decH[0], $contract);
620	}
621	}
622
623	@hangulCE = map({
624	$map->{$_} ? @{ $map->{$_} } : $der->($_);
625	} @decH);
626	}
627	return map _varCE($vbl, $_), @hangulCE;
628	}
629	elsif (_isUIdeo($u, $self->{UCA_Version})) {
630	my $cjk = $self->{overrideCJK};
631	return map _varCE($vbl, $_),
632	$cjk
633	? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
634	: defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
635	? _uideoCE_8($u)
636	: $der->($u);
637	}
638	else {
639	return map _varCE($vbl, $_), $der->($u);
640	}
641	}
642
643
644	##
645	## string sortkey = getSortKey(string arg)
646	##
647	sub getSortKey
648	{
649	my $self = shift;
650	my $lev = $self->{level};
651	my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
652	my $v2i = $self->{UCA_Version} >= 9 &&
653	$self->{variable} ne 'non-ignorable';
654
655	my @buf; # weight arrays
656	if ($self->{hangul_terminator}) {
657	my $preHST = '';
658	foreach my $jcps (@$rEnt) {
659	# weird things like VL, TL-contraction are not considered!
660	my $curHST = '';
661	foreach my $u (split /;/, $jcps) {
662	$curHST .= getHST($u);
663	}
664	if ($preHST && !$curHST \|\| # hangul before non-hangul
665	$preHST =~ /L\z/ && $curHST =~ /^T/ \|\|
666	$preHST =~ /V\z/ && $curHST =~ /^L/ \|\|
667	$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
668
669	push @buf, $self->getWtHangulTerm();
670	}
671	$preHST = $curHST;
672
673	push @buf, $self->getWt($jcps);
674	}
675	$preHST # end at hangul
676	and push @buf, $self->getWtHangulTerm();
677	}
678	else {
679	foreach my $jcps (@$rEnt) {
680	push @buf, $self->getWt($jcps);
681	}
682	}
683
684	# make sort key
685	my @ret = ([],[],[],[]);
686	my $last_is_variable;
687
688	foreach my $vwt (@buf) {
689	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
690
691	# "Ignorable (L1, L2) after Variable" since track. v. 9
692	if ($v2i) {
693	if ($var) {
694	$last_is_variable = TRUE;
695	}
696	elsif (!$wt[0]) { # ignorable
697	next if $last_is_variable;
698	}
699	else {
700	$last_is_variable = FALSE;
701	}
702	}
703	foreach my $v (0..$lev-1) {
704	0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
705	}
706	}
707
708	# modification of tertiary weights
709	if ($self->{upper_before_lower}) {
710	foreach my $w (@{ $ret[2] }) {
711	if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
712	elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
713	elsif ($w == 0x1C) { $w += 1 } # square upper
714	elsif ($w == 0x1D) { $w -= 1 } # square lower
715	}
716	}
717	if ($self->{katakana_before_hiragana}) {
718	foreach my $w (@{ $ret[2] }) {
719	if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
720	elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
721	}
722	}
723
724	if ($self->{backwardsFlag}) {
725	for (my $v = MinLevel; $v <= MaxLevel; $v++) {
726	if ($self->{backwardsFlag} & (1 << $v)) {
727	@{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
728	}
729	}
730	}
731
732	join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
733	}
734
735
736	##
737	## int compare = cmp(string a, string b)
738	##
739	sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
740	sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
741	sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
742	sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
743	sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
744	sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
745	sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
746
747	##
748	## list[strings] sorted = sort(list[strings] arg)
749	##
750	sub sort {
751	my $obj = shift;
752	return
753	map { $_->[1] }
754	sort{ $a->[0] cmp $b->[0] }
755	map [ $obj->getSortKey($_), $_ ], @_;
756	}
757
758
759	sub _derivCE_14 {
760	my $u = shift;
761	my $base =
762	(CJK_UidIni <= $u && $u <= CJK_UidF41)
763	? 0xFB40 : # CJK
764	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin \|\|
765	CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
766	? 0xFB80 # CJK ext.
767	: 0xFBC0; # others
768
769	my $aaaa = $base + ($u >> 15);
770	my $bbbb = ($u & 0x7FFF) \| 0x8000;
771	return
772	pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
773	pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
774	}
775
776	sub _derivCE_9 {
777	my $u = shift;
778	my $base =
779	(CJK_UidIni <= $u && $u <= CJK_UidFin)
780	? 0xFB40 : # CJK
781	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin \|\|
782	CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
783	? 0xFB80 # CJK ext.
784	: 0xFBC0; # others
785
786	my $aaaa = $base + ($u >> 15);
787	my $bbbb = ($u & 0x7FFF) \| 0x8000;
788	return
789	pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
790	pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
791	}
792
793	sub _derivCE_8 {
794	my $code = shift;
795	my $aaaa = 0xFF80 + ($code >> 15);
796	my $bbbb = ($code & 0x7FFF) \| 0x8000;
797	return
798	pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
799	pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
800	}
801
802	sub _uideoCE_8 {
803	my $u = shift;
804	return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
805	}
806
807	sub _isUIdeo {
808	my ($u, $uca_vers) = @_;
809	return(
810	(CJK_UidIni <= $u &&
811	($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
812	\|\|
813	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
814	\|\|
815	(CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
816	);
817	}
818
819
820	sub getWtHangulTerm {
821	my $self = shift;
822	return _varCE($self->{variable},
823	pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
824	}
825
826
827	##
828	## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
829	##
830	sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
831
832	#
833	# $code must be in Hangul syllable.
834	# Check it before you enter here.
835	#
836	sub _decompHangul {
837	my $code = shift;
838	my $si = $code - Hangul_SBase;
839	my $li = int( $si / Hangul_NCount);
840	my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
841	my $ti = $si % Hangul_TCount;
842	return (
843	Hangul_LBase + $li,
844	Hangul_VBase + $vi,
845	$ti ? (Hangul_TBase + $ti) : (),
846	);
847	}
848
849	sub _isIllegal {
850	my $code = shift;
851	return ! defined $code # removed
852	\|\| ($code < 0 \|\| 0x10FFFF < $code) # out of range
853	\|\| (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
854	\|\| (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
855	\|\| (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
856	;
857	}
858
859	# Hangul Syllable Type
860	sub getHST {
861	my $u = shift;
862	return
863	Hangul_LIni <= $u && $u <= Hangul_LFin \|\| $u == Hangul_LFill ? "L" :
864	Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
865	Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
866	Hangul_SIni <= $u && $u <= Hangul_SFin ?
867	($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
868	}
869
870
871	##
872	## bool _nonIgnorAtLevel(arrayref weights, int level)
873	##
874	sub _nonIgnorAtLevel($$)
875	{
876	my $wt = shift;
877	return if ! defined $wt;
878	my $lv = shift;
879	return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
880	}
881
882	##
883	## bool _eqArray(
884	## arrayref of arrayref[weights] source,
885	## arrayref of arrayref[weights] substr,
886	## int level)
887	## * comparison of graphemes vs graphemes.
888	## @$source >= @$substr must be true (check it before call this);
889	##
890	sub _eqArray($$$)
891	{
892	my $source = shift;
893	my $substr = shift;
894	my $lev = shift;
895
896	for my $g (0..@$substr-1){
897	# Do the $g'th graphemes have the same number of AV weigths?
898	return if @{ $source->[$g] } != @{ $substr->[$g] };
899
900	for my $w (0..@{ $substr->[$g] }-1) {
901	for my $v (0..$lev-1) {
902	return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
903	}
904	}
905	}
906	return 1;
907	}
908
909	##
910	## (int position, int length)
911	## int position = index(string, substring, position, [undoc'ed grobal])
912	##
913	## With "grobal" (only for the list context),
914	## returns list of arrayref[position, length].
915	##
916	sub index
917	{
918	my $self = shift;
919	my $str = shift;
920	my $len = length($str);
921	my $subE = $self->splitEnt(shift);
922	my $pos = @_ ? shift : 0;
923	$pos = 0 if $pos < 0;
924	my $grob = shift;
925
926	my $lev = $self->{level};
927	my $v2i = $self->{UCA_Version} >= 9 &&
928	$self->{variable} ne 'non-ignorable';
929
930	if (! @$subE) {
931	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
932	return $grob
933	? map([$_, 0], $temp..$len)
934	: wantarray ? ($temp,0) : $temp;
935	}
936	$len < $pos
937	and return wantarray ? () : NOMATCHPOS;
938	my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
939	@$strE
940	or return wantarray ? () : NOMATCHPOS;
941
942	my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
943
944	my $last_is_variable;
945	for my $vwt (map $self->getWt($_), @$subE) {
946	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
947	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
948
949	# "Ignorable (L1, L2) after Variable" since track. v. 9
950	if ($v2i) {
951	if ($var) {
952	$last_is_variable = TRUE;
953	}
954	elsif (!$wt[0]) { # ignorable
955	$to_be_pushed = FALSE if $last_is_variable;
956	}
957	else {
958	$last_is_variable = FALSE;
959	}
960	}
961
962	if (@subWt && !$var && !$wt[0]) {
963	push @{ $subWt[-1] }, \@wt if $to_be_pushed;
964	} else {
965	push @subWt, [ \@wt ];
966	}
967	}
968
969	my $count = 0;
970	my $end = @$strE - 1;
971
972	$last_is_variable = FALSE; # reuse
973	for (my $i = 0; $i <= $end; ) { # no $i++
974	my $found_base = 0;
975
976	# fetch a grapheme
977	while ($i <= $end && $found_base == 0) {
978	for my $vwt ($self->getWt($strE->[$i][0])) {
979	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
980	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
981
982	# "Ignorable (L1, L2) after Variable" since track. v. 9
983	if ($v2i) {
984	if ($var) {
985	$last_is_variable = TRUE;
986	}
987	elsif (!$wt[0]) { # ignorable
988	$to_be_pushed = FALSE if $last_is_variable;
989	}
990	else {
991	$last_is_variable = FALSE;
992	}
993	}
994
995	if (@strWt && !$var && !$wt[0]) {
996	push @{ $strWt[-1] }, \@wt if $to_be_pushed;
997	$finPos[-1] = $strE->[$i][2];
998	} elsif ($to_be_pushed) {
999	push @strWt, [ \@wt ];
1000	push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1001	$finPos[-1] = NOMATCHPOS if $found_base;
1002	push @finPos, $strE->[$i][2];
1003	$found_base++;
1004	}
1005	# else ===> no-op
1006	}
1007	$i++;
1008	}
1009
1010	# try to match
1011	while ( @strWt > @subWt \|\| (@strWt == @subWt && $i > $end) ) {
1012	if ($iniPos[0] != NOMATCHPOS &&
1013	$finPos[$#subWt] != NOMATCHPOS &&
1014	_eqArray(\@strWt, \@subWt, $lev)) {
1015	my $temp = $iniPos[0] + $pos;
1016
1017	if ($grob) {
1018	push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1019	splice @strWt, 0, $#subWt;
1020	splice @iniPos, 0, $#subWt;
1021	splice @finPos, 0, $#subWt;
1022	}
1023	else {
1024	return wantarray
1025	? ($temp, $finPos[$#subWt] - $iniPos[0])
1026	: $temp;
1027	}
1028	}
1029	shift @strWt;
1030	shift @iniPos;
1031	shift @finPos;
1032	}
1033	}
1034
1035	return $grob
1036	? @g_ret
1037	: wantarray ? () : NOMATCHPOS;
1038	}
1039
1040	##
1041	## scalarref to matching part = match(string, substring)
1042	##
1043	sub match
1044	{
1045	my $self = shift;
1046	if (my($pos,$len) = $self->index($_[0], $_[1])) {
1047	my $temp = substr($_[0], $pos, $len);
1048	return wantarray ? $temp : \$temp;
1049	# An lvalue ref \substr should be avoided,
1050	# since its value is affected by modification of its referent.
1051	}
1052	else {
1053	return;
1054	}
1055	}
1056
1057	##
1058	## arrayref matching parts = gmatch(string, substring)
1059	##
1060	sub gmatch
1061	{
1062	my $self = shift;
1063	my $str = shift;
1064	my $sub = shift;
1065	return map substr($str, $_->[0], $_->[1]),
1066	$self->index($str, $sub, 0, 'g');
1067	}
1068
1069	##
1070	## bool subst'ed = subst(string, substring, replace)
1071	##
1072	sub subst
1073	{
1074	my $self = shift;
1075	my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1076
1077	if (my($pos,$len) = $self->index($_[0], $_[1])) {
1078	if ($code) {
1079	my $mat = substr($_[0], $pos, $len);
1080	substr($_[0], $pos, $len, $code->($mat));
1081	} else {
1082	substr($_[0], $pos, $len, $_[2]);
1083	}
1084	return TRUE;
1085	}
1086	else {
1087	return FALSE;
1088	}
1089	}
1090
1091	##
1092	## int count = gsubst(string, substring, replace)
1093	##
1094	sub gsubst
1095	{
1096	my $self = shift;
1097	my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1098	my $cnt = 0;
1099
1100	# Replacement is carried out from the end, then use reverse.
1101	for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1102	if ($code) {
1103	my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1104	substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1105	} else {
1106	substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1107	}
1108	$cnt++;
1109	}
1110	return $cnt;
1111	}
1112
1113	1;
1114	__END__
1115
1116	=head1 NAME
1117
1118	Unicode::Collate - Unicode Collation Algorithm
1119
1120	=head1 SYNOPSIS
1121
1122	use Unicode::Collate;
1123
1124	#construct
1125	$Collator = Unicode::Collate->new(%tailoring);
1126
1127	#sort
1128	@sorted = $Collator->sort(@not_sorted);
1129
1130	#compare
1131	$result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1132
1133	# If %tailoring is false (i.e. empty),
1134	# $Collator should do the default collation.
1135
1136	=head1 DESCRIPTION
1137
1138	This module is an implementation of Unicode Technical Standard #10
1139	(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1140
1141	=head2 Constructor and Tailoring
1142
1143	The C<new> method returns a collator object.
1144
1145	$Collator = Unicode::Collate->new(
1146	UCA_Version => $UCA_Version,
1147	alternate => $alternate, # deprecated: use of 'variable' is recommended.
1148	backwards => $levelNumber, # or \@levelNumbers
1149	entry => $element,
1150	hangul_terminator => $term_primary_weight,
1151	ignoreName => qr/$ignoreName/,
1152	ignoreChar => qr/$ignoreChar/,
1153	katakana_before_hiragana => $bool,
1154	level => $collationLevel,
1155	normalization => $normalization_form,
1156	overrideCJK => \&overrideCJK,
1157	overrideHangul => \&overrideHangul,
1158	preprocess => \&preprocess,
1159	rearrange => \@charList,
1160	table => $filename,
1161	undefName => qr/$undefName/,
1162	undefChar => qr/$undefChar/,
1163	upper_before_lower => $bool,
1164	variable => $variable,
1165	);
1166
1167	=over 4
1168
1169	=item UCA_Version
1170
1171	If the tracking version number of UCA is given,
1172	behavior of that tracking version is emulated on collating.
1173	If omitted, the return value of C<UCA_Version()> is used.
1174	C<UCA_Version()> should return the latest tracking version supported.
1175
1176	The supported tracking version: 8, 9, 11, or 14.
1177
1178	UCA Unicode Standard DUCET (@version)
1179	---------------------------------------------------
1180	8 3.1 3.0.1 (3.0.1d9)
1181	9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1182	11 4.0 4.0.0 (4.0.0)
1183	14 4.1.0 4.1.0 (4.1.0)
1184
1185	Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1186
1187	=item alternate
1188
1189	-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1190
1191	For backward compatibility, C<alternate> (old name) can be used
1192	as an alias for C<variable>.
1193
1194	=item backwards
1195
1196	-- see 3.1.2 French Accents, UTS #10.
1197
1198	backwards => $levelNumber or \@levelNumbers
1199
1200	Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1201	If omitted, forwards at all the levels.
1202
1203	=item entry
1204
1205	-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1206
1207	If the same character (or a sequence of characters) exists
1208	in the collation element table through C<table>,
1209	mapping to collation elements is overrided.
1210	If it does not exist, the mapping is defined additionally.
1211
1212	entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1213	0063 0068 ; [.0E6A.0020.0002.0063] # ch
1214	0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1215	0043 0048 ; [.0E6A.0020.0008.0043] # CH
1216	006C 006C ; [.0F4C.0020.0002.006C] # ll
1217	004C 006C ; [.0F4C.0020.0007.004C] # Ll
1218	004C 004C ; [.0F4C.0020.0008.004C] # LL
1219	00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1220	006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1221	00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1222	004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1223	ENTRY
1224
1225	entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1226	00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1227	00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1228	ENTRY
1229
1230	B<NOTE:> The code point in the UCA file format (before C<';'>)
1231	B<must> be a Unicode code point (defined as hexadecimal),
1232	but not a native code point.
1233	So C<0063> must always denote C<U+0063>,
1234	but not a character of C<"\x63">.
1235
1236	Weighting may vary depending on collation element table.
1237	So ensure the weights defined in C<entry> will be consistent with
1238	those in the collation element table loaded via C<table>.
1239
1240	In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1241	and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1242	(as a value between C<0E60> and C<0E6D>)
1243	makes ordering as C<C E<lt> CH E<lt> D>.
1244	Exactly speaking DUCET already has some characters between C<C> and C<D>:
1245	C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1246	C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1247	and C<c-curl> (C<U+0255>) with C<0E69>.
1248	Then primary weight C<0E6A> for C<CH> makes C<CH>
1249	ordered between C<c-curl> and C<D>.
1250
1251	=item hangul_terminator
1252
1253	-- see 7.1.4 Trailing Weights, UTS #10.
1254
1255	If a true value is given (non-zero but should be positive),
1256	it will be added as a terminator primary weight to the end of
1257	every standard Hangul syllable. Secondary and any higher weights
1258	for terminator are set to zero.
1259	If the value is false or C<hangul_terminator> key does not exist,
1260	insertion of terminator weights will not be performed.
1261
1262	Boundaries of Hangul syllables are determined
1263	according to conjoining Jamo behavior in F<the Unicode Standard>
1264	and F<HangulSyllableType.txt>.
1265
1266	B<Implementation Note:>
1267	(1) For expansion mapping (Unicode character mapped
1268	to a sequence of collation elements), a terminator will not be added
1269	between collation elements, even if Hangul syllable boundary exists there.
1270	Addition of terminator is restricted to the next position
1271	to the last collation element.
1272
1273	(2) Non-conjoining Hangul letters
1274	(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1275	automatically terminated with a terminator primary weight.
1276	These characters may need terminator included in a collation element
1277	table beforehand.
1278
1279	=item ignoreChar
1280
1281	=item ignoreName
1282
1283	-- see 3.2.2 Variable Weighting, UTS #10.
1284
1285	Makes the entry in the table completely ignorable;
1286	i.e. as if the weights were zero at all level.
1287
1288	Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289	will be ignored. Through C<ignoreName>, any character whose name
1290	(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291	will be ignored.
1292
1293	E.g. when 'a' and 'e' are ignorable,
1294	'element' is equal to 'lament' (or 'lmnt').
1295
1296	=item katakana_before_hiragana
1297
1298	-- see 7.3.1 Tertiary Weight Table, UTS #10.
1299
1300	By default, hiragana is before katakana.
1301	If the parameter is made true, this is reversed.
1302
1303	B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1304	distinctions must occur in level 3, and their weights at level 3 must be
1305	same as those mentioned in 7.3.1, UTS #10.
1306	If you define your collation elements which violate this requirement,
1307	this parameter does not work validly.
1308
1309	=item level
1310
1311	-- see 4.3 Form Sort Key, UTS #10.
1312
1313	Set the maximum level.
1314	Any higher levels than the specified one are ignored.
1315
1316	Level 1: alphabetic ordering
1317	Level 2: diacritic ordering
1318	Level 3: case ordering
1319	Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1320
1321	ex.level => 2,
1322
1323	If omitted, the maximum is the 4th.
1324
1325	=item normalization
1326
1327	-- see 4.1 Normalize, UTS #10.
1328
1329	If specified, strings are normalized before preparation of sort keys
1330	(the normalization is executed after preprocess).
1331
1332	A form name C<Unicode::Normalize::normalize()> accepts will be applied
1333	as C<$normalization_form>.
1334	Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1335	See C<Unicode::Normalize::normalize()> for detail.
1336	If omitted, C<'NFD'> is used.
1337
1338	C<normalization> is performed after C<preprocess> (if defined).
1339
1340	Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1341	though they are not concerned with C<Unicode::Normalize::normalize()>.
1342
1343	If C<undef> (not a string C<"undef">) is passed explicitly
1344	as the value for this key,
1345	any normalization is not carried out (this may make tailoring easier
1346	if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1347	only contiguous contractions are resolved;
1348	e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1349	C<A-cedilla-ring> would be primary equal to C<A>.
1350	In this point,
1351	C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1352	B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1353
1354	In the case of C<(normalization =E<gt> "prenormalized")>,
1355	any normalization is not performed, but
1356	non-contiguous contractions with combining characters are performed.
1357	Therefore
1358	C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1359	B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1360	If source strings are finely prenormalized,
1361	C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1362
1363	Except C<(normalization =E<gt> undef)>,
1364	B<Unicode::Normalize> is required (see also B<CAVEAT>).
1365
1366	=item overrideCJK
1367
1368	-- see 7.1 Derived Collation Elements, UTS #10.
1369
1370	By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1371	but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1372	C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1373	are lesser than C<CJK Unified Ideographs Extension> (its range is
1374	C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1375
1376	Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1377
1378	ex. CJK Unified Ideographs in the JIS code point order.
1379
1380	overrideCJK => sub {
1381	my $u = shift; # get a Unicode codepoint
1382	my $b = pack('n', $u); # to UTF-16BE
1383	my $s = your_unicode_to_sjis_converter($b); # convert
1384	my $n = unpack('n', $s); # convert sjis to short
1385	[ $n, 0x20, 0x2, $u ]; # return the collation element
1386	},
1387
1388	ex. ignores all CJK Unified Ideographs.
1389
1390	overrideCJK => sub {()}, # CODEREF returning empty list
1391
1392	# where ->eq("Pe\x{4E00}rl", "Perl") is true
1393	# as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1394
1395	If C<undef> is passed explicitly as the value for this key,
1396	weights for CJK Unified Ideographs are treated as undefined.
1397	But assignment of weight for CJK Unified Ideographs
1398	in table or C<entry> is still valid.
1399
1400	=item overrideHangul
1401
1402	-- see 7.1 Derived Collation Elements, UTS #10.
1403
1404	By default, Hangul Syllables are decomposed into Hangul Jamo,
1405	even if C<(normalization =E<gt> undef)>.
1406	But the mapping of Hangul Syllables may be overrided.
1407
1408	This parameter works like C<overrideCJK>, so see there for examples.
1409
1410	If you want to override the mapping of Hangul Syllables,
1411	NFD, NFKD, and FCD are not appropriate,
1412	since they will decompose Hangul Syllables before overriding.
1413
1414	If C<undef> is passed explicitly as the value for this key,
1415	weight for Hangul Syllables is treated as undefined
1416	without decomposition into Hangul Jamo.
1417	But definition of weight for Hangul Syllables
1418	in table or C<entry> is still valid.
1419
1420	=item preprocess
1421
1422	-- see 5.1 Preprocessing, UTS #10.
1423
1424	If specified, the coderef is used to preprocess
1425	before the formation of sort keys.
1426
1427	ex. dropping English articles, such as "a" or "the".
1428	Then, "the pen" is before "a pencil".
1429
1430	preprocess => sub {
1431	my $str = shift;
1432	$str =~ s/\b(?:an?\|the)\s+//gi;
1433	return $str;
1434	},
1435
1436	C<preprocess> is performed before C<normalization> (if defined).
1437
1438	=item rearrange
1439
1440	-- see 3.1.3 Rearrangement, UTS #10.
1441
1442	Characters that are not coded in logical order and to be rearranged.
1443	If C<UCA_Version> is equal to or lesser than 11, default is:
1444
1445	rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1446
1447	If you want to disallow any rearrangement, pass C<undef> or C<[]>
1448	(a reference to empty list) as the value for this key.
1449
1450	If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1451
1452	B<According to the version 9 of UCA, this parameter shall not be used;
1453	but it is not warned at present.>
1454
1455	=item table
1456
1457	-- see 3.2 Default Unicode Collation Element Table, UTS #10.
1458
1459	You can use another collation element table if desired.
1460
1461	The table file should locate in the F<Unicode/Collate> directory
1462	on C<@INC>. Say, if the filename is F<Foo.txt>,
1463	the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1464
1465	By default, F<allkeys.txt> (as the filename of DUCET) is used.
1466	If you will prepare your own table file, any name other than F<allkeys.txt>
1467	may be better to avoid namespace conflict.
1468
1469	If C<undef> is passed explicitly as the value for this key,
1470	no file is read (but you can define collation elements via C<entry>).
1471
1472	A typical way to define a collation element table
1473	without any file of table:
1474
1475	$onlyABC = Unicode::Collate->new(
1476	table => undef,
1477	entry => << 'ENTRIES',
1478	0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1479	0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1480	0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1481	0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1482	0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1483	0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1484	ENTRIES
1485	);
1486
1487	If C<ignoreName> or C<undefName> is used, character names should be
1488	specified as a comment (following C<#>) on each line.
1489
1490	=item undefChar
1491
1492	=item undefName
1493
1494	-- see 6.3.4 Reducing the Repertoire, UTS #10.
1495
1496	Undefines the collation element as if it were unassigned in the table.
1497	This reduces the size of the table.
1498	If an unassigned character appears in the string to be collated,
1499	the sort key is made from its codepoint
1500	as a single-character collation element,
1501	as it is greater than any other assigned collation elements
1502	(in the codepoint order among the unassigned characters).
1503	But, it'd be better to ignore characters
1504	unfamiliar to you and maybe never used.
1505
1506	Through C<undefChar>, any character matching C<qr/$undefChar/>
1507	will be undefined. Through C<undefName>, any character whose name
1508	(given in the C<table> file as a comment) matches C<qr/$undefName/>
1509	will be undefined.
1510
1511	ex. Collation weights for beyond-BMP characters are not stored in object:
1512
1513	undefChar => qr/[^\0-\x{fffd}]/,
1514
1515	=item upper_before_lower
1516
1517	-- see 6.6 Case Comparisons, UTS #10.
1518
1519	By default, lowercase is before uppercase.
1520	If the parameter is made true, this is reversed.
1521
1522	B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1523	distinctions must occur in level 3, and their weights at level 3 must be
1524	same as those mentioned in 7.3.1, UTS #10.
1525	If you define your collation elements which differs from this requirement,
1526	this parameter doesn't work validly.
1527
1528	=item variable
1529
1530	-- see 3.2.2 Variable Weighting, UTS #10.
1531
1532	This key allows to variable weighting for variable collation elements,
1533	which are marked with an ASTERISK in the table
1534	(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1535
1536	variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1537
1538	These names are case-insensitive.
1539	By default (if specification is omitted), 'shifted' is adopted.
1540
1541	'Blanked' Variable elements are made ignorable at levels 1 through 3;
1542	considered at the 4th level.
1543
1544	'Non-Ignorable' Variable elements are not reset to ignorable.
1545
1546	'Shifted' Variable elements are made ignorable at levels 1 through 3
1547	their level 4 weight is replaced by the old level 1 weight.
1548	Level 4 weight for Non-Variable elements is 0xFFFF.
1549
1550	'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1551	are trimmed.
1552
1553	=back
1554
1555	=head2 Methods for Collation
1556
1557	=over 4
1558
1559	=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1560
1561	Sorts a list of strings.
1562
1563	=item C<$result = $Collator-E<gt>cmp($a, $b)>
1564
1565	Returns 1 (when C<$a> is greater than C<$b>)
1566	or 0 (when C<$a> is equal to C<$b>)
1567	or -1 (when C<$a> is lesser than C<$b>).
1568
1569	=item C<$result = $Collator-E<gt>eq($a, $b)>
1570
1571	=item C<$result = $Collator-E<gt>ne($a, $b)>
1572
1573	=item C<$result = $Collator-E<gt>lt($a, $b)>
1574
1575	=item C<$result = $Collator-E<gt>le($a, $b)>
1576
1577	=item C<$result = $Collator-E<gt>gt($a, $b)>
1578
1579	=item C<$result = $Collator-E<gt>ge($a, $b)>
1580
1581	They works like the same name operators as theirs.
1582
1583	eq : whether $a is equal to $b.
1584	ne : whether $a is not equal to $b.
1585	lt : whether $a is lesser than $b.
1586	le : whether $a is lesser than $b or equal to $b.
1587	gt : whether $a is greater than $b.
1588	ge : whether $a is greater than $b or equal to $b.
1589
1590	=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1591
1592	-- see 4.3 Form Sort Key, UTS #10.
1593
1594	Returns a sort key.
1595
1596	You compare the sort keys using a binary comparison
1597	and get the result of the comparison of the strings using UCA.
1598
1599	$Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1600
1601	is equivalent to
1602
1603	$Collator->cmp($a, $b)
1604
1605	=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1606
1607	Converts a sorting key into its representation form.
1608	If C<UCA_Version> is 8, the output is slightly different.
1609
1610	use Unicode::Collate;
1611	my $c = Unicode::Collate->new();
1612	print $c->viewSortKey("Perl"),"\n";
1613
1614	# output:
1615	# [0B67 0A65 0B7F 0B03 \| 0020 0020 0020 0020 \| 0008 0002 0002 0002 \| FFFF FFFF FFFF FFFF]
1616	# Level 1 Level 2 Level 3 Level 4
1617
1618	=back
1619
1620	=head2 Methods for Searching
1621
1622	B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1623	for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1624	C<subst>, C<gsubst>) is croaked,
1625	as the position and the length might differ
1626	from those on the specified string.
1627	(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1628
1629	The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1630	like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1631	but they are not aware of any pattern, but only a literal substring.
1632
1633	=over 4
1634
1635	=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1636
1637	=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1638
1639	If C<$substring> matches a part of C<$string>, returns
1640	the position of the first occurrence of the matching part in scalar context;
1641	in list context, returns a two-element list of
1642	the position and the length of the matching part.
1643
1644	If C<$substring> does not match any part of C<$string>,
1645	returns C<-1> in scalar context and
1646	an empty list in list context.
1647
1648	e.g. you say
1649
1650	my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1651	# (normalization => undef) is REQUIRED.
1652	my $str = "Ich muß studieren Perl.";
1653	my $sub = "MÜSS";
1654	my $match;
1655	if (my($pos,$len) = $Collator->index($str, $sub)) {
1656	$match = substr($str, $pos, $len);
1657	}
1658
1659	and get C<"muß"> in C<$match> since C<"muß">
1660	is primary equal to C<"MÜSS">.
1661
1662	=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1663
1664	=item C<($match) = $Collator-E<gt>match($string, $substring)>
1665
1666	If C<$substring> matches a part of C<$string>, in scalar context, returns
1667	B<a reference to> the first occurrence of the matching part
1668	(C<$match_ref> is always true if matches,
1669	since every reference is B<true>);
1670	in list context, returns the first occurrence of the matching part.
1671
1672	If C<$substring> does not match any part of C<$string>,
1673	returns C<undef> in scalar context and
1674	an empty list in list context.
1675
1676	e.g.
1677
1678	if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1679	print "matches [$$match_ref].\n";
1680	} else {
1681	print "doesn't match.\n";
1682	}
1683
1684	or
1685
1686	if (($match) = $Collator->match($str, $sub)) { # list context
1687	print "matches [$match].\n";
1688	} else {
1689	print "doesn't match.\n";
1690	}
1691
1692	=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1693
1694	If C<$substring> matches a part of C<$string>, returns
1695	all the matching parts (or matching count in scalar context).
1696
1697	If C<$substring> does not match any part of C<$string>,
1698	returns an empty list.
1699
1700	=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1701
1702	If C<$substring> matches a part of C<$string>,
1703	the first occurrence of the matching part is replaced by C<$replacement>
1704	(C<$string> is modified) and return C<$count> (always equals to C<1>).
1705
1706	C<$replacement> can be a C<CODEREF>,
1707	taking the matching part as an argument,
1708	and returning a string to replace the matching part
1709	(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1710
1711	=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1712
1713	If C<$substring> matches a part of C<$string>,
1714	all the occurrences of the matching part is replaced by C<$replacement>
1715	(C<$string> is modified) and return C<$count>.
1716
1717	C<$replacement> can be a C<CODEREF>,
1718	taking the matching part as an argument,
1719	and returning a string to replace the matching part
1720	(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1721
1722	e.g.
1723
1724	my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1725	# (normalization => undef) is REQUIRED.
1726	my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1727	$Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1728
1729	# now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1730	# i.e., all the camels are made bold-faced.
1731
1732	=back
1733
1734	=head2 Other Methods
1735
1736	=over 4
1737
1738	=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1739
1740	Change the value of specified keys and returns the changed part.
1741
1742	$Collator = Unicode::Collate->new(level => 4);
1743
1744	$Collator->eq("perl", "PERL"); # false
1745
1746	%old = $Collator->change(level => 2); # returns (level => 4).
1747
1748	$Collator->eq("perl", "PERL"); # true
1749
1750	$Collator->change(%old); # returns (level => 2).
1751
1752	$Collator->eq("perl", "PERL"); # false
1753
1754	Not all C<(key,value)>s are allowed to be changed.
1755	See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1756
1757	In the scalar context, returns the modified collator
1758	(but it is B<not> a clone from the original).
1759
1760	$Collator->change(level => 2)->eq("perl", "PERL"); # true
1761
1762	$Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1763
1764	$Collator->change(level => 4)->eq("perl", "PERL"); # false
1765
1766	=item C<$version = $Collator-E<gt>version()>
1767
1768	Returns the version number (a string) of the Unicode Standard
1769	which the C<table> file used by the collator object is based on.
1770	If the table does not include a version line (starting with C<@version>),
1771	returns C<"unknown">.
1772
1773	=item C<UCA_Version()>
1774
1775	Returns the tracking version number of UTS #10 this module consults.
1776
1777	=item C<Base_Unicode_Version()>
1778
1779	Returns the version number of UTS #10 this module consults.
1780
1781	=back
1782
1783	=head1 EXPORT
1784
1785	No method will be exported.
1786
1787	=head1 INSTALL
1788
1789	Though this module can be used without any C<table> file,
1790	to use this module easily, it is recommended to install a table file
1791	in the UCA format, by copying it under the directory
1792	<a place in @INC>/Unicode/Collate.
1793
1794	The most preferable one is "The Default Unicode Collation Element Table"
1795	(aka DUCET), available from the Unicode Consortium's website:
1796
1797	http://www.unicode.org/Public/UCA/
1798
1799	http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1800
1801	If DUCET is not installed, it is recommended to copy the file
1802	from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1803	to <a place in @INC>/Unicode/Collate/allkeys.txt
1804	manually.
1805
1806	=head1 CAVEATS
1807
1808	=over 4
1809
1810	=item Normalization
1811
1812	Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1813	module (see L<Unicode::Normalize>).
1814
1815	If you need not it (say, in the case when you need not
1816	handle any combining characters),
1817	assign C<normalization =E<gt> undef> explicitly.
1818
1819	-- see 6.5 Avoiding Normalization, UTS #10.
1820
1821	=item Conformance Test
1822
1823	The Conformance Test for the UCA is available
1824	under L<http://www.unicode.org/Public/UCA/>.
1825
1826	For F<CollationTest_SHIFTED.txt>,
1827	a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1828	for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1829	C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1830
1831	B<Unicode::Normalize is required to try The Conformance Test.>
1832
1833	=back
1834
1835	=head1 AUTHOR, COPYRIGHT AND LICENSE
1836
1837	The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1838	<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1839	SADAHIRO Tomoyuki. Japan. All rights reserved.
1840
1841	This module is free software; you can redistribute it and/or
1842	modify it under the same terms as Perl itself.
1843
1844	The file Unicode/Collate/allkeys.txt was copied directly
1845	from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>.
1846	This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1847	Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1848
1849	=head1 SEE ALSO
1850
1851	=over 4
1852
1853	=item Unicode Collation Algorithm - UTS #10
1854
1855	L<http://www.unicode.org/reports/tr10/>
1856
1857	=item The Default Unicode Collation Element Table (DUCET)
1858
1859	L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1860
1861	=item The conformance test for the UCA
1862
1863	L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1864
1865	L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1866
1867	=item Hangul Syllable Type
1868
1869	L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1870
1871	=item Unicode Normalization Forms - UAX #15
1872
1873	L<http://www.unicode.org/reports/tr15/>
1874
1875	=back
1876
1877	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/perl/5.8.8/lib/Unicode/Collate.pm

Download in other formats: