source: trunk/essentials/dev-lang/perl/utf8.c@ 3697

Last change on this file since 3697 was 3181, checked in by bird, 19 years ago

perl 5.8.8

File size: 53.0 KB
Line 
1/* utf8.c
2 *
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
25#define PERL_IN_UTF8_C
26#include "perl.h"
27
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
30
31/*
32=head1 Unicode Support
33
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
36Unicode characters as a variable number of bytes, in such a way that
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
39
40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
41
42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
44bytes available. The return value is the pointer to the byte after the
45end of the new character. In other words,
46
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
51 d = uvuni_to_utf8(d, uv);
52
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
64U8 *
65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
66{
67 if (ckWARN(WARN_UTF8)) {
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
78 * FFFEs and FFFFs beyond 0x10FFFF. */
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
85 if (UNI_IS_INVARIANT(uv)) {
86 *d++ = (U8)UTF_TO_NATIVE(uv);
87 return d;
88 }
89#if defined(EBCDIC)
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
98 return d+len;
99 }
100#else /* Non loop style */
101 if (uv < 0x800) {
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
104 return d;
105 }
106 if (uv < 0x10000) {
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
110 return d;
111 }
112 if (uv < 0x200000) {
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
117 return d;
118 }
119 if (uv < 0x4000000) {
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
125 return d;
126 }
127 if (uv < 0x80000000) {
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
134 return d;
135 }
136#ifdef HAS_QUAD
137 if (uv < UTF8_QUAD_MAX)
138#endif
139 {
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
147 return d;
148 }
149#ifdef HAS_QUAD
150 {
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
164 return d;
165 }
166#endif
167#endif /* Loop style */
168}
169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
175
176/*
177
178Tests if some arbitrary number of bytes begins in a valid UTF-8
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
182
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
193=cut */
194STATIC STRLEN
195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196{
197 U8 u = *s;
198 STRLEN slen;
199 UV uv, ouv;
200
201 if (UTF8_IS_INVARIANT(u))
202 return 1;
203
204 if (!UTF8_IS_START(u))
205 return 0;
206
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 return 0;
209
210 slen = len - 1;
211 s++;
212#ifdef EBCDIC
213 u = NATIVE_TO_UTF(u);
214#endif
215 u &= UTF_START_MASK(len);
216 uv = u;
217 ouv = uv;
218 while (slen--) {
219 if (!UTF8_IS_CONTINUATION(*s))
220 return 0;
221 uv = UTF8_ACCUMULATE(uv, *s);
222 if (uv < ouv)
223 return 0;
224 ouv = uv;
225 s++;
226 }
227
228 if ((STRLEN)UNISKIP(uv) < len)
229 return 0;
230
231 return len;
232}
233
234/*
235=for apidoc A|STRLEN|is_utf8_char|U8 *s
236
237Tests if some arbitrary number of bytes begins in a valid UTF-8
238character. Note that an INVARIANT (i.e. ASCII) character is a valid
239UTF-8 character. The actual number of bytes in the UTF-8 character
240will be returned if it is valid, otherwise 0.
241
242=cut */
243STRLEN
244Perl_is_utf8_char(pTHX_ U8 *s)
245{
246 const STRLEN len = UTF8SKIP(s);
247#ifdef IS_UTF8_CHAR
248 if (IS_UTF8_CHAR_FAST(len))
249 return IS_UTF8_CHAR(s, len) ? len : 0;
250#endif /* #ifdef IS_UTF8_CHAR */
251 return is_utf8_char_slow(s, len);
252}
253
254/*
255=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
256
257Returns true if first C<len> bytes of the given string form a valid
258UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
259not mean 'a string that contains code points above 0x7F encoded in UTF-8'
260because a valid ASCII string is a valid UTF-8 string.
261
262See also is_utf8_string_loclen() and is_utf8_string_loc().
263
264=cut
265*/
266
267bool
268Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
269{
270 const U8* x = s;
271 const U8* send;
272
273 if (!len)
274 len = strlen((const char *)s);
275 send = s + len;
276
277 while (x < send) {
278 STRLEN c;
279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
281 c = 1;
282 else if (!UTF8_IS_START(*x))
283 goto out;
284 else {
285 /* ... and call is_utf8_char() only if really needed. */
286#ifdef IS_UTF8_CHAR
287 c = UTF8SKIP(x);
288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
290 goto out;
291 } else if (!is_utf8_char_slow(x, c))
292 goto out;
293#else
294 c = is_utf8_char(x);
295#endif /* #ifdef IS_UTF8_CHAR */
296 if (!c)
297 goto out;
298 }
299 x += c;
300 }
301
302 out:
303 if (x != send)
304 return FALSE;
305
306 return TRUE;
307}
308
309/*
310=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
311
312Like is_utf8_string() but stores the location of the failure (in the
313case of "utf8ness failure") or the location s+len (in the case of
314"utf8ness success") in the C<ep>, and the number of UTF-8
315encoded characters in the C<el>.
316
317See also is_utf8_string_loc() and is_utf8_string().
318
319=cut
320*/
321
322bool
323Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
324{
325 const U8* x = s;
326 const U8* send;
327 STRLEN c;
328
329 if (!len)
330 len = strlen((const char *)s);
331 send = s + len;
332 if (el)
333 *el = 0;
334
335 while (x < send) {
336 /* Inline the easy bits of is_utf8_char() here for speed... */
337 if (UTF8_IS_INVARIANT(*x))
338 c = 1;
339 else if (!UTF8_IS_START(*x))
340 goto out;
341 else {
342 /* ... and call is_utf8_char() only if really needed. */
343#ifdef IS_UTF8_CHAR
344 c = UTF8SKIP(x);
345 if (IS_UTF8_CHAR_FAST(c)) {
346 if (!IS_UTF8_CHAR(x, c))
347 c = 0;
348 } else
349 c = is_utf8_char_slow(x, c);
350#else
351 c = is_utf8_char(x);
352#endif /* #ifdef IS_UTF8_CHAR */
353 if (!c)
354 goto out;
355 }
356 x += c;
357 if (el)
358 (*el)++;
359 }
360
361 out:
362 if (ep)
363 *ep = x;
364 if (x != send)
365 return FALSE;
366
367 return TRUE;
368}
369
370/*
371=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
372
373Like is_utf8_string() but stores the location of the failure (in the
374case of "utf8ness failure") or the location s+len (in the case of
375"utf8ness success") in the C<ep>.
376
377See also is_utf8_string_loclen() and is_utf8_string().
378
379=cut
380*/
381
382bool
383Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **ep)
384{
385 return is_utf8_string_loclen(s, len, (const U8 **)ep, 0);
386}
387
388/*
389=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
390
391Bottom level UTF-8 decode routine.
392Returns the unicode code point value of the first character in the string C<s>
393which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
394C<retlen> will be set to the length, in bytes, of that character.
395
396If C<s> does not point to a well-formed UTF-8 character, the behaviour
397is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
398it is assumed that the caller will raise a warning, and this function
399will silently just set C<retlen> to C<-1> and return zero. If the
400C<flags> does not contain UTF8_CHECK_ONLY, warnings about
401malformations will be given, C<retlen> will be set to the expected
402length of the UTF-8 character in bytes, and zero will be returned.
403
404The C<flags> can also contain various flags to allow deviations from
405the strict UTF-8 encoding (see F<utf8.h>).
406
407Most code should use utf8_to_uvchr() rather than call this directly.
408
409=cut
410*/
411
412UV
413Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
414{
415 const U8 *s0 = s;
416 UV uv = *s, ouv = 0;
417 STRLEN len = 1;
418 const bool dowarn = ckWARN_d(WARN_UTF8);
419 const UV startbyte = *s;
420 STRLEN expectlen = 0;
421 U32 warning = 0;
422
423/* This list is a superset of the UTF8_ALLOW_XXX. */
424
425#define UTF8_WARN_EMPTY 1
426#define UTF8_WARN_CONTINUATION 2
427#define UTF8_WARN_NON_CONTINUATION 3
428#define UTF8_WARN_FE_FF 4
429#define UTF8_WARN_SHORT 5
430#define UTF8_WARN_OVERFLOW 6
431#define UTF8_WARN_SURROGATE 7
432#define UTF8_WARN_LONG 8
433#define UTF8_WARN_FFFF 9 /* Also FFFE. */
434
435 if (curlen == 0 &&
436 !(flags & UTF8_ALLOW_EMPTY)) {
437 warning = UTF8_WARN_EMPTY;
438 goto malformed;
439 }
440
441 if (UTF8_IS_INVARIANT(uv)) {
442 if (retlen)
443 *retlen = 1;
444 return (UV) (NATIVE_TO_UTF(*s));
445 }
446
447 if (UTF8_IS_CONTINUATION(uv) &&
448 !(flags & UTF8_ALLOW_CONTINUATION)) {
449 warning = UTF8_WARN_CONTINUATION;
450 goto malformed;
451 }
452
453 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
454 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
455 warning = UTF8_WARN_NON_CONTINUATION;
456 goto malformed;
457 }
458
459#ifdef EBCDIC
460 uv = NATIVE_TO_UTF(uv);
461#else
462 if ((uv == 0xfe || uv == 0xff) &&
463 !(flags & UTF8_ALLOW_FE_FF)) {
464 warning = UTF8_WARN_FE_FF;
465 goto malformed;
466 }
467#endif
468
469 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
470 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
471 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
472 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
473#ifdef EBCDIC
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else { len = 7; uv &= 0x01; }
476#else
477 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
478 else if (!(uv & 0x01)) { len = 7; uv = 0; }
479 else { len = 13; uv = 0; } /* whoa! */
480#endif
481
482 if (retlen)
483 *retlen = len;
484
485 expectlen = len;
486
487 if ((curlen < expectlen) &&
488 !(flags & UTF8_ALLOW_SHORT)) {
489 warning = UTF8_WARN_SHORT;
490 goto malformed;
491 }
492
493 len--;
494 s++;
495 ouv = uv;
496
497 while (len--) {
498 if (!UTF8_IS_CONTINUATION(*s) &&
499 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
500 s--;
501 warning = UTF8_WARN_NON_CONTINUATION;
502 goto malformed;
503 }
504 else
505 uv = UTF8_ACCUMULATE(uv, *s);
506 if (!(uv > ouv)) {
507 /* These cannot be allowed. */
508 if (uv == ouv) {
509 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
510 warning = UTF8_WARN_LONG;
511 goto malformed;
512 }
513 }
514 else { /* uv < ouv */
515 /* This cannot be allowed. */
516 warning = UTF8_WARN_OVERFLOW;
517 goto malformed;
518 }
519 }
520 s++;
521 ouv = uv;
522 }
523
524 if (UNICODE_IS_SURROGATE(uv) &&
525 !(flags & UTF8_ALLOW_SURROGATE)) {
526 warning = UTF8_WARN_SURROGATE;
527 goto malformed;
528 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
529 !(flags & UTF8_ALLOW_LONG)) {
530 warning = UTF8_WARN_LONG;
531 goto malformed;
532 } else if (UNICODE_IS_ILLEGAL(uv) &&
533 !(flags & UTF8_ALLOW_FFFF)) {
534 warning = UTF8_WARN_FFFF;
535 goto malformed;
536 }
537
538 return uv;
539
540malformed:
541
542 if (flags & UTF8_CHECK_ONLY) {
543 if (retlen)
544 *retlen = -1;
545 return 0;
546 }
547
548 if (dowarn) {
549 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
550
551 switch (warning) {
552 case 0: /* Intentionally empty. */ break;
553 case UTF8_WARN_EMPTY:
554 Perl_sv_catpv(aTHX_ sv, "(empty string)");
555 break;
556 case UTF8_WARN_CONTINUATION:
557 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
558 break;
559 case UTF8_WARN_NON_CONTINUATION:
560 if (s == s0)
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
562 (UV)s[1], startbyte);
563 else {
564 const int len = (int)(s-s0);
565 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
566 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
567 }
568
569 break;
570 case UTF8_WARN_FE_FF:
571 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
572 break;
573 case UTF8_WARN_SHORT:
574 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
575 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
576 expectlen = curlen; /* distance for caller to skip */
577 break;
578 case UTF8_WARN_OVERFLOW:
579 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
580 ouv, *s, startbyte);
581 break;
582 case UTF8_WARN_SURROGATE:
583 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
584 break;
585 case UTF8_WARN_LONG:
586 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
587 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
588 break;
589 case UTF8_WARN_FFFF:
590 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
591 break;
592 default:
593 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
594 break;
595 }
596
597 if (warning) {
598 const char * const s = SvPVX_const(sv);
599
600 if (PL_op)
601 Perl_warner(aTHX_ packWARN(WARN_UTF8),
602 "%s in %s", s, OP_DESC(PL_op));
603 else
604 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
605 }
606 }
607
608 if (retlen)
609 *retlen = expectlen ? expectlen : len;
610
611 return 0;
612}
613
614/*
615=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
616
617Returns the native character value of the first character in the string C<s>
618which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
619length, in bytes, of that character.
620
621If C<s> does not point to a well-formed UTF-8 character, zero is
622returned and retlen is set, if possible, to -1.
623
624=cut
625*/
626
627UV
628Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
629{
630 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
631 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
632}
633
634/*
635=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
636
637Returns the Unicode code point of the first character in the string C<s>
638which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
639length, in bytes, of that character.
640
641This function should only be used when returned UV is considered
642an index into the Unicode semantic tables (e.g. swashes).
643
644If C<s> does not point to a well-formed UTF-8 character, zero is
645returned and retlen is set, if possible, to -1.
646
647=cut
648*/
649
650UV
651Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
652{
653 /* Call the low level routine asking for checks */
654 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
655 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
656}
657
658/*
659=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
660
661Return the length of the UTF-8 char encoded string C<s> in characters.
662Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
663up past C<e>, croaks.
664
665=cut
666*/
667
668STRLEN
669Perl_utf8_length(pTHX_ U8 *s, U8 *e)
670{
671 STRLEN len = 0;
672
673 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
674 * the bitops (especially ~) can create illegal UTF-8.
675 * In other words: in Perl UTF-8 is not just for Unicode. */
676
677 if (e < s)
678 goto warn_and_return;
679 while (s < e) {
680 const U8 t = UTF8SKIP(s);
681 if (e - s < t) {
682 warn_and_return:
683 if (ckWARN_d(WARN_UTF8)) {
684 if (PL_op)
685 Perl_warner(aTHX_ packWARN(WARN_UTF8),
686 "%s in %s", unees, OP_DESC(PL_op));
687 else
688 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
689 }
690 return len;
691 }
692 s += t;
693 len++;
694 }
695
696 return len;
697}
698
699/*
700=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
701
702Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
703and C<b>.
704
705WARNING: use only if you *know* that the pointers point inside the
706same UTF-8 buffer.
707
708=cut
709*/
710
711IV
712Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
713{
714 IV off = 0;
715
716 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
717 * the bitops (especially ~) can create illegal UTF-8.
718 * In other words: in Perl UTF-8 is not just for Unicode. */
719
720 if (a < b) {
721 while (a < b) {
722 const U8 c = UTF8SKIP(a);
723 if (b - a < c)
724 goto warn_and_return;
725 a += c;
726 off--;
727 }
728 }
729 else {
730 while (b < a) {
731 const U8 c = UTF8SKIP(b);
732
733 if (a - b < c) {
734 warn_and_return:
735 if (ckWARN_d(WARN_UTF8)) {
736 if (PL_op)
737 Perl_warner(aTHX_ packWARN(WARN_UTF8),
738 "%s in %s", unees, OP_DESC(PL_op));
739 else
740 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
741 }
742 return off;
743 }
744 b += c;
745 off++;
746 }
747 }
748
749 return off;
750}
751
752/*
753=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
754
755Return the UTF-8 pointer C<s> displaced by C<off> characters, either
756forward or backward.
757
758WARNING: do not use the following unless you *know* C<off> is within
759the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
760on the first byte of character or just after the last byte of a character.
761
762=cut
763*/
764
765U8 *
766Perl_utf8_hop(pTHX_ U8 *s, I32 off)
767{
768 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
769 * the bitops (especially ~) can create illegal UTF-8.
770 * In other words: in Perl UTF-8 is not just for Unicode. */
771
772 if (off >= 0) {
773 while (off--)
774 s += UTF8SKIP(s);
775 }
776 else {
777 while (off++) {
778 s--;
779 while (UTF8_IS_CONTINUATION(*s))
780 s--;
781 }
782 }
783 return (U8 *)s;
784}
785
786/*
787=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
788
789Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
790Unlike C<bytes_to_utf8>, this over-writes the original string, and
791updates len to contain the new length.
792Returns zero on failure, setting C<len> to -1.
793
794=cut
795*/
796
797U8 *
798Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
799{
800 U8 *send;
801 U8 *d;
802 U8 *save = s;
803
804 /* ensure valid UTF-8 and chars < 256 before updating string */
805 for (send = s + *len; s < send; ) {
806 U8 c = *s++;
807
808 if (!UTF8_IS_INVARIANT(c) &&
809 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
810 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
811 *len = -1;
812 return 0;
813 }
814 }
815
816 d = s = save;
817 while (s < send) {
818 STRLEN ulen;
819 *d++ = (U8)utf8_to_uvchr(s, &ulen);
820 s += ulen;
821 }
822 *d = '\0';
823 *len = d - save;
824 return save;
825}
826
827/*
828=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
829
830Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
831Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
832the newly-created string, and updates C<len> to contain the new
833length. Returns the original string if no conversion occurs, C<len>
834is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8350 if C<s> is converted or contains all 7bit characters.
836
837=cut
838*/
839
840U8 *
841Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
842{
843 U8 *d;
844 const U8 *start = s;
845 const U8 *send;
846 I32 count = 0;
847 const U8 *s2;
848
849 if (!*is_utf8)
850 return (U8 *)start;
851
852 /* ensure valid UTF-8 and chars < 256 before converting string */
853 for (send = s + *len; s < send;) {
854 U8 c = *s++;
855 if (!UTF8_IS_INVARIANT(c)) {
856 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
857 (c = *s++) && UTF8_IS_CONTINUATION(c))
858 count++;
859 else
860 return (U8 *)start;
861 }
862 }
863
864 *is_utf8 = 0;
865
866 Newxz(d, (*len) - count + 1, U8);
867 s2 = start; start = d;
868 while (s2 < send) {
869 U8 c = *s2++;
870 if (!UTF8_IS_INVARIANT(c)) {
871 /* Then it is two-byte encoded */
872 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
873 c = ASCII_TO_NATIVE(c);
874 }
875 *d++ = c;
876 }
877 *d = '\0';
878 *len = d - start;
879 return (U8 *)start;
880}
881
882/*
883=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
884
885Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
886Returns a pointer to the newly-created string, and sets C<len> to
887reflect the new length.
888
889If you want to convert to UTF-8 from other encodings than ASCII,
890see sv_recode_to_utf8().
891
892=cut
893*/
894
895U8*
896Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
897{
898 const U8 * const send = s + (*len);
899 U8 *d;
900 U8 *dst;
901
902 Newxz(d, (*len) * 2 + 1, U8);
903 dst = d;
904
905 while (s < send) {
906 const UV uv = NATIVE_TO_ASCII(*s++);
907 if (UNI_IS_INVARIANT(uv))
908 *d++ = (U8)UTF_TO_NATIVE(uv);
909 else {
910 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
911 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
912 }
913 }
914 *d = '\0';
915 *len = d-dst;
916 return dst;
917}
918
919/*
920 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
921 *
922 * Destination must be pre-extended to 3/2 source. Do not use in-place.
923 * We optimize for native, for obvious reasons. */
924
925U8*
926Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
927{
928 U8* pend;
929 U8* dstart = d;
930
931 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
932 d[0] = 0;
933 *newlen = 1;
934 return d;
935 }
936
937 if (bytelen & 1)
938 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
939
940 pend = p + bytelen;
941
942 while (p < pend) {
943 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
944 p += 2;
945 if (uv < 0x80) {
946 *d++ = (U8)uv;
947 continue;
948 }
949 if (uv < 0x800) {
950 *d++ = (U8)(( uv >> 6) | 0xc0);
951 *d++ = (U8)(( uv & 0x3f) | 0x80);
952 continue;
953 }
954 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
955 UV low = (p[0] << 8) + p[1];
956 p += 2;
957 if (low < 0xdc00 || low >= 0xdfff)
958 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
959 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
960 }
961 if (uv < 0x10000) {
962 *d++ = (U8)(( uv >> 12) | 0xe0);
963 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
964 *d++ = (U8)(( uv & 0x3f) | 0x80);
965 continue;
966 }
967 else {
968 *d++ = (U8)(( uv >> 18) | 0xf0);
969 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
970 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
971 *d++ = (U8)(( uv & 0x3f) | 0x80);
972 continue;
973 }
974 }
975 *newlen = d - dstart;
976 return d;
977}
978
979/* Note: this one is slightly destructive of the source. */
980
981U8*
982Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
983{
984 U8* s = (U8*)p;
985 U8* send = s + bytelen;
986 while (s < send) {
987 U8 tmp = s[0];
988 s[0] = s[1];
989 s[1] = tmp;
990 s += 2;
991 }
992 return utf16_to_utf8(p, d, bytelen, newlen);
993}
994
995/* for now these are all defined (inefficiently) in terms of the utf8 versions */
996
997bool
998Perl_is_uni_alnum(pTHX_ UV c)
999{
1000 U8 tmpbuf[UTF8_MAXBYTES+1];
1001 uvchr_to_utf8(tmpbuf, c);
1002 return is_utf8_alnum(tmpbuf);
1003}
1004
1005bool
1006Perl_is_uni_alnumc(pTHX_ UV c)
1007{
1008 U8 tmpbuf[UTF8_MAXBYTES+1];
1009 uvchr_to_utf8(tmpbuf, c);
1010 return is_utf8_alnumc(tmpbuf);
1011}
1012
1013bool
1014Perl_is_uni_idfirst(pTHX_ UV c)
1015{
1016 U8 tmpbuf[UTF8_MAXBYTES+1];
1017 uvchr_to_utf8(tmpbuf, c);
1018 return is_utf8_idfirst(tmpbuf);
1019}
1020
1021bool
1022Perl_is_uni_alpha(pTHX_ UV c)
1023{
1024 U8 tmpbuf[UTF8_MAXBYTES+1];
1025 uvchr_to_utf8(tmpbuf, c);
1026 return is_utf8_alpha(tmpbuf);
1027}
1028
1029bool
1030Perl_is_uni_ascii(pTHX_ UV c)
1031{
1032 U8 tmpbuf[UTF8_MAXBYTES+1];
1033 uvchr_to_utf8(tmpbuf, c);
1034 return is_utf8_ascii(tmpbuf);
1035}
1036
1037bool
1038Perl_is_uni_space(pTHX_ UV c)
1039{
1040 U8 tmpbuf[UTF8_MAXBYTES+1];
1041 uvchr_to_utf8(tmpbuf, c);
1042 return is_utf8_space(tmpbuf);
1043}
1044
1045bool
1046Perl_is_uni_digit(pTHX_ UV c)
1047{
1048 U8 tmpbuf[UTF8_MAXBYTES+1];
1049 uvchr_to_utf8(tmpbuf, c);
1050 return is_utf8_digit(tmpbuf);
1051}
1052
1053bool
1054Perl_is_uni_upper(pTHX_ UV c)
1055{
1056 U8 tmpbuf[UTF8_MAXBYTES+1];
1057 uvchr_to_utf8(tmpbuf, c);
1058 return is_utf8_upper(tmpbuf);
1059}
1060
1061bool
1062Perl_is_uni_lower(pTHX_ UV c)
1063{
1064 U8 tmpbuf[UTF8_MAXBYTES+1];
1065 uvchr_to_utf8(tmpbuf, c);
1066 return is_utf8_lower(tmpbuf);
1067}
1068
1069bool
1070Perl_is_uni_cntrl(pTHX_ UV c)
1071{
1072 U8 tmpbuf[UTF8_MAXBYTES+1];
1073 uvchr_to_utf8(tmpbuf, c);
1074 return is_utf8_cntrl(tmpbuf);
1075}
1076
1077bool
1078Perl_is_uni_graph(pTHX_ UV c)
1079{
1080 U8 tmpbuf[UTF8_MAXBYTES+1];
1081 uvchr_to_utf8(tmpbuf, c);
1082 return is_utf8_graph(tmpbuf);
1083}
1084
1085bool
1086Perl_is_uni_print(pTHX_ UV c)
1087{
1088 U8 tmpbuf[UTF8_MAXBYTES+1];
1089 uvchr_to_utf8(tmpbuf, c);
1090 return is_utf8_print(tmpbuf);
1091}
1092
1093bool
1094Perl_is_uni_punct(pTHX_ UV c)
1095{
1096 U8 tmpbuf[UTF8_MAXBYTES+1];
1097 uvchr_to_utf8(tmpbuf, c);
1098 return is_utf8_punct(tmpbuf);
1099}
1100
1101bool
1102Perl_is_uni_xdigit(pTHX_ UV c)
1103{
1104 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1105 uvchr_to_utf8(tmpbuf, c);
1106 return is_utf8_xdigit(tmpbuf);
1107}
1108
1109UV
1110Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1111{
1112 uvchr_to_utf8(p, c);
1113 return to_utf8_upper(p, p, lenp);
1114}
1115
1116UV
1117Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
1119 uvchr_to_utf8(p, c);
1120 return to_utf8_title(p, p, lenp);
1121}
1122
1123UV
1124Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1125{
1126 uvchr_to_utf8(p, c);
1127 return to_utf8_lower(p, p, lenp);
1128}
1129
1130UV
1131Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1132{
1133 uvchr_to_utf8(p, c);
1134 return to_utf8_fold(p, p, lenp);
1135}
1136
1137/* for now these all assume no locale info available for Unicode > 255 */
1138
1139bool
1140Perl_is_uni_alnum_lc(pTHX_ UV c)
1141{
1142 return is_uni_alnum(c); /* XXX no locale support yet */
1143}
1144
1145bool
1146Perl_is_uni_alnumc_lc(pTHX_ UV c)
1147{
1148 return is_uni_alnumc(c); /* XXX no locale support yet */
1149}
1150
1151bool
1152Perl_is_uni_idfirst_lc(pTHX_ UV c)
1153{
1154 return is_uni_idfirst(c); /* XXX no locale support yet */
1155}
1156
1157bool
1158Perl_is_uni_alpha_lc(pTHX_ UV c)
1159{
1160 return is_uni_alpha(c); /* XXX no locale support yet */
1161}
1162
1163bool
1164Perl_is_uni_ascii_lc(pTHX_ UV c)
1165{
1166 return is_uni_ascii(c); /* XXX no locale support yet */
1167}
1168
1169bool
1170Perl_is_uni_space_lc(pTHX_ UV c)
1171{
1172 return is_uni_space(c); /* XXX no locale support yet */
1173}
1174
1175bool
1176Perl_is_uni_digit_lc(pTHX_ UV c)
1177{
1178 return is_uni_digit(c); /* XXX no locale support yet */
1179}
1180
1181bool
1182Perl_is_uni_upper_lc(pTHX_ UV c)
1183{
1184 return is_uni_upper(c); /* XXX no locale support yet */
1185}
1186
1187bool
1188Perl_is_uni_lower_lc(pTHX_ UV c)
1189{
1190 return is_uni_lower(c); /* XXX no locale support yet */
1191}
1192
1193bool
1194Perl_is_uni_cntrl_lc(pTHX_ UV c)
1195{
1196 return is_uni_cntrl(c); /* XXX no locale support yet */
1197}
1198
1199bool
1200Perl_is_uni_graph_lc(pTHX_ UV c)
1201{
1202 return is_uni_graph(c); /* XXX no locale support yet */
1203}
1204
1205bool
1206Perl_is_uni_print_lc(pTHX_ UV c)
1207{
1208 return is_uni_print(c); /* XXX no locale support yet */
1209}
1210
1211bool
1212Perl_is_uni_punct_lc(pTHX_ UV c)
1213{
1214 return is_uni_punct(c); /* XXX no locale support yet */
1215}
1216
1217bool
1218Perl_is_uni_xdigit_lc(pTHX_ UV c)
1219{
1220 return is_uni_xdigit(c); /* XXX no locale support yet */
1221}
1222
1223U32
1224Perl_to_uni_upper_lc(pTHX_ U32 c)
1225{
1226 /* XXX returns only the first character -- do not use XXX */
1227 /* XXX no locale support yet */
1228 STRLEN len;
1229 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1230 return (U32)to_uni_upper(c, tmpbuf, &len);
1231}
1232
1233U32
1234Perl_to_uni_title_lc(pTHX_ U32 c)
1235{
1236 /* XXX returns only the first character XXX -- do not use XXX */
1237 /* XXX no locale support yet */
1238 STRLEN len;
1239 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1240 return (U32)to_uni_title(c, tmpbuf, &len);
1241}
1242
1243U32
1244Perl_to_uni_lower_lc(pTHX_ U32 c)
1245{
1246 /* XXX returns only the first character -- do not use XXX */
1247 /* XXX no locale support yet */
1248 STRLEN len;
1249 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1250 return (U32)to_uni_lower(c, tmpbuf, &len);
1251}
1252
1253bool
1254Perl_is_utf8_alnum(pTHX_ U8 *p)
1255{
1256 if (!is_utf8_char(p))
1257 return FALSE;
1258 if (!PL_utf8_alnum)
1259 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1260 * descendant of isalnum(3), in other words, it doesn't
1261 * contain the '_'. --jhi */
1262 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1263 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1264/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1265#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1266 if (!PL_utf8_alnum)
1267 PL_utf8_alnum = swash_init("utf8", "",
1268 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1269 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1270#endif
1271}
1272
1273bool
1274Perl_is_utf8_alnumc(pTHX_ U8 *p)
1275{
1276 if (!is_utf8_char(p))
1277 return FALSE;
1278 if (!PL_utf8_alnumc)
1279 PL_utf8_alnumc = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1280 return swash_fetch(PL_utf8_alnumc, p, TRUE) != 0;
1281/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1282#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1283 if (!PL_utf8_alnum)
1284 PL_utf8_alnum = swash_init("utf8", "",
1285 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1286 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1287#endif
1288}
1289
1290bool
1291Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1292{
1293 if (*p == '_')
1294 return TRUE;
1295 if (!is_utf8_char(p))
1296 return FALSE;
1297 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1298 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1299 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1300}
1301
1302bool
1303Perl_is_utf8_idcont(pTHX_ U8 *p)
1304{
1305 if (*p == '_')
1306 return TRUE;
1307 if (!is_utf8_char(p))
1308 return FALSE;
1309 if (!PL_utf8_idcont)
1310 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1311 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1312}
1313
1314bool
1315Perl_is_utf8_alpha(pTHX_ U8 *p)
1316{
1317 if (!is_utf8_char(p))
1318 return FALSE;
1319 if (!PL_utf8_alpha)
1320 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1321 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1322}
1323
1324bool
1325Perl_is_utf8_ascii(pTHX_ U8 *p)
1326{
1327 if (!is_utf8_char(p))
1328 return FALSE;
1329 if (!PL_utf8_ascii)
1330 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1331 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1332}
1333
1334bool
1335Perl_is_utf8_space(pTHX_ U8 *p)
1336{
1337 if (!is_utf8_char(p))
1338 return FALSE;
1339 if (!PL_utf8_space)
1340 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1341 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1342}
1343
1344bool
1345Perl_is_utf8_digit(pTHX_ U8 *p)
1346{
1347 if (!is_utf8_char(p))
1348 return FALSE;
1349 if (!PL_utf8_digit)
1350 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1351 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1352}
1353
1354bool
1355Perl_is_utf8_upper(pTHX_ U8 *p)
1356{
1357 if (!is_utf8_char(p))
1358 return FALSE;
1359 if (!PL_utf8_upper)
1360 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1361 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1362}
1363
1364bool
1365Perl_is_utf8_lower(pTHX_ U8 *p)
1366{
1367 if (!is_utf8_char(p))
1368 return FALSE;
1369 if (!PL_utf8_lower)
1370 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1371 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1372}
1373
1374bool
1375Perl_is_utf8_cntrl(pTHX_ U8 *p)
1376{
1377 if (!is_utf8_char(p))
1378 return FALSE;
1379 if (!PL_utf8_cntrl)
1380 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1381 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1382}
1383
1384bool
1385Perl_is_utf8_graph(pTHX_ U8 *p)
1386{
1387 if (!is_utf8_char(p))
1388 return FALSE;
1389 if (!PL_utf8_graph)
1390 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1391 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1392}
1393
1394bool
1395Perl_is_utf8_print(pTHX_ U8 *p)
1396{
1397 if (!is_utf8_char(p))
1398 return FALSE;
1399 if (!PL_utf8_print)
1400 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1401 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1402}
1403
1404bool
1405Perl_is_utf8_punct(pTHX_ U8 *p)
1406{
1407 if (!is_utf8_char(p))
1408 return FALSE;
1409 if (!PL_utf8_punct)
1410 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1411 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1412}
1413
1414bool
1415Perl_is_utf8_xdigit(pTHX_ U8 *p)
1416{
1417 if (!is_utf8_char(p))
1418 return FALSE;
1419 if (!PL_utf8_xdigit)
1420 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1421 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1422}
1423
1424bool
1425Perl_is_utf8_mark(pTHX_ U8 *p)
1426{
1427 if (!is_utf8_char(p))
1428 return FALSE;
1429 if (!PL_utf8_mark)
1430 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1431 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1432}
1433
1434/*
1435=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1436
1437The "p" contains the pointer to the UTF-8 string encoding
1438the character that is being converted.
1439
1440The "ustrp" is a pointer to the character buffer to put the
1441conversion result to. The "lenp" is a pointer to the length
1442of the result.
1443
1444The "swashp" is a pointer to the swash to use.
1445
1446Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1447and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1448but not always, a multicharacter mapping), is tried first.
1449
1450The "special" is a string like "utf8::ToSpecLower", which means the
1451hash %utf8::ToSpecLower. The access to the hash is through
1452Perl_to_utf8_case().
1453
1454The "normal" is a string like "ToLower" which means the swash
1455%utf8::ToLower.
1456
1457=cut */
1458
1459UV
1460Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1461{
1462 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1463 STRLEN len = 0;
1464
1465 const UV uv0 = utf8_to_uvchr(p, NULL);
1466 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1467 * are necessary in EBCDIC, they are redundant no-ops
1468 * in ASCII-ish platforms, and hopefully optimized away. */
1469 const UV uv1 = NATIVE_TO_UNI(uv0);
1470 uvuni_to_utf8(tmpbuf, uv1);
1471
1472 if (!*swashp) /* load on-demand */
1473 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1474
1475 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1476 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1477 /* It might be "special" (sometimes, but not always,
1478 * a multicharacter mapping) */
1479 HV *hv;
1480 SV **svp;
1481
1482 if ((hv = get_hv(special, FALSE)) &&
1483 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1484 (*svp)) {
1485 const char *s;
1486
1487 s = SvPV_const(*svp, len);
1488 if (len == 1)
1489 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1490 else {
1491#ifdef EBCDIC
1492 /* If we have EBCDIC we need to remap the characters
1493 * since any characters in the low 256 are Unicode
1494 * code points, not EBCDIC. */
1495 U8 *t = (U8*)s, *tend = t + len, *d;
1496
1497 d = tmpbuf;
1498 if (SvUTF8(*svp)) {
1499 STRLEN tlen = 0;
1500
1501 while (t < tend) {
1502 UV c = utf8_to_uvchr(t, &tlen);
1503 if (tlen > 0) {
1504 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1505 t += tlen;
1506 }
1507 else
1508 break;
1509 }
1510 }
1511 else {
1512 while (t < tend) {
1513 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1514 t++;
1515 }
1516 }
1517 len = d - tmpbuf;
1518 Copy(tmpbuf, ustrp, len, U8);
1519#else
1520 Copy(s, ustrp, len, U8);
1521#endif
1522 }
1523 }
1524 }
1525
1526 if (!len && *swashp) {
1527 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1528
1529 if (uv2) {
1530 /* It was "normal" (a single character mapping). */
1531 UV uv3 = UNI_TO_NATIVE(uv2);
1532
1533 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1534 }
1535 }
1536
1537 if (!len) /* Neither: just copy. */
1538 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1539
1540 if (lenp)
1541 *lenp = len;
1542
1543 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1544}
1545
1546/*
1547=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1548
1549Convert the UTF-8 encoded character at p to its uppercase version and
1550store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1551that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1552the uppercase version may be longer than the original character.
1553
1554The first character of the uppercased version is returned
1555(but note, as explained above, that there may be more.)
1556
1557=cut */
1558
1559UV
1560Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1561{
1562 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1563 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1564}
1565
1566/*
1567=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1568
1569Convert the UTF-8 encoded character at p to its titlecase version and
1570store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1571that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1572titlecase version may be longer than the original character.
1573
1574The first character of the titlecased version is returned
1575(but note, as explained above, that there may be more.)
1576
1577=cut */
1578
1579UV
1580Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1581{
1582 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1583 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1584}
1585
1586/*
1587=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1588
1589Convert the UTF-8 encoded character at p to its lowercase version and
1590store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1591that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1592lowercase version may be longer than the original character.
1593
1594The first character of the lowercased version is returned
1595(but note, as explained above, that there may be more.)
1596
1597=cut */
1598
1599UV
1600Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1601{
1602 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1603 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1604}
1605
1606/*
1607=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1608
1609Convert the UTF-8 encoded character at p to its foldcase version and
1610store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1611that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1612foldcase version may be longer than the original character (up to
1613three characters).
1614
1615The first character of the foldcased version is returned
1616(but note, as explained above, that there may be more.)
1617
1618=cut */
1619
1620UV
1621Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1622{
1623 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1624 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1625}
1626
1627/* a "swash" is a swatch hash */
1628
1629SV*
1630Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1631{
1632 SV* retval;
1633 SV* const tokenbufsv = sv_newmortal();
1634 dSP;
1635 const size_t pkg_len = strlen(pkg);
1636 const size_t name_len = strlen(name);
1637 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1638 SV* errsv_save;
1639
1640 PUSHSTACKi(PERLSI_MAGIC);
1641 ENTER;
1642 SAVEI32(PL_hints);
1643 PL_hints = 0;
1644 save_re_context();
1645 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1646 ENTER;
1647 errsv_save = newSVsv(ERRSV);
1648 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1649 Nullsv);
1650 if (!SvTRUE(ERRSV))
1651 sv_setsv(ERRSV, errsv_save);
1652 SvREFCNT_dec(errsv_save);
1653 LEAVE;
1654 }
1655 SPAGAIN;
1656 PUSHMARK(SP);
1657 EXTEND(SP,5);
1658 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1659 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1660 PUSHs(listsv);
1661 PUSHs(sv_2mortal(newSViv(minbits)));
1662 PUSHs(sv_2mortal(newSViv(none)));
1663 PUTBACK;
1664 if (IN_PERL_COMPILETIME) {
1665 /* XXX ought to be handled by lex_start */
1666 SAVEI32(PL_in_my);
1667 PL_in_my = 0;
1668 sv_setpv(tokenbufsv, PL_tokenbuf);
1669 }
1670 errsv_save = newSVsv(ERRSV);
1671 if (call_method("SWASHNEW", G_SCALAR))
1672 retval = newSVsv(*PL_stack_sp--);
1673 else
1674 retval = &PL_sv_undef;
1675 if (!SvTRUE(ERRSV))
1676 sv_setsv(ERRSV, errsv_save);
1677 SvREFCNT_dec(errsv_save);
1678 LEAVE;
1679 POPSTACK;
1680 if (IN_PERL_COMPILETIME) {
1681 STRLEN len;
1682 const char* const pv = SvPV_const(tokenbufsv, len);
1683
1684 Copy(pv, PL_tokenbuf, len+1, char);
1685 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1686 }
1687 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1688 if (SvPOK(retval))
1689 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1690 retval);
1691 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1692 }
1693 return retval;
1694}
1695
1696
1697/* This API is wrong for special case conversions since we may need to
1698 * return several Unicode characters for a single Unicode character
1699 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1700 * the lower-level routine, and it is similarly broken for returning
1701 * multiple values. --jhi */
1702UV
1703Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1704{
1705 HV* const hv = (HV*)SvRV(sv);
1706 U32 klen;
1707 U32 off;
1708 STRLEN slen;
1709 STRLEN needents;
1710 const U8 *tmps = NULL;
1711 U32 bit;
1712 SV *retval;
1713 U8 tmputf8[2];
1714 UV c = NATIVE_TO_ASCII(*ptr);
1715
1716 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1717 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1718 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1719 ptr = tmputf8;
1720 }
1721 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1722 * then the "swatch" is a vec() for al the chars which start
1723 * with 0xAA..0xYY
1724 * So the key in the hash (klen) is length of encoded char -1
1725 */
1726 klen = UTF8SKIP(ptr) - 1;
1727 off = ptr[klen];
1728
1729 if (klen == 0)
1730 {
1731 /* If char in invariant then swatch is for all the invariant chars
1732 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1733 */
1734 needents = UTF_CONTINUATION_MARK;
1735 off = NATIVE_TO_UTF(ptr[klen]);
1736 }
1737 else
1738 {
1739 /* If char is encoded then swatch is for the prefix */
1740 needents = (1 << UTF_ACCUMULATION_SHIFT);
1741 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1742 }
1743
1744 /*
1745 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1746 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1747 * it's nothing to sniff at.) Pity we usually come through at least
1748 * two function calls to get here...
1749 *
1750 * NB: this code assumes that swatches are never modified, once generated!
1751 */
1752
1753 if (hv == PL_last_swash_hv &&
1754 klen == PL_last_swash_klen &&
1755 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1756 {
1757 tmps = PL_last_swash_tmps;
1758 slen = PL_last_swash_slen;
1759 }
1760 else {
1761 /* Try our second-level swatch cache, kept in a hash. */
1762 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1763
1764 /* If not cached, generate it via utf8::SWASHGET */
1765 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1766 dSP;
1767 /* We use utf8n_to_uvuni() as we want an index into
1768 Unicode tables, not a native character number.
1769 */
1770 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1771 ckWARN(WARN_UTF8) ?
1772 0 : UTF8_ALLOW_ANY);
1773 SV *errsv_save;
1774 ENTER;
1775 SAVETMPS;
1776 save_re_context();
1777 PUSHSTACKi(PERLSI_MAGIC);
1778 PUSHMARK(SP);
1779 EXTEND(SP,3);
1780 PUSHs((SV*)sv);
1781 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1782 PUSHs(sv_2mortal(newSViv((klen) ?
1783 (code_point & ~(needents - 1)) : 0)));
1784 PUSHs(sv_2mortal(newSViv(needents)));
1785 PUTBACK;
1786 errsv_save = newSVsv(ERRSV);
1787 if (call_method("SWASHGET", G_SCALAR))
1788 retval = newSVsv(*PL_stack_sp--);
1789 else
1790 retval = &PL_sv_undef;
1791 if (!SvTRUE(ERRSV))
1792 sv_setsv(ERRSV, errsv_save);
1793 SvREFCNT_dec(errsv_save);
1794 POPSTACK;
1795 FREETMPS;
1796 LEAVE;
1797 if (IN_PERL_COMPILETIME)
1798 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1799
1800 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1801
1802 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1803 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1804 }
1805
1806 PL_last_swash_hv = hv;
1807 PL_last_swash_klen = klen;
1808 /* FIXME change interpvar.h? */
1809 PL_last_swash_tmps = (U8 *) tmps;
1810 PL_last_swash_slen = slen;
1811 if (klen)
1812 Copy(ptr, PL_last_swash_key, klen, U8);
1813 }
1814
1815 switch ((int)((slen << 3) / needents)) {
1816 case 1:
1817 bit = 1 << (off & 7);
1818 off >>= 3;
1819 return (tmps[off] & bit) != 0;
1820 case 8:
1821 return tmps[off];
1822 case 16:
1823 off <<= 1;
1824 return (tmps[off] << 8) + tmps[off + 1] ;
1825 case 32:
1826 off <<= 2;
1827 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1828 }
1829 Perl_croak(aTHX_ "panic: swash_fetch");
1830 return 0;
1831}
1832
1833
1834/*
1835=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1836
1837Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1838of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1839bytes available. The return value is the pointer to the byte after the
1840end of the new character. In other words,
1841
1842 d = uvchr_to_utf8(d, uv);
1843
1844is the recommended wide native character-aware way of saying
1845
1846 *(d++) = uv;
1847
1848=cut
1849*/
1850
1851/* On ASCII machines this is normally a macro but we want a
1852 real function in case XS code wants it
1853*/
1854#undef Perl_uvchr_to_utf8
1855U8 *
1856Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1857{
1858 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1859}
1860
1861U8 *
1862Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1863{
1864 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1865}
1866
1867/*
1868=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1869
1870Returns the native character value of the first character in the string C<s>
1871which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1872length, in bytes, of that character.
1873
1874Allows length and flags to be passed to low level routine.
1875
1876=cut
1877*/
1878/* On ASCII machines this is normally a macro but we want
1879 a real function in case XS code wants it
1880*/
1881#undef Perl_utf8n_to_uvchr
1882UV
1883Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1884{
1885 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1886 return UNI_TO_NATIVE(uv);
1887}
1888
1889/*
1890=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1891
1892Build to the scalar dsv a displayable version of the string spv,
1893length len, the displayable version being at most pvlim bytes long
1894(if longer, the rest is truncated and "..." will be appended).
1895
1896The flags argument can have UNI_DISPLAY_ISPRINT set to display
1897isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1898to display the \\[nrfta\\] as the backslashed versions (like '\n')
1899(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1900UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1901UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1902
1903The pointer to the PV of the dsv is returned.
1904
1905=cut */
1906char *
1907Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1908{
1909 int truncated = 0;
1910 const char *s, *e;
1911
1912 sv_setpvn(dsv, "", 0);
1913 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1914 UV u;
1915 /* This serves double duty as a flag and a character to print after
1916 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1917 */
1918 char ok = 0;
1919
1920 if (pvlim && SvCUR(dsv) >= pvlim) {
1921 truncated++;
1922 break;
1923 }
1924 u = utf8_to_uvchr((U8*)s, 0);
1925 if (u < 256) {
1926 const unsigned char c = (unsigned char)u & 0xFF;
1927 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1928 switch (c) {
1929 case '\n':
1930 ok = 'n'; break;
1931 case '\r':
1932 ok = 'r'; break;
1933 case '\t':
1934 ok = 't'; break;
1935 case '\f':
1936 ok = 'f'; break;
1937 case '\a':
1938 ok = 'a'; break;
1939 case '\\':
1940 ok = '\\'; break;
1941 default: break;
1942 }
1943 if (ok) {
1944 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1945 }
1946 }
1947 /* isPRINT() is the locale-blind version. */
1948 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1949 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1950 ok = 1;
1951 }
1952 }
1953 if (!ok)
1954 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1955 }
1956 if (truncated)
1957 sv_catpvn(dsv, "...", 3);
1958
1959 return SvPVX(dsv);
1960}
1961
1962/*
1963=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1964
1965Build to the scalar dsv a displayable version of the scalar sv,
1966the displayable version being at most pvlim bytes long
1967(if longer, the rest is truncated and "..." will be appended).
1968
1969The flags argument is as in pv_uni_display().
1970
1971The pointer to the PV of the dsv is returned.
1972
1973=cut */
1974char *
1975Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1976{
1977 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
1978 SvCUR(ssv), pvlim, flags);
1979}
1980
1981/*
1982=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1983
1984Return true if the strings s1 and s2 differ case-insensitively, false
1985if not (if they are equal case-insensitively). If u1 is true, the
1986string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1987the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1988are false, the respective string is assumed to be in native 8-bit
1989encoding.
1990
1991If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1992in there (they will point at the beginning of the I<next> character).
1993If the pointers behind pe1 or pe2 are non-NULL, they are the end
1994pointers beyond which scanning will not continue under any
1995circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1996s2+l2 will be used as goal end pointers that will also stop the scan,
1997and which qualify towards defining a successful match: all the scans
1998that define an explicit length must reach their goal pointers for
1999a match to succeed).
2000
2001For case-insensitiveness, the "casefolding" of Unicode is used
2002instead of upper/lowercasing both the characters, see
2003http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2004
2005=cut */
2006I32
2007Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2008{
2009 register const U8 *p1 = (const U8*)s1;
2010 register const U8 *p2 = (const U8*)s2;
2011 register const U8 *f1 = 0, *f2 = 0;
2012 register U8 *e1 = 0, *q1 = 0;
2013 register U8 *e2 = 0, *q2 = 0;
2014 STRLEN n1 = 0, n2 = 0;
2015 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2016 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2017 U8 natbuf[1+1];
2018 STRLEN foldlen1, foldlen2;
2019 bool match;
2020
2021 if (pe1)
2022 e1 = *(U8**)pe1;
2023 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2024 f1 = (const U8*)s1 + l1;
2025 if (pe2)
2026 e2 = *(U8**)pe2;
2027 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2028 f2 = (const U8*)s2 + l2;
2029
2030 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2031 return 1; /* mismatch; possible infinite loop or false positive */
2032
2033 if (!u1 || !u2)
2034 natbuf[1] = 0; /* Need to terminate the buffer. */
2035
2036 while ((e1 == 0 || p1 < e1) &&
2037 (f1 == 0 || p1 < f1) &&
2038 (e2 == 0 || p2 < e2) &&
2039 (f2 == 0 || p2 < f2)) {
2040 if (n1 == 0) {
2041 if (u1)
2042 to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
2043 else {
2044 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2045 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2046 }
2047 q1 = foldbuf1;
2048 n1 = foldlen1;
2049 }
2050 if (n2 == 0) {
2051 if (u2)
2052 to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
2053 else {
2054 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2055 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2056 }
2057 q2 = foldbuf2;
2058 n2 = foldlen2;
2059 }
2060 while (n1 && n2) {
2061 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2062 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2063 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2064 return 1; /* mismatch */
2065 n1 -= UTF8SKIP(q1);
2066 q1 += UTF8SKIP(q1);
2067 n2 -= UTF8SKIP(q2);
2068 q2 += UTF8SKIP(q2);
2069 }
2070 if (n1 == 0)
2071 p1 += u1 ? UTF8SKIP(p1) : 1;
2072 if (n2 == 0)
2073 p2 += u2 ? UTF8SKIP(p2) : 1;
2074
2075 }
2076
2077 /* A match is defined by all the scans that specified
2078 * an explicit length reaching their final goals. */
2079 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2080
2081 if (match) {
2082 if (pe1)
2083 *pe1 = (char*)p1;
2084 if (pe2)
2085 *pe2 = (char*)p2;
2086 }
2087
2088 return match ? 0 : 1; /* 0 match, 1 mismatch */
2089}
2090
2091/*
2092 * Local variables:
2093 * c-indentation-style: bsd
2094 * c-basic-offset: 4
2095 * indent-tabs-mode: t
2096 * End:
2097 *
2098 * ex: set ts=8 sts=4 sw=4 noet:
2099 */
Note: See TracBrowser for help on using the repository browser.