Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

utf8.c@ 3662

Visit:

Last change on this file since 3662 was 3181, checked in by bird, 18 years ago
perl 5.8.8
File size: 53.0 KB

Line
1	/* utf8.c
2	*
3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4	* by Larry Wall and others
5	*
6	* You may distribute under the terms of either the GNU General Public
7	* License or the Artistic License, as specified in the README file.
8	*
9	*/
10
11	/*
12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13	* heard of that we don't want to see any closer; and that's the one place
14	* we're trying to get to! And that's just where we can't get, nohow.'
15	*
16	* 'Well do I understand your speech,' he answered in the same language;
17	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18	* as is the custom in the West, if you wish to be answered?'
19	*
20	* ...the travellers perceived that the floor was paved with stones of many
21	* hues; branching runes and strange devices intertwined beneath their feet.
22	*/
23
24	#include "EXTERN.h"
25	#define PERL_IN_UTF8_C
26	#include "perl.h"
27
28	static const char unees[] =
29	"Malformed UTF-8 character (unexpected end of string)";
30
31	/*
32	=head1 Unicode Support
33
34	This file contains various utility functions for manipulating UTF8-encoded
35	strings. For the uninitiated, this is a method of representing arbitrary
36	Unicode characters as a variable number of bytes, in such a way that
37	characters in the ASCII range are unmodified, and a zero byte never appears
38	within non-zero characters.
39
40	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
41
42	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
43	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
44	bytes available. The return value is the pointer to the byte after the
45	end of the new character. In other words,
46
47	d = uvuni_to_utf8_flags(d, uv, flags);
48
49	or, in most cases,
50
51	d = uvuni_to_utf8(d, uv);
52
53	(which is equivalent to)
54
55	d = uvuni_to_utf8_flags(d, uv, 0);
56
57	is the recommended Unicode-aware way of saying
58
59	*(d++) = uv;
60
61	=cut
62	*/
63
64	U8 *
65	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
66	{
67	if (ckWARN(WARN_UTF8)) {
68	if (UNICODE_IS_SURROGATE(uv) &&
69	!(flags & UNICODE_ALLOW_SURROGATE))
70	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
71	else if (
72	((uv >= 0xFDD0 && uv <= 0xFDEF &&
73	!(flags & UNICODE_ALLOW_FDD0))
74	\|\|
75	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
76	!(flags & UNICODE_ALLOW_FFFF))) &&
77	/* UNICODE_ALLOW_SUPER includes
78	* FFFEs and FFFFs beyond 0x10FFFF. */
79	((uv <= PERL_UNICODE_MAX) \|\|
80	!(flags & UNICODE_ALLOW_SUPER))
81	)
82	Perl_warner(aTHX_ packWARN(WARN_UTF8),
83	"Unicode character 0x%04"UVxf" is illegal", uv);
84	}
85	if (UNI_IS_INVARIANT(uv)) {
86	*d++ = (U8)UTF_TO_NATIVE(uv);
87	return d;
88	}
89	#if defined(EBCDIC)
90	else {
91	STRLEN len = UNISKIP(uv);
92	U8 *p = d+len-1;
93	while (p > d) {
94	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
95	uv >>= UTF_ACCUMULATION_SHIFT;
96	}
97	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
98	return d+len;
99	}
100	#else /* Non loop style */
101	if (uv < 0x800) {
102	*d++ = (U8)(( uv >> 6) \| 0xc0);
103	*d++ = (U8)(( uv & 0x3f) \| 0x80);
104	return d;
105	}
106	if (uv < 0x10000) {
107	*d++ = (U8)(( uv >> 12) \| 0xe0);
108	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
109	*d++ = (U8)(( uv & 0x3f) \| 0x80);
110	return d;
111	}
112	if (uv < 0x200000) {
113	*d++ = (U8)(( uv >> 18) \| 0xf0);
114	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
115	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
116	*d++ = (U8)(( uv & 0x3f) \| 0x80);
117	return d;
118	}
119	if (uv < 0x4000000) {
120	*d++ = (U8)(( uv >> 24) \| 0xf8);
121	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
122	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
123	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
124	*d++ = (U8)(( uv & 0x3f) \| 0x80);
125	return d;
126	}
127	if (uv < 0x80000000) {
128	*d++ = (U8)(( uv >> 30) \| 0xfc);
129	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
130	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
131	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
132	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
133	*d++ = (U8)(( uv & 0x3f) \| 0x80);
134	return d;
135	}
136	#ifdef HAS_QUAD
137	if (uv < UTF8_QUAD_MAX)
138	#endif
139	{
140	d++ = 0xfe; / Can't match U+FEFF! */
141	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
142	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
143	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
144	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
145	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
146	*d++ = (U8)(( uv & 0x3f) \| 0x80);
147	return d;
148	}
149	#ifdef HAS_QUAD
150	{
151	d++ = 0xff; / Can't match U+FFFE! */
152	d++ = 0x80; / 6 Reserved bits */
153	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
154	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
155	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
156	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
157	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
158	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
159	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
160	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
161	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
162	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
163	*d++ = (U8)(( uv & 0x3f) \| 0x80);
164	return d;
165	}
166	#endif
167	#endif /* Loop style */
168	}
169
170	U8 *
171	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172	{
173	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174	}
175
176	/*
177
178	Tests if some arbitrary number of bytes begins in a valid UTF-8
179	character. Note that an INVARIANT (i.e. ASCII) character is a valid
180	UTF-8 character. The actual number of bytes in the UTF-8 character
181	will be returned if it is valid, otherwise 0.
182
183	This is the "slow" version as opposed to the "fast" version which is
184	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187	you should use the _slow(). In practice this means that the _slow()
188	will be used very rarely, since the maximum Unicode code point (as of
189	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191	five bytes or more.
192
193	=cut */
194	STATIC STRLEN
195	S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196	{
197	U8 u = *s;
198	STRLEN slen;
199	UV uv, ouv;
200
201	if (UTF8_IS_INVARIANT(u))
202	return 1;
203
204	if (!UTF8_IS_START(u))
205	return 0;
206
207	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
208	return 0;
209
210	slen = len - 1;
211	s++;
212	#ifdef EBCDIC
213	u = NATIVE_TO_UTF(u);
214	#endif
215	u &= UTF_START_MASK(len);
216	uv = u;
217	ouv = uv;
218	while (slen--) {
219	if (!UTF8_IS_CONTINUATION(*s))
220	return 0;
221	uv = UTF8_ACCUMULATE(uv, *s);
222	if (uv < ouv)
223	return 0;
224	ouv = uv;
225	s++;
226	}
227
228	if ((STRLEN)UNISKIP(uv) < len)
229	return 0;
230
231	return len;
232	}
233
234	/*
235	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
236
237	Tests if some arbitrary number of bytes begins in a valid UTF-8
238	character. Note that an INVARIANT (i.e. ASCII) character is a valid
239	UTF-8 character. The actual number of bytes in the UTF-8 character
240	will be returned if it is valid, otherwise 0.
241
242	=cut */
243	STRLEN
244	Perl_is_utf8_char(pTHX_ U8 *s)
245	{
246	const STRLEN len = UTF8SKIP(s);
247	#ifdef IS_UTF8_CHAR
248	if (IS_UTF8_CHAR_FAST(len))
249	return IS_UTF8_CHAR(s, len) ? len : 0;
250	#endif /* #ifdef IS_UTF8_CHAR */
251	return is_utf8_char_slow(s, len);
252	}
253
254	/*
255	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
256
257	Returns true if first C<len> bytes of the given string form a valid
258	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
259	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
260	because a valid ASCII string is a valid UTF-8 string.
261
262	See also is_utf8_string_loclen() and is_utf8_string_loc().
263
264	=cut
265	*/
266
267	bool
268	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
269	{
270	const U8* x = s;
271	const U8* send;
272
273	if (!len)
274	len = strlen((const char *)s);
275	send = s + len;
276
277	while (x < send) {
278	STRLEN c;
279	/* Inline the easy bits of is_utf8_char() here for speed... */
280	if (UTF8_IS_INVARIANT(*x))
281	c = 1;
282	else if (!UTF8_IS_START(*x))
283	goto out;
284	else {
285	/* ... and call is_utf8_char() only if really needed. */
286	#ifdef IS_UTF8_CHAR
287	c = UTF8SKIP(x);
288	if (IS_UTF8_CHAR_FAST(c)) {
289	if (!IS_UTF8_CHAR(x, c))
290	goto out;
291	} else if (!is_utf8_char_slow(x, c))
292	goto out;
293	#else
294	c = is_utf8_char(x);
295	#endif /* #ifdef IS_UTF8_CHAR */
296	if (!c)
297	goto out;
298	}
299	x += c;
300	}
301
302	out:
303	if (x != send)
304	return FALSE;
305
306	return TRUE;
307	}
308
309	/*
310	=for apidoc A\|bool\|is_utf8_string_loclen\|const U8 s\|STRLEN len\|const U8 ep\|const STRLEN el
311
312	Like is_utf8_string() but stores the location of the failure (in the
313	case of "utf8ness failure") or the location s+len (in the case of
314	"utf8ness success") in the C<ep>, and the number of UTF-8
315	encoded characters in the C<el>.
316
317	See also is_utf8_string_loc() and is_utf8_string().
318
319	=cut
320	*/
321
322	bool
323	Perl_is_utf8_string_loclen(pTHX_ const U8 s, STRLEN len, const U8 ep, STRLEN el)
324	{
325	const U8* x = s;
326	const U8* send;
327	STRLEN c;
328
329	if (!len)
330	len = strlen((const char *)s);
331	send = s + len;
332	if (el)
333	*el = 0;
334
335	while (x < send) {
336	/* Inline the easy bits of is_utf8_char() here for speed... */
337	if (UTF8_IS_INVARIANT(*x))
338	c = 1;
339	else if (!UTF8_IS_START(*x))
340	goto out;
341	else {
342	/* ... and call is_utf8_char() only if really needed. */
343	#ifdef IS_UTF8_CHAR
344	c = UTF8SKIP(x);
345	if (IS_UTF8_CHAR_FAST(c)) {
346	if (!IS_UTF8_CHAR(x, c))
347	c = 0;
348	} else
349	c = is_utf8_char_slow(x, c);
350	#else
351	c = is_utf8_char(x);
352	#endif /* #ifdef IS_UTF8_CHAR */
353	if (!c)
354	goto out;
355	}
356	x += c;
357	if (el)
358	(*el)++;
359	}
360
361	out:
362	if (ep)
363	*ep = x;
364	if (x != send)
365	return FALSE;
366
367	return TRUE;
368	}
369
370	/*
371	=for apidoc A\|bool\|is_utf8_string_loc\|const U8 s\|STRLEN len\|const U8 ep\|const STRLEN el
372
373	Like is_utf8_string() but stores the location of the failure (in the
374	case of "utf8ness failure") or the location s+len (in the case of
375	"utf8ness success") in the C<ep>.
376
377	See also is_utf8_string_loclen() and is_utf8_string().
378
379	=cut
380	*/
381
382	bool
383	Perl_is_utf8_string_loc(pTHX_ U8 s, STRLEN len, U8 *ep)
384	{
385	return is_utf8_string_loclen(s, len, (const U8 **)ep, 0);
386	}
387
388	/*
389	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
390
391	Bottom level UTF-8 decode routine.
392	Returns the unicode code point value of the first character in the string C<s>
393	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
394	C<retlen> will be set to the length, in bytes, of that character.
395
396	If C<s> does not point to a well-formed UTF-8 character, the behaviour
397	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
398	it is assumed that the caller will raise a warning, and this function
399	will silently just set C<retlen> to C<-1> and return zero. If the
400	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
401	malformations will be given, C<retlen> will be set to the expected
402	length of the UTF-8 character in bytes, and zero will be returned.
403
404	The C<flags> can also contain various flags to allow deviations from
405	the strict UTF-8 encoding (see F<utf8.h>).
406
407	Most code should use utf8_to_uvchr() rather than call this directly.
408
409	=cut
410	*/
411
412	UV
413	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
414	{
415	const U8 *s0 = s;
416	UV uv = *s, ouv = 0;
417	STRLEN len = 1;
418	const bool dowarn = ckWARN_d(WARN_UTF8);
419	const UV startbyte = *s;
420	STRLEN expectlen = 0;
421	U32 warning = 0;
422
423	/* This list is a superset of the UTF8_ALLOW_XXX. */
424
425	#define UTF8_WARN_EMPTY 1
426	#define UTF8_WARN_CONTINUATION 2
427	#define UTF8_WARN_NON_CONTINUATION 3
428	#define UTF8_WARN_FE_FF 4
429	#define UTF8_WARN_SHORT 5
430	#define UTF8_WARN_OVERFLOW 6
431	#define UTF8_WARN_SURROGATE 7
432	#define UTF8_WARN_LONG 8
433	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
434
435	if (curlen == 0 &&
436	!(flags & UTF8_ALLOW_EMPTY)) {
437	warning = UTF8_WARN_EMPTY;
438	goto malformed;
439	}
440
441	if (UTF8_IS_INVARIANT(uv)) {
442	if (retlen)
443	*retlen = 1;
444	return (UV) (NATIVE_TO_UTF(*s));
445	}
446
447	if (UTF8_IS_CONTINUATION(uv) &&
448	!(flags & UTF8_ALLOW_CONTINUATION)) {
449	warning = UTF8_WARN_CONTINUATION;
450	goto malformed;
451	}
452
453	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
454	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
455	warning = UTF8_WARN_NON_CONTINUATION;
456	goto malformed;
457	}
458
459	#ifdef EBCDIC
460	uv = NATIVE_TO_UTF(uv);
461	#else
462	if ((uv == 0xfe \|\| uv == 0xff) &&
463	!(flags & UTF8_ALLOW_FE_FF)) {
464	warning = UTF8_WARN_FE_FF;
465	goto malformed;
466	}
467	#endif
468
469	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
470	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
471	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
472	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
473	#ifdef EBCDIC
474	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475	else { len = 7; uv &= 0x01; }
476	#else
477	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
478	else if (!(uv & 0x01)) { len = 7; uv = 0; }
479	else { len = 13; uv = 0; } /* whoa! */
480	#endif
481
482	if (retlen)
483	*retlen = len;
484
485	expectlen = len;
486
487	if ((curlen < expectlen) &&
488	!(flags & UTF8_ALLOW_SHORT)) {
489	warning = UTF8_WARN_SHORT;
490	goto malformed;
491	}
492
493	len--;
494	s++;
495	ouv = uv;
496
497	while (len--) {
498	if (!UTF8_IS_CONTINUATION(*s) &&
499	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
500	s--;
501	warning = UTF8_WARN_NON_CONTINUATION;
502	goto malformed;
503	}
504	else
505	uv = UTF8_ACCUMULATE(uv, *s);
506	if (!(uv > ouv)) {
507	/* These cannot be allowed. */
508	if (uv == ouv) {
509	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
510	warning = UTF8_WARN_LONG;
511	goto malformed;
512	}
513	}
514	else { /* uv < ouv */
515	/* This cannot be allowed. */
516	warning = UTF8_WARN_OVERFLOW;
517	goto malformed;
518	}
519	}
520	s++;
521	ouv = uv;
522	}
523
524	if (UNICODE_IS_SURROGATE(uv) &&
525	!(flags & UTF8_ALLOW_SURROGATE)) {
526	warning = UTF8_WARN_SURROGATE;
527	goto malformed;
528	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
529	!(flags & UTF8_ALLOW_LONG)) {
530	warning = UTF8_WARN_LONG;
531	goto malformed;
532	} else if (UNICODE_IS_ILLEGAL(uv) &&
533	!(flags & UTF8_ALLOW_FFFF)) {
534	warning = UTF8_WARN_FFFF;
535	goto malformed;
536	}
537
538	return uv;
539
540	malformed:
541
542	if (flags & UTF8_CHECK_ONLY) {
543	if (retlen)
544	*retlen = -1;
545	return 0;
546	}
547
548	if (dowarn) {
549	SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
550
551	switch (warning) {
552	case 0: /* Intentionally empty. */ break;
553	case UTF8_WARN_EMPTY:
554	Perl_sv_catpv(aTHX_ sv, "(empty string)");
555	break;
556	case UTF8_WARN_CONTINUATION:
557	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
558	break;
559	case UTF8_WARN_NON_CONTINUATION:
560	if (s == s0)
561	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
562	(UV)s[1], startbyte);
563	else {
564	const int len = (int)(s-s0);
565	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
566	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
567	}
568
569	break;
570	case UTF8_WARN_FE_FF:
571	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
572	break;
573	case UTF8_WARN_SHORT:
574	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
575	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
576	expectlen = curlen; /* distance for caller to skip */
577	break;
578	case UTF8_WARN_OVERFLOW:
579	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
580	ouv, *s, startbyte);
581	break;
582	case UTF8_WARN_SURROGATE:
583	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
584	break;
585	case UTF8_WARN_LONG:
586	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
587	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
588	break;
589	case UTF8_WARN_FFFF:
590	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
591	break;
592	default:
593	Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
594	break;
595	}
596
597	if (warning) {
598	const char * const s = SvPVX_const(sv);
599
600	if (PL_op)
601	Perl_warner(aTHX_ packWARN(WARN_UTF8),
602	"%s in %s", s, OP_DESC(PL_op));
603	else
604	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
605	}
606	}
607
608	if (retlen)
609	*retlen = expectlen ? expectlen : len;
610
611	return 0;
612	}
613
614	/*
615	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
616
617	Returns the native character value of the first character in the string C<s>
618	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
619	length, in bytes, of that character.
620
621	If C<s> does not point to a well-formed UTF-8 character, zero is
622	returned and retlen is set, if possible, to -1.
623
624	=cut
625	*/
626
627	UV
628	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
629	{
630	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
631	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
632	}
633
634	/*
635	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
636
637	Returns the Unicode code point of the first character in the string C<s>
638	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
639	length, in bytes, of that character.
640
641	This function should only be used when returned UV is considered
642	an index into the Unicode semantic tables (e.g. swashes).
643
644	If C<s> does not point to a well-formed UTF-8 character, zero is
645	returned and retlen is set, if possible, to -1.
646
647	=cut
648	*/
649
650	UV
651	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
652	{
653	/* Call the low level routine asking for checks */
654	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
655	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
656	}
657
658	/*
659	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
660
661	Return the length of the UTF-8 char encoded string C<s> in characters.
662	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
663	up past C<e>, croaks.
664
665	=cut
666	*/
667
668	STRLEN
669	Perl_utf8_length(pTHX_ U8 s, U8 e)
670	{
671	STRLEN len = 0;
672
673	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
674	* the bitops (especially ~) can create illegal UTF-8.
675	* In other words: in Perl UTF-8 is not just for Unicode. */
676
677	if (e < s)
678	goto warn_and_return;
679	while (s < e) {
680	const U8 t = UTF8SKIP(s);
681	if (e - s < t) {
682	warn_and_return:
683	if (ckWARN_d(WARN_UTF8)) {
684	if (PL_op)
685	Perl_warner(aTHX_ packWARN(WARN_UTF8),
686	"%s in %s", unees, OP_DESC(PL_op));
687	else
688	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
689	}
690	return len;
691	}
692	s += t;
693	len++;
694	}
695
696	return len;
697	}
698
699	/*
700	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
701
702	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
703	and C<b>.
704
705	WARNING: use only if you know that the pointers point inside the
706	same UTF-8 buffer.
707
708	=cut
709	*/
710
711	IV
712	Perl_utf8_distance(pTHX_ U8 a, U8 b)
713	{
714	IV off = 0;
715
716	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
717	* the bitops (especially ~) can create illegal UTF-8.
718	* In other words: in Perl UTF-8 is not just for Unicode. */
719
720	if (a < b) {
721	while (a < b) {
722	const U8 c = UTF8SKIP(a);
723	if (b - a < c)
724	goto warn_and_return;
725	a += c;
726	off--;
727	}
728	}
729	else {
730	while (b < a) {
731	const U8 c = UTF8SKIP(b);
732
733	if (a - b < c) {
734	warn_and_return:
735	if (ckWARN_d(WARN_UTF8)) {
736	if (PL_op)
737	Perl_warner(aTHX_ packWARN(WARN_UTF8),
738	"%s in %s", unees, OP_DESC(PL_op));
739	else
740	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
741	}
742	return off;
743	}
744	b += c;
745	off++;
746	}
747	}
748
749	return off;
750	}
751
752	/*
753	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
754
755	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
756	forward or backward.
757
758	WARNING: do not use the following unless you know C<off> is within
759	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
760	on the first byte of character or just after the last byte of a character.
761
762	=cut
763	*/
764
765	U8 *
766	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
767	{
768	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
769	* the bitops (especially ~) can create illegal UTF-8.
770	* In other words: in Perl UTF-8 is not just for Unicode. */
771
772	if (off >= 0) {
773	while (off--)
774	s += UTF8SKIP(s);
775	}
776	else {
777	while (off++) {
778	s--;
779	while (UTF8_IS_CONTINUATION(*s))
780	s--;
781	}
782	}
783	return (U8 *)s;
784	}
785
786	/*
787	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
788
789	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
790	Unlike C<bytes_to_utf8>, this over-writes the original string, and
791	updates len to contain the new length.
792	Returns zero on failure, setting C<len> to -1.
793
794	=cut
795	*/
796
797	U8 *
798	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
799	{
800	U8 *send;
801	U8 *d;
802	U8 *save = s;
803
804	/* ensure valid UTF-8 and chars < 256 before updating string */
805	for (send = s + *len; s < send; ) {
806	U8 c = *s++;
807
808	if (!UTF8_IS_INVARIANT(c) &&
809	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
810	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
811	*len = -1;
812	return 0;
813	}
814	}
815
816	d = s = save;
817	while (s < send) {
818	STRLEN ulen;
819	*d++ = (U8)utf8_to_uvchr(s, &ulen);
820	s += ulen;
821	}
822	*d = '\0';
823	*len = d - save;
824	return save;
825	}
826
827	/*
828	=for apidoc A\|U8 \|bytes_from_utf8\|const U8 s\|STRLEN len\|bool is_utf8
829
830	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
831	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
832	the newly-created string, and updates C<len> to contain the new
833	length. Returns the original string if no conversion occurs, C<len>
834	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
835	0 if C<s> is converted or contains all 7bit characters.
836
837	=cut
838	*/
839
840	U8 *
841	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
842	{
843	U8 *d;
844	const U8 *start = s;
845	const U8 *send;
846	I32 count = 0;
847	const U8 *s2;
848
849	if (!*is_utf8)
850	return (U8 *)start;
851
852	/* ensure valid UTF-8 and chars < 256 before converting string */
853	for (send = s + *len; s < send;) {
854	U8 c = *s++;
855	if (!UTF8_IS_INVARIANT(c)) {
856	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
857	(c = *s++) && UTF8_IS_CONTINUATION(c))
858	count++;
859	else
860	return (U8 *)start;
861	}
862	}
863
864	*is_utf8 = 0;
865
866	Newxz(d, (*len) - count + 1, U8);
867	s2 = start; start = d;
868	while (s2 < send) {
869	U8 c = *s2++;
870	if (!UTF8_IS_INVARIANT(c)) {
871	/* Then it is two-byte encoded */
872	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
873	c = ASCII_TO_NATIVE(c);
874	}
875	*d++ = c;
876	}
877	*d = '\0';
878	*len = d - start;
879	return (U8 *)start;
880	}
881
882	/*
883	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
884
885	Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
886	Returns a pointer to the newly-created string, and sets C<len> to
887	reflect the new length.
888
889	If you want to convert to UTF-8 from other encodings than ASCII,
890	see sv_recode_to_utf8().
891
892	=cut
893	*/
894
895	U8*
896	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
897	{
898	const U8 * const send = s + (*len);
899	U8 *d;
900	U8 *dst;
901
902	Newxz(d, (len) 2 + 1, U8);
903	dst = d;
904
905	while (s < send) {
906	const UV uv = NATIVE_TO_ASCII(*s++);
907	if (UNI_IS_INVARIANT(uv))
908	*d++ = (U8)UTF_TO_NATIVE(uv);
909	else {
910	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
911	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
912	}
913	}
914	*d = '\0';
915	*len = d-dst;
916	return dst;
917	}
918
919	/*
920	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
921	*
922	* Destination must be pre-extended to 3/2 source. Do not use in-place.
923	* We optimize for native, for obvious reasons. */
924
925	U8*
926	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
927	{
928	U8* pend;
929	U8* dstart = d;
930
931	if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
932	d[0] = 0;
933	*newlen = 1;
934	return d;
935	}
936
937	if (bytelen & 1)
938	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
939
940	pend = p + bytelen;
941
942	while (p < pend) {
943	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
944	p += 2;
945	if (uv < 0x80) {
946	*d++ = (U8)uv;
947	continue;
948	}
949	if (uv < 0x800) {
950	*d++ = (U8)(( uv >> 6) \| 0xc0);
951	*d++ = (U8)(( uv & 0x3f) \| 0x80);
952	continue;
953	}
954	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
955	UV low = (p[0] << 8) + p[1];
956	p += 2;
957	if (low < 0xdc00 \|\| low >= 0xdfff)
958	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
959	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
960	}
961	if (uv < 0x10000) {
962	*d++ = (U8)(( uv >> 12) \| 0xe0);
963	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
964	*d++ = (U8)(( uv & 0x3f) \| 0x80);
965	continue;
966	}
967	else {
968	*d++ = (U8)(( uv >> 18) \| 0xf0);
969	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
970	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
971	*d++ = (U8)(( uv & 0x3f) \| 0x80);
972	continue;
973	}
974	}
975	*newlen = d - dstart;
976	return d;
977	}
978
979	/* Note: this one is slightly destructive of the source. */
980
981	U8*
982	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
983	{
984	U8* s = (U8*)p;
985	U8* send = s + bytelen;
986	while (s < send) {
987	U8 tmp = s[0];
988	s[0] = s[1];
989	s[1] = tmp;
990	s += 2;
991	}
992	return utf16_to_utf8(p, d, bytelen, newlen);
993	}
994
995	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
996
997	bool
998	Perl_is_uni_alnum(pTHX_ UV c)
999	{
1000	U8 tmpbuf[UTF8_MAXBYTES+1];
1001	uvchr_to_utf8(tmpbuf, c);
1002	return is_utf8_alnum(tmpbuf);
1003	}
1004
1005	bool
1006	Perl_is_uni_alnumc(pTHX_ UV c)
1007	{
1008	U8 tmpbuf[UTF8_MAXBYTES+1];
1009	uvchr_to_utf8(tmpbuf, c);
1010	return is_utf8_alnumc(tmpbuf);
1011	}
1012
1013	bool
1014	Perl_is_uni_idfirst(pTHX_ UV c)
1015	{
1016	U8 tmpbuf[UTF8_MAXBYTES+1];
1017	uvchr_to_utf8(tmpbuf, c);
1018	return is_utf8_idfirst(tmpbuf);
1019	}
1020
1021	bool
1022	Perl_is_uni_alpha(pTHX_ UV c)
1023	{
1024	U8 tmpbuf[UTF8_MAXBYTES+1];
1025	uvchr_to_utf8(tmpbuf, c);
1026	return is_utf8_alpha(tmpbuf);
1027	}
1028
1029	bool
1030	Perl_is_uni_ascii(pTHX_ UV c)
1031	{
1032	U8 tmpbuf[UTF8_MAXBYTES+1];
1033	uvchr_to_utf8(tmpbuf, c);
1034	return is_utf8_ascii(tmpbuf);
1035	}
1036
1037	bool
1038	Perl_is_uni_space(pTHX_ UV c)
1039	{
1040	U8 tmpbuf[UTF8_MAXBYTES+1];
1041	uvchr_to_utf8(tmpbuf, c);
1042	return is_utf8_space(tmpbuf);
1043	}
1044
1045	bool
1046	Perl_is_uni_digit(pTHX_ UV c)
1047	{
1048	U8 tmpbuf[UTF8_MAXBYTES+1];
1049	uvchr_to_utf8(tmpbuf, c);
1050	return is_utf8_digit(tmpbuf);
1051	}
1052
1053	bool
1054	Perl_is_uni_upper(pTHX_ UV c)
1055	{
1056	U8 tmpbuf[UTF8_MAXBYTES+1];
1057	uvchr_to_utf8(tmpbuf, c);
1058	return is_utf8_upper(tmpbuf);
1059	}
1060
1061	bool
1062	Perl_is_uni_lower(pTHX_ UV c)
1063	{
1064	U8 tmpbuf[UTF8_MAXBYTES+1];
1065	uvchr_to_utf8(tmpbuf, c);
1066	return is_utf8_lower(tmpbuf);
1067	}
1068
1069	bool
1070	Perl_is_uni_cntrl(pTHX_ UV c)
1071	{
1072	U8 tmpbuf[UTF8_MAXBYTES+1];
1073	uvchr_to_utf8(tmpbuf, c);
1074	return is_utf8_cntrl(tmpbuf);
1075	}
1076
1077	bool
1078	Perl_is_uni_graph(pTHX_ UV c)
1079	{
1080	U8 tmpbuf[UTF8_MAXBYTES+1];
1081	uvchr_to_utf8(tmpbuf, c);
1082	return is_utf8_graph(tmpbuf);
1083	}
1084
1085	bool
1086	Perl_is_uni_print(pTHX_ UV c)
1087	{
1088	U8 tmpbuf[UTF8_MAXBYTES+1];
1089	uvchr_to_utf8(tmpbuf, c);
1090	return is_utf8_print(tmpbuf);
1091	}
1092
1093	bool
1094	Perl_is_uni_punct(pTHX_ UV c)
1095	{
1096	U8 tmpbuf[UTF8_MAXBYTES+1];
1097	uvchr_to_utf8(tmpbuf, c);
1098	return is_utf8_punct(tmpbuf);
1099	}
1100
1101	bool
1102	Perl_is_uni_xdigit(pTHX_ UV c)
1103	{
1104	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1105	uvchr_to_utf8(tmpbuf, c);
1106	return is_utf8_xdigit(tmpbuf);
1107	}
1108
1109	UV
1110	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1111	{
1112	uvchr_to_utf8(p, c);
1113	return to_utf8_upper(p, p, lenp);
1114	}
1115
1116	UV
1117	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1118	{
1119	uvchr_to_utf8(p, c);
1120	return to_utf8_title(p, p, lenp);
1121	}
1122
1123	UV
1124	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1125	{
1126	uvchr_to_utf8(p, c);
1127	return to_utf8_lower(p, p, lenp);
1128	}
1129
1130	UV
1131	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1132	{
1133	uvchr_to_utf8(p, c);
1134	return to_utf8_fold(p, p, lenp);
1135	}
1136
1137	/* for now these all assume no locale info available for Unicode > 255 */
1138
1139	bool
1140	Perl_is_uni_alnum_lc(pTHX_ UV c)
1141	{
1142	return is_uni_alnum(c); /* XXX no locale support yet */
1143	}
1144
1145	bool
1146	Perl_is_uni_alnumc_lc(pTHX_ UV c)
1147	{
1148	return is_uni_alnumc(c); /* XXX no locale support yet */
1149	}
1150
1151	bool
1152	Perl_is_uni_idfirst_lc(pTHX_ UV c)
1153	{
1154	return is_uni_idfirst(c); /* XXX no locale support yet */
1155	}
1156
1157	bool
1158	Perl_is_uni_alpha_lc(pTHX_ UV c)
1159	{
1160	return is_uni_alpha(c); /* XXX no locale support yet */
1161	}
1162
1163	bool
1164	Perl_is_uni_ascii_lc(pTHX_ UV c)
1165	{
1166	return is_uni_ascii(c); /* XXX no locale support yet */
1167	}
1168
1169	bool
1170	Perl_is_uni_space_lc(pTHX_ UV c)
1171	{
1172	return is_uni_space(c); /* XXX no locale support yet */
1173	}
1174
1175	bool
1176	Perl_is_uni_digit_lc(pTHX_ UV c)
1177	{
1178	return is_uni_digit(c); /* XXX no locale support yet */
1179	}
1180
1181	bool
1182	Perl_is_uni_upper_lc(pTHX_ UV c)
1183	{
1184	return is_uni_upper(c); /* XXX no locale support yet */
1185	}
1186
1187	bool
1188	Perl_is_uni_lower_lc(pTHX_ UV c)
1189	{
1190	return is_uni_lower(c); /* XXX no locale support yet */
1191	}
1192
1193	bool
1194	Perl_is_uni_cntrl_lc(pTHX_ UV c)
1195	{
1196	return is_uni_cntrl(c); /* XXX no locale support yet */
1197	}
1198
1199	bool
1200	Perl_is_uni_graph_lc(pTHX_ UV c)
1201	{
1202	return is_uni_graph(c); /* XXX no locale support yet */
1203	}
1204
1205	bool
1206	Perl_is_uni_print_lc(pTHX_ UV c)
1207	{
1208	return is_uni_print(c); /* XXX no locale support yet */
1209	}
1210
1211	bool
1212	Perl_is_uni_punct_lc(pTHX_ UV c)
1213	{
1214	return is_uni_punct(c); /* XXX no locale support yet */
1215	}
1216
1217	bool
1218	Perl_is_uni_xdigit_lc(pTHX_ UV c)
1219	{
1220	return is_uni_xdigit(c); /* XXX no locale support yet */
1221	}
1222
1223	U32
1224	Perl_to_uni_upper_lc(pTHX_ U32 c)
1225	{
1226	/* XXX returns only the first character -- do not use XXX */
1227	/* XXX no locale support yet */
1228	STRLEN len;
1229	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1230	return (U32)to_uni_upper(c, tmpbuf, &len);
1231	}
1232
1233	U32
1234	Perl_to_uni_title_lc(pTHX_ U32 c)
1235	{
1236	/* XXX returns only the first character XXX -- do not use XXX */
1237	/* XXX no locale support yet */
1238	STRLEN len;
1239	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1240	return (U32)to_uni_title(c, tmpbuf, &len);
1241	}
1242
1243	U32
1244	Perl_to_uni_lower_lc(pTHX_ U32 c)
1245	{
1246	/* XXX returns only the first character -- do not use XXX */
1247	/* XXX no locale support yet */
1248	STRLEN len;
1249	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1250	return (U32)to_uni_lower(c, tmpbuf, &len);
1251	}
1252
1253	bool
1254	Perl_is_utf8_alnum(pTHX_ U8 *p)
1255	{
1256	if (!is_utf8_char(p))
1257	return FALSE;
1258	if (!PL_utf8_alnum)
1259	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1260	* descendant of isalnum(3), in other words, it doesn't
1261	* contain the '_'. --jhi */
1262	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1263	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1264	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
1265	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1266	if (!PL_utf8_alnum)
1267	PL_utf8_alnum = swash_init("utf8", "",
1268	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1269	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1270	#endif
1271	}
1272
1273	bool
1274	Perl_is_utf8_alnumc(pTHX_ U8 *p)
1275	{
1276	if (!is_utf8_char(p))
1277	return FALSE;
1278	if (!PL_utf8_alnumc)
1279	PL_utf8_alnumc = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1280	return swash_fetch(PL_utf8_alnumc, p, TRUE) != 0;
1281	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
1282	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1283	if (!PL_utf8_alnum)
1284	PL_utf8_alnum = swash_init("utf8", "",
1285	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1286	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1287	#endif
1288	}
1289
1290	bool
1291	Perl_is_utf8_idfirst(pTHX_ U8 p) / The naming is historical. */
1292	{
1293	if (*p == '_')
1294	return TRUE;
1295	if (!is_utf8_char(p))
1296	return FALSE;
1297	if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1298	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1299	return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1300	}
1301
1302	bool
1303	Perl_is_utf8_idcont(pTHX_ U8 *p)
1304	{
1305	if (*p == '_')
1306	return TRUE;
1307	if (!is_utf8_char(p))
1308	return FALSE;
1309	if (!PL_utf8_idcont)
1310	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1311	return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1312	}
1313
1314	bool
1315	Perl_is_utf8_alpha(pTHX_ U8 *p)
1316	{
1317	if (!is_utf8_char(p))
1318	return FALSE;
1319	if (!PL_utf8_alpha)
1320	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1321	return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1322	}
1323
1324	bool
1325	Perl_is_utf8_ascii(pTHX_ U8 *p)
1326	{
1327	if (!is_utf8_char(p))
1328	return FALSE;
1329	if (!PL_utf8_ascii)
1330	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1331	return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1332	}
1333
1334	bool
1335	Perl_is_utf8_space(pTHX_ U8 *p)
1336	{
1337	if (!is_utf8_char(p))
1338	return FALSE;
1339	if (!PL_utf8_space)
1340	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1341	return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1342	}
1343
1344	bool
1345	Perl_is_utf8_digit(pTHX_ U8 *p)
1346	{
1347	if (!is_utf8_char(p))
1348	return FALSE;
1349	if (!PL_utf8_digit)
1350	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1351	return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1352	}
1353
1354	bool
1355	Perl_is_utf8_upper(pTHX_ U8 *p)
1356	{
1357	if (!is_utf8_char(p))
1358	return FALSE;
1359	if (!PL_utf8_upper)
1360	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1361	return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1362	}
1363
1364	bool
1365	Perl_is_utf8_lower(pTHX_ U8 *p)
1366	{
1367	if (!is_utf8_char(p))
1368	return FALSE;
1369	if (!PL_utf8_lower)
1370	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1371	return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1372	}
1373
1374	bool
1375	Perl_is_utf8_cntrl(pTHX_ U8 *p)
1376	{
1377	if (!is_utf8_char(p))
1378	return FALSE;
1379	if (!PL_utf8_cntrl)
1380	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1381	return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1382	}
1383
1384	bool
1385	Perl_is_utf8_graph(pTHX_ U8 *p)
1386	{
1387	if (!is_utf8_char(p))
1388	return FALSE;
1389	if (!PL_utf8_graph)
1390	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1391	return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1392	}
1393
1394	bool
1395	Perl_is_utf8_print(pTHX_ U8 *p)
1396	{
1397	if (!is_utf8_char(p))
1398	return FALSE;
1399	if (!PL_utf8_print)
1400	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1401	return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1402	}
1403
1404	bool
1405	Perl_is_utf8_punct(pTHX_ U8 *p)
1406	{
1407	if (!is_utf8_char(p))
1408	return FALSE;
1409	if (!PL_utf8_punct)
1410	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1411	return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1412	}
1413
1414	bool
1415	Perl_is_utf8_xdigit(pTHX_ U8 *p)
1416	{
1417	if (!is_utf8_char(p))
1418	return FALSE;
1419	if (!PL_utf8_xdigit)
1420	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1421	return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1422	}
1423
1424	bool
1425	Perl_is_utf8_mark(pTHX_ U8 *p)
1426	{
1427	if (!is_utf8_char(p))
1428	return FALSE;
1429	if (!PL_utf8_mark)
1430	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1431	return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1432	}
1433
1434	/*
1435	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
1436
1437	The "p" contains the pointer to the UTF-8 string encoding
1438	the character that is being converted.
1439
1440	The "ustrp" is a pointer to the character buffer to put the
1441	conversion result to. The "lenp" is a pointer to the length
1442	of the result.
1443
1444	The "swashp" is a pointer to the swash to use.
1445
1446	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1447	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1448	but not always, a multicharacter mapping), is tried first.
1449
1450	The "special" is a string like "utf8::ToSpecLower", which means the
1451	hash %utf8::ToSpecLower. The access to the hash is through
1452	Perl_to_utf8_case().
1453
1454	The "normal" is a string like "ToLower" which means the swash
1455	%utf8::ToLower.
1456
1457	=cut */
1458
1459	UV
1460	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
1461	{
1462	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1463	STRLEN len = 0;
1464
1465	const UV uv0 = utf8_to_uvchr(p, NULL);
1466	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1467	* are necessary in EBCDIC, they are redundant no-ops
1468	* in ASCII-ish platforms, and hopefully optimized away. */
1469	const UV uv1 = NATIVE_TO_UNI(uv0);
1470	uvuni_to_utf8(tmpbuf, uv1);
1471
1472	if (!swashp) / load on-demand */
1473	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1474
1475	/* The 0xDF is the only special casing Unicode code point below 0x100. */
1476	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
1477	/* It might be "special" (sometimes, but not always,
1478	* a multicharacter mapping) */
1479	HV *hv;
1480	SV **svp;
1481
1482	if ((hv = get_hv(special, FALSE)) &&
1483	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1484	(*svp)) {
1485	const char *s;
1486
1487	s = SvPV_const(*svp, len);
1488	if (len == 1)
1489	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
1490	else {
1491	#ifdef EBCDIC
1492	/* If we have EBCDIC we need to remap the characters
1493	* since any characters in the low 256 are Unicode
1494	* code points, not EBCDIC. */
1495	U8 t = (U8)s, tend = t + len, d;
1496
1497	d = tmpbuf;
1498	if (SvUTF8(*svp)) {
1499	STRLEN tlen = 0;
1500
1501	while (t < tend) {
1502	UV c = utf8_to_uvchr(t, &tlen);
1503	if (tlen > 0) {
1504	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1505	t += tlen;
1506	}
1507	else
1508	break;
1509	}
1510	}
1511	else {
1512	while (t < tend) {
1513	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1514	t++;
1515	}
1516	}
1517	len = d - tmpbuf;
1518	Copy(tmpbuf, ustrp, len, U8);
1519	#else
1520	Copy(s, ustrp, len, U8);
1521	#endif
1522	}
1523	}
1524	}
1525
1526	if (!len && *swashp) {
1527	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1528
1529	if (uv2) {
1530	/* It was "normal" (a single character mapping). */
1531	UV uv3 = UNI_TO_NATIVE(uv2);
1532
1533	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1534	}
1535	}
1536
1537	if (!len) /* Neither: just copy. */
1538	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1539
1540	if (lenp)
1541	*lenp = len;
1542
1543	return len ? utf8_to_uvchr(ustrp, 0) : 0;
1544	}
1545
1546	/*
1547	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
1548
1549	Convert the UTF-8 encoded character at p to its uppercase version and
1550	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1551	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1552	the uppercase version may be longer than the original character.
1553
1554	The first character of the uppercased version is returned
1555	(but note, as explained above, that there may be more.)
1556
1557	=cut */
1558
1559	UV
1560	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1561	{
1562	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1563	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1564	}
1565
1566	/*
1567	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
1568
1569	Convert the UTF-8 encoded character at p to its titlecase version and
1570	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1571	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1572	titlecase version may be longer than the original character.
1573
1574	The first character of the titlecased version is returned
1575	(but note, as explained above, that there may be more.)
1576
1577	=cut */
1578
1579	UV
1580	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1581	{
1582	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1583	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1584	}
1585
1586	/*
1587	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
1588
1589	Convert the UTF-8 encoded character at p to its lowercase version and
1590	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1591	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1592	lowercase version may be longer than the original character.
1593
1594	The first character of the lowercased version is returned
1595	(but note, as explained above, that there may be more.)
1596
1597	=cut */
1598
1599	UV
1600	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1601	{
1602	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1603	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1604	}
1605
1606	/*
1607	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
1608
1609	Convert the UTF-8 encoded character at p to its foldcase version and
1610	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1611	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1612	foldcase version may be longer than the original character (up to
1613	three characters).
1614
1615	The first character of the foldcased version is returned
1616	(but note, as explained above, that there may be more.)
1617
1618	=cut */
1619
1620	UV
1621	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
1622	{
1623	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1624	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1625	}
1626
1627	/* a "swash" is a swatch hash */
1628
1629	SV*
1630	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1631	{
1632	SV* retval;
1633	SV* const tokenbufsv = sv_newmortal();
1634	dSP;
1635	const size_t pkg_len = strlen(pkg);
1636	const size_t name_len = strlen(name);
1637	HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1638	SV* errsv_save;
1639
1640	PUSHSTACKi(PERLSI_MAGIC);
1641	ENTER;
1642	SAVEI32(PL_hints);
1643	PL_hints = 0;
1644	save_re_context();
1645	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1646	ENTER;
1647	errsv_save = newSVsv(ERRSV);
1648	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1649	Nullsv);
1650	if (!SvTRUE(ERRSV))
1651	sv_setsv(ERRSV, errsv_save);
1652	SvREFCNT_dec(errsv_save);
1653	LEAVE;
1654	}
1655	SPAGAIN;
1656	PUSHMARK(SP);
1657	EXTEND(SP,5);
1658	PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1659	PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1660	PUSHs(listsv);
1661	PUSHs(sv_2mortal(newSViv(minbits)));
1662	PUSHs(sv_2mortal(newSViv(none)));
1663	PUTBACK;
1664	if (IN_PERL_COMPILETIME) {
1665	/* XXX ought to be handled by lex_start */
1666	SAVEI32(PL_in_my);
1667	PL_in_my = 0;
1668	sv_setpv(tokenbufsv, PL_tokenbuf);
1669	}
1670	errsv_save = newSVsv(ERRSV);
1671	if (call_method("SWASHNEW", G_SCALAR))
1672	retval = newSVsv(*PL_stack_sp--);
1673	else
1674	retval = &PL_sv_undef;
1675	if (!SvTRUE(ERRSV))
1676	sv_setsv(ERRSV, errsv_save);
1677	SvREFCNT_dec(errsv_save);
1678	LEAVE;
1679	POPSTACK;
1680	if (IN_PERL_COMPILETIME) {
1681	STRLEN len;
1682	const char* const pv = SvPV_const(tokenbufsv, len);
1683
1684	Copy(pv, PL_tokenbuf, len+1, char);
1685	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1686	}
1687	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
1688	if (SvPOK(retval))
1689	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1690	retval);
1691	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1692	}
1693	return retval;
1694	}
1695
1696
1697	/* This API is wrong for special case conversions since we may need to
1698	* return several Unicode characters for a single Unicode character
1699	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1700	* the lower-level routine, and it is similarly broken for returning
1701	* multiple values. --jhi */
1702	UV
1703	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
1704	{
1705	HV* const hv = (HV*)SvRV(sv);
1706	U32 klen;
1707	U32 off;
1708	STRLEN slen;
1709	STRLEN needents;
1710	const U8 *tmps = NULL;
1711	U32 bit;
1712	SV *retval;
1713	U8 tmputf8[2];
1714	UV c = NATIVE_TO_ASCII(*ptr);
1715
1716	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1717	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1718	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1719	ptr = tmputf8;
1720	}
1721	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1722	* then the "swatch" is a vec() for al the chars which start
1723	* with 0xAA..0xYY
1724	* So the key in the hash (klen) is length of encoded char -1
1725	*/
1726	klen = UTF8SKIP(ptr) - 1;
1727	off = ptr[klen];
1728
1729	if (klen == 0)
1730	{
1731	/* If char in invariant then swatch is for all the invariant chars
1732	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1733	*/
1734	needents = UTF_CONTINUATION_MARK;
1735	off = NATIVE_TO_UTF(ptr[klen]);
1736	}
1737	else
1738	{
1739	/* If char is encoded then swatch is for the prefix */
1740	needents = (1 << UTF_ACCUMULATION_SHIFT);
1741	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1742	}
1743
1744	/*
1745	* This single-entry cache saves about 1/3 of the utf8 overhead in test
1746	* suite. (That is, only 7-8% overall over just a hash cache. Still,
1747	* it's nothing to sniff at.) Pity we usually come through at least
1748	* two function calls to get here...
1749	*
1750	* NB: this code assumes that swatches are never modified, once generated!
1751	*/
1752
1753	if (hv == PL_last_swash_hv &&
1754	klen == PL_last_swash_klen &&
1755	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
1756	{
1757	tmps = PL_last_swash_tmps;
1758	slen = PL_last_swash_slen;
1759	}
1760	else {
1761	/* Try our second-level swatch cache, kept in a hash. */
1762	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1763
1764	/* If not cached, generate it via utf8::SWASHGET */
1765	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (const U8)SvPV_const(*svp, slen))) {
1766	dSP;
1767	/* We use utf8n_to_uvuni() as we want an index into
1768	Unicode tables, not a native character number.
1769	*/
1770	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1771	ckWARN(WARN_UTF8) ?
1772	0 : UTF8_ALLOW_ANY);
1773	SV *errsv_save;
1774	ENTER;
1775	SAVETMPS;
1776	save_re_context();
1777	PUSHSTACKi(PERLSI_MAGIC);
1778	PUSHMARK(SP);
1779	EXTEND(SP,3);
1780	PUSHs((SV*)sv);
1781	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1782	PUSHs(sv_2mortal(newSViv((klen) ?
1783	(code_point & ~(needents - 1)) : 0)));
1784	PUSHs(sv_2mortal(newSViv(needents)));
1785	PUTBACK;
1786	errsv_save = newSVsv(ERRSV);
1787	if (call_method("SWASHGET", G_SCALAR))
1788	retval = newSVsv(*PL_stack_sp--);
1789	else
1790	retval = &PL_sv_undef;
1791	if (!SvTRUE(ERRSV))
1792	sv_setsv(ERRSV, errsv_save);
1793	SvREFCNT_dec(errsv_save);
1794	POPSTACK;
1795	FREETMPS;
1796	LEAVE;
1797	if (IN_PERL_COMPILETIME)
1798	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1799
1800	svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1801
1802	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
1803	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1804	}
1805
1806	PL_last_swash_hv = hv;
1807	PL_last_swash_klen = klen;
1808	/* FIXME change interpvar.h? */
1809	PL_last_swash_tmps = (U8 *) tmps;
1810	PL_last_swash_slen = slen;
1811	if (klen)
1812	Copy(ptr, PL_last_swash_key, klen, U8);
1813	}
1814
1815	switch ((int)((slen << 3) / needents)) {
1816	case 1:
1817	bit = 1 << (off & 7);
1818	off >>= 3;
1819	return (tmps[off] & bit) != 0;
1820	case 8:
1821	return tmps[off];
1822	case 16:
1823	off <<= 1;
1824	return (tmps[off] << 8) + tmps[off + 1] ;
1825	case 32:
1826	off <<= 2;
1827	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1828	}
1829	Perl_croak(aTHX_ "panic: swash_fetch");
1830	return 0;
1831	}
1832
1833
1834	/*
1835	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
1836
1837	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1838	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1839	bytes available. The return value is the pointer to the byte after the
1840	end of the new character. In other words,
1841
1842	d = uvchr_to_utf8(d, uv);
1843
1844	is the recommended wide native character-aware way of saying
1845
1846	*(d++) = uv;
1847
1848	=cut
1849	*/
1850
1851	/* On ASCII machines this is normally a macro but we want a
1852	real function in case XS code wants it
1853	*/
1854	#undef Perl_uvchr_to_utf8
1855	U8 *
1856	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1857	{
1858	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1859	}
1860
1861	U8 *
1862	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1863	{
1864	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1865	}
1866
1867	/*
1868	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
1869
1870	Returns the native character value of the first character in the string C<s>
1871	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1872	length, in bytes, of that character.
1873
1874	Allows length and flags to be passed to low level routine.
1875
1876	=cut
1877	*/
1878	/* On ASCII machines this is normally a macro but we want
1879	a real function in case XS code wants it
1880	*/
1881	#undef Perl_utf8n_to_uvchr
1882	UV
1883	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
1884	{
1885	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1886	return UNI_TO_NATIVE(uv);
1887	}
1888
1889	/*
1890	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
1891
1892	Build to the scalar dsv a displayable version of the string spv,
1893	length len, the displayable version being at most pvlim bytes long
1894	(if longer, the rest is truncated and "..." will be appended).
1895
1896	The flags argument can have UNI_DISPLAY_ISPRINT set to display
1897	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1898	to display the \\[nrfta\\] as the backslashed versions (like '\n')
1899	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1900	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1901	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1902
1903	The pointer to the PV of the dsv is returned.
1904
1905	=cut */
1906	char *
1907	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
1908	{
1909	int truncated = 0;
1910	const char s, e;
1911
1912	sv_setpvn(dsv, "", 0);
1913	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1914	UV u;
1915	/* This serves double duty as a flag and a character to print after
1916	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1917	*/
1918	char ok = 0;
1919
1920	if (pvlim && SvCUR(dsv) >= pvlim) {
1921	truncated++;
1922	break;
1923	}
1924	u = utf8_to_uvchr((U8*)s, 0);
1925	if (u < 256) {
1926	const unsigned char c = (unsigned char)u & 0xFF;
1927	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1928	switch (c) {
1929	case '\n':
1930	ok = 'n'; break;
1931	case '\r':
1932	ok = 'r'; break;
1933	case '\t':
1934	ok = 't'; break;
1935	case '\f':
1936	ok = 'f'; break;
1937	case '\a':
1938	ok = 'a'; break;
1939	case '\\':
1940	ok = '\\'; break;
1941	default: break;
1942	}
1943	if (ok) {
1944	Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1945	}
1946	}
1947	/* isPRINT() is the locale-blind version. */
1948	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1949	Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1950	ok = 1;
1951	}
1952	}
1953	if (!ok)
1954	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1955	}
1956	if (truncated)
1957	sv_catpvn(dsv, "...", 3);
1958
1959	return SvPVX(dsv);
1960	}
1961
1962	/*
1963	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
1964
1965	Build to the scalar dsv a displayable version of the scalar sv,
1966	the displayable version being at most pvlim bytes long
1967	(if longer, the rest is truncated and "..." will be appended).
1968
1969	The flags argument is as in pv_uni_display().
1970
1971	The pointer to the PV of the dsv is returned.
1972
1973	=cut */
1974	char *
1975	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
1976	{
1977	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
1978	SvCUR(ssv), pvlim, flags);
1979	}
1980
1981	/*
1982	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
1983
1984	Return true if the strings s1 and s2 differ case-insensitively, false
1985	if not (if they are equal case-insensitively). If u1 is true, the
1986	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1987	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1988	are false, the respective string is assumed to be in native 8-bit
1989	encoding.
1990
1991	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1992	in there (they will point at the beginning of the I<next> character).
1993	If the pointers behind pe1 or pe2 are non-NULL, they are the end
1994	pointers beyond which scanning will not continue under any
1995	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1996	s2+l2 will be used as goal end pointers that will also stop the scan,
1997	and which qualify towards defining a successful match: all the scans
1998	that define an explicit length must reach their goal pointers for
1999	a match to succeed).
2000
2001	For case-insensitiveness, the "casefolding" of Unicode is used
2002	instead of upper/lowercasing both the characters, see
2003	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2004
2005	=cut */
2006	I32
2007	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
2008	{
2009	register const U8 p1 = (const U8)s1;
2010	register const U8 p2 = (const U8)s2;
2011	register const U8 f1 = 0, f2 = 0;
2012	register U8 e1 = 0, q1 = 0;
2013	register U8 e2 = 0, q2 = 0;
2014	STRLEN n1 = 0, n2 = 0;
2015	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2016	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2017	U8 natbuf[1+1];
2018	STRLEN foldlen1, foldlen2;
2019	bool match;
2020
2021	if (pe1)
2022	e1 = (U8*)pe1;
2023	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2024	f1 = (const U8*)s1 + l1;
2025	if (pe2)
2026	e2 = (U8*)pe2;
2027	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2028	f2 = (const U8*)s2 + l2;
2029
2030	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
2031	return 1; /* mismatch; possible infinite loop or false positive */
2032
2033	if (!u1 \|\| !u2)
2034	natbuf[1] = 0; /* Need to terminate the buffer. */
2035
2036	while ((e1 == 0 \|\| p1 < e1) &&
2037	(f1 == 0 \|\| p1 < f1) &&
2038	(e2 == 0 \|\| p2 < e2) &&
2039	(f2 == 0 \|\| p2 < f2)) {
2040	if (n1 == 0) {
2041	if (u1)
2042	to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
2043	else {
2044	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2045	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2046	}
2047	q1 = foldbuf1;
2048	n1 = foldlen1;
2049	}
2050	if (n2 == 0) {
2051	if (u2)
2052	to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
2053	else {
2054	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2055	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2056	}
2057	q2 = foldbuf2;
2058	n2 = foldlen2;
2059	}
2060	while (n1 && n2) {
2061	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
2062	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
2063	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
2064	return 1; /* mismatch */
2065	n1 -= UTF8SKIP(q1);
2066	q1 += UTF8SKIP(q1);
2067	n2 -= UTF8SKIP(q2);
2068	q2 += UTF8SKIP(q2);
2069	}
2070	if (n1 == 0)
2071	p1 += u1 ? UTF8SKIP(p1) : 1;
2072	if (n2 == 0)
2073	p2 += u2 ? UTF8SKIP(p2) : 1;
2074
2075	}
2076
2077	/* A match is defined by all the scans that specified
2078	* an explicit length reaching their final goals. */
2079	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
2080
2081	if (match) {
2082	if (pe1)
2083	pe1 = (char)p1;
2084	if (pe2)
2085	pe2 = (char)p2;
2086	}
2087
2088	return match ? 0 : 1; /* 0 match, 1 mismatch */
2089	}
2090
2091	/*
2092	* Local variables:
2093	* c-indentation-style: bsd
2094	* c-basic-offset: 4
2095	* indent-tabs-mode: t
2096	* End:
2097	*
2098	* ex: set ts=8 sts=4 sw=4 noet:
2099	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/perl/5.8.8/utf8.c@ 3662

Download in other formats: