Context Navigation

utf8.c

Visit:

Last change on this file was 3181, checked in by bird, 18 years ago
perl 5.8.8
File size: 53.0 KB

Rev	Line
[3181]	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* 'Well do I understand your speech,' he answered in the same language;
	17	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	18	* as is the custom in the West, if you wish to be answered?'
	19	*
	20	* ...the travellers perceived that the floor was paved with stones of many
	21	* hues; branching runes and strange devices intertwined beneath their feet.
	22	*/
	23
	24	#include "EXTERN.h"
	25	#define PERL_IN_UTF8_C
	26	#include "perl.h"
	27
	28	static const char unees[] =
	29	"Malformed UTF-8 character (unexpected end of string)";
	30
	31	/*
	32	=head1 Unicode Support
	33
	34	This file contains various utility functions for manipulating UTF8-encoded
	35	strings. For the uninitiated, this is a method of representing arbitrary
	36	Unicode characters as a variable number of bytes, in such a way that
	37	characters in the ASCII range are unmodified, and a zero byte never appears
	38	within non-zero characters.
	39
	40	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	41
	42	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	43	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	44	bytes available. The return value is the pointer to the byte after the
	45	end of the new character. In other words,
	46
	47	d = uvuni_to_utf8_flags(d, uv, flags);
	48
	49	or, in most cases,
	50
	51	d = uvuni_to_utf8(d, uv);
	52
	53	(which is equivalent to)
	54
	55	d = uvuni_to_utf8_flags(d, uv, 0);
	56
	57	is the recommended Unicode-aware way of saying
	58
	59	*(d++) = uv;
	60
	61	=cut
	62	*/
	63
	64	U8 *
	65	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	66	{
	67	if (ckWARN(WARN_UTF8)) {
	68	if (UNICODE_IS_SURROGATE(uv) &&
	69	!(flags & UNICODE_ALLOW_SURROGATE))
	70	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	71	else if (
	72	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	73	!(flags & UNICODE_ALLOW_FDD0))
	74	\|\|
	75	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	76	!(flags & UNICODE_ALLOW_FFFF))) &&
	77	/* UNICODE_ALLOW_SUPER includes
	78	* FFFEs and FFFFs beyond 0x10FFFF. */
	79	((uv <= PERL_UNICODE_MAX) \|\|
	80	!(flags & UNICODE_ALLOW_SUPER))
	81	)
	82	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	83	"Unicode character 0x%04"UVxf" is illegal", uv);
	84	}
	85	if (UNI_IS_INVARIANT(uv)) {
	86	*d++ = (U8)UTF_TO_NATIVE(uv);
	87	return d;
	88	}
	89	#if defined(EBCDIC)
	90	else {
	91	STRLEN len = UNISKIP(uv);
	92	U8 *p = d+len-1;
	93	while (p > d) {
	94	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	95	uv >>= UTF_ACCUMULATION_SHIFT;
	96	}
	97	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	98	return d+len;
	99	}
	100	#else /* Non loop style */
	101	if (uv < 0x800) {
	102	*d++ = (U8)(( uv >> 6) \| 0xc0);
	103	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	104	return d;
	105	}
	106	if (uv < 0x10000) {
	107	*d++ = (U8)(( uv >> 12) \| 0xe0);
	108	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	109	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	110	return d;
	111	}
	112	if (uv < 0x200000) {
	113	*d++ = (U8)(( uv >> 18) \| 0xf0);
	114	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	115	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	116	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	117	return d;
	118	}
	119	if (uv < 0x4000000) {
	120	*d++ = (U8)(( uv >> 24) \| 0xf8);
	121	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	122	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	123	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	124	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	125	return d;
	126	}
	127	if (uv < 0x80000000) {
	128	*d++ = (U8)(( uv >> 30) \| 0xfc);
	129	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	130	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	131	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	132	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	133	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	134	return d;
	135	}
	136	#ifdef HAS_QUAD
	137	if (uv < UTF8_QUAD_MAX)
	138	#endif
	139	{
	140	d++ = 0xfe; / Can't match U+FEFF! */
	141	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	142	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	143	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	144	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	145	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	146	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	147	return d;
	148	}
	149	#ifdef HAS_QUAD
	150	{
	151	d++ = 0xff; / Can't match U+FFFE! */
	152	d++ = 0x80; / 6 Reserved bits */
	153	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	154	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	155	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	156	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	157	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	158	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	159	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	160	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	161	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	162	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	163	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	164	return d;
	165	}
	166	#endif
	167	#endif /* Loop style */
	168	}
	169
	170	U8 *
	171	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	172	{
	173	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
	174	}
	175
	176	/*
	177
	178	Tests if some arbitrary number of bytes begins in a valid UTF-8
	179	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	180	UTF-8 character. The actual number of bytes in the UTF-8 character
	181	will be returned if it is valid, otherwise 0.
	182
	183	This is the "slow" version as opposed to the "fast" version which is
	184	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	185	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	186	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	187	you should use the _slow(). In practice this means that the _slow()
	188	will be used very rarely, since the maximum Unicode code point (as of
	189	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	190	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	191	five bytes or more.
	192
	193	=cut */
	194	STATIC STRLEN
	195	S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
	196	{
	197	U8 u = *s;
	198	STRLEN slen;
	199	UV uv, ouv;
	200
	201	if (UTF8_IS_INVARIANT(u))
	202	return 1;
	203
	204	if (!UTF8_IS_START(u))
	205	return 0;
	206
	207	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	208	return 0;
	209
	210	slen = len - 1;
	211	s++;
	212	#ifdef EBCDIC
	213	u = NATIVE_TO_UTF(u);
	214	#endif
	215	u &= UTF_START_MASK(len);
	216	uv = u;
	217	ouv = uv;
	218	while (slen--) {
	219	if (!UTF8_IS_CONTINUATION(*s))
	220	return 0;
	221	uv = UTF8_ACCUMULATE(uv, *s);
	222	if (uv < ouv)
	223	return 0;
	224	ouv = uv;
	225	s++;
	226	}
	227
	228	if ((STRLEN)UNISKIP(uv) < len)
	229	return 0;
	230
	231	return len;
	232	}
	233
	234	/*
	235	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
	236
	237	Tests if some arbitrary number of bytes begins in a valid UTF-8
	238	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	239	UTF-8 character. The actual number of bytes in the UTF-8 character
	240	will be returned if it is valid, otherwise 0.
	241
	242	=cut */
	243	STRLEN
	244	Perl_is_utf8_char(pTHX_ U8 *s)
	245	{
	246	const STRLEN len = UTF8SKIP(s);
	247	#ifdef IS_UTF8_CHAR
	248	if (IS_UTF8_CHAR_FAST(len))
	249	return IS_UTF8_CHAR(s, len) ? len : 0;
	250	#endif /* #ifdef IS_UTF8_CHAR */
	251	return is_utf8_char_slow(s, len);
	252	}
	253
	254	/*
	255	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
	256
	257	Returns true if first C<len> bytes of the given string form a valid
	258	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	259	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	260	because a valid ASCII string is a valid UTF-8 string.
	261
	262	See also is_utf8_string_loclen() and is_utf8_string_loc().
	263
	264	=cut
	265	*/
	266
	267	bool
	268	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	269	{
	270	const U8* x = s;
	271	const U8* send;
	272
	273	if (!len)
	274	len = strlen((const char *)s);
	275	send = s + len;
	276
	277	while (x < send) {
	278	STRLEN c;
	279	/* Inline the easy bits of is_utf8_char() here for speed... */
	280	if (UTF8_IS_INVARIANT(*x))
	281	c = 1;
	282	else if (!UTF8_IS_START(*x))
	283	goto out;
	284	else {
	285	/* ... and call is_utf8_char() only if really needed. */
	286	#ifdef IS_UTF8_CHAR
	287	c = UTF8SKIP(x);
	288	if (IS_UTF8_CHAR_FAST(c)) {
	289	if (!IS_UTF8_CHAR(x, c))
	290	goto out;
	291	} else if (!is_utf8_char_slow(x, c))
	292	goto out;
	293	#else
	294	c = is_utf8_char(x);
	295	#endif /* #ifdef IS_UTF8_CHAR */
	296	if (!c)
	297	goto out;
	298	}
	299	x += c;
	300	}
	301
	302	out:
	303	if (x != send)
	304	return FALSE;
	305
	306	return TRUE;
	307	}
	308
	309	/*
	310	=for apidoc A\|bool\|is_utf8_string_loclen\|const U8 s\|STRLEN len\|const U8 ep\|const STRLEN el
	311
	312	Like is_utf8_string() but stores the location of the failure (in the
	313	case of "utf8ness failure") or the location s+len (in the case of
	314	"utf8ness success") in the C<ep>, and the number of UTF-8
	315	encoded characters in the C<el>.
	316
	317	See also is_utf8_string_loc() and is_utf8_string().
	318
	319	=cut
	320	*/
	321
	322	bool
	323	Perl_is_utf8_string_loclen(pTHX_ const U8 s, STRLEN len, const U8 ep, STRLEN el)
	324	{
	325	const U8* x = s;
	326	const U8* send;
	327	STRLEN c;
	328
	329	if (!len)
	330	len = strlen((const char *)s);
	331	send = s + len;
	332	if (el)
	333	*el = 0;
	334
	335	while (x < send) {
	336	/* Inline the easy bits of is_utf8_char() here for speed... */
	337	if (UTF8_IS_INVARIANT(*x))
	338	c = 1;
	339	else if (!UTF8_IS_START(*x))
	340	goto out;
	341	else {
	342	/* ... and call is_utf8_char() only if really needed. */
	343	#ifdef IS_UTF8_CHAR
	344	c = UTF8SKIP(x);
	345	if (IS_UTF8_CHAR_FAST(c)) {
	346	if (!IS_UTF8_CHAR(x, c))
	347	c = 0;
	348	} else
	349	c = is_utf8_char_slow(x, c);
	350	#else
	351	c = is_utf8_char(x);
	352	#endif /* #ifdef IS_UTF8_CHAR */
	353	if (!c)
	354	goto out;
	355	}
	356	x += c;
	357	if (el)
	358	(*el)++;
	359	}
	360
	361	out:
	362	if (ep)
	363	*ep = x;
	364	if (x != send)
	365	return FALSE;
	366
	367	return TRUE;
	368	}
	369
	370	/*
	371	=for apidoc A\|bool\|is_utf8_string_loc\|const U8 s\|STRLEN len\|const U8 ep\|const STRLEN el
	372
	373	Like is_utf8_string() but stores the location of the failure (in the
	374	case of "utf8ness failure") or the location s+len (in the case of
	375	"utf8ness success") in the C<ep>.
	376
	377	See also is_utf8_string_loclen() and is_utf8_string().
	378
	379	=cut
	380	*/
	381
	382	bool
	383	Perl_is_utf8_string_loc(pTHX_ U8 s, STRLEN len, U8 *ep)
	384	{
	385	return is_utf8_string_loclen(s, len, (const U8 **)ep, 0);
	386	}
	387
	388	/*
	389	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	390
	391	Bottom level UTF-8 decode routine.
	392	Returns the unicode code point value of the first character in the string C<s>
	393	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	394	C<retlen> will be set to the length, in bytes, of that character.
	395
	396	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	397	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	398	it is assumed that the caller will raise a warning, and this function
	399	will silently just set C<retlen> to C<-1> and return zero. If the
	400	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	401	malformations will be given, C<retlen> will be set to the expected
	402	length of the UTF-8 character in bytes, and zero will be returned.
	403
	404	The C<flags> can also contain various flags to allow deviations from
	405	the strict UTF-8 encoding (see F<utf8.h>).
	406
	407	Most code should use utf8_to_uvchr() rather than call this directly.
	408
	409	=cut
	410	*/
	411
	412	UV
	413	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	414	{
	415	const U8 *s0 = s;
	416	UV uv = *s, ouv = 0;
	417	STRLEN len = 1;
	418	const bool dowarn = ckWARN_d(WARN_UTF8);
	419	const UV startbyte = *s;
	420	STRLEN expectlen = 0;
	421	U32 warning = 0;
	422
	423	/* This list is a superset of the UTF8_ALLOW_XXX. */
	424
	425	#define UTF8_WARN_EMPTY 1
	426	#define UTF8_WARN_CONTINUATION 2
	427	#define UTF8_WARN_NON_CONTINUATION 3
	428	#define UTF8_WARN_FE_FF 4
	429	#define UTF8_WARN_SHORT 5
	430	#define UTF8_WARN_OVERFLOW 6
	431	#define UTF8_WARN_SURROGATE 7
	432	#define UTF8_WARN_LONG 8
	433	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	434
	435	if (curlen == 0 &&
	436	!(flags & UTF8_ALLOW_EMPTY)) {
	437	warning = UTF8_WARN_EMPTY;
	438	goto malformed;
	439	}
	440
	441	if (UTF8_IS_INVARIANT(uv)) {
	442	if (retlen)
	443	*retlen = 1;
	444	return (UV) (NATIVE_TO_UTF(*s));
	445	}
	446
	447	if (UTF8_IS_CONTINUATION(uv) &&
	448	!(flags & UTF8_ALLOW_CONTINUATION)) {
	449	warning = UTF8_WARN_CONTINUATION;
	450	goto malformed;
	451	}
	452
	453	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	454	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	455	warning = UTF8_WARN_NON_CONTINUATION;
	456	goto malformed;
	457	}
	458
	459	#ifdef EBCDIC
	460	uv = NATIVE_TO_UTF(uv);
	461	#else
	462	if ((uv == 0xfe \|\| uv == 0xff) &&
	463	!(flags & UTF8_ALLOW_FE_FF)) {
	464	warning = UTF8_WARN_FE_FF;
	465	goto malformed;
	466	}
	467	#endif
	468
	469	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	470	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	471	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	472	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	473	#ifdef EBCDIC
	474	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	475	else { len = 7; uv &= 0x01; }
	476	#else
	477	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	478	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	479	else { len = 13; uv = 0; } /* whoa! */
	480	#endif
	481
	482	if (retlen)
	483	*retlen = len;
	484
	485	expectlen = len;
	486
	487	if ((curlen < expectlen) &&
	488	!(flags & UTF8_ALLOW_SHORT)) {
	489	warning = UTF8_WARN_SHORT;
	490	goto malformed;
	491	}
	492
	493	len--;
	494	s++;
	495	ouv = uv;
	496
	497	while (len--) {
	498	if (!UTF8_IS_CONTINUATION(*s) &&
	499	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	500	s--;
	501	warning = UTF8_WARN_NON_CONTINUATION;
	502	goto malformed;
	503	}
	504	else
	505	uv = UTF8_ACCUMULATE(uv, *s);
	506	if (!(uv > ouv)) {
	507	/* These cannot be allowed. */
	508	if (uv == ouv) {
	509	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	510	warning = UTF8_WARN_LONG;
	511	goto malformed;
	512	}
	513	}
	514	else { /* uv < ouv */
	515	/* This cannot be allowed. */
	516	warning = UTF8_WARN_OVERFLOW;
	517	goto malformed;
	518	}
	519	}
	520	s++;
	521	ouv = uv;
	522	}
	523
	524	if (UNICODE_IS_SURROGATE(uv) &&
	525	!(flags & UTF8_ALLOW_SURROGATE)) {
	526	warning = UTF8_WARN_SURROGATE;
	527	goto malformed;
	528	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	529	!(flags & UTF8_ALLOW_LONG)) {
	530	warning = UTF8_WARN_LONG;
	531	goto malformed;
	532	} else if (UNICODE_IS_ILLEGAL(uv) &&
	533	!(flags & UTF8_ALLOW_FFFF)) {
	534	warning = UTF8_WARN_FFFF;
	535	goto malformed;
	536	}
	537
	538	return uv;
	539
	540	malformed:
	541
	542	if (flags & UTF8_CHECK_ONLY) {
	543	if (retlen)
	544	*retlen = -1;
	545	return 0;
	546	}
	547
	548	if (dowarn) {
	549	SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	550
	551	switch (warning) {
	552	case 0: /* Intentionally empty. */ break;
	553	case UTF8_WARN_EMPTY:
	554	Perl_sv_catpv(aTHX_ sv, "(empty string)");
	555	break;
	556	case UTF8_WARN_CONTINUATION:
	557	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	558	break;
	559	case UTF8_WARN_NON_CONTINUATION:
	560	if (s == s0)
	561	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	562	(UV)s[1], startbyte);
	563	else {
	564	const int len = (int)(s-s0);
	565	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	566	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	567	}
	568
	569	break;
	570	case UTF8_WARN_FE_FF:
	571	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	572	break;
	573	case UTF8_WARN_SHORT:
	574	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	575	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	576	expectlen = curlen; /* distance for caller to skip */
	577	break;
	578	case UTF8_WARN_OVERFLOW:
	579	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	580	ouv, *s, startbyte);
	581	break;
	582	case UTF8_WARN_SURROGATE:
	583	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	584	break;
	585	case UTF8_WARN_LONG:
	586	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	587	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	588	break;
	589	case UTF8_WARN_FFFF:
	590	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	591	break;
	592	default:
	593	Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
	594	break;
	595	}
	596
	597	if (warning) {
	598	const char * const s = SvPVX_const(sv);
	599
	600	if (PL_op)
	601	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	602	"%s in %s", s, OP_DESC(PL_op));
	603	else
	604	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	605	}
	606	}
	607
	608	if (retlen)
	609	*retlen = expectlen ? expectlen : len;
	610
	611	return 0;
	612	}
	613
	614	/*
	615	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
	616
	617	Returns the native character value of the first character in the string C<s>
	618	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	619	length, in bytes, of that character.
	620
	621	If C<s> does not point to a well-formed UTF-8 character, zero is
	622	returned and retlen is set, if possible, to -1.
	623
	624	=cut
	625	*/
	626
	627	UV
	628	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
	629	{
	630	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
	631	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	632	}
	633
	634	/*
	635	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
	636
	637	Returns the Unicode code point of the first character in the string C<s>
	638	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	639	length, in bytes, of that character.
	640
	641	This function should only be used when returned UV is considered
	642	an index into the Unicode semantic tables (e.g. swashes).
	643
	644	If C<s> does not point to a well-formed UTF-8 character, zero is
	645	returned and retlen is set, if possible, to -1.
	646
	647	=cut
	648	*/
	649
	650	UV
	651	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
	652	{
	653	/* Call the low level routine asking for checks */
	654	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	655	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	656	}
	657
	658	/*
	659	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
	660
	661	Return the length of the UTF-8 char encoded string C<s> in characters.
	662	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	663	up past C<e>, croaks.
	664
	665	=cut
	666	*/
	667
	668	STRLEN
	669	Perl_utf8_length(pTHX_ U8 s, U8 e)
	670	{
	671	STRLEN len = 0;
	672
	673	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	674	* the bitops (especially ~) can create illegal UTF-8.
	675	* In other words: in Perl UTF-8 is not just for Unicode. */
	676
	677	if (e < s)
	678	goto warn_and_return;
	679	while (s < e) {
	680	const U8 t = UTF8SKIP(s);
	681	if (e - s < t) {
	682	warn_and_return:
	683	if (ckWARN_d(WARN_UTF8)) {
	684	if (PL_op)
	685	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	686	"%s in %s", unees, OP_DESC(PL_op));
	687	else
	688	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	689	}
	690	return len;
	691	}
	692	s += t;
	693	len++;
	694	}
	695
	696	return len;
	697	}
	698
	699	/*
	700	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
	701
	702	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	703	and C<b>.
	704
	705	WARNING: use only if you know that the pointers point inside the
	706	same UTF-8 buffer.
	707
	708	=cut
	709	*/
	710
	711	IV
	712	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	713	{
	714	IV off = 0;
	715
	716	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	717	* the bitops (especially ~) can create illegal UTF-8.
	718	* In other words: in Perl UTF-8 is not just for Unicode. */
	719
	720	if (a < b) {
	721	while (a < b) {
	722	const U8 c = UTF8SKIP(a);
	723	if (b - a < c)
	724	goto warn_and_return;
	725	a += c;
	726	off--;
	727	}
	728	}
	729	else {
	730	while (b < a) {
	731	const U8 c = UTF8SKIP(b);
	732
	733	if (a - b < c) {
	734	warn_and_return:
	735	if (ckWARN_d(WARN_UTF8)) {
	736	if (PL_op)
	737	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	738	"%s in %s", unees, OP_DESC(PL_op));
	739	else
	740	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	741	}
	742	return off;
	743	}
	744	b += c;
	745	off++;
	746	}
	747	}
	748
	749	return off;
	750	}
	751
	752	/*
	753	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	754
	755	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	756	forward or backward.
	757
	758	WARNING: do not use the following unless you know C<off> is within
	759	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	760	on the first byte of character or just after the last byte of a character.
	761
	762	=cut
	763	*/
	764
	765	U8 *
	766	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	767	{
	768	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	769	* the bitops (especially ~) can create illegal UTF-8.
	770	* In other words: in Perl UTF-8 is not just for Unicode. */
	771
	772	if (off >= 0) {
	773	while (off--)
	774	s += UTF8SKIP(s);
	775	}
	776	else {
	777	while (off++) {
	778	s--;
	779	while (UTF8_IS_CONTINUATION(*s))
	780	s--;
	781	}
	782	}
	783	return (U8 *)s;
	784	}
	785
	786	/*
	787	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	788
	789	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	790	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	791	updates len to contain the new length.
	792	Returns zero on failure, setting C<len> to -1.
	793
	794	=cut
	795	*/
	796
	797	U8 *
	798	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	799	{
	800	U8 *send;
	801	U8 *d;
	802	U8 *save = s;
	803
	804	/* ensure valid UTF-8 and chars < 256 before updating string */
	805	for (send = s + *len; s < send; ) {
	806	U8 c = *s++;
	807
	808	if (!UTF8_IS_INVARIANT(c) &&
	809	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	810	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	811	*len = -1;
	812	return 0;
	813	}
	814	}
	815
	816	d = s = save;
	817	while (s < send) {
	818	STRLEN ulen;
	819	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	820	s += ulen;
	821	}
	822	*d = '\0';
	823	*len = d - save;
	824	return save;
	825	}
	826
	827	/*
	828	=for apidoc A\|U8 \|bytes_from_utf8\|const U8 s\|STRLEN len\|bool is_utf8
	829
	830	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	831	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	832	the newly-created string, and updates C<len> to contain the new
	833	length. Returns the original string if no conversion occurs, C<len>
	834	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	835	0 if C<s> is converted or contains all 7bit characters.
	836
	837	=cut
	838	*/
	839
	840	U8 *
	841	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
	842	{
	843	U8 *d;
	844	const U8 *start = s;
	845	const U8 *send;
	846	I32 count = 0;
	847	const U8 *s2;
	848
	849	if (!*is_utf8)
	850	return (U8 *)start;
	851
	852	/* ensure valid UTF-8 and chars < 256 before converting string */
	853	for (send = s + *len; s < send;) {
	854	U8 c = *s++;
	855	if (!UTF8_IS_INVARIANT(c)) {
	856	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	857	(c = *s++) && UTF8_IS_CONTINUATION(c))
	858	count++;
	859	else
	860	return (U8 *)start;
	861	}
	862	}
	863
	864	*is_utf8 = 0;
	865
	866	Newxz(d, (*len) - count + 1, U8);
	867	s2 = start; start = d;
	868	while (s2 < send) {
	869	U8 c = *s2++;
	870	if (!UTF8_IS_INVARIANT(c)) {
	871	/* Then it is two-byte encoded */
	872	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
	873	c = ASCII_TO_NATIVE(c);
	874	}
	875	*d++ = c;
	876	}
	877	*d = '\0';
	878	*len = d - start;
	879	return (U8 *)start;
	880	}
	881
	882	/*
	883	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	884
	885	Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
	886	Returns a pointer to the newly-created string, and sets C<len> to
	887	reflect the new length.
	888
	889	If you want to convert to UTF-8 from other encodings than ASCII,
	890	see sv_recode_to_utf8().
	891
	892	=cut
	893	*/
	894
	895	U8*
	896	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
	897	{
	898	const U8 * const send = s + (*len);
	899	U8 *d;
	900	U8 *dst;
	901
	902	Newxz(d, (len) 2 + 1, U8);
	903	dst = d;
	904
	905	while (s < send) {
	906	const UV uv = NATIVE_TO_ASCII(*s++);
	907	if (UNI_IS_INVARIANT(uv))
	908	*d++ = (U8)UTF_TO_NATIVE(uv);
	909	else {
	910	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	911	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	912	}
	913	}
	914	*d = '\0';
	915	*len = d-dst;
	916	return dst;
	917	}
	918
	919	/*
	920	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	921	*
	922	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	923	* We optimize for native, for obvious reasons. */
	924
	925	U8*
	926	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	927	{
	928	U8* pend;
	929	U8* dstart = d;
	930
	931	if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
	932	d[0] = 0;
	933	*newlen = 1;
	934	return d;
	935	}
	936
	937	if (bytelen & 1)
	938	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
	939
	940	pend = p + bytelen;
	941
	942	while (p < pend) {
	943	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	944	p += 2;
	945	if (uv < 0x80) {
	946	*d++ = (U8)uv;
	947	continue;
	948	}
	949	if (uv < 0x800) {
	950	*d++ = (U8)(( uv >> 6) \| 0xc0);
	951	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	952	continue;
	953	}
	954	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	955	UV low = (p[0] << 8) + p[1];
	956	p += 2;
	957	if (low < 0xdc00 \|\| low >= 0xdfff)
	958	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	959	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	960	}
	961	if (uv < 0x10000) {
	962	*d++ = (U8)(( uv >> 12) \| 0xe0);
	963	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	964	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	965	continue;
	966	}
	967	else {
	968	*d++ = (U8)(( uv >> 18) \| 0xf0);
	969	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	970	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	971	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	972	continue;
	973	}
	974	}
	975	*newlen = d - dstart;
	976	return d;
	977	}
	978
	979	/* Note: this one is slightly destructive of the source. */
	980
	981	U8*
	982	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	983	{
	984	U8* s = (U8*)p;
	985	U8* send = s + bytelen;
	986	while (s < send) {
	987	U8 tmp = s[0];
	988	s[0] = s[1];
	989	s[1] = tmp;
	990	s += 2;
	991	}
	992	return utf16_to_utf8(p, d, bytelen, newlen);
	993	}
	994
	995	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	996
	997	bool
	998	Perl_is_uni_alnum(pTHX_ UV c)
	999	{
	1000	U8 tmpbuf[UTF8_MAXBYTES+1];
	1001	uvchr_to_utf8(tmpbuf, c);
	1002	return is_utf8_alnum(tmpbuf);
	1003	}
	1004
	1005	bool
	1006	Perl_is_uni_alnumc(pTHX_ UV c)
	1007	{
	1008	U8 tmpbuf[UTF8_MAXBYTES+1];
	1009	uvchr_to_utf8(tmpbuf, c);
	1010	return is_utf8_alnumc(tmpbuf);
	1011	}
	1012
	1013	bool
	1014	Perl_is_uni_idfirst(pTHX_ UV c)
	1015	{
	1016	U8 tmpbuf[UTF8_MAXBYTES+1];
	1017	uvchr_to_utf8(tmpbuf, c);
	1018	return is_utf8_idfirst(tmpbuf);
	1019	}
	1020
	1021	bool
	1022	Perl_is_uni_alpha(pTHX_ UV c)
	1023	{
	1024	U8 tmpbuf[UTF8_MAXBYTES+1];
	1025	uvchr_to_utf8(tmpbuf, c);
	1026	return is_utf8_alpha(tmpbuf);
	1027	}
	1028
	1029	bool
	1030	Perl_is_uni_ascii(pTHX_ UV c)
	1031	{
	1032	U8 tmpbuf[UTF8_MAXBYTES+1];
	1033	uvchr_to_utf8(tmpbuf, c);
	1034	return is_utf8_ascii(tmpbuf);
	1035	}
	1036
	1037	bool
	1038	Perl_is_uni_space(pTHX_ UV c)
	1039	{
	1040	U8 tmpbuf[UTF8_MAXBYTES+1];
	1041	uvchr_to_utf8(tmpbuf, c);
	1042	return is_utf8_space(tmpbuf);
	1043	}
	1044
	1045	bool
	1046	Perl_is_uni_digit(pTHX_ UV c)
	1047	{
	1048	U8 tmpbuf[UTF8_MAXBYTES+1];
	1049	uvchr_to_utf8(tmpbuf, c);
	1050	return is_utf8_digit(tmpbuf);
	1051	}
	1052
	1053	bool
	1054	Perl_is_uni_upper(pTHX_ UV c)
	1055	{
	1056	U8 tmpbuf[UTF8_MAXBYTES+1];
	1057	uvchr_to_utf8(tmpbuf, c);
	1058	return is_utf8_upper(tmpbuf);
	1059	}
	1060
	1061	bool
	1062	Perl_is_uni_lower(pTHX_ UV c)
	1063	{
	1064	U8 tmpbuf[UTF8_MAXBYTES+1];
	1065	uvchr_to_utf8(tmpbuf, c);
	1066	return is_utf8_lower(tmpbuf);
	1067	}
	1068
	1069	bool
	1070	Perl_is_uni_cntrl(pTHX_ UV c)
	1071	{
	1072	U8 tmpbuf[UTF8_MAXBYTES+1];
	1073	uvchr_to_utf8(tmpbuf, c);
	1074	return is_utf8_cntrl(tmpbuf);
	1075	}
	1076
	1077	bool
	1078	Perl_is_uni_graph(pTHX_ UV c)
	1079	{
	1080	U8 tmpbuf[UTF8_MAXBYTES+1];
	1081	uvchr_to_utf8(tmpbuf, c);
	1082	return is_utf8_graph(tmpbuf);
	1083	}
	1084
	1085	bool
	1086	Perl_is_uni_print(pTHX_ UV c)
	1087	{
	1088	U8 tmpbuf[UTF8_MAXBYTES+1];
	1089	uvchr_to_utf8(tmpbuf, c);
	1090	return is_utf8_print(tmpbuf);
	1091	}
	1092
	1093	bool
	1094	Perl_is_uni_punct(pTHX_ UV c)
	1095	{
	1096	U8 tmpbuf[UTF8_MAXBYTES+1];
	1097	uvchr_to_utf8(tmpbuf, c);
	1098	return is_utf8_punct(tmpbuf);
	1099	}
	1100
	1101	bool
	1102	Perl_is_uni_xdigit(pTHX_ UV c)
	1103	{
	1104	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1105	uvchr_to_utf8(tmpbuf, c);
	1106	return is_utf8_xdigit(tmpbuf);
	1107	}
	1108
	1109	UV
	1110	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1111	{
	1112	uvchr_to_utf8(p, c);
	1113	return to_utf8_upper(p, p, lenp);
	1114	}
	1115
	1116	UV
	1117	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1118	{
	1119	uvchr_to_utf8(p, c);
	1120	return to_utf8_title(p, p, lenp);
	1121	}
	1122
	1123	UV
	1124	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1125	{
	1126	uvchr_to_utf8(p, c);
	1127	return to_utf8_lower(p, p, lenp);
	1128	}
	1129
	1130	UV
	1131	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1132	{
	1133	uvchr_to_utf8(p, c);
	1134	return to_utf8_fold(p, p, lenp);
	1135	}
	1136
	1137	/* for now these all assume no locale info available for Unicode > 255 */
	1138
	1139	bool
	1140	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1141	{
	1142	return is_uni_alnum(c); /* XXX no locale support yet */
	1143	}
	1144
	1145	bool
	1146	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	1147	{
	1148	return is_uni_alnumc(c); /* XXX no locale support yet */
	1149	}
	1150
	1151	bool
	1152	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1153	{
	1154	return is_uni_idfirst(c); /* XXX no locale support yet */
	1155	}
	1156
	1157	bool
	1158	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1159	{
	1160	return is_uni_alpha(c); /* XXX no locale support yet */
	1161	}
	1162
	1163	bool
	1164	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1165	{
	1166	return is_uni_ascii(c); /* XXX no locale support yet */
	1167	}
	1168
	1169	bool
	1170	Perl_is_uni_space_lc(pTHX_ UV c)
	1171	{
	1172	return is_uni_space(c); /* XXX no locale support yet */
	1173	}
	1174
	1175	bool
	1176	Perl_is_uni_digit_lc(pTHX_ UV c)
	1177	{
	1178	return is_uni_digit(c); /* XXX no locale support yet */
	1179	}
	1180
	1181	bool
	1182	Perl_is_uni_upper_lc(pTHX_ UV c)
	1183	{
	1184	return is_uni_upper(c); /* XXX no locale support yet */
	1185	}
	1186
	1187	bool
	1188	Perl_is_uni_lower_lc(pTHX_ UV c)
	1189	{
	1190	return is_uni_lower(c); /* XXX no locale support yet */
	1191	}
	1192
	1193	bool
	1194	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1195	{
	1196	return is_uni_cntrl(c); /* XXX no locale support yet */
	1197	}
	1198
	1199	bool
	1200	Perl_is_uni_graph_lc(pTHX_ UV c)
	1201	{
	1202	return is_uni_graph(c); /* XXX no locale support yet */
	1203	}
	1204
	1205	bool
	1206	Perl_is_uni_print_lc(pTHX_ UV c)
	1207	{
	1208	return is_uni_print(c); /* XXX no locale support yet */
	1209	}
	1210
	1211	bool
	1212	Perl_is_uni_punct_lc(pTHX_ UV c)
	1213	{
	1214	return is_uni_punct(c); /* XXX no locale support yet */
	1215	}
	1216
	1217	bool
	1218	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1219	{
	1220	return is_uni_xdigit(c); /* XXX no locale support yet */
	1221	}
	1222
	1223	U32
	1224	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1225	{
	1226	/* XXX returns only the first character -- do not use XXX */
	1227	/* XXX no locale support yet */
	1228	STRLEN len;
	1229	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1230	return (U32)to_uni_upper(c, tmpbuf, &len);
	1231	}
	1232
	1233	U32
	1234	Perl_to_uni_title_lc(pTHX_ U32 c)
	1235	{
	1236	/* XXX returns only the first character XXX -- do not use XXX */
	1237	/* XXX no locale support yet */
	1238	STRLEN len;
	1239	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1240	return (U32)to_uni_title(c, tmpbuf, &len);
	1241	}
	1242
	1243	U32
	1244	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1245	{
	1246	/* XXX returns only the first character -- do not use XXX */
	1247	/* XXX no locale support yet */
	1248	STRLEN len;
	1249	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1250	return (U32)to_uni_lower(c, tmpbuf, &len);
	1251	}
	1252
	1253	bool
	1254	Perl_is_utf8_alnum(pTHX_ U8 *p)
	1255	{
	1256	if (!is_utf8_char(p))
	1257	return FALSE;
	1258	if (!PL_utf8_alnum)
	1259	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1260	* descendant of isalnum(3), in other words, it doesn't
	1261	* contain the '_'. --jhi */
	1262	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1263	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1264	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1265	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1266	if (!PL_utf8_alnum)
	1267	PL_utf8_alnum = swash_init("utf8", "",
	1268	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1269	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1270	#endif
	1271	}
	1272
	1273	bool
	1274	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	1275	{
	1276	if (!is_utf8_char(p))
	1277	return FALSE;
	1278	if (!PL_utf8_alnumc)
	1279	PL_utf8_alnumc = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1280	return swash_fetch(PL_utf8_alnumc, p, TRUE) != 0;
	1281	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1282	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1283	if (!PL_utf8_alnum)
	1284	PL_utf8_alnum = swash_init("utf8", "",
	1285	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1286	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1287	#endif
	1288	}
	1289
	1290	bool
	1291	Perl_is_utf8_idfirst(pTHX_ U8 p) / The naming is historical. */
	1292	{
	1293	if (*p == '_')
	1294	return TRUE;
	1295	if (!is_utf8_char(p))
	1296	return FALSE;
	1297	if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
	1298	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
	1299	return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
	1300	}
	1301
	1302	bool
	1303	Perl_is_utf8_idcont(pTHX_ U8 *p)
	1304	{
	1305	if (*p == '_')
	1306	return TRUE;
	1307	if (!is_utf8_char(p))
	1308	return FALSE;
	1309	if (!PL_utf8_idcont)
	1310	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
	1311	return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
	1312	}
	1313
	1314	bool
	1315	Perl_is_utf8_alpha(pTHX_ U8 *p)
	1316	{
	1317	if (!is_utf8_char(p))
	1318	return FALSE;
	1319	if (!PL_utf8_alpha)
	1320	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1321	return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
	1322	}
	1323
	1324	bool
	1325	Perl_is_utf8_ascii(pTHX_ U8 *p)
	1326	{
	1327	if (!is_utf8_char(p))
	1328	return FALSE;
	1329	if (!PL_utf8_ascii)
	1330	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1331	return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
	1332	}
	1333
	1334	bool
	1335	Perl_is_utf8_space(pTHX_ U8 *p)
	1336	{
	1337	if (!is_utf8_char(p))
	1338	return FALSE;
	1339	if (!PL_utf8_space)
	1340	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1341	return swash_fetch(PL_utf8_space, p, TRUE) != 0;
	1342	}
	1343
	1344	bool
	1345	Perl_is_utf8_digit(pTHX_ U8 *p)
	1346	{
	1347	if (!is_utf8_char(p))
	1348	return FALSE;
	1349	if (!PL_utf8_digit)
	1350	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1351	return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
	1352	}
	1353
	1354	bool
	1355	Perl_is_utf8_upper(pTHX_ U8 *p)
	1356	{
	1357	if (!is_utf8_char(p))
	1358	return FALSE;
	1359	if (!PL_utf8_upper)
	1360	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
	1361	return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
	1362	}
	1363
	1364	bool
	1365	Perl_is_utf8_lower(pTHX_ U8 *p)
	1366	{
	1367	if (!is_utf8_char(p))
	1368	return FALSE;
	1369	if (!PL_utf8_lower)
	1370	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
	1371	return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
	1372	}
	1373
	1374	bool
	1375	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	1376	{
	1377	if (!is_utf8_char(p))
	1378	return FALSE;
	1379	if (!PL_utf8_cntrl)
	1380	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1381	return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
	1382	}
	1383
	1384	bool
	1385	Perl_is_utf8_graph(pTHX_ U8 *p)
	1386	{
	1387	if (!is_utf8_char(p))
	1388	return FALSE;
	1389	if (!PL_utf8_graph)
	1390	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1391	return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
	1392	}
	1393
	1394	bool
	1395	Perl_is_utf8_print(pTHX_ U8 *p)
	1396	{
	1397	if (!is_utf8_char(p))
	1398	return FALSE;
	1399	if (!PL_utf8_print)
	1400	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1401	return swash_fetch(PL_utf8_print, p, TRUE) != 0;
	1402	}
	1403
	1404	bool
	1405	Perl_is_utf8_punct(pTHX_ U8 *p)
	1406	{
	1407	if (!is_utf8_char(p))
	1408	return FALSE;
	1409	if (!PL_utf8_punct)
	1410	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1411	return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
	1412	}
	1413
	1414	bool
	1415	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1416	{
	1417	if (!is_utf8_char(p))
	1418	return FALSE;
	1419	if (!PL_utf8_xdigit)
	1420	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1421	return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
	1422	}
	1423
	1424	bool
	1425	Perl_is_utf8_mark(pTHX_ U8 *p)
	1426	{
	1427	if (!is_utf8_char(p))
	1428	return FALSE;
	1429	if (!PL_utf8_mark)
	1430	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1431	return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
	1432	}
	1433
	1434	/*
	1435	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1436
	1437	The "p" contains the pointer to the UTF-8 string encoding
	1438	the character that is being converted.
	1439
	1440	The "ustrp" is a pointer to the character buffer to put the
	1441	conversion result to. The "lenp" is a pointer to the length
	1442	of the result.
	1443
	1444	The "swashp" is a pointer to the swash to use.
	1445
	1446	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1447	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
	1448	but not always, a multicharacter mapping), is tried first.
	1449
	1450	The "special" is a string like "utf8::ToSpecLower", which means the
	1451	hash %utf8::ToSpecLower. The access to the hash is through
	1452	Perl_to_utf8_case().
	1453
	1454	The "normal" is a string like "ToLower" which means the swash
	1455	%utf8::ToLower.
	1456
	1457	=cut */
	1458
	1459	UV
	1460	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
	1461	{
	1462	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1463	STRLEN len = 0;
	1464
	1465	const UV uv0 = utf8_to_uvchr(p, NULL);
	1466	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1467	* are necessary in EBCDIC, they are redundant no-ops
	1468	* in ASCII-ish platforms, and hopefully optimized away. */
	1469	const UV uv1 = NATIVE_TO_UNI(uv0);
	1470	uvuni_to_utf8(tmpbuf, uv1);
	1471
	1472	if (!swashp) / load on-demand */
	1473	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1474
	1475	/* The 0xDF is the only special casing Unicode code point below 0x100. */
	1476	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
	1477	/* It might be "special" (sometimes, but not always,
	1478	* a multicharacter mapping) */
	1479	HV *hv;
	1480	SV **svp;
	1481
	1482	if ((hv = get_hv(special, FALSE)) &&
	1483	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1484	(*svp)) {
	1485	const char *s;
	1486
	1487	s = SvPV_const(*svp, len);
	1488	if (len == 1)
	1489	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1490	else {
	1491	#ifdef EBCDIC
	1492	/* If we have EBCDIC we need to remap the characters
	1493	* since any characters in the low 256 are Unicode
	1494	* code points, not EBCDIC. */
	1495	U8 t = (U8)s, tend = t + len, d;
	1496
	1497	d = tmpbuf;
	1498	if (SvUTF8(*svp)) {
	1499	STRLEN tlen = 0;
	1500
	1501	while (t < tend) {
	1502	UV c = utf8_to_uvchr(t, &tlen);
	1503	if (tlen > 0) {
	1504	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1505	t += tlen;
	1506	}
	1507	else
	1508	break;
	1509	}
	1510	}
	1511	else {
	1512	while (t < tend) {
	1513	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1514	t++;
	1515	}
	1516	}
	1517	len = d - tmpbuf;
	1518	Copy(tmpbuf, ustrp, len, U8);
	1519	#else
	1520	Copy(s, ustrp, len, U8);
	1521	#endif
	1522	}
	1523	}
	1524	}
	1525
	1526	if (!len && *swashp) {
	1527	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1528
	1529	if (uv2) {
	1530	/* It was "normal" (a single character mapping). */
	1531	UV uv3 = UNI_TO_NATIVE(uv2);
	1532
	1533	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1534	}
	1535	}
	1536
	1537	if (!len) /* Neither: just copy. */
	1538	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1539
	1540	if (lenp)
	1541	*lenp = len;
	1542
	1543	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1544	}
	1545
	1546	/*
	1547	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
	1548
	1549	Convert the UTF-8 encoded character at p to its uppercase version and
	1550	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1551	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1552	the uppercase version may be longer than the original character.
	1553
	1554	The first character of the uppercased version is returned
	1555	(but note, as explained above, that there may be more.)
	1556
	1557	=cut */
	1558
	1559	UV
	1560	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1561	{
	1562	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1563	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1564	}
	1565
	1566	/*
	1567	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
	1568
	1569	Convert the UTF-8 encoded character at p to its titlecase version and
	1570	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1571	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1572	titlecase version may be longer than the original character.
	1573
	1574	The first character of the titlecased version is returned
	1575	(but note, as explained above, that there may be more.)
	1576
	1577	=cut */
	1578
	1579	UV
	1580	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1581	{
	1582	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1583	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1584	}
	1585
	1586	/*
	1587	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
	1588
	1589	Convert the UTF-8 encoded character at p to its lowercase version and
	1590	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1591	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1592	lowercase version may be longer than the original character.
	1593
	1594	The first character of the lowercased version is returned
	1595	(but note, as explained above, that there may be more.)
	1596
	1597	=cut */
	1598
	1599	UV
	1600	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1601	{
	1602	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1603	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1604	}
	1605
	1606	/*
	1607	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
	1608
	1609	Convert the UTF-8 encoded character at p to its foldcase version and
	1610	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1611	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1612	foldcase version may be longer than the original character (up to
	1613	three characters).
	1614
	1615	The first character of the foldcased version is returned
	1616	(but note, as explained above, that there may be more.)
	1617
	1618	=cut */
	1619
	1620	UV
	1621	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1622	{
	1623	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1624	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1625	}
	1626
	1627	/* a "swash" is a swatch hash */
	1628
	1629	SV*
	1630	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1631	{
	1632	SV* retval;
	1633	SV* const tokenbufsv = sv_newmortal();
	1634	dSP;
	1635	const size_t pkg_len = strlen(pkg);
	1636	const size_t name_len = strlen(name);
	1637	HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
	1638	SV* errsv_save;
	1639
	1640	PUSHSTACKi(PERLSI_MAGIC);
	1641	ENTER;
	1642	SAVEI32(PL_hints);
	1643	PL_hints = 0;
	1644	save_re_context();
	1645	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1646	ENTER;
	1647	errsv_save = newSVsv(ERRSV);
	1648	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1649	Nullsv);
	1650	if (!SvTRUE(ERRSV))
	1651	sv_setsv(ERRSV, errsv_save);
	1652	SvREFCNT_dec(errsv_save);
	1653	LEAVE;
	1654	}
	1655	SPAGAIN;
	1656	PUSHMARK(SP);
	1657	EXTEND(SP,5);
	1658	PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
	1659	PUSHs(sv_2mortal(newSVpvn(name, name_len)));
	1660	PUSHs(listsv);
	1661	PUSHs(sv_2mortal(newSViv(minbits)));
	1662	PUSHs(sv_2mortal(newSViv(none)));
	1663	PUTBACK;
	1664	if (IN_PERL_COMPILETIME) {
	1665	/* XXX ought to be handled by lex_start */
	1666	SAVEI32(PL_in_my);
	1667	PL_in_my = 0;
	1668	sv_setpv(tokenbufsv, PL_tokenbuf);
	1669	}
	1670	errsv_save = newSVsv(ERRSV);
	1671	if (call_method("SWASHNEW", G_SCALAR))
	1672	retval = newSVsv(*PL_stack_sp--);
	1673	else
	1674	retval = &PL_sv_undef;
	1675	if (!SvTRUE(ERRSV))
	1676	sv_setsv(ERRSV, errsv_save);
	1677	SvREFCNT_dec(errsv_save);
	1678	LEAVE;
	1679	POPSTACK;
	1680	if (IN_PERL_COMPILETIME) {
	1681	STRLEN len;
	1682	const char* const pv = SvPV_const(tokenbufsv, len);
	1683
	1684	Copy(pv, PL_tokenbuf, len+1, char);
	1685	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1686	}
	1687	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1688	if (SvPOK(retval))
	1689	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1690	retval);
	1691	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1692	}
	1693	return retval;
	1694	}
	1695
	1696
	1697	/* This API is wrong for special case conversions since we may need to
	1698	* return several Unicode characters for a single Unicode character
	1699	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1700	* the lower-level routine, and it is similarly broken for returning
	1701	* multiple values. --jhi */
	1702	UV
	1703	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
	1704	{
	1705	HV* const hv = (HV*)SvRV(sv);
	1706	U32 klen;
	1707	U32 off;
	1708	STRLEN slen;
	1709	STRLEN needents;
	1710	const U8 *tmps = NULL;
	1711	U32 bit;
	1712	SV *retval;
	1713	U8 tmputf8[2];
	1714	UV c = NATIVE_TO_ASCII(*ptr);
	1715
	1716	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1717	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1718	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1719	ptr = tmputf8;
	1720	}
	1721	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1722	* then the "swatch" is a vec() for al the chars which start
	1723	* with 0xAA..0xYY
	1724	* So the key in the hash (klen) is length of encoded char -1
	1725	*/
	1726	klen = UTF8SKIP(ptr) - 1;
	1727	off = ptr[klen];
	1728
	1729	if (klen == 0)
	1730	{
	1731	/* If char in invariant then swatch is for all the invariant chars
	1732	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1733	*/
	1734	needents = UTF_CONTINUATION_MARK;
	1735	off = NATIVE_TO_UTF(ptr[klen]);
	1736	}
	1737	else
	1738	{
	1739	/* If char is encoded then swatch is for the prefix */
	1740	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1741	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1742	}
	1743
	1744	/*
	1745	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1746	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1747	* it's nothing to sniff at.) Pity we usually come through at least
	1748	* two function calls to get here...
	1749	*
	1750	* NB: this code assumes that swatches are never modified, once generated!
	1751	*/
	1752
	1753	if (hv == PL_last_swash_hv &&
	1754	klen == PL_last_swash_klen &&
	1755	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1756	{
	1757	tmps = PL_last_swash_tmps;
	1758	slen = PL_last_swash_slen;
	1759	}
	1760	else {
	1761	/* Try our second-level swatch cache, kept in a hash. */
	1762	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	1763
	1764	/* If not cached, generate it via utf8::SWASHGET */
	1765	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (const U8)SvPV_const(*svp, slen))) {
	1766	dSP;
	1767	/* We use utf8n_to_uvuni() as we want an index into
	1768	Unicode tables, not a native character number.
	1769	*/
	1770	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1771	ckWARN(WARN_UTF8) ?
	1772	0 : UTF8_ALLOW_ANY);
	1773	SV *errsv_save;
	1774	ENTER;
	1775	SAVETMPS;
	1776	save_re_context();
	1777	PUSHSTACKi(PERLSI_MAGIC);
	1778	PUSHMARK(SP);
	1779	EXTEND(SP,3);
	1780	PUSHs((SV*)sv);
	1781	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1782	PUSHs(sv_2mortal(newSViv((klen) ?
	1783	(code_point & ~(needents - 1)) : 0)));
	1784	PUSHs(sv_2mortal(newSViv(needents)));
	1785	PUTBACK;
	1786	errsv_save = newSVsv(ERRSV);
	1787	if (call_method("SWASHGET", G_SCALAR))
	1788	retval = newSVsv(*PL_stack_sp--);
	1789	else
	1790	retval = &PL_sv_undef;
	1791	if (!SvTRUE(ERRSV))
	1792	sv_setsv(ERRSV, errsv_save);
	1793	SvREFCNT_dec(errsv_save);
	1794	POPSTACK;
	1795	FREETMPS;
	1796	LEAVE;
	1797	if (IN_PERL_COMPILETIME)
	1798	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1799
	1800	svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
	1801
	1802	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1803	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1804	}
	1805
	1806	PL_last_swash_hv = hv;
	1807	PL_last_swash_klen = klen;
	1808	/* FIXME change interpvar.h? */
	1809	PL_last_swash_tmps = (U8 *) tmps;
	1810	PL_last_swash_slen = slen;
	1811	if (klen)
	1812	Copy(ptr, PL_last_swash_key, klen, U8);
	1813	}
	1814
	1815	switch ((int)((slen << 3) / needents)) {
	1816	case 1:
	1817	bit = 1 << (off & 7);
	1818	off >>= 3;
	1819	return (tmps[off] & bit) != 0;
	1820	case 8:
	1821	return tmps[off];
	1822	case 16:
	1823	off <<= 1;
	1824	return (tmps[off] << 8) + tmps[off + 1] ;
	1825	case 32:
	1826	off <<= 2;
	1827	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1828	}
	1829	Perl_croak(aTHX_ "panic: swash_fetch");
	1830	return 0;
	1831	}
	1832
	1833
	1834	/*
	1835	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1836
	1837	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	1838	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	1839	bytes available. The return value is the pointer to the byte after the
	1840	end of the new character. In other words,
	1841
	1842	d = uvchr_to_utf8(d, uv);
	1843
	1844	is the recommended wide native character-aware way of saying
	1845
	1846	*(d++) = uv;
	1847
	1848	=cut
	1849	*/
	1850
	1851	/* On ASCII machines this is normally a macro but we want a
	1852	real function in case XS code wants it
	1853	*/
	1854	#undef Perl_uvchr_to_utf8
	1855	U8 *
	1856	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1857	{
	1858	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	1859	}
	1860
	1861	U8 *
	1862	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	1863	{
	1864	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	1865	}
	1866
	1867	/*
	1868	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1869
	1870	Returns the native character value of the first character in the string C<s>
	1871	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	1872	length, in bytes, of that character.
	1873
	1874	Allows length and flags to be passed to low level routine.
	1875
	1876	=cut
	1877	*/
	1878	/* On ASCII machines this is normally a macro but we want
	1879	a real function in case XS code wants it
	1880	*/
	1881	#undef Perl_utf8n_to_uvchr
	1882	UV
	1883	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1884	{
	1885	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1886	return UNI_TO_NATIVE(uv);
	1887	}
	1888
	1889	/*
	1890	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	1891
	1892	Build to the scalar dsv a displayable version of the string spv,
	1893	length len, the displayable version being at most pvlim bytes long
	1894	(if longer, the rest is truncated and "..." will be appended).
	1895
	1896	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	1897	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	1898	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	1899	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	1900	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	1901	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	1902
	1903	The pointer to the PV of the dsv is returned.
	1904
	1905	=cut */
	1906	char *
	1907	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	1908	{
	1909	int truncated = 0;
	1910	const char s, e;
	1911
	1912	sv_setpvn(dsv, "", 0);
	1913	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	1914	UV u;
	1915	/* This serves double duty as a flag and a character to print after
	1916	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	1917	*/
	1918	char ok = 0;
	1919
	1920	if (pvlim && SvCUR(dsv) >= pvlim) {
	1921	truncated++;
	1922	break;
	1923	}
	1924	u = utf8_to_uvchr((U8*)s, 0);
	1925	if (u < 256) {
	1926	const unsigned char c = (unsigned char)u & 0xFF;
	1927	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
	1928	switch (c) {
	1929	case '\n':
	1930	ok = 'n'; break;
	1931	case '\r':
	1932	ok = 'r'; break;
	1933	case '\t':
	1934	ok = 't'; break;
	1935	case '\f':
	1936	ok = 'f'; break;
	1937	case '\a':
	1938	ok = 'a'; break;
	1939	case '\\':
	1940	ok = '\\'; break;
	1941	default: break;
	1942	}
	1943	if (ok) {
	1944	Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
	1945	}
	1946	}
	1947	/* isPRINT() is the locale-blind version. */
	1948	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	1949	Perl_sv_catpvf(aTHX_ dsv, "%c", c);
	1950	ok = 1;
	1951	}
	1952	}
	1953	if (!ok)
	1954	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	1955	}
	1956	if (truncated)
	1957	sv_catpvn(dsv, "...", 3);
	1958
	1959	return SvPVX(dsv);
	1960	}
	1961
	1962	/*
	1963	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	1964
	1965	Build to the scalar dsv a displayable version of the scalar sv,
	1966	the displayable version being at most pvlim bytes long
	1967	(if longer, the rest is truncated and "..." will be appended).
	1968
	1969	The flags argument is as in pv_uni_display().
	1970
	1971	The pointer to the PV of the dsv is returned.
	1972
	1973	=cut */
	1974	char *
	1975	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	1976	{
	1977	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
	1978	SvCUR(ssv), pvlim, flags);
	1979	}
	1980
	1981	/*
	1982	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	1983
	1984	Return true if the strings s1 and s2 differ case-insensitively, false
	1985	if not (if they are equal case-insensitively). If u1 is true, the
	1986	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	1987	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	1988	are false, the respective string is assumed to be in native 8-bit
	1989	encoding.
	1990
	1991	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	1992	in there (they will point at the beginning of the I<next> character).
	1993	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	1994	pointers beyond which scanning will not continue under any
	1995	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	1996	s2+l2 will be used as goal end pointers that will also stop the scan,
	1997	and which qualify towards defining a successful match: all the scans
	1998	that define an explicit length must reach their goal pointers for
	1999	a match to succeed).
	2000
	2001	For case-insensitiveness, the "casefolding" of Unicode is used
	2002	instead of upper/lowercasing both the characters, see
	2003	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	2004
	2005	=cut */
	2006	I32
	2007	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	2008	{
	2009	register const U8 p1 = (const U8)s1;
	2010	register const U8 p2 = (const U8)s2;
	2011	register const U8 f1 = 0, f2 = 0;
	2012	register U8 e1 = 0, q1 = 0;
	2013	register U8 e2 = 0, q2 = 0;
	2014	STRLEN n1 = 0, n2 = 0;
	2015	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	2016	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	2017	U8 natbuf[1+1];
	2018	STRLEN foldlen1, foldlen2;
	2019	bool match;
	2020
	2021	if (pe1)
	2022	e1 = (U8*)pe1;
	2023	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (const U8*)s1)))
	2024	f1 = (const U8*)s1 + l1;
	2025	if (pe2)
	2026	e2 = (U8*)pe2;
	2027	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (const U8*)s2)))
	2028	f2 = (const U8*)s2 + l2;
	2029
	2030	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	2031	return 1; /* mismatch; possible infinite loop or false positive */
	2032
	2033	if (!u1 \|\| !u2)
	2034	natbuf[1] = 0; /* Need to terminate the buffer. */
	2035
	2036	while ((e1 == 0 \|\| p1 < e1) &&
	2037	(f1 == 0 \|\| p1 < f1) &&
	2038	(e2 == 0 \|\| p2 < e2) &&
	2039	(f2 == 0 \|\| p2 < f2)) {
	2040	if (n1 == 0) {
	2041	if (u1)
	2042	to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
	2043	else {
	2044	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	2045	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	2046	}
	2047	q1 = foldbuf1;
	2048	n1 = foldlen1;
	2049	}
	2050	if (n2 == 0) {
	2051	if (u2)
	2052	to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
	2053	else {
	2054	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	2055	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	2056	}
	2057	q2 = foldbuf2;
	2058	n2 = foldlen2;
	2059	}
	2060	while (n1 && n2) {
	2061	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	2062	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	2063	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	2064	return 1; /* mismatch */
	2065	n1 -= UTF8SKIP(q1);
	2066	q1 += UTF8SKIP(q1);
	2067	n2 -= UTF8SKIP(q2);
	2068	q2 += UTF8SKIP(q2);
	2069	}
	2070	if (n1 == 0)
	2071	p1 += u1 ? UTF8SKIP(p1) : 1;
	2072	if (n2 == 0)
	2073	p2 += u2 ? UTF8SKIP(p2) : 1;
	2074
	2075	}
	2076
	2077	/* A match is defined by all the scans that specified
	2078	* an explicit length reaching their final goals. */
	2079	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	2080
	2081	if (match) {
	2082	if (pe1)
	2083	pe1 = (char)p1;
	2084	if (pe2)
	2085	pe2 = (char)p2;
	2086	}
	2087
	2088	return match ? 0 : 1; /* 0 match, 1 mismatch */
	2089	}
	2090
	2091	/*
	2092	* Local variables:
	2093	* c-indentation-style: bsd
	2094	* c-basic-offset: 4
	2095	* indent-tabs-mode: t
	2096	* End:
	2097	*
	2098	* ex: set ts=8 sts=4 sw=4 noet:
	2099	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/perl/5.8.8/utf8.c

Download in other formats: