File Coverage

utf8.c

Criterion	Covered	Total	%
statement	1065	1354	78.7
branch	1030	1790	57.5
condition			n/a
subroutine			n/a
total	2095	3144	66.6

line	stmt	bran	code
1			/* utf8.c
2			*
3			* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4			* by Larry Wall and others
5			*
6			* You may distribute under the terms of either the GNU General Public
7			* License or the Artistic License, as specified in the README file.
8			*
9			*/
10
11			/*
12			* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13			* heard of that we don't want to see any closer; and that's the one place
14			* we're trying to get to! And that's just where we can't get, nohow.'
15			*
16			* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17			*
18			* 'Well do I understand your speech,' he answered in the same language;
19			* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20			* as is the custom in the West, if you wish to be answered?'
21			* --Gandalf, addressing Théoden's door wardens
22			*
23			* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24			*
25			* ...the travellers perceived that the floor was paved with stones of many
26			* hues; branching runes and strange devices intertwined beneath their feet.
27			*
28			* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29			*/
30
31			#include "EXTERN.h"
32			#define PERL_IN_UTF8_C
33			#include "perl.h"
34			#include "inline_invlist.c"
35
36			static const char unees[] =
37			"Malformed UTF-8 character (unexpected end of string)";
38
39			/*
40			=head1 Unicode Support
41
42			This file contains various utility functions for manipulating UTF8-encoded
43			strings. For the uninitiated, this is a method of representing arbitrary
44			Unicode characters as a variable number of bytes, in such a way that
45			characters in the ASCII range are unmodified, and a zero byte never appears
46			within non-zero characters.
47
48			=cut
49			*/
50
51			/*
52			=for apidoc is_ascii_string
53
54			Returns true if the first C bytes of the string C ~~are the same whether~~
55			or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
56			is, if they are invariant. On ASCII-ish machines, only ASCII characters
57			fit this definition, hence the function's name.
58
59			If C is 0, it will be calculated using C.
60
61			See also L(), L(), and L().
62
63			=cut
64			*/
65
66			bool
67	533506		Perl_is_ascii_string(const U8 *s, STRLEN len)
68			{
69	533506	50	const U8* const send = s + (len ? len : strlen((const char *)s));
70			const U8* x = s;
71
72			PERL_ARGS_ASSERT_IS_ASCII_STRING;
73
74	13381006	100	for (; x < send; ++x) {
75	12849638	100	if (!UTF8_IS_INVARIANT(*x))
76			break;
77			}
78
79	533506		return x == send;
80			}
81
82			/*
83			=for apidoc uvoffuni_to_utf8_flags
84
85			THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
86			Instead, B or
87			L>.
88
89			This function is like them, but the input is a strict Unicode
90			(as opposed to native) code point. Only in very rare circumstances should code
91			not be using the native code point.
92
93			For details, see the description for L>.
94
95			=cut
96			*/
97
98			U8 *
99	8636950		Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
100			{
101			PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
102
103	8636950	100	if (UNI_IS_INVARIANT(uv)) {
104	4073184		*d++ = (U8) LATIN1_TO_NATIVE(uv);
105	4073184		return d;
106			}
107
108			/* The first problematic code point is the first surrogate */
109	4563766	100	if (uv >= UNICODE_SURROGATE_FIRST
110	946222	100	&& ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
111			{
112	923878	100	if (UNICODE_IS_SURROGATE(uv)) {
113	4936	50	if (flags & UNICODE_WARN_SURROGATE) {
114	0		Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
115			"UTF-16 surrogate U+%04"UVXf, uv);
116			}
117	4936	50	if (flags & UNICODE_DISALLOW_SURROGATE) {
118			return NULL;
119			}
120			}
121	918942	100	else if (UNICODE_IS_SUPER(uv)) {
122	77488	50	if (flags & UNICODE_WARN_SUPER
123	77488	100	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
		50
124			{
125	0		Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
126			"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
127			}
128	77488	50	if (flags & UNICODE_DISALLOW_SUPER
129	77488	100	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
		50
130			{
131			return NULL;
132			}
133			}
134	841454	100	else if (UNICODE_IS_NONCHAR(uv)) {
		100
135	912	50	if (flags & UNICODE_WARN_NONCHAR) {
136	0		Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
137			"Unicode non-character U+%04"UVXf" is illegal for open interchange",
138			uv);
139			}
140	912	50	if (flags & UNICODE_DISALLOW_NONCHAR) {
141			return NULL;
142			}
143			}
144			}
145
146			#if defined(EBCDIC)
147			{
148			STRLEN len = OFFUNISKIP(uv);
149			U8 *p = d+len-1;
150			while (p > d) {
151			*p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
152			uv >>= UTF_ACCUMULATION_SHIFT;
153			}
154			*p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
155			return d+len;
156			}
157			#else /* Non loop style */
158	4563766	100	if (uv < 0x800) {
159	1453792		*d++ = (U8)(( uv >> 6) \| 0xc0);
160	1453792		*d++ = (U8)(( uv & 0x3f) \| 0x80);
161	1453792		return d;
162			}
163	3109974	100	if (uv < 0x10000) {
164	2810290		*d++ = (U8)(( uv >> 12) \| 0xe0);
165	2810290		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
166	2810290		*d++ = (U8)(( uv & 0x3f) \| 0x80);
167	2810290		return d;
168			}
169	299684	100	if (uv < 0x200000) {
170	222796		*d++ = (U8)(( uv >> 18) \| 0xf0);
171	222796		*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
172	222796		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
173	222796		*d++ = (U8)(( uv & 0x3f) \| 0x80);
174	222796		return d;
175			}
176	76888	100	if (uv < 0x4000000) {
177	32		*d++ = (U8)(( uv >> 24) \| 0xf8);
178	32		*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
179	32		*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
180	32		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
181	32		*d++ = (U8)(( uv & 0x3f) \| 0x80);
182	32		return d;
183			}
184	76856	100	if (uv < 0x80000000) {
185	68		*d++ = (U8)(( uv >> 30) \| 0xfc);
186	68		*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
187	68		*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
188	68		*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
189	68		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
190	68		*d++ = (U8)(( uv & 0x3f) \| 0x80);
191	68		return d;
192			}
193			#ifdef HAS_QUAD
194	76788	100	if (uv < UTF8_QUAD_MAX)
195			#endif
196			{
197	34		d++ = 0xfe; / Can't match U+FEFF! */
198	34		*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
199	34		*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
200	34		*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
201	34		*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
202	34		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
203	34		*d++ = (U8)(( uv & 0x3f) \| 0x80);
204	34		return d;
205			}
206			#ifdef HAS_QUAD
207			{
208	76754		d++ = 0xff; / Can't match U+FFFE! */
209	76754		d++ = 0x80; / 6 Reserved bits */
210	76754		d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
211	76754		*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
212	76754		*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
213	76754		*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
214	76754		*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
215	76754		*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
216	76754		*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
217	76754		*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
218	76754		*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
219	76754		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
220	76754		*d++ = (U8)(( uv & 0x3f) \| 0x80);
221	4385292		return d;
222			}
223			#endif
224			#endif /* Non loop style */
225			}
226			/*
227			=for apidoc uvchr_to_utf8
228
229			Adds the UTF-8 representation of the native code point C to the end
230			of the string C; C should have at least C free
231			bytes available. The return value is the pointer to the byte after the
232			end of the new character. In other words,
233
234			d = uvchr_to_utf8(d, uv);
235
236			is the recommended wide native character-aware way of saying
237
238			*(d++) = uv;
239
240			This function accepts any UV as input. To forbid or warn on non-Unicode code
241			points, or those that may be problematic, see L.
242
243			=cut
244			*/
245
246			/* This is also a macro */
247			PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
248
249			U8 *
250	0		Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
251			{
252	0		return uvchr_to_utf8(d, uv);
253			}
254
255			/*
256			=for apidoc uvchr_to_utf8_flags
257
258			Adds the UTF-8 representation of the native code point C to the end
259			of the string C; C should have at least C free
260			bytes available. The return value is the pointer to the byte after the
261			end of the new character. In other words,
262
263			d = uvchr_to_utf8_flags(d, uv, flags);
264
265			or, in most cases,
266
267			d = uvchr_to_utf8_flags(d, uv, 0);
268
269			This is the Unicode-aware way of saying
270
271			*(d++) = uv;
272
273			This function will convert to UTF-8 (and not warn) even code points that aren't
274			legal Unicode or are problematic, unless C contains one or more of the
275			following flags:
276
277			If C is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
278			the function will raise a warning, provided UTF8 warnings are enabled. If instead
279			UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
280			If both flags are set, the function will both warn and return NULL.
281
282			The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
283			affect how the function handles a Unicode non-character. And likewise, the
284			UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
285			code points that are
286			above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
287			even less portable) can be warned and/or disallowed even if other above-Unicode
288			code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
289			flags.
290
291			And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
292			above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
293			DISALLOW flags.
294
295			=cut
296			*/
297
298			/* This is also a macro */
299			PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
300
301			U8 *
302	0		Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
303			{
304	0		return uvchr_to_utf8_flags(d, uv, flags);
305			}
306
307			/*
308
309			Tests if the first C bytes of string C ~~form a valid UTF-8~~
310			character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
311			valid UTF-8 character. The number of bytes in the UTF-8 character
312			will be returned if it is valid, otherwise 0.
313
314			This is the "slow" version as opposed to the "fast" version which is
315			the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
316			difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
317			or less you should use the IS_UTF8_CHAR(), for lengths of five or more
318			you should use the _slow(). In practice this means that the _slow()
319			will be used very rarely, since the maximum Unicode code point (as of
320			Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
321			the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
322			five bytes or more.
323
324			=cut */
325			PERL_STATIC_INLINE STRLEN
326	32		S_is_utf8_char_slow(const U8 *s, const STRLEN len)
327			{
328			dTHX; /* The function called below requires thread context */
329
330			STRLEN actual_len;
331
332			PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
333
334	32		utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
335
336	32	50	return (actual_len == (STRLEN) -1) ? 0 : actual_len;
337			}
338
339			/*
340			=for apidoc is_utf8_char_buf
341
342			Returns the number of bytes that comprise the first UTF-8 encoded character in
343			buffer C. C should point to one position beyond the end of the
344			buffer. 0 is returned if C does not point to a complete, valid UTF-8
345			encoded character.
346
347			Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
348			machines) is a valid UTF-8 character.
349
350			=cut */
351
352			STRLEN
353	410108		Perl_is_utf8_char_buf(const U8 buf, const U8 buf_end)
354			{
355
356			STRLEN len;
357
358			PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
359
360	410108	50	if (buf_end <= buf) {
361			return 0;
362			}
363
364	410108		len = buf_end - buf;
365	410108	50	if (len > UTF8SKIP(buf)) {
366	0		len = UTF8SKIP(buf);
367			}
368
369			#ifdef IS_UTF8_CHAR
370	410108	100	if (IS_UTF8_CHAR_FAST(len))
371	410104	100	return IS_UTF8_CHAR(buf, len) ? len : 0;
		50
		50
		0
		50
		0
		0
		50
		0
		0
		100
		50
		50
		50
		50
		50
		50
		50
		100
		50
		50
		0
		100
		50
		50
		50
		100
		50
		100
		50
		50
		50
		50
		50
		100
372			#endif /* #ifdef IS_UTF8_CHAR */
373	205056		return is_utf8_char_slow(buf, len);
374			}
375
376			/*
377			=for apidoc is_utf8_char
378
379			Tests if some arbitrary number of bytes begins in a valid UTF-8
380			character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
381			character is a valid UTF-8 character. The actual number of bytes in the UTF-8
382			character will be returned if it is valid, otherwise 0.
383
384			This function is deprecated due to the possibility that malformed input could
385			cause reading beyond the end of the input buffer. Use L
386			instead.
387
388			=cut */
389
390			STRLEN
391	0		Perl_is_utf8_char(const U8 *s)
392			{
393			PERL_ARGS_ASSERT_IS_UTF8_CHAR;
394
395			/* Assumes we have enough space, which is why this is deprecated */
396	0		return is_utf8_char_buf(s, s + UTF8SKIP(s));
397			}
398
399
400			/*
401			=for apidoc is_utf8_string
402
403			Returns true if the first C bytes of string C ~~form a valid~~
404			UTF-8 string, false otherwise. If C is 0, it will be calculated
405			using C (which means if you use this option, that C ~~has to have a~~
406			terminating NUL byte). Note that all characters being ASCII constitute 'a
407			valid UTF-8 string'.
408
409			See also L(), L(), and L().
410
411			=cut
412			*/
413
414			bool
415	32690		Perl_is_utf8_string(const U8 *s, STRLEN len)
416			{
417	32690	100	const U8* const send = s + (len ? len : strlen((const char *)s));
418			const U8* x = s;
419
420			PERL_ARGS_ASSERT_IS_UTF8_STRING;
421
422	787575	100	while (x < send) {
423			/* Inline the easy bits of is_utf8_char() here for speed... */
424	738552	100	if (UTF8_IS_INVARIANT(*x)) {
425	442400		x++;
426			}
427			else {
428			/* ... and call is_utf8_char() only if really needed. */
429	296152		const STRLEN c = UTF8SKIP(x);
430	296152		const U8* const next_char_ptr = x + c;
431
432	296152	100	if (next_char_ptr > send) {
433			return FALSE;
434			}
435
436	296144	100	if (IS_UTF8_CHAR_FAST(c)) {
437	296116	100	if (!IS_UTF8_CHAR(x, c))
		50
		50
		0
		50
		0
		0
		50
		0
		0
		100
		50
		50
		50
		50
		50
		50
		50
		100
		50
		50
		0
		100
		50
		50
		50
		100
		50
		100
		50
		50
		50
		50
		50
		100
438			return FALSE;
439			}
440	369278	50	else if (! is_utf8_char_slow(x, c)) {
441			return FALSE;
442			}
443			x = next_char_ptr;
444			}
445			}
446
447			return TRUE;
448			}
449
450			/*
451			Implemented as a macro in utf8.h
452
453			=for apidoc is_utf8_string_loc
454
455			Like L but stores the location of the failure (in the
456			case of "utf8ness failure") or the location C~~+C (in the case of~~
457			"utf8ness success") in the C.
458
459			See also L() and L().
460
461			=for apidoc is_utf8_string_loclen
462
463			Like L() but stores the location of the failure (in the
464			case of "utf8ness failure") or the location C~~+C (in the case of~~
465			"utf8ness success") in the C, and the number of UTF-8
466			encoded characters in the C.
467
468			See also L() and L().
469
470			=cut
471			*/
472
473			bool
474	2290		Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
475			{
476	2290	50	const U8* const send = s + (len ? len : strlen((const char *)s));
477			const U8* x = s;
478			STRLEN c;
479			STRLEN outlen = 0;
480
481			PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
482
483	98919	100	while (x < send) {
484			const U8* next_char_ptr;
485
486			/* Inline the easy bits of is_utf8_char() here for speed... */
487	95494	100	if (UTF8_IS_INVARIANT(*x))
488	89540		next_char_ptr = x + 1;
489			else {
490			/* ... and call is_utf8_char() only if really needed. */
491	5954		c = UTF8SKIP(x);
492	5954		next_char_ptr = c + x;
493	5954	100	if (next_char_ptr > send) {
494			goto out;
495			}
496	5948	50	if (IS_UTF8_CHAR_FAST(c)) {
497	5948	100	if (!IS_UTF8_CHAR(x, c))
		50
		50
		0
		50
		0
		0
		50
		0
		0
		50
		50
		50
		50
		0
		0
		0
		0
		100
		50
		50
		0
		100
		50
		50
		50
		50
		50
		100
		50
		100
		50
		50
		50
		100
498			c = 0;
499			} else
500	0		c = is_utf8_char_slow(x, c);
501	5948	100	if (!c)
502			goto out;
503			}
504			x = next_char_ptr;
505	95484		outlen++;
506			}
507
508			out:
509	2290	50	if (el)
510	0		*el = outlen;
511
512	2290	50	if (ep)
513	2290		*ep = x;
514	2290		return (x == send);
515			}
516
517			/*
518
519			=for apidoc utf8n_to_uvchr
520
521			THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
522			Most code should use L() rather than call this directly.
523
524			Bottom level UTF-8 decode routine.
525			Returns the native code point value of the first character in the string C,
526			which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
527			C bytes; C<*retlen> (if C isn't NULL) will be set to
528			the length, in bytes, of that character.
529
530			The value of C determines the behavior when C ~~does not point to a~~
531			well-formed UTF-8 character. If C is 0, when a malformation is found,
532			zero is returned and C<retlen> is set so that (S + C<retlen>>) is the
533			next possible position in C ~~that could begin a non-malformed character.~~
534			Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
535
536			Various ALLOW flags can be set in C to allow (and not warn on)
537			individual types of malformations, such as the sequence being overlong (that
538			is, when there is a shorter sequence that can express the same code point;
539			overlong sequences are expressly forbidden in the UTF-8 standard due to
540			potential security issues). Another malformation example is the first byte of
541			a character not being a legal first byte. See F for the list of such
542			flags. For allowed 0 length strings, this function returns 0; for allowed
543			overlong sequences, the computed code point is returned; for all other allowed
544			malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
545			determinable reasonable value.
546
547			The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
548			flags) malformation is found. If this flag is set, the routine assumes that
549			the caller will raise a warning, and this function will silently just set
550			C to C<-1> (cast to C) and return zero.
551
552			Note that this API requires disambiguation between successful decoding a NUL
553			character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
554			in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
555			first byte of C ~~is 0 as well. If so, the input was a NUL; if not, the input~~
556			had an error.
557
558			Certain code points are considered problematic. These are Unicode surrogates,
559			Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
560			By default these are considered regular code points, but certain situations
561			warrant special handling for them. If C contains
562			UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
563			malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
564			UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
565			maximum) can be set to disallow these categories individually.
566
567			The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
568			UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
569			for their respective categories, but otherwise the code points are considered
570			valid (not malformations). To get a category to both be treated as a
571			malformation and raise a warning, specify both the WARN and DISALLOW flags.
572			(But note that warnings are not raised if lexically disabled nor if
573			UTF8_CHECK_ONLY is also specified.)
574
575			Very large code points (above 0x7FFF_FFFF) are considered more problematic than
576			the others that are above the Unicode legal maximum. There are several
577			reasons: they requre at least 32 bits to represent them on ASCII platforms, are
578			not representable at all on EBCDIC platforms, and the original UTF-8
579			specification never went above this number (the current 0x10FFFF limit was
580			imposed later). (The smaller ones, those that fit into 32 bits, are
581			representable by a UV on ASCII platforms, but not by an IV, which means that
582			the number of operations that can be performed on them is quite restricted.)
583			The UTF-8 encoding on ASCII platforms for these large code points begins with a
584			byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
585			be treated as malformations, while allowing smaller above-Unicode code points.
586			(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
587			including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
588			the other WARN flags, but applies just to these code points.
589
590			All other code points corresponding to Unicode characters, including private
591			use and those yet to be assigned, are never considered malformed and never
592			warn.
593
594			=cut
595			*/
596
597			UV
598	16071668		Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
599			{
600			dVAR;
601			const U8 * const s0 = s;
602			U8 overflow_byte = '\0'; /* Save byte in case of overflow */
603			U8 * send;
604	16071668		UV uv = *s;
605			STRLEN expectlen;
606			SV* sv = NULL;
607			UV outlier_ret = 0; /* return value when input is in error or problematic
608			*/
609			UV pack_warn = 0; /* Save result of packWARN() for later */
610			bool unexpected_non_continuation = FALSE;
611			bool overflowed = FALSE;
612			bool do_overlong_test = TRUE; /* May have to skip this test */
613
614			const char* const malformed_text = "Malformed UTF-8 character";
615
616			PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
617
618			/* The order of malformation tests here is important. We should consume as
619			* few bytes as possible in order to not skip any valid character. This is
620			* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
621			* http://unicode.org/reports/tr36 for more discussion as to why. For
622			* example, once we've done a UTF8SKIP, we can tell the expected number of
623			* bytes, and could fail right off the bat if the input parameters indicate
624			* that there are too few available. But it could be that just that first
625			* byte is garbled, and the intended character occupies fewer bytes. If we
626			* blindly assumed that the first byte is correct, and skipped based on
627			* that number, we could skip over a valid input character. So instead, we
628			* always examine the sequence byte-by-byte.
629			*
630			* We also should not consume too few bytes, otherwise someone could inject
631			* things. For example, an input could be deliberately designed to
632			* overflow, and if this code bailed out immediately upon discovering that,
633			* returning to the caller *retlen pointing to the very next byte (one
634			* which is actually part of of the overflowing sequence), that could look
635			* legitimate to the caller, which could discard the initial partial
636			* sequence and process the rest, inappropriately */
637
638			/* Zero length strings, if allowed, of necessity are zero */
639	16071668	100	if (UNLIKELY(curlen == 0)) {
640	10	50	if (retlen) {
641	10		*retlen = 0;
642			}
643
644	10	100	if (flags & UTF8_ALLOW_EMPTY) {
645			return 0;
646			}
647	8	100	if (! (flags & UTF8_CHECK_ONLY)) {
648	4		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
649			}
650			goto malformed;
651			}
652
653	16071658		expectlen = UTF8SKIP(s);
654
655			/* A well-formed UTF-8 character, as the vast majority of calls to this
656			* function will be for, has this expected length. For efficiency, set
657			* things up here to return it. It will be overriden only in those rare
658			* cases where a malformation is found */
659	16071658	100	if (retlen) {
660	13947306		*retlen = expectlen;
661			}
662
663			/* An invariant is trivially well-formed */
664	16071658	100	if (UTF8_IS_INVARIANT(uv)) {
665			return uv;
666			}
667
668			/* A continuation character can't start a valid sequence */
669	4409996	100	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
670	198	100	if (flags & UTF8_ALLOW_CONTINUATION) {
671	2	50	if (retlen) {
672	2		*retlen = 1;
673			}
674			return UNICODE_REPLACEMENT;
675			}
676
677	196	100	if (! (flags & UTF8_CHECK_ONLY)) {
678	194		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
679			}
680			curlen = 1;
681			goto malformed;
682			}
683
684			/* Here is not a continuation byte, nor an invariant. The only thing left
685			* is a start byte (possibly for an overlong) */
686
687			#ifdef EBCDIC
688			uv = NATIVE_UTF8_TO_I8(uv);
689			#endif
690
691			/* Remove the leading bits that indicate the number of bytes in the
692			* character's whole UTF-8 sequence, leaving just the bits that are part of
693			* the value */
694	4409798	100	uv &= UTF_START_MASK(expectlen);
695
696			/* Now, loop through the remaining bytes in the character's sequence,
697			* accumulating each into the working value as we go. Be sure to not look
698			* past the end of the input string */
699	4409798		send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
700
701	12275328	100	for (s = s0 + 1; s < send; s++) {
702	7866060	100	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
703			#ifndef EBCDIC /* Can't overflow in EBCDIC */
704	7865530	100	if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
705
706			/* The original implementors viewed this malformation as more
707			* serious than the others (though I, khw, don't understand
708			* why, since other malformations also give very very wrong
709			* results), so there is no way to turn off checking for it.
710			* Set a flag, but keep going in the loop, so that we absorb
711			* the rest of the bytes that comprise the character. */
712			overflowed = TRUE;
713	92		overflow_byte = s; / Save for warning message's use */
714			}
715			#endif
716	7865530		uv = UTF8_ACCUMULATE(uv, *s);
717			}
718			else {
719			/* Here, found a non-continuation before processing all expected
720			* bytes. This byte begins a new character, so quit, even if
721			* allowing this malformation. */
722			unexpected_non_continuation = TRUE;
723			break;
724			}
725			} /* End of loop through the character's bytes */
726
727			/* Save how many bytes were actually in the character */
728	4409798		curlen = s - s0;
729
730			/* The loop above finds two types of malformations: non-continuation and/or
731			* overflow. The non-continuation malformation is really a too-short
732			* malformation, as it means that the current character ended before it was
733			* expected to (being terminated prematurely by the beginning of the next
734			* character, whereas in the too-short malformation there just are too few
735			* bytes available to hold the character. In both cases, the check below
736			* that we have found the expected number of bytes would fail if executed.)
737			* Thus the non-continuation malformation is really unnecessary, being a
738			* subset of the too-short malformation. But there may be existing
739			* applications that are expecting the non-continuation type, so we retain
740			* it, and return it in preference to the too-short malformation. (If this
741			* code were being written from scratch, the two types might be collapsed
742			* into one.) I, khw, am also giving priority to returning the
743			* non-continuation and too-short malformations over overflow when multiple
744			* ones are present. I don't know of any real reason to prefer one over
745			* the other, except that it seems to me that multiple-byte errors trumps
746			* errors from a single byte */
747	4409798	100	if (UNLIKELY(unexpected_non_continuation)) {
748	530	100	if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
749	526	100	if (! (flags & UTF8_CHECK_ONLY)) {
750	272	100	if (curlen == 1) {
751	254		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, s, s0));
752			}
753			else {
754	18		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, s, (int) curlen, s0, (int)expectlen));
755			}
756			}
757			goto malformed;
758			}
759			uv = UNICODE_REPLACEMENT;
760
761			/* Skip testing for overlongs, as the REPLACEMENT may not be the same
762			* as what the original expectations were. */
763			do_overlong_test = FALSE;
764	4	50	if (retlen) {
765	4		*retlen = curlen;
766			}
767			}
768	4409268	100	else if (UNLIKELY(curlen < expectlen)) {
769	44	100	if (! (flags & UTF8_ALLOW_SHORT)) {
770	42	100	if (! (flags & UTF8_CHECK_ONLY)) {
771	40	100	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
772			}
773			goto malformed;
774			}
775			uv = UNICODE_REPLACEMENT;
776			do_overlong_test = FALSE;
777	2	50	if (retlen) {
778	2		*retlen = curlen;
779			}
780			}
781
782			#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
783	4409230	100	if ((s0 & 0xFE) == 0xFE / matches both FE, FF */
784	68358	100	&& (flags & (UTF8_WARN_FE_FF\|UTF8_DISALLOW_FE_FF)))
785			{
786			/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
787			* generation of the sv, since no warnings are raised under CHECK */
788	150	100	if ((flags & (UTF8_WARN_FE_FF\|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
789	60	100	&& ckWARN_d(WARN_UTF8))
790			{
791			/* This message is deliberately not of the same syntax as the other
792			* messages for malformations, for backwards compatibility in the
793			* unlikely event that code is relying on its precise earlier text
794			*/
795	12		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
796			pack_warn = packWARN(WARN_UTF8);
797			}
798	150	100	if (flags & UTF8_DISALLOW_FE_FF) {
799			goto malformed;
800			}
801			}
802	4409110	100	if (UNLIKELY(overflowed)) {
803
804			/* If the first byte is FF, it will overflow a 32-bit word. If the
805			* first byte is FE, it will overflow a signed 32-bit word. The
806			* above preserves backward compatibility, since its message was used
807			* in earlier versions of this code in preference to overflow */
808	46		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
809	46		goto malformed;
810			}
811			#endif
812
813	4409064	100	if (do_overlong_test
814	4409058	100	&& expectlen > (STRLEN) OFFUNISKIP(uv)
		100
		100
		100
		100
		100
		100
		100
815	44	100	&& ! (flags & UTF8_ALLOW_LONG))
816			{
817			/* The overlong malformation has lower precedence than the others.
818			* Note that if this malformation is allowed, we return the actual
819			* value, instead of the replacement character. This is because this
820			* value is actually well-defined. */
821	42	100	if (! (flags & UTF8_CHECK_ONLY)) {
822	36	100	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
		100
		100
		100
		50
		0
		0
		50
823			}
824			goto malformed;
825			}
826
827			/* Here, the input is considered to be well-formed , but could be a
828			* problematic code point that is not allowed by the input parameters. */
829	4409022	100	if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
830	685448	100	&& (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
831			\|UTF8_WARN_ILLEGAL_INTERCHANGE)))
832			{
833	7866	100	if (UNICODE_IS_SURROGATE(uv)) {
834	64	100	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
835	20	100	&& ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
836			{
837	8		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
838			pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
839			}
840	64	100	if (flags & UTF8_DISALLOW_SURROGATE) {
841			goto disallowed;
842			}
843			}
844	7802	100	else if ((uv > PERL_UNICODE_MAX)) {
845	86	100	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
846	20	100	&& ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
847			{
848	8		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
849			pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
850			}
851	86	100	if (flags & UTF8_DISALLOW_SUPER) {
852			goto disallowed;
853			}
854			}
855	7716	100	else if (UNICODE_IS_NONCHAR(uv)) {
		100
856	52	100	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
857	20	100	&& ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
858			{
859	8		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
860			pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
861			}
862	52	100	if (flags & UTF8_DISALLOW_NONCHAR) {
863			goto disallowed;
864			}
865			}
866
867	7718	100	if (sv) {
868			outlier_ret = uv; /* Note we don't bother to convert to native,
869			as all the outlier code points are the same
870			in both ASCII and EBCDIC */
871			goto do_warn;
872			}
873
874			/* Here, this is not considered a malformed character, so drop through
875			* to return it */
876			}
877
878			return UNI_TO_NATIVE(uv);
879
880			/* There are three cases which get to beyond this point. In all 3 cases:
881			* if not null points to a string to print as a warning.
882			* is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
883			* set.
884			* is what return value to use if UTF8_CHECK_ONLY isn't set.
885			* This is done by initializing it to 0, and changing it only
886			* for case 1).
887			* The 3 cases are:
888			* 1) The input is valid but problematic, and to be warned about. The
889			* return value is the resultant code point; <*retlen> is set to
890			* , the number of bytes that comprise the code point.
891			* contains the result of packWARN() for the warning
892			* types. The entry point for this case is the label ;
893			* 2) The input is a valid code point but disallowed by the parameters to
894			* this function. The return value is 0. If UTF8_CHECK_ONLY is set,
895			* <*relen> is -1; otherwise it is , the number of bytes that
896			* comprise the code point. contains the result of
897			* packWARN() for the warning types. The entry point for this case is
898			* the label .
899			* 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
900			* is set, <*relen> is -1; otherwise it is , the number of
901			* bytes that comprise the malformation. All such malformations are
902			* assumed to be warning type . The entry point for this case
903			* is the label .
904			*/
905
906			malformed:
907
908	980	100	if (sv && ckWARN_d(WARN_UTF8)) {
		100
909			pack_warn = packWARN(WARN_UTF8);
910			}
911
912			disallowed:
913
914	1128	100	if (flags & UTF8_CHECK_ONLY) {
915	438	50	if (retlen)
916	438		*retlen = ((STRLEN) -1);
917			return 0;
918			}
919
920			do_warn:
921
922	706	100	if (pack_warn) { /* was initialized to 0, and changed only
923			if warnings are to be raised. */
924	562		const char * const string = SvPVX_const(sv);
925
926	562	100	if (PL_op)
927	548	50	Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
		0
928			else
929	14		Perl_warner(aTHX_ pack_warn, "%s", string);
930			}
931
932	706	100	if (retlen) {
933	8036132		*retlen = curlen;
934			}
935
936			return outlier_ret;
937			}
938
939			/*
940			=for apidoc utf8_to_uvchr_buf
941
942			Returns the native code point of the first character in the string C ~~which~~
943			is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C.
944			C<*retlen> will be set to the length, in bytes, of that character.
945
946			If C ~~does not point to a well-formed UTF-8 character and UTF8 warnings are~~
947			enabled, zero is returned and C<*retlen> is set (if C isn't
948			NULL) to -1. If those warnings are off, the computed value, if well-defined
949			(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
950			C<retlen> is set (if C isn't NULL) so that (S + C<retlen>>) is
951			the next possible position in C ~~that could begin a non-malformed character.~~
952			See L for details on when the REPLACEMENT CHARACTER is
953			returned.
954
955			=cut
956			*/
957
958
959			UV
960	0		Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
961			{
962			assert(s < send);
963
964	0	0	return utf8n_to_uvchr(s, send - s, retlen,
965			ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
966			}
967
968			/* Like L(), but should only be called when it is known that
969			* there are no malformations in the input UTF-8 string C~~. surrogates,~~
970			* non-character code points, and non-Unicode code points are allowed. A macro
971			* in utf8.h is used to normally avoid this function wrapper */
972
973			UV
974	6448256		Perl_valid_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
975			{
976	6448256		UV expectlen = UTF8SKIP(s);
977	6448256		const U8* send = s + expectlen;
978	6448256		UV uv = *s;
979
980			PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
981
982	6448256	100	if (retlen) {
983	414938		*retlen = expectlen;
984			}
985
986			/* An invariant is trivially returned */
987	6448256	100	if (expectlen == 1) {
988			return uv;
989			}
990
991			#ifdef EBCDIC
992			uv = NATIVE_UTF8_TO_I8(uv);
993			#endif
994
995			/* Remove the leading bits that indicate the number of bytes, leaving just
996			* the bits that are part of the value */
997	4289160	100	uv &= UTF_START_MASK(expectlen);
998
999			/* Now, loop through the remaining bytes, accumulating each into the
1000			* working total as we go. (I khw tried unrolling the loop for up to 4
1001			* bytes, but there was no performance improvement) */
1002	11936924	100	for (++s; s < send; s++) {
1003	6568216		uv = UTF8_ACCUMULATE(uv, *s);
1004			}
1005
1006			return UNI_TO_NATIVE(uv);
1007
1008			}
1009
1010			/*
1011			=for apidoc utf8_to_uvchr
1012
1013			Returns the native code point of the first character in the string C
1014			which is assumed to be in UTF-8 encoding; C will be set to the
1015			length, in bytes, of that character.
1016
1017			Some, but not all, UTF-8 malformations are detected, and in fact, some
1018			malformed input could cause reading beyond the end of the input buffer, which
1019			is why this function is deprecated. Use L instead.
1020
1021			If C ~~points to one of the detected malformations, and UTF8 warnings are~~
1022			enabled, zero is returned and C<*retlen> is set (if C isn't
1023			NULL) to -1. If those warnings are off, the computed value if well-defined (or
1024			the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1025			is set (if C isn't NULL) so that (S + C<*retlen>>) is the
1026			next possible position in C ~~that could begin a non-malformed character.~~
1027			See L for details on when the REPLACEMENT CHARACTER is returned.
1028
1029			=cut
1030			*/
1031
1032			UV
1033	0		Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
1034			{
1035			PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
1036
1037	0	0	return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
1038			}
1039
1040			/*
1041			=for apidoc utf8_to_uvuni_buf
1042
1043			Only in very rare circumstances should code need to be dealing in Unicode
1044			(as opposed to native) code points. In those few cases, use
1045			C> instead.
1046
1047			Returns the Unicode (not-native) code point of the first character in the
1048			string C ~~which~~
1049			is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C.
1050			C will be set to the length, in bytes, of that character.
1051
1052			If C ~~does not point to a well-formed UTF-8 character and UTF8 warnings are~~
1053			enabled, zero is returned and C<*retlen> is set (if C isn't
1054			NULL) to -1. If those warnings are off, the computed value if well-defined (or
1055			the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1056			is set (if C isn't NULL) so that (S + C<*retlen>>) is the
1057			next possible position in C ~~that could begin a non-malformed character.~~
1058			See L for details on when the REPLACEMENT CHARACTER is returned.
1059
1060			=cut
1061			*/
1062
1063			UV
1064	0		Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
1065			{
1066			PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1067
1068			assert(send > s);
1069
1070			/* Call the low level routine asking for checks */
1071	0	0	return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
1072			ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
1073			}
1074
1075			/* DEPRECATED!
1076			* Like L(), but should only be called when it is known that
1077			* there are no malformations in the input UTF-8 string C~~. Surrogates,~~
1078			* non-character code points, and non-Unicode code points are allowed */
1079
1080			UV
1081	0		Perl_valid_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
1082			{
1083			PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1084
1085	0		return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1086			}
1087
1088			/*
1089			=for apidoc utf8_to_uvuni
1090
1091			Returns the Unicode code point of the first character in the string C
1092			which is assumed to be in UTF-8 encoding; C will be set to the
1093			length, in bytes, of that character.
1094
1095			Some, but not all, UTF-8 malformations are detected, and in fact, some
1096			malformed input could cause reading beyond the end of the input buffer, which
1097			is one reason why this function is deprecated. The other is that only in
1098			extremely limited circumstances should the Unicode versus native code point be
1099			of any interest to you. See L for alternatives.
1100
1101			If C ~~points to one of the detected malformations, and UTF8 warnings are~~
1102			enabled, zero is returned and C<*retlen> is set (if C doesn't point to
1103			NULL) to -1. If those warnings are off, the computed value if well-defined (or
1104			the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1105			is set (if C isn't NULL) so that (S + C<*retlen>>) is the
1106			next possible position in C ~~that could begin a non-malformed character.~~
1107			See L for details on when the REPLACEMENT CHARACTER is returned.
1108
1109			=cut
1110			*/
1111
1112			UV
1113	0		Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
1114			{
1115			PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1116
1117	0		return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1118			}
1119
1120			/*
1121			=for apidoc utf8_length
1122
1123			Return the length of the UTF-8 char encoded string C ~~in characters.~~
1124			Stops at C (inclusive). If C s> or if the scan would end
1125			up past C, croaks.
1126
1127			=cut
1128			*/
1129
1130			STRLEN
1131	7119006		Perl_utf8_length(pTHX_ const U8 s, const U8 e)
1132			{
1133			dVAR;
1134			STRLEN len = 0;
1135
1136			PERL_ARGS_ASSERT_UTF8_LENGTH;
1137
1138			/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1139			* the bitops (especially ~) can create illegal UTF-8.
1140			* In other words: in Perl UTF-8 is not just for Unicode. */
1141
1142	7119006	50	if (e < s)
1143			goto warn_and_return;
1144	629958130	100	while (s < e) {
1145	622839124		s += UTF8SKIP(s);
1146	622839124		len++;
1147			}
1148
1149	7119006	100	if (e != s) {
1150	32		len--;
1151			warn_and_return:
1152	32	50	if (PL_op)
1153	48	50	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1154	16	0	"%s in %s", unees, OP_DESC(PL_op));
1155			else
1156	0		Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1157			}
1158
1159	7119006		return len;
1160			}
1161
1162			/*
1163			=for apidoc utf8_distance
1164
1165			Returns the number of UTF-8 characters between the UTF-8 pointers C
1166			and C.
1167
1168			WARNING: use only if you know that the pointers point inside the
1169			same UTF-8 buffer.
1170
1171			=cut
1172			*/
1173
1174			IV
1175	377416		Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
1176			{
1177			PERL_ARGS_ASSERT_UTF8_DISTANCE;
1178
1179	377416	50	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1180			}
1181
1182			/*
1183			=for apidoc utf8_hop
1184
1185			Return the UTF-8 pointer C ~~displaced by C characters, either~~
1186			forward or backward.
1187
1188			WARNING: do not use the following unless you know C is within
1189			the UTF-8 data pointed to by C and that on entry C ~~is aligned~~
1190			on the first byte of character or just after the last byte of a character.
1191
1192			=cut
1193			*/
1194
1195			U8 *
1196	61814		Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1197			{
1198			PERL_ARGS_ASSERT_UTF8_HOP;
1199
1200			PERL_UNUSED_CONTEXT;
1201			/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1202			* the bitops (especially ~) can create illegal UTF-8.
1203			* In other words: in Perl UTF-8 is not just for Unicode. */
1204
1205	61814	100	if (off >= 0) {
1206	1224442	100	while (off--)
1207	1164038		s += UTF8SKIP(s);
1208			}
1209			else {
1210	3990	100	while (off++) {
1211	2580		s--;
1212	5085	100	while (UTF8_IS_CONTINUATION(*s))
1213	510		s--;
1214			}
1215			}
1216	61814		return (U8 *)s;
1217			}
1218
1219			/*
1220			=for apidoc bytes_cmp_utf8
1221
1222			Compares the sequence of characters (stored as octets) in C, C with the
1223			sequence of characters (stored as UTF-8) in C, C. Returns 0 if they are
1224			equal, -1 or -2 if the first string is less than the second string, +1 or +2
1225			if the first string is greater than the second string.
1226
1227			-1 or +1 is returned if the shorter string was identical to the start of the
1228			longer string. -2 or +2 is returned if the was a difference between characters
1229			within the strings.
1230
1231			=cut
1232			*/
1233
1234			int
1235	273870		Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
1236			{
1237	273870		const U8 *const bend = b + blen;
1238	273870		const U8 *const uend = u + ulen;
1239
1240			PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1241
1242			PERL_UNUSED_CONTEXT;
1243
1244	4034843	100	while (b < bend && u < uend) {
1245	3855934		U8 c = *u++;
1246	3855934	100	if (!UTF8_IS_INVARIANT(c)) {
1247	185108	100	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1248	118226	50	if (u < uend) {
1249	118226		U8 c1 = *u++;
1250	118226	50	if (UTF8_IS_CONTINUATION(c1)) {
1251	118226		c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
1252			} else {
1253	0	0	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
		0
1254			"Malformed UTF-8 character "
1255			"(unexpected non-continuation byte 0x%02x"
1256			", immediately after start byte 0x%02x)"
1257			/* Dear diag.t, it's in the pod. */
1258			"%s%s", c1, c,
1259	0		PL_op ? " in " : "",
1260	0	0	PL_op ? OP_DESC(PL_op) : "");
		0
1261	0		return -2;
1262			}
1263			} else {
1264	0	0	if (PL_op)
1265	0	0	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1266	0	0	"%s in %s", unees, OP_DESC(PL_op));
1267			else
1268	0		Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1269			return -2; /* Really want to return undef :-) */
1270			}
1271			} else {
1272			return -2;
1273			}
1274			}
1275	3789052	100	if (*b != c) {
1276	165014	100	return *b < c ? -2 : +2;
1277			}
1278	3624038		++b;
1279			}
1280
1281	41974	100	if (b == bend && u == uend)
1282			return 0;
1283
1284	137293	100	return b < bend ? +1 : -1;
1285			}
1286
1287			/*
1288			=for apidoc utf8_to_bytes
1289
1290			Converts a string C ~~of length C from UTF-8 into native byte encoding.~~
1291			Unlike L, this over-writes the original string, and
1292			updates C to contain the new length.
1293			Returns zero on failure, setting C to -1.
1294
1295			If you need a copy of the string, see L.
1296
1297			=cut
1298			*/
1299
1300			U8 *
1301	170464		Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
1302			{
1303			U8 * const save = s;
1304	170464		U8 * const send = s + *len;
1305			U8 *d;
1306
1307			PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1308
1309			/* ensure valid UTF-8 and chars < 256 before updating string */
1310	1013490	100	while (s < send) {
1311	758560	100	if (! UTF8_IS_INVARIANT(*s)) {
1312	121154	100	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
		50
		50
1313	766		*len = ((STRLEN) -1);
1314	766		return 0;
1315			}
1316	120388		s++;
1317			}
1318	757794		s++;
1319			}
1320
1321			d = s = save;
1322	924844	100	while (s < send) {
1323	755146		U8 c = *s++;
1324	755146	100	if (! UTF8_IS_INVARIANT(c)) {
1325			/* Then it is two-byte encoded */
1326	120116		c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1327	120116		s++;
1328			}
1329	755146		*d++ = c;
1330			}
1331	169698		*d = '\0';
1332	169698		*len = d - save;
1333	170081		return save;
1334			}
1335
1336			/*
1337			=for apidoc bytes_from_utf8
1338
1339			Converts a string C ~~of length C from UTF-8 into native byte encoding.~~
1340			Unlike L but like L, returns a pointer to
1341			the newly-created string, and updates C to contain the new
1342			length. Returns the original string if no conversion occurs, C
1343			is unchanged. Do nothing if C points to 0. Sets C to
1344			0 if C ~~is converted or consisted entirely of characters that are invariant~~
1345			in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1346
1347			=cut
1348			*/
1349
1350			U8 *
1351	2950312		Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
1352			{
1353			U8 *d;
1354			const U8 *start = s;
1355			const U8 *send;
1356			I32 count = 0;
1357
1358			PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1359
1360			PERL_UNUSED_CONTEXT;
1361	2950312	50	if (!*is_utf8)
1362			return (U8 *)start;
1363
1364			/* ensure valid UTF-8 and chars < 256 before converting string */
1365	10013178	100	for (send = s + *len; s < send;) {
1366	7202750	100	if (! UTF8_IS_INVARIANT(*s)) {
1367	1623176	100	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
		50
		50
1368			return (U8 *)start;
1369			}
1370	8136		count++;
1371	8136		s++;
1372			}
1373	5587710		s++;
1374			}
1375
1376	1335272		*is_utf8 = FALSE;
1377
1378	1335272		Newx(d, (*len) - count + 1, U8);
1379			s = start; start = d;
1380	7188024	100	while (s < send) {
1381	5185116		U8 c = *s++;
1382	5185116	100	if (! UTF8_IS_INVARIANT(c)) {
1383			/* Then it is two-byte encoded */
1384	8074		c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1385	8074		s++;
1386			}
1387	5185116		*d++ = c;
1388			}
1389	1335272		*d = '\0';
1390	1335272		*len = d - start;
1391	2142792		return (U8 *)start;
1392			}
1393
1394			/*
1395			=for apidoc bytes_to_utf8
1396
1397			Converts a string C ~~of length C bytes from the native encoding into~~
1398			UTF-8.
1399			Returns a pointer to the newly-created string, and sets C to
1400			reflect the new length in bytes.
1401
1402			A NUL character will be written after the end of the string.
1403
1404			If you want to convert to UTF-8 from encodings other than
1405			the native (Latin1 or EBCDIC),
1406			see L().
1407
1408			=cut
1409			*/
1410
1411			/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1412			likewise need duplication. */
1413
1414			U8*
1415	213714		Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
1416			{
1417	213714		const U8 * const send = s + (*len);
1418			U8 *d;
1419			U8 *dst;
1420
1421			PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1422			PERL_UNUSED_CONTEXT;
1423
1424	213714		Newx(d, (len) 2 + 1, U8);
1425			dst = d;
1426
1427	12019571	100	while (s < send) {
1428	11699000		append_utf8_from_native_byte(*s, &d);
1429	11699000		s++;
1430			}
1431	213714		*d = '\0';
1432	213714		*len = d-dst;
1433	213714		return dst;
1434			}
1435
1436			/*
1437			* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1438			*
1439			* Destination must be pre-extended to 3/2 source. Do not use in-place.
1440			* We optimize for native, for obvious reasons. */
1441
1442			U8*
1443	28666		Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1444			{
1445			U8* pend;
1446			U8* dstart = d;
1447
1448			PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1449
1450	28666	100	if (bytelen & 1)
1451	6		Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1452
1453	28660		pend = p + bytelen;
1454
1455	731254	100	while (p < pend) {
1456	688286		UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1457	688286		p += 2;
1458	688286	100	if (UNI_IS_INVARIANT(uv)) {
1459	680068		*d++ = LATIN1_TO_NATIVE((U8) uv);
1460	680068		continue;
1461			}
1462	8218	100	if (uv <= MAX_UTF8_TWO_BYTE) {
1463	120		*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1464	120		*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
1465	120		continue;
1466			}
1467			#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1468			#define LAST_HIGH_SURROGATE 0xDBFF
1469			#define FIRST_LOW_SURROGATE 0xDC00
1470			#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
1471	8098	100	if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
1472	7962	100	if (p >= pend) {
1473	2		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1474			} else {
1475	7960		UV low = (p[0] << 8) + p[1];
1476	7960		p += 2;
1477	7960	100	if (low < FIRST_LOW_SURROGATE \|\| low > LAST_LOW_SURROGATE)
1478	8		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1479	11928		uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1480	7952		+ (low - FIRST_LOW_SURROGATE) + 0x10000;
1481			}
1482	136	100	} else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
1483	12		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1484			}
1485			#ifdef EBCDIC
1486			d = uvoffuni_to_utf8_flags(d, uv, 0);
1487			#else
1488	8076	100	if (uv < 0x10000) {
1489	124		*d++ = (U8)(( uv >> 12) \| 0xe0);
1490	124		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
1491	124		*d++ = (U8)(( uv & 0x3f) \| 0x80);
1492	124		continue;
1493			}
1494			else {
1495	7952		*d++ = (U8)(( uv >> 18) \| 0xf0);
1496	7952		*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
1497	7952		*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
1498	7952		*d++ = (U8)(( uv & 0x3f) \| 0x80);
1499	348108		continue;
1500			}
1501			#endif
1502			}
1503	28638		*newlen = d - dstart;
1504	28638		return d;
1505			}
1506
1507			/* Note: this one is slightly destructive of the source. */
1508
1509			U8*
1510	14356		Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1511			{
1512			U8* s = (U8*)p;
1513	14356		U8* const send = s + bytelen;
1514
1515			PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1516
1517	14356	100	if (bytelen & 1)
1518	2		Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1519			(UV)bytelen);
1520
1521	362482	100	while (s < send) {
1522	348128		const U8 tmp = s[0];
1523	348128		s[0] = s[1];
1524	348128		s[1] = tmp;
1525	348128		s += 2;
1526			}
1527	14354		return utf16_to_utf8(p, d, bytelen, newlen);
1528			}
1529
1530			bool
1531	836		Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1532			{
1533			U8 tmpbuf[UTF8_MAXBYTES+1];
1534	836		uvchr_to_utf8(tmpbuf, c);
1535	836		return _is_utf8_FOO(classnum, tmpbuf);
1536			}
1537
1538			/* for now these are all defined (inefficiently) in terms of the utf8 versions.
1539			* Note that the macros in handy.h that call these short-circuit calling them
1540			* for Latin-1 range inputs */
1541
1542			bool
1543	0		Perl_is_uni_alnum(pTHX_ UV c)
1544			{
1545			U8 tmpbuf[UTF8_MAXBYTES+1];
1546	0		uvchr_to_utf8(tmpbuf, c);
1547	0		return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
1548			}
1549
1550			bool
1551	0		Perl_is_uni_alnumc(pTHX_ UV c)
1552			{
1553			U8 tmpbuf[UTF8_MAXBYTES+1];
1554	0		uvchr_to_utf8(tmpbuf, c);
1555	0		return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
1556			}
1557
1558			/* Internal function so we can deprecate the external one, and call
1559			this one from other deprecated functions in this file */
1560
1561			PERL_STATIC_INLINE bool
1562			S_is_utf8_idfirst(pTHX_ const U8 *p)
1563			{
1564			dVAR;
1565
1566	0		if (*p == '_')
1567			return TRUE;
1568			/* is_utf8_idstart would be more logical. */
1569	0		return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1570			}
1571
1572			bool
1573	0		Perl_is_uni_idfirst(pTHX_ UV c)
1574			{
1575			U8 tmpbuf[UTF8_MAXBYTES+1];
1576	0		uvchr_to_utf8(tmpbuf, c);
1577	0		return S_is_utf8_idfirst(aTHX_ tmpbuf);
1578			}
1579
1580			bool
1581	68		Perl__is_uni_perl_idcont(pTHX_ UV c)
1582			{
1583			U8 tmpbuf[UTF8_MAXBYTES+1];
1584	68		uvchr_to_utf8(tmpbuf, c);
1585	68		return _is_utf8_perl_idcont(tmpbuf);
1586			}
1587
1588			bool
1589	60		Perl__is_uni_perl_idstart(pTHX_ UV c)
1590			{
1591			U8 tmpbuf[UTF8_MAXBYTES+1];
1592	60		uvchr_to_utf8(tmpbuf, c);
1593	60		return _is_utf8_perl_idstart(tmpbuf);
1594			}
1595
1596			bool
1597	0		Perl_is_uni_alpha(pTHX_ UV c)
1598			{
1599			U8 tmpbuf[UTF8_MAXBYTES+1];
1600	0		uvchr_to_utf8(tmpbuf, c);
1601	0		return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
1602			}
1603
1604			bool
1605	0		Perl_is_uni_ascii(pTHX_ UV c)
1606			{
1607	0		return isASCII(c);
1608			}
1609
1610			bool
1611	0		Perl_is_uni_blank(pTHX_ UV c)
1612			{
1613	0	0	return isBLANK_uni(c);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1614			}
1615
1616			bool
1617	0		Perl_is_uni_space(pTHX_ UV c)
1618			{
1619	0	0	return isSPACE_uni(c);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1620			}
1621
1622			bool
1623	0		Perl_is_uni_digit(pTHX_ UV c)
1624			{
1625			U8 tmpbuf[UTF8_MAXBYTES+1];
1626	0		uvchr_to_utf8(tmpbuf, c);
1627	0		return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
1628			}
1629
1630			bool
1631	0		Perl_is_uni_upper(pTHX_ UV c)
1632			{
1633			U8 tmpbuf[UTF8_MAXBYTES+1];
1634	0		uvchr_to_utf8(tmpbuf, c);
1635	0		return _is_utf8_FOO(_CC_UPPER, tmpbuf);
1636			}
1637
1638			bool
1639	0		Perl_is_uni_lower(pTHX_ UV c)
1640			{
1641			U8 tmpbuf[UTF8_MAXBYTES+1];
1642	0		uvchr_to_utf8(tmpbuf, c);
1643	0		return _is_utf8_FOO(_CC_LOWER, tmpbuf);
1644			}
1645
1646			bool
1647	0		Perl_is_uni_cntrl(pTHX_ UV c)
1648			{
1649	0	0	return isCNTRL_L1(c);
		0
1650			}
1651
1652			bool
1653	0		Perl_is_uni_graph(pTHX_ UV c)
1654			{
1655			U8 tmpbuf[UTF8_MAXBYTES+1];
1656	0		uvchr_to_utf8(tmpbuf, c);
1657	0		return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
1658			}
1659
1660			bool
1661	0		Perl_is_uni_print(pTHX_ UV c)
1662			{
1663			U8 tmpbuf[UTF8_MAXBYTES+1];
1664	0		uvchr_to_utf8(tmpbuf, c);
1665	0		return _is_utf8_FOO(_CC_PRINT, tmpbuf);
1666			}
1667
1668			bool
1669	0		Perl_is_uni_punct(pTHX_ UV c)
1670			{
1671			U8 tmpbuf[UTF8_MAXBYTES+1];
1672	0		uvchr_to_utf8(tmpbuf, c);
1673	0		return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
1674			}
1675
1676			bool
1677	0		Perl_is_uni_xdigit(pTHX_ UV c)
1678			{
1679	0	0	return isXDIGIT_uni(c);
		0
		0
		0
		0
		0
1680			}
1681
1682			UV
1683	62676		Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1684			{
1685			/* We have the latin1-range values compiled into the core, so just use
1686			* those, converting the result to utf8. The only difference between upper
1687			* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1688			* either "SS" or "Ss". Which one to use is passed into the routine in
1689			* 'S_or_s' to avoid a test */
1690
1691	62676		UV converted = toUPPER_LATIN1_MOD(c);
1692
1693			PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1694
1695			assert(S_or_s == 'S' \|\| S_or_s == 's');
1696
1697	62676	100	if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
1698			characters in this range */
1699	32292		*p = (U8) converted;
1700	32292		*lenp = 1;
1701	32292		return converted;
1702			}
1703
1704			/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1705			* which it maps to one of them, so as to only have to have one check for
1706			* it in the main case */
1707	30384	100	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1708	204		switch (c) {
1709			case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1710			converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1711			break;
1712			case MICRO_SIGN:
1713			converted = GREEK_CAPITAL_LETTER_MU;
1714	64		break;
1715			case LATIN_SMALL_LETTER_SHARP_S:
1716	72		*(p)++ = 'S';
1717	72		*p = S_or_s;
1718	72		*lenp = 2;
1719	72		return 'S';
1720			default:
1721	0		Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1722			assert(0); /* NOTREACHED */
1723			}
1724			}
1725
1726	30312		*(p)++ = UTF8_TWO_BYTE_HI(converted);
1727	30312		*p = UTF8_TWO_BYTE_LO(converted);
1728	30312		*lenp = 2;
1729
1730	46494		return converted;
1731			}
1732
1733			/* Call the function to convert a UTF-8 encoded character to the specified case.
1734			* Note that there may be more than one character in the result.
1735			* INP is a pointer to the first byte of the input character
1736			* OUTP will be set to the first byte of the string of changed characters. It
1737			* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1738			* LENP will be set to the length in bytes of the string of changed characters
1739			*
1740			* The functions return the ordinal of the first character in the string of OUTP */
1741			#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1742			#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1743			#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
1744
1745			/* This additionally has the input parameter SPECIALS, which if non-zero will
1746			* cause this to use the SPECIALS hash for folding (meaning get full case
1747			* folding); otherwise, when zero, this implies a simple case fold */
1748			#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
1749
1750			UV
1751	546		Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1752			{
1753			dVAR;
1754
1755			/* Convert the Unicode character whose ordinal is to its uppercase
1756			* version and store that in UTF-8 in and its length in bytes in .
1757			* Note that the needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1758			* the changed version may be longer than the original character.
1759			*
1760			* The ordinal of the first character of the changed version is returned
1761			* (but note, as explained above, that there may be more.) */
1762
1763			PERL_ARGS_ASSERT_TO_UNI_UPPER;
1764
1765	546	100	if (c < 256) {
1766	512		return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1767			}
1768
1769	34		uvchr_to_utf8(p, c);
1770	290		return CALL_UPPER_CASE(p, p, lenp);
1771			}
1772
1773			UV
1774	546		Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1775			{
1776			dVAR;
1777
1778			PERL_ARGS_ASSERT_TO_UNI_TITLE;
1779
1780	546	100	if (c < 256) {
1781	512		return _to_upper_title_latin1((U8) c, p, lenp, 's');
1782			}
1783
1784	34		uvchr_to_utf8(p, c);
1785	290		return CALL_TITLE_CASE(p, p, lenp);
1786			}
1787
1788			STATIC U8
1789			S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1790			{
1791			/* We have the latin1-range values compiled into the core, so just use
1792			* those, converting the result to utf8. Since the result is always just
1793			* one character, we allow to be NULL */
1794
1795	60176		U8 converted = toLOWER_LATIN1(c);
1796
1797	60176	50	if (p != NULL) {
		50
		50
1798	60176	50	if (NATIVE_IS_INVARIANT(converted)) {
		50
		100
1799	31466		*p = converted;
1800	31466		*lenp = 1;
1801			}
1802			else {
1803	28710		*p = UTF8_TWO_BYTE_HI(converted);
1804	28710		*(p+1) = UTF8_TWO_BYTE_LO(converted);
1805	28710		*lenp = 2;
1806			}
1807			}
1808			return converted;
1809			}
1810
1811			UV
1812	530		Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1813			{
1814			dVAR;
1815
1816			PERL_ARGS_ASSERT_TO_UNI_LOWER;
1817
1818	530	100	if (c < 256) {
1819	768		return to_lower_latin1((U8) c, p, lenp);
1820			}
1821
1822	18		uvchr_to_utf8(p, c);
1823	274		return CALL_LOWER_CASE(p, p, lenp);
1824			}
1825
1826			UV
1827	3027170		Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
1828			{
1829			/* Corresponds to to_lower_latin1(); bits meanings:
1830			* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1831			* FOLD_FLAGS_FULL iff full folding is to be used;
1832			*
1833			* Not to be used for locale folds
1834			*/
1835
1836			UV converted;
1837
1838			PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1839
1840			assert (! (flags & FOLD_FLAGS_LOCALE));
1841
1842	3027170	100	if (c == MICRO_SIGN) {
1843			converted = GREEK_SMALL_LETTER_MU;
1844			}
1845	2986918	100	else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1846
1847			/* If can't cross 127/128 boundary, can't return "ss"; instead return
1848			* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1849			* under those circumstances. */
1850	446202	100	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1851	70840		lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1852	70840		Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1853			p, *lenp, U8);
1854	70840		return LATIN_SMALL_LETTER_LONG_S;
1855			}
1856			else {
1857	375362		*(p)++ = 's';
1858	375362		*p = 's';
1859	375362		*lenp = 2;
1860	375362		return 's';
1861			}
1862			}
1863			else { /* In this range the fold of all other characters is their lower
1864			case */
1865	2540716		converted = toLOWER_LATIN1(c);
1866			}
1867
1868	2580968	100	if (NATIVE_IS_INVARIANT(converted)) {
1869	2366680		*p = (U8) converted;
1870	2366680		*lenp = 1;
1871			}
1872			else {
1873	214288		*(p)++ = UTF8_TWO_BYTE_HI(converted);
1874	214288		*p = UTF8_TWO_BYTE_LO(converted);
1875	1620729		*lenp = 2;
1876			}
1877
1878			return converted;
1879			}
1880
1881			UV
1882	4383144		Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
1883			{
1884
1885			/* Not currently externally documented, and subject to change
1886			* bits meanings:
1887			* FOLD_FLAGS_FULL iff full folding is to be used;
1888			* FOLD_FLAGS_LOCALE iff in locale
1889			* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1890			*/
1891
1892			PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1893
1894	4383144	100	if (c < 256) {
1895	2765492		UV result = _to_fold_latin1((U8) c, p, lenp,
1896			flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
1897			/* It is illegal for the fold to cross the 255/256 boundary under
1898			* locale; in this case return the original */
1899	1434954	50	return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1900			? c
1901	2765492	100	: result;
1902			}
1903
1904			/* If no special needs, just use the macro */
1905	1617652	100	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
1906	981148		uvchr_to_utf8(p, c);
1907	981148	100	return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1908			}
1909			else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1910			the special flags. */
1911			U8 utf8_c[UTF8_MAXBYTES + 1];
1912	636504		uvchr_to_utf8(utf8_c, c);
1913	2509824		return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
1914			}
1915			}
1916
1917			bool
1918	0		Perl_is_uni_alnum_lc(pTHX_ UV c)
1919			{
1920	0	0	if (c < 256) {
1921	0	0	return isALNUM_LC(c);
		0
		0
1922			}
1923	0		return _is_uni_FOO(_CC_WORDCHAR, c);
1924			}
1925
1926			bool
1927	0		Perl_is_uni_alnumc_lc(pTHX_ UV c)
1928			{
1929	0	0	if (c < 256) {
1930	0	0	return isALPHANUMERIC_LC(c);
		0
1931			}
1932	0		return _is_uni_FOO(_CC_ALPHANUMERIC, c);
1933			}
1934
1935			bool
1936	0		Perl_is_uni_idfirst_lc(pTHX_ UV c)
1937			{
1938	0	0	if (c < 256) {
1939	0	0	return isIDFIRST_LC(c);
		0
		0
1940			}
1941	0		return _is_uni_perl_idstart(c);
1942			}
1943
1944			bool
1945	0		Perl_is_uni_alpha_lc(pTHX_ UV c)
1946			{
1947	0	0	if (c < 256) {
1948	0	0	return isALPHA_LC(c);
		0
1949			}
1950	0		return _is_uni_FOO(_CC_ALPHA, c);
1951			}
1952
1953			bool
1954	0		Perl_is_uni_ascii_lc(pTHX_ UV c)
1955			{
1956	0	0	if (c < 256) {
1957	0	0	return isASCII_LC(c);
		0
1958			}
1959			return 0;
1960			}
1961
1962			bool
1963	0		Perl_is_uni_blank_lc(pTHX_ UV c)
1964			{
1965	0	0	if (c < 256) {
1966	0	0	return isBLANK_LC(c);
		0
1967			}
1968	0	0	return isBLANK_uni(c);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1969			}
1970
1971			bool
1972	0		Perl_is_uni_space_lc(pTHX_ UV c)
1973			{
1974	0	0	if (c < 256) {
1975	0	0	return isSPACE_LC(c);
		0
1976			}
1977	0	0	return isSPACE_uni(c);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1978			}
1979
1980			bool
1981	0		Perl_is_uni_digit_lc(pTHX_ UV c)
1982			{
1983	0	0	if (c < 256) {
1984	0	0	return isDIGIT_LC(c);
		0
1985			}
1986	0		return _is_uni_FOO(_CC_DIGIT, c);
1987			}
1988
1989			bool
1990	0		Perl_is_uni_upper_lc(pTHX_ UV c)
1991			{
1992	0	0	if (c < 256) {
1993	0	0	return isUPPER_LC(c);
		0
1994			}
1995	0		return _is_uni_FOO(_CC_UPPER, c);
1996			}
1997
1998			bool
1999	0		Perl_is_uni_lower_lc(pTHX_ UV c)
2000			{
2001	0	0	if (c < 256) {
2002	0	0	return isLOWER_LC(c);
		0
2003			}
2004	0		return _is_uni_FOO(_CC_LOWER, c);
2005			}
2006
2007			bool
2008	0		Perl_is_uni_cntrl_lc(pTHX_ UV c)
2009			{
2010	0	0	if (c < 256) {
2011	0	0	return isCNTRL_LC(c);
		0
2012			}
2013			return 0;
2014			}
2015
2016			bool
2017	0		Perl_is_uni_graph_lc(pTHX_ UV c)
2018			{
2019	0	0	if (c < 256) {
2020	0	0	return isGRAPH_LC(c);
		0
2021			}
2022	0		return _is_uni_FOO(_CC_GRAPH, c);
2023			}
2024
2025			bool
2026	0		Perl_is_uni_print_lc(pTHX_ UV c)
2027			{
2028	0	0	if (c < 256) {
2029	0	0	return isPRINT_LC(c);
		0
2030			}
2031	0		return _is_uni_FOO(_CC_PRINT, c);
2032			}
2033
2034			bool
2035	0		Perl_is_uni_punct_lc(pTHX_ UV c)
2036			{
2037	0	0	if (c < 256) {
2038	0	0	return isPUNCT_LC(c);
		0
2039			}
2040	0		return _is_uni_FOO(_CC_PUNCT, c);
2041			}
2042
2043			bool
2044	0		Perl_is_uni_xdigit_lc(pTHX_ UV c)
2045			{
2046	0	0	if (c < 256) {
2047	0	0	return isXDIGIT_LC(c);
		0
2048			}
2049	0	0	return isXDIGIT_uni(c);
		0
		0
		0
		0
		0
2050			}
2051
2052			U32
2053	0		Perl_to_uni_upper_lc(pTHX_ U32 c)
2054			{
2055			/* XXX returns only the first character -- do not use XXX */
2056			/* XXX no locale support yet */
2057			STRLEN len;
2058			U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2059	0		return (U32)to_uni_upper(c, tmpbuf, &len);
2060			}
2061
2062			U32
2063	0		Perl_to_uni_title_lc(pTHX_ U32 c)
2064			{
2065			/* XXX returns only the first character XXX -- do not use XXX */
2066			/* XXX no locale support yet */
2067			STRLEN len;
2068			U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2069	0		return (U32)to_uni_title(c, tmpbuf, &len);
2070			}
2071
2072			U32
2073	0		Perl_to_uni_lower_lc(pTHX_ U32 c)
2074			{
2075			/* XXX returns only the first character -- do not use XXX */
2076			/* XXX no locale support yet */
2077			STRLEN len;
2078			U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2079	0		return (U32)to_uni_lower(c, tmpbuf, &len);
2080			}
2081
2082			PERL_STATIC_INLINE bool
2083	410108		S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
2084			const char *const swashname)
2085			{
2086			/* returns a boolean giving whether or not the UTF8-encoded character that
2087			* starts at is in the swash indicated by .
2088			* contains a pointer to where the swash indicated by
2089			* is to be stored; which this routine will do, so that future calls will
2090			* look at <*swash> and only generate a swash if it is not null
2091			*
2092			* Note that it is assumed that the buffer length of is enough to
2093			* contain all the bytes that comprise the character. Thus, <*p> should
2094			* have been checked before this call for mal-formedness enough to assure
2095			* that. */
2096
2097			dVAR;
2098
2099			PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2100
2101			/* The API should have included a length for the UTF-8 character in ,
2102			* but it doesn't. We therefore assume that p has been validated at least
2103			* as far as there being enough bytes available in it to accommodate the
2104			* character without reading beyond the end, and pass that number on to the
2105			* validating routine */
2106	410108	100	if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
2107	4	50	if (ckWARN_d(WARN_UTF8)) {
2108	4		Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
2109			"Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
2110	4	50	if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2111			what the malformation is */
2112	4	50	utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2113			}
2114			}
2115			return FALSE;
2116			}
2117	410104	100	if (!*swash) {
2118	638		U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2119	638		*swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
2120			}
2121
2122	410106		return swash_fetch(*swash, p, TRUE) != 0;
2123			}
2124
2125			bool
2126	1530		Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2127			{
2128			dVAR;
2129
2130			PERL_ARGS_ASSERT__IS_UTF8_FOO;
2131
2132			assert(classnum < _FIRST_NON_SWASH_CC);
2133
2134	1530		return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
2135			}
2136
2137			bool
2138	0		Perl_is_utf8_alnum(pTHX_ const U8 *p)
2139			{
2140			dVAR;
2141
2142			PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
2143
2144			/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
2145			* descendant of isalnum(3), in other words, it doesn't
2146			* contain the '_'. --jhi */
2147	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord");
2148			}
2149
2150			bool
2151	0		Perl_is_utf8_alnumc(pTHX_ const U8 *p)
2152			{
2153			dVAR;
2154
2155			PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
2156
2157	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum");
2158			}
2159
2160			bool
2161	0		Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
2162			{
2163			dVAR;
2164
2165			PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
2166
2167	0		return S_is_utf8_idfirst(aTHX_ p);
2168			}
2169
2170			bool
2171	0		Perl_is_utf8_xidfirst(pTHX_ const U8 p) / The naming is historical. */
2172			{
2173			dVAR;
2174
2175			PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
2176
2177	0	0	if (*p == '_')
2178			return TRUE;
2179			/* is_utf8_idstart would be more logical. */
2180	0		return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
2181			}
2182
2183			bool
2184	401724		Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
2185			{
2186			dVAR;
2187
2188			PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
2189
2190	401724		return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
2191			}
2192
2193			bool
2194	6840		Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2195			{
2196			dVAR;
2197
2198			PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2199
2200	6840		return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont");
2201			}
2202
2203
2204			bool
2205	0		Perl_is_utf8_idcont(pTHX_ const U8 *p)
2206			{
2207			dVAR;
2208
2209			PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
2210
2211	0		return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
2212			}
2213
2214			bool
2215	0		Perl_is_utf8_xidcont(pTHX_ const U8 *p)
2216			{
2217			dVAR;
2218
2219			PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
2220
2221	0		return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
2222			}
2223
2224			bool
2225	0		Perl_is_utf8_alpha(pTHX_ const U8 *p)
2226			{
2227			dVAR;
2228
2229			PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
2230
2231	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha");
2232			}
2233
2234			bool
2235	0		Perl_is_utf8_ascii(pTHX_ const U8 *p)
2236			{
2237			dVAR;
2238
2239			PERL_ARGS_ASSERT_IS_UTF8_ASCII;
2240
2241			/* ASCII characters are the same whether in utf8 or not. So the macro
2242			* works on both utf8 and non-utf8 representations. */
2243	0		return isASCII(*p);
2244			}
2245
2246			bool
2247	0		Perl_is_utf8_blank(pTHX_ const U8 *p)
2248			{
2249			dVAR;
2250
2251			PERL_ARGS_ASSERT_IS_UTF8_BLANK;
2252
2253	0	0	return isBLANK_utf8(p);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
2254			}
2255
2256			bool
2257	0		Perl_is_utf8_space(pTHX_ const U8 *p)
2258			{
2259			dVAR;
2260
2261			PERL_ARGS_ASSERT_IS_UTF8_SPACE;
2262
2263	0	0	return isSPACE_utf8(p);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
2264			}
2265
2266			bool
2267	0		Perl_is_utf8_perl_space(pTHX_ const U8 *p)
2268			{
2269			dVAR;
2270
2271			PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
2272
2273			/* Only true if is an ASCII space-like character, and ASCII is invariant
2274			* under utf8, so can just use the macro */
2275	0		return isSPACE_A(*p);
2276			}
2277
2278			bool
2279	0		Perl_is_utf8_perl_word(pTHX_ const U8 *p)
2280			{
2281			dVAR;
2282
2283			PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
2284
2285			/* Only true if is an ASCII word character, and ASCII is invariant
2286			* under utf8, so can just use the macro */
2287	0		return isWORDCHAR_A(*p);
2288			}
2289
2290			bool
2291	0		Perl_is_utf8_digit(pTHX_ const U8 *p)
2292			{
2293			dVAR;
2294
2295			PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
2296
2297	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit");
2298			}
2299
2300			bool
2301	0		Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
2302			{
2303			dVAR;
2304
2305			PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
2306
2307			/* Only true if is an ASCII digit character, and ASCII is invariant
2308			* under utf8, so can just use the macro */
2309	0		return isDIGIT_A(*p);
2310			}
2311
2312			bool
2313	0		Perl_is_utf8_upper(pTHX_ const U8 *p)
2314			{
2315			dVAR;
2316
2317			PERL_ARGS_ASSERT_IS_UTF8_UPPER;
2318
2319	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase");
2320			}
2321
2322			bool
2323	0		Perl_is_utf8_lower(pTHX_ const U8 *p)
2324			{
2325			dVAR;
2326
2327			PERL_ARGS_ASSERT_IS_UTF8_LOWER;
2328
2329	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase");
2330			}
2331
2332			bool
2333	0		Perl_is_utf8_cntrl(pTHX_ const U8 *p)
2334			{
2335			dVAR;
2336
2337			PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
2338
2339	0	0	return isCNTRL_utf8(p);
		0
		0
		0
2340			}
2341
2342			bool
2343	0		Perl_is_utf8_graph(pTHX_ const U8 *p)
2344			{
2345			dVAR;
2346
2347			PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
2348
2349	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph");
2350			}
2351
2352			bool
2353	0		Perl_is_utf8_print(pTHX_ const U8 *p)
2354			{
2355			dVAR;
2356
2357			PERL_ARGS_ASSERT_IS_UTF8_PRINT;
2358
2359	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint");
2360			}
2361
2362			bool
2363	0		Perl_is_utf8_punct(pTHX_ const U8 *p)
2364			{
2365			dVAR;
2366
2367			PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
2368
2369	0		return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct");
2370			}
2371
2372			bool
2373	0		Perl_is_utf8_xdigit(pTHX_ const U8 *p)
2374			{
2375			dVAR;
2376
2377			PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2378
2379	0	0	return is_XDIGIT_utf8(p);
		0
		0
		0
		0
		0
		0
2380			}
2381
2382			bool
2383	14		Perl__is_utf8_mark(pTHX_ const U8 *p)
2384			{
2385			dVAR;
2386
2387			PERL_ARGS_ASSERT__IS_UTF8_MARK;
2388
2389	14		return is_utf8_common(p, &PL_utf8_mark, "IsM");
2390			}
2391
2392
2393			bool
2394	0		Perl_is_utf8_mark(pTHX_ const U8 *p)
2395			{
2396			dVAR;
2397
2398			PERL_ARGS_ASSERT_IS_UTF8_MARK;
2399
2400	0		return _is_utf8_mark(p);
2401			}
2402
2403			/*
2404			=for apidoc to_utf8_case
2405
2406			The C contains the pointer to the UTF-8 string encoding
2407			the character that is being converted. This routine assumes that the character
2408			at C is well-formed.
2409
2410			The C is a pointer to the character buffer to put the
2411			conversion result to. The C is a pointer to the length
2412			of the result.
2413
2414			The C is a pointer to the swash to use.
2415
2416			Both the special and normal mappings are stored in F,
2417			and loaded by SWASHNEW, using F. The C (usually,
2418			but not always, a multicharacter mapping), is tried first.
2419
2420			The C is a string like "utf8::ToSpecLower", which means the
2421			hash %utf8::ToSpecLower. The access to the hash is through
2422			Perl_to_utf8_case().
2423
2424			The C is a string like "ToLower" which means the swash
2425			%utf8::ToLower.
2426
2427			=cut */
2428
2429			UV
2430	2406114		Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
2431			SV *swashp, const char normal, const char *special)
2432			{
2433			dVAR;
2434	2406114		STRLEN len = 0;
2435	2406114		const UV uv1 = valid_utf8_to_uvchr(p, NULL);
2436
2437			PERL_ARGS_ASSERT_TO_UTF8_CASE;
2438
2439			/* Note that swash_fetch() doesn't output warnings for these because it
2440			* assumes we will */
2441	2406114	100	if (uv1 >= UNICODE_SURROGATE_FIRST) {
2442	586434	100	if (uv1 <= UNICODE_SURROGATE_LAST) {
2443	50	100	if (ckWARN_d(WARN_SURROGATE)) {
2444	36	50	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
		50
		0
2445	36		Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2446			"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2447			}
2448			}
2449	586384	100	else if (UNICODE_IS_SUPER(uv1)) {
2450	54	100	if (ckWARN_d(WARN_NON_UNICODE)) {
2451	28	50	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
		50
		0
2452	28		Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2453			"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2454			}
2455			}
2456
2457			/* Note that non-characters are perfectly legal, so no warning should
2458			* be given */
2459			}
2460
2461	2406082	100	if (!swashp) / load on-demand */
2462	100		*swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
2463
2464	2406082	100	if (special) {
2465			/* It might be "special" (sometimes, but not always,
2466			* a multicharacter mapping) */
2467	2078270		HV * const hv = get_hv(special, 0);
2468			SV **svp;
2469
2470	4156540	50	if (hv &&
		100
2471	3288656	50	(svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
		100
		100
		100
		50
		50
		50
		50
2472	1210386		(*svp)) {
2473			const char *s;
2474
2475	1210386	50	s = SvPV_const(*svp, len);
2476	1210386	50	if (len == 1)
2477			/* EIGHTBIT */
2478	0		len = uvchr_to_utf8(ustrp, (U8)s) - ustrp;
2479			else {
2480	1210386		Copy(s, ustrp, len, U8);
2481			}
2482			}
2483			}
2484
2485	2406082	100	if (!len && *swashp) {
		50
2486	1195696		const UV uv2 = swash_fetch(swashp, p, TRUE / => is utf8 */);
2487
2488	1195696	100	if (uv2) {
2489			/* It was "normal" (a single character mapping). */
2490	779284		len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2491			}
2492			}
2493
2494	2406082	100	if (len) {
2495	1989670	50	if (lenp) {
2496	1989670		*lenp = len;
2497			}
2498	1989670		return valid_utf8_to_uvchr(ustrp, 0);
2499			}
2500
2501			/* Here, there was no mapping defined, which means that the code point maps
2502			* to itself. Return the inputs */
2503	416412		len = UTF8SKIP(p);
2504	416412	100	if (p != ustrp) { /* Don't copy onto itself */
2505	313988		Copy(p, ustrp, len, U8);
2506			}
2507
2508	416412	100	if (lenp)
2509	1411238		*lenp = len;
2510
2511			return uv1;
2512
2513			}
2514
2515			STATIC UV
2516	235496		S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2517			{
2518			/* This is called when changing the case of a utf8-encoded character above
2519			* the Latin1 range, and the operation is in locale. If the result
2520			* contains a character that crosses the 255/256 boundary, disallow the
2521			* change, and return the original code point. See L for why;
2522			*
2523			* p points to the original string whose case was changed; assumed
2524			* by this routine to be well-formed
2525			* result the code point of the first character in the changed-case string
2526			* ustrp points to the changed-case string ( represents its first char)
2527			* lenp points to the length of */
2528
2529			UV original; /* To store the first code point of */
2530
2531			PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2532
2533			assert(UTF8_IS_ABOVE_LATIN1(*p));
2534
2535			/* We know immediately if the first character in the string crosses the
2536			* boundary, so can skip */
2537	235496	100	if (result > 255) {
2538
2539			/* Look at every character in the result; if any cross the
2540			* boundary, the whole thing is disallowed */
2541	71244		U8* s = ustrp + UTF8SKIP(ustrp);
2542	71244		U8* e = ustrp + *lenp;
2543	106866	100	while (s < e) {
2544	5992	50	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
2545			goto bad_crossing;
2546			}
2547	0		s += UTF8SKIP(s);
2548			}
2549
2550			/* Here, no characters crossed, result is ok as-is */
2551			return result;
2552			}
2553
2554			bad_crossing:
2555
2556			/* Failed, have to return the original */
2557	170244		original = valid_utf8_to_uvchr(p, lenp);
2558	170244		Copy(p, ustrp, *lenp, char);
2559	202870		return original;
2560			}
2561
2562			/*
2563			=for apidoc to_utf8_upper
2564
2565			Instead use L.
2566
2567			=cut */
2568
2569			/* Not currently externally documented, and subject to change:
2570			* is set iff locale semantics are to be used for code points < 256
2571			* if non-null, *tainted_ptr will be set TRUE iff locale rules
2572			* were used in the calculation; otherwise unchanged. */
2573
2574			UV
2575	183772		Perl__to_utf8_upper_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, const bool flags, bool tainted_ptr)
2576			{
2577			dVAR;
2578
2579			UV result;
2580
2581			PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2582
2583	183772	100	if (UTF8_IS_INVARIANT(*p)) {
2584	29730	50	if (flags) {
2585	0		result = toUPPER_LC(*p);
2586			}
2587			else {
2588	29730		return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2589			}
2590			}
2591	154042	100	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2592	27982	100	if (flags) {
2593	2	50	result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)));
2594			}
2595			else {
2596	27980		return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
2597			ustrp, lenp, 'S');
2598			}
2599			}
2600			else { /* utf8, ord above 255 */
2601	126060		result = CALL_UPPER_CASE(p, ustrp, lenp);
2602
2603	126040	50	if (flags) {
2604	0		result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2605			}
2606	126040		return result;
2607			}
2608
2609			/* Here, used locale rules. Convert back to utf8 */
2610	2	50	if (UTF8_IS_INVARIANT(result)) {
2611	0		*ustrp = (U8) result;
2612	0		*lenp = 1;
2613			}
2614			else {
2615	2		*ustrp = UTF8_EIGHT_BIT_HI(result);
2616	2		*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2617	2		*lenp = 2;
2618			}
2619
2620	2	50	if (tainted_ptr) {
2621	91877		*tainted_ptr = TRUE;
2622			}
2623			return result;
2624			}
2625
2626			/*
2627			=for apidoc to_utf8_title
2628
2629			Instead use L.
2630
2631			=cut */
2632
2633			/* Not currently externally documented, and subject to change:
2634			* is set iff locale semantics are to be used for code points < 256
2635			* Since titlecase is not defined in POSIX, uppercase is used instead
2636			* for these/
2637			* if non-null, *tainted_ptr will be set TRUE iff locale rules
2638			* were used in the calculation; otherwise unchanged. */
2639
2640			UV
2641	10196		Perl__to_utf8_title_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, const bool flags, bool tainted_ptr)
2642			{
2643			dVAR;
2644
2645			UV result;
2646
2647			PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2648
2649	10196	100	if (UTF8_IS_INVARIANT(*p)) {
2650	982	50	if (flags) {
2651	0		result = toUPPER_LC(*p);
2652			}
2653			else {
2654	982		return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2655			}
2656			}
2657	9214	100	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2658	786	100	if (flags) {
2659	2	50	result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)));
2660			}
2661			else {
2662	784		return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
2663			ustrp, lenp, 's');
2664			}
2665			}
2666			else { /* utf8, ord above 255 */
2667	8428		result = CALL_TITLE_CASE(p, ustrp, lenp);
2668
2669	8422	50	if (flags) {
2670	0		result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2671			}
2672	8422		return result;
2673			}
2674
2675			/* Here, used locale rules. Convert back to utf8 */
2676	2	50	if (UTF8_IS_INVARIANT(result)) {
2677	0		*ustrp = (U8) result;
2678	0		*lenp = 1;
2679			}
2680			else {
2681	2		*ustrp = UTF8_EIGHT_BIT_HI(result);
2682	2		*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2683	2		*lenp = 2;
2684			}
2685
2686	2	50	if (tainted_ptr) {
2687	5096		*tainted_ptr = TRUE;
2688			}
2689			return result;
2690			}
2691
2692			/*
2693			=for apidoc to_utf8_lower
2694
2695			Instead use L.
2696
2697			=cut */
2698
2699			/* Not currently externally documented, and subject to change:
2700			* is set iff locale semantics are to be used for code points < 256
2701			* if non-null, *tainted_ptr will be set TRUE iff locale rules
2702			* were used in the calculation; otherwise unchanged. */
2703
2704			UV
2705	192456		Perl__to_utf8_lower_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, const bool flags, bool tainted_ptr)
2706			{
2707			UV result;
2708
2709			dVAR;
2710
2711			PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2712
2713	192456	100	if (UTF8_IS_INVARIANT(*p)) {
2714	31466	100	if (flags) {
2715	256		result = toLOWER_LC(*p);
2716			}
2717			else {
2718	46815		return to_lower_latin1(*p, ustrp, lenp);
2719			}
2720			}
2721	160990	100	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2722	28714	100	if (flags) {
2723	260	50	result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)));
2724			}
2725			else {
2726	42681		return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
2727			ustrp, lenp);
2728			}
2729			}
2730			else { /* utf8, ord above 255 */
2731	132276		result = CALL_LOWER_CASE(p, ustrp, lenp);
2732
2733	132270	50	if (flags) {
2734	0		result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2735			}
2736
2737	132270		return result;
2738			}
2739
2740			/* Here, used locale rules. Convert back to utf8 */
2741	516	100	if (UTF8_IS_INVARIANT(result)) {
2742	256		*ustrp = (U8) result;
2743	256		*lenp = 1;
2744			}
2745			else {
2746	260		*ustrp = UTF8_EIGHT_BIT_HI(result);
2747	260		*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2748	260		*lenp = 2;
2749			}
2750
2751	516	50	if (tainted_ptr) {
2752	96483		*tainted_ptr = TRUE;
2753			}
2754			return result;
2755			}
2756
2757			/*
2758			=for apidoc to_utf8_fold
2759
2760			Instead use L.
2761
2762			=cut */
2763
2764			/* Not currently externally documented, and subject to change,
2765			* in
2766			* bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2767			* points < 256. Since foldcase is not defined in
2768			* POSIX, lowercase is used instead
2769			* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2770			* otherwise simple folds
2771			* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2772			* prohibited
2773			* if non-null, *tainted_ptr will be set TRUE iff locale rules
2774			* were used in the calculation; otherwise unchanged. */
2775
2776			UV
2777	1282382		Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, U8 flags, bool tainted_ptr)
2778			{
2779			dVAR;
2780
2781			UV result;
2782
2783			PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2784
2785			/* These are mutually exclusive */
2786			assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2787
2788			assert(p != ustrp); /* Otherwise overwrites */
2789
2790	1282382	100	if (UTF8_IS_INVARIANT(*p)) {
2791	68476	100	if (flags & FOLD_FLAGS_LOCALE) {
2792	256		result = toFOLD_LC(*p);
2793			}
2794			else {
2795	68220		return _to_fold_latin1(*p, ustrp, lenp,
2796			flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
2797			}
2798			}
2799	1213906	100	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2800	55790	100	if (flags & FOLD_FLAGS_LOCALE) {
2801	256	50	result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)));
2802			}
2803			else {
2804	55534		return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
2805			ustrp, lenp,
2806			flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
2807			}
2808			}
2809			else { /* utf8, ord above 255 */
2810	1158116	100	result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
2811
2812	1158116	100	if (flags & FOLD_FLAGS_LOCALE) {
2813
2814			/* Special case these characters, as what normally gets returned
2815			* under locale doesn't work */
2816	327246	100	if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2817	259094	100	&& memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2818			sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2819			{
2820			goto return_long_s;
2821			}
2822	277472	100	else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
2823	209320	100	&& memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
2824			sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
2825			{
2826			goto return_ligature_st;
2827			}
2828	235496		return check_locale_boundary_crossing(p, result, ustrp, lenp);
2829			}
2830	830870	100	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2831			return result;
2832			}
2833			else {
2834			/* This is called when changing the case of a utf8-encoded
2835			* character above the ASCII range, and the result should not
2836			* contain an ASCII character. */
2837
2838			UV original; /* To store the first code point of */
2839
2840			/* Look at every character in the result; if any cross the
2841			* boundary, the whole thing is disallowed */
2842			U8* s = ustrp;
2843	312882		U8* e = ustrp + *lenp;
2844	543507	100	while (s < e) {
2845	318878	100	if (isASCII(*s)) {
2846			/* Crossed, have to return the original */
2847	244694		original = valid_utf8_to_uvchr(p, lenp);
2848
2849			/* But in these instances, there is an alternative we can
2850			* return that is valid */
2851	367041	100	if (original == LATIN_CAPITAL_LETTER_SHARP_S
2852	244694		\|\| original == LATIN_SMALL_LETTER_SHARP_S)
2853			{
2854			goto return_long_s;
2855			}
2856	196330	100	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2857			goto return_ligature_st;
2858			}
2859	155300		Copy(p, ustrp, *lenp, char);
2860	155300		return original;
2861			}
2862	74184		s += UTF8SKIP(s);
2863			}
2864
2865			/* Here, no characters crossed, result is ok as-is */
2866			return result;
2867			}
2868			}
2869
2870			/* Here, used locale rules. Convert back to utf8 */
2871	512	100	if (UTF8_IS_INVARIANT(result)) {
2872	256		*ustrp = (U8) result;
2873	256		*lenp = 1;
2874			}
2875			else {
2876	256		*ustrp = UTF8_EIGHT_BIT_HI(result);
2877	256		*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2878	256		*lenp = 2;
2879			}
2880
2881	512	50	if (tainted_ptr) {
2882	512		*tainted_ptr = TRUE;
2883			}
2884			return result;
2885
2886			return_long_s:
2887			/* Certain folds to 'ss' are prohibited by the options, but they do allow
2888			* folds to a string of two of these characters. By returning this
2889			* instead, then, e.g.,
2890			* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2891			* works. */
2892
2893	98138		lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2894	98138		Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2895			ustrp, *lenp, U8);
2896	98138		return LATIN_SMALL_LETTER_LONG_S;
2897
2898			return_ligature_st:
2899			/* Two folds to 'st' are prohibited by the options; instead we pick one and
2900			* have the other one fold to it */
2901
2902	83006		*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2903	83006		Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2904	682694		return LATIN_SMALL_LIGATURE_ST;
2905			}
2906
2907			/* Note:
2908			* Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2909			* C is a pointer to a package name for SWASHNEW, should be "utf8".
2910			* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2911			*/
2912
2913			SV*
2914	182		Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2915			{
2916			PERL_ARGS_ASSERT_SWASH_INIT;
2917
2918			/* Returns a copy of a swash initiated by the called function. This is the
2919			* public interface, and returning a copy prevents others from doing
2920			* mischief on the original */
2921
2922	182		return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
2923			}
2924
2925			SV*
2926	637998		Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV listsv, I32 minbits, I32 none, SV invlist, U8* const flags_p)
2927			{
2928			/* Initialize and return a swash, creating it if necessary. It does this
2929			* by calling utf8_heavy.pl in the general case. The returned value may be
2930			* the swash's inversion list instead if the input parameters allow it.
2931			* Which is returned should be immaterial to callers, as the only
2932			* operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2933			* and swash_to_invlist() handle both these transparently.
2934			*
2935			* This interface should only be used by functions that won't destroy or
2936			* adversely change the swash, as doing so affects all other uses of the
2937			* swash in the program; the general public should use 'Perl_swash_init'
2938			* instead.
2939			*
2940			* pkg is the name of the package that should be in.
2941			* name is the name of the swash to find. Typically it is a Unicode
2942			* property name, including user-defined ones
2943			* listsv is a string to initialize the swash with. It must be of the form
2944			* documented as the subroutine return value in
2945			* L
2946			* minbits is the number of bits required to represent each data element.
2947			* It is '1' for binary properties.
2948			* none I (khw) do not understand this one, but it is used only in tr///.
2949			* invlist is an inversion list to initialize the swash with (or NULL)
2950			* flags_p if non-NULL is the address of various input and output flag bits
2951			* to the routine, as follows: ('I' means is input to the routine;
2952			* 'O' means output from the routine. Only flags marked O are
2953			* meaningful on return.)
2954			* _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2955			* came from a user-defined property. (I O)
2956			* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2957			* when the swash cannot be located, to simply return NULL. (I)
2958			* _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2959			* return of an inversion list instead of a swash hash if this routine
2960			* thinks that would result in faster execution of swash_fetch() later
2961			* on. (I)
2962			*
2963			* Thus there are three possible inputs to find the swash: ,
2964			* , and . At least one must be specified. The result
2965			* will be the union of the specified ones, although 's various
2966			* actions can intersect, etc. what gives.
2967			*
2968			* is only valid for binary properties */
2969
2970			dVAR;
2971			SV* retval = &PL_sv_undef;
2972			HV* swash_hv = NULL;
2973			const int invlist_swash_boundary =
2974	637844	100	(flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2975			? 512 /* Based on some benchmarking, but not extensive, see commit
2976			message */
2977	637998	100	: -1; /* Never return just an inversion list */
2978
2979			assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
2980			assert(! invlist \|\| minbits == 1);
2981
2982			/* If data was passed in to go out to utf8_heavy to find the swash of, do
2983			* so */
2984	871652	100	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
		100
		50
2985	233654		dSP;
2986	233654		const size_t pkg_len = strlen(pkg);
2987	233654		const size_t name_len = strlen(name);
2988	233654		HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2989			SV* errsv_save;
2990			GV *method;
2991
2992			PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2993
2994	233654	100	PUSHSTACKi(PERLSI_MAGIC);
2995	233654		ENTER;
2996	233654		SAVEHINTS();
2997	233654		save_re_context();
2998			/* We might get here via a subroutine signature which uses a utf8
2999			* parameter name, at which point PL_subname will have been set
3000			* but not yet used. */
3001	233654		save_item(PL_subname);
3002	233654	100	if (PL_parser && PL_parser->error_count)
		100
3003	2		SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3004	233654		method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
3005	233654	100	if (!method) { /* demand load utf8 */
3006	1104		ENTER;
3007	1104	50	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3008	1104		GvSV(PL_errgv) = NULL;
3009			/* It is assumed that callers of this routine are not passing in
3010			* any user derived data. */
3011			/* Need to do this after save_re_context() as it will set
3012			* PL_tainted to 1 while saving $1 etc (see the code after getrx:
3013			* in Perl_magic_get). Even line to create errsv_save can turn on
3014			* PL_tainted. */
3015			#ifndef NO_TAINT_SUPPORT
3016	1104		SAVEBOOL(TAINT_get);
3017	1104		TAINT_NOT;
3018			#endif
3019	1104		Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3020			NULL);
3021			{
3022			/* Not ERRSV, as there is no need to vivify a scalar we are
3023			about to discard. */
3024	1104		SV * const errsv = GvSV(PL_errgv);
3025	1104	50	if (!SvTRUE(errsv)) {
		50
		0
		50
		0
		0
		50
		50
		50
		50
		0
		50
		0
		0
		0
		0
		0
		0
		0
3026	2208		GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3027	1104		SvREFCNT_dec(errsv);
3028			}
3029			}
3030	1104		LEAVE;
3031			}
3032	233654		SPAGAIN;
3033	233654	50	PUSHMARK(SP);
3034	116827		EXTEND(SP,5);
3035	233654		mPUSHp(pkg, pkg_len);
3036	233654		mPUSHp(name, name_len);
3037	233654		PUSHs(listsv);
3038	233654		mPUSHi(minbits);
3039	233654		mPUSHi(none);
3040	233654		PUTBACK;
3041	233654	50	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3042	233654		GvSV(PL_errgv) = NULL;
3043			/* If we already have a pointer to the method, no need to use
3044			* call_method() to repeat the lookup. */
3045	467308	100	if (method
		50
3046	232550		? call_sv(MUTABLE_SV(method), G_SCALAR)
3047	1104		: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
3048			{
3049	233654		retval = *PL_stack_sp--;
3050			SvREFCNT_inc(retval);
3051			}
3052			{
3053			/* Not ERRSV. See above. */
3054	233654		SV * const errsv = GvSV(PL_errgv);
3055	233654	100	if (!SvTRUE(errsv)) {
		50
		0
		100
		50
		50
		50
		50
		50
		50
		0
		50
		0
		0
		0
		0
		0
		0
		0
3056	467308		GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3057	233654		SvREFCNT_dec(errsv);
3058			}
3059			}
3060	233654		LEAVE;
3061	233654	50	POPSTACK;
3062	233654	100	if (IN_PERL_COMPILETIME) {
3063	181458		CopHINTS_set(PL_curcop, PL_hints);
3064			}
3065	233654	100	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
		50
3066	50258	50	if (SvPOK(retval))
3067
3068			/* If caller wants to handle missing properties, let them */
3069	50258	50	if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
		100
3070			return NULL;
3071			}
3072	25126		Perl_croak(aTHX_
3073			"Can't find Unicode property definition \"%"SVf"\"",
3074			SVfARG(retval));
3075			Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
3076			}
3077			} /* End of calling the module to find the swash */
3078
3079			/* If this operation fetched a swash, and we will need it later, get it */
3080	587740	100	if (retval != &PL_sv_undef
3081	183396	100	&& (minbits == 1 \|\| (flags_p
		50
3082	0	0	&& ! (*flags_p
3083	0		& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3084			{
3085	183172		swash_hv = MUTABLE_HV(SvRV(retval));
3086
3087			/* If we don't already know that there is a user-defined component to
3088			* this swash, and the user has indicated they wish to know if there is
3089			* one (by passing ), find out */
3090	183172	100	if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
		50
3091	183088		SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3092	183088	50	if (user_defined && SvUV(*user_defined)) {
		50
		100
3093	130		*flags_p \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3094			}
3095			}
3096			}
3097
3098			/* Make sure there is an inversion list for binary properties */
3099	587740	100	if (minbits == 1) {
3100			SV** swash_invlistsvp = NULL;
3101	587516		SV* swash_invlist = NULL;
3102			bool invlist_in_swash_is_valid = FALSE;
3103			bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3104			an unclaimed reference count */
3105
3106			/* If this operation fetched a swash, get its already existing
3107			* inversion list, or create one for it */
3108
3109	587516	100	if (swash_hv) {
3110	183172		swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
3111	183172	100	if (swash_invlistsvp) {
3112	177752		swash_invlist = *swash_invlistsvp;
3113			invlist_in_swash_is_valid = TRUE;
3114			}
3115			else {
3116	5420		swash_invlist = _swash_to_invlist(retval);
3117			swash_invlist_unclaimed = TRUE;
3118			}
3119			}
3120
3121			/* If an inversion list was passed in, have to include it */
3122	587516	100	if (invlist) {
3123
3124			/* Any fetched swash will by now have an inversion list in it;
3125			* otherwise will be NULL, indicating that we
3126			* didn't fetch a swash */
3127	405002	100	if (swash_invlist) {
3128
3129			/* Add the passed-in inversion list, which invalidates the one
3130			* already stored in the swash */
3131			invlist_in_swash_is_valid = FALSE;
3132	658		_invlist_union(invlist, swash_invlist, &swash_invlist);
3133			}
3134			else {
3135
3136			/* Here, there is no swash already. Set up a minimal one, if
3137			* we are going to return a swash */
3138	404344	100	if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
3139	6550		swash_hv = newHV();
3140	6550		retval = newRV_noinc(MUTABLE_SV(swash_hv));
3141			}
3142	404344		swash_invlist = invlist;
3143			}
3144			}
3145
3146			/* Here, we have computed the union of all the passed-in data. It may
3147			* be that there was an inversion list in the swash which didn't get
3148			* touched; otherwise save the one computed one */
3149	587516	100	if (! invlist_in_swash_is_valid
3150	614646	100	&& (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3151			{
3152	11642	50	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
3153			{
3154	0		Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3155			}
3156			/* We just stole a reference count. */
3157	11642	100	if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3158	6550		else SvREFCNT_inc_simple_void_NN(swash_invlist);
3159			}
3160
3161			/* Use the inversion list stand-alone if small enough */
3162	881274	100	if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3163	398126		SvREFCNT_dec(retval);
3164	398126	100	if (!swash_invlist_unclaimed)
3165	397798		SvREFCNT_inc_simple_void_NN(swash_invlist);
3166	398126		retval = newRV_noinc(swash_invlist);
3167			}
3168			}
3169
3170	600306		return retval;
3171			}
3172
3173
3174			/* This API is wrong for special case conversions since we may need to
3175			* return several Unicode characters for a single Unicode character
3176			* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3177			* the lower-level routine, and it is similarly broken for returning
3178			* multiple values. --jhi
3179			* For those, you should use to_utf8_case() instead */
3180			/* Now SWASHGET is recasted into S_swatch_get in this file. */
3181
3182			/* Note:
3183			* Returns the value of property/mapping C for the first character
3184			* of the string C. If C is true, the string C is
3185			* assumed to be in well-formed utf8. If C is false, the string C
3186			* is assumed to be in native 8-bit encoding. Caches the swatch in C.
3187			*
3188			* A "swash" is a hash which contains initially the keys/values set up by
3189			* SWASHNEW. The purpose is to be able to completely represent a Unicode
3190			* property for all possible code points. Things are stored in a compact form
3191			* (see utf8_heavy.pl) so that calculation is required to find the actual
3192			* property value for a given code point. As code points are looked up, new
3193			* key/value pairs are added to the hash, so that the calculation doesn't have
3194			* to ever be re-done. Further, each calculation is done, not just for the
3195			* desired one, but for a whole block of code points adjacent to that one.
3196			* For binary properties on ASCII machines, the block is usually for 64 code
3197			* points, starting with a code point evenly divisible by 64. Thus if the
3198			* property value for code point 257 is requested, the code goes out and
3199			* calculates the property values for all 64 code points between 256 and 319,
3200			* and stores these as a single 64-bit long bit vector, called a "swatch",
3201			* under the key for code point 256. The key is the UTF-8 encoding for code
3202			* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3203			* for a code point is 13 bytes, the key will be 12 bytes long. If the value
3204			* for code point 258 is then requested, this code realizes that it would be
3205			* stored under the key for 256, and would find that value and extract the
3206			* relevant bit, offset from 256.
3207			*
3208			* Non-binary properties are stored in as many bits as necessary to represent
3209			* their values (32 currently, though the code is more general than that), not
3210			* as single bits, but the principal is the same: the value for each key is a
3211			* vector that encompasses the property values for all code points whose UTF-8
3212			* representations are represented by the key. That is, for all code points
3213			* whose UTF-8 representations are length N bytes, and the key is the first N-1
3214			* bytes of that.
3215			*/
3216			UV
3217	9535344		Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
3218			{
3219			dVAR;
3220	9535344		HV *const hv = MUTABLE_HV(SvRV(swash));
3221			U32 klen;
3222			U32 off;
3223			STRLEN slen;
3224			STRLEN needents;
3225			const U8 *tmps = NULL;
3226			U32 bit;
3227			SV *swatch;
3228	9535344		const U8 c = *ptr;
3229
3230			PERL_ARGS_ASSERT_SWASH_FETCH;
3231
3232			/* If it really isn't a hash, it isn't really swash; must be an inversion
3233			* list */
3234	9535344	100	if (SvTYPE(hv) != SVt_PVHV) {
3235	2874804	50	return _invlist_contains_cp((SV*)hv,
3236			(do_utf8)
3237			? valid_utf8_to_uvchr(ptr, NULL)
3238			: c);
3239			}
3240
3241			/* We store the values in a "swatch" which is a vec() value in a swash
3242			* hash. Code points 0-255 are a single vec() stored with key length
3243			* (klen) 0. All other code points have a UTF-8 representation
3244			* 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3245			* share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3246			* length for them is the length of the encoded char - 1. ptr[klen] is the
3247			* final byte in the sequence representing the character */
3248	8097942	50	if (!do_utf8 \|\| UTF8_IS_INVARIANT(c)) {
		100
3249			klen = 0;
3250			needents = 256;
3251	14106		off = c;
3252			}
3253	8083836	100	else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3254			klen = 0;
3255			needents = 256;
3256	3928		off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
3257			}
3258			else {
3259	8079908		klen = UTF8SKIP(ptr) - 1;
3260
3261			/* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3262			* the vec is the final byte in the sequence. (In EBCDIC this is
3263			* converted to I8 to get consecutive values.) To help you visualize
3264			* all this:
3265			* Straight 1047 After final byte
3266			* UTF-8 UTF-EBCDIC I8 transform
3267			* U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3268			* U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3269			* ...
3270			* U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3271			* U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3272			* ...
3273			* U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3274			* U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3275			* ...
3276			* U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3277			* U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3278			* ...
3279			* U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3280			* U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3281			*
3282			* (There are no discontinuities in the elided (...) entries.)
3283			* The UTF-8 key for these 33 code points is '\xD0' (which also is the
3284			* key for the next 31, up through U+043F, whose UTF-8 final byte is
3285			* \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3286			* The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3287			* index into the vec() swatch (after subtracting 0x80, which we
3288			* actually do with an '&').
3289			* In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3290			* code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3291			* dicontinuities which go away by transforming it into I8, and we
3292			* effectively subtract 0xA0 to get the index. */
3293			needents = (1 << UTF_ACCUMULATION_SHIFT);
3294	8079908		off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
3295			}
3296
3297			/*
3298			* This single-entry cache saves about 1/3 of the utf8 overhead in test
3299			* suite. (That is, only 7-8% overall over just a hash cache. Still,
3300			* it's nothing to sniff at.) Pity we usually come through at least
3301			* two function calls to get here...
3302			*
3303			* NB: this code assumes that swatches are never modified, once generated!
3304			*/
3305
3306	12128315	100	if (hv == PL_last_swash_hv &&
		100
3307	12021991	100	klen == PL_last_swash_klen &&
3308	7982572	100	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
3309			{
3310	7832052		tmps = PL_last_swash_tmps;
3311	7832052		slen = PL_last_swash_slen;
3312			}
3313			else {
3314			/* Try our second-level swatch cache, kept in a hash. */
3315	265890		SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
3316
3317			/* If not cached, generate it via swatch_get */
3318	265890	100	if (!svp \|\| !SvPOK(*svp)
		50
3319	244590	50	\|\| !(tmps = (const U8)SvPV_const(svp, slen)))
		50
3320			{
3321	21300	100	if (klen) {
3322	20738		const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3323	20738		swatch = swatch_get(swash,
3324			code_point & ~((UV)needents - 1),
3325			needents);
3326			}
3327			else { /* For the first 256 code points, the swatch has a key of
3328			length 0 */
3329	562		swatch = swatch_get(swash, 0, needents);
3330			}
3331
3332	21300	100	if (IN_PERL_COMPILETIME)
3333	4918		CopHINTS_set(PL_curcop, PL_hints);
3334
3335	21300		svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
3336
3337	21300	50	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
		50
		50
3338	21300	50	\|\| (slen << 3) < needents)
3339	0		Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3340			"svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3341			svp, tmps, (UV)slen, (UV)needents);
3342			}
3343
3344	265890		PL_last_swash_hv = hv;
3345			assert(klen <= sizeof(PL_last_swash_key));
3346	265890		PL_last_swash_klen = (U8)klen;
3347			/* FIXME change interpvar.h? */
3348	265890		PL_last_swash_tmps = (U8 *) tmps;
3349	265890		PL_last_swash_slen = slen;
3350	265890	100	if (klen)
3351	256902		Copy(ptr, PL_last_swash_key, klen, U8);
3352			}
3353
3354	8097942		switch ((int)((slen << 3) / needents)) {
3355			case 1:
3356	6899056		bit = 1 << (off & 7);
3357	6899056		off >>= 3;
3358	6899056		return (tmps[off] & bit) != 0;
3359			case 8:
3360	70		return tmps[off];
3361			case 16:
3362	2986		off <<= 1;
3363	2986		return (tmps[off] << 8) + tmps[off + 1] ;
3364			case 32:
3365	1195830		off <<= 2;
3366	1195830		return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
3367			}
3368	4767672		Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3369			"slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
3370			NORETURN_FUNCTION_END;
3371			}
3372
3373			/* Read a single line of the main body of the swash input text. These are of
3374			* the form:
3375			* 0053 0056 0073
3376			* where each number is hex. The first two numbers form the minimum and
3377			* maximum of a range, and the third is the value associated with the range.
3378			* Not all swashes should have a third number
3379			*
3380			* On input: l points to the beginning of the line to be examined; it points
3381			* to somewhere in the string of the whole input text, and is
3382			* terminated by a \n or the null string terminator.
3383			* lend points to the null terminator of that string
3384			* wants_value is non-zero if the swash expects a third number
3385			* typestr is the name of the swash's mapping, like 'ToLower'
3386			* On output: min, max, and *val are set to the values read from the line.
3387			* returns a pointer just beyond the line examined. If there was no
3388			* valid min number on the line, returns lend+1
3389			*/
3390
3391			STATIC U8*
3392	2303340		S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3393			const bool wants_value, const U8* const typestr)
3394			{
3395	2303340	100	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
		100
3396			STRLEN numlen; /* Length of the number */
3397	2303340		I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3398			\| PERL_SCAN_DISALLOW_PREFIX
3399			\| PERL_SCAN_SILENT_NON_PORTABLE;
3400
3401			/* nl points to the next \n in the scan */
3402	2303340		U8* const nl = (U8*)memchr(l, '\n', lend - l);
3403
3404			/* Get the first number on the line: the range minimum */
3405	2303340		numlen = lend - l;
3406	2303340		min = grok_hex((char )l, &numlen, &flags, NULL);
3407	2303340	50	if (numlen) /* If found a hex number, position past it */
3408	2303340		l += numlen;
3409	0	0	else if (nl) { /* Else, go handle next line, if any */
3410	0		return nl + 1; /* 1 is length of "\n" */
3411			}
3412			else { /* Else, no next line */
3413	0		return lend + 1; /* to LIST's end at which \n is not found */
3414			}
3415
3416			/* The max range value follows, separated by a BLANK */
3417	2303340	100	if (isBLANK(*l)) {
3418	2303198		++l;
3419	2303198		flags = PERL_SCAN_SILENT_ILLDIGIT
3420			\| PERL_SCAN_DISALLOW_PREFIX
3421			\| PERL_SCAN_SILENT_NON_PORTABLE;
3422	2303198		numlen = lend - l;
3423	2303198		max = grok_hex((char )l, &numlen, &flags, NULL);
3424	2303198	100	if (numlen)
3425	1402214		l += numlen;
3426			else /* If no value here, it is a single element range */
3427	900984		max = min;
3428
3429			/* Non-binary tables have a third entry: what the first element of the
3430			* range maps to */
3431	2303198	100	if (wants_value) {
3432	541538	50	if (isBLANK(*l)) {
3433	541538		++l;
3434
3435			/* The ToLc, etc table mappings are not in hex, and must be
3436			* corrected by adding the code point to them */
3437	541538	100	if (typeto) {
3438	541128		char after_strtol = (char ) lend;
3439	541128		val = Strtol((char )l, &after_strtol, 10);
3440			l = (U8 *) after_strtol;
3441			}
3442			else { /* Other tables are in hex, and are the correct result
3443			without tweaking */
3444	410		flags = PERL_SCAN_SILENT_ILLDIGIT
3445			\| PERL_SCAN_DISALLOW_PREFIX
3446			\| PERL_SCAN_SILENT_NON_PORTABLE;
3447	410		numlen = lend - l;
3448	410		val = grok_hex((char )l, &numlen, &flags, NULL);
3449	410	50	if (numlen)
3450			l += numlen;
3451			else
3452	0		*val = 0;
3453			}
3454			}
3455			else {
3456	0		*val = 0;
3457	0	0	if (typeto) {
3458			/* diag_listed_as: To%s: illegal mapping '%s' */
3459	0		Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3460			typestr, l);
3461			}
3462			}
3463			}
3464			else
3465	1761660		val = 0; / bits == 1, then any val should be ignored */
3466			}
3467			else { /* Nothing following range min, should be single element with no
3468			mapping expected */
3469	142		max = min;
3470	142	50	if (wants_value) {
3471	0		*val = 0;
3472	0	0	if (typeto) {
3473			/* diag_listed_as: To%s: illegal mapping '%s' */
3474	0		Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3475			}
3476			}
3477			else
3478	142		val = 0; / bits == 1, then val should be ignored */
3479			}
3480
3481			/* Position to next line if any, or EOF */
3482	2303340	100	if (nl)
3483	2303268		l = nl + 1;
3484			else
3485			l = lend;
3486
3487	2303340		return l;
3488			}
3489
3490			/* Note:
3491			* Returns a swatch (a bit vector string) for a code point sequence
3492			* that starts from the value C and comprises the number C.
3493			* A C must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3494			* Should be used via swash_fetch, which will cache the swatch in C.
3495			*/
3496			STATIC SV*
3497	21300		S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3498			{
3499			SV *swatch;
3500			U8 l, lend, x, xend, s, send;
3501			STRLEN lcur, xcur, scur;
3502	21300		HV *const hv = MUTABLE_HV(SvRV(swash));
3503	21300		SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
3504
3505			SV** listsvp = NULL; /* The string containing the main body of the table */
3506			SV** extssvp = NULL;
3507			SV** invert_it_svp = NULL;
3508			U8* typestr = NULL;
3509			STRLEN bits;
3510			STRLEN octets; /* if bits == 1, then octets == 0 */
3511			UV none;
3512	21300		UV end = start + span;
3513
3514	21300	100	if (invlistsvp == NULL) {
3515	1070		SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3516	1070		SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3517	1070		SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3518	1070		extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3519	1070		listsvp = hv_fetchs(hv, "LIST", FALSE);
3520	1070		invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3521
3522	1070	50	bits = SvUV(*bitssvp);
3523	1070	50	none = SvUV(*nonesvp);
3524	1070	50	typestr = (U8)SvPV_nolen(typesvp);
3525			}
3526			else {
3527			bits = 1;
3528			none = 0;
3529			}
3530	21300		octets = bits >> 3; /* if bits == 1, then octets == 0 */
3531
3532			PERL_ARGS_ASSERT_SWATCH_GET;
3533
3534	21300	100	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
		50
3535	0		Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3536			(UV)bits);
3537			}
3538
3539			/* If overflowed, use the max possible */
3540	21300	100	if (end < start) {
3541			end = UV_MAX;
3542	10		span = end - start;
3543			}
3544
3545			/* create and initialize $swatch */
3546	21300	100	scur = octets ? (span * octets) : (span + 7) / 8;
3547	21300		swatch = newSV(scur);
3548	21300		SvPOK_on(swatch);
3549	21300		s = (U8*)SvPVX(swatch);
3550	21300	100	if (octets && none) {
3551	230		const U8* const e = s + scur;
3552	28889	100	while (s < e) {
3553	28544	100	if (bits == 8)
3554	4352		*s++ = (U8)(none & 0xff);
3555	24192	100	else if (bits == 16) {
3556	16640		*s++ = (U8)((none >> 8) & 0xff);
3557	16640		*s++ = (U8)( none & 0xff);
3558			}
3559	7552	50	else if (bits == 32) {
3560	7552		*s++ = (U8)((none >> 24) & 0xff);
3561	7552		*s++ = (U8)((none >> 16) & 0xff);
3562	7552		*s++ = (U8)((none >> 8) & 0xff);
3563	18048		*s++ = (U8)( none & 0xff);
3564			}
3565			}
3566	230		*s = '\0';
3567			}
3568			else {
3569	21070		(void)memzero((U8*)s, scur + 1);
3570			}
3571	21300		SvCUR_set(swatch, scur);
3572	21300		s = (U8*)SvPVX(swatch);
3573
3574	21300	100	if (invlistsvp) { /* If has an inversion list set up use that */
3575	20230		_invlist_populate_swatch(*invlistsvp, start, end, s);
3576	20230		return swatch;
3577			}
3578
3579			/* read $swash->{LIST} */
3580	1070	50	l = (U8)SvPV(listsvp, lcur);
3581	1070		lend = l + lcur;
3582	528023	100	while (l < lend) {
3583			UV min, max, val, upper;
3584	526418		l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3585			cBOOL(octets), typestr);
3586	526418	50	if (l > lend) {
3587			break;
3588			}
3589
3590			/* If looking for something beyond this range, go try the next one */
3591	526418	100	if (max < start)
3592	279486		continue;
3593
3594			/* is generally 1 beyond where we want to set things, but at the
3595			* platform's infinity, where we can't go any higher, we want to
3596			* include the code point at */
3597			upper = (max < end)
3598			? max
3599	252919	100	: (max != UV_MAX \|\| end != UV_MAX)
3600			? end - 1
3601	234958	50	: end;
3602
3603	246932	50	if (octets) {
3604			UV key;
3605	246932	100	if (min < start) {
3606	116	100	if (!none \|\| val < none) {
		100
3607	98		val += start - min;
3608			}
3609	116		min = start;
3610			}
3611	269928	100	for (key = min; key <= upper; key++) {
3612			STRLEN offset;
3613			/* offset must be non-negative (start <= min <= key < end) */
3614	22996		offset = octets * (key - start);
3615	22996	100	if (bits == 8)
3616	26		s[offset] = (U8)(val & 0xff);
3617	22970	100	else if (bits == 16) {
3618	3092		s[offset ] = (U8)((val >> 8) & 0xff);
3619	3092		s[offset + 1] = (U8)( val & 0xff);
3620			}
3621	19878	50	else if (bits == 32) {
3622	19878		s[offset ] = (U8)((val >> 24) & 0xff);
3623	19878		s[offset + 1] = (U8)((val >> 16) & 0xff);
3624	19878		s[offset + 2] = (U8)((val >> 8) & 0xff);
3625	19878		s[offset + 3] = (U8)( val & 0xff);
3626			}
3627
3628	22996	100	if (!none \|\| val < none)
		100
3629	20316		++val;
3630			}
3631			}
3632			else { /* bits == 1, then val should be ignored */
3633			UV key;
3634	0	0	if (min < start)
3635	0		min = start;
3636
3637	263209	0	for (key = min; key <= upper; key++) {
3638	0		const STRLEN offset = (STRLEN)(key - start);
3639	0		s[offset >> 3] \|= 1 << (offset & 7);
3640			}
3641			}
3642			} /* while */
3643
3644			/* Invert if the data says it should be. Assumes that bits == 1 */
3645	1070	100	if (invert_it_svp && SvUV(*invert_it_svp)) {
		50
		50
3646
3647			/* Unicode properties should come with all bits above PERL_UNICODE_MAX
3648			* be 0, and their inversion should also be 0, as we don't succeed any
3649			* Unicode property matches for non-Unicode code points */
3650	0	0	if (start <= PERL_UNICODE_MAX) {
3651
3652			/* The code below assumes that we never cross the
3653			* Unicode/above-Unicode boundary in a range, as otherwise we would
3654			* have to figure out where to stop flipping the bits. Since this
3655			* boundary is divisible by a large power of 2, and swatches comes
3656			* in small powers of 2, this should be a valid assumption */
3657			assert(start + span - 1 <= PERL_UNICODE_MAX);
3658
3659	0		send = s + scur;
3660	0	0	while (s < send) {
3661	0		s = ~(s);
3662	0		s++;
3663			}
3664			}
3665			}
3666
3667			/* read $swash->{EXTRAS}
3668			* This code also copied to swash_to_invlist() below */
3669	1070	50	x = (U8)SvPV(extssvp, xcur);
3670	1070		xend = x + xcur;
3671	11950	100	while (x < xend) {
3672			STRLEN namelen;
3673			U8 *namestr;
3674			SV** othersvp;
3675			HV* otherhv;
3676			STRLEN otherbits;
3677			SV *otherbitssvp, other;
3678			U8 s, o, *nl;
3679			STRLEN slen, olen;
3680
3681	230		const U8 opc = *x++;
3682	230	50	if (opc == '\n')
3683	0		continue;
3684
3685	230		nl = (U8*)memchr(x, '\n', xend - x);
3686
3687	230	50	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
		50
3688	230	50	if (nl) {
3689	230		x = nl + 1; /* 1 is length of "\n" */
3690	230		continue;
3691			}
3692			else {
3693			x = xend; /* to EXTRAS' end at which \n is not found */
3694			break;
3695			}
3696			}
3697
3698			namestr = x;
3699	0	0	if (nl) {
3700	0		namelen = nl - namestr;
3701	0		x = nl + 1;
3702			}
3703			else {
3704	0		namelen = xend - namestr;
3705			x = xend;
3706			}
3707
3708	0		othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3709	0		otherhv = MUTABLE_HV(SvRV(*othersvp));
3710	0		otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3711	0	0	otherbits = (STRLEN)SvUV(*otherbitssvp);
3712	0	0	if (bits < otherbits)
3713	0		Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3714			"bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3715
3716			/* The "other" swatch must be destroyed after. */
3717	0		other = swatch_get(*othersvp, start, span);
3718	0	0	o = (U8*)SvPV(other, olen);
3719
3720	0	0	if (!olen)
3721	0		Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3722
3723	0	0	s = (U8*)SvPV(swatch, slen);
3724	0	0	if (bits == 1 && otherbits == 1) {
3725	0	0	if (slen != olen)
3726	0		Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3727			"mismatch, slen=%"UVuf", olen=%"UVuf,
3728			(UV)slen, (UV)olen);
3729
3730	0		switch (opc) {
3731			case '+':
3732	0	0	while (slen--)
3733	0		s++ \|= o++;
3734			break;
3735			case '!':
3736	0	0	while (slen--)
3737	0		s++ \|= ~o++;
3738			break;
3739			case '-':
3740	0	0	while (slen--)
3741	0		s++ &= ~o++;
3742			break;
3743			case '&':
3744	0	0	while (slen--)
3745	0		s++ &= o++;
3746			break;
3747			default:
3748			break;
3749			}
3750			}
3751			else {
3752	0		STRLEN otheroctets = otherbits >> 3;
3753			STRLEN offset = 0;
3754	0		U8* const send = s + slen;
3755
3756	0	0	while (s < send) {
3757			UV otherval = 0;
3758
3759	0	0	if (otherbits == 1) {
3760	0		otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3761	0		++offset;
3762			}
3763			else {
3764			STRLEN vlen = otheroctets;
3765	0		otherval = *o++;
3766	0	0	while (--vlen) {
3767	0		otherval <<= 8;
3768	0		otherval \|= *o++;
3769			}
3770			}
3771
3772	0	0	if (opc == '+' && otherval)
3773			NOOP; /* replace with otherval */
3774	0	0	else if (opc == '!' && !otherval)
3775			otherval = 1;
3776	0	0	else if (opc == '-' && otherval)
3777			otherval = 0;
3778	0	0	else if (opc == '&' && !otherval)
3779			otherval = 0;
3780			else {
3781	0		s += octets; /* no replacement */
3782	0		continue;
3783			}
3784
3785	0	0	if (bits == 8)
3786	0		*s++ = (U8)( otherval & 0xff);
3787	0	0	else if (bits == 16) {
3788	0		*s++ = (U8)((otherval >> 8) & 0xff);
3789	0		*s++ = (U8)( otherval & 0xff);
3790			}
3791	0	0	else if (bits == 32) {
3792	0		*s++ = (U8)((otherval >> 24) & 0xff);
3793	0		*s++ = (U8)((otherval >> 16) & 0xff);
3794	0		*s++ = (U8)((otherval >> 8) & 0xff);
3795	0		*s++ = (U8)( otherval & 0xff);
3796			}
3797			}
3798			}
3799	115		sv_free(other); /* through with it! */
3800			} /* while */
3801			return swatch;
3802			}
3803
3804			HV*
3805	24		Perl__swash_inversion_hash(pTHX_ SV* const swash)
3806			{
3807
3808			/* Subject to change or removal. For use only in regcomp.c and regexec.c
3809			* Can't be used on a property that is subject to user override, as it
3810			* relies on the value of SPECIALS in the swash which would be set by
3811			* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3812			* for overridden properties
3813			*
3814			* Returns a hash which is the inversion and closure of a swash mapping.
3815			* For example, consider the input lines:
3816			* 004B 006B
3817			* 004C 006C
3818			* 212A 006B
3819			*
3820			* The returned hash would have two keys, the utf8 for 006B and the utf8 for
3821			* 006C. The value for each key is an array. For 006C, the array would
3822			* have two elements, the utf8 for itself, and for 004C. For 006B, there
3823			* would be three elements in its array, the utf8 for 006B, 004B and 212A.
3824			*
3825			* Essentially, for any code point, it gives all the code points that map to
3826			* it, or the list of 'froms' for that point.
3827			*
3828			* Currently it ignores any additions or deletions from other swashes,
3829			* looking at just the main body of the swash, and if there are SPECIALS
3830			* in the swash, at that hash
3831			*
3832			* The specials hash can be extra code points, and most likely consists of
3833			* maps from single code points to multiple ones (each expressed as a string
3834			* of utf8 characters). This function currently returns only 1-1 mappings.
3835			* However consider this possible input in the specials hash:
3836			* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
3837			* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
3838			*
3839			* Both FB05 and FB06 map to the same multi-char sequence, which we don't
3840			* currently handle. But it also means that FB05 and FB06 are equivalent in
3841			* a 1-1 mapping which we should handle, and this relationship may not be in
3842			* the main table. Therefore this function examines all the multi-char
3843			* sequences and adds the 1-1 mappings that come out of that. */
3844
3845			U8 l, lend;
3846			STRLEN lcur;
3847	24		HV *const hv = MUTABLE_HV(SvRV(swash));
3848
3849			/* The string containing the main body of the table. This will have its
3850			* assertion fail if the swash has been converted to its inversion list */
3851	24		SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3852
3853	24		SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3854	24		SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3855	24		SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3856			/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3857	24	50	const U8* const typestr = (U8)SvPV_nolen(typesvp);
3858	24	50	const STRLEN bits = SvUV(*bitssvp);
3859	24		const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3860	24	50	const UV none = SvUV(*nonesvp);
3861	24		SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3862
3863	24		HV* ret = newHV();
3864
3865			PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3866
3867			/* Must have at least 8 bits to get the mappings */
3868	24	50	if (bits != 8 && bits != 16 && bits != 32) {
		50
3869	0		Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3870			(UV)bits);
3871			}
3872
3873	24	50	if (specials_p) { /* It might be "special" (sometimes, but not always, a
3874			mapping to more than one character */
3875
3876			/* Construct an inverse mapping hash for the specials */
3877	24		HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3878	24		HV * specials_inverse = newHV();
3879			char char_from; / the lhs of the map */
3880			I32 from_len; /* its byte length */
3881			char char_to; / the rhs of the map */
3882			I32 to_len; /* its byte length */
3883			SV sv_to; / and in a sv */
3884			AV* from_list; /* list of things that map to each 'to' */
3885
3886	24		hv_iterinit(specials_hv);
3887
3888			/* The keys are the characters (in utf8) that map to the corresponding
3889			* utf8 string value. Iterate through the list creating the inverse
3890			* list. */
3891	2532	100	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3892			SV** listp;
3893	2496	50	if (! SvPOK(sv_to)) {
3894	0		Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3895			"unexpectedly is not a string, flags=%lu",
3896	0		(unsigned long)SvFLAGS(sv_to));
3897			}
3898			/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8) char_from, 0), valid_utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
3899
3900			/* Each key in the inverse list is a mapped-to value, and the key's
3901			* hash value is a list of the strings (each in utf8) that map to
3902			* it. Those strings are all one character long */
3903	2496	100	if ((listp = hv_fetch(specials_inverse,
3904			SvPVX(sv_to),
3905			SvCUR(sv_to), 0)))
3906			{
3907	744		from_list = (AV) listp;
3908			}
3909			else { /* No entry yet for it: create one */
3910	1752		from_list = newAV();
3911	1752	50	if (! hv_store(specials_inverse,
3912			SvPVX(sv_to),
3913			SvCUR(sv_to),
3914			(SV*) from_list, 0))
3915			{
3916	0		Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3917			}
3918			}
3919
3920			/* Here have the list associated with this 'to' (perhaps newly
3921			* created and empty). Just add to it. Note that we ASSUME that
3922			* the input is guaranteed to not have duplications, so we don't
3923			* check for that. Duplications just slow down execution time. */
3924	2496		av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3925			}
3926
3927			/* Here, 'specials_inverse' contains the inverse mapping. Go through
3928			* it looking for cases like the FB05/FB06 examples above. There would
3929			* be an entry in the hash like
3930			* 'st' => [ FB05, FB06 ]
3931			* In this example we will create two lists that get stored in the
3932			* returned hash, 'ret':
3933			* FB05 => [ FB05, FB06 ]
3934			* FB06 => [ FB05, FB06 ]
3935			*
3936			* Note that there is nothing to do if the array only has one element.
3937			* (In the normal 1-1 case handled below, we don't have to worry about
3938			* two lists, as everything gets tied to the single list that is
3939			* generated for the single character 'to'. But here, we are omitting
3940			* that list, ('st' in the example), so must have multiple lists.) */
3941	1776	100	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3942			&char_to, &to_len)))
3943			{
3944	1752	100	if (av_len(from_list) > 0) {
3945			SSize_t i;
3946
3947			/* We iterate over all combinations of i,j to place each code
3948			* point on each list */
3949	1872	100	for (i = 0; i <= av_len(from_list); i++) {
3950			SSize_t j;
3951	1488		AV* i_list = newAV();
3952	1488		SV** entryp = av_fetch(from_list, i, FALSE);
3953	1488	50	if (entryp == NULL) {
3954	0		Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3955			}
3956	1488	50	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
3957	0		Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3958			}
3959	1488	50	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
3960			(SV*) i_list, FALSE))
3961			{
3962	0		Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3963			}
3964
3965			/* For debugging: UV u = valid_utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
3966	3720	100	for (j = 0; j <= av_len(from_list); j++) {
3967	2976		entryp = av_fetch(from_list, j, FALSE);
3968	2976	50	if (entryp == NULL) {
3969	0		Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3970			}
3971
3972			/* When i==j this adds itself to the list */
3973	2976	50	av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3974			(U8) SvPVX(entryp),
3975			(U8) SvPVX(entryp) + SvCUR(*entryp),
3976			0)));
3977			/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
3978			}
3979			}
3980			}
3981			}
3982	24		SvREFCNT_dec(specials_inverse); /* done with it */
3983			} /* End of specials */
3984
3985			/* read $swash->{LIST} */
3986	24	50	l = (U8)SvPV(listsvp, lcur);
3987	24		lend = l + lcur;
3988
3989			/* Go through each input line */
3990	15156	100	while (l < lend) {
3991			UV min, max, val;
3992			UV inverse;
3993	15120		l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3994			cBOOL(octets), typestr);
3995	15120	50	if (l > lend) {
3996			break;
3997			}
3998
3999			/* Each element in the range is to be inverted */
4000	40440	100	for (inverse = min; inverse <= max; inverse++) {
4001			AV* list;
4002			SV** listp;
4003			IV i;
4004			bool found_key = FALSE;
4005			bool found_inverse = FALSE;
4006
4007			/* The key is the inverse mapping */
4008			char key[UTF8_MAXBYTES+1];
4009	25320		char* key_end = (char ) uvchr_to_utf8((U8) key, val);
4010	25320		STRLEN key_len = key_end - key;
4011
4012			/* Get the list for the map */
4013	25320	100	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
4014	1176		list = (AV) listp;
4015			}
4016			else { /* No entry yet for it: create one */
4017	24144		list = newAV();
4018	24144	50	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
4019	0		Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
4020			}
4021			}
4022
4023			/* Look through list to see if this inverse mapping already is
4024			* listed, or if there is a mapping to itself already */
4025	27048	100	for (i = 0; i <= av_len(list); i++) {
4026	2400		SV** entryp = av_fetch(list, i, FALSE);
4027			SV* entry;
4028	2400	50	if (entryp == NULL) {
4029	0		Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
4030			}
4031	2400		entry = *entryp;
4032			/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));/
4033	2400	50	if (SvUV(entry) == val) {
		100
4034			found_key = TRUE;
4035			}
4036	2400	50	if (SvUV(entry) == inverse) {
		100
4037			found_inverse = TRUE;
4038			}
4039
4040			/* No need to continue searching if found everything we are
4041			* looking for */
4042	2400	100	if (found_key && found_inverse) {
		100
4043			break;
4044			}
4045			}
4046
4047			/* Make sure there is a mapping to itself on the list */
4048	25320	100	if (! found_key) {
4049	24144		av_push(list, newSVuv(val));
4050			/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));/
4051			}
4052
4053
4054			/* Simply add the value to the list */
4055	25320	100	if (! found_inverse) {
4056	24648		av_push(list, newSVuv(inverse));
4057			/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));/
4058			}
4059
4060			/* swatch_get() increments the value of val for each element in the
4061			* range. That makes more compact tables possible. You can
4062			* express the capitalization, for example, of all consecutive
4063			* letters with a single line: 0061\t007A\t0041 This maps 0061 to
4064			* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
4065			* and it's not documented; it appears to be used only in
4066			* implementing tr//; I copied the semantics from swatch_get(), just
4067			* in case */
4068	25320	50	if (!none \|\| val < none) {
		0
4069	25320		++val;
4070			}
4071			}
4072			}
4073
4074	24		return ret;
4075			}
4076
4077			SV*
4078	6680		Perl__swash_to_invlist(pTHX_ SV* const swash)
4079			{
4080
4081			/* Subject to change or removal. For use only in one place in regcomp.c.
4082			* Ownership is given to one reference count in the returned SV* */
4083
4084			U8 l, lend;
4085			char *loc;
4086			STRLEN lcur;
4087	6680		HV *const hv = MUTABLE_HV(SvRV(swash));
4088			UV elements = 0; /* Number of elements in the inversion list */
4089	6680		U8 empty[] = "";
4090			SV** listsvp;
4091			SV** typesvp;
4092			SV** bitssvp;
4093			SV** extssvp;
4094			SV** invert_it_svp;
4095
4096			U8* typestr;
4097			STRLEN bits;
4098			STRLEN octets; /* if bits == 1, then octets == 0 */
4099			U8 x, xend;
4100			STRLEN xcur;
4101
4102			SV* invlist;
4103
4104			PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
4105
4106			/* If not a hash, it must be the swash's inversion list instead */
4107	6680	100	if (SvTYPE(hv) != SVt_PVHV) {
4108	28		return SvREFCNT_inc_simple_NN((SV*) hv);
4109			}
4110
4111			/* The string containing the main body of the table */
4112	6652		listsvp = hv_fetchs(hv, "LIST", FALSE);
4113	6652		typesvp = hv_fetchs(hv, "TYPE", FALSE);
4114	6652		bitssvp = hv_fetchs(hv, "BITS", FALSE);
4115	6652		extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
4116	6652		invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
4117
4118	6652	50	typestr = (U8)SvPV_nolen(typesvp);
4119	6652	50	bits = SvUV(*bitssvp);
4120	6652		octets = bits >> 3; /* if bits == 1, then octets == 0 */
4121
4122			/* read $swash->{LIST} */
4123	6652	50	if (SvPOK(*listsvp)) {
4124	6652	50	l = (U8)SvPV(listsvp, lcur);
4125			}
4126			else {
4127			/* LIST legitimately doesn't contain a string during compilation phases
4128			* of Perl itself, before the Unicode tables are generated. In this
4129			* case, just fake things up by creating an empty list */
4130			l = empty;
4131	0		lcur = 0;
4132			}
4133			loc = (char *) l;
4134	6652		lend = l + lcur;
4135
4136			/* Scan the input to count the number of lines to preallocate array size
4137			* based on worst possible case, which is each line in the input creates 2
4138			* elements in the inversion list: 1) the beginning of a range in the list;
4139			* 2) the beginning of a range not in the list. */
4140	1771708	100	while ((loc = (strchr(loc, '\n'))) != NULL) {
4141	1761730		elements += 2;
4142	1761730		loc++;
4143			}
4144
4145			/* If the ending is somehow corrupt and isn't a new line, add another
4146			* element for the final range that isn't in the inversion list */
4147	9978	50	if (! (*lend == '\n'
		50
4148	9978	100	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
		100
4149			{
4150	72		elements++;
4151			}
4152
4153	6652		invlist = _new_invlist(elements);
4154
4155			/* Now go through the input again, adding each range to the list */
4156	1771780	100	while (l < lend) {
4157			UV start, end;
4158			UV val; /* Not used by this function */
4159
4160	1761802		l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
4161			cBOOL(octets), typestr);
4162
4163	1761802	50	if (l > lend) {
4164			break;
4165			}
4166
4167	1761802		invlist = _add_range_to_invlist(invlist, start, end);
4168			}
4169
4170			/* Invert if the data says it should be */
4171	6652	100	if (invert_it_svp && SvUV(*invert_it_svp)) {
		50
		100
4172	94		_invlist_invert_prop(invlist);
4173			}
4174
4175			/* This code is copied from swatch_get()
4176			* read $swash->{EXTRAS} */
4177	6652	50	x = (U8)SvPV(extssvp, xcur);
4178	6652		xend = x + xcur;
4179	11942	100	while (x < xend) {
4180			STRLEN namelen;
4181			U8 *namestr;
4182			SV** othersvp;
4183			HV* otherhv;
4184			STRLEN otherbits;
4185			SV *otherbitssvp, other;
4186			U8 *nl;
4187
4188	1964		const U8 opc = *x++;
4189	1964	100	if (opc == '\n')
4190	2		continue;
4191
4192	1962		nl = (U8*)memchr(x, '\n', xend - x);
4193
4194	1962	100	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
		100
4195	844	50	if (nl) {
4196	844		x = nl + 1; /* 1 is length of "\n" */
4197	844		continue;
4198			}
4199			else {
4200			x = xend; /* to EXTRAS' end at which \n is not found */
4201			break;
4202			}
4203			}
4204
4205			namestr = x;
4206	1118	100	if (nl) {
4207	1104		namelen = nl - namestr;
4208	1104		x = nl + 1;
4209			}
4210			else {
4211	14		namelen = xend - namestr;
4212			x = xend;
4213			}
4214
4215	1118		othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
4216	1118		otherhv = MUTABLE_HV(SvRV(*othersvp));
4217	1118		otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
4218	1118	50	otherbits = (STRLEN)SvUV(*otherbitssvp);
4219
4220	1118	50	if (bits != otherbits \|\| bits != 1) {
4221	0		Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
4222			"properties, bits=%"UVuf", otherbits=%"UVuf,
4223			(UV)bits, (UV)otherbits);
4224			}
4225
4226			/* The "other" swatch must be destroyed after. */
4227	1118		other = _swash_to_invlist((SV )othersvp);
4228
4229			/* End of code copied from swatch_get() */
4230	1118		switch (opc) {
4231			case '+':
4232	710		_invlist_union(invlist, other, &invlist);
4233	710		break;
4234			case '!':
4235	310		_invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
4236	310		break;
4237			case '-':
4238	80		_invlist_subtract(invlist, other, &invlist);
4239	80		break;
4240			case '&':
4241	18		_invlist_intersection(invlist, other, &invlist);
4242	18		break;
4243			default:
4244			break;
4245			}
4246	1541		sv_free(other); /* through with it! */
4247			}
4248
4249	6666		return invlist;
4250			}
4251
4252			SV*
4253	180730		Perl__get_swash_invlist(pTHX_ SV* const swash)
4254			{
4255			SV** ptr;
4256
4257			PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
4258
4259	180730	50	if (! SvROK(swash)) {
4260			return NULL;
4261			}
4262
4263			/* If it really isn't a hash, it isn't really swash; must be an inversion
4264			* list */
4265	180730	50	if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
4266	0		return SvRV(swash);
4267			}
4268
4269	180730		ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
4270	180730	50	if (! ptr) {
4271			return NULL;
4272			}
4273
4274	180730		return *ptr;
4275			}
4276
4277			bool
4278	21060		Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4279			{
4280			/* May change: warns if surrogates, non-character code points, or
4281			* non-Unicode code points are in s which has length len bytes. Returns
4282			* TRUE if none found; FALSE otherwise. The only other validity check is
4283			* to make sure that this won't exceed the string's length */
4284
4285	21060		const U8* const e = s + len;
4286			bool ok = TRUE;
4287
4288			PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4289
4290	2379248	100	while (s < e) {
4291	2347658	50	if (UTF8SKIP(s) > len) {
4292	0	0	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4293	0	0	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
		0
4294	0		return FALSE;
4295			}
4296	2347658	100	if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
4297			STRLEN char_len;
4298	9974	100	if (UTF8_IS_SUPER(s)) {
		50
		100
4299	12	100	if (ckWARN_d(WARN_NON_UNICODE)) {
4300	10	100	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4301	10		Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
4302			"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
4303			ok = FALSE;
4304			}
4305			}
4306	9962	100	else if (UTF8_IS_SURROGATE(s)) {
		100
4307	14	100	if (ckWARN_d(WARN_SURROGATE)) {
4308	12	100	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4309	12		Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4310			"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
4311			ok = FALSE;
4312			}
4313			}
4314			else if
4315	32667	100	((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
		100
		50
		50
		50
		50
		100
		50
		50
		100
4316	31863	100	&& (ckWARN_d(WARN_NONCHAR)))
		100
		100
		100
		100
		100
		50
		100
		100
4317			{
4318	80	100	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4319	80		Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
4320			"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
4321			ok = FALSE;
4322			}
4323			}
4324	2347658		s += UTF8SKIP(s);
4325			}
4326
4327			return ok;
4328			}
4329
4330			/*
4331			=for apidoc pv_uni_display
4332
4333			Build to the scalar C a displayable version of the string C,
4334			length C, the displayable version being at most C bytes long
4335			(if longer, the rest is truncated and "..." will be appended).
4336
4337			The C argument can have UNI_DISPLAY_ISPRINT set to display
4338			isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
4339			to display the \\[nrfta\\] as the backslashed versions (like '\n')
4340			(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
4341			UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
4342			UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
4343
4344			The pointer to the PV of the C is returned.
4345
4346			=cut */
4347			char *
4348	33996		Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
4349			{
4350			int truncated = 0;
4351			const char s, e;
4352
4353			PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4354
4355	33996		sv_setpvs(dsv, "");
4356	33996		SvUTF8_off(dsv);
4357	68154	100	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4358			UV u;
4359			/* This serves double duty as a flag and a character to print after
4360			a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4361			*/
4362			char ok = 0;
4363
4364	34172	50	if (pvlim && SvCUR(dsv) >= pvlim) {
		100
4365			truncated++;
4366			break;
4367			}
4368	34158	100	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
4369	34158	100	if (u < 256) {
4370	328		const unsigned char c = (unsigned char)u & 0xFF;
4371	328	100	if (flags & UNI_DISPLAY_BACKSLASH) {
4372	66		switch (c) {
4373			case '\n':
4374	2		ok = 'n'; break;
4375			case '\r':
4376	2		ok = 'r'; break;
4377			case '\t':
4378	2		ok = 't'; break;
4379			case '\f':
4380	2		ok = 'f'; break;
4381			case '\a':
4382	2		ok = 'a'; break;
4383			case '\\':
4384	2		ok = '\\'; break;
4385			default: break;
4386			}
4387	66	100	if (ok) {
4388	12		const char string = ok;
4389	12		sv_catpvs(dsv, "\\");
4390	12		sv_catpvn(dsv, &string, 1);
4391			}
4392			}
4393			/* isPRINT() is the locale-blind version. */
4394	328	100	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
		50
		100
4395	160		const char string = c;
4396	160		sv_catpvn(dsv, &string, 1);
4397			ok = 1;
4398			}
4399			}
4400	34158	100	if (!ok)
4401	33986		Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
4402			}
4403	33996	100	if (truncated)
4404	14		sv_catpvs(dsv, "...");
4405
4406	33996		return SvPVX(dsv);
4407			}
4408
4409			/*
4410			=for apidoc sv_uni_display
4411
4412			Build to the scalar C a displayable version of the scalar C,
4413			the displayable version being at most C bytes long
4414			(if longer, the rest is truncated and "..." will be appended).
4415
4416			The C argument is as in L().
4417
4418			The pointer to the PV of the C is returned.
4419
4420			=cut
4421			*/
4422			char *
4423	33996		Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
4424			{
4425			const char * const ptr =
4426	33996	50	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
		50
4427
4428			PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4429
4430	33996		return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4431	33996		SvCUR(ssv), pvlim, flags);
4432			}
4433
4434			/*
4435			=for apidoc foldEQ_utf8
4436
4437			Returns true if the leading portions of the strings C and C (either or both
4438			of which may be in UTF-8) are the same case-insensitively; false otherwise.
4439			How far into the strings to compare is determined by other input parameters.
4440
4441			If C is true, the string C is assumed to be in UTF-8-encoded Unicode;
4442			otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C
4443			with respect to C.
4444
4445			If the byte length C is non-zero, it says how far into C to check for fold
4446			equality. In other words, C+C will be used as a goal to reach. The
4447			scan will not be considered to be a match unless the goal is reached, and
4448			scanning won't continue past that goal. Correspondingly for C with respect to
4449			C.
4450
4451			If C is non-NULL and the pointer it points to is not NULL, that pointer is
4452			considered an end pointer to the position 1 byte past the maximum point
4453			in C beyond which scanning will not continue under any circumstances.
4454			(This routine assumes that UTF-8 encoded input strings are not malformed;
4455			malformed input can cause it to read past C).
4456			This means that if both C and C are specified, and C
4457			is less than C+C, the match will never be successful because it can
4458			never
4459			get as far as its goal (and in fact is asserted against). Correspondingly for
4460			C with respect to C.
4461
4462			At least one of C and C must have a goal (at least one of C and
4463			C must be non-zero), and if both do, both have to be
4464			reached for a successful match. Also, if the fold of a character is multiple
4465			characters, all of them must be matched (see tr21 reference below for
4466			'folding').
4467
4468			Upon a successful match, if C is non-NULL,
4469			it will be set to point to the beginning of the I character of C
4470			beyond what was matched. Correspondingly for C and C.
4471
4472			For case-insensitiveness, the "casefolding" of Unicode is used
4473			instead of upper/lowercasing both the characters, see
4474			L (Case Mappings).
4475
4476			=cut */
4477
4478			/* A flags parameter has been added which may change, and hence isn't
4479			* externally documented. Currently it is:
4480			* 0 for as-documented above
4481			* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4482			ASCII one, to not match
4483			* FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code
4484			* points below 256; unicode rules for above 255; and
4485			* folds that cross those boundaries are disallowed,
4486			* like the NOMIX_ASCII option
4487			* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
4488			* routine. This allows that step to be skipped.
4489			* FOLDEQ_S2_ALREADY_FOLDED Similarly.
4490			*/
4491			I32
4492	1342716		Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, UV l1, bool u1, const char s2, char **pe2, UV l2, bool u2, U32 flags)
4493			{
4494			dVAR;
4495			const U8 p1 = (const U8)s1; /* Point to current char */
4496			const U8 p2 = (const U8)s2;
4497			const U8 g1 = NULL; / goal for s1 */
4498			const U8 *g2 = NULL;
4499			const U8 e1 = NULL; / Don't scan s1 past this */
4500			U8 f1 = NULL; / Point to current folded */
4501			const U8 *e2 = NULL;
4502			U8 *f2 = NULL;
4503	1342716		STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
4504			U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4505			U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4506
4507			PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4508
4509			assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_UTF8_LOCALE))
4510			&& (flags & (FOLDEQ_S1_ALREADY_FOLDED \| FOLDEQ_S2_ALREADY_FOLDED))));
4511			/* The algorithm is to trial the folds without regard to the flags on
4512			* the first line of the above assert(), and then see if the result
4513			* violates them. This means that the inputs can't be pre-folded to a
4514			* violating result, hence the assert. This could be changed, with the
4515			* addition of extra tests here for the already-folded case, which would
4516			* slow it down. That cost is more than any possible gain for when these
4517			* flags are specified, as the flags indicate /il or /iaa matching which
4518			* is less common than /iu, and I (khw) also believe that real-world /il
4519			* and /iaa matches are most likely to involve code points 0-255, and this
4520			* function only under rare conditions gets called for 0-255. */
4521
4522	1342716	100	if (pe1) {
4523	417046		e1 = (U8*)pe1;
4524			}
4525
4526	1342716	100	if (l1) {
4527	925670		g1 = (const U8*)s1 + l1;
4528			}
4529
4530	1342716	100	if (pe2) {
4531	925670		e2 = (U8*)pe2;
4532			}
4533
4534	1342716	100	if (l2) {
4535	417046		g2 = (const U8*)s2 + l2;
4536			}
4537
4538			/* Must have at least one goal */
4539			assert(g1 \|\| g2);
4540
4541	1342716	100	if (g1) {
4542
4543			/* Will never match if goal is out-of-bounds */
4544			assert(! e1 \|\| e1 >= g1);
4545
4546			/* Here, there isn't an end pointer, or it is beyond the goal. We
4547			* only go as far as the goal */
4548			e1 = g1;
4549			}
4550			else {
4551			assert(e1); /* Must have an end for looking at s1 */
4552			}
4553
4554			/* Same for goal for s2 */
4555	1342716	100	if (g2) {
4556			assert(! e2 \|\| e2 >= g2);
4557			e2 = g2;
4558			}
4559			else {
4560			assert(e2);
4561			}
4562
4563			/* If both operands are already folded, we could just do a memEQ on the
4564			* whole strings at once, but it would be better if the caller realized
4565			* this and didn't even call us */
4566
4567			/* Look through both strings, a character at a time */
4568	2542844	100	while (p1 < e1 && p2 < e2) {
4569
4570			/* If at the beginning of a new character in s1, get its fold to use
4571			* and the length of the fold. (exception: locale rules just get the
4572			* character to a single byte) */
4573	1609316	100	if (n1 == 0) {
4574	1582220	100	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4575			f1 = (U8 *) p1;
4576	523574		n1 = UTF8SKIP(f1);
4577			}
4578			else {
4579			/* If in locale matching, we use two sets of rules, depending
4580			* on if the code point is above or below 255. Here, we test
4581			* for and handle locale rules */
4582	1058646	100	if ((flags & FOLDEQ_UTF8_LOCALE)
4583	507440	100	&& (! u1 \|\| ! UTF8_IS_ABOVE_LATIN1(*p1)))
		100
4584			{
4585			/* There is no mixing of code points above and below 255. */
4586	433594	100	if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) {
		100
4587			return 0;
4588			}
4589
4590			/* We handle locale rules by converting, if necessary, the
4591			* code point to a single byte. */
4592	391934	100	if (! u1 \|\| UTF8_IS_INVARIANT(*p1)) {
		100
4593	372000		foldbuf1 = p1;
4594			}
4595			else {
4596	19934		foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(p1, *(p1 + 1));
4597			}
4598	391934		n1 = 1;
4599			}
4600	625052	100	else if (isASCII(p1)) { / Note, that here won't be both
4601			ASCII and using locale rules */
4602
4603			/* If trying to mix non- with ASCII, and not supposed to,
4604			* fail */
4605	342356	100	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
		100
4606			return 0;
4607			}
4608	318190		n1 = 1;
4609	318190	100	foldbuf1 = toFOLD(p1);
4610			}
4611	282696	100	else if (u1) {
4612	244502		to_utf8_fold(p1, foldbuf1, &n1);
4613			}
4614			else { /* Not utf8, get utf8 fold */
4615	38194		to_uni_fold(*p1, foldbuf1, &n1);
4616			}
4617			f1 = foldbuf1;
4618			}
4619			}
4620
4621	1543490	100	if (n2 == 0) { /* Same for s2 */
4622	1487888	100	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4623			f2 = (U8 *) p2;
4624	182386		n2 = UTF8SKIP(f2);
4625			}
4626			else {
4627	1305502	100	if ((flags & FOLDEQ_UTF8_LOCALE)
4628	467818	100	&& (! u2 \|\| ! UTF8_IS_ABOVE_LATIN1(*p2)))
		100
4629			{
4630			/* Here, the next char in s2 is < 256. We've already
4631			* worked on s1, and if it isn't also < 256, can't match */
4632	455012	100	if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) {
		100
4633			return 0;
4634			}
4635	391934	100	if (! u2 \|\| UTF8_IS_INVARIANT(*p2)) {
		100
4636	379464		foldbuf2 = p2;
4637			}
4638			else {
4639	12470		foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(p2, *(p2 + 1));
4640			}
4641
4642			/* Use another function to handle locale rules. We've made
4643			* sure that both characters to compare are single bytes */
4644	391934	100	if (! foldEQ_locale((char ) f1, (char ) foldbuf2, 1)) {
4645			return 0;
4646			}
4647	302340		n1 = n2 = 0;
4648			}
4649	850490	100	else if (isASCII(*p2)) {
4650	536880	100	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
		100
4651			return 0;
4652			}
4653	492254		n2 = 1;
4654	492254	100	foldbuf2 = toFOLD(p2);
4655			}
4656	313610	100	else if (u2) {
4657	803144		to_utf8_fold(p2, foldbuf2, &n2);
4658			}
4659			else {
4660	53514		to_uni_fold(*p2, foldbuf2, &n2);
4661			}
4662			f2 = foldbuf2;
4663			}
4664			}
4665
4666			/* Here f1 and f2 point to the beginning of the strings to compare.
4667			* These strings are the folds of the next character from each input
4668			* string, stored in utf8. */
4669
4670			/* While there is more to look for in both folds, see if they
4671			* continue to match */
4672	2251468	100	while (n1 && n2) {
		100
4673	1051340		U8 fold_length = UTF8SKIP(f1);
4674	1051340	100	if (fold_length != UTF8SKIP(f2)
4675	986510	100	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
		100
4676			function call for single
4677			byte */
4678	905722	100	\|\| memNE((char)f1, (char)f2, fold_length))
4679			{
4680			return 0; /* mismatch */
4681			}
4682
4683			/* Here, they matched, advance past them */
4684	905276		n1 -= fold_length;
4685	905276		f1 += fold_length;
4686	905276		n2 -= fold_length;
4687	905276		f2 += fold_length;
4688			}
4689
4690			/* When reach the end of any fold, advance the input past it */
4691	1200128	100	if (n1 == 0) {
4692	1172938	100	p1 += u1 ? UTF8SKIP(p1) : 1;
4693			}
4694	1200128	100	if (n2 == 0) {
4695	1171507	100	p2 += u2 ? UTF8SKIP(p2) : 1;
4696			}
4697			} /* End of loop through both strings */
4698
4699			/* A match is defined by each scan that specified an explicit length
4700			* reaching its final goal, and the other not having matched a partial
4701			* character (which can happen when the fold of a character is more than one
4702			* character). */
4703	933528	100	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
		100
		100
		100
4704			return 0;
4705			}
4706
4707			/* Successful match. Set output pointers */
4708	734446	100	if (pe1) {
4709	267984		pe1 = (char)p1;
4710			}
4711	734446	100	if (pe2) {
4712	904589		pe2 = (char)p2;
4713			}
4714			return 1;
4715	0		}
4716
4717			/*
4718			* Local variables:
4719			* c-indentation-style: bsd
4720			* c-basic-offset: 4
4721			* indent-tabs-mode: nil
4722			* End:
4723			*
4724			* ex: set ts=8 sts=4 sw=4 et:
4725			*/