File Coverage

utf8.c
Criterion Covered Total %
statement 1065 1354 78.7
branch 1030 1790 57.5
condition n/a
subroutine n/a
total 2095 3144 66.6


line stmt bran cond sub time code
1           /* utf8.c
2           *
3           * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4           * by Larry Wall and others
5           *
6           * You may distribute under the terms of either the GNU General Public
7           * License or the Artistic License, as specified in the README file.
8           *
9           */
10            
11           /*
12           * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13           * heard of that we don't want to see any closer; and that's the one place
14           * we're trying to get to! And that's just where we can't get, nohow.'
15           *
16           * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17           *
18           * 'Well do I understand your speech,' he answered in the same language;
19           * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20           * as is the custom in the West, if you wish to be answered?'
21           * --Gandalf, addressing Théoden's door wardens
22           *
23           * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24           *
25           * ...the travellers perceived that the floor was paved with stones of many
26           * hues; branching runes and strange devices intertwined beneath their feet.
27           *
28           * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29           */
30            
31           #include "EXTERN.h"
32           #define PERL_IN_UTF8_C
33           #include "perl.h"
34           #include "inline_invlist.c"
35            
36           static const char unees[] =
37           "Malformed UTF-8 character (unexpected end of string)";
38            
39           /*
40           =head1 Unicode Support
41            
42           This file contains various utility functions for manipulating UTF8-encoded
43           strings. For the uninitiated, this is a method of representing arbitrary
44           Unicode characters as a variable number of bytes, in such a way that
45           characters in the ASCII range are unmodified, and a zero byte never appears
46           within non-zero characters.
47            
48           =cut
49           */
50            
51           /*
52           =for apidoc is_ascii_string
53            
54           Returns true if the first C bytes of the string C are the same whether
55           or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
56           is, if they are invariant. On ASCII-ish machines, only ASCII characters
57           fit this definition, hence the function's name.
58            
59           If C is 0, it will be calculated using C.
60            
61           See also L(), L(), and L().
62            
63           =cut
64           */
65            
66           bool
67 533506         Perl_is_ascii_string(const U8 *s, STRLEN len)
68           {
69 533506 50       const U8* const send = s + (len ? len : strlen((const char *)s));
70           const U8* x = s;
71            
72           PERL_ARGS_ASSERT_IS_ASCII_STRING;
73            
74 13381006 100       for (; x < send; ++x) {
75 12849638 100       if (!UTF8_IS_INVARIANT(*x))
76           break;
77           }
78            
79 533506         return x == send;
80           }
81            
82           /*
83           =for apidoc uvoffuni_to_utf8_flags
84            
85           THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
86           Instead, B or
87           L>.
88            
89           This function is like them, but the input is a strict Unicode
90           (as opposed to native) code point. Only in very rare circumstances should code
91           not be using the native code point.
92            
93           For details, see the description for L>.
94            
95           =cut
96           */
97            
98           U8 *
99 8636950         Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
100           {
101           PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
102            
103 8636950 100       if (UNI_IS_INVARIANT(uv)) {
104 4073184         *d++ = (U8) LATIN1_TO_NATIVE(uv);
105 4073184         return d;
106           }
107            
108           /* The first problematic code point is the first surrogate */
109 4563766 100       if (uv >= UNICODE_SURROGATE_FIRST
110 946222 100       && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
111           {
112 923878 100       if (UNICODE_IS_SURROGATE(uv)) {
113 4936 50       if (flags & UNICODE_WARN_SURROGATE) {
114 0         Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
115           "UTF-16 surrogate U+%04"UVXf, uv);
116           }
117 4936 50       if (flags & UNICODE_DISALLOW_SURROGATE) {
118           return NULL;
119           }
120           }
121 918942 100       else if (UNICODE_IS_SUPER(uv)) {
122 77488 50       if (flags & UNICODE_WARN_SUPER
123 77488 100       || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
    50        
124           {
125 0         Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
126           "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
127           }
128 77488 50       if (flags & UNICODE_DISALLOW_SUPER
129 77488 100       || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
    50        
130           {
131           return NULL;
132           }
133           }
134 841454 100       else if (UNICODE_IS_NONCHAR(uv)) {
    100        
135 912 50       if (flags & UNICODE_WARN_NONCHAR) {
136 0         Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
137           "Unicode non-character U+%04"UVXf" is illegal for open interchange",
138           uv);
139           }
140 912 50       if (flags & UNICODE_DISALLOW_NONCHAR) {
141           return NULL;
142           }
143           }
144           }
145            
146           #if defined(EBCDIC)
147           {
148           STRLEN len = OFFUNISKIP(uv);
149           U8 *p = d+len-1;
150           while (p > d) {
151           *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
152           uv >>= UTF_ACCUMULATION_SHIFT;
153           }
154           *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
155           return d+len;
156           }
157           #else /* Non loop style */
158 4563766 100       if (uv < 0x800) {
159 1453792         *d++ = (U8)(( uv >> 6) | 0xc0);
160 1453792         *d++ = (U8)(( uv & 0x3f) | 0x80);
161 1453792         return d;
162           }
163 3109974 100       if (uv < 0x10000) {
164 2810290         *d++ = (U8)(( uv >> 12) | 0xe0);
165 2810290         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
166 2810290         *d++ = (U8)(( uv & 0x3f) | 0x80);
167 2810290         return d;
168           }
169 299684 100       if (uv < 0x200000) {
170 222796         *d++ = (U8)(( uv >> 18) | 0xf0);
171 222796         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
172 222796         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
173 222796         *d++ = (U8)(( uv & 0x3f) | 0x80);
174 222796         return d;
175           }
176 76888 100       if (uv < 0x4000000) {
177 32         *d++ = (U8)(( uv >> 24) | 0xf8);
178 32         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
179 32         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
180 32         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
181 32         *d++ = (U8)(( uv & 0x3f) | 0x80);
182 32         return d;
183           }
184 76856 100       if (uv < 0x80000000) {
185 68         *d++ = (U8)(( uv >> 30) | 0xfc);
186 68         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
187 68         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
188 68         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
189 68         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
190 68         *d++ = (U8)(( uv & 0x3f) | 0x80);
191 68         return d;
192           }
193           #ifdef HAS_QUAD
194 76788 100       if (uv < UTF8_QUAD_MAX)
195           #endif
196           {
197 34         *d++ = 0xfe; /* Can't match U+FEFF! */
198 34         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
199 34         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
200 34         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
201 34         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
202 34         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
203 34         *d++ = (U8)(( uv & 0x3f) | 0x80);
204 34         return d;
205           }
206           #ifdef HAS_QUAD
207           {
208 76754         *d++ = 0xff; /* Can't match U+FFFE! */
209 76754         *d++ = 0x80; /* 6 Reserved bits */
210 76754         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
211 76754         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
212 76754         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
213 76754         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
214 76754         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
215 76754         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
216 76754         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
217 76754         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
218 76754         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
219 76754         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
220 76754         *d++ = (U8)(( uv & 0x3f) | 0x80);
221 4385292         return d;
222           }
223           #endif
224           #endif /* Non loop style */
225           }
226           /*
227           =for apidoc uvchr_to_utf8
228            
229           Adds the UTF-8 representation of the native code point C to the end
230           of the string C; C should have at least C free
231           bytes available. The return value is the pointer to the byte after the
232           end of the new character. In other words,
233            
234           d = uvchr_to_utf8(d, uv);
235            
236           is the recommended wide native character-aware way of saying
237            
238           *(d++) = uv;
239            
240           This function accepts any UV as input. To forbid or warn on non-Unicode code
241           points, or those that may be problematic, see L.
242            
243           =cut
244           */
245            
246           /* This is also a macro */
247           PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
248            
249           U8 *
250 0         Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
251           {
252 0         return uvchr_to_utf8(d, uv);
253           }
254            
255           /*
256           =for apidoc uvchr_to_utf8_flags
257            
258           Adds the UTF-8 representation of the native code point C to the end
259           of the string C; C should have at least C free
260           bytes available. The return value is the pointer to the byte after the
261           end of the new character. In other words,
262            
263           d = uvchr_to_utf8_flags(d, uv, flags);
264            
265           or, in most cases,
266            
267           d = uvchr_to_utf8_flags(d, uv, 0);
268            
269           This is the Unicode-aware way of saying
270            
271           *(d++) = uv;
272            
273           This function will convert to UTF-8 (and not warn) even code points that aren't
274           legal Unicode or are problematic, unless C contains one or more of the
275           following flags:
276            
277           If C is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
278           the function will raise a warning, provided UTF8 warnings are enabled. If instead
279           UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
280           If both flags are set, the function will both warn and return NULL.
281            
282           The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
283           affect how the function handles a Unicode non-character. And likewise, the
284           UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
285           code points that are
286           above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
287           even less portable) can be warned and/or disallowed even if other above-Unicode
288           code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
289           flags.
290            
291           And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
292           above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
293           DISALLOW flags.
294            
295           =cut
296           */
297            
298           /* This is also a macro */
299           PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
300            
301           U8 *
302 0         Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
303           {
304 0         return uvchr_to_utf8_flags(d, uv, flags);
305           }
306            
307           /*
308            
309           Tests if the first C bytes of string C form a valid UTF-8
310           character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
311           valid UTF-8 character. The number of bytes in the UTF-8 character
312           will be returned if it is valid, otherwise 0.
313            
314           This is the "slow" version as opposed to the "fast" version which is
315           the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
316           difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
317           or less you should use the IS_UTF8_CHAR(), for lengths of five or more
318           you should use the _slow(). In practice this means that the _slow()
319           will be used very rarely, since the maximum Unicode code point (as of
320           Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
321           the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
322           five bytes or more.
323            
324           =cut */
325           PERL_STATIC_INLINE STRLEN
326 32         S_is_utf8_char_slow(const U8 *s, const STRLEN len)
327           {
328           dTHX; /* The function called below requires thread context */
329            
330           STRLEN actual_len;
331            
332           PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
333            
334 32         utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
335            
336 32 50       return (actual_len == (STRLEN) -1) ? 0 : actual_len;
337           }
338            
339           /*
340           =for apidoc is_utf8_char_buf
341            
342           Returns the number of bytes that comprise the first UTF-8 encoded character in
343           buffer C. C should point to one position beyond the end of the
344           buffer. 0 is returned if C does not point to a complete, valid UTF-8
345           encoded character.
346            
347           Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
348           machines) is a valid UTF-8 character.
349            
350           =cut */
351            
352           STRLEN
353 410108         Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
354           {
355            
356           STRLEN len;
357            
358           PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
359            
360 410108 50       if (buf_end <= buf) {
361           return 0;
362           }
363            
364 410108         len = buf_end - buf;
365 410108 50       if (len > UTF8SKIP(buf)) {
366 0         len = UTF8SKIP(buf);
367           }
368            
369           #ifdef IS_UTF8_CHAR
370 410108 100       if (IS_UTF8_CHAR_FAST(len))
371 410104 100       return IS_UTF8_CHAR(buf, len) ? len : 0;
    50        
    50        
    0        
    50        
    0        
    0        
    50        
    0        
    0        
    100        
    50        
    50        
    50        
    50        
    50        
    50        
    50        
    100        
    50        
    50        
    0        
    100        
    50        
    50        
    50        
    100        
    50        
    100        
    50        
    50        
    50        
    50        
    50        
    100        
372           #endif /* #ifdef IS_UTF8_CHAR */
373 205056         return is_utf8_char_slow(buf, len);
374           }
375            
376           /*
377           =for apidoc is_utf8_char
378            
379           Tests if some arbitrary number of bytes begins in a valid UTF-8
380           character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
381           character is a valid UTF-8 character. The actual number of bytes in the UTF-8
382           character will be returned if it is valid, otherwise 0.
383            
384           This function is deprecated due to the possibility that malformed input could
385           cause reading beyond the end of the input buffer. Use L
386           instead.
387            
388           =cut */
389            
390           STRLEN
391 0         Perl_is_utf8_char(const U8 *s)
392           {
393           PERL_ARGS_ASSERT_IS_UTF8_CHAR;
394            
395           /* Assumes we have enough space, which is why this is deprecated */
396 0         return is_utf8_char_buf(s, s + UTF8SKIP(s));
397           }
398            
399            
400           /*
401           =for apidoc is_utf8_string
402            
403           Returns true if the first C bytes of string C form a valid
404           UTF-8 string, false otherwise. If C is 0, it will be calculated
405           using C (which means if you use this option, that C has to have a
406           terminating NUL byte). Note that all characters being ASCII constitute 'a
407           valid UTF-8 string'.
408            
409           See also L(), L(), and L().
410            
411           =cut
412           */
413            
414           bool
415 32690         Perl_is_utf8_string(const U8 *s, STRLEN len)
416           {
417 32690 100       const U8* const send = s + (len ? len : strlen((const char *)s));
418           const U8* x = s;
419            
420           PERL_ARGS_ASSERT_IS_UTF8_STRING;
421            
422 787575 100       while (x < send) {
423           /* Inline the easy bits of is_utf8_char() here for speed... */
424 738552 100       if (UTF8_IS_INVARIANT(*x)) {
425 442400         x++;
426           }
427           else {
428           /* ... and call is_utf8_char() only if really needed. */
429 296152         const STRLEN c = UTF8SKIP(x);
430 296152         const U8* const next_char_ptr = x + c;
431            
432 296152 100       if (next_char_ptr > send) {
433           return FALSE;
434           }
435            
436 296144 100       if (IS_UTF8_CHAR_FAST(c)) {
437 296116 100       if (!IS_UTF8_CHAR(x, c))
    50        
    50        
    0        
    50        
    0        
    0        
    50        
    0        
    0        
    100        
    50        
    50        
    50        
    50        
    50        
    50        
    50        
    100        
    50        
    50        
    0        
    100        
    50        
    50        
    50        
    100        
    50        
    100        
    50        
    50        
    50        
    50        
    50        
    100        
438           return FALSE;
439           }
440 369278 50       else if (! is_utf8_char_slow(x, c)) {
441           return FALSE;
442           }
443           x = next_char_ptr;
444           }
445           }
446            
447           return TRUE;
448           }
449            
450           /*
451           Implemented as a macro in utf8.h
452            
453           =for apidoc is_utf8_string_loc
454            
455           Like L but stores the location of the failure (in the
456           case of "utf8ness failure") or the location C+C (in the case of
457           "utf8ness success") in the C.
458            
459           See also L() and L().
460            
461           =for apidoc is_utf8_string_loclen
462            
463           Like L() but stores the location of the failure (in the
464           case of "utf8ness failure") or the location C+C (in the case of
465           "utf8ness success") in the C, and the number of UTF-8
466           encoded characters in the C.
467            
468           See also L() and L().
469            
470           =cut
471           */
472            
473           bool
474 2290         Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
475           {
476 2290 50       const U8* const send = s + (len ? len : strlen((const char *)s));
477           const U8* x = s;
478           STRLEN c;
479           STRLEN outlen = 0;
480            
481           PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
482            
483 98919 100       while (x < send) {
484           const U8* next_char_ptr;
485            
486           /* Inline the easy bits of is_utf8_char() here for speed... */
487 95494 100       if (UTF8_IS_INVARIANT(*x))
488 89540         next_char_ptr = x + 1;
489           else {
490           /* ... and call is_utf8_char() only if really needed. */
491 5954         c = UTF8SKIP(x);
492 5954         next_char_ptr = c + x;
493 5954 100       if (next_char_ptr > send) {
494           goto out;
495           }
496 5948 50       if (IS_UTF8_CHAR_FAST(c)) {
497 5948 100       if (!IS_UTF8_CHAR(x, c))
    50        
    50        
    0        
    50        
    0        
    0        
    50        
    0        
    0        
    50        
    50        
    50        
    50        
    0        
    0        
    0        
    0        
    100        
    50        
    50        
    0        
    100        
    50        
    50        
    50        
    50        
    50        
    100        
    50        
    100        
    50        
    50        
    50        
    100        
498           c = 0;
499           } else
500 0         c = is_utf8_char_slow(x, c);
501 5948 100       if (!c)
502           goto out;
503           }
504           x = next_char_ptr;
505 95484         outlen++;
506           }
507            
508           out:
509 2290 50       if (el)
510 0         *el = outlen;
511            
512 2290 50       if (ep)
513 2290         *ep = x;
514 2290         return (x == send);
515           }
516            
517           /*
518            
519           =for apidoc utf8n_to_uvchr
520            
521           THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
522           Most code should use L() rather than call this directly.
523            
524           Bottom level UTF-8 decode routine.
525           Returns the native code point value of the first character in the string C,
526           which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
527           C bytes; C<*retlen> (if C isn't NULL) will be set to
528           the length, in bytes, of that character.
529            
530           The value of C determines the behavior when C does not point to a
531           well-formed UTF-8 character. If C is 0, when a malformation is found,
532           zero is returned and C<*retlen> is set so that (S + C<*retlen>>) is the
533           next possible position in C that could begin a non-malformed character.
534           Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
535            
536           Various ALLOW flags can be set in C to allow (and not warn on)
537           individual types of malformations, such as the sequence being overlong (that
538           is, when there is a shorter sequence that can express the same code point;
539           overlong sequences are expressly forbidden in the UTF-8 standard due to
540           potential security issues). Another malformation example is the first byte of
541           a character not being a legal first byte. See F for the list of such
542           flags. For allowed 0 length strings, this function returns 0; for allowed
543           overlong sequences, the computed code point is returned; for all other allowed
544           malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
545           determinable reasonable value.
546            
547           The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
548           flags) malformation is found. If this flag is set, the routine assumes that
549           the caller will raise a warning, and this function will silently just set
550           C to C<-1> (cast to C) and return zero.
551            
552           Note that this API requires disambiguation between successful decoding a NUL
553           character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
554           in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
555           first byte of C is 0 as well. If so, the input was a NUL; if not, the input
556           had an error.
557            
558           Certain code points are considered problematic. These are Unicode surrogates,
559           Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
560           By default these are considered regular code points, but certain situations
561           warrant special handling for them. If C contains
562           UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
563           malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
564           UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
565           maximum) can be set to disallow these categories individually.
566            
567           The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
568           UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
569           for their respective categories, but otherwise the code points are considered
570           valid (not malformations). To get a category to both be treated as a
571           malformation and raise a warning, specify both the WARN and DISALLOW flags.
572           (But note that warnings are not raised if lexically disabled nor if
573           UTF8_CHECK_ONLY is also specified.)
574            
575           Very large code points (above 0x7FFF_FFFF) are considered more problematic than
576           the others that are above the Unicode legal maximum. There are several
577           reasons: they requre at least 32 bits to represent them on ASCII platforms, are
578           not representable at all on EBCDIC platforms, and the original UTF-8
579           specification never went above this number (the current 0x10FFFF limit was
580           imposed later). (The smaller ones, those that fit into 32 bits, are
581           representable by a UV on ASCII platforms, but not by an IV, which means that
582           the number of operations that can be performed on them is quite restricted.)
583           The UTF-8 encoding on ASCII platforms for these large code points begins with a
584           byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
585           be treated as malformations, while allowing smaller above-Unicode code points.
586           (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
587           including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
588           the other WARN flags, but applies just to these code points.
589            
590           All other code points corresponding to Unicode characters, including private
591           use and those yet to be assigned, are never considered malformed and never
592           warn.
593            
594           =cut
595           */
596            
597           UV
598 16071668         Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
599           {
600           dVAR;
601           const U8 * const s0 = s;
602           U8 overflow_byte = '\0'; /* Save byte in case of overflow */
603           U8 * send;
604 16071668         UV uv = *s;
605           STRLEN expectlen;
606           SV* sv = NULL;
607           UV outlier_ret = 0; /* return value when input is in error or problematic
608           */
609           UV pack_warn = 0; /* Save result of packWARN() for later */
610           bool unexpected_non_continuation = FALSE;
611           bool overflowed = FALSE;
612           bool do_overlong_test = TRUE; /* May have to skip this test */
613            
614           const char* const malformed_text = "Malformed UTF-8 character";
615            
616           PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
617            
618           /* The order of malformation tests here is important. We should consume as
619           * few bytes as possible in order to not skip any valid character. This is
620           * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
621           * http://unicode.org/reports/tr36 for more discussion as to why. For
622           * example, once we've done a UTF8SKIP, we can tell the expected number of
623           * bytes, and could fail right off the bat if the input parameters indicate
624           * that there are too few available. But it could be that just that first
625           * byte is garbled, and the intended character occupies fewer bytes. If we
626           * blindly assumed that the first byte is correct, and skipped based on
627           * that number, we could skip over a valid input character. So instead, we
628           * always examine the sequence byte-by-byte.
629           *
630           * We also should not consume too few bytes, otherwise someone could inject
631           * things. For example, an input could be deliberately designed to
632           * overflow, and if this code bailed out immediately upon discovering that,
633           * returning to the caller *retlen pointing to the very next byte (one
634           * which is actually part of of the overflowing sequence), that could look
635           * legitimate to the caller, which could discard the initial partial
636           * sequence and process the rest, inappropriately */
637            
638           /* Zero length strings, if allowed, of necessity are zero */
639 16071668 100       if (UNLIKELY(curlen == 0)) {
640 10 50       if (retlen) {
641 10         *retlen = 0;
642           }
643            
644 10 100       if (flags & UTF8_ALLOW_EMPTY) {
645           return 0;
646           }
647 8 100       if (! (flags & UTF8_CHECK_ONLY)) {
648 4         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
649           }
650           goto malformed;
651           }
652            
653 16071658         expectlen = UTF8SKIP(s);
654            
655           /* A well-formed UTF-8 character, as the vast majority of calls to this
656           * function will be for, has this expected length. For efficiency, set
657           * things up here to return it. It will be overriden only in those rare
658           * cases where a malformation is found */
659 16071658 100       if (retlen) {
660 13947306         *retlen = expectlen;
661           }
662            
663           /* An invariant is trivially well-formed */
664 16071658 100       if (UTF8_IS_INVARIANT(uv)) {
665           return uv;
666           }
667            
668           /* A continuation character can't start a valid sequence */
669 4409996 100       if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
670 198 100       if (flags & UTF8_ALLOW_CONTINUATION) {
671 2 50       if (retlen) {
672 2         *retlen = 1;
673           }
674           return UNICODE_REPLACEMENT;
675           }
676            
677 196 100       if (! (flags & UTF8_CHECK_ONLY)) {
678 194         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
679           }
680           curlen = 1;
681           goto malformed;
682           }
683            
684           /* Here is not a continuation byte, nor an invariant. The only thing left
685           * is a start byte (possibly for an overlong) */
686            
687           #ifdef EBCDIC
688           uv = NATIVE_UTF8_TO_I8(uv);
689           #endif
690            
691           /* Remove the leading bits that indicate the number of bytes in the
692           * character's whole UTF-8 sequence, leaving just the bits that are part of
693           * the value */
694 4409798 100       uv &= UTF_START_MASK(expectlen);
695            
696           /* Now, loop through the remaining bytes in the character's sequence,
697           * accumulating each into the working value as we go. Be sure to not look
698           * past the end of the input string */
699 4409798         send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
700            
701 12275328 100       for (s = s0 + 1; s < send; s++) {
702 7866060 100       if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
703           #ifndef EBCDIC /* Can't overflow in EBCDIC */
704 7865530 100       if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
705            
706           /* The original implementors viewed this malformation as more
707           * serious than the others (though I, khw, don't understand
708           * why, since other malformations also give very very wrong
709           * results), so there is no way to turn off checking for it.
710           * Set a flag, but keep going in the loop, so that we absorb
711           * the rest of the bytes that comprise the character. */
712           overflowed = TRUE;
713 92         overflow_byte = *s; /* Save for warning message's use */
714           }
715           #endif
716 7865530         uv = UTF8_ACCUMULATE(uv, *s);
717           }
718           else {
719           /* Here, found a non-continuation before processing all expected
720           * bytes. This byte begins a new character, so quit, even if
721           * allowing this malformation. */
722           unexpected_non_continuation = TRUE;
723           break;
724           }
725           } /* End of loop through the character's bytes */
726            
727           /* Save how many bytes were actually in the character */
728 4409798         curlen = s - s0;
729            
730           /* The loop above finds two types of malformations: non-continuation and/or
731           * overflow. The non-continuation malformation is really a too-short
732           * malformation, as it means that the current character ended before it was
733           * expected to (being terminated prematurely by the beginning of the next
734           * character, whereas in the too-short malformation there just are too few
735           * bytes available to hold the character. In both cases, the check below
736           * that we have found the expected number of bytes would fail if executed.)
737           * Thus the non-continuation malformation is really unnecessary, being a
738           * subset of the too-short malformation. But there may be existing
739           * applications that are expecting the non-continuation type, so we retain
740           * it, and return it in preference to the too-short malformation. (If this
741           * code were being written from scratch, the two types might be collapsed
742           * into one.) I, khw, am also giving priority to returning the
743           * non-continuation and too-short malformations over overflow when multiple
744           * ones are present. I don't know of any real reason to prefer one over
745           * the other, except that it seems to me that multiple-byte errors trumps
746           * errors from a single byte */
747 4409798 100       if (UNLIKELY(unexpected_non_continuation)) {
748 530 100       if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
749 526 100       if (! (flags & UTF8_CHECK_ONLY)) {
750 272 100       if (curlen == 1) {
751 254         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
752           }
753           else {
754 18         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
755           }
756           }
757           goto malformed;
758           }
759           uv = UNICODE_REPLACEMENT;
760            
761           /* Skip testing for overlongs, as the REPLACEMENT may not be the same
762           * as what the original expectations were. */
763           do_overlong_test = FALSE;
764 4 50       if (retlen) {
765 4         *retlen = curlen;
766           }
767           }
768 4409268 100       else if (UNLIKELY(curlen < expectlen)) {
769 44 100       if (! (flags & UTF8_ALLOW_SHORT)) {
770 42 100       if (! (flags & UTF8_CHECK_ONLY)) {
771 40 100       sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
772           }
773           goto malformed;
774           }
775           uv = UNICODE_REPLACEMENT;
776           do_overlong_test = FALSE;
777 2 50       if (retlen) {
778 2         *retlen = curlen;
779           }
780           }
781            
782           #ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
783 4409230 100       if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
784 68358 100       && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
785           {
786           /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
787           * generation of the sv, since no warnings are raised under CHECK */
788 150 100       if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
789 60 100       && ckWARN_d(WARN_UTF8))
790           {
791           /* This message is deliberately not of the same syntax as the other
792           * messages for malformations, for backwards compatibility in the
793           * unlikely event that code is relying on its precise earlier text
794           */
795 12         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
796           pack_warn = packWARN(WARN_UTF8);
797           }
798 150 100       if (flags & UTF8_DISALLOW_FE_FF) {
799           goto malformed;
800           }
801           }
802 4409110 100       if (UNLIKELY(overflowed)) {
803            
804           /* If the first byte is FF, it will overflow a 32-bit word. If the
805           * first byte is FE, it will overflow a signed 32-bit word. The
806           * above preserves backward compatibility, since its message was used
807           * in earlier versions of this code in preference to overflow */
808 46         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
809 46         goto malformed;
810           }
811           #endif
812            
813 4409064 100       if (do_overlong_test
814 4409058 100       && expectlen > (STRLEN) OFFUNISKIP(uv)
    100        
    100        
    100        
    100        
    100        
    100        
    100        
815 44 100       && ! (flags & UTF8_ALLOW_LONG))
816           {
817           /* The overlong malformation has lower precedence than the others.
818           * Note that if this malformation is allowed, we return the actual
819           * value, instead of the replacement character. This is because this
820           * value is actually well-defined. */
821 42 100       if (! (flags & UTF8_CHECK_ONLY)) {
822 36 100       sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
    100        
    100        
    100        
    50        
    0        
    0        
    50        
823           }
824           goto malformed;
825           }
826            
827           /* Here, the input is considered to be well-formed , but could be a
828           * problematic code point that is not allowed by the input parameters. */
829 4409022 100       if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
830 685448 100       && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
831           |UTF8_WARN_ILLEGAL_INTERCHANGE)))
832           {
833 7866 100       if (UNICODE_IS_SURROGATE(uv)) {
834 64 100       if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
835 20 100       && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
836           {
837 8         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
838           pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
839           }
840 64 100       if (flags & UTF8_DISALLOW_SURROGATE) {
841           goto disallowed;
842           }
843           }
844 7802 100       else if ((uv > PERL_UNICODE_MAX)) {
845 86 100       if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
846 20 100       && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
847           {
848 8         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
849           pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
850           }
851 86 100       if (flags & UTF8_DISALLOW_SUPER) {
852           goto disallowed;
853           }
854           }
855 7716 100       else if (UNICODE_IS_NONCHAR(uv)) {
    100        
856 52 100       if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
857 20 100       && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
858           {
859 8         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
860           pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
861           }
862 52 100       if (flags & UTF8_DISALLOW_NONCHAR) {
863           goto disallowed;
864           }
865           }
866            
867 7718 100       if (sv) {
868           outlier_ret = uv; /* Note we don't bother to convert to native,
869           as all the outlier code points are the same
870           in both ASCII and EBCDIC */
871           goto do_warn;
872           }
873            
874           /* Here, this is not considered a malformed character, so drop through
875           * to return it */
876           }
877            
878           return UNI_TO_NATIVE(uv);
879            
880           /* There are three cases which get to beyond this point. In all 3 cases:
881           * if not null points to a string to print as a warning.
882           * is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
883           * set.
884           * is what return value to use if UTF8_CHECK_ONLY isn't set.
885           * This is done by initializing it to 0, and changing it only
886           * for case 1).
887           * The 3 cases are:
888           * 1) The input is valid but problematic, and to be warned about. The
889           * return value is the resultant code point; <*retlen> is set to
890           * , the number of bytes that comprise the code point.
891           * contains the result of packWARN() for the warning
892           * types. The entry point for this case is the label ;
893           * 2) The input is a valid code point but disallowed by the parameters to
894           * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
895           * <*relen> is -1; otherwise it is , the number of bytes that
896           * comprise the code point. contains the result of
897           * packWARN() for the warning types. The entry point for this case is
898           * the label .
899           * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
900           * is set, <*relen> is -1; otherwise it is , the number of
901           * bytes that comprise the malformation. All such malformations are
902           * assumed to be warning type . The entry point for this case
903           * is the label .
904           */
905            
906           malformed:
907            
908 980 100       if (sv && ckWARN_d(WARN_UTF8)) {
    100        
909           pack_warn = packWARN(WARN_UTF8);
910           }
911            
912           disallowed:
913            
914 1128 100       if (flags & UTF8_CHECK_ONLY) {
915 438 50       if (retlen)
916 438         *retlen = ((STRLEN) -1);
917           return 0;
918           }
919            
920           do_warn:
921            
922 706 100       if (pack_warn) { /* was initialized to 0, and changed only
923           if warnings are to be raised. */
924 562         const char * const string = SvPVX_const(sv);
925            
926 562 100       if (PL_op)
927 548 50       Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
    0        
928           else
929 14         Perl_warner(aTHX_ pack_warn, "%s", string);
930           }
931            
932 706 100       if (retlen) {
933 8036132         *retlen = curlen;
934           }
935            
936           return outlier_ret;
937           }
938            
939           /*
940           =for apidoc utf8_to_uvchr_buf
941            
942           Returns the native code point of the first character in the string C which
943           is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C.
944           C<*retlen> will be set to the length, in bytes, of that character.
945            
946           If C does not point to a well-formed UTF-8 character and UTF8 warnings are
947           enabled, zero is returned and C<*retlen> is set (if C isn't
948           NULL) to -1. If those warnings are off, the computed value, if well-defined
949           (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
950           C<*retlen> is set (if C isn't NULL) so that (S + C<*retlen>>) is
951           the next possible position in C that could begin a non-malformed character.
952           See L for details on when the REPLACEMENT CHARACTER is
953           returned.
954            
955           =cut
956           */
957            
958            
959           UV
960 0         Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
961           {
962           assert(s < send);
963            
964 0 0       return utf8n_to_uvchr(s, send - s, retlen,
965           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
966           }
967            
968           /* Like L(), but should only be called when it is known that
969           * there are no malformations in the input UTF-8 string C. surrogates,
970           * non-character code points, and non-Unicode code points are allowed. A macro
971           * in utf8.h is used to normally avoid this function wrapper */
972            
973           UV
974 6448256         Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
975           {
976 6448256         UV expectlen = UTF8SKIP(s);
977 6448256         const U8* send = s + expectlen;
978 6448256         UV uv = *s;
979            
980           PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
981            
982 6448256 100       if (retlen) {
983 414938         *retlen = expectlen;
984           }
985            
986           /* An invariant is trivially returned */
987 6448256 100       if (expectlen == 1) {
988           return uv;
989           }
990            
991           #ifdef EBCDIC
992           uv = NATIVE_UTF8_TO_I8(uv);
993           #endif
994            
995           /* Remove the leading bits that indicate the number of bytes, leaving just
996           * the bits that are part of the value */
997 4289160 100       uv &= UTF_START_MASK(expectlen);
998            
999           /* Now, loop through the remaining bytes, accumulating each into the
1000           * working total as we go. (I khw tried unrolling the loop for up to 4
1001           * bytes, but there was no performance improvement) */
1002 11936924 100       for (++s; s < send; s++) {
1003 6568216         uv = UTF8_ACCUMULATE(uv, *s);
1004           }
1005            
1006           return UNI_TO_NATIVE(uv);
1007            
1008           }
1009            
1010           /*
1011           =for apidoc utf8_to_uvchr
1012            
1013           Returns the native code point of the first character in the string C
1014           which is assumed to be in UTF-8 encoding; C will be set to the
1015           length, in bytes, of that character.
1016            
1017           Some, but not all, UTF-8 malformations are detected, and in fact, some
1018           malformed input could cause reading beyond the end of the input buffer, which
1019           is why this function is deprecated. Use L instead.
1020            
1021           If C points to one of the detected malformations, and UTF8 warnings are
1022           enabled, zero is returned and C<*retlen> is set (if C isn't
1023           NULL) to -1. If those warnings are off, the computed value if well-defined (or
1024           the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1025           is set (if C isn't NULL) so that (S + C<*retlen>>) is the
1026           next possible position in C that could begin a non-malformed character.
1027           See L for details on when the REPLACEMENT CHARACTER is returned.
1028            
1029           =cut
1030           */
1031            
1032           UV
1033 0         Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
1034           {
1035           PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
1036            
1037 0 0       return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
1038           }
1039            
1040           /*
1041           =for apidoc utf8_to_uvuni_buf
1042            
1043           Only in very rare circumstances should code need to be dealing in Unicode
1044           (as opposed to native) code points. In those few cases, use
1045           C> instead.
1046            
1047           Returns the Unicode (not-native) code point of the first character in the
1048           string C which
1049           is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C.
1050           C will be set to the length, in bytes, of that character.
1051            
1052           If C does not point to a well-formed UTF-8 character and UTF8 warnings are
1053           enabled, zero is returned and C<*retlen> is set (if C isn't
1054           NULL) to -1. If those warnings are off, the computed value if well-defined (or
1055           the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1056           is set (if C isn't NULL) so that (S + C<*retlen>>) is the
1057           next possible position in C that could begin a non-malformed character.
1058           See L for details on when the REPLACEMENT CHARACTER is returned.
1059            
1060           =cut
1061           */
1062            
1063           UV
1064 0         Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1065           {
1066           PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1067            
1068           assert(send > s);
1069            
1070           /* Call the low level routine asking for checks */
1071 0 0       return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
1072           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
1073           }
1074            
1075           /* DEPRECATED!
1076           * Like L(), but should only be called when it is known that
1077           * there are no malformations in the input UTF-8 string C. Surrogates,
1078           * non-character code points, and non-Unicode code points are allowed */
1079            
1080           UV
1081 0         Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1082           {
1083           PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1084            
1085 0         return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1086           }
1087            
1088           /*
1089           =for apidoc utf8_to_uvuni
1090            
1091           Returns the Unicode code point of the first character in the string C
1092           which is assumed to be in UTF-8 encoding; C will be set to the
1093           length, in bytes, of that character.
1094            
1095           Some, but not all, UTF-8 malformations are detected, and in fact, some
1096           malformed input could cause reading beyond the end of the input buffer, which
1097           is one reason why this function is deprecated. The other is that only in
1098           extremely limited circumstances should the Unicode versus native code point be
1099           of any interest to you. See L for alternatives.
1100            
1101           If C points to one of the detected malformations, and UTF8 warnings are
1102           enabled, zero is returned and C<*retlen> is set (if C doesn't point to
1103           NULL) to -1. If those warnings are off, the computed value if well-defined (or
1104           the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1105           is set (if C isn't NULL) so that (S + C<*retlen>>) is the
1106           next possible position in C that could begin a non-malformed character.
1107           See L for details on when the REPLACEMENT CHARACTER is returned.
1108            
1109           =cut
1110           */
1111            
1112           UV
1113 0         Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1114           {
1115           PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1116            
1117 0         return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1118           }
1119            
1120           /*
1121           =for apidoc utf8_length
1122            
1123           Return the length of the UTF-8 char encoded string C in characters.
1124           Stops at C (inclusive). If C s> or if the scan would end
1125           up past C, croaks.
1126            
1127           =cut
1128           */
1129            
1130           STRLEN
1131 7119006         Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
1132           {
1133           dVAR;
1134           STRLEN len = 0;
1135            
1136           PERL_ARGS_ASSERT_UTF8_LENGTH;
1137            
1138           /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1139           * the bitops (especially ~) can create illegal UTF-8.
1140           * In other words: in Perl UTF-8 is not just for Unicode. */
1141            
1142 7119006 50       if (e < s)
1143           goto warn_and_return;
1144 629958130 100       while (s < e) {
1145 622839124         s += UTF8SKIP(s);
1146 622839124         len++;
1147           }
1148            
1149 7119006 100       if (e != s) {
1150 32         len--;
1151           warn_and_return:
1152 32 50       if (PL_op)
1153 48 50       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1154 16 0       "%s in %s", unees, OP_DESC(PL_op));
1155           else
1156 0         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1157           }
1158            
1159 7119006         return len;
1160           }
1161            
1162           /*
1163           =for apidoc utf8_distance
1164            
1165           Returns the number of UTF-8 characters between the UTF-8 pointers C
1166           and C.
1167            
1168           WARNING: use only if you *know* that the pointers point inside the
1169           same UTF-8 buffer.
1170            
1171           =cut
1172           */
1173            
1174           IV
1175 377416         Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1176           {
1177           PERL_ARGS_ASSERT_UTF8_DISTANCE;
1178            
1179 377416 50       return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1180           }
1181            
1182           /*
1183           =for apidoc utf8_hop
1184            
1185           Return the UTF-8 pointer C displaced by C characters, either
1186           forward or backward.
1187            
1188           WARNING: do not use the following unless you *know* C is within
1189           the UTF-8 data pointed to by C *and* that on entry C is aligned
1190           on the first byte of character or just after the last byte of a character.
1191            
1192           =cut
1193           */
1194            
1195           U8 *
1196 61814         Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1197           {
1198           PERL_ARGS_ASSERT_UTF8_HOP;
1199            
1200           PERL_UNUSED_CONTEXT;
1201           /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1202           * the bitops (especially ~) can create illegal UTF-8.
1203           * In other words: in Perl UTF-8 is not just for Unicode. */
1204            
1205 61814 100       if (off >= 0) {
1206 1224442 100       while (off--)
1207 1164038         s += UTF8SKIP(s);
1208           }
1209           else {
1210 3990 100       while (off++) {
1211 2580         s--;
1212 5085 100       while (UTF8_IS_CONTINUATION(*s))
1213 510         s--;
1214           }
1215           }
1216 61814         return (U8 *)s;
1217           }
1218            
1219           /*
1220           =for apidoc bytes_cmp_utf8
1221            
1222           Compares the sequence of characters (stored as octets) in C, C with the
1223           sequence of characters (stored as UTF-8) in C, C. Returns 0 if they are
1224           equal, -1 or -2 if the first string is less than the second string, +1 or +2
1225           if the first string is greater than the second string.
1226            
1227           -1 or +1 is returned if the shorter string was identical to the start of the
1228           longer string. -2 or +2 is returned if the was a difference between characters
1229           within the strings.
1230            
1231           =cut
1232           */
1233            
1234           int
1235 273870         Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1236           {
1237 273870         const U8 *const bend = b + blen;
1238 273870         const U8 *const uend = u + ulen;
1239            
1240           PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1241            
1242           PERL_UNUSED_CONTEXT;
1243            
1244 4034843 100       while (b < bend && u < uend) {
1245 3855934         U8 c = *u++;
1246 3855934 100       if (!UTF8_IS_INVARIANT(c)) {
1247 185108 100       if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1248 118226 50       if (u < uend) {
1249 118226         U8 c1 = *u++;
1250 118226 50       if (UTF8_IS_CONTINUATION(c1)) {
1251 118226         c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
1252           } else {
1253 0 0       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
    0        
1254           "Malformed UTF-8 character "
1255           "(unexpected non-continuation byte 0x%02x"
1256           ", immediately after start byte 0x%02x)"
1257           /* Dear diag.t, it's in the pod. */
1258           "%s%s", c1, c,
1259 0         PL_op ? " in " : "",
1260 0 0       PL_op ? OP_DESC(PL_op) : "");
    0        
1261 0         return -2;
1262           }
1263           } else {
1264 0 0       if (PL_op)
1265 0 0       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1266 0 0       "%s in %s", unees, OP_DESC(PL_op));
1267           else
1268 0         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1269           return -2; /* Really want to return undef :-) */
1270           }
1271           } else {
1272           return -2;
1273           }
1274           }
1275 3789052 100       if (*b != c) {
1276 165014 100       return *b < c ? -2 : +2;
1277           }
1278 3624038         ++b;
1279           }
1280            
1281 41974 100       if (b == bend && u == uend)
1282           return 0;
1283            
1284 137293 100       return b < bend ? +1 : -1;
1285           }
1286            
1287           /*
1288           =for apidoc utf8_to_bytes
1289            
1290           Converts a string C of length C from UTF-8 into native byte encoding.
1291           Unlike L, this over-writes the original string, and
1292           updates C to contain the new length.
1293           Returns zero on failure, setting C to -1.
1294            
1295           If you need a copy of the string, see L.
1296            
1297           =cut
1298           */
1299            
1300           U8 *
1301 170464         Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1302           {
1303           U8 * const save = s;
1304 170464         U8 * const send = s + *len;
1305           U8 *d;
1306            
1307           PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1308            
1309           /* ensure valid UTF-8 and chars < 256 before updating string */
1310 1013490 100       while (s < send) {
1311 758560 100       if (! UTF8_IS_INVARIANT(*s)) {
1312 121154 100       if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
    50        
    50        
1313 766         *len = ((STRLEN) -1);
1314 766         return 0;
1315           }
1316 120388         s++;
1317           }
1318 757794         s++;
1319           }
1320            
1321           d = s = save;
1322 924844 100       while (s < send) {
1323 755146         U8 c = *s++;
1324 755146 100       if (! UTF8_IS_INVARIANT(c)) {
1325           /* Then it is two-byte encoded */
1326 120116         c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1327 120116         s++;
1328           }
1329 755146         *d++ = c;
1330           }
1331 169698         *d = '\0';
1332 169698         *len = d - save;
1333 170081         return save;
1334           }
1335            
1336           /*
1337           =for apidoc bytes_from_utf8
1338            
1339           Converts a string C of length C from UTF-8 into native byte encoding.
1340           Unlike L but like L, returns a pointer to
1341           the newly-created string, and updates C to contain the new
1342           length. Returns the original string if no conversion occurs, C
1343           is unchanged. Do nothing if C points to 0. Sets C to
1344           0 if C is converted or consisted entirely of characters that are invariant
1345           in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1346            
1347           =cut
1348           */
1349            
1350           U8 *
1351 2950312         Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1352           {
1353           U8 *d;
1354           const U8 *start = s;
1355           const U8 *send;
1356           I32 count = 0;
1357            
1358           PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1359            
1360           PERL_UNUSED_CONTEXT;
1361 2950312 50       if (!*is_utf8)
1362           return (U8 *)start;
1363            
1364           /* ensure valid UTF-8 and chars < 256 before converting string */
1365 10013178 100       for (send = s + *len; s < send;) {
1366 7202750 100       if (! UTF8_IS_INVARIANT(*s)) {
1367 1623176 100       if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
    50        
    50        
1368           return (U8 *)start;
1369           }
1370 8136         count++;
1371 8136         s++;
1372           }
1373 5587710         s++;
1374           }
1375            
1376 1335272         *is_utf8 = FALSE;
1377            
1378 1335272         Newx(d, (*len) - count + 1, U8);
1379           s = start; start = d;
1380 7188024 100       while (s < send) {
1381 5185116         U8 c = *s++;
1382 5185116 100       if (! UTF8_IS_INVARIANT(c)) {
1383           /* Then it is two-byte encoded */
1384 8074         c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1385 8074         s++;
1386           }
1387 5185116         *d++ = c;
1388           }
1389 1335272         *d = '\0';
1390 1335272         *len = d - start;
1391 2142792         return (U8 *)start;
1392           }
1393            
1394           /*
1395           =for apidoc bytes_to_utf8
1396            
1397           Converts a string C of length C bytes from the native encoding into
1398           UTF-8.
1399           Returns a pointer to the newly-created string, and sets C to
1400           reflect the new length in bytes.
1401            
1402           A NUL character will be written after the end of the string.
1403            
1404           If you want to convert to UTF-8 from encodings other than
1405           the native (Latin1 or EBCDIC),
1406           see L().
1407            
1408           =cut
1409           */
1410            
1411           /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1412           likewise need duplication. */
1413            
1414           U8*
1415 213714         Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1416           {
1417 213714         const U8 * const send = s + (*len);
1418           U8 *d;
1419           U8 *dst;
1420            
1421           PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1422           PERL_UNUSED_CONTEXT;
1423            
1424 213714         Newx(d, (*len) * 2 + 1, U8);
1425           dst = d;
1426            
1427 12019571 100       while (s < send) {
1428 11699000         append_utf8_from_native_byte(*s, &d);
1429 11699000         s++;
1430           }
1431 213714         *d = '\0';
1432 213714         *len = d-dst;
1433 213714         return dst;
1434           }
1435            
1436           /*
1437           * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1438           *
1439           * Destination must be pre-extended to 3/2 source. Do not use in-place.
1440           * We optimize for native, for obvious reasons. */
1441            
1442           U8*
1443 28666         Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1444           {
1445           U8* pend;
1446           U8* dstart = d;
1447            
1448           PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1449            
1450 28666 100       if (bytelen & 1)
1451 6         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1452            
1453 28660         pend = p + bytelen;
1454            
1455 731254 100       while (p < pend) {
1456 688286         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1457 688286         p += 2;
1458 688286 100       if (UNI_IS_INVARIANT(uv)) {
1459 680068         *d++ = LATIN1_TO_NATIVE((U8) uv);
1460 680068         continue;
1461           }
1462 8218 100       if (uv <= MAX_UTF8_TWO_BYTE) {
1463 120         *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1464 120         *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
1465 120         continue;
1466           }
1467           #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1468           #define LAST_HIGH_SURROGATE 0xDBFF
1469           #define FIRST_LOW_SURROGATE 0xDC00
1470           #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
1471 8098 100       if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
1472 7962 100       if (p >= pend) {
1473 2         Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1474           } else {
1475 7960         UV low = (p[0] << 8) + p[1];
1476 7960         p += 2;
1477 7960 100       if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
1478 8         Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1479 11928         uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1480 7952         + (low - FIRST_LOW_SURROGATE) + 0x10000;
1481           }
1482 136 100       } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
1483 12         Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1484           }
1485           #ifdef EBCDIC
1486           d = uvoffuni_to_utf8_flags(d, uv, 0);
1487           #else
1488 8076 100       if (uv < 0x10000) {
1489 124         *d++ = (U8)(( uv >> 12) | 0xe0);
1490 124         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1491 124         *d++ = (U8)(( uv & 0x3f) | 0x80);
1492 124         continue;
1493           }
1494           else {
1495 7952         *d++ = (U8)(( uv >> 18) | 0xf0);
1496 7952         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1497 7952         *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1498 7952         *d++ = (U8)(( uv & 0x3f) | 0x80);
1499 348108         continue;
1500           }
1501           #endif
1502           }
1503 28638         *newlen = d - dstart;
1504 28638         return d;
1505           }
1506            
1507           /* Note: this one is slightly destructive of the source. */
1508            
1509           U8*
1510 14356         Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1511           {
1512           U8* s = (U8*)p;
1513 14356         U8* const send = s + bytelen;
1514            
1515           PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1516            
1517 14356 100       if (bytelen & 1)
1518 2         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1519           (UV)bytelen);
1520            
1521 362482 100       while (s < send) {
1522 348128         const U8 tmp = s[0];
1523 348128         s[0] = s[1];
1524 348128         s[1] = tmp;
1525 348128         s += 2;
1526           }
1527 14354         return utf16_to_utf8(p, d, bytelen, newlen);
1528           }
1529            
1530           bool
1531 836         Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1532           {
1533           U8 tmpbuf[UTF8_MAXBYTES+1];
1534 836         uvchr_to_utf8(tmpbuf, c);
1535 836         return _is_utf8_FOO(classnum, tmpbuf);
1536           }
1537            
1538           /* for now these are all defined (inefficiently) in terms of the utf8 versions.
1539           * Note that the macros in handy.h that call these short-circuit calling them
1540           * for Latin-1 range inputs */
1541            
1542           bool
1543 0         Perl_is_uni_alnum(pTHX_ UV c)
1544           {
1545           U8 tmpbuf[UTF8_MAXBYTES+1];
1546 0         uvchr_to_utf8(tmpbuf, c);
1547 0         return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
1548           }
1549            
1550           bool
1551 0         Perl_is_uni_alnumc(pTHX_ UV c)
1552           {
1553           U8 tmpbuf[UTF8_MAXBYTES+1];
1554 0         uvchr_to_utf8(tmpbuf, c);
1555 0         return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
1556           }
1557            
1558           /* Internal function so we can deprecate the external one, and call
1559           this one from other deprecated functions in this file */
1560            
1561           PERL_STATIC_INLINE bool
1562           S_is_utf8_idfirst(pTHX_ const U8 *p)
1563           {
1564           dVAR;
1565            
1566 0         if (*p == '_')
1567           return TRUE;
1568           /* is_utf8_idstart would be more logical. */
1569 0         return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1570           }
1571            
1572           bool
1573 0         Perl_is_uni_idfirst(pTHX_ UV c)
1574           {
1575           U8 tmpbuf[UTF8_MAXBYTES+1];
1576 0         uvchr_to_utf8(tmpbuf, c);
1577 0         return S_is_utf8_idfirst(aTHX_ tmpbuf);
1578           }
1579            
1580           bool
1581 68         Perl__is_uni_perl_idcont(pTHX_ UV c)
1582           {
1583           U8 tmpbuf[UTF8_MAXBYTES+1];
1584 68         uvchr_to_utf8(tmpbuf, c);
1585 68         return _is_utf8_perl_idcont(tmpbuf);
1586           }
1587            
1588           bool
1589 60         Perl__is_uni_perl_idstart(pTHX_ UV c)
1590           {
1591           U8 tmpbuf[UTF8_MAXBYTES+1];
1592 60         uvchr_to_utf8(tmpbuf, c);
1593 60         return _is_utf8_perl_idstart(tmpbuf);
1594           }
1595            
1596           bool
1597 0         Perl_is_uni_alpha(pTHX_ UV c)
1598           {
1599           U8 tmpbuf[UTF8_MAXBYTES+1];
1600 0         uvchr_to_utf8(tmpbuf, c);
1601 0         return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
1602           }
1603            
1604           bool
1605 0         Perl_is_uni_ascii(pTHX_ UV c)
1606           {
1607 0         return isASCII(c);
1608           }
1609            
1610           bool
1611 0         Perl_is_uni_blank(pTHX_ UV c)
1612           {
1613 0 0       return isBLANK_uni(c);
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
1614           }
1615            
1616           bool
1617 0         Perl_is_uni_space(pTHX_ UV c)
1618           {
1619 0 0       return isSPACE_uni(c);
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
1620           }
1621            
1622           bool
1623 0         Perl_is_uni_digit(pTHX_ UV c)
1624           {
1625           U8 tmpbuf[UTF8_MAXBYTES+1];
1626 0         uvchr_to_utf8(tmpbuf, c);
1627 0         return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
1628           }
1629            
1630           bool
1631 0         Perl_is_uni_upper(pTHX_ UV c)
1632           {
1633           U8 tmpbuf[UTF8_MAXBYTES+1];
1634 0         uvchr_to_utf8(tmpbuf, c);
1635 0         return _is_utf8_FOO(_CC_UPPER, tmpbuf);
1636           }
1637            
1638           bool
1639 0         Perl_is_uni_lower(pTHX_ UV c)
1640           {
1641           U8 tmpbuf[UTF8_MAXBYTES+1];
1642 0         uvchr_to_utf8(tmpbuf, c);
1643 0         return _is_utf8_FOO(_CC_LOWER, tmpbuf);
1644           }
1645            
1646           bool
1647 0         Perl_is_uni_cntrl(pTHX_ UV c)
1648           {
1649 0 0       return isCNTRL_L1(c);
    0        
1650           }
1651            
1652           bool
1653 0         Perl_is_uni_graph(pTHX_ UV c)
1654           {
1655           U8 tmpbuf[UTF8_MAXBYTES+1];
1656 0         uvchr_to_utf8(tmpbuf, c);
1657 0         return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
1658           }
1659            
1660           bool
1661 0         Perl_is_uni_print(pTHX_ UV c)
1662           {
1663           U8 tmpbuf[UTF8_MAXBYTES+1];
1664 0         uvchr_to_utf8(tmpbuf, c);
1665 0         return _is_utf8_FOO(_CC_PRINT, tmpbuf);
1666           }
1667            
1668           bool
1669 0         Perl_is_uni_punct(pTHX_ UV c)
1670           {
1671           U8 tmpbuf[UTF8_MAXBYTES+1];
1672 0         uvchr_to_utf8(tmpbuf, c);
1673 0         return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
1674           }
1675            
1676           bool
1677 0         Perl_is_uni_xdigit(pTHX_ UV c)
1678           {
1679 0 0       return isXDIGIT_uni(c);
    0        
    0        
    0        
    0        
    0        
1680           }
1681            
1682           UV
1683 62676         Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1684           {
1685           /* We have the latin1-range values compiled into the core, so just use
1686           * those, converting the result to utf8. The only difference between upper
1687           * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1688           * either "SS" or "Ss". Which one to use is passed into the routine in
1689           * 'S_or_s' to avoid a test */
1690            
1691 62676         UV converted = toUPPER_LATIN1_MOD(c);
1692            
1693           PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1694            
1695           assert(S_or_s == 'S' || S_or_s == 's');
1696            
1697 62676 100       if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
1698           characters in this range */
1699 32292         *p = (U8) converted;
1700 32292         *lenp = 1;
1701 32292         return converted;
1702           }
1703            
1704           /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1705           * which it maps to one of them, so as to only have to have one check for
1706           * it in the main case */
1707 30384 100       if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1708 204         switch (c) {
1709           case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1710           converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1711           break;
1712           case MICRO_SIGN:
1713           converted = GREEK_CAPITAL_LETTER_MU;
1714 64         break;
1715           case LATIN_SMALL_LETTER_SHARP_S:
1716 72         *(p)++ = 'S';
1717 72         *p = S_or_s;
1718 72         *lenp = 2;
1719 72         return 'S';
1720           default:
1721 0         Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1722           assert(0); /* NOTREACHED */
1723           }
1724           }
1725            
1726 30312         *(p)++ = UTF8_TWO_BYTE_HI(converted);
1727 30312         *p = UTF8_TWO_BYTE_LO(converted);
1728 30312         *lenp = 2;
1729            
1730 46494         return converted;
1731           }
1732            
1733           /* Call the function to convert a UTF-8 encoded character to the specified case.
1734           * Note that there may be more than one character in the result.
1735           * INP is a pointer to the first byte of the input character
1736           * OUTP will be set to the first byte of the string of changed characters. It
1737           * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1738           * LENP will be set to the length in bytes of the string of changed characters
1739           *
1740           * The functions return the ordinal of the first character in the string of OUTP */
1741           #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1742           #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1743           #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
1744            
1745           /* This additionally has the input parameter SPECIALS, which if non-zero will
1746           * cause this to use the SPECIALS hash for folding (meaning get full case
1747           * folding); otherwise, when zero, this implies a simple case fold */
1748           #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
1749            
1750           UV
1751 546         Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1752           {
1753           dVAR;
1754            
1755           /* Convert the Unicode character whose ordinal is to its uppercase
1756           * version and store that in UTF-8 in

and its length in bytes in .

1757           * Note that the

needs to be at least UTF8_MAXBYTES_CASE+1 bytes since

1758           * the changed version may be longer than the original character.
1759           *
1760           * The ordinal of the first character of the changed version is returned
1761           * (but note, as explained above, that there may be more.) */
1762            
1763           PERL_ARGS_ASSERT_TO_UNI_UPPER;
1764            
1765 546 100       if (c < 256) {
1766 512         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1767           }
1768            
1769 34         uvchr_to_utf8(p, c);
1770 290         return CALL_UPPER_CASE(p, p, lenp);
1771           }
1772            
1773           UV
1774 546         Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1775           {
1776           dVAR;
1777            
1778           PERL_ARGS_ASSERT_TO_UNI_TITLE;
1779            
1780 546 100       if (c < 256) {
1781 512         return _to_upper_title_latin1((U8) c, p, lenp, 's');
1782           }
1783            
1784 34         uvchr_to_utf8(p, c);
1785 290         return CALL_TITLE_CASE(p, p, lenp);
1786           }
1787            
1788           STATIC U8
1789           S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1790           {
1791           /* We have the latin1-range values compiled into the core, so just use
1792           * those, converting the result to utf8. Since the result is always just
1793           * one character, we allow

to be NULL */

1794            
1795 60176         U8 converted = toLOWER_LATIN1(c);
1796            
1797 60176 50       if (p != NULL) {
    50        
    50        
1798 60176 50       if (NATIVE_IS_INVARIANT(converted)) {
    50        
    100        
1799 31466         *p = converted;
1800 31466         *lenp = 1;
1801           }
1802           else {
1803 28710         *p = UTF8_TWO_BYTE_HI(converted);
1804 28710         *(p+1) = UTF8_TWO_BYTE_LO(converted);
1805 28710         *lenp = 2;
1806           }
1807           }
1808           return converted;
1809           }
1810            
1811           UV
1812 530         Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1813           {
1814           dVAR;
1815            
1816           PERL_ARGS_ASSERT_TO_UNI_LOWER;
1817            
1818 530 100       if (c < 256) {
1819 768         return to_lower_latin1((U8) c, p, lenp);
1820           }
1821            
1822 18         uvchr_to_utf8(p, c);
1823 274         return CALL_LOWER_CASE(p, p, lenp);
1824           }
1825            
1826           UV
1827 3027170         Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
1828           {
1829           /* Corresponds to to_lower_latin1(); bits meanings:
1830           * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1831           * FOLD_FLAGS_FULL iff full folding is to be used;
1832           *
1833           * Not to be used for locale folds
1834           */
1835            
1836           UV converted;
1837            
1838           PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1839            
1840           assert (! (flags & FOLD_FLAGS_LOCALE));
1841            
1842 3027170 100       if (c == MICRO_SIGN) {
1843           converted = GREEK_SMALL_LETTER_MU;
1844           }
1845 2986918 100       else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1846            
1847           /* If can't cross 127/128 boundary, can't return "ss"; instead return
1848           * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1849           * under those circumstances. */
1850 446202 100       if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1851 70840         *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1852 70840         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1853           p, *lenp, U8);
1854 70840         return LATIN_SMALL_LETTER_LONG_S;
1855           }
1856           else {
1857 375362         *(p)++ = 's';
1858 375362         *p = 's';
1859 375362         *lenp = 2;
1860 375362         return 's';
1861           }
1862           }
1863           else { /* In this range the fold of all other characters is their lower
1864           case */
1865 2540716         converted = toLOWER_LATIN1(c);
1866           }
1867            
1868 2580968 100       if (NATIVE_IS_INVARIANT(converted)) {
1869 2366680         *p = (U8) converted;
1870 2366680         *lenp = 1;
1871           }
1872           else {
1873 214288         *(p)++ = UTF8_TWO_BYTE_HI(converted);
1874 214288         *p = UTF8_TWO_BYTE_LO(converted);
1875 1620729         *lenp = 2;
1876           }
1877            
1878           return converted;
1879           }
1880            
1881           UV
1882 4383144         Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
1883           {
1884            
1885           /* Not currently externally documented, and subject to change
1886           * bits meanings:
1887           * FOLD_FLAGS_FULL iff full folding is to be used;
1888           * FOLD_FLAGS_LOCALE iff in locale
1889           * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1890           */
1891            
1892           PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1893            
1894 4383144 100       if (c < 256) {
1895 2765492         UV result = _to_fold_latin1((U8) c, p, lenp,
1896           flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
1897           /* It is illegal for the fold to cross the 255/256 boundary under
1898           * locale; in this case return the original */
1899 1434954 50       return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1900           ? c
1901 2765492 100       : result;
1902           }
1903            
1904           /* If no special needs, just use the macro */
1905 1617652 100       if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1906 981148         uvchr_to_utf8(p, c);
1907 981148 100       return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1908           }
1909           else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1910           the special flags. */
1911           U8 utf8_c[UTF8_MAXBYTES + 1];
1912 636504         uvchr_to_utf8(utf8_c, c);
1913 2509824         return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
1914           }
1915           }
1916            
1917           bool
1918 0         Perl_is_uni_alnum_lc(pTHX_ UV c)
1919           {
1920 0 0       if (c < 256) {
1921 0 0       return isALNUM_LC(c);
    0        
    0        
1922           }
1923 0         return _is_uni_FOO(_CC_WORDCHAR, c);
1924           }
1925            
1926           bool
1927 0         Perl_is_uni_alnumc_lc(pTHX_ UV c)
1928           {
1929 0 0       if (c < 256) {
1930 0 0       return isALPHANUMERIC_LC(c);
    0        
1931           }
1932 0         return _is_uni_FOO(_CC_ALPHANUMERIC, c);
1933           }
1934            
1935           bool
1936 0         Perl_is_uni_idfirst_lc(pTHX_ UV c)
1937           {
1938 0 0       if (c < 256) {
1939 0 0       return isIDFIRST_LC(c);
    0        
    0        
1940           }
1941 0         return _is_uni_perl_idstart(c);
1942           }
1943            
1944           bool
1945 0         Perl_is_uni_alpha_lc(pTHX_ UV c)
1946           {
1947 0 0       if (c < 256) {
1948 0 0       return isALPHA_LC(c);
    0        
1949           }
1950 0         return _is_uni_FOO(_CC_ALPHA, c);
1951           }
1952            
1953           bool
1954 0         Perl_is_uni_ascii_lc(pTHX_ UV c)
1955           {
1956 0 0       if (c < 256) {
1957 0 0       return isASCII_LC(c);
    0        
1958           }
1959           return 0;
1960           }
1961            
1962           bool
1963 0         Perl_is_uni_blank_lc(pTHX_ UV c)
1964           {
1965 0 0       if (c < 256) {
1966 0 0       return isBLANK_LC(c);
    0        
1967           }
1968 0 0       return isBLANK_uni(c);
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
1969           }
1970            
1971           bool
1972 0         Perl_is_uni_space_lc(pTHX_ UV c)
1973           {
1974 0 0       if (c < 256) {
1975 0 0       return isSPACE_LC(c);
    0        
1976           }
1977 0 0       return isSPACE_uni(c);
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
1978           }
1979            
1980           bool
1981 0         Perl_is_uni_digit_lc(pTHX_ UV c)
1982           {
1983 0 0       if (c < 256) {
1984 0 0       return isDIGIT_LC(c);
    0        
1985           }
1986 0         return _is_uni_FOO(_CC_DIGIT, c);
1987           }
1988            
1989           bool
1990 0         Perl_is_uni_upper_lc(pTHX_ UV c)
1991           {
1992 0 0       if (c < 256) {
1993 0 0       return isUPPER_LC(c);
    0        
1994           }
1995 0         return _is_uni_FOO(_CC_UPPER, c);
1996           }
1997            
1998           bool
1999 0         Perl_is_uni_lower_lc(pTHX_ UV c)
2000           {
2001 0 0       if (c < 256) {
2002 0 0       return isLOWER_LC(c);
    0        
2003           }
2004 0         return _is_uni_FOO(_CC_LOWER, c);
2005           }
2006            
2007           bool
2008 0         Perl_is_uni_cntrl_lc(pTHX_ UV c)
2009           {
2010 0 0       if (c < 256) {
2011 0 0       return isCNTRL_LC(c);
    0        
2012           }
2013           return 0;
2014           }
2015            
2016           bool
2017 0         Perl_is_uni_graph_lc(pTHX_ UV c)
2018           {
2019 0 0       if (c < 256) {
2020 0 0       return isGRAPH_LC(c);
    0        
2021           }
2022 0         return _is_uni_FOO(_CC_GRAPH, c);
2023           }
2024            
2025           bool
2026 0         Perl_is_uni_print_lc(pTHX_ UV c)
2027           {
2028 0 0       if (c < 256) {
2029 0 0       return isPRINT_LC(c);
    0        
2030           }
2031 0         return _is_uni_FOO(_CC_PRINT, c);
2032           }
2033            
2034           bool
2035 0         Perl_is_uni_punct_lc(pTHX_ UV c)
2036           {
2037 0 0       if (c < 256) {
2038 0 0       return isPUNCT_LC(c);
    0        
2039           }
2040 0         return _is_uni_FOO(_CC_PUNCT, c);
2041           }
2042            
2043           bool
2044 0         Perl_is_uni_xdigit_lc(pTHX_ UV c)
2045           {
2046 0 0       if (c < 256) {
2047 0 0       return isXDIGIT_LC(c);
    0        
2048           }
2049 0 0       return isXDIGIT_uni(c);
    0        
    0        
    0        
    0        
    0        
2050           }
2051            
2052           U32
2053 0         Perl_to_uni_upper_lc(pTHX_ U32 c)
2054           {
2055           /* XXX returns only the first character -- do not use XXX */
2056           /* XXX no locale support yet */
2057           STRLEN len;
2058           U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2059 0         return (U32)to_uni_upper(c, tmpbuf, &len);
2060           }
2061            
2062           U32
2063 0         Perl_to_uni_title_lc(pTHX_ U32 c)
2064           {
2065           /* XXX returns only the first character XXX -- do not use XXX */
2066           /* XXX no locale support yet */
2067           STRLEN len;
2068           U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2069 0         return (U32)to_uni_title(c, tmpbuf, &len);
2070           }
2071            
2072           U32
2073 0         Perl_to_uni_lower_lc(pTHX_ U32 c)
2074           {
2075           /* XXX returns only the first character -- do not use XXX */
2076           /* XXX no locale support yet */
2077           STRLEN len;
2078           U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2079 0         return (U32)to_uni_lower(c, tmpbuf, &len);
2080           }
2081            
2082           PERL_STATIC_INLINE bool
2083 410108         S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
2084           const char *const swashname)
2085           {
2086           /* returns a boolean giving whether or not the UTF8-encoded character that
2087           * starts at

is in the swash indicated by .

2088           * contains a pointer to where the swash indicated by
2089           * is to be stored; which this routine will do, so that future calls will
2090           * look at <*swash> and only generate a swash if it is not null
2091           *
2092           * Note that it is assumed that the buffer length of

is enough to

2093           * contain all the bytes that comprise the character. Thus, <*p> should
2094           * have been checked before this call for mal-formedness enough to assure
2095           * that. */
2096            
2097           dVAR;
2098            
2099           PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2100            
2101           /* The API should have included a length for the UTF-8 character in

,

2102           * but it doesn't. We therefore assume that p has been validated at least
2103           * as far as there being enough bytes available in it to accommodate the
2104           * character without reading beyond the end, and pass that number on to the
2105           * validating routine */
2106 410108 100       if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
2107 4 50       if (ckWARN_d(WARN_UTF8)) {
2108 4         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
2109           "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
2110 4 50       if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2111           what the malformation is */
2112 4 50       utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2113           }
2114           }
2115           return FALSE;
2116           }
2117 410104 100       if (!*swash) {
2118 638         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2119 638         *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
2120           }
2121            
2122 410106         return swash_fetch(*swash, p, TRUE) != 0;
2123           }
2124            
2125           bool
2126 1530         Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2127           {
2128           dVAR;
2129            
2130           PERL_ARGS_ASSERT__IS_UTF8_FOO;
2131            
2132           assert(classnum < _FIRST_NON_SWASH_CC);
2133            
2134 1530         return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
2135           }
2136            
2137           bool
2138 0         Perl_is_utf8_alnum(pTHX_ const U8 *p)
2139           {
2140           dVAR;
2141            
2142           PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
2143            
2144           /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
2145           * descendant of isalnum(3), in other words, it doesn't
2146           * contain the '_'. --jhi */
2147 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord");
2148           }
2149            
2150           bool
2151 0         Perl_is_utf8_alnumc(pTHX_ const U8 *p)
2152           {
2153           dVAR;
2154            
2155           PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
2156            
2157 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum");
2158           }
2159            
2160           bool
2161 0         Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
2162           {
2163           dVAR;
2164            
2165           PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
2166            
2167 0         return S_is_utf8_idfirst(aTHX_ p);
2168           }
2169            
2170           bool
2171 0         Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
2172           {
2173           dVAR;
2174            
2175           PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
2176            
2177 0 0       if (*p == '_')
2178           return TRUE;
2179           /* is_utf8_idstart would be more logical. */
2180 0         return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
2181           }
2182            
2183           bool
2184 401724         Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
2185           {
2186           dVAR;
2187            
2188           PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
2189            
2190 401724         return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
2191           }
2192            
2193           bool
2194 6840         Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2195           {
2196           dVAR;
2197            
2198           PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2199            
2200 6840         return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont");
2201           }
2202            
2203            
2204           bool
2205 0         Perl_is_utf8_idcont(pTHX_ const U8 *p)
2206           {
2207           dVAR;
2208            
2209           PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
2210            
2211 0         return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
2212           }
2213            
2214           bool
2215 0         Perl_is_utf8_xidcont(pTHX_ const U8 *p)
2216           {
2217           dVAR;
2218            
2219           PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
2220            
2221 0         return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
2222           }
2223            
2224           bool
2225 0         Perl_is_utf8_alpha(pTHX_ const U8 *p)
2226           {
2227           dVAR;
2228            
2229           PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
2230            
2231 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha");
2232           }
2233            
2234           bool
2235 0         Perl_is_utf8_ascii(pTHX_ const U8 *p)
2236           {
2237           dVAR;
2238            
2239           PERL_ARGS_ASSERT_IS_UTF8_ASCII;
2240            
2241           /* ASCII characters are the same whether in utf8 or not. So the macro
2242           * works on both utf8 and non-utf8 representations. */
2243 0         return isASCII(*p);
2244           }
2245            
2246           bool
2247 0         Perl_is_utf8_blank(pTHX_ const U8 *p)
2248           {
2249           dVAR;
2250            
2251           PERL_ARGS_ASSERT_IS_UTF8_BLANK;
2252            
2253 0 0       return isBLANK_utf8(p);
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
2254           }
2255            
2256           bool
2257 0         Perl_is_utf8_space(pTHX_ const U8 *p)
2258           {
2259           dVAR;
2260            
2261           PERL_ARGS_ASSERT_IS_UTF8_SPACE;
2262            
2263 0 0       return isSPACE_utf8(p);
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
2264           }
2265            
2266           bool
2267 0         Perl_is_utf8_perl_space(pTHX_ const U8 *p)
2268           {
2269           dVAR;
2270            
2271           PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
2272            
2273           /* Only true if is an ASCII space-like character, and ASCII is invariant
2274           * under utf8, so can just use the macro */
2275 0         return isSPACE_A(*p);
2276           }
2277            
2278           bool
2279 0         Perl_is_utf8_perl_word(pTHX_ const U8 *p)
2280           {
2281           dVAR;
2282            
2283           PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
2284            
2285           /* Only true if is an ASCII word character, and ASCII is invariant
2286           * under utf8, so can just use the macro */
2287 0         return isWORDCHAR_A(*p);
2288           }
2289            
2290           bool
2291 0         Perl_is_utf8_digit(pTHX_ const U8 *p)
2292           {
2293           dVAR;
2294            
2295           PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
2296            
2297 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit");
2298           }
2299            
2300           bool
2301 0         Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
2302           {
2303           dVAR;
2304            
2305           PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
2306            
2307           /* Only true if is an ASCII digit character, and ASCII is invariant
2308           * under utf8, so can just use the macro */
2309 0         return isDIGIT_A(*p);
2310           }
2311            
2312           bool
2313 0         Perl_is_utf8_upper(pTHX_ const U8 *p)
2314           {
2315           dVAR;
2316            
2317           PERL_ARGS_ASSERT_IS_UTF8_UPPER;
2318            
2319 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase");
2320           }
2321            
2322           bool
2323 0         Perl_is_utf8_lower(pTHX_ const U8 *p)
2324           {
2325           dVAR;
2326            
2327           PERL_ARGS_ASSERT_IS_UTF8_LOWER;
2328            
2329 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase");
2330           }
2331            
2332           bool
2333 0         Perl_is_utf8_cntrl(pTHX_ const U8 *p)
2334           {
2335           dVAR;
2336            
2337           PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
2338            
2339 0 0       return isCNTRL_utf8(p);
    0        
    0        
    0        
2340           }
2341            
2342           bool
2343 0         Perl_is_utf8_graph(pTHX_ const U8 *p)
2344           {
2345           dVAR;
2346            
2347           PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
2348            
2349 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph");
2350           }
2351            
2352           bool
2353 0         Perl_is_utf8_print(pTHX_ const U8 *p)
2354           {
2355           dVAR;
2356            
2357           PERL_ARGS_ASSERT_IS_UTF8_PRINT;
2358            
2359 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint");
2360           }
2361            
2362           bool
2363 0         Perl_is_utf8_punct(pTHX_ const U8 *p)
2364           {
2365           dVAR;
2366            
2367           PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
2368            
2369 0         return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct");
2370           }
2371            
2372           bool
2373 0         Perl_is_utf8_xdigit(pTHX_ const U8 *p)
2374           {
2375           dVAR;
2376            
2377           PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2378            
2379 0 0       return is_XDIGIT_utf8(p);
    0        
    0        
    0        
    0        
    0        
    0        
2380           }
2381            
2382           bool
2383 14         Perl__is_utf8_mark(pTHX_ const U8 *p)
2384           {
2385           dVAR;
2386            
2387           PERL_ARGS_ASSERT__IS_UTF8_MARK;
2388            
2389 14         return is_utf8_common(p, &PL_utf8_mark, "IsM");
2390           }
2391            
2392            
2393           bool
2394 0         Perl_is_utf8_mark(pTHX_ const U8 *p)
2395           {
2396           dVAR;
2397            
2398           PERL_ARGS_ASSERT_IS_UTF8_MARK;
2399            
2400 0         return _is_utf8_mark(p);
2401           }
2402            
2403           /*
2404           =for apidoc to_utf8_case
2405            
2406           The C

contains the pointer to the UTF-8 string encoding

2407           the character that is being converted. This routine assumes that the character
2408           at C

is well-formed.

2409            
2410           The C is a pointer to the character buffer to put the
2411           conversion result to. The C is a pointer to the length
2412           of the result.
2413            
2414           The C is a pointer to the swash to use.
2415            
2416           Both the special and normal mappings are stored in F,
2417           and loaded by SWASHNEW, using F. The C (usually,
2418           but not always, a multicharacter mapping), is tried first.
2419            
2420           The C is a string like "utf8::ToSpecLower", which means the
2421           hash %utf8::ToSpecLower. The access to the hash is through
2422           Perl_to_utf8_case().
2423            
2424           The C is a string like "ToLower" which means the swash
2425           %utf8::ToLower.
2426            
2427           =cut */
2428            
2429           UV
2430 2406114         Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2431           SV **swashp, const char *normal, const char *special)
2432           {
2433           dVAR;
2434 2406114         STRLEN len = 0;
2435 2406114         const UV uv1 = valid_utf8_to_uvchr(p, NULL);
2436            
2437           PERL_ARGS_ASSERT_TO_UTF8_CASE;
2438            
2439           /* Note that swash_fetch() doesn't output warnings for these because it
2440           * assumes we will */
2441 2406114 100       if (uv1 >= UNICODE_SURROGATE_FIRST) {
2442 586434 100       if (uv1 <= UNICODE_SURROGATE_LAST) {
2443 50 100       if (ckWARN_d(WARN_SURROGATE)) {
2444 36 50       const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
    50        
    0        
2445 36         Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2446           "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2447           }
2448           }
2449 586384 100       else if (UNICODE_IS_SUPER(uv1)) {
2450 54 100       if (ckWARN_d(WARN_NON_UNICODE)) {
2451 28 50       const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
    50        
    0        
2452 28         Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2453           "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2454           }
2455           }
2456            
2457           /* Note that non-characters are perfectly legal, so no warning should
2458           * be given */
2459           }
2460            
2461 2406082 100       if (!*swashp) /* load on-demand */
2462 100         *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
2463            
2464 2406082 100       if (special) {
2465           /* It might be "special" (sometimes, but not always,
2466           * a multicharacter mapping) */
2467 2078270         HV * const hv = get_hv(special, 0);
2468           SV **svp;
2469            
2470 4156540 50       if (hv &&
    100        
2471 3288656 50       (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
    100        
    100        
    100        
    50        
    50        
    50        
    50        
2472 1210386         (*svp)) {
2473           const char *s;
2474            
2475 1210386 50       s = SvPV_const(*svp, len);
2476 1210386 50       if (len == 1)
2477           /* EIGHTBIT */
2478 0         len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2479           else {
2480 1210386         Copy(s, ustrp, len, U8);
2481           }
2482           }
2483           }
2484            
2485 2406082 100       if (!len && *swashp) {
    50        
2486 1195696         const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
2487            
2488 1195696 100       if (uv2) {
2489           /* It was "normal" (a single character mapping). */
2490 779284         len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2491           }
2492           }
2493            
2494 2406082 100       if (len) {
2495 1989670 50       if (lenp) {
2496 1989670         *lenp = len;
2497           }
2498 1989670         return valid_utf8_to_uvchr(ustrp, 0);
2499           }
2500            
2501           /* Here, there was no mapping defined, which means that the code point maps
2502           * to itself. Return the inputs */
2503 416412         len = UTF8SKIP(p);
2504 416412 100       if (p != ustrp) { /* Don't copy onto itself */
2505 313988         Copy(p, ustrp, len, U8);
2506           }
2507            
2508 416412 100       if (lenp)
2509 1411238         *lenp = len;
2510            
2511           return uv1;
2512            
2513           }
2514            
2515           STATIC UV
2516 235496         S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2517           {
2518           /* This is called when changing the case of a utf8-encoded character above
2519           * the Latin1 range, and the operation is in locale. If the result
2520           * contains a character that crosses the 255/256 boundary, disallow the
2521           * change, and return the original code point. See L for why;
2522           *
2523           * p points to the original string whose case was changed; assumed
2524           * by this routine to be well-formed
2525           * result the code point of the first character in the changed-case string
2526           * ustrp points to the changed-case string ( represents its first char)
2527           * lenp points to the length of */
2528            
2529           UV original; /* To store the first code point of

*/

2530            
2531           PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2532            
2533           assert(UTF8_IS_ABOVE_LATIN1(*p));
2534            
2535           /* We know immediately if the first character in the string crosses the
2536           * boundary, so can skip */
2537 235496 100       if (result > 255) {
2538            
2539           /* Look at every character in the result; if any cross the
2540           * boundary, the whole thing is disallowed */
2541 71244         U8* s = ustrp + UTF8SKIP(ustrp);
2542 71244         U8* e = ustrp + *lenp;
2543 106866 100       while (s < e) {
2544 5992 50       if (! UTF8_IS_ABOVE_LATIN1(*s)) {
2545           goto bad_crossing;
2546           }
2547 0         s += UTF8SKIP(s);
2548           }
2549            
2550           /* Here, no characters crossed, result is ok as-is */
2551           return result;
2552           }
2553            
2554           bad_crossing:
2555            
2556           /* Failed, have to return the original */
2557 170244         original = valid_utf8_to_uvchr(p, lenp);
2558 170244         Copy(p, ustrp, *lenp, char);
2559 202870         return original;
2560           }
2561            
2562           /*
2563           =for apidoc to_utf8_upper
2564            
2565           Instead use L.
2566            
2567           =cut */
2568            
2569           /* Not currently externally documented, and subject to change:
2570           * is set iff locale semantics are to be used for code points < 256
2571           * if non-null, *tainted_ptr will be set TRUE iff locale rules
2572           * were used in the calculation; otherwise unchanged. */
2573            
2574           UV
2575 183772         Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2576           {
2577           dVAR;
2578            
2579           UV result;
2580            
2581           PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2582            
2583 183772 100       if (UTF8_IS_INVARIANT(*p)) {
2584 29730 50       if (flags) {
2585 0         result = toUPPER_LC(*p);
2586           }
2587           else {
2588 29730         return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2589           }
2590           }
2591 154042 100       else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2592 27982 100       if (flags) {
2593 2 50       result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2594           }
2595           else {
2596 27980         return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2597           ustrp, lenp, 'S');
2598           }
2599           }
2600           else { /* utf8, ord above 255 */
2601 126060         result = CALL_UPPER_CASE(p, ustrp, lenp);
2602            
2603 126040 50       if (flags) {
2604 0         result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2605           }
2606 126040         return result;
2607           }
2608            
2609           /* Here, used locale rules. Convert back to utf8 */
2610 2 50       if (UTF8_IS_INVARIANT(result)) {
2611 0         *ustrp = (U8) result;
2612 0         *lenp = 1;
2613           }
2614           else {
2615 2         *ustrp = UTF8_EIGHT_BIT_HI(result);
2616 2         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2617 2         *lenp = 2;
2618           }
2619            
2620 2 50       if (tainted_ptr) {
2621 91877         *tainted_ptr = TRUE;
2622           }
2623           return result;
2624           }
2625            
2626           /*
2627           =for apidoc to_utf8_title
2628            
2629           Instead use L.
2630            
2631           =cut */
2632            
2633           /* Not currently externally documented, and subject to change:
2634           * is set iff locale semantics are to be used for code points < 256
2635           * Since titlecase is not defined in POSIX, uppercase is used instead
2636           * for these/
2637           * if non-null, *tainted_ptr will be set TRUE iff locale rules
2638           * were used in the calculation; otherwise unchanged. */
2639            
2640           UV
2641 10196         Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2642           {
2643           dVAR;
2644            
2645           UV result;
2646            
2647           PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2648            
2649 10196 100       if (UTF8_IS_INVARIANT(*p)) {
2650 982 50       if (flags) {
2651 0         result = toUPPER_LC(*p);
2652           }
2653           else {
2654 982         return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2655           }
2656           }
2657 9214 100       else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2658 786 100       if (flags) {
2659 2 50       result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2660           }
2661           else {
2662 784         return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2663           ustrp, lenp, 's');
2664           }
2665           }
2666           else { /* utf8, ord above 255 */
2667 8428         result = CALL_TITLE_CASE(p, ustrp, lenp);
2668            
2669 8422 50       if (flags) {
2670 0         result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2671           }
2672 8422         return result;
2673           }
2674            
2675           /* Here, used locale rules. Convert back to utf8 */
2676 2 50       if (UTF8_IS_INVARIANT(result)) {
2677 0         *ustrp = (U8) result;
2678 0         *lenp = 1;
2679           }
2680           else {
2681 2         *ustrp = UTF8_EIGHT_BIT_HI(result);
2682 2         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2683 2         *lenp = 2;
2684           }
2685            
2686 2 50       if (tainted_ptr) {
2687 5096         *tainted_ptr = TRUE;
2688           }
2689           return result;
2690           }
2691            
2692           /*
2693           =for apidoc to_utf8_lower
2694            
2695           Instead use L.
2696            
2697           =cut */
2698            
2699           /* Not currently externally documented, and subject to change:
2700           * is set iff locale semantics are to be used for code points < 256
2701           * if non-null, *tainted_ptr will be set TRUE iff locale rules
2702           * were used in the calculation; otherwise unchanged. */
2703            
2704           UV
2705 192456         Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2706           {
2707           UV result;
2708            
2709           dVAR;
2710            
2711           PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2712            
2713 192456 100       if (UTF8_IS_INVARIANT(*p)) {
2714 31466 100       if (flags) {
2715 256         result = toLOWER_LC(*p);
2716           }
2717           else {
2718 46815         return to_lower_latin1(*p, ustrp, lenp);
2719           }
2720           }
2721 160990 100       else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2722 28714 100       if (flags) {
2723 260 50       result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2724           }
2725           else {
2726 42681         return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2727           ustrp, lenp);
2728           }
2729           }
2730           else { /* utf8, ord above 255 */
2731 132276         result = CALL_LOWER_CASE(p, ustrp, lenp);
2732            
2733 132270 50       if (flags) {
2734 0         result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2735           }
2736            
2737 132270         return result;
2738           }
2739            
2740           /* Here, used locale rules. Convert back to utf8 */
2741 516 100       if (UTF8_IS_INVARIANT(result)) {
2742 256         *ustrp = (U8) result;
2743 256         *lenp = 1;
2744           }
2745           else {
2746 260         *ustrp = UTF8_EIGHT_BIT_HI(result);
2747 260         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2748 260         *lenp = 2;
2749           }
2750            
2751 516 50       if (tainted_ptr) {
2752 96483         *tainted_ptr = TRUE;
2753           }
2754           return result;
2755           }
2756            
2757           /*
2758           =for apidoc to_utf8_fold
2759            
2760           Instead use L.
2761            
2762           =cut */
2763            
2764           /* Not currently externally documented, and subject to change,
2765           * in
2766           * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2767           * points < 256. Since foldcase is not defined in
2768           * POSIX, lowercase is used instead
2769           * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2770           * otherwise simple folds
2771           * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2772           * prohibited
2773           * if non-null, *tainted_ptr will be set TRUE iff locale rules
2774           * were used in the calculation; otherwise unchanged. */
2775            
2776           UV
2777 1282382         Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
2778           {
2779           dVAR;
2780            
2781           UV result;
2782            
2783           PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2784            
2785           /* These are mutually exclusive */
2786           assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2787            
2788           assert(p != ustrp); /* Otherwise overwrites */
2789            
2790 1282382 100       if (UTF8_IS_INVARIANT(*p)) {
2791 68476 100       if (flags & FOLD_FLAGS_LOCALE) {
2792 256         result = toFOLD_LC(*p);
2793           }
2794           else {
2795 68220         return _to_fold_latin1(*p, ustrp, lenp,
2796           flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2797           }
2798           }
2799 1213906 100       else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2800 55790 100       if (flags & FOLD_FLAGS_LOCALE) {
2801 256 50       result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
2802           }
2803           else {
2804 55534         return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2805           ustrp, lenp,
2806           flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2807           }
2808           }
2809           else { /* utf8, ord above 255 */
2810 1158116 100       result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
2811            
2812 1158116 100       if (flags & FOLD_FLAGS_LOCALE) {
2813            
2814           /* Special case these characters, as what normally gets returned
2815           * under locale doesn't work */
2816 327246 100       if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2817 259094 100       && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2818           sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2819           {
2820           goto return_long_s;
2821           }
2822 277472 100       else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
2823 209320 100       && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
2824           sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
2825           {
2826           goto return_ligature_st;
2827           }
2828 235496         return check_locale_boundary_crossing(p, result, ustrp, lenp);
2829           }
2830 830870 100       else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2831           return result;
2832           }
2833           else {
2834           /* This is called when changing the case of a utf8-encoded
2835           * character above the ASCII range, and the result should not
2836           * contain an ASCII character. */
2837            
2838           UV original; /* To store the first code point of

*/

2839            
2840           /* Look at every character in the result; if any cross the
2841           * boundary, the whole thing is disallowed */
2842           U8* s = ustrp;
2843 312882         U8* e = ustrp + *lenp;
2844 543507 100       while (s < e) {
2845 318878 100       if (isASCII(*s)) {
2846           /* Crossed, have to return the original */
2847 244694         original = valid_utf8_to_uvchr(p, lenp);
2848            
2849           /* But in these instances, there is an alternative we can
2850           * return that is valid */
2851 367041 100       if (original == LATIN_CAPITAL_LETTER_SHARP_S
2852 244694         || original == LATIN_SMALL_LETTER_SHARP_S)
2853           {
2854           goto return_long_s;
2855           }
2856 196330 100       else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2857           goto return_ligature_st;
2858           }
2859 155300         Copy(p, ustrp, *lenp, char);
2860 155300         return original;
2861           }
2862 74184         s += UTF8SKIP(s);
2863           }
2864            
2865           /* Here, no characters crossed, result is ok as-is */
2866           return result;
2867           }
2868           }
2869            
2870           /* Here, used locale rules. Convert back to utf8 */
2871 512 100       if (UTF8_IS_INVARIANT(result)) {
2872 256         *ustrp = (U8) result;
2873 256         *lenp = 1;
2874           }
2875           else {
2876 256         *ustrp = UTF8_EIGHT_BIT_HI(result);
2877 256         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2878 256         *lenp = 2;
2879           }
2880            
2881 512 50       if (tainted_ptr) {
2882 512         *tainted_ptr = TRUE;
2883           }
2884           return result;
2885            
2886           return_long_s:
2887           /* Certain folds to 'ss' are prohibited by the options, but they do allow
2888           * folds to a string of two of these characters. By returning this
2889           * instead, then, e.g.,
2890           * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2891           * works. */
2892            
2893 98138         *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2894 98138         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2895           ustrp, *lenp, U8);
2896 98138         return LATIN_SMALL_LETTER_LONG_S;
2897            
2898           return_ligature_st:
2899           /* Two folds to 'st' are prohibited by the options; instead we pick one and
2900           * have the other one fold to it */
2901            
2902 83006         *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2903 83006         Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2904 682694         return LATIN_SMALL_LIGATURE_ST;
2905           }
2906            
2907           /* Note:
2908           * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2909           * C is a pointer to a package name for SWASHNEW, should be "utf8".
2910           * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2911           */
2912            
2913           SV*
2914 182         Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2915           {
2916           PERL_ARGS_ASSERT_SWASH_INIT;
2917            
2918           /* Returns a copy of a swash initiated by the called function. This is the
2919           * public interface, and returning a copy prevents others from doing
2920           * mischief on the original */
2921            
2922 182         return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
2923           }
2924            
2925           SV*
2926 637998         Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
2927           {
2928           /* Initialize and return a swash, creating it if necessary. It does this
2929           * by calling utf8_heavy.pl in the general case. The returned value may be
2930           * the swash's inversion list instead if the input parameters allow it.
2931           * Which is returned should be immaterial to callers, as the only
2932           * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2933           * and swash_to_invlist() handle both these transparently.
2934           *
2935           * This interface should only be used by functions that won't destroy or
2936           * adversely change the swash, as doing so affects all other uses of the
2937           * swash in the program; the general public should use 'Perl_swash_init'
2938           * instead.
2939           *
2940           * pkg is the name of the package that should be in.
2941           * name is the name of the swash to find. Typically it is a Unicode
2942           * property name, including user-defined ones
2943           * listsv is a string to initialize the swash with. It must be of the form
2944           * documented as the subroutine return value in
2945           * L
2946           * minbits is the number of bits required to represent each data element.
2947           * It is '1' for binary properties.
2948           * none I (khw) do not understand this one, but it is used only in tr///.
2949           * invlist is an inversion list to initialize the swash with (or NULL)
2950           * flags_p if non-NULL is the address of various input and output flag bits
2951           * to the routine, as follows: ('I' means is input to the routine;
2952           * 'O' means output from the routine. Only flags marked O are
2953           * meaningful on return.)
2954           * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2955           * came from a user-defined property. (I O)
2956           * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2957           * when the swash cannot be located, to simply return NULL. (I)
2958           * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2959           * return of an inversion list instead of a swash hash if this routine
2960           * thinks that would result in faster execution of swash_fetch() later
2961           * on. (I)
2962           *
2963           * Thus there are three possible inputs to find the swash: ,
2964           * , and . At least one must be specified. The result
2965           * will be the union of the specified ones, although 's various
2966           * actions can intersect, etc. what gives.
2967           *
2968           * is only valid for binary properties */
2969            
2970           dVAR;
2971           SV* retval = &PL_sv_undef;
2972           HV* swash_hv = NULL;
2973           const int invlist_swash_boundary =
2974 637844 100       (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2975           ? 512 /* Based on some benchmarking, but not extensive, see commit
2976           message */
2977 637998 100       : -1; /* Never return just an inversion list */
2978            
2979           assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2980           assert(! invlist || minbits == 1);
2981            
2982           /* If data was passed in to go out to utf8_heavy to find the swash of, do
2983           * so */
2984 871652 100       if (listsv != &PL_sv_undef || strNE(name, "")) {
    100        
    50        
2985 233654         dSP;
2986 233654         const size_t pkg_len = strlen(pkg);
2987 233654         const size_t name_len = strlen(name);
2988 233654         HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2989           SV* errsv_save;
2990           GV *method;
2991            
2992           PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2993            
2994 233654 100       PUSHSTACKi(PERLSI_MAGIC);
2995 233654         ENTER;
2996 233654         SAVEHINTS();
2997 233654         save_re_context();
2998           /* We might get here via a subroutine signature which uses a utf8
2999           * parameter name, at which point PL_subname will have been set
3000           * but not yet used. */
3001 233654         save_item(PL_subname);
3002 233654 100       if (PL_parser && PL_parser->error_count)
    100        
3003 2         SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3004 233654         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
3005 233654 100       if (!method) { /* demand load utf8 */
3006 1104         ENTER;
3007 1104 50       if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3008 1104         GvSV(PL_errgv) = NULL;
3009           /* It is assumed that callers of this routine are not passing in
3010           * any user derived data. */
3011           /* Need to do this after save_re_context() as it will set
3012           * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3013           * in Perl_magic_get). Even line to create errsv_save can turn on
3014           * PL_tainted. */
3015           #ifndef NO_TAINT_SUPPORT
3016 1104         SAVEBOOL(TAINT_get);
3017 1104         TAINT_NOT;
3018           #endif
3019 1104         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3020           NULL);
3021           {
3022           /* Not ERRSV, as there is no need to vivify a scalar we are
3023           about to discard. */
3024 1104         SV * const errsv = GvSV(PL_errgv);
3025 1104 50       if (!SvTRUE(errsv)) {
    50        
    0        
    50        
    0        
    0        
    50        
    50        
    50        
    50        
    0        
    50        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
3026 2208         GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3027 1104         SvREFCNT_dec(errsv);
3028           }
3029           }
3030 1104         LEAVE;
3031           }
3032 233654         SPAGAIN;
3033 233654 50       PUSHMARK(SP);
3034 116827         EXTEND(SP,5);
3035 233654         mPUSHp(pkg, pkg_len);
3036 233654         mPUSHp(name, name_len);
3037 233654         PUSHs(listsv);
3038 233654         mPUSHi(minbits);
3039 233654         mPUSHi(none);
3040 233654         PUTBACK;
3041 233654 50       if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3042 233654         GvSV(PL_errgv) = NULL;
3043           /* If we already have a pointer to the method, no need to use
3044           * call_method() to repeat the lookup. */
3045 467308 100       if (method
    50        
3046 232550         ? call_sv(MUTABLE_SV(method), G_SCALAR)
3047 1104         : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3048           {
3049 233654         retval = *PL_stack_sp--;
3050           SvREFCNT_inc(retval);
3051           }
3052           {
3053           /* Not ERRSV. See above. */
3054 233654         SV * const errsv = GvSV(PL_errgv);
3055 233654 100       if (!SvTRUE(errsv)) {
    50        
    0        
    100        
    50        
    50        
    50        
    50        
    50        
    50        
    0        
    50        
    0        
    0        
    0        
    0        
    0        
    0        
    0        
3056 467308         GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3057 233654         SvREFCNT_dec(errsv);
3058           }
3059           }
3060 233654         LEAVE;
3061 233654 50       POPSTACK;
3062 233654 100       if (IN_PERL_COMPILETIME) {
3063 181458         CopHINTS_set(PL_curcop, PL_hints);
3064           }
3065 233654 100       if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
    50        
3066 50258 50       if (SvPOK(retval))
3067            
3068           /* If caller wants to handle missing properties, let them */
3069 50258 50       if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
    100        
3070           return NULL;
3071           }
3072 25126         Perl_croak(aTHX_
3073           "Can't find Unicode property definition \"%"SVf"\"",
3074           SVfARG(retval));
3075           Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
3076           }
3077           } /* End of calling the module to find the swash */
3078            
3079           /* If this operation fetched a swash, and we will need it later, get it */
3080 587740 100       if (retval != &PL_sv_undef
3081 183396 100       && (minbits == 1 || (flags_p
    50        
3082 0 0       && ! (*flags_p
3083 0         & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3084           {
3085 183172         swash_hv = MUTABLE_HV(SvRV(retval));
3086            
3087           /* If we don't already know that there is a user-defined component to
3088           * this swash, and the user has indicated they wish to know if there is
3089           * one (by passing ), find out */
3090 183172 100       if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
    50        
3091 183088         SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3092 183088 50       if (user_defined && SvUV(*user_defined)) {
    50        
    100        
3093 130         *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3094           }
3095           }
3096           }
3097            
3098           /* Make sure there is an inversion list for binary properties */
3099 587740 100       if (minbits == 1) {
3100           SV** swash_invlistsvp = NULL;
3101 587516         SV* swash_invlist = NULL;
3102           bool invlist_in_swash_is_valid = FALSE;
3103           bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3104           an unclaimed reference count */
3105            
3106           /* If this operation fetched a swash, get its already existing
3107           * inversion list, or create one for it */
3108            
3109 587516 100       if (swash_hv) {
3110 183172         swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
3111 183172 100       if (swash_invlistsvp) {
3112 177752         swash_invlist = *swash_invlistsvp;
3113           invlist_in_swash_is_valid = TRUE;
3114           }
3115           else {
3116 5420         swash_invlist = _swash_to_invlist(retval);
3117           swash_invlist_unclaimed = TRUE;
3118           }
3119           }
3120            
3121           /* If an inversion list was passed in, have to include it */
3122 587516 100       if (invlist) {
3123            
3124           /* Any fetched swash will by now have an inversion list in it;
3125           * otherwise will be NULL, indicating that we
3126           * didn't fetch a swash */
3127 405002 100       if (swash_invlist) {
3128            
3129           /* Add the passed-in inversion list, which invalidates the one
3130           * already stored in the swash */
3131           invlist_in_swash_is_valid = FALSE;
3132 658         _invlist_union(invlist, swash_invlist, &swash_invlist);
3133           }
3134           else {
3135            
3136           /* Here, there is no swash already. Set up a minimal one, if
3137           * we are going to return a swash */
3138 404344 100       if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
3139 6550         swash_hv = newHV();
3140 6550         retval = newRV_noinc(MUTABLE_SV(swash_hv));
3141           }
3142 404344         swash_invlist = invlist;
3143           }
3144           }
3145            
3146           /* Here, we have computed the union of all the passed-in data. It may
3147           * be that there was an inversion list in the swash which didn't get
3148           * touched; otherwise save the one computed one */
3149 587516 100       if (! invlist_in_swash_is_valid
3150 614646 100       && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3151           {
3152 11642 50       if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
3153           {
3154 0         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3155           }
3156           /* We just stole a reference count. */
3157 11642 100       if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3158 6550         else SvREFCNT_inc_simple_void_NN(swash_invlist);
3159           }
3160            
3161           /* Use the inversion list stand-alone if small enough */
3162 881274 100       if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3163 398126         SvREFCNT_dec(retval);
3164 398126 100       if (!swash_invlist_unclaimed)
3165 397798         SvREFCNT_inc_simple_void_NN(swash_invlist);
3166 398126         retval = newRV_noinc(swash_invlist);
3167           }
3168           }
3169            
3170 600306         return retval;
3171           }
3172            
3173            
3174           /* This API is wrong for special case conversions since we may need to
3175           * return several Unicode characters for a single Unicode character
3176           * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3177           * the lower-level routine, and it is similarly broken for returning
3178           * multiple values. --jhi
3179           * For those, you should use to_utf8_case() instead */
3180           /* Now SWASHGET is recasted into S_swatch_get in this file. */
3181            
3182           /* Note:
3183           * Returns the value of property/mapping C for the first character
3184           * of the string C. If C is true, the string C is
3185           * assumed to be in well-formed utf8. If C is false, the string C
3186           * is assumed to be in native 8-bit encoding. Caches the swatch in C.
3187           *
3188           * A "swash" is a hash which contains initially the keys/values set up by
3189           * SWASHNEW. The purpose is to be able to completely represent a Unicode
3190           * property for all possible code points. Things are stored in a compact form
3191           * (see utf8_heavy.pl) so that calculation is required to find the actual
3192           * property value for a given code point. As code points are looked up, new
3193           * key/value pairs are added to the hash, so that the calculation doesn't have
3194           * to ever be re-done. Further, each calculation is done, not just for the
3195           * desired one, but for a whole block of code points adjacent to that one.
3196           * For binary properties on ASCII machines, the block is usually for 64 code
3197           * points, starting with a code point evenly divisible by 64. Thus if the
3198           * property value for code point 257 is requested, the code goes out and
3199           * calculates the property values for all 64 code points between 256 and 319,
3200           * and stores these as a single 64-bit long bit vector, called a "swatch",
3201           * under the key for code point 256. The key is the UTF-8 encoding for code
3202           * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3203           * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3204           * for code point 258 is then requested, this code realizes that it would be
3205           * stored under the key for 256, and would find that value and extract the
3206           * relevant bit, offset from 256.
3207           *
3208           * Non-binary properties are stored in as many bits as necessary to represent
3209           * their values (32 currently, though the code is more general than that), not
3210           * as single bits, but the principal is the same: the value for each key is a
3211           * vector that encompasses the property values for all code points whose UTF-8
3212           * representations are represented by the key. That is, for all code points
3213           * whose UTF-8 representations are length N bytes, and the key is the first N-1
3214           * bytes of that.
3215           */
3216           UV
3217 9535344         Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
3218           {
3219           dVAR;
3220 9535344         HV *const hv = MUTABLE_HV(SvRV(swash));
3221           U32 klen;
3222           U32 off;
3223           STRLEN slen;
3224           STRLEN needents;
3225           const U8 *tmps = NULL;
3226           U32 bit;
3227           SV *swatch;
3228 9535344         const U8 c = *ptr;
3229            
3230           PERL_ARGS_ASSERT_SWASH_FETCH;
3231            
3232           /* If it really isn't a hash, it isn't really swash; must be an inversion
3233           * list */
3234 9535344 100       if (SvTYPE(hv) != SVt_PVHV) {
3235 2874804 50       return _invlist_contains_cp((SV*)hv,
3236           (do_utf8)
3237           ? valid_utf8_to_uvchr(ptr, NULL)
3238           : c);
3239           }
3240            
3241           /* We store the values in a "swatch" which is a vec() value in a swash
3242           * hash. Code points 0-255 are a single vec() stored with key length
3243           * (klen) 0. All other code points have a UTF-8 representation
3244           * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3245           * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3246           * length for them is the length of the encoded char - 1. ptr[klen] is the
3247           * final byte in the sequence representing the character */
3248 8097942 50       if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
    100        
3249           klen = 0;
3250           needents = 256;
3251 14106         off = c;
3252           }
3253 8083836 100       else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3254           klen = 0;
3255           needents = 256;
3256 3928         off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
3257           }
3258           else {
3259 8079908         klen = UTF8SKIP(ptr) - 1;
3260            
3261           /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3262           * the vec is the final byte in the sequence. (In EBCDIC this is
3263           * converted to I8 to get consecutive values.) To help you visualize
3264           * all this:
3265           * Straight 1047 After final byte
3266           * UTF-8 UTF-EBCDIC I8 transform
3267           * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3268           * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3269           * ...
3270           * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3271           * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3272           * ...
3273           * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3274           * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3275           * ...
3276           * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3277           * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3278           * ...
3279           * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3280           * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3281           *
3282           * (There are no discontinuities in the elided (...) entries.)
3283           * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3284           * key for the next 31, up through U+043F, whose UTF-8 final byte is
3285           * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3286           * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3287           * index into the vec() swatch (after subtracting 0x80, which we
3288           * actually do with an '&').
3289           * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3290           * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3291           * dicontinuities which go away by transforming it into I8, and we
3292           * effectively subtract 0xA0 to get the index. */
3293           needents = (1 << UTF_ACCUMULATION_SHIFT);
3294 8079908         off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
3295           }
3296            
3297           /*
3298           * This single-entry cache saves about 1/3 of the utf8 overhead in test
3299           * suite. (That is, only 7-8% overall over just a hash cache. Still,
3300           * it's nothing to sniff at.) Pity we usually come through at least
3301           * two function calls to get here...
3302           *
3303           * NB: this code assumes that swatches are never modified, once generated!
3304           */
3305            
3306 12128315 100       if (hv == PL_last_swash_hv &&
    100        
3307 12021991 100       klen == PL_last_swash_klen &&
3308 7982572 100       (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
3309           {
3310 7832052         tmps = PL_last_swash_tmps;
3311 7832052         slen = PL_last_swash_slen;
3312           }
3313           else {
3314           /* Try our second-level swatch cache, kept in a hash. */
3315 265890         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
3316            
3317           /* If not cached, generate it via swatch_get */
3318 265890 100       if (!svp || !SvPOK(*svp)
    50        
3319 244590 50       || !(tmps = (const U8*)SvPV_const(*svp, slen)))
    50        
3320           {
3321 21300 100       if (klen) {
3322 20738         const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3323 20738         swatch = swatch_get(swash,
3324           code_point & ~((UV)needents - 1),
3325           needents);
3326           }
3327           else { /* For the first 256 code points, the swatch has a key of
3328           length 0 */
3329 562         swatch = swatch_get(swash, 0, needents);
3330           }
3331            
3332 21300 100       if (IN_PERL_COMPILETIME)
3333 4918         CopHINTS_set(PL_curcop, PL_hints);
3334            
3335 21300         svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
3336            
3337 21300 50       if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
    50        
    50        
3338 21300 50       || (slen << 3) < needents)
3339 0         Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3340           "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3341           svp, tmps, (UV)slen, (UV)needents);
3342           }
3343            
3344 265890         PL_last_swash_hv = hv;
3345           assert(klen <= sizeof(PL_last_swash_key));
3346 265890         PL_last_swash_klen = (U8)klen;
3347           /* FIXME change interpvar.h? */
3348 265890         PL_last_swash_tmps = (U8 *) tmps;
3349 265890         PL_last_swash_slen = slen;
3350 265890 100       if (klen)
3351 256902         Copy(ptr, PL_last_swash_key, klen, U8);
3352           }
3353            
3354 8097942         switch ((int)((slen << 3) / needents)) {
3355           case 1:
3356 6899056         bit = 1 << (off & 7);
3357 6899056         off >>= 3;
3358 6899056         return (tmps[off] & bit) != 0;
3359           case 8:
3360 70         return tmps[off];
3361           case 16:
3362 2986         off <<= 1;
3363 2986         return (tmps[off] << 8) + tmps[off + 1] ;
3364           case 32:
3365 1195830         off <<= 2;
3366 1195830         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
3367           }
3368 4767672         Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3369           "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
3370           NORETURN_FUNCTION_END;
3371           }
3372            
3373           /* Read a single line of the main body of the swash input text. These are of
3374           * the form:
3375           * 0053 0056 0073
3376           * where each number is hex. The first two numbers form the minimum and
3377           * maximum of a range, and the third is the value associated with the range.
3378           * Not all swashes should have a third number
3379           *
3380           * On input: l points to the beginning of the line to be examined; it points
3381           * to somewhere in the string of the whole input text, and is
3382           * terminated by a \n or the null string terminator.
3383           * lend points to the null terminator of that string
3384           * wants_value is non-zero if the swash expects a third number
3385           * typestr is the name of the swash's mapping, like 'ToLower'
3386           * On output: *min, *max, and *val are set to the values read from the line.
3387           * returns a pointer just beyond the line examined. If there was no
3388           * valid min number on the line, returns lend+1
3389           */
3390            
3391           STATIC U8*
3392 2303340         S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3393           const bool wants_value, const U8* const typestr)
3394           {
3395 2303340 100       const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
    100        
3396           STRLEN numlen; /* Length of the number */
3397 2303340         I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3398           | PERL_SCAN_DISALLOW_PREFIX
3399           | PERL_SCAN_SILENT_NON_PORTABLE;
3400            
3401           /* nl points to the next \n in the scan */
3402 2303340         U8* const nl = (U8*)memchr(l, '\n', lend - l);
3403            
3404           /* Get the first number on the line: the range minimum */
3405 2303340         numlen = lend - l;
3406 2303340         *min = grok_hex((char *)l, &numlen, &flags, NULL);
3407 2303340 50       if (numlen) /* If found a hex number, position past it */
3408 2303340         l += numlen;
3409 0 0       else if (nl) { /* Else, go handle next line, if any */
3410 0         return nl + 1; /* 1 is length of "\n" */
3411           }
3412           else { /* Else, no next line */
3413 0         return lend + 1; /* to LIST's end at which \n is not found */
3414           }
3415            
3416           /* The max range value follows, separated by a BLANK */
3417 2303340 100       if (isBLANK(*l)) {
3418 2303198         ++l;
3419 2303198         flags = PERL_SCAN_SILENT_ILLDIGIT
3420           | PERL_SCAN_DISALLOW_PREFIX
3421           | PERL_SCAN_SILENT_NON_PORTABLE;
3422 2303198         numlen = lend - l;
3423 2303198         *max = grok_hex((char *)l, &numlen, &flags, NULL);
3424 2303198 100       if (numlen)
3425 1402214         l += numlen;
3426           else /* If no value here, it is a single element range */
3427 900984         *max = *min;
3428            
3429           /* Non-binary tables have a third entry: what the first element of the
3430           * range maps to */
3431 2303198 100       if (wants_value) {
3432 541538 50       if (isBLANK(*l)) {
3433 541538         ++l;
3434            
3435           /* The ToLc, etc table mappings are not in hex, and must be
3436           * corrected by adding the code point to them */
3437 541538 100       if (typeto) {
3438 541128         char *after_strtol = (char *) lend;
3439 541128         *val = Strtol((char *)l, &after_strtol, 10);
3440           l = (U8 *) after_strtol;
3441           }
3442           else { /* Other tables are in hex, and are the correct result
3443           without tweaking */
3444 410         flags = PERL_SCAN_SILENT_ILLDIGIT
3445           | PERL_SCAN_DISALLOW_PREFIX
3446           | PERL_SCAN_SILENT_NON_PORTABLE;
3447 410         numlen = lend - l;
3448 410         *val = grok_hex((char *)l, &numlen, &flags, NULL);
3449 410 50       if (numlen)
3450           l += numlen;
3451           else
3452 0         *val = 0;
3453           }
3454           }
3455           else {
3456 0         *val = 0;
3457 0 0       if (typeto) {
3458           /* diag_listed_as: To%s: illegal mapping '%s' */
3459 0         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3460           typestr, l);
3461           }
3462           }
3463           }
3464           else
3465 1761660         *val = 0; /* bits == 1, then any val should be ignored */
3466           }
3467           else { /* Nothing following range min, should be single element with no
3468           mapping expected */
3469 142         *max = *min;
3470 142 50       if (wants_value) {
3471 0         *val = 0;
3472 0 0       if (typeto) {
3473           /* diag_listed_as: To%s: illegal mapping '%s' */
3474 0         Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3475           }
3476           }
3477           else
3478 142         *val = 0; /* bits == 1, then val should be ignored */
3479           }
3480            
3481           /* Position to next line if any, or EOF */
3482 2303340 100       if (nl)
3483 2303268         l = nl + 1;
3484           else
3485           l = lend;
3486            
3487 2303340         return l;
3488           }
3489            
3490           /* Note:
3491           * Returns a swatch (a bit vector string) for a code point sequence
3492           * that starts from the value C and comprises the number C.
3493           * A C must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3494           * Should be used via swash_fetch, which will cache the swatch in C.
3495           */
3496           STATIC SV*
3497 21300         S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3498           {
3499           SV *swatch;
3500           U8 *l, *lend, *x, *xend, *s, *send;
3501           STRLEN lcur, xcur, scur;
3502 21300         HV *const hv = MUTABLE_HV(SvRV(swash));
3503 21300         SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
3504            
3505           SV** listsvp = NULL; /* The string containing the main body of the table */
3506           SV** extssvp = NULL;
3507           SV** invert_it_svp = NULL;
3508           U8* typestr = NULL;
3509           STRLEN bits;
3510           STRLEN octets; /* if bits == 1, then octets == 0 */
3511           UV none;
3512 21300         UV end = start + span;
3513            
3514 21300 100       if (invlistsvp == NULL) {
3515 1070         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3516 1070         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3517 1070         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3518 1070         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3519 1070         listsvp = hv_fetchs(hv, "LIST", FALSE);
3520 1070         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3521            
3522 1070 50       bits = SvUV(*bitssvp);
3523 1070 50       none = SvUV(*nonesvp);
3524 1070 50       typestr = (U8*)SvPV_nolen(*typesvp);
3525           }
3526           else {
3527           bits = 1;
3528           none = 0;
3529           }
3530 21300         octets = bits >> 3; /* if bits == 1, then octets == 0 */
3531            
3532           PERL_ARGS_ASSERT_SWATCH_GET;
3533            
3534 21300 100       if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
    50        
3535 0         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3536           (UV)bits);
3537           }
3538            
3539           /* If overflowed, use the max possible */
3540 21300 100       if (end < start) {
3541           end = UV_MAX;
3542 10         span = end - start;
3543           }
3544            
3545           /* create and initialize $swatch */
3546 21300 100       scur = octets ? (span * octets) : (span + 7) / 8;
3547 21300         swatch = newSV(scur);
3548 21300         SvPOK_on(swatch);
3549 21300         s = (U8*)SvPVX(swatch);
3550 21300 100       if (octets && none) {
3551 230         const U8* const e = s + scur;
3552 28889 100       while (s < e) {
3553 28544 100       if (bits == 8)
3554 4352         *s++ = (U8)(none & 0xff);
3555 24192 100       else if (bits == 16) {
3556 16640         *s++ = (U8)((none >> 8) & 0xff);
3557 16640         *s++ = (U8)( none & 0xff);
3558           }
3559 7552 50       else if (bits == 32) {
3560 7552         *s++ = (U8)((none >> 24) & 0xff);
3561 7552         *s++ = (U8)((none >> 16) & 0xff);
3562 7552         *s++ = (U8)((none >> 8) & 0xff);
3563 18048         *s++ = (U8)( none & 0xff);
3564           }
3565           }
3566 230         *s = '\0';
3567           }
3568           else {
3569 21070         (void)memzero((U8*)s, scur + 1);
3570           }
3571 21300         SvCUR_set(swatch, scur);
3572 21300         s = (U8*)SvPVX(swatch);
3573            
3574 21300 100       if (invlistsvp) { /* If has an inversion list set up use that */
3575 20230         _invlist_populate_swatch(*invlistsvp, start, end, s);
3576 20230         return swatch;
3577           }
3578            
3579           /* read $swash->{LIST} */
3580 1070 50       l = (U8*)SvPV(*listsvp, lcur);
3581 1070         lend = l + lcur;
3582 528023 100       while (l < lend) {
3583           UV min, max, val, upper;
3584 526418         l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3585           cBOOL(octets), typestr);
3586 526418 50       if (l > lend) {
3587           break;
3588           }
3589            
3590           /* If looking for something beyond this range, go try the next one */
3591 526418 100       if (max < start)
3592 279486         continue;
3593            
3594           /* is generally 1 beyond where we want to set things, but at the
3595           * platform's infinity, where we can't go any higher, we want to
3596           * include the code point at */
3597           upper = (max < end)
3598           ? max
3599 252919 100       : (max != UV_MAX || end != UV_MAX)
3600           ? end - 1
3601 234958 50       : end;
3602            
3603 246932 50       if (octets) {
3604           UV key;
3605 246932 100       if (min < start) {
3606 116 100       if (!none || val < none) {
    100        
3607 98         val += start - min;
3608           }
3609 116         min = start;
3610           }
3611 269928 100       for (key = min; key <= upper; key++) {
3612           STRLEN offset;
3613           /* offset must be non-negative (start <= min <= key < end) */
3614 22996         offset = octets * (key - start);
3615 22996 100       if (bits == 8)
3616 26         s[offset] = (U8)(val & 0xff);
3617 22970 100       else if (bits == 16) {
3618 3092         s[offset ] = (U8)((val >> 8) & 0xff);
3619 3092         s[offset + 1] = (U8)( val & 0xff);
3620           }
3621 19878 50       else if (bits == 32) {
3622 19878         s[offset ] = (U8)((val >> 24) & 0xff);
3623 19878         s[offset + 1] = (U8)((val >> 16) & 0xff);
3624 19878         s[offset + 2] = (U8)((val >> 8) & 0xff);
3625 19878         s[offset + 3] = (U8)( val & 0xff);
3626           }
3627            
3628 22996 100       if (!none || val < none)
    100        
3629 20316         ++val;
3630           }
3631           }
3632           else { /* bits == 1, then val should be ignored */
3633           UV key;
3634 0 0       if (min < start)
3635 0         min = start;
3636            
3637 263209 0       for (key = min; key <= upper; key++) {
3638 0         const STRLEN offset = (STRLEN)(key - start);
3639 0         s[offset >> 3] |= 1 << (offset & 7);
3640           }
3641           }
3642           } /* while */
3643            
3644           /* Invert if the data says it should be. Assumes that bits == 1 */
3645 1070 100       if (invert_it_svp && SvUV(*invert_it_svp)) {
    50        
    50        
3646            
3647           /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3648           * be 0, and their inversion should also be 0, as we don't succeed any
3649           * Unicode property matches for non-Unicode code points */
3650 0 0       if (start <= PERL_UNICODE_MAX) {
3651            
3652           /* The code below assumes that we never cross the
3653           * Unicode/above-Unicode boundary in a range, as otherwise we would
3654           * have to figure out where to stop flipping the bits. Since this
3655           * boundary is divisible by a large power of 2, and swatches comes
3656           * in small powers of 2, this should be a valid assumption */
3657           assert(start + span - 1 <= PERL_UNICODE_MAX);
3658            
3659 0         send = s + scur;
3660 0 0       while (s < send) {
3661 0         *s = ~(*s);
3662 0         s++;
3663           }
3664           }
3665           }
3666            
3667           /* read $swash->{EXTRAS}
3668           * This code also copied to swash_to_invlist() below */
3669 1070 50       x = (U8*)SvPV(*extssvp, xcur);
3670 1070         xend = x + xcur;
3671 11950 100       while (x < xend) {
3672           STRLEN namelen;
3673           U8 *namestr;
3674           SV** othersvp;
3675           HV* otherhv;
3676           STRLEN otherbits;
3677           SV **otherbitssvp, *other;
3678           U8 *s, *o, *nl;
3679           STRLEN slen, olen;
3680            
3681 230         const U8 opc = *x++;
3682 230 50       if (opc == '\n')
3683 0         continue;
3684            
3685 230         nl = (U8*)memchr(x, '\n', xend - x);
3686            
3687 230 50       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
    50        
3688 230 50       if (nl) {
3689 230         x = nl + 1; /* 1 is length of "\n" */
3690 230         continue;
3691           }
3692           else {
3693           x = xend; /* to EXTRAS' end at which \n is not found */
3694           break;
3695           }
3696           }
3697            
3698           namestr = x;
3699 0 0       if (nl) {
3700 0         namelen = nl - namestr;
3701 0         x = nl + 1;
3702           }
3703           else {
3704 0         namelen = xend - namestr;
3705           x = xend;
3706           }
3707            
3708 0         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3709 0         otherhv = MUTABLE_HV(SvRV(*othersvp));
3710 0         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3711 0 0       otherbits = (STRLEN)SvUV(*otherbitssvp);
3712 0 0       if (bits < otherbits)
3713 0         Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3714           "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3715            
3716           /* The "other" swatch must be destroyed after. */
3717 0         other = swatch_get(*othersvp, start, span);
3718 0 0       o = (U8*)SvPV(other, olen);
3719            
3720 0 0       if (!olen)
3721 0         Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3722            
3723 0 0       s = (U8*)SvPV(swatch, slen);
3724 0 0       if (bits == 1 && otherbits == 1) {
3725 0 0       if (slen != olen)
3726 0         Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3727           "mismatch, slen=%"UVuf", olen=%"UVuf,
3728           (UV)slen, (UV)olen);
3729            
3730 0         switch (opc) {
3731           case '+':
3732 0 0       while (slen--)
3733 0         *s++ |= *o++;
3734           break;
3735           case '!':
3736 0 0       while (slen--)
3737 0         *s++ |= ~*o++;
3738           break;
3739           case '-':
3740 0 0       while (slen--)
3741 0         *s++ &= ~*o++;
3742           break;
3743           case '&':
3744 0 0       while (slen--)
3745 0         *s++ &= *o++;
3746           break;
3747           default:
3748           break;
3749           }
3750           }
3751           else {
3752 0         STRLEN otheroctets = otherbits >> 3;
3753           STRLEN offset = 0;
3754 0         U8* const send = s + slen;
3755            
3756 0 0       while (s < send) {
3757           UV otherval = 0;
3758            
3759 0 0       if (otherbits == 1) {
3760 0         otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3761 0         ++offset;
3762           }
3763           else {
3764           STRLEN vlen = otheroctets;
3765 0         otherval = *o++;
3766 0 0       while (--vlen) {
3767 0         otherval <<= 8;
3768 0         otherval |= *o++;
3769           }
3770           }
3771            
3772 0 0       if (opc == '+' && otherval)
3773           NOOP; /* replace with otherval */
3774 0 0       else if (opc == '!' && !otherval)
3775           otherval = 1;
3776 0 0       else if (opc == '-' && otherval)
3777           otherval = 0;
3778 0 0       else if (opc == '&' && !otherval)
3779           otherval = 0;
3780           else {
3781 0         s += octets; /* no replacement */
3782 0         continue;
3783           }
3784            
3785 0 0       if (bits == 8)
3786 0         *s++ = (U8)( otherval & 0xff);
3787 0 0       else if (bits == 16) {
3788 0         *s++ = (U8)((otherval >> 8) & 0xff);
3789 0         *s++ = (U8)( otherval & 0xff);
3790           }
3791 0 0       else if (bits == 32) {
3792 0         *s++ = (U8)((otherval >> 24) & 0xff);
3793 0         *s++ = (U8)((otherval >> 16) & 0xff);
3794 0         *s++ = (U8)((otherval >> 8) & 0xff);
3795 0         *s++ = (U8)( otherval & 0xff);
3796           }
3797           }
3798           }
3799 115         sv_free(other); /* through with it! */
3800           } /* while */
3801           return swatch;
3802           }
3803            
3804           HV*
3805 24         Perl__swash_inversion_hash(pTHX_ SV* const swash)
3806           {
3807            
3808           /* Subject to change or removal. For use only in regcomp.c and regexec.c
3809           * Can't be used on a property that is subject to user override, as it
3810           * relies on the value of SPECIALS in the swash which would be set by
3811           * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3812           * for overridden properties
3813           *
3814           * Returns a hash which is the inversion and closure of a swash mapping.
3815           * For example, consider the input lines:
3816           * 004B 006B
3817           * 004C 006C
3818           * 212A 006B
3819           *
3820           * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3821           * 006C. The value for each key is an array. For 006C, the array would
3822           * have two elements, the utf8 for itself, and for 004C. For 006B, there
3823           * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3824           *
3825           * Essentially, for any code point, it gives all the code points that map to
3826           * it, or the list of 'froms' for that point.
3827           *
3828           * Currently it ignores any additions or deletions from other swashes,
3829           * looking at just the main body of the swash, and if there are SPECIALS
3830           * in the swash, at that hash
3831           *
3832           * The specials hash can be extra code points, and most likely consists of
3833           * maps from single code points to multiple ones (each expressed as a string
3834           * of utf8 characters). This function currently returns only 1-1 mappings.
3835           * However consider this possible input in the specials hash:
3836           * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
3837           * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
3838           *
3839           * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3840           * currently handle. But it also means that FB05 and FB06 are equivalent in
3841           * a 1-1 mapping which we should handle, and this relationship may not be in
3842           * the main table. Therefore this function examines all the multi-char
3843           * sequences and adds the 1-1 mappings that come out of that. */
3844            
3845           U8 *l, *lend;
3846           STRLEN lcur;
3847 24         HV *const hv = MUTABLE_HV(SvRV(swash));
3848            
3849           /* The string containing the main body of the table. This will have its
3850           * assertion fail if the swash has been converted to its inversion list */
3851 24         SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3852            
3853 24         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3854 24         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3855 24         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3856           /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3857 24 50       const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3858 24 50       const STRLEN bits = SvUV(*bitssvp);
3859 24         const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3860 24 50       const UV none = SvUV(*nonesvp);
3861 24         SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3862            
3863 24         HV* ret = newHV();
3864            
3865           PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3866            
3867           /* Must have at least 8 bits to get the mappings */
3868 24 50       if (bits != 8 && bits != 16 && bits != 32) {
    50        
3869 0         Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3870           (UV)bits);
3871           }
3872            
3873 24 50       if (specials_p) { /* It might be "special" (sometimes, but not always, a
3874           mapping to more than one character */
3875            
3876           /* Construct an inverse mapping hash for the specials */
3877 24         HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3878 24         HV * specials_inverse = newHV();
3879           char *char_from; /* the lhs of the map */
3880           I32 from_len; /* its byte length */
3881           char *char_to; /* the rhs of the map */
3882           I32 to_len; /* its byte length */
3883           SV *sv_to; /* and in a sv */
3884           AV* from_list; /* list of things that map to each 'to' */
3885            
3886 24         hv_iterinit(specials_hv);
3887            
3888           /* The keys are the characters (in utf8) that map to the corresponding
3889           * utf8 string value. Iterate through the list creating the inverse
3890           * list. */
3891 2532 100       while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3892           SV** listp;
3893 2496 50       if (! SvPOK(sv_to)) {
3894 0         Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3895           "unexpectedly is not a string, flags=%lu",
3896 0         (unsigned long)SvFLAGS(sv_to));
3897           }
3898           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3899            
3900           /* Each key in the inverse list is a mapped-to value, and the key's
3901           * hash value is a list of the strings (each in utf8) that map to
3902           * it. Those strings are all one character long */
3903 2496 100       if ((listp = hv_fetch(specials_inverse,
3904           SvPVX(sv_to),
3905           SvCUR(sv_to), 0)))
3906           {
3907 744         from_list = (AV*) *listp;
3908           }
3909           else { /* No entry yet for it: create one */
3910 1752         from_list = newAV();
3911 1752 50       if (! hv_store(specials_inverse,
3912           SvPVX(sv_to),
3913           SvCUR(sv_to),
3914           (SV*) from_list, 0))
3915           {
3916 0         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3917           }
3918           }
3919            
3920           /* Here have the list associated with this 'to' (perhaps newly
3921           * created and empty). Just add to it. Note that we ASSUME that
3922           * the input is guaranteed to not have duplications, so we don't
3923           * check for that. Duplications just slow down execution time. */
3924 2496         av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3925           }
3926            
3927           /* Here, 'specials_inverse' contains the inverse mapping. Go through
3928           * it looking for cases like the FB05/FB06 examples above. There would
3929           * be an entry in the hash like
3930           * 'st' => [ FB05, FB06 ]
3931           * In this example we will create two lists that get stored in the
3932           * returned hash, 'ret':
3933           * FB05 => [ FB05, FB06 ]
3934           * FB06 => [ FB05, FB06 ]
3935           *
3936           * Note that there is nothing to do if the array only has one element.
3937           * (In the normal 1-1 case handled below, we don't have to worry about
3938           * two lists, as everything gets tied to the single list that is
3939           * generated for the single character 'to'. But here, we are omitting
3940           * that list, ('st' in the example), so must have multiple lists.) */
3941 1776 100       while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3942           &char_to, &to_len)))
3943           {
3944 1752 100       if (av_len(from_list) > 0) {
3945           SSize_t i;
3946            
3947           /* We iterate over all combinations of i,j to place each code
3948           * point on each list */
3949 1872 100       for (i = 0; i <= av_len(from_list); i++) {
3950           SSize_t j;
3951 1488         AV* i_list = newAV();
3952 1488         SV** entryp = av_fetch(from_list, i, FALSE);
3953 1488 50       if (entryp == NULL) {
3954 0         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3955           }
3956 1488 50       if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3957 0         Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3958           }
3959 1488 50       if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3960           (SV*) i_list, FALSE))
3961           {
3962 0         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3963           }
3964            
3965           /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3966 3720 100       for (j = 0; j <= av_len(from_list); j++) {
3967 2976         entryp = av_fetch(from_list, j, FALSE);
3968 2976 50       if (entryp == NULL) {
3969 0         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3970           }
3971            
3972           /* When i==j this adds itself to the list */
3973 2976 50       av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3974           (U8*) SvPVX(*entryp),
3975           (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3976           0)));
3977           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3978           }
3979           }
3980           }
3981           }
3982 24         SvREFCNT_dec(specials_inverse); /* done with it */
3983           } /* End of specials */
3984            
3985           /* read $swash->{LIST} */
3986 24 50       l = (U8*)SvPV(*listsvp, lcur);
3987 24         lend = l + lcur;
3988            
3989           /* Go through each input line */
3990 15156 100       while (l < lend) {
3991           UV min, max, val;
3992           UV inverse;
3993 15120         l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3994           cBOOL(octets), typestr);
3995 15120 50       if (l > lend) {
3996           break;
3997           }
3998            
3999           /* Each element in the range is to be inverted */
4000 40440 100       for (inverse = min; inverse <= max; inverse++) {
4001           AV* list;
4002           SV** listp;
4003           IV i;
4004           bool found_key = FALSE;
4005           bool found_inverse = FALSE;
4006            
4007           /* The key is the inverse mapping */
4008           char key[UTF8_MAXBYTES+1];
4009 25320         char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
4010 25320         STRLEN key_len = key_end - key;
4011            
4012           /* Get the list for the map */
4013 25320 100       if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
4014 1176         list = (AV*) *listp;
4015           }
4016           else { /* No entry yet for it: create one */
4017 24144         list = newAV();
4018 24144 50       if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
4019 0         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
4020           }
4021           }
4022            
4023           /* Look through list to see if this inverse mapping already is
4024           * listed, or if there is a mapping to itself already */
4025 27048 100       for (i = 0; i <= av_len(list); i++) {
4026 2400         SV** entryp = av_fetch(list, i, FALSE);
4027           SV* entry;
4028 2400 50       if (entryp == NULL) {
4029 0         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
4030           }
4031 2400         entry = *entryp;
4032           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
4033 2400 50       if (SvUV(entry) == val) {
    100        
4034           found_key = TRUE;
4035           }
4036 2400 50       if (SvUV(entry) == inverse) {
    100        
4037           found_inverse = TRUE;
4038           }
4039            
4040           /* No need to continue searching if found everything we are
4041           * looking for */
4042 2400 100       if (found_key && found_inverse) {
    100        
4043           break;
4044           }
4045           }
4046            
4047           /* Make sure there is a mapping to itself on the list */
4048 25320 100       if (! found_key) {
4049 24144         av_push(list, newSVuv(val));
4050           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
4051           }
4052            
4053            
4054           /* Simply add the value to the list */
4055 25320 100       if (! found_inverse) {
4056 24648         av_push(list, newSVuv(inverse));
4057           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
4058           }
4059            
4060           /* swatch_get() increments the value of val for each element in the
4061           * range. That makes more compact tables possible. You can
4062           * express the capitalization, for example, of all consecutive
4063           * letters with a single line: 0061\t007A\t0041 This maps 0061 to
4064           * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
4065           * and it's not documented; it appears to be used only in
4066           * implementing tr//; I copied the semantics from swatch_get(), just
4067           * in case */
4068 25320 50       if (!none || val < none) {
    0        
4069 25320         ++val;
4070           }
4071           }
4072           }
4073            
4074 24         return ret;
4075           }
4076            
4077           SV*
4078 6680         Perl__swash_to_invlist(pTHX_ SV* const swash)
4079           {
4080            
4081           /* Subject to change or removal. For use only in one place in regcomp.c.
4082           * Ownership is given to one reference count in the returned SV* */
4083            
4084           U8 *l, *lend;
4085           char *loc;
4086           STRLEN lcur;
4087 6680         HV *const hv = MUTABLE_HV(SvRV(swash));
4088           UV elements = 0; /* Number of elements in the inversion list */
4089 6680         U8 empty[] = "";
4090           SV** listsvp;
4091           SV** typesvp;
4092           SV** bitssvp;
4093           SV** extssvp;
4094           SV** invert_it_svp;
4095            
4096           U8* typestr;
4097           STRLEN bits;
4098           STRLEN octets; /* if bits == 1, then octets == 0 */
4099           U8 *x, *xend;
4100           STRLEN xcur;
4101            
4102           SV* invlist;
4103            
4104           PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
4105            
4106           /* If not a hash, it must be the swash's inversion list instead */
4107 6680 100       if (SvTYPE(hv) != SVt_PVHV) {
4108 28         return SvREFCNT_inc_simple_NN((SV*) hv);
4109           }
4110            
4111           /* The string containing the main body of the table */
4112 6652         listsvp = hv_fetchs(hv, "LIST", FALSE);
4113 6652         typesvp = hv_fetchs(hv, "TYPE", FALSE);
4114 6652         bitssvp = hv_fetchs(hv, "BITS", FALSE);
4115 6652         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
4116 6652         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
4117            
4118 6652 50       typestr = (U8*)SvPV_nolen(*typesvp);
4119 6652 50       bits = SvUV(*bitssvp);
4120 6652         octets = bits >> 3; /* if bits == 1, then octets == 0 */
4121            
4122           /* read $swash->{LIST} */
4123 6652 50       if (SvPOK(*listsvp)) {
4124 6652 50       l = (U8*)SvPV(*listsvp, lcur);
4125           }
4126           else {
4127           /* LIST legitimately doesn't contain a string during compilation phases
4128           * of Perl itself, before the Unicode tables are generated. In this
4129           * case, just fake things up by creating an empty list */
4130           l = empty;
4131 0         lcur = 0;
4132           }
4133           loc = (char *) l;
4134 6652         lend = l + lcur;
4135            
4136           /* Scan the input to count the number of lines to preallocate array size
4137           * based on worst possible case, which is each line in the input creates 2
4138           * elements in the inversion list: 1) the beginning of a range in the list;
4139           * 2) the beginning of a range not in the list. */
4140 1771708 100       while ((loc = (strchr(loc, '\n'))) != NULL) {
4141 1761730         elements += 2;
4142 1761730         loc++;
4143           }
4144            
4145           /* If the ending is somehow corrupt and isn't a new line, add another
4146           * element for the final range that isn't in the inversion list */
4147 9978 50       if (! (*lend == '\n'
    50        
4148 9978 100       || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
    100        
4149           {
4150 72         elements++;
4151           }
4152            
4153 6652         invlist = _new_invlist(elements);
4154            
4155           /* Now go through the input again, adding each range to the list */
4156 1771780 100       while (l < lend) {
4157           UV start, end;
4158           UV val; /* Not used by this function */
4159            
4160 1761802         l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
4161           cBOOL(octets), typestr);
4162            
4163 1761802 50       if (l > lend) {
4164           break;
4165           }
4166            
4167 1761802         invlist = _add_range_to_invlist(invlist, start, end);
4168           }
4169            
4170           /* Invert if the data says it should be */
4171 6652 100       if (invert_it_svp && SvUV(*invert_it_svp)) {
    50        
    100        
4172 94         _invlist_invert_prop(invlist);
4173           }
4174            
4175           /* This code is copied from swatch_get()
4176           * read $swash->{EXTRAS} */
4177 6652 50       x = (U8*)SvPV(*extssvp, xcur);
4178 6652         xend = x + xcur;
4179 11942 100       while (x < xend) {
4180           STRLEN namelen;
4181           U8 *namestr;
4182           SV** othersvp;
4183           HV* otherhv;
4184           STRLEN otherbits;
4185           SV **otherbitssvp, *other;
4186           U8 *nl;
4187            
4188 1964         const U8 opc = *x++;
4189 1964 100       if (opc == '\n')
4190 2         continue;
4191            
4192 1962         nl = (U8*)memchr(x, '\n', xend - x);
4193            
4194 1962 100       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
    100        
4195 844 50       if (nl) {
4196 844         x = nl + 1; /* 1 is length of "\n" */
4197 844         continue;
4198           }
4199           else {
4200