utf8.c | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 1065 | 1354 | 78.7 |
branch | 1030 | 1790 | 57.5 |
condition | n/a | ||
subroutine | n/a | ||
total | 2095 | 3144 | 66.6 |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | /* utf8.c | |||||
2 | * | |||||
3 | * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 | |||||
4 | * by Larry Wall and others | |||||
5 | * | |||||
6 | * You may distribute under the terms of either the GNU General Public | |||||
7 | * License or the Artistic License, as specified in the README file. | |||||
8 | * | |||||
9 | */ | |||||
10 | ||||||
11 | /* | |||||
12 | * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever | |||||
13 | * heard of that we don't want to see any closer; and that's the one place | |||||
14 | * we're trying to get to! And that's just where we can't get, nohow.' | |||||
15 | * | |||||
16 | * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] | |||||
17 | * | |||||
18 | * 'Well do I understand your speech,' he answered in the same language; | |||||
19 | * 'yet few strangers do so. Why then do you not speak in the Common Tongue, | |||||
20 | * as is the custom in the West, if you wish to be answered?' | |||||
21 | * --Gandalf, addressing Théoden's door wardens | |||||
22 | * | |||||
23 | * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] | |||||
24 | * | |||||
25 | * ...the travellers perceived that the floor was paved with stones of many | |||||
26 | * hues; branching runes and strange devices intertwined beneath their feet. | |||||
27 | * | |||||
28 | * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] | |||||
29 | */ | |||||
30 | ||||||
31 | #include "EXTERN.h" | |||||
32 | #define PERL_IN_UTF8_C | |||||
33 | #include "perl.h" | |||||
34 | #include "inline_invlist.c" | |||||
35 | ||||||
36 | static const char unees[] = | |||||
37 | "Malformed UTF-8 character (unexpected end of string)"; | |||||
38 | ||||||
39 | /* | |||||
40 | =head1 Unicode Support | |||||
41 | ||||||
42 | This file contains various utility functions for manipulating UTF8-encoded | |||||
43 | strings. For the uninitiated, this is a method of representing arbitrary | |||||
44 | Unicode characters as a variable number of bytes, in such a way that | |||||
45 | characters in the ASCII range are unmodified, and a zero byte never appears | |||||
46 | within non-zero characters. | |||||
47 | ||||||
48 | =cut | |||||
49 | */ | |||||
50 | ||||||
51 | /* | |||||
52 | =for apidoc is_ascii_string | |||||
53 | ||||||
54 | Returns true if the first C |
|||||
55 | or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That | |||||
56 | is, if they are invariant. On ASCII-ish machines, only ASCII characters | |||||
57 | fit this definition, hence the function's name. | |||||
58 | ||||||
59 | If C |
|||||
60 | ||||||
61 | See also L(), L(), and L(). | |||||
62 | ||||||
63 | =cut | |||||
64 | */ | |||||
65 | ||||||
66 | bool | |||||
67 | 533506 | Perl_is_ascii_string(const U8 *s, STRLEN len) | ||||
68 | { | |||||
69 | 533506 | 50 | const U8* const send = s + (len ? len : strlen((const char *)s)); | |||
70 | const U8* x = s; | |||||
71 | ||||||
72 | PERL_ARGS_ASSERT_IS_ASCII_STRING; | |||||
73 | ||||||
74 | 13381006 | 100 | for (; x < send; ++x) { | |||
75 | 12849638 | 100 | if (!UTF8_IS_INVARIANT(*x)) | |||
76 | break; | |||||
77 | } | |||||
78 | ||||||
79 | 533506 | return x == send; | ||||
80 | } | |||||
81 | ||||||
82 | /* | |||||
83 | =for apidoc uvoffuni_to_utf8_flags | |||||
84 | ||||||
85 | THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. | |||||
86 | Instead, B |
|||||
87 | L>. | |||||
88 | ||||||
89 | This function is like them, but the input is a strict Unicode | |||||
90 | (as opposed to native) code point. Only in very rare circumstances should code | |||||
91 | not be using the native code point. | |||||
92 | ||||||
93 | For details, see the description for L>. | |||||
94 | ||||||
95 | =cut | |||||
96 | */ | |||||
97 | ||||||
98 | U8 * | |||||
99 | 8636950 | Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) | ||||
100 | { | |||||
101 | PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; | |||||
102 | ||||||
103 | 8636950 | 100 | if (UNI_IS_INVARIANT(uv)) { | |||
104 | 4073184 | *d++ = (U8) LATIN1_TO_NATIVE(uv); | ||||
105 | 4073184 | return d; | ||||
106 | } | |||||
107 | ||||||
108 | /* The first problematic code point is the first surrogate */ | |||||
109 | 4563766 | 100 | if (uv >= UNICODE_SURROGATE_FIRST | |||
110 | 946222 | 100 | && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR)) | |||
111 | { | |||||
112 | 923878 | 100 | if (UNICODE_IS_SURROGATE(uv)) { | |||
113 | 4936 | 50 | if (flags & UNICODE_WARN_SURROGATE) { | |||
114 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), | ||||
115 | "UTF-16 surrogate U+%04"UVXf, uv); | |||||
116 | } | |||||
117 | 4936 | 50 | if (flags & UNICODE_DISALLOW_SURROGATE) { | |||
118 | return NULL; | |||||
119 | } | |||||
120 | } | |||||
121 | 918942 | 100 | else if (UNICODE_IS_SUPER(uv)) { | |||
122 | 77488 | 50 | if (flags & UNICODE_WARN_SUPER | |||
123 | 77488 | 100 | || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF))) | |||
50 | ||||||
124 | { | |||||
125 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), | ||||
126 | "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv); | |||||
127 | } | |||||
128 | 77488 | 50 | if (flags & UNICODE_DISALLOW_SUPER | |||
129 | 77488 | 100 | || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF))) | |||
50 | ||||||
130 | { | |||||
131 | return NULL; | |||||
132 | } | |||||
133 | } | |||||
134 | 841454 | 100 | else if (UNICODE_IS_NONCHAR(uv)) { | |||
100 | ||||||
135 | 912 | 50 | if (flags & UNICODE_WARN_NONCHAR) { | |||
136 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), | ||||
137 | "Unicode non-character U+%04"UVXf" is illegal for open interchange", | |||||
138 | uv); | |||||
139 | } | |||||
140 | 912 | 50 | if (flags & UNICODE_DISALLOW_NONCHAR) { | |||
141 | return NULL; | |||||
142 | } | |||||
143 | } | |||||
144 | } | |||||
145 | ||||||
146 | #if defined(EBCDIC) | |||||
147 | { | |||||
148 | STRLEN len = OFFUNISKIP(uv); | |||||
149 | U8 *p = d+len-1; | |||||
150 | while (p > d) { | |||||
151 | *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); | |||||
152 | uv >>= UTF_ACCUMULATION_SHIFT; | |||||
153 | } | |||||
154 | *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); | |||||
155 | return d+len; | |||||
156 | } | |||||
157 | #else /* Non loop style */ | |||||
158 | 4563766 | 100 | if (uv < 0x800) { | |||
159 | 1453792 | *d++ = (U8)(( uv >> 6) | 0xc0); | ||||
160 | 1453792 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
161 | 1453792 | return d; | ||||
162 | } | |||||
163 | 3109974 | 100 | if (uv < 0x10000) { | |||
164 | 2810290 | *d++ = (U8)(( uv >> 12) | 0xe0); | ||||
165 | 2810290 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
166 | 2810290 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
167 | 2810290 | return d; | ||||
168 | } | |||||
169 | 299684 | 100 | if (uv < 0x200000) { | |||
170 | 222796 | *d++ = (U8)(( uv >> 18) | 0xf0); | ||||
171 | 222796 | *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); | ||||
172 | 222796 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
173 | 222796 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
174 | 222796 | return d; | ||||
175 | } | |||||
176 | 76888 | 100 | if (uv < 0x4000000) { | |||
177 | 32 | *d++ = (U8)(( uv >> 24) | 0xf8); | ||||
178 | 32 | *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); | ||||
179 | 32 | *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); | ||||
180 | 32 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
181 | 32 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
182 | 32 | return d; | ||||
183 | } | |||||
184 | 76856 | 100 | if (uv < 0x80000000) { | |||
185 | 68 | *d++ = (U8)(( uv >> 30) | 0xfc); | ||||
186 | 68 | *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); | ||||
187 | 68 | *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); | ||||
188 | 68 | *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); | ||||
189 | 68 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
190 | 68 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
191 | 68 | return d; | ||||
192 | } | |||||
193 | #ifdef HAS_QUAD | |||||
194 | 76788 | 100 | if (uv < UTF8_QUAD_MAX) | |||
195 | #endif | |||||
196 | { | |||||
197 | 34 | *d++ = 0xfe; /* Can't match U+FEFF! */ | ||||
198 | 34 | *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); | ||||
199 | 34 | *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); | ||||
200 | 34 | *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); | ||||
201 | 34 | *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); | ||||
202 | 34 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
203 | 34 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
204 | 34 | return d; | ||||
205 | } | |||||
206 | #ifdef HAS_QUAD | |||||
207 | { | |||||
208 | 76754 | *d++ = 0xff; /* Can't match U+FFFE! */ | ||||
209 | 76754 | *d++ = 0x80; /* 6 Reserved bits */ | ||||
210 | 76754 | *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */ | ||||
211 | 76754 | *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80); | ||||
212 | 76754 | *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80); | ||||
213 | 76754 | *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80); | ||||
214 | 76754 | *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80); | ||||
215 | 76754 | *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); | ||||
216 | 76754 | *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); | ||||
217 | 76754 | *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); | ||||
218 | 76754 | *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); | ||||
219 | 76754 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
220 | 76754 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
221 | 4385292 | return d; | ||||
222 | } | |||||
223 | #endif | |||||
224 | #endif /* Non loop style */ | |||||
225 | } | |||||
226 | /* | |||||
227 | =for apidoc uvchr_to_utf8 | |||||
228 | ||||||
229 | Adds the UTF-8 representation of the native code point C |
|||||
230 | of the string C |
|||||
231 | bytes available. The return value is the pointer to the byte after the | |||||
232 | end of the new character. In other words, | |||||
233 | ||||||
234 | d = uvchr_to_utf8(d, uv); | |||||
235 | ||||||
236 | is the recommended wide native character-aware way of saying | |||||
237 | ||||||
238 | *(d++) = uv; | |||||
239 | ||||||
240 | This function accepts any UV as input. To forbid or warn on non-Unicode code | |||||
241 | points, or those that may be problematic, see L. | |||||
242 | ||||||
243 | =cut | |||||
244 | */ | |||||
245 | ||||||
246 | /* This is also a macro */ | |||||
247 | PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); | |||||
248 | ||||||
249 | U8 * | |||||
250 | 0 | Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) | ||||
251 | { | |||||
252 | 0 | return uvchr_to_utf8(d, uv); | ||||
253 | } | |||||
254 | ||||||
255 | /* | |||||
256 | =for apidoc uvchr_to_utf8_flags | |||||
257 | ||||||
258 | Adds the UTF-8 representation of the native code point C |
|||||
259 | of the string C |
|||||
260 | bytes available. The return value is the pointer to the byte after the | |||||
261 | end of the new character. In other words, | |||||
262 | ||||||
263 | d = uvchr_to_utf8_flags(d, uv, flags); | |||||
264 | ||||||
265 | or, in most cases, | |||||
266 | ||||||
267 | d = uvchr_to_utf8_flags(d, uv, 0); | |||||
268 | ||||||
269 | This is the Unicode-aware way of saying | |||||
270 | ||||||
271 | *(d++) = uv; | |||||
272 | ||||||
273 | This function will convert to UTF-8 (and not warn) even code points that aren't | |||||
274 | legal Unicode or are problematic, unless C |
|||||
275 | following flags: | |||||
276 | ||||||
277 | If C |
|||||
278 | the function will raise a warning, provided UTF8 warnings are enabled. If instead | |||||
279 | UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. | |||||
280 | If both flags are set, the function will both warn and return NULL. | |||||
281 | ||||||
282 | The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly | |||||
283 | affect how the function handles a Unicode non-character. And likewise, the | |||||
284 | UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of | |||||
285 | code points that are | |||||
286 | above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are | |||||
287 | even less portable) can be warned and/or disallowed even if other above-Unicode | |||||
288 | code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF | |||||
289 | flags. | |||||
290 | ||||||
291 | And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the | |||||
292 | above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four | |||||
293 | DISALLOW flags. | |||||
294 | ||||||
295 | =cut | |||||
296 | */ | |||||
297 | ||||||
298 | /* This is also a macro */ | |||||
299 | PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); | |||||
300 | ||||||
301 | U8 * | |||||
302 | 0 | Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) | ||||
303 | { | |||||
304 | 0 | return uvchr_to_utf8_flags(d, uv, flags); | ||||
305 | } | |||||
306 | ||||||
307 | /* | |||||
308 | ||||||
309 | Tests if the first C |
|||||
310 | character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a | |||||
311 | valid UTF-8 character. The number of bytes in the UTF-8 character | |||||
312 | will be returned if it is valid, otherwise 0. | |||||
313 | ||||||
314 | This is the "slow" version as opposed to the "fast" version which is | |||||
315 | the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed | |||||
316 | difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four | |||||
317 | or less you should use the IS_UTF8_CHAR(), for lengths of five or more | |||||
318 | you should use the _slow(). In practice this means that the _slow() | |||||
319 | will be used very rarely, since the maximum Unicode code point (as of | |||||
320 | Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only | |||||
321 | the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into | |||||
322 | five bytes or more. | |||||
323 | ||||||
324 | =cut */ | |||||
325 | PERL_STATIC_INLINE STRLEN | |||||
326 | 32 | S_is_utf8_char_slow(const U8 *s, const STRLEN len) | ||||
327 | { | |||||
328 | dTHX; /* The function called below requires thread context */ | |||||
329 | ||||||
330 | STRLEN actual_len; | |||||
331 | ||||||
332 | PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; | |||||
333 | ||||||
334 | 32 | utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY); | ||||
335 | ||||||
336 | 32 | 50 | return (actual_len == (STRLEN) -1) ? 0 : actual_len; | |||
337 | } | |||||
338 | ||||||
339 | /* | |||||
340 | =for apidoc is_utf8_char_buf | |||||
341 | ||||||
342 | Returns the number of bytes that comprise the first UTF-8 encoded character in | |||||
343 | buffer C |
|||||
344 | buffer. 0 is returned if C |
|||||
345 | encoded character. | |||||
346 | ||||||
347 | Note that an INVARIANT character (i.e. ASCII on non-EBCDIC | |||||
348 | machines) is a valid UTF-8 character. | |||||
349 | ||||||
350 | =cut */ | |||||
351 | ||||||
352 | STRLEN | |||||
353 | 410108 | Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) | ||||
354 | { | |||||
355 | ||||||
356 | STRLEN len; | |||||
357 | ||||||
358 | PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF; | |||||
359 | ||||||
360 | 410108 | 50 | if (buf_end <= buf) { | |||
361 | return 0; | |||||
362 | } | |||||
363 | ||||||
364 | 410108 | len = buf_end - buf; | ||||
365 | 410108 | 50 | if (len > UTF8SKIP(buf)) { | |||
366 | 0 | len = UTF8SKIP(buf); | ||||
367 | } | |||||
368 | ||||||
369 | #ifdef IS_UTF8_CHAR | |||||
370 | 410108 | 100 | if (IS_UTF8_CHAR_FAST(len)) | |||
371 | 410104 | 100 | return IS_UTF8_CHAR(buf, len) ? len : 0; | |||
50 | ||||||
50 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
372 | #endif /* #ifdef IS_UTF8_CHAR */ | |||||
373 | 205056 | return is_utf8_char_slow(buf, len); | ||||
374 | } | |||||
375 | ||||||
376 | /* | |||||
377 | =for apidoc is_utf8_char | |||||
378 | ||||||
379 | Tests if some arbitrary number of bytes begins in a valid UTF-8 | |||||
380 | character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) | |||||
381 | character is a valid UTF-8 character. The actual number of bytes in the UTF-8 | |||||
382 | character will be returned if it is valid, otherwise 0. | |||||
383 | ||||||
384 | This function is deprecated due to the possibility that malformed input could | |||||
385 | cause reading beyond the end of the input buffer. Use L | |||||
386 | instead. | |||||
387 | ||||||
388 | =cut */ | |||||
389 | ||||||
390 | STRLEN | |||||
391 | 0 | Perl_is_utf8_char(const U8 *s) | ||||
392 | { | |||||
393 | PERL_ARGS_ASSERT_IS_UTF8_CHAR; | |||||
394 | ||||||
395 | /* Assumes we have enough space, which is why this is deprecated */ | |||||
396 | 0 | return is_utf8_char_buf(s, s + UTF8SKIP(s)); | ||||
397 | } | |||||
398 | ||||||
399 | ||||||
400 | /* | |||||
401 | =for apidoc is_utf8_string | |||||
402 | ||||||
403 | Returns true if the first C |
|||||
404 | UTF-8 string, false otherwise. If C |
|||||
405 | using C |
|||||
406 | terminating NUL byte). Note that all characters being ASCII constitute 'a | |||||
407 | valid UTF-8 string'. | |||||
408 | ||||||
409 | See also L(), L(), and L(). | |||||
410 | ||||||
411 | =cut | |||||
412 | */ | |||||
413 | ||||||
414 | bool | |||||
415 | 32690 | Perl_is_utf8_string(const U8 *s, STRLEN len) | ||||
416 | { | |||||
417 | 32690 | 100 | const U8* const send = s + (len ? len : strlen((const char *)s)); | |||
418 | const U8* x = s; | |||||
419 | ||||||
420 | PERL_ARGS_ASSERT_IS_UTF8_STRING; | |||||
421 | ||||||
422 | 787575 | 100 | while (x < send) { | |||
423 | /* Inline the easy bits of is_utf8_char() here for speed... */ | |||||
424 | 738552 | 100 | if (UTF8_IS_INVARIANT(*x)) { | |||
425 | 442400 | x++; | ||||
426 | } | |||||
427 | else { | |||||
428 | /* ... and call is_utf8_char() only if really needed. */ | |||||
429 | 296152 | const STRLEN c = UTF8SKIP(x); | ||||
430 | 296152 | const U8* const next_char_ptr = x + c; | ||||
431 | ||||||
432 | 296152 | 100 | if (next_char_ptr > send) { | |||
433 | return FALSE; | |||||
434 | } | |||||
435 | ||||||
436 | 296144 | 100 | if (IS_UTF8_CHAR_FAST(c)) { | |||
437 | 296116 | 100 | if (!IS_UTF8_CHAR(x, c)) | |||
50 | ||||||
50 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
438 | return FALSE; | |||||
439 | } | |||||
440 | 369278 | 50 | else if (! is_utf8_char_slow(x, c)) { | |||
441 | return FALSE; | |||||
442 | } | |||||
443 | x = next_char_ptr; | |||||
444 | } | |||||
445 | } | |||||
446 | ||||||
447 | return TRUE; | |||||
448 | } | |||||
449 | ||||||
450 | /* | |||||
451 | Implemented as a macro in utf8.h | |||||
452 | ||||||
453 | =for apidoc is_utf8_string_loc | |||||
454 | ||||||
455 | Like L but stores the location of the failure (in the | |||||
456 | case of "utf8ness failure") or the location C |
|||||
457 | "utf8ness success") in the C |
|||||
458 | ||||||
459 | See also L() and L(). | |||||
460 | ||||||
461 | =for apidoc is_utf8_string_loclen | |||||
462 | ||||||
463 | Like L() but stores the location of the failure (in the | |||||
464 | case of "utf8ness failure") or the location C |
|||||
465 | "utf8ness success") in the C |
|||||
466 | encoded characters in the C |
|||||
467 | ||||||
468 | See also L() and L(). | |||||
469 | ||||||
470 | =cut | |||||
471 | */ | |||||
472 | ||||||
473 | bool | |||||
474 | 2290 | Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) | ||||
475 | { | |||||
476 | 2290 | 50 | const U8* const send = s + (len ? len : strlen((const char *)s)); | |||
477 | const U8* x = s; | |||||
478 | STRLEN c; | |||||
479 | STRLEN outlen = 0; | |||||
480 | ||||||
481 | PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; | |||||
482 | ||||||
483 | 98919 | 100 | while (x < send) { | |||
484 | const U8* next_char_ptr; | |||||
485 | ||||||
486 | /* Inline the easy bits of is_utf8_char() here for speed... */ | |||||
487 | 95494 | 100 | if (UTF8_IS_INVARIANT(*x)) | |||
488 | 89540 | next_char_ptr = x + 1; | ||||
489 | else { | |||||
490 | /* ... and call is_utf8_char() only if really needed. */ | |||||
491 | 5954 | c = UTF8SKIP(x); | ||||
492 | 5954 | next_char_ptr = c + x; | ||||
493 | 5954 | 100 | if (next_char_ptr > send) { | |||
494 | goto out; | |||||
495 | } | |||||
496 | 5948 | 50 | if (IS_UTF8_CHAR_FAST(c)) { | |||
497 | 5948 | 100 | if (!IS_UTF8_CHAR(x, c)) | |||
50 | ||||||
50 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
498 | c = 0; | |||||
499 | } else | |||||
500 | 0 | c = is_utf8_char_slow(x, c); | ||||
501 | 5948 | 100 | if (!c) | |||
502 | goto out; | |||||
503 | } | |||||
504 | x = next_char_ptr; | |||||
505 | 95484 | outlen++; | ||||
506 | } | |||||
507 | ||||||
508 | out: | |||||
509 | 2290 | 50 | if (el) | |||
510 | 0 | *el = outlen; | ||||
511 | ||||||
512 | 2290 | 50 | if (ep) | |||
513 | 2290 | *ep = x; | ||||
514 | 2290 | return (x == send); | ||||
515 | } | |||||
516 | ||||||
517 | /* | |||||
518 | ||||||
519 | =for apidoc utf8n_to_uvchr | |||||
520 | ||||||
521 | THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. | |||||
522 | Most code should use L() rather than call this directly. | |||||
523 | ||||||
524 | Bottom level UTF-8 decode routine. | |||||
525 | Returns the native code point value of the first character in the string C |
|||||
526 | which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than | |||||
527 | C |
|||||
528 | the length, in bytes, of that character. | |||||
529 | ||||||
530 | The value of C |
|||||
531 | well-formed UTF-8 character. If C |
|||||
532 | zero is returned and C<*retlen> is set so that (S |
|||||
533 | next possible position in C |
|||||
534 | Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised. | |||||
535 | ||||||
536 | Various ALLOW flags can be set in C |
|||||
537 | individual types of malformations, such as the sequence being overlong (that | |||||
538 | is, when there is a shorter sequence that can express the same code point; | |||||
539 | overlong sequences are expressly forbidden in the UTF-8 standard due to | |||||
540 | potential security issues). Another malformation example is the first byte of | |||||
541 | a character not being a legal first byte. See F |
|||||
542 | flags. For allowed 0 length strings, this function returns 0; for allowed | |||||
543 | overlong sequences, the computed code point is returned; for all other allowed | |||||
544 | malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no | |||||
545 | determinable reasonable value. | |||||
546 | ||||||
547 | The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other | |||||
548 | flags) malformation is found. If this flag is set, the routine assumes that | |||||
549 | the caller will raise a warning, and this function will silently just set | |||||
550 | C |
|||||
551 | ||||||
552 | Note that this API requires disambiguation between successful decoding a NUL | |||||
553 | character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as | |||||
554 | in both cases, 0 is returned. To disambiguate, upon a zero return, see if the | |||||
555 | first byte of C |
|||||
556 | had an error. | |||||
557 | ||||||
558 | Certain code points are considered problematic. These are Unicode surrogates, | |||||
559 | Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. | |||||
560 | By default these are considered regular code points, but certain situations | |||||
561 | warrant special handling for them. If C |
|||||
562 | UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as | |||||
563 | malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE, | |||||
564 | UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode | |||||
565 | maximum) can be set to disallow these categories individually. | |||||
566 | ||||||
567 | The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE, | |||||
568 | UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised | |||||
569 | for their respective categories, but otherwise the code points are considered | |||||
570 | valid (not malformations). To get a category to both be treated as a | |||||
571 | malformation and raise a warning, specify both the WARN and DISALLOW flags. | |||||
572 | (But note that warnings are not raised if lexically disabled nor if | |||||
573 | UTF8_CHECK_ONLY is also specified.) | |||||
574 | ||||||
575 | Very large code points (above 0x7FFF_FFFF) are considered more problematic than | |||||
576 | the others that are above the Unicode legal maximum. There are several | |||||
577 | reasons: they requre at least 32 bits to represent them on ASCII platforms, are | |||||
578 | not representable at all on EBCDIC platforms, and the original UTF-8 | |||||
579 | specification never went above this number (the current 0x10FFFF limit was | |||||
580 | imposed later). (The smaller ones, those that fit into 32 bits, are | |||||
581 | representable by a UV on ASCII platforms, but not by an IV, which means that | |||||
582 | the number of operations that can be performed on them is quite restricted.) | |||||
583 | The UTF-8 encoding on ASCII platforms for these large code points begins with a | |||||
584 | byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to | |||||
585 | be treated as malformations, while allowing smaller above-Unicode code points. | |||||
586 | (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points, | |||||
587 | including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like | |||||
588 | the other WARN flags, but applies just to these code points. | |||||
589 | ||||||
590 | All other code points corresponding to Unicode characters, including private | |||||
591 | use and those yet to be assigned, are never considered malformed and never | |||||
592 | warn. | |||||
593 | ||||||
594 | =cut | |||||
595 | */ | |||||
596 | ||||||
597 | UV | |||||
598 | 16071668 | Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) | ||||
599 | { | |||||
600 | dVAR; | |||||
601 | const U8 * const s0 = s; | |||||
602 | U8 overflow_byte = '\0'; /* Save byte in case of overflow */ | |||||
603 | U8 * send; | |||||
604 | 16071668 | UV uv = *s; | ||||
605 | STRLEN expectlen; | |||||
606 | SV* sv = NULL; | |||||
607 | UV outlier_ret = 0; /* return value when input is in error or problematic | |||||
608 | */ | |||||
609 | UV pack_warn = 0; /* Save result of packWARN() for later */ | |||||
610 | bool unexpected_non_continuation = FALSE; | |||||
611 | bool overflowed = FALSE; | |||||
612 | bool do_overlong_test = TRUE; /* May have to skip this test */ | |||||
613 | ||||||
614 | const char* const malformed_text = "Malformed UTF-8 character"; | |||||
615 | ||||||
616 | PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; | |||||
617 | ||||||
618 | /* The order of malformation tests here is important. We should consume as | |||||
619 | * few bytes as possible in order to not skip any valid character. This is | |||||
620 | * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also | |||||
621 | * http://unicode.org/reports/tr36 for more discussion as to why. For | |||||
622 | * example, once we've done a UTF8SKIP, we can tell the expected number of | |||||
623 | * bytes, and could fail right off the bat if the input parameters indicate | |||||
624 | * that there are too few available. But it could be that just that first | |||||
625 | * byte is garbled, and the intended character occupies fewer bytes. If we | |||||
626 | * blindly assumed that the first byte is correct, and skipped based on | |||||
627 | * that number, we could skip over a valid input character. So instead, we | |||||
628 | * always examine the sequence byte-by-byte. | |||||
629 | * | |||||
630 | * We also should not consume too few bytes, otherwise someone could inject | |||||
631 | * things. For example, an input could be deliberately designed to | |||||
632 | * overflow, and if this code bailed out immediately upon discovering that, | |||||
633 | * returning to the caller *retlen pointing to the very next byte (one | |||||
634 | * which is actually part of of the overflowing sequence), that could look | |||||
635 | * legitimate to the caller, which could discard the initial partial | |||||
636 | * sequence and process the rest, inappropriately */ | |||||
637 | ||||||
638 | /* Zero length strings, if allowed, of necessity are zero */ | |||||
639 | 16071668 | 100 | if (UNLIKELY(curlen == 0)) { | |||
640 | 10 | 50 | if (retlen) { | |||
641 | 10 | *retlen = 0; | ||||
642 | } | |||||
643 | ||||||
644 | 10 | 100 | if (flags & UTF8_ALLOW_EMPTY) { | |||
645 | return 0; | |||||
646 | } | |||||
647 | 8 | 100 | if (! (flags & UTF8_CHECK_ONLY)) { | |||
648 | 4 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text)); | ||||
649 | } | |||||
650 | goto malformed; | |||||
651 | } | |||||
652 | ||||||
653 | 16071658 | expectlen = UTF8SKIP(s); | ||||
654 | ||||||
655 | /* A well-formed UTF-8 character, as the vast majority of calls to this | |||||
656 | * function will be for, has this expected length. For efficiency, set | |||||
657 | * things up here to return it. It will be overriden only in those rare | |||||
658 | * cases where a malformation is found */ | |||||
659 | 16071658 | 100 | if (retlen) { | |||
660 | 13947306 | *retlen = expectlen; | ||||
661 | } | |||||
662 | ||||||
663 | /* An invariant is trivially well-formed */ | |||||
664 | 16071658 | 100 | if (UTF8_IS_INVARIANT(uv)) { | |||
665 | return uv; | |||||
666 | } | |||||
667 | ||||||
668 | /* A continuation character can't start a valid sequence */ | |||||
669 | 4409996 | 100 | if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { | |||
670 | 198 | 100 | if (flags & UTF8_ALLOW_CONTINUATION) { | |||
671 | 2 | 50 | if (retlen) { | |||
672 | 2 | *retlen = 1; | ||||
673 | } | |||||
674 | return UNICODE_REPLACEMENT; | |||||
675 | } | |||||
676 | ||||||
677 | 196 | 100 | if (! (flags & UTF8_CHECK_ONLY)) { | |||
678 | 194 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0)); | ||||
679 | } | |||||
680 | curlen = 1; | |||||
681 | goto malformed; | |||||
682 | } | |||||
683 | ||||||
684 | /* Here is not a continuation byte, nor an invariant. The only thing left | |||||
685 | * is a start byte (possibly for an overlong) */ | |||||
686 | ||||||
687 | #ifdef EBCDIC | |||||
688 | uv = NATIVE_UTF8_TO_I8(uv); | |||||
689 | #endif | |||||
690 | ||||||
691 | /* Remove the leading bits that indicate the number of bytes in the | |||||
692 | * character's whole UTF-8 sequence, leaving just the bits that are part of | |||||
693 | * the value */ | |||||
694 | 4409798 | 100 | uv &= UTF_START_MASK(expectlen); | |||
695 | ||||||
696 | /* Now, loop through the remaining bytes in the character's sequence, | |||||
697 | * accumulating each into the working value as we go. Be sure to not look | |||||
698 | * past the end of the input string */ | |||||
699 | 4409798 | send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen); | ||||
700 | ||||||
701 | 12275328 | 100 | for (s = s0 + 1; s < send; s++) { | |||
702 | 7866060 | 100 | if (LIKELY(UTF8_IS_CONTINUATION(*s))) { | |||
703 | #ifndef EBCDIC /* Can't overflow in EBCDIC */ | |||||
704 | 7865530 | 100 | if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) { | |||
705 | ||||||
706 | /* The original implementors viewed this malformation as more | |||||
707 | * serious than the others (though I, khw, don't understand | |||||
708 | * why, since other malformations also give very very wrong | |||||
709 | * results), so there is no way to turn off checking for it. | |||||
710 | * Set a flag, but keep going in the loop, so that we absorb | |||||
711 | * the rest of the bytes that comprise the character. */ | |||||
712 | overflowed = TRUE; | |||||
713 | 92 | overflow_byte = *s; /* Save for warning message's use */ | ||||
714 | } | |||||
715 | #endif | |||||
716 | 7865530 | uv = UTF8_ACCUMULATE(uv, *s); | ||||
717 | } | |||||
718 | else { | |||||
719 | /* Here, found a non-continuation before processing all expected | |||||
720 | * bytes. This byte begins a new character, so quit, even if | |||||
721 | * allowing this malformation. */ | |||||
722 | unexpected_non_continuation = TRUE; | |||||
723 | break; | |||||
724 | } | |||||
725 | } /* End of loop through the character's bytes */ | |||||
726 | ||||||
727 | /* Save how many bytes were actually in the character */ | |||||
728 | 4409798 | curlen = s - s0; | ||||
729 | ||||||
730 | /* The loop above finds two types of malformations: non-continuation and/or | |||||
731 | * overflow. The non-continuation malformation is really a too-short | |||||
732 | * malformation, as it means that the current character ended before it was | |||||
733 | * expected to (being terminated prematurely by the beginning of the next | |||||
734 | * character, whereas in the too-short malformation there just are too few | |||||
735 | * bytes available to hold the character. In both cases, the check below | |||||
736 | * that we have found the expected number of bytes would fail if executed.) | |||||
737 | * Thus the non-continuation malformation is really unnecessary, being a | |||||
738 | * subset of the too-short malformation. But there may be existing | |||||
739 | * applications that are expecting the non-continuation type, so we retain | |||||
740 | * it, and return it in preference to the too-short malformation. (If this | |||||
741 | * code were being written from scratch, the two types might be collapsed | |||||
742 | * into one.) I, khw, am also giving priority to returning the | |||||
743 | * non-continuation and too-short malformations over overflow when multiple | |||||
744 | * ones are present. I don't know of any real reason to prefer one over | |||||
745 | * the other, except that it seems to me that multiple-byte errors trumps | |||||
746 | * errors from a single byte */ | |||||
747 | 4409798 | 100 | if (UNLIKELY(unexpected_non_continuation)) { | |||
748 | 530 | 100 | if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) { | |||
749 | 526 | 100 | if (! (flags & UTF8_CHECK_ONLY)) { | |||
750 | 272 | 100 | if (curlen == 1) { | |||
751 | 254 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0)); | ||||
752 | } | |||||
753 | else { | |||||
754 | 18 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen)); | ||||
755 | } | |||||
756 | } | |||||
757 | goto malformed; | |||||
758 | } | |||||
759 | uv = UNICODE_REPLACEMENT; | |||||
760 | ||||||
761 | /* Skip testing for overlongs, as the REPLACEMENT may not be the same | |||||
762 | * as what the original expectations were. */ | |||||
763 | do_overlong_test = FALSE; | |||||
764 | 4 | 50 | if (retlen) { | |||
765 | 4 | *retlen = curlen; | ||||
766 | } | |||||
767 | } | |||||
768 | 4409268 | 100 | else if (UNLIKELY(curlen < expectlen)) { | |||
769 | 44 | 100 | if (! (flags & UTF8_ALLOW_SHORT)) { | |||
770 | 42 | 100 | if (! (flags & UTF8_CHECK_ONLY)) { | |||
771 | 40 | 100 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0)); | |||
772 | } | |||||
773 | goto malformed; | |||||
774 | } | |||||
775 | uv = UNICODE_REPLACEMENT; | |||||
776 | do_overlong_test = FALSE; | |||||
777 | 2 | 50 | if (retlen) { | |||
778 | 2 | *retlen = curlen; | ||||
779 | } | |||||
780 | } | |||||
781 | ||||||
782 | #ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */ | |||||
783 | 4409230 | 100 | if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ | |||
784 | 68358 | 100 | && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) | |||
785 | { | |||||
786 | /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary | |||||
787 | * generation of the sv, since no warnings are raised under CHECK */ | |||||
788 | 150 | 100 | if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF | |||
789 | 60 | 100 | && ckWARN_d(WARN_UTF8)) | |||
790 | { | |||||
791 | /* This message is deliberately not of the same syntax as the other | |||||
792 | * messages for malformations, for backwards compatibility in the | |||||
793 | * unlikely event that code is relying on its precise earlier text | |||||
794 | */ | |||||
795 | 12 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0)); | ||||
796 | pack_warn = packWARN(WARN_UTF8); | |||||
797 | } | |||||
798 | 150 | 100 | if (flags & UTF8_DISALLOW_FE_FF) { | |||
799 | goto malformed; | |||||
800 | } | |||||
801 | } | |||||
802 | 4409110 | 100 | if (UNLIKELY(overflowed)) { | |||
803 | ||||||
804 | /* If the first byte is FF, it will overflow a 32-bit word. If the | |||||
805 | * first byte is FE, it will overflow a signed 32-bit word. The | |||||
806 | * above preserves backward compatibility, since its message was used | |||||
807 | * in earlier versions of this code in preference to overflow */ | |||||
808 | 46 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0)); | ||||
809 | 46 | goto malformed; | ||||
810 | } | |||||
811 | #endif | |||||
812 | ||||||
813 | 4409064 | 100 | if (do_overlong_test | |||
814 | 4409058 | 100 | && expectlen > (STRLEN) OFFUNISKIP(uv) | |||
100 | ||||||
100 | ||||||
100 | ||||||
100 | ||||||
100 | ||||||
100 | ||||||
100 | ||||||
815 | 44 | 100 | && ! (flags & UTF8_ALLOW_LONG)) | |||
816 | { | |||||
817 | /* The overlong malformation has lower precedence than the others. | |||||
818 | * Note that if this malformation is allowed, we return the actual | |||||
819 | * value, instead of the replacement character. This is because this | |||||
820 | * value is actually well-defined. */ | |||||
821 | 42 | 100 | if (! (flags & UTF8_CHECK_ONLY)) { | |||
822 | 36 | 100 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0)); | |||
100 | ||||||
100 | ||||||
100 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
50 | ||||||
823 | } | |||||
824 | goto malformed; | |||||
825 | } | |||||
826 | ||||||
827 | /* Here, the input is considered to be well-formed , but could be a | |||||
828 | * problematic code point that is not allowed by the input parameters. */ | |||||
829 | 4409022 | 100 | if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */ | |||
830 | 685448 | 100 | && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE | |||
831 | |UTF8_WARN_ILLEGAL_INTERCHANGE))) | |||||
832 | { | |||||
833 | 7866 | 100 | if (UNICODE_IS_SURROGATE(uv)) { | |||
834 | 64 | 100 | if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE | |||
835 | 20 | 100 | && ckWARN2_d(WARN_UTF8, WARN_SURROGATE)) | |||
836 | { | |||||
837 | 8 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv)); | ||||
838 | pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE); | |||||
839 | } | |||||
840 | 64 | 100 | if (flags & UTF8_DISALLOW_SURROGATE) { | |||
841 | goto disallowed; | |||||
842 | } | |||||
843 | } | |||||
844 | 7802 | 100 | else if ((uv > PERL_UNICODE_MAX)) { | |||
845 | 86 | 100 | if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER | |||
846 | 20 | 100 | && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE)) | |||
847 | { | |||||
848 | 8 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv)); | ||||
849 | pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE); | |||||
850 | } | |||||
851 | 86 | 100 | if (flags & UTF8_DISALLOW_SUPER) { | |||
852 | goto disallowed; | |||||
853 | } | |||||
854 | } | |||||
855 | 7716 | 100 | else if (UNICODE_IS_NONCHAR(uv)) { | |||
100 | ||||||
856 | 52 | 100 | if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR | |||
857 | 20 | 100 | && ckWARN2_d(WARN_UTF8, WARN_NONCHAR)) | |||
858 | { | |||||
859 | 8 | sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv)); | ||||
860 | pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR); | |||||
861 | } | |||||
862 | 52 | 100 | if (flags & UTF8_DISALLOW_NONCHAR) { | |||
863 | goto disallowed; | |||||
864 | } | |||||
865 | } | |||||
866 | ||||||
867 | 7718 | 100 | if (sv) { | |||
868 | outlier_ret = uv; /* Note we don't bother to convert to native, | |||||
869 | as all the outlier code points are the same | |||||
870 | in both ASCII and EBCDIC */ | |||||
871 | goto do_warn; | |||||
872 | } | |||||
873 | ||||||
874 | /* Here, this is not considered a malformed character, so drop through | |||||
875 | * to return it */ | |||||
876 | } | |||||
877 | ||||||
878 | return UNI_TO_NATIVE(uv); | |||||
879 | ||||||
880 | /* There are three cases which get to beyond this point. In all 3 cases: | |||||
881 | * |
|||||
882 | * |
|||||
883 | * set. | |||||
884 | * |
|||||
885 | * This is done by initializing it to 0, and changing it only | |||||
886 | * for case 1). | |||||
887 | * The 3 cases are: | |||||
888 | * 1) The input is valid but problematic, and to be warned about. The | |||||
889 | * return value is the resultant code point; <*retlen> is set to | |||||
890 | * |
|||||
891 | * |
|||||
892 | * types. The entry point for this case is the label |
|||||
893 | * 2) The input is a valid code point but disallowed by the parameters to | |||||
894 | * this function. The return value is 0. If UTF8_CHECK_ONLY is set, | |||||
895 | * <*relen> is -1; otherwise it is |
|||||
896 | * comprise the code point. |
|||||
897 | * packWARN() for the warning types. The entry point for this case is | |||||
898 | * the label |
|||||
899 | * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY | |||||
900 | * is set, <*relen> is -1; otherwise it is |
|||||
901 | * bytes that comprise the malformation. All such malformations are | |||||
902 | * assumed to be warning type |
|||||
903 | * is the label |
|||||
904 | */ | |||||
905 | ||||||
906 | malformed: | |||||
907 | ||||||
908 | 980 | 100 | if (sv && ckWARN_d(WARN_UTF8)) { | |||
100 | ||||||
909 | pack_warn = packWARN(WARN_UTF8); | |||||
910 | } | |||||
911 | ||||||
912 | disallowed: | |||||
913 | ||||||
914 | 1128 | 100 | if (flags & UTF8_CHECK_ONLY) { | |||
915 | 438 | 50 | if (retlen) | |||
916 | 438 | *retlen = ((STRLEN) -1); | ||||
917 | return 0; | |||||
918 | } | |||||
919 | ||||||
920 | do_warn: | |||||
921 | ||||||
922 | 706 | 100 | if (pack_warn) { /* |
|||
923 | if warnings are to be raised. */ | |||||
924 | 562 | const char * const string = SvPVX_const(sv); | ||||
925 | ||||||
926 | 562 | 100 | if (PL_op) | |||
927 | 548 | 50 | Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op)); | |||
0 | ||||||
928 | else | |||||
929 | 14 | Perl_warner(aTHX_ pack_warn, "%s", string); | ||||
930 | } | |||||
931 | ||||||
932 | 706 | 100 | if (retlen) { | |||
933 | 8036132 | *retlen = curlen; | ||||
934 | } | |||||
935 | ||||||
936 | return outlier_ret; | |||||
937 | } | |||||
938 | ||||||
939 | /* | |||||
940 | =for apidoc utf8_to_uvchr_buf | |||||
941 | ||||||
942 | Returns the native code point of the first character in the string C |
|||||
943 | is assumed to be in UTF-8 encoding; C |
|||||
944 | C<*retlen> will be set to the length, in bytes, of that character. | |||||
945 | ||||||
946 | If C |
|||||
947 | enabled, zero is returned and C<*retlen> is set (if C |
|||||
948 | NULL) to -1. If those warnings are off, the computed value, if well-defined | |||||
949 | (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and | |||||
950 | C<*retlen> is set (if C |
|||||
951 | the next possible position in C |
|||||
952 | See L for details on when the REPLACEMENT CHARACTER is | |||||
953 | returned. | |||||
954 | ||||||
955 | =cut | |||||
956 | */ | |||||
957 | ||||||
958 | ||||||
959 | UV | |||||
960 | 0 | Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) | ||||
961 | { | |||||
962 | assert(s < send); | |||||
963 | ||||||
964 | 0 | 0 | return utf8n_to_uvchr(s, send - s, retlen, | |||
965 | ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); | |||||
966 | } | |||||
967 | ||||||
968 | /* Like L(), but should only be called when it is known that | |||||
969 | * there are no malformations in the input UTF-8 string C |
|||||
970 | * non-character code points, and non-Unicode code points are allowed. A macro | |||||
971 | * in utf8.h is used to normally avoid this function wrapper */ | |||||
972 | ||||||
973 | UV | |||||
974 | 6448256 | Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) | ||||
975 | { | |||||
976 | 6448256 | UV expectlen = UTF8SKIP(s); | ||||
977 | 6448256 | const U8* send = s + expectlen; | ||||
978 | 6448256 | UV uv = *s; | ||||
979 | ||||||
980 | PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; | |||||
981 | ||||||
982 | 6448256 | 100 | if (retlen) { | |||
983 | 414938 | *retlen = expectlen; | ||||
984 | } | |||||
985 | ||||||
986 | /* An invariant is trivially returned */ | |||||
987 | 6448256 | 100 | if (expectlen == 1) { | |||
988 | return uv; | |||||
989 | } | |||||
990 | ||||||
991 | #ifdef EBCDIC | |||||
992 | uv = NATIVE_UTF8_TO_I8(uv); | |||||
993 | #endif | |||||
994 | ||||||
995 | /* Remove the leading bits that indicate the number of bytes, leaving just | |||||
996 | * the bits that are part of the value */ | |||||
997 | 4289160 | 100 | uv &= UTF_START_MASK(expectlen); | |||
998 | ||||||
999 | /* Now, loop through the remaining bytes, accumulating each into the | |||||
1000 | * working total as we go. (I khw tried unrolling the loop for up to 4 | |||||
1001 | * bytes, but there was no performance improvement) */ | |||||
1002 | 11936924 | 100 | for (++s; s < send; s++) { | |||
1003 | 6568216 | uv = UTF8_ACCUMULATE(uv, *s); | ||||
1004 | } | |||||
1005 | ||||||
1006 | return UNI_TO_NATIVE(uv); | |||||
1007 | ||||||
1008 | } | |||||
1009 | ||||||
1010 | /* | |||||
1011 | =for apidoc utf8_to_uvchr | |||||
1012 | ||||||
1013 | Returns the native code point of the first character in the string C |
|||||
1014 | which is assumed to be in UTF-8 encoding; C |
|||||
1015 | length, in bytes, of that character. | |||||
1016 | ||||||
1017 | Some, but not all, UTF-8 malformations are detected, and in fact, some | |||||
1018 | malformed input could cause reading beyond the end of the input buffer, which | |||||
1019 | is why this function is deprecated. Use L instead. | |||||
1020 | ||||||
1021 | If C |
|||||
1022 | enabled, zero is returned and C<*retlen> is set (if C |
|||||
1023 | NULL) to -1. If those warnings are off, the computed value if well-defined (or | |||||
1024 | the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> | |||||
1025 | is set (if C |
|||||
1026 | next possible position in C |
|||||
1027 | See L for details on when the REPLACEMENT CHARACTER is returned. | |||||
1028 | ||||||
1029 | =cut | |||||
1030 | */ | |||||
1031 | ||||||
1032 | UV | |||||
1033 | 0 | Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) | ||||
1034 | { | |||||
1035 | PERL_ARGS_ASSERT_UTF8_TO_UVCHR; | |||||
1036 | ||||||
1037 | 0 | 0 | return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen); | |||
1038 | } | |||||
1039 | ||||||
1040 | /* | |||||
1041 | =for apidoc utf8_to_uvuni_buf | |||||
1042 | ||||||
1043 | Only in very rare circumstances should code need to be dealing in Unicode | |||||
1044 | (as opposed to native) code points. In those few cases, use | |||||
1045 | C |
|||||
1046 | ||||||
1047 | Returns the Unicode (not-native) code point of the first character in the | |||||
1048 | string C |
|||||
1049 | is assumed to be in UTF-8 encoding; C |
|||||
1050 | C |
|||||
1051 | ||||||
1052 | If C |
|||||
1053 | enabled, zero is returned and C<*retlen> is set (if C |
|||||
1054 | NULL) to -1. If those warnings are off, the computed value if well-defined (or | |||||
1055 | the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> | |||||
1056 | is set (if C |
|||||
1057 | next possible position in C |
|||||
1058 | See L for details on when the REPLACEMENT CHARACTER is returned. | |||||
1059 | ||||||
1060 | =cut | |||||
1061 | */ | |||||
1062 | ||||||
1063 | UV | |||||
1064 | 0 | Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) | ||||
1065 | { | |||||
1066 | PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF; | |||||
1067 | ||||||
1068 | assert(send > s); | |||||
1069 | ||||||
1070 | /* Call the low level routine asking for checks */ | |||||
1071 | 0 | 0 | return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen, | |||
1072 | ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); | |||||
1073 | } | |||||
1074 | ||||||
1075 | /* DEPRECATED! | |||||
1076 | * Like L(), but should only be called when it is known that | |||||
1077 | * there are no malformations in the input UTF-8 string C |
|||||
1078 | * non-character code points, and non-Unicode code points are allowed */ | |||||
1079 | ||||||
1080 | UV | |||||
1081 | 0 | Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) | ||||
1082 | { | |||||
1083 | PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI; | |||||
1084 | ||||||
1085 | 0 | return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); | ||||
1086 | } | |||||
1087 | ||||||
1088 | /* | |||||
1089 | =for apidoc utf8_to_uvuni | |||||
1090 | ||||||
1091 | Returns the Unicode code point of the first character in the string C |
|||||
1092 | which is assumed to be in UTF-8 encoding; C |
|||||
1093 | length, in bytes, of that character. | |||||
1094 | ||||||
1095 | Some, but not all, UTF-8 malformations are detected, and in fact, some | |||||
1096 | malformed input could cause reading beyond the end of the input buffer, which | |||||
1097 | is one reason why this function is deprecated. The other is that only in | |||||
1098 | extremely limited circumstances should the Unicode versus native code point be | |||||
1099 | of any interest to you. See L for alternatives. | |||||
1100 | ||||||
1101 | If C |
|||||
1102 | enabled, zero is returned and C<*retlen> is set (if C |
|||||
1103 | NULL) to -1. If those warnings are off, the computed value if well-defined (or | |||||
1104 | the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> | |||||
1105 | is set (if C |
|||||
1106 | next possible position in C |
|||||
1107 | See L for details on when the REPLACEMENT CHARACTER is returned. | |||||
1108 | ||||||
1109 | =cut | |||||
1110 | */ | |||||
1111 | ||||||
1112 | UV | |||||
1113 | 0 | Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) | ||||
1114 | { | |||||
1115 | PERL_ARGS_ASSERT_UTF8_TO_UVUNI; | |||||
1116 | ||||||
1117 | 0 | return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); | ||||
1118 | } | |||||
1119 | ||||||
1120 | /* | |||||
1121 | =for apidoc utf8_length | |||||
1122 | ||||||
1123 | Return the length of the UTF-8 char encoded string C |
|||||
1124 | Stops at C |
|||||
1125 | up past C |
|||||
1126 | ||||||
1127 | =cut | |||||
1128 | */ | |||||
1129 | ||||||
1130 | STRLEN | |||||
1131 | 7119006 | Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) | ||||
1132 | { | |||||
1133 | dVAR; | |||||
1134 | STRLEN len = 0; | |||||
1135 | ||||||
1136 | PERL_ARGS_ASSERT_UTF8_LENGTH; | |||||
1137 | ||||||
1138 | /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. | |||||
1139 | * the bitops (especially ~) can create illegal UTF-8. | |||||
1140 | * In other words: in Perl UTF-8 is not just for Unicode. */ | |||||
1141 | ||||||
1142 | 7119006 | 50 | if (e < s) | |||
1143 | goto warn_and_return; | |||||
1144 | 629958130 | 100 | while (s < e) { | |||
1145 | 622839124 | s += UTF8SKIP(s); | ||||
1146 | 622839124 | len++; | ||||
1147 | } | |||||
1148 | ||||||
1149 | 7119006 | 100 | if (e != s) { | |||
1150 | 32 | len--; | ||||
1151 | warn_and_return: | |||||
1152 | 32 | 50 | if (PL_op) | |||
1153 | 48 | 50 | Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), | |||
1154 | 16 | 0 | "%s in %s", unees, OP_DESC(PL_op)); | |||
1155 | else | |||||
1156 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); | ||||
1157 | } | |||||
1158 | ||||||
1159 | 7119006 | return len; | ||||
1160 | } | |||||
1161 | ||||||
1162 | /* | |||||
1163 | =for apidoc utf8_distance | |||||
1164 | ||||||
1165 | Returns the number of UTF-8 characters between the UTF-8 pointers C | |||||
1166 | and C. | |||||
1167 | ||||||
1168 | WARNING: use only if you *know* that the pointers point inside the | |||||
1169 | same UTF-8 buffer. | |||||
1170 | ||||||
1171 | =cut | |||||
1172 | */ | |||||
1173 | ||||||
1174 | IV | |||||
1175 | 377416 | Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) | ||||
1176 | { | |||||
1177 | PERL_ARGS_ASSERT_UTF8_DISTANCE; | |||||
1178 | ||||||
1179 | 377416 | 50 | return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); | |||
1180 | } | |||||
1181 | ||||||
1182 | /* | |||||
1183 | =for apidoc utf8_hop | |||||
1184 | ||||||
1185 | Return the UTF-8 pointer C |
|||||
1186 | forward or backward. | |||||
1187 | ||||||
1188 | WARNING: do not use the following unless you *know* C |
|||||
1189 | the UTF-8 data pointed to by C |
|||||
1190 | on the first byte of character or just after the last byte of a character. | |||||
1191 | ||||||
1192 | =cut | |||||
1193 | */ | |||||
1194 | ||||||
1195 | U8 * | |||||
1196 | 61814 | Perl_utf8_hop(pTHX_ const U8 *s, I32 off) | ||||
1197 | { | |||||
1198 | PERL_ARGS_ASSERT_UTF8_HOP; | |||||
1199 | ||||||
1200 | PERL_UNUSED_CONTEXT; | |||||
1201 | /* Note: cannot use UTF8_IS_...() too eagerly here since e.g | |||||
1202 | * the bitops (especially ~) can create illegal UTF-8. | |||||
1203 | * In other words: in Perl UTF-8 is not just for Unicode. */ | |||||
1204 | ||||||
1205 | 61814 | 100 | if (off >= 0) { | |||
1206 | 1224442 | 100 | while (off--) | |||
1207 | 1164038 | s += UTF8SKIP(s); | ||||
1208 | } | |||||
1209 | else { | |||||
1210 | 3990 | 100 | while (off++) { | |||
1211 | 2580 | s--; | ||||
1212 | 5085 | 100 | while (UTF8_IS_CONTINUATION(*s)) | |||
1213 | 510 | s--; | ||||
1214 | } | |||||
1215 | } | |||||
1216 | 61814 | return (U8 *)s; | ||||
1217 | } | |||||
1218 | ||||||
1219 | /* | |||||
1220 | =for apidoc bytes_cmp_utf8 | |||||
1221 | ||||||
1222 | Compares the sequence of characters (stored as octets) in C, C |
|||||
1223 | sequence of characters (stored as UTF-8) in C, C |
|||||
1224 | equal, -1 or -2 if the first string is less than the second string, +1 or +2 | |||||
1225 | if the first string is greater than the second string. | |||||
1226 | ||||||
1227 | -1 or +1 is returned if the shorter string was identical to the start of the | |||||
1228 | longer string. -2 or +2 is returned if the was a difference between characters | |||||
1229 | within the strings. | |||||
1230 | ||||||
1231 | =cut | |||||
1232 | */ | |||||
1233 | ||||||
1234 | int | |||||
1235 | 273870 | Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) | ||||
1236 | { | |||||
1237 | 273870 | const U8 *const bend = b + blen; | ||||
1238 | 273870 | const U8 *const uend = u + ulen; | ||||
1239 | ||||||
1240 | PERL_ARGS_ASSERT_BYTES_CMP_UTF8; | |||||
1241 | ||||||
1242 | PERL_UNUSED_CONTEXT; | |||||
1243 | ||||||
1244 | 4034843 | 100 | while (b < bend && u < uend) { | |||
1245 | 3855934 | U8 c = *u++; | ||||
1246 | 3855934 | 100 | if (!UTF8_IS_INVARIANT(c)) { | |||
1247 | 185108 | 100 | if (UTF8_IS_DOWNGRADEABLE_START(c)) { | |||
1248 | 118226 | 50 | if (u < uend) { | |||
1249 | 118226 | U8 c1 = *u++; | ||||
1250 | 118226 | 50 | if (UTF8_IS_CONTINUATION(c1)) { | |||
1251 | 118226 | c = TWO_BYTE_UTF8_TO_NATIVE(c, c1); | ||||
1252 | } else { | |||||
1253 | 0 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), | |||
0 | ||||||
1254 | "Malformed UTF-8 character " | |||||
1255 | "(unexpected non-continuation byte 0x%02x" | |||||
1256 | ", immediately after start byte 0x%02x)" | |||||
1257 | /* Dear diag.t, it's in the pod. */ | |||||
1258 | "%s%s", c1, c, | |||||
1259 | 0 | PL_op ? " in " : "", | ||||
1260 | 0 | 0 | PL_op ? OP_DESC(PL_op) : ""); | |||
0 | ||||||
1261 | 0 | return -2; | ||||
1262 | } | |||||
1263 | } else { | |||||
1264 | 0 | 0 | if (PL_op) | |||
1265 | 0 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), | |||
1266 | 0 | 0 | "%s in %s", unees, OP_DESC(PL_op)); | |||
1267 | else | |||||
1268 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); | ||||
1269 | return -2; /* Really want to return undef :-) */ | |||||
1270 | } | |||||
1271 | } else { | |||||
1272 | return -2; | |||||
1273 | } | |||||
1274 | } | |||||
1275 | 3789052 | 100 | if (*b != c) { | |||
1276 | 165014 | 100 | return *b < c ? -2 : +2; | |||
1277 | } | |||||
1278 | 3624038 | ++b; | ||||
1279 | } | |||||
1280 | ||||||
1281 | 41974 | 100 | if (b == bend && u == uend) | |||
1282 | return 0; | |||||
1283 | ||||||
1284 | 137293 | 100 | return b < bend ? +1 : -1; | |||
1285 | } | |||||
1286 | ||||||
1287 | /* | |||||
1288 | =for apidoc utf8_to_bytes | |||||
1289 | ||||||
1290 | Converts a string C |
|||||
1291 | Unlike L, this over-writes the original string, and | |||||
1292 | updates C |
|||||
1293 | Returns zero on failure, setting C |
|||||
1294 | ||||||
1295 | If you need a copy of the string, see L. | |||||
1296 | ||||||
1297 | =cut | |||||
1298 | */ | |||||
1299 | ||||||
1300 | U8 * | |||||
1301 | 170464 | Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) | ||||
1302 | { | |||||
1303 | U8 * const save = s; | |||||
1304 | 170464 | U8 * const send = s + *len; | ||||
1305 | U8 *d; | |||||
1306 | ||||||
1307 | PERL_ARGS_ASSERT_UTF8_TO_BYTES; | |||||
1308 | ||||||
1309 | /* ensure valid UTF-8 and chars < 256 before updating string */ | |||||
1310 | 1013490 | 100 | while (s < send) { | |||
1311 | 758560 | 100 | if (! UTF8_IS_INVARIANT(*s)) { | |||
1312 | 121154 | 100 | if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { | |||
50 | ||||||
50 | ||||||
1313 | 766 | *len = ((STRLEN) -1); | ||||
1314 | 766 | return 0; | ||||
1315 | } | |||||
1316 | 120388 | s++; | ||||
1317 | } | |||||
1318 | 757794 | s++; | ||||
1319 | } | |||||
1320 | ||||||
1321 | d = s = save; | |||||
1322 | 924844 | 100 | while (s < send) { | |||
1323 | 755146 | U8 c = *s++; | ||||
1324 | 755146 | 100 | if (! UTF8_IS_INVARIANT(c)) { | |||
1325 | /* Then it is two-byte encoded */ | |||||
1326 | 120116 | c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); | ||||
1327 | 120116 | s++; | ||||
1328 | } | |||||
1329 | 755146 | *d++ = c; | ||||
1330 | } | |||||
1331 | 169698 | *d = '\0'; | ||||
1332 | 169698 | *len = d - save; | ||||
1333 | 170081 | return save; | ||||
1334 | } | |||||
1335 | ||||||
1336 | /* | |||||
1337 | =for apidoc bytes_from_utf8 | |||||
1338 | ||||||
1339 | Converts a string C |
|||||
1340 | Unlike L but like L, returns a pointer to | |||||
1341 | the newly-created string, and updates C |
|||||
1342 | length. Returns the original string if no conversion occurs, C |
|||||
1343 | is unchanged. Do nothing if C |
|||||
1344 | 0 if C |
|||||
1345 | in utf8 (i.e., US-ASCII on non-EBCDIC machines). | |||||
1346 | ||||||
1347 | =cut | |||||
1348 | */ | |||||
1349 | ||||||
1350 | U8 * | |||||
1351 | 2950312 | Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) | ||||
1352 | { | |||||
1353 | U8 *d; | |||||
1354 | const U8 *start = s; | |||||
1355 | const U8 *send; | |||||
1356 | I32 count = 0; | |||||
1357 | ||||||
1358 | PERL_ARGS_ASSERT_BYTES_FROM_UTF8; | |||||
1359 | ||||||
1360 | PERL_UNUSED_CONTEXT; | |||||
1361 | 2950312 | 50 | if (!*is_utf8) | |||
1362 | return (U8 *)start; | |||||
1363 | ||||||
1364 | /* ensure valid UTF-8 and chars < 256 before converting string */ | |||||
1365 | 10013178 | 100 | for (send = s + *len; s < send;) { | |||
1366 | 7202750 | 100 | if (! UTF8_IS_INVARIANT(*s)) { | |||
1367 | 1623176 | 100 | if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { | |||
50 | ||||||
50 | ||||||
1368 | return (U8 *)start; | |||||
1369 | } | |||||
1370 | 8136 | count++; | ||||
1371 | 8136 | s++; | ||||
1372 | } | |||||
1373 | 5587710 | s++; | ||||
1374 | } | |||||
1375 | ||||||
1376 | 1335272 | *is_utf8 = FALSE; | ||||
1377 | ||||||
1378 | 1335272 | Newx(d, (*len) - count + 1, U8); | ||||
1379 | s = start; start = d; | |||||
1380 | 7188024 | 100 | while (s < send) { | |||
1381 | 5185116 | U8 c = *s++; | ||||
1382 | 5185116 | 100 | if (! UTF8_IS_INVARIANT(c)) { | |||
1383 | /* Then it is two-byte encoded */ | |||||
1384 | 8074 | c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); | ||||
1385 | 8074 | s++; | ||||
1386 | } | |||||
1387 | 5185116 | *d++ = c; | ||||
1388 | } | |||||
1389 | 1335272 | *d = '\0'; | ||||
1390 | 1335272 | *len = d - start; | ||||
1391 | 2142792 | return (U8 *)start; | ||||
1392 | } | |||||
1393 | ||||||
1394 | /* | |||||
1395 | =for apidoc bytes_to_utf8 | |||||
1396 | ||||||
1397 | Converts a string C |
|||||
1398 | UTF-8. | |||||
1399 | Returns a pointer to the newly-created string, and sets C |
|||||
1400 | reflect the new length in bytes. | |||||
1401 | ||||||
1402 | A NUL character will be written after the end of the string. | |||||
1403 | ||||||
1404 | If you want to convert to UTF-8 from encodings other than | |||||
1405 | the native (Latin1 or EBCDIC), | |||||
1406 | see L(). | |||||
1407 | ||||||
1408 | =cut | |||||
1409 | */ | |||||
1410 | ||||||
1411 | /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will | |||||
1412 | likewise need duplication. */ | |||||
1413 | ||||||
1414 | U8* | |||||
1415 | 213714 | Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) | ||||
1416 | { | |||||
1417 | 213714 | const U8 * const send = s + (*len); | ||||
1418 | U8 *d; | |||||
1419 | U8 *dst; | |||||
1420 | ||||||
1421 | PERL_ARGS_ASSERT_BYTES_TO_UTF8; | |||||
1422 | PERL_UNUSED_CONTEXT; | |||||
1423 | ||||||
1424 | 213714 | Newx(d, (*len) * 2 + 1, U8); | ||||
1425 | dst = d; | |||||
1426 | ||||||
1427 | 12019571 | 100 | while (s < send) { | |||
1428 | 11699000 | append_utf8_from_native_byte(*s, &d); | ||||
1429 | 11699000 | s++; | ||||
1430 | } | |||||
1431 | 213714 | *d = '\0'; | ||||
1432 | 213714 | *len = d-dst; | ||||
1433 | 213714 | return dst; | ||||
1434 | } | |||||
1435 | ||||||
1436 | /* | |||||
1437 | * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. | |||||
1438 | * | |||||
1439 | * Destination must be pre-extended to 3/2 source. Do not use in-place. | |||||
1440 | * We optimize for native, for obvious reasons. */ | |||||
1441 | ||||||
1442 | U8* | |||||
1443 | 28666 | Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) | ||||
1444 | { | |||||
1445 | U8* pend; | |||||
1446 | U8* dstart = d; | |||||
1447 | ||||||
1448 | PERL_ARGS_ASSERT_UTF16_TO_UTF8; | |||||
1449 | ||||||
1450 | 28666 | 100 | if (bytelen & 1) | |||
1451 | 6 | Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); | ||||
1452 | ||||||
1453 | 28660 | pend = p + bytelen; | ||||
1454 | ||||||
1455 | 731254 | 100 | while (p < pend) { | |||
1456 | 688286 | UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ | ||||
1457 | 688286 | p += 2; | ||||
1458 | 688286 | 100 | if (UNI_IS_INVARIANT(uv)) { | |||
1459 | 680068 | *d++ = LATIN1_TO_NATIVE((U8) uv); | ||||
1460 | 680068 | continue; | ||||
1461 | } | |||||
1462 | 8218 | 100 | if (uv <= MAX_UTF8_TWO_BYTE) { | |||
1463 | 120 | *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv)); | ||||
1464 | 120 | *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); | ||||
1465 | 120 | continue; | ||||
1466 | } | |||||
1467 | #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST | |||||
1468 | #define LAST_HIGH_SURROGATE 0xDBFF | |||||
1469 | #define FIRST_LOW_SURROGATE 0xDC00 | |||||
1470 | #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST | |||||
1471 | 8098 | 100 | if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) { | |||
1472 | 7962 | 100 | if (p >= pend) { | |||
1473 | 2 | Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); | ||||
1474 | } else { | |||||
1475 | 7960 | UV low = (p[0] << 8) + p[1]; | ||||
1476 | 7960 | p += 2; | ||||
1477 | 7960 | 100 | if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE) | |||
1478 | 8 | Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); | ||||
1479 | 11928 | uv = ((uv - FIRST_HIGH_SURROGATE) << 10) | ||||
1480 | 7952 | + (low - FIRST_LOW_SURROGATE) + 0x10000; | ||||
1481 | } | |||||
1482 | 136 | 100 | } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) { | |||
1483 | 12 | Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); | ||||
1484 | } | |||||
1485 | #ifdef EBCDIC | |||||
1486 | d = uvoffuni_to_utf8_flags(d, uv, 0); | |||||
1487 | #else | |||||
1488 | 8076 | 100 | if (uv < 0x10000) { | |||
1489 | 124 | *d++ = (U8)(( uv >> 12) | 0xe0); | ||||
1490 | 124 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
1491 | 124 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
1492 | 124 | continue; | ||||
1493 | } | |||||
1494 | else { | |||||
1495 | 7952 | *d++ = (U8)(( uv >> 18) | 0xf0); | ||||
1496 | 7952 | *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); | ||||
1497 | 7952 | *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); | ||||
1498 | 7952 | *d++ = (U8)(( uv & 0x3f) | 0x80); | ||||
1499 | 348108 | continue; | ||||
1500 | } | |||||
1501 | #endif | |||||
1502 | } | |||||
1503 | 28638 | *newlen = d - dstart; | ||||
1504 | 28638 | return d; | ||||
1505 | } | |||||
1506 | ||||||
1507 | /* Note: this one is slightly destructive of the source. */ | |||||
1508 | ||||||
1509 | U8* | |||||
1510 | 14356 | Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) | ||||
1511 | { | |||||
1512 | U8* s = (U8*)p; | |||||
1513 | 14356 | U8* const send = s + bytelen; | ||||
1514 | ||||||
1515 | PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; | |||||
1516 | ||||||
1517 | 14356 | 100 | if (bytelen & 1) | |||
1518 | 2 | Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf, | ||||
1519 | (UV)bytelen); | |||||
1520 | ||||||
1521 | 362482 | 100 | while (s < send) { | |||
1522 | 348128 | const U8 tmp = s[0]; | ||||
1523 | 348128 | s[0] = s[1]; | ||||
1524 | 348128 | s[1] = tmp; | ||||
1525 | 348128 | s += 2; | ||||
1526 | } | |||||
1527 | 14354 | return utf16_to_utf8(p, d, bytelen, newlen); | ||||
1528 | } | |||||
1529 | ||||||
1530 | bool | |||||
1531 | 836 | Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) | ||||
1532 | { | |||||
1533 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1534 | 836 | uvchr_to_utf8(tmpbuf, c); | ||||
1535 | 836 | return _is_utf8_FOO(classnum, tmpbuf); | ||||
1536 | } | |||||
1537 | ||||||
1538 | /* for now these are all defined (inefficiently) in terms of the utf8 versions. | |||||
1539 | * Note that the macros in handy.h that call these short-circuit calling them | |||||
1540 | * for Latin-1 range inputs */ | |||||
1541 | ||||||
1542 | bool | |||||
1543 | 0 | Perl_is_uni_alnum(pTHX_ UV c) | ||||
1544 | { | |||||
1545 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1546 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1547 | 0 | return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf); | ||||
1548 | } | |||||
1549 | ||||||
1550 | bool | |||||
1551 | 0 | Perl_is_uni_alnumc(pTHX_ UV c) | ||||
1552 | { | |||||
1553 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1554 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1555 | 0 | return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf); | ||||
1556 | } | |||||
1557 | ||||||
1558 | /* Internal function so we can deprecate the external one, and call | |||||
1559 | this one from other deprecated functions in this file */ | |||||
1560 | ||||||
1561 | PERL_STATIC_INLINE bool | |||||
1562 | S_is_utf8_idfirst(pTHX_ const U8 *p) | |||||
1563 | { | |||||
1564 | dVAR; | |||||
1565 | ||||||
1566 | 0 | if (*p == '_') | ||||
1567 | return TRUE; | |||||
1568 | /* is_utf8_idstart would be more logical. */ | |||||
1569 | 0 | return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); | ||||
1570 | } | |||||
1571 | ||||||
1572 | bool | |||||
1573 | 0 | Perl_is_uni_idfirst(pTHX_ UV c) | ||||
1574 | { | |||||
1575 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1576 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1577 | 0 | return S_is_utf8_idfirst(aTHX_ tmpbuf); | ||||
1578 | } | |||||
1579 | ||||||
1580 | bool | |||||
1581 | 68 | Perl__is_uni_perl_idcont(pTHX_ UV c) | ||||
1582 | { | |||||
1583 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1584 | 68 | uvchr_to_utf8(tmpbuf, c); | ||||
1585 | 68 | return _is_utf8_perl_idcont(tmpbuf); | ||||
1586 | } | |||||
1587 | ||||||
1588 | bool | |||||
1589 | 60 | Perl__is_uni_perl_idstart(pTHX_ UV c) | ||||
1590 | { | |||||
1591 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1592 | 60 | uvchr_to_utf8(tmpbuf, c); | ||||
1593 | 60 | return _is_utf8_perl_idstart(tmpbuf); | ||||
1594 | } | |||||
1595 | ||||||
1596 | bool | |||||
1597 | 0 | Perl_is_uni_alpha(pTHX_ UV c) | ||||
1598 | { | |||||
1599 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1600 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1601 | 0 | return _is_utf8_FOO(_CC_ALPHA, tmpbuf); | ||||
1602 | } | |||||
1603 | ||||||
1604 | bool | |||||
1605 | 0 | Perl_is_uni_ascii(pTHX_ UV c) | ||||
1606 | { | |||||
1607 | 0 | return isASCII(c); | ||||
1608 | } | |||||
1609 | ||||||
1610 | bool | |||||
1611 | 0 | Perl_is_uni_blank(pTHX_ UV c) | ||||
1612 | { | |||||
1613 | 0 | 0 | return isBLANK_uni(c); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
1614 | } | |||||
1615 | ||||||
1616 | bool | |||||
1617 | 0 | Perl_is_uni_space(pTHX_ UV c) | ||||
1618 | { | |||||
1619 | 0 | 0 | return isSPACE_uni(c); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
1620 | } | |||||
1621 | ||||||
1622 | bool | |||||
1623 | 0 | Perl_is_uni_digit(pTHX_ UV c) | ||||
1624 | { | |||||
1625 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1626 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1627 | 0 | return _is_utf8_FOO(_CC_DIGIT, tmpbuf); | ||||
1628 | } | |||||
1629 | ||||||
1630 | bool | |||||
1631 | 0 | Perl_is_uni_upper(pTHX_ UV c) | ||||
1632 | { | |||||
1633 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1634 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1635 | 0 | return _is_utf8_FOO(_CC_UPPER, tmpbuf); | ||||
1636 | } | |||||
1637 | ||||||
1638 | bool | |||||
1639 | 0 | Perl_is_uni_lower(pTHX_ UV c) | ||||
1640 | { | |||||
1641 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1642 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1643 | 0 | return _is_utf8_FOO(_CC_LOWER, tmpbuf); | ||||
1644 | } | |||||
1645 | ||||||
1646 | bool | |||||
1647 | 0 | Perl_is_uni_cntrl(pTHX_ UV c) | ||||
1648 | { | |||||
1649 | 0 | 0 | return isCNTRL_L1(c); | |||
0 | ||||||
1650 | } | |||||
1651 | ||||||
1652 | bool | |||||
1653 | 0 | Perl_is_uni_graph(pTHX_ UV c) | ||||
1654 | { | |||||
1655 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1656 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1657 | 0 | return _is_utf8_FOO(_CC_GRAPH, tmpbuf); | ||||
1658 | } | |||||
1659 | ||||||
1660 | bool | |||||
1661 | 0 | Perl_is_uni_print(pTHX_ UV c) | ||||
1662 | { | |||||
1663 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1664 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1665 | 0 | return _is_utf8_FOO(_CC_PRINT, tmpbuf); | ||||
1666 | } | |||||
1667 | ||||||
1668 | bool | |||||
1669 | 0 | Perl_is_uni_punct(pTHX_ UV c) | ||||
1670 | { | |||||
1671 | U8 tmpbuf[UTF8_MAXBYTES+1]; | |||||
1672 | 0 | uvchr_to_utf8(tmpbuf, c); | ||||
1673 | 0 | return _is_utf8_FOO(_CC_PUNCT, tmpbuf); | ||||
1674 | } | |||||
1675 | ||||||
1676 | bool | |||||
1677 | 0 | Perl_is_uni_xdigit(pTHX_ UV c) | ||||
1678 | { | |||||
1679 | 0 | 0 | return isXDIGIT_uni(c); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
1680 | } | |||||
1681 | ||||||
1682 | UV | |||||
1683 | 62676 | Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s) | ||||
1684 | { | |||||
1685 | /* We have the latin1-range values compiled into the core, so just use | |||||
1686 | * those, converting the result to utf8. The only difference between upper | |||||
1687 | * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is | |||||
1688 | * either "SS" or "Ss". Which one to use is passed into the routine in | |||||
1689 | * 'S_or_s' to avoid a test */ | |||||
1690 | ||||||
1691 | 62676 | UV converted = toUPPER_LATIN1_MOD(c); | ||||
1692 | ||||||
1693 | PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; | |||||
1694 | ||||||
1695 | assert(S_or_s == 'S' || S_or_s == 's'); | |||||
1696 | ||||||
1697 | 62676 | 100 | if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for | |||
1698 | characters in this range */ | |||||
1699 | 32292 | *p = (U8) converted; | ||||
1700 | 32292 | *lenp = 1; | ||||
1701 | 32292 | return converted; | ||||
1702 | } | |||||
1703 | ||||||
1704 | /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, | |||||
1705 | * which it maps to one of them, so as to only have to have one check for | |||||
1706 | * it in the main case */ | |||||
1707 | 30384 | 100 | if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { | |||
1708 | 204 | switch (c) { | ||||
1709 | case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: | |||||
1710 | converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; | |||||
1711 | break; | |||||
1712 | case MICRO_SIGN: | |||||
1713 | converted = GREEK_CAPITAL_LETTER_MU; | |||||
1714 | 64 | break; | ||||
1715 | case LATIN_SMALL_LETTER_SHARP_S: | |||||
1716 | 72 | *(p)++ = 'S'; | ||||
1717 | 72 | *p = S_or_s; | ||||
1718 | 72 | *lenp = 2; | ||||
1719 | 72 | return 'S'; | ||||
1720 | default: | |||||
1721 | 0 | Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); | ||||
1722 | assert(0); /* NOTREACHED */ | |||||
1723 | } | |||||
1724 | } | |||||
1725 | ||||||
1726 | 30312 | *(p)++ = UTF8_TWO_BYTE_HI(converted); | ||||
1727 | 30312 | *p = UTF8_TWO_BYTE_LO(converted); | ||||
1728 | 30312 | *lenp = 2; | ||||
1729 | ||||||
1730 | 46494 | return converted; | ||||
1731 | } | |||||
1732 | ||||||
1733 | /* Call the function to convert a UTF-8 encoded character to the specified case. | |||||
1734 | * Note that there may be more than one character in the result. | |||||
1735 | * INP is a pointer to the first byte of the input character | |||||
1736 | * OUTP will be set to the first byte of the string of changed characters. It | |||||
1737 | * needs to have space for UTF8_MAXBYTES_CASE+1 bytes | |||||
1738 | * LENP will be set to the length in bytes of the string of changed characters | |||||
1739 | * | |||||
1740 | * The functions return the ordinal of the first character in the string of OUTP */ | |||||
1741 | #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc") | |||||
1742 | #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc") | |||||
1743 | #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc") | |||||
1744 | ||||||
1745 | /* This additionally has the input parameter SPECIALS, which if non-zero will | |||||
1746 | * cause this to use the SPECIALS hash for folding (meaning get full case | |||||
1747 | * folding); otherwise, when zero, this implies a simple case fold */ | |||||
1748 | #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL) | |||||
1749 | ||||||
1750 | UV | |||||
1751 | 546 | Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) | ||||
1752 | { | |||||
1753 | dVAR; | |||||
1754 | ||||||
1755 | /* Convert the Unicode character whose ordinal is |
|||||
1756 | * version and store that in UTF-8 in and its length in bytes in |
|||||
1757 | * Note that the needs to be at least UTF8_MAXBYTES_CASE+1 bytes since |
|||||
1758 | * the changed version may be longer than the original character. | |||||
1759 | * | |||||
1760 | * The ordinal of the first character of the changed version is returned | |||||
1761 | * (but note, as explained above, that there may be more.) */ | |||||
1762 | ||||||
1763 | PERL_ARGS_ASSERT_TO_UNI_UPPER; | |||||
1764 | ||||||
1765 | 546 | 100 | if (c < 256) { | |||
1766 | 512 | return _to_upper_title_latin1((U8) c, p, lenp, 'S'); | ||||
1767 | } | |||||
1768 | ||||||
1769 | 34 | uvchr_to_utf8(p, c); | ||||
1770 | 290 | return CALL_UPPER_CASE(p, p, lenp); | ||||
1771 | } | |||||
1772 | ||||||
1773 | UV | |||||
1774 | 546 | Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) | ||||
1775 | { | |||||
1776 | dVAR; | |||||
1777 | ||||||
1778 | PERL_ARGS_ASSERT_TO_UNI_TITLE; | |||||
1779 | ||||||
1780 | 546 | 100 | if (c < 256) { | |||
1781 | 512 | return _to_upper_title_latin1((U8) c, p, lenp, 's'); | ||||
1782 | } | |||||
1783 | ||||||
1784 | 34 | uvchr_to_utf8(p, c); | ||||
1785 | 290 | return CALL_TITLE_CASE(p, p, lenp); | ||||
1786 | } | |||||
1787 | ||||||
1788 | STATIC U8 | |||||
1789 | S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp) | |||||
1790 | { | |||||
1791 | /* We have the latin1-range values compiled into the core, so just use | |||||
1792 | * those, converting the result to utf8. Since the result is always just | |||||
1793 | * one character, we allow to be NULL */ |
|||||
1794 | ||||||
1795 | 60176 | U8 converted = toLOWER_LATIN1(c); | ||||
1796 | ||||||
1797 | 60176 | 50 | if (p != NULL) { | |||
50 | ||||||
50 | ||||||
1798 | 60176 | 50 | if (NATIVE_IS_INVARIANT(converted)) { | |||
50 | ||||||
100 | ||||||
1799 | 31466 | *p = converted; | ||||
1800 | 31466 | *lenp = 1; | ||||
1801 | } | |||||
1802 | else { | |||||
1803 | 28710 | *p = UTF8_TWO_BYTE_HI(converted); | ||||
1804 | 28710 | *(p+1) = UTF8_TWO_BYTE_LO(converted); | ||||
1805 | 28710 | *lenp = 2; | ||||
1806 | } | |||||
1807 | } | |||||
1808 | return converted; | |||||
1809 | } | |||||
1810 | ||||||
1811 | UV | |||||
1812 | 530 | Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) | ||||
1813 | { | |||||
1814 | dVAR; | |||||
1815 | ||||||
1816 | PERL_ARGS_ASSERT_TO_UNI_LOWER; | |||||
1817 | ||||||
1818 | 530 | 100 | if (c < 256) { | |||
1819 | 768 | return to_lower_latin1((U8) c, p, lenp); | ||||
1820 | } | |||||
1821 | ||||||
1822 | 18 | uvchr_to_utf8(p, c); | ||||
1823 | 274 | return CALL_LOWER_CASE(p, p, lenp); | ||||
1824 | } | |||||
1825 | ||||||
1826 | UV | |||||
1827 | 3027170 | Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) | ||||
1828 | { | |||||
1829 | /* Corresponds to to_lower_latin1(); |
|||||
1830 | * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited | |||||
1831 | * FOLD_FLAGS_FULL iff full folding is to be used; | |||||
1832 | * | |||||
1833 | * Not to be used for locale folds | |||||
1834 | */ | |||||
1835 | ||||||
1836 | UV converted; | |||||
1837 | ||||||
1838 | PERL_ARGS_ASSERT__TO_FOLD_LATIN1; | |||||
1839 | ||||||
1840 | assert (! (flags & FOLD_FLAGS_LOCALE)); | |||||
1841 | ||||||
1842 | 3027170 | 100 | if (c == MICRO_SIGN) { | |||
1843 | converted = GREEK_SMALL_LETTER_MU; | |||||
1844 | } | |||||
1845 | 2986918 | 100 | else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { | |||
1846 | ||||||
1847 | /* If can't cross 127/128 boundary, can't return "ss"; instead return | |||||
1848 | * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") | |||||
1849 | * under those circumstances. */ | |||||
1850 | 446202 | 100 | if (flags & FOLD_FLAGS_NOMIX_ASCII) { | |||
1851 | 70840 | *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; | ||||
1852 | 70840 | Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, | ||||
1853 | p, *lenp, U8); | |||||
1854 | 70840 | return LATIN_SMALL_LETTER_LONG_S; | ||||
1855 | } | |||||
1856 | else { | |||||
1857 | 375362 | *(p)++ = 's'; | ||||
1858 | 375362 | *p = 's'; | ||||
1859 | 375362 | *lenp = 2; | ||||
1860 | 375362 | return 's'; | ||||
1861 | } | |||||
1862 | } | |||||
1863 | else { /* In this range the fold of all other characters is their lower | |||||
1864 | case */ | |||||
1865 | 2540716 | converted = toLOWER_LATIN1(c); | ||||
1866 | } | |||||
1867 | ||||||
1868 | 2580968 | 100 | if (NATIVE_IS_INVARIANT(converted)) { | |||
1869 | 2366680 | *p = (U8) converted; | ||||
1870 | 2366680 | *lenp = 1; | ||||
1871 | } | |||||
1872 | else { | |||||
1873 | 214288 | *(p)++ = UTF8_TWO_BYTE_HI(converted); | ||||
1874 | 214288 | *p = UTF8_TWO_BYTE_LO(converted); | ||||
1875 | 1620729 | *lenp = 2; | ||||
1876 | } | |||||
1877 | ||||||
1878 | return converted; | |||||
1879 | } | |||||
1880 | ||||||
1881 | UV | |||||
1882 | 4383144 | Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags) | ||||
1883 | { | |||||
1884 | ||||||
1885 | /* Not currently externally documented, and subject to change | |||||
1886 | * |
|||||
1887 | * FOLD_FLAGS_FULL iff full folding is to be used; | |||||
1888 | * FOLD_FLAGS_LOCALE iff in locale | |||||
1889 | * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited | |||||
1890 | */ | |||||
1891 | ||||||
1892 | PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; | |||||
1893 | ||||||
1894 | 4383144 | 100 | if (c < 256) { | |||
1895 | 2765492 | UV result = _to_fold_latin1((U8) c, p, lenp, | ||||
1896 | flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); | |||||
1897 | /* It is illegal for the fold to cross the 255/256 boundary under | |||||
1898 | * locale; in this case return the original */ | |||||
1899 | 1434954 | 50 | return (result > 256 && flags & FOLD_FLAGS_LOCALE) | |||
1900 | ? c | |||||
1901 | 2765492 | 100 | : result; | |||
1902 | } | |||||
1903 | ||||||
1904 | /* If no special needs, just use the macro */ | |||||
1905 | 1617652 | 100 | if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { | |||
1906 | 981148 | uvchr_to_utf8(p, c); | ||||
1907 | 981148 | 100 | return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL); | |||
1908 | } | |||||
1909 | else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with | |||||
1910 | the special flags. */ | |||||
1911 | U8 utf8_c[UTF8_MAXBYTES + 1]; | |||||
1912 | 636504 | uvchr_to_utf8(utf8_c, c); | ||||
1913 | 2509824 | return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL); | ||||
1914 | } | |||||
1915 | } | |||||
1916 | ||||||
1917 | bool | |||||
1918 | 0 | Perl_is_uni_alnum_lc(pTHX_ UV c) | ||||
1919 | { | |||||
1920 | 0 | 0 | if (c < 256) { | |||
1921 | 0 | 0 | return isALNUM_LC(c); | |||
0 | ||||||
0 | ||||||
1922 | } | |||||
1923 | 0 | return _is_uni_FOO(_CC_WORDCHAR, c); | ||||
1924 | } | |||||
1925 | ||||||
1926 | bool | |||||
1927 | 0 | Perl_is_uni_alnumc_lc(pTHX_ UV c) | ||||
1928 | { | |||||
1929 | 0 | 0 | if (c < 256) { | |||
1930 | 0 | 0 | return isALPHANUMERIC_LC(c); | |||
0 | ||||||
1931 | } | |||||
1932 | 0 | return _is_uni_FOO(_CC_ALPHANUMERIC, c); | ||||
1933 | } | |||||
1934 | ||||||
1935 | bool | |||||
1936 | 0 | Perl_is_uni_idfirst_lc(pTHX_ UV c) | ||||
1937 | { | |||||
1938 | 0 | 0 | if (c < 256) { | |||
1939 | 0 | 0 | return isIDFIRST_LC(c); | |||
0 | ||||||
0 | ||||||
1940 | } | |||||
1941 | 0 | return _is_uni_perl_idstart(c); | ||||
1942 | } | |||||
1943 | ||||||
1944 | bool | |||||
1945 | 0 | Perl_is_uni_alpha_lc(pTHX_ UV c) | ||||
1946 | { | |||||
1947 | 0 | 0 | if (c < 256) { | |||
1948 | 0 | 0 | return isALPHA_LC(c); | |||
0 | ||||||
1949 | } | |||||
1950 | 0 | return _is_uni_FOO(_CC_ALPHA, c); | ||||
1951 | } | |||||
1952 | ||||||
1953 | bool | |||||
1954 | 0 | Perl_is_uni_ascii_lc(pTHX_ UV c) | ||||
1955 | { | |||||
1956 | 0 | 0 | if (c < 256) { | |||
1957 | 0 | 0 | return isASCII_LC(c); | |||
0 | ||||||
1958 | } | |||||
1959 | return 0; | |||||
1960 | } | |||||
1961 | ||||||
1962 | bool | |||||
1963 | 0 | Perl_is_uni_blank_lc(pTHX_ UV c) | ||||
1964 | { | |||||
1965 | 0 | 0 | if (c < 256) { | |||
1966 | 0 | 0 | return isBLANK_LC(c); | |||
0 | ||||||
1967 | } | |||||
1968 | 0 | 0 | return isBLANK_uni(c); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
1969 | } | |||||
1970 | ||||||
1971 | bool | |||||
1972 | 0 | Perl_is_uni_space_lc(pTHX_ UV c) | ||||
1973 | { | |||||
1974 | 0 | 0 | if (c < 256) { | |||
1975 | 0 | 0 | return isSPACE_LC(c); | |||
0 | ||||||
1976 | } | |||||
1977 | 0 | 0 | return isSPACE_uni(c); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
1978 | } | |||||
1979 | ||||||
1980 | bool | |||||
1981 | 0 | Perl_is_uni_digit_lc(pTHX_ UV c) | ||||
1982 | { | |||||
1983 | 0 | 0 | if (c < 256) { | |||
1984 | 0 | 0 | return isDIGIT_LC(c); | |||
0 | ||||||
1985 | } | |||||
1986 | 0 | return _is_uni_FOO(_CC_DIGIT, c); | ||||
1987 | } | |||||
1988 | ||||||
1989 | bool | |||||
1990 | 0 | Perl_is_uni_upper_lc(pTHX_ UV c) | ||||
1991 | { | |||||
1992 | 0 | 0 | if (c < 256) { | |||
1993 | 0 | 0 | return isUPPER_LC(c); | |||
0 | ||||||
1994 | } | |||||
1995 | 0 | return _is_uni_FOO(_CC_UPPER, c); | ||||
1996 | } | |||||
1997 | ||||||
1998 | bool | |||||
1999 | 0 | Perl_is_uni_lower_lc(pTHX_ UV c) | ||||
2000 | { | |||||
2001 | 0 | 0 | if (c < 256) { | |||
2002 | 0 | 0 | return isLOWER_LC(c); | |||
0 | ||||||
2003 | } | |||||
2004 | 0 | return _is_uni_FOO(_CC_LOWER, c); | ||||
2005 | } | |||||
2006 | ||||||
2007 | bool | |||||
2008 | 0 | Perl_is_uni_cntrl_lc(pTHX_ UV c) | ||||
2009 | { | |||||
2010 | 0 | 0 | if (c < 256) { | |||
2011 | 0 | 0 | return isCNTRL_LC(c); | |||
0 | ||||||
2012 | } | |||||
2013 | return 0; | |||||
2014 | } | |||||
2015 | ||||||
2016 | bool | |||||
2017 | 0 | Perl_is_uni_graph_lc(pTHX_ UV c) | ||||
2018 | { | |||||
2019 | 0 | 0 | if (c < 256) { | |||
2020 | 0 | 0 | return isGRAPH_LC(c); | |||
0 | ||||||
2021 | } | |||||
2022 | 0 | return _is_uni_FOO(_CC_GRAPH, c); | ||||
2023 | } | |||||
2024 | ||||||
2025 | bool | |||||
2026 | 0 | Perl_is_uni_print_lc(pTHX_ UV c) | ||||
2027 | { | |||||
2028 | 0 | 0 | if (c < 256) { | |||
2029 | 0 | 0 | return isPRINT_LC(c); | |||
0 | ||||||
2030 | } | |||||
2031 | 0 | return _is_uni_FOO(_CC_PRINT, c); | ||||
2032 | } | |||||
2033 | ||||||
2034 | bool | |||||
2035 | 0 | Perl_is_uni_punct_lc(pTHX_ UV c) | ||||
2036 | { | |||||
2037 | 0 | 0 | if (c < 256) { | |||
2038 | 0 | 0 | return isPUNCT_LC(c); | |||
0 | ||||||
2039 | } | |||||
2040 | 0 | return _is_uni_FOO(_CC_PUNCT, c); | ||||
2041 | } | |||||
2042 | ||||||
2043 | bool | |||||
2044 | 0 | Perl_is_uni_xdigit_lc(pTHX_ UV c) | ||||
2045 | { | |||||
2046 | 0 | 0 | if (c < 256) { | |||
2047 | 0 | 0 | return isXDIGIT_LC(c); | |||
0 | ||||||
2048 | } | |||||
2049 | 0 | 0 | return isXDIGIT_uni(c); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
2050 | } | |||||
2051 | ||||||
2052 | U32 | |||||
2053 | 0 | Perl_to_uni_upper_lc(pTHX_ U32 c) | ||||
2054 | { | |||||
2055 | /* XXX returns only the first character -- do not use XXX */ | |||||
2056 | /* XXX no locale support yet */ | |||||
2057 | STRLEN len; | |||||
2058 | U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; | |||||
2059 | 0 | return (U32)to_uni_upper(c, tmpbuf, &len); | ||||
2060 | } | |||||
2061 | ||||||
2062 | U32 | |||||
2063 | 0 | Perl_to_uni_title_lc(pTHX_ U32 c) | ||||
2064 | { | |||||
2065 | /* XXX returns only the first character XXX -- do not use XXX */ | |||||
2066 | /* XXX no locale support yet */ | |||||
2067 | STRLEN len; | |||||
2068 | U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; | |||||
2069 | 0 | return (U32)to_uni_title(c, tmpbuf, &len); | ||||
2070 | } | |||||
2071 | ||||||
2072 | U32 | |||||
2073 | 0 | Perl_to_uni_lower_lc(pTHX_ U32 c) | ||||
2074 | { | |||||
2075 | /* XXX returns only the first character -- do not use XXX */ | |||||
2076 | /* XXX no locale support yet */ | |||||
2077 | STRLEN len; | |||||
2078 | U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; | |||||
2079 | 0 | return (U32)to_uni_lower(c, tmpbuf, &len); | ||||
2080 | } | |||||
2081 | ||||||
2082 | PERL_STATIC_INLINE bool | |||||
2083 | 410108 | S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, | ||||
2084 | const char *const swashname) | |||||
2085 | { | |||||
2086 | /* returns a boolean giving whether or not the UTF8-encoded character that | |||||
2087 | * starts at is in the swash indicated by |
|||||
2088 | * contains a pointer to where the swash indicated by |
|||||
2089 | * is to be stored; which this routine will do, so that future calls will | |||||
2090 | * look at <*swash> and only generate a swash if it is not null | |||||
2091 | * | |||||
2092 | * Note that it is assumed that the buffer length of is enough to |
|||||
2093 | * contain all the bytes that comprise the character. Thus, <*p> should | |||||
2094 | * have been checked before this call for mal-formedness enough to assure | |||||
2095 | * that. */ | |||||
2096 | ||||||
2097 | dVAR; | |||||
2098 | ||||||
2099 | PERL_ARGS_ASSERT_IS_UTF8_COMMON; | |||||
2100 | ||||||
2101 | /* The API should have included a length for the UTF-8 character in , |
|||||
2102 | * but it doesn't. We therefore assume that p has been validated at least | |||||
2103 | * as far as there being enough bytes available in it to accommodate the | |||||
2104 | * character without reading beyond the end, and pass that number on to the | |||||
2105 | * validating routine */ | |||||
2106 | 410108 | 100 | if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) { | |||
2107 | 4 | 50 | if (ckWARN_d(WARN_UTF8)) { | |||
2108 | 4 | Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), | ||||
2109 | "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); | |||||
2110 | 4 | 50 | if (ckWARN(WARN_UTF8)) { /* This will output details as to the | |||
2111 | what the malformation is */ | |||||
2112 | 4 | 50 | utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL); | |||
2113 | } | |||||
2114 | } | |||||
2115 | return FALSE; | |||||
2116 | } | |||||
2117 | 410104 | 100 | if (!*swash) { | |||
2118 | 638 | U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; | ||||
2119 | 638 | *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags); | ||||
2120 | } | |||||
2121 | ||||||
2122 | 410106 | return swash_fetch(*swash, p, TRUE) != 0; | ||||
2123 | } | |||||
2124 | ||||||
2125 | bool | |||||
2126 | 1530 | Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) | ||||
2127 | { | |||||
2128 | dVAR; | |||||
2129 | ||||||
2130 | PERL_ARGS_ASSERT__IS_UTF8_FOO; | |||||
2131 | ||||||
2132 | assert(classnum < _FIRST_NON_SWASH_CC); | |||||
2133 | ||||||
2134 | 1530 | return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]); | ||||
2135 | } | |||||
2136 | ||||||
2137 | bool | |||||
2138 | 0 | Perl_is_utf8_alnum(pTHX_ const U8 *p) | ||||
2139 | { | |||||
2140 | dVAR; | |||||
2141 | ||||||
2142 | PERL_ARGS_ASSERT_IS_UTF8_ALNUM; | |||||
2143 | ||||||
2144 | /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true | |||||
2145 | * descendant of isalnum(3), in other words, it doesn't | |||||
2146 | * contain the '_'. --jhi */ | |||||
2147 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord"); | ||||
2148 | } | |||||
2149 | ||||||
2150 | bool | |||||
2151 | 0 | Perl_is_utf8_alnumc(pTHX_ const U8 *p) | ||||
2152 | { | |||||
2153 | dVAR; | |||||
2154 | ||||||
2155 | PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; | |||||
2156 | ||||||
2157 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum"); | ||||
2158 | } | |||||
2159 | ||||||
2160 | bool | |||||
2161 | 0 | Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ | ||||
2162 | { | |||||
2163 | dVAR; | |||||
2164 | ||||||
2165 | PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; | |||||
2166 | ||||||
2167 | 0 | return S_is_utf8_idfirst(aTHX_ p); | ||||
2168 | } | |||||
2169 | ||||||
2170 | bool | |||||
2171 | 0 | Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */ | ||||
2172 | { | |||||
2173 | dVAR; | |||||
2174 | ||||||
2175 | PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST; | |||||
2176 | ||||||
2177 | 0 | 0 | if (*p == '_') | |||
2178 | return TRUE; | |||||
2179 | /* is_utf8_idstart would be more logical. */ | |||||
2180 | 0 | return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart"); | ||||
2181 | } | |||||
2182 | ||||||
2183 | bool | |||||
2184 | 401724 | Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) | ||||
2185 | { | |||||
2186 | dVAR; | |||||
2187 | ||||||
2188 | PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; | |||||
2189 | ||||||
2190 | 401724 | return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart"); | ||||
2191 | } | |||||
2192 | ||||||
2193 | bool | |||||
2194 | 6840 | Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) | ||||
2195 | { | |||||
2196 | dVAR; | |||||
2197 | ||||||
2198 | PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; | |||||
2199 | ||||||
2200 | 6840 | return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont"); | ||||
2201 | } | |||||
2202 | ||||||
2203 | ||||||
2204 | bool | |||||
2205 | 0 | Perl_is_utf8_idcont(pTHX_ const U8 *p) | ||||
2206 | { | |||||
2207 | dVAR; | |||||
2208 | ||||||
2209 | PERL_ARGS_ASSERT_IS_UTF8_IDCONT; | |||||
2210 | ||||||
2211 | 0 | return is_utf8_common(p, &PL_utf8_idcont, "IdContinue"); | ||||
2212 | } | |||||
2213 | ||||||
2214 | bool | |||||
2215 | 0 | Perl_is_utf8_xidcont(pTHX_ const U8 *p) | ||||
2216 | { | |||||
2217 | dVAR; | |||||
2218 | ||||||
2219 | PERL_ARGS_ASSERT_IS_UTF8_XIDCONT; | |||||
2220 | ||||||
2221 | 0 | return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue"); | ||||
2222 | } | |||||
2223 | ||||||
2224 | bool | |||||
2225 | 0 | Perl_is_utf8_alpha(pTHX_ const U8 *p) | ||||
2226 | { | |||||
2227 | dVAR; | |||||
2228 | ||||||
2229 | PERL_ARGS_ASSERT_IS_UTF8_ALPHA; | |||||
2230 | ||||||
2231 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha"); | ||||
2232 | } | |||||
2233 | ||||||
2234 | bool | |||||
2235 | 0 | Perl_is_utf8_ascii(pTHX_ const U8 *p) | ||||
2236 | { | |||||
2237 | dVAR; | |||||
2238 | ||||||
2239 | PERL_ARGS_ASSERT_IS_UTF8_ASCII; | |||||
2240 | ||||||
2241 | /* ASCII characters are the same whether in utf8 or not. So the macro | |||||
2242 | * works on both utf8 and non-utf8 representations. */ | |||||
2243 | 0 | return isASCII(*p); | ||||
2244 | } | |||||
2245 | ||||||
2246 | bool | |||||
2247 | 0 | Perl_is_utf8_blank(pTHX_ const U8 *p) | ||||
2248 | { | |||||
2249 | dVAR; | |||||
2250 | ||||||
2251 | PERL_ARGS_ASSERT_IS_UTF8_BLANK; | |||||
2252 | ||||||
2253 | 0 | 0 | return isBLANK_utf8(p); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
2254 | } | |||||
2255 | ||||||
2256 | bool | |||||
2257 | 0 | Perl_is_utf8_space(pTHX_ const U8 *p) | ||||
2258 | { | |||||
2259 | dVAR; | |||||
2260 | ||||||
2261 | PERL_ARGS_ASSERT_IS_UTF8_SPACE; | |||||
2262 | ||||||
2263 | 0 | 0 | return isSPACE_utf8(p); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
2264 | } | |||||
2265 | ||||||
2266 | bool | |||||
2267 | 0 | Perl_is_utf8_perl_space(pTHX_ const U8 *p) | ||||
2268 | { | |||||
2269 | dVAR; | |||||
2270 | ||||||
2271 | PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE; | |||||
2272 | ||||||
2273 | /* Only true if is an ASCII space-like character, and ASCII is invariant | |||||
2274 | * under utf8, so can just use the macro */ | |||||
2275 | 0 | return isSPACE_A(*p); | ||||
2276 | } | |||||
2277 | ||||||
2278 | bool | |||||
2279 | 0 | Perl_is_utf8_perl_word(pTHX_ const U8 *p) | ||||
2280 | { | |||||
2281 | dVAR; | |||||
2282 | ||||||
2283 | PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD; | |||||
2284 | ||||||
2285 | /* Only true if is an ASCII word character, and ASCII is invariant | |||||
2286 | * under utf8, so can just use the macro */ | |||||
2287 | 0 | return isWORDCHAR_A(*p); | ||||
2288 | } | |||||
2289 | ||||||
2290 | bool | |||||
2291 | 0 | Perl_is_utf8_digit(pTHX_ const U8 *p) | ||||
2292 | { | |||||
2293 | dVAR; | |||||
2294 | ||||||
2295 | PERL_ARGS_ASSERT_IS_UTF8_DIGIT; | |||||
2296 | ||||||
2297 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit"); | ||||
2298 | } | |||||
2299 | ||||||
2300 | bool | |||||
2301 | 0 | Perl_is_utf8_posix_digit(pTHX_ const U8 *p) | ||||
2302 | { | |||||
2303 | dVAR; | |||||
2304 | ||||||
2305 | PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT; | |||||
2306 | ||||||
2307 | /* Only true if is an ASCII digit character, and ASCII is invariant | |||||
2308 | * under utf8, so can just use the macro */ | |||||
2309 | 0 | return isDIGIT_A(*p); | ||||
2310 | } | |||||
2311 | ||||||
2312 | bool | |||||
2313 | 0 | Perl_is_utf8_upper(pTHX_ const U8 *p) | ||||
2314 | { | |||||
2315 | dVAR; | |||||
2316 | ||||||
2317 | PERL_ARGS_ASSERT_IS_UTF8_UPPER; | |||||
2318 | ||||||
2319 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase"); | ||||
2320 | } | |||||
2321 | ||||||
2322 | bool | |||||
2323 | 0 | Perl_is_utf8_lower(pTHX_ const U8 *p) | ||||
2324 | { | |||||
2325 | dVAR; | |||||
2326 | ||||||
2327 | PERL_ARGS_ASSERT_IS_UTF8_LOWER; | |||||
2328 | ||||||
2329 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase"); | ||||
2330 | } | |||||
2331 | ||||||
2332 | bool | |||||
2333 | 0 | Perl_is_utf8_cntrl(pTHX_ const U8 *p) | ||||
2334 | { | |||||
2335 | dVAR; | |||||
2336 | ||||||
2337 | PERL_ARGS_ASSERT_IS_UTF8_CNTRL; | |||||
2338 | ||||||
2339 | 0 | 0 | return isCNTRL_utf8(p); | |||
0 | ||||||
0 | ||||||
0 | ||||||
2340 | } | |||||
2341 | ||||||
2342 | bool | |||||
2343 | 0 | Perl_is_utf8_graph(pTHX_ const U8 *p) | ||||
2344 | { | |||||
2345 | dVAR; | |||||
2346 | ||||||
2347 | PERL_ARGS_ASSERT_IS_UTF8_GRAPH; | |||||
2348 | ||||||
2349 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph"); | ||||
2350 | } | |||||
2351 | ||||||
2352 | bool | |||||
2353 | 0 | Perl_is_utf8_print(pTHX_ const U8 *p) | ||||
2354 | { | |||||
2355 | dVAR; | |||||
2356 | ||||||
2357 | PERL_ARGS_ASSERT_IS_UTF8_PRINT; | |||||
2358 | ||||||
2359 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint"); | ||||
2360 | } | |||||
2361 | ||||||
2362 | bool | |||||
2363 | 0 | Perl_is_utf8_punct(pTHX_ const U8 *p) | ||||
2364 | { | |||||
2365 | dVAR; | |||||
2366 | ||||||
2367 | PERL_ARGS_ASSERT_IS_UTF8_PUNCT; | |||||
2368 | ||||||
2369 | 0 | return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct"); | ||||
2370 | } | |||||
2371 | ||||||
2372 | bool | |||||
2373 | 0 | Perl_is_utf8_xdigit(pTHX_ const U8 *p) | ||||
2374 | { | |||||
2375 | dVAR; | |||||
2376 | ||||||
2377 | PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; | |||||
2378 | ||||||
2379 | 0 | 0 | return is_XDIGIT_utf8(p); | |||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
2380 | } | |||||
2381 | ||||||
2382 | bool | |||||
2383 | 14 | Perl__is_utf8_mark(pTHX_ const U8 *p) | ||||
2384 | { | |||||
2385 | dVAR; | |||||
2386 | ||||||
2387 | PERL_ARGS_ASSERT__IS_UTF8_MARK; | |||||
2388 | ||||||
2389 | 14 | return is_utf8_common(p, &PL_utf8_mark, "IsM"); | ||||
2390 | } | |||||
2391 | ||||||
2392 | ||||||
2393 | bool | |||||
2394 | 0 | Perl_is_utf8_mark(pTHX_ const U8 *p) | ||||
2395 | { | |||||
2396 | dVAR; | |||||
2397 | ||||||
2398 | PERL_ARGS_ASSERT_IS_UTF8_MARK; | |||||
2399 | ||||||
2400 | 0 | return _is_utf8_mark(p); | ||||
2401 | } | |||||
2402 | ||||||
2403 | /* | |||||
2404 | =for apidoc to_utf8_case | |||||
2405 | ||||||
2406 | The C contains the pointer to the UTF-8 string encoding |
|||||
2407 | the character that is being converted. This routine assumes that the character | |||||
2408 | at C is well-formed. |
|||||
2409 | ||||||
2410 | The C |
|||||
2411 | conversion result to. The C |
|||||
2412 | of the result. | |||||
2413 | ||||||
2414 | The C |
|||||
2415 | ||||||
2416 | Both the special and normal mappings are stored in F |
|||||
2417 | and loaded by SWASHNEW, using F |
|||||
2418 | but not always, a multicharacter mapping), is tried first. | |||||
2419 | ||||||
2420 | The C |
|||||
2421 | hash %utf8::ToSpecLower. The access to the hash is through | |||||
2422 | Perl_to_utf8_case(). | |||||
2423 | ||||||
2424 | The C |
|||||
2425 | %utf8::ToLower. | |||||
2426 | ||||||
2427 | =cut */ | |||||
2428 | ||||||
2429 | UV | |||||
2430 | 2406114 | Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, | ||||
2431 | SV **swashp, const char *normal, const char *special) | |||||
2432 | { | |||||
2433 | dVAR; | |||||
2434 | 2406114 | STRLEN len = 0; | ||||
2435 | 2406114 | const UV uv1 = valid_utf8_to_uvchr(p, NULL); | ||||
2436 | ||||||
2437 | PERL_ARGS_ASSERT_TO_UTF8_CASE; | |||||
2438 | ||||||
2439 | /* Note that swash_fetch() doesn't output warnings for these because it | |||||
2440 | * assumes we will */ | |||||
2441 | 2406114 | 100 | if (uv1 >= UNICODE_SURROGATE_FIRST) { | |||
2442 | 586434 | 100 | if (uv1 <= UNICODE_SURROGATE_LAST) { | |||
2443 | 50 | 100 | if (ckWARN_d(WARN_SURROGATE)) { | |||
2444 | 36 | 50 | const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; | |||
50 | ||||||
0 | ||||||
2445 | 36 | Perl_warner(aTHX_ packWARN(WARN_SURROGATE), | ||||
2446 | "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1); | |||||
2447 | } | |||||
2448 | } | |||||
2449 | 586384 | 100 | else if (UNICODE_IS_SUPER(uv1)) { | |||
2450 | 54 | 100 | if (ckWARN_d(WARN_NON_UNICODE)) { | |||
2451 | 28 | 50 | const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; | |||
50 | ||||||
0 | ||||||
2452 | 28 | Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), | ||||
2453 | "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1); | |||||
2454 | } | |||||
2455 | } | |||||
2456 | ||||||
2457 | /* Note that non-characters are perfectly legal, so no warning should | |||||
2458 | * be given */ | |||||
2459 | } | |||||
2460 | ||||||
2461 | 2406082 | 100 | if (!*swashp) /* load on-demand */ | |||
2462 | 100 | *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); | ||||
2463 | ||||||
2464 | 2406082 | 100 | if (special) { | |||
2465 | /* It might be "special" (sometimes, but not always, | |||||
2466 | * a multicharacter mapping) */ | |||||
2467 | 2078270 | HV * const hv = get_hv(special, 0); | ||||
2468 | SV **svp; | |||||
2469 | ||||||
2470 | 4156540 | 50 | if (hv && | |||
100 | ||||||
2471 | 3288656 | 50 | (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) && | |||
100 | ||||||
100 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
2472 | 1210386 | (*svp)) { | ||||
2473 | const char *s; | |||||
2474 | ||||||
2475 | 1210386 | 50 | s = SvPV_const(*svp, len); | |||
2476 | 1210386 | 50 | if (len == 1) | |||
2477 | /* EIGHTBIT */ | |||||
2478 | 0 | len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp; | ||||
2479 | else { | |||||
2480 | 1210386 | Copy(s, ustrp, len, U8); | ||||
2481 | } | |||||
2482 | } | |||||
2483 | } | |||||
2484 | ||||||
2485 | 2406082 | 100 | if (!len && *swashp) { | |||
50 | ||||||
2486 | 1195696 | const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */); | ||||
2487 | ||||||
2488 | 1195696 | 100 | if (uv2) { | |||
2489 | /* It was "normal" (a single character mapping). */ | |||||
2490 | 779284 | len = uvchr_to_utf8(ustrp, uv2) - ustrp; | ||||
2491 | } | |||||
2492 | } | |||||
2493 | ||||||
2494 | 2406082 | 100 | if (len) { | |||
2495 | 1989670 | 50 | if (lenp) { | |||
2496 | 1989670 | *lenp = len; | ||||
2497 | } | |||||
2498 | 1989670 | return valid_utf8_to_uvchr(ustrp, 0); | ||||
2499 | } | |||||
2500 | ||||||
2501 | /* Here, there was no mapping defined, which means that the code point maps | |||||
2502 | * to itself. Return the inputs */ | |||||
2503 | 416412 | len = UTF8SKIP(p); | ||||
2504 | 416412 | 100 | if (p != ustrp) { /* Don't copy onto itself */ | |||
2505 | 313988 | Copy(p, ustrp, len, U8); | ||||
2506 | } | |||||
2507 | ||||||
2508 | 416412 | 100 | if (lenp) | |||
2509 | 1411238 | *lenp = len; | ||||
2510 | ||||||
2511 | return uv1; | |||||
2512 | ||||||
2513 | } | |||||
2514 | ||||||
2515 | STATIC UV | |||||
2516 | 235496 | S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) | ||||
2517 | { | |||||
2518 | /* This is called when changing the case of a utf8-encoded character above | |||||
2519 | * the Latin1 range, and the operation is in locale. If the result | |||||
2520 | * contains a character that crosses the 255/256 boundary, disallow the | |||||
2521 | * change, and return the original code point. See L |
|||||
2522 | * | |||||
2523 | * p points to the original string whose case was changed; assumed | |||||
2524 | * by this routine to be well-formed | |||||
2525 | * result the code point of the first character in the changed-case string | |||||
2526 | * ustrp points to the changed-case string ( |
|||||
2527 | * lenp points to the length of |
|||||
2528 | ||||||
2529 | UV original; /* To store the first code point of */ |
|||||
2530 | ||||||
2531 | PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING; | |||||
2532 | ||||||
2533 | assert(UTF8_IS_ABOVE_LATIN1(*p)); | |||||
2534 | ||||||
2535 | /* We know immediately if the first character in the string crosses the | |||||
2536 | * boundary, so can skip */ | |||||
2537 | 235496 | 100 | if (result > 255) { | |||
2538 | ||||||
2539 | /* Look at every character in the result; if any cross the | |||||
2540 | * boundary, the whole thing is disallowed */ | |||||
2541 | 71244 | U8* s = ustrp + UTF8SKIP(ustrp); | ||||
2542 | 71244 | U8* e = ustrp + *lenp; | ||||
2543 | 106866 | 100 | while (s < e) { | |||
2544 | 5992 | 50 | if (! UTF8_IS_ABOVE_LATIN1(*s)) { | |||
2545 | goto bad_crossing; | |||||
2546 | } | |||||
2547 | 0 | s += UTF8SKIP(s); | ||||
2548 | } | |||||
2549 | ||||||
2550 | /* Here, no characters crossed, result is ok as-is */ | |||||
2551 | return result; | |||||
2552 | } | |||||
2553 | ||||||
2554 | bad_crossing: | |||||
2555 | ||||||
2556 | /* Failed, have to return the original */ | |||||
2557 | 170244 | original = valid_utf8_to_uvchr(p, lenp); | ||||
2558 | 170244 | Copy(p, ustrp, *lenp, char); | ||||
2559 | 202870 | return original; | ||||
2560 | } | |||||
2561 | ||||||
2562 | /* | |||||
2563 | =for apidoc to_utf8_upper | |||||
2564 | ||||||
2565 | Instead use L. | |||||
2566 | ||||||
2567 | =cut */ | |||||
2568 | ||||||
2569 | /* Not currently externally documented, and subject to change: | |||||
2570 | * |
|||||
2571 | * |
|||||
2572 | * were used in the calculation; otherwise unchanged. */ | |||||
2573 | ||||||
2574 | UV | |||||
2575 | 183772 | Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr) | ||||
2576 | { | |||||
2577 | dVAR; | |||||
2578 | ||||||
2579 | UV result; | |||||
2580 | ||||||
2581 | PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; | |||||
2582 | ||||||
2583 | 183772 | 100 | if (UTF8_IS_INVARIANT(*p)) { | |||
2584 | 29730 | 50 | if (flags) { | |||
2585 | 0 | result = toUPPER_LC(*p); | ||||
2586 | } | |||||
2587 | else { | |||||
2588 | 29730 | return _to_upper_title_latin1(*p, ustrp, lenp, 'S'); | ||||
2589 | } | |||||
2590 | } | |||||
2591 | 154042 | 100 | else if UTF8_IS_DOWNGRADEABLE_START(*p) { | |||
2592 | 27982 | 100 | if (flags) { | |||
2593 | 2 | 50 | result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); | |||
2594 | } | |||||
2595 | else { | |||||
2596 | 27980 | return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), | ||||
2597 | ustrp, lenp, 'S'); | |||||
2598 | } | |||||
2599 | } | |||||
2600 | else { /* utf8, ord above 255 */ | |||||
2601 | 126060 | result = CALL_UPPER_CASE(p, ustrp, lenp); | ||||
2602 | ||||||
2603 | 126040 | 50 | if (flags) { | |||
2604 | 0 | result = check_locale_boundary_crossing(p, result, ustrp, lenp); | ||||
2605 | } | |||||
2606 | 126040 | return result; | ||||
2607 | } | |||||
2608 | ||||||
2609 | /* Here, used locale rules. Convert back to utf8 */ | |||||
2610 | 2 | 50 | if (UTF8_IS_INVARIANT(result)) { | |||
2611 | 0 | *ustrp = (U8) result; | ||||
2612 | 0 | *lenp = 1; | ||||
2613 | } | |||||
2614 | else { | |||||
2615 | 2 | *ustrp = UTF8_EIGHT_BIT_HI(result); | ||||
2616 | 2 | *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); | ||||
2617 | 2 | *lenp = 2; | ||||
2618 | } | |||||
2619 | ||||||
2620 | 2 | 50 | if (tainted_ptr) { | |||
2621 | 91877 | *tainted_ptr = TRUE; | ||||
2622 | } | |||||
2623 | return result; | |||||
2624 | } | |||||
2625 | ||||||
2626 | /* | |||||
2627 | =for apidoc to_utf8_title | |||||
2628 | ||||||
2629 | Instead use L. | |||||
2630 | ||||||
2631 | =cut */ | |||||
2632 | ||||||
2633 | /* Not currently externally documented, and subject to change: | |||||
2634 | * |
|||||
2635 | * Since titlecase is not defined in POSIX, uppercase is used instead | |||||
2636 | * for these/ | |||||
2637 | * |
|||||
2638 | * were used in the calculation; otherwise unchanged. */ | |||||
2639 | ||||||
2640 | UV | |||||
2641 | 10196 | Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr) | ||||
2642 | { | |||||
2643 | dVAR; | |||||
2644 | ||||||
2645 | UV result; | |||||
2646 | ||||||
2647 | PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; | |||||
2648 | ||||||
2649 | 10196 | 100 | if (UTF8_IS_INVARIANT(*p)) { | |||
2650 | 982 | 50 | if (flags) { | |||
2651 | 0 | result = toUPPER_LC(*p); | ||||
2652 | } | |||||
2653 | else { | |||||
2654 | 982 | return _to_upper_title_latin1(*p, ustrp, lenp, 's'); | ||||
2655 | } | |||||
2656 | } | |||||
2657 | 9214 | 100 | else if UTF8_IS_DOWNGRADEABLE_START(*p) { | |||
2658 | 786 | 100 | if (flags) { | |||
2659 | 2 | 50 | result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); | |||
2660 | } | |||||
2661 | else { | |||||
2662 | 784 | return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), | ||||
2663 | ustrp, lenp, 's'); | |||||
2664 | } | |||||
2665 | } | |||||
2666 | else { /* utf8, ord above 255 */ | |||||
2667 | 8428 | result = CALL_TITLE_CASE(p, ustrp, lenp); | ||||
2668 | ||||||
2669 | 8422 | 50 | if (flags) { | |||
2670 | 0 | result = check_locale_boundary_crossing(p, result, ustrp, lenp); | ||||
2671 | } | |||||
2672 | 8422 | return result; | ||||
2673 | } | |||||
2674 | ||||||
2675 | /* Here, used locale rules. Convert back to utf8 */ | |||||
2676 | 2 | 50 | if (UTF8_IS_INVARIANT(result)) { | |||
2677 | 0 | *ustrp = (U8) result; | ||||
2678 | 0 | *lenp = 1; | ||||
2679 | } | |||||
2680 | else { | |||||
2681 | 2 | *ustrp = UTF8_EIGHT_BIT_HI(result); | ||||
2682 | 2 | *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); | ||||
2683 | 2 | *lenp = 2; | ||||
2684 | } | |||||
2685 | ||||||
2686 | 2 | 50 | if (tainted_ptr) { | |||
2687 | 5096 | *tainted_ptr = TRUE; | ||||
2688 | } | |||||
2689 | return result; | |||||
2690 | } | |||||
2691 | ||||||
2692 | /* | |||||
2693 | =for apidoc to_utf8_lower | |||||
2694 | ||||||
2695 | Instead use L. | |||||
2696 | ||||||
2697 | =cut */ | |||||
2698 | ||||||
2699 | /* Not currently externally documented, and subject to change: | |||||
2700 | * |
|||||
2701 | * |
|||||
2702 | * were used in the calculation; otherwise unchanged. */ | |||||
2703 | ||||||
2704 | UV | |||||
2705 | 192456 | Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr) | ||||
2706 | { | |||||
2707 | UV result; | |||||
2708 | ||||||
2709 | dVAR; | |||||
2710 | ||||||
2711 | PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; | |||||
2712 | ||||||
2713 | 192456 | 100 | if (UTF8_IS_INVARIANT(*p)) { | |||
2714 | 31466 | 100 | if (flags) { | |||
2715 | 256 | result = toLOWER_LC(*p); | ||||
2716 | } | |||||
2717 | else { | |||||
2718 | 46815 | return to_lower_latin1(*p, ustrp, lenp); | ||||
2719 | } | |||||
2720 | } | |||||
2721 | 160990 | 100 | else if UTF8_IS_DOWNGRADEABLE_START(*p) { | |||
2722 | 28714 | 100 | if (flags) { | |||
2723 | 260 | 50 | result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); | |||
2724 | } | |||||
2725 | else { | |||||
2726 | 42681 | return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), | ||||
2727 | ustrp, lenp); | |||||
2728 | } | |||||
2729 | } | |||||
2730 | else { /* utf8, ord above 255 */ | |||||
2731 | 132276 | result = CALL_LOWER_CASE(p, ustrp, lenp); | ||||
2732 | ||||||
2733 | 132270 | 50 | if (flags) { | |||
2734 | 0 | result = check_locale_boundary_crossing(p, result, ustrp, lenp); | ||||
2735 | } | |||||
2736 | ||||||
2737 | 132270 | return result; | ||||
2738 | } | |||||
2739 | ||||||
2740 | /* Here, used locale rules. Convert back to utf8 */ | |||||
2741 | 516 | 100 | if (UTF8_IS_INVARIANT(result)) { | |||
2742 | 256 | *ustrp = (U8) result; | ||||
2743 | 256 | *lenp = 1; | ||||
2744 | } | |||||
2745 | else { | |||||
2746 | 260 | *ustrp = UTF8_EIGHT_BIT_HI(result); | ||||
2747 | 260 | *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); | ||||
2748 | 260 | *lenp = 2; | ||||
2749 | } | |||||
2750 | ||||||
2751 | 516 | 50 | if (tainted_ptr) { | |||
2752 | 96483 | *tainted_ptr = TRUE; | ||||
2753 | } | |||||
2754 | return result; | |||||
2755 | } | |||||
2756 | ||||||
2757 | /* | |||||
2758 | =for apidoc to_utf8_fold | |||||
2759 | ||||||
2760 | Instead use L. | |||||
2761 | ||||||
2762 | =cut */ | |||||
2763 | ||||||
2764 | /* Not currently externally documented, and subject to change, | |||||
2765 | * in |
|||||
2766 | * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code | |||||
2767 | * points < 256. Since foldcase is not defined in | |||||
2768 | * POSIX, lowercase is used instead | |||||
2769 | * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; | |||||
2770 | * otherwise simple folds | |||||
2771 | * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are | |||||
2772 | * prohibited | |||||
2773 | * |
|||||
2774 | * were used in the calculation; otherwise unchanged. */ | |||||
2775 | ||||||
2776 | UV | |||||
2777 | 1282382 | Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr) | ||||
2778 | { | |||||
2779 | dVAR; | |||||
2780 | ||||||
2781 | UV result; | |||||
2782 | ||||||
2783 | PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; | |||||
2784 | ||||||
2785 | /* These are mutually exclusive */ | |||||
2786 | assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII))); | |||||
2787 | ||||||
2788 | assert(p != ustrp); /* Otherwise overwrites */ | |||||
2789 | ||||||
2790 | 1282382 | 100 | if (UTF8_IS_INVARIANT(*p)) { | |||
2791 | 68476 | 100 | if (flags & FOLD_FLAGS_LOCALE) { | |||
2792 | 256 | result = toFOLD_LC(*p); | ||||
2793 | } | |||||
2794 | else { | |||||
2795 | 68220 | return _to_fold_latin1(*p, ustrp, lenp, | ||||
2796 | flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); | |||||
2797 | } | |||||
2798 | } | |||||
2799 | 1213906 | 100 | else if UTF8_IS_DOWNGRADEABLE_START(*p) { | |||
2800 | 55790 | 100 | if (flags & FOLD_FLAGS_LOCALE) { | |||
2801 | 256 | 50 | result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); | |||
2802 | } | |||||
2803 | else { | |||||
2804 | 55534 | return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), | ||||
2805 | ustrp, lenp, | |||||
2806 | flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); | |||||
2807 | } | |||||
2808 | } | |||||
2809 | else { /* utf8, ord above 255 */ | |||||
2810 | 1158116 | 100 | result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL); | |||
2811 | ||||||
2812 | 1158116 | 100 | if (flags & FOLD_FLAGS_LOCALE) { | |||
2813 | ||||||
2814 | /* Special case these characters, as what normally gets returned | |||||
2815 | * under locale doesn't work */ | |||||
2816 | 327246 | 100 | if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 | |||
2817 | 259094 | 100 | && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, | |||
2818 | sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1)) | |||||
2819 | { | |||||
2820 | goto return_long_s; | |||||
2821 | } | |||||
2822 | 277472 | 100 | else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1 | |||
2823 | 209320 | 100 | && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8, | |||
2824 | sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1)) | |||||
2825 | { | |||||
2826 | goto return_ligature_st; | |||||
2827 | } | |||||
2828 | 235496 | return check_locale_boundary_crossing(p, result, ustrp, lenp); | ||||
2829 | } | |||||
2830 | 830870 | 100 | else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { | |||
2831 | return result; | |||||
2832 | } | |||||
2833 | else { | |||||
2834 | /* This is called when changing the case of a utf8-encoded | |||||
2835 | * character above the ASCII range, and the result should not | |||||
2836 | * contain an ASCII character. */ | |||||
2837 | ||||||
2838 | UV original; /* To store the first code point of */ |
|||||
2839 | ||||||
2840 | /* Look at every character in the result; if any cross the | |||||
2841 | * boundary, the whole thing is disallowed */ | |||||
2842 | U8* s = ustrp; | |||||
2843 | 312882 | U8* e = ustrp + *lenp; | ||||
2844 | 543507 | 100 | while (s < e) { | |||
2845 | 318878 | 100 | if (isASCII(*s)) { | |||
2846 | /* Crossed, have to return the original */ | |||||
2847 | 244694 | original = valid_utf8_to_uvchr(p, lenp); | ||||
2848 | ||||||
2849 | /* But in these instances, there is an alternative we can | |||||
2850 | * return that is valid */ | |||||
2851 | 367041 | 100 | if (original == LATIN_CAPITAL_LETTER_SHARP_S | |||
2852 | 244694 | || original == LATIN_SMALL_LETTER_SHARP_S) | ||||
2853 | { | |||||
2854 | goto return_long_s; | |||||
2855 | } | |||||
2856 | 196330 | 100 | else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { | |||
2857 | goto return_ligature_st; | |||||
2858 | } | |||||
2859 | 155300 | Copy(p, ustrp, *lenp, char); | ||||
2860 | 155300 | return original; | ||||
2861 | } | |||||
2862 | 74184 | s += UTF8SKIP(s); | ||||
2863 | } | |||||
2864 | ||||||
2865 | /* Here, no characters crossed, result is ok as-is */ | |||||
2866 | return result; | |||||
2867 | } | |||||
2868 | } | |||||
2869 | ||||||
2870 | /* Here, used locale rules. Convert back to utf8 */ | |||||
2871 | 512 | 100 | if (UTF8_IS_INVARIANT(result)) { | |||
2872 | 256 | *ustrp = (U8) result; | ||||
2873 | 256 | *lenp = 1; | ||||
2874 | } | |||||
2875 | else { | |||||
2876 | 256 | *ustrp = UTF8_EIGHT_BIT_HI(result); | ||||
2877 | 256 | *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result); | ||||
2878 | 256 | *lenp = 2; | ||||
2879 | } | |||||
2880 | ||||||
2881 | 512 | 50 | if (tainted_ptr) { | |||
2882 | 512 | *tainted_ptr = TRUE; | ||||
2883 | } | |||||
2884 | return result; | |||||
2885 | ||||||
2886 | return_long_s: | |||||
2887 | /* Certain folds to 'ss' are prohibited by the options, but they do allow | |||||
2888 | * folds to a string of two of these characters. By returning this | |||||
2889 | * instead, then, e.g., | |||||
2890 | * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") | |||||
2891 | * works. */ | |||||
2892 | ||||||
2893 | 98138 | *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; | ||||
2894 | 98138 | Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, | ||||
2895 | ustrp, *lenp, U8); | |||||
2896 | 98138 | return LATIN_SMALL_LETTER_LONG_S; | ||||
2897 | ||||||
2898 | return_ligature_st: | |||||
2899 | /* Two folds to 'st' are prohibited by the options; instead we pick one and | |||||
2900 | * have the other one fold to it */ | |||||
2901 | ||||||
2902 | 83006 | *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1; | ||||
2903 | 83006 | Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); | ||||
2904 | 682694 | return LATIN_SMALL_LIGATURE_ST; | ||||
2905 | } | |||||
2906 | ||||||
2907 | /* Note: | |||||
2908 | * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch(). | |||||
2909 | * C |
|||||
2910 | * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl. | |||||
2911 | */ | |||||
2912 | ||||||
2913 | SV* | |||||
2914 | 182 | Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none) | ||||
2915 | { | |||||
2916 | PERL_ARGS_ASSERT_SWASH_INIT; | |||||
2917 | ||||||
2918 | /* Returns a copy of a swash initiated by the called function. This is the | |||||
2919 | * public interface, and returning a copy prevents others from doing | |||||
2920 | * mischief on the original */ | |||||
2921 | ||||||
2922 | 182 | return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL)); | ||||
2923 | } | |||||
2924 | ||||||
2925 | SV* | |||||
2926 | 637998 | Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p) | ||||
2927 | { | |||||
2928 | /* Initialize and return a swash, creating it if necessary. It does this | |||||
2929 | * by calling utf8_heavy.pl in the general case. The returned value may be | |||||
2930 | * the swash's inversion list instead if the input parameters allow it. | |||||
2931 | * Which is returned should be immaterial to callers, as the only | |||||
2932 | * operations permitted on a swash, swash_fetch(), _get_swash_invlist(), | |||||
2933 | * and swash_to_invlist() handle both these transparently. | |||||
2934 | * | |||||
2935 | * This interface should only be used by functions that won't destroy or | |||||
2936 | * adversely change the swash, as doing so affects all other uses of the | |||||
2937 | * swash in the program; the general public should use 'Perl_swash_init' | |||||
2938 | * instead. | |||||
2939 | * | |||||
2940 | * pkg is the name of the package that |
|||||
2941 | * name is the name of the swash to find. Typically it is a Unicode | |||||
2942 | * property name, including user-defined ones | |||||
2943 | * listsv is a string to initialize the swash with. It must be of the form | |||||
2944 | * documented as the subroutine return value in | |||||
2945 | * L |
|||||
2946 | * minbits is the number of bits required to represent each data element. | |||||
2947 | * It is '1' for binary properties. | |||||
2948 | * none I (khw) do not understand this one, but it is used only in tr///. | |||||
2949 | * invlist is an inversion list to initialize the swash with (or NULL) | |||||
2950 | * flags_p if non-NULL is the address of various input and output flag bits | |||||
2951 | * to the routine, as follows: ('I' means is input to the routine; | |||||
2952 | * 'O' means output from the routine. Only flags marked O are | |||||
2953 | * meaningful on return.) | |||||
2954 | * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash | |||||
2955 | * came from a user-defined property. (I O) | |||||
2956 | * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking | |||||
2957 | * when the swash cannot be located, to simply return NULL. (I) | |||||
2958 | * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a | |||||
2959 | * return of an inversion list instead of a swash hash if this routine | |||||
2960 | * thinks that would result in faster execution of swash_fetch() later | |||||
2961 | * on. (I) | |||||
2962 | * | |||||
2963 | * Thus there are three possible inputs to find the swash: |
|||||
2964 | * |
|||||
2965 | * will be the union of the specified ones, although |
|||||
2966 | * actions can intersect, etc. what |
|||||
2967 | * | |||||
2968 | * |
|||||
2969 | ||||||
2970 | dVAR; | |||||
2971 | SV* retval = &PL_sv_undef; | |||||
2972 | HV* swash_hv = NULL; | |||||
2973 | const int invlist_swash_boundary = | |||||
2974 | 637844 | 100 | (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST) | |||
2975 | ? 512 /* Based on some benchmarking, but not extensive, see commit | |||||
2976 | message */ | |||||
2977 | 637998 | 100 | : -1; /* Never return just an inversion list */ | |||
2978 | ||||||
2979 | assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); | |||||
2980 | assert(! invlist || minbits == 1); | |||||
2981 | ||||||
2982 | /* If data was passed in to go out to utf8_heavy to find the swash of, do | |||||
2983 | * so */ | |||||
2984 | 871652 | 100 | if (listsv != &PL_sv_undef || strNE(name, "")) { | |||
100 | ||||||
50 | ||||||
2985 | 233654 | dSP; | ||||
2986 | 233654 | const size_t pkg_len = strlen(pkg); | ||||
2987 | 233654 | const size_t name_len = strlen(name); | ||||
2988 | 233654 | HV * const stash = gv_stashpvn(pkg, pkg_len, 0); | ||||
2989 | SV* errsv_save; | |||||
2990 | GV *method; | |||||
2991 | ||||||
2992 | PERL_ARGS_ASSERT__CORE_SWASH_INIT; | |||||
2993 | ||||||
2994 | 233654 | 100 | PUSHSTACKi(PERLSI_MAGIC); | |||
2995 | 233654 | ENTER; | ||||
2996 | 233654 | SAVEHINTS(); | ||||
2997 | 233654 | save_re_context(); | ||||
2998 | /* We might get here via a subroutine signature which uses a utf8 | |||||
2999 | * parameter name, at which point PL_subname will have been set | |||||
3000 | * but not yet used. */ | |||||
3001 | 233654 | save_item(PL_subname); | ||||
3002 | 233654 | 100 | if (PL_parser && PL_parser->error_count) | |||
100 | ||||||
3003 | 2 | SAVEI8(PL_parser->error_count), PL_parser->error_count = 0; | ||||
3004 | 233654 | method = gv_fetchmeth(stash, "SWASHNEW", 8, -1); | ||||
3005 | 233654 | 100 | if (!method) { /* demand load utf8 */ | |||
3006 | 1104 | ENTER; | ||||
3007 | 1104 | 50 | if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); | |||
3008 | 1104 | GvSV(PL_errgv) = NULL; | ||||
3009 | /* It is assumed that callers of this routine are not passing in | |||||
3010 | * any user derived data. */ | |||||
3011 | /* Need to do this after save_re_context() as it will set | |||||
3012 | * PL_tainted to 1 while saving $1 etc (see the code after getrx: | |||||
3013 | * in Perl_magic_get). Even line to create errsv_save can turn on | |||||
3014 | * PL_tainted. */ | |||||
3015 | #ifndef NO_TAINT_SUPPORT | |||||
3016 | 1104 | SAVEBOOL(TAINT_get); | ||||
3017 | 1104 | TAINT_NOT; | ||||
3018 | #endif | |||||
3019 | 1104 | Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), | ||||
3020 | NULL); | |||||
3021 | { | |||||
3022 | /* Not ERRSV, as there is no need to vivify a scalar we are | |||||
3023 | about to discard. */ | |||||
3024 | 1104 | SV * const errsv = GvSV(PL_errgv); | ||||
3025 | 1104 | 50 | if (!SvTRUE(errsv)) { | |||
50 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
3026 | 2208 | GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); | ||||
3027 | 1104 | SvREFCNT_dec(errsv); | ||||
3028 | } | |||||
3029 | } | |||||
3030 | 1104 | LEAVE; | ||||
3031 | } | |||||
3032 | 233654 | SPAGAIN; | ||||
3033 | 233654 | 50 | PUSHMARK(SP); | |||
3034 | 116827 | EXTEND(SP,5); | ||||
3035 | 233654 | mPUSHp(pkg, pkg_len); | ||||
3036 | 233654 | mPUSHp(name, name_len); | ||||
3037 | 233654 | PUSHs(listsv); | ||||
3038 | 233654 | mPUSHi(minbits); | ||||
3039 | 233654 | mPUSHi(none); | ||||
3040 | 233654 | PUTBACK; | ||||
3041 | 233654 | 50 | if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); | |||
3042 | 233654 | GvSV(PL_errgv) = NULL; | ||||
3043 | /* If we already have a pointer to the method, no need to use | |||||
3044 | * call_method() to repeat the lookup. */ | |||||
3045 | 467308 | 100 | if (method | |||
50 | ||||||
3046 | 232550 | ? call_sv(MUTABLE_SV(method), G_SCALAR) | ||||
3047 | 1104 | : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD)) | ||||
3048 | { | |||||
3049 | 233654 | retval = *PL_stack_sp--; | ||||
3050 | SvREFCNT_inc(retval); | |||||
3051 | } | |||||
3052 | { | |||||
3053 | /* Not ERRSV. See above. */ | |||||
3054 | 233654 | SV * const errsv = GvSV(PL_errgv); | ||||
3055 | 233654 | 100 | if (!SvTRUE(errsv)) { | |||
50 | ||||||
0 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
0 | ||||||
50 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
0 | ||||||
3056 | 467308 | GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); | ||||
3057 | 233654 | SvREFCNT_dec(errsv); | ||||
3058 | } | |||||
3059 | } | |||||
3060 | 233654 | LEAVE; | ||||
3061 | 233654 | 50 | POPSTACK; | |||
3062 | 233654 | 100 | if (IN_PERL_COMPILETIME) { | |||
3063 | 181458 | CopHINTS_set(PL_curcop, PL_hints); | ||||
3064 | } | |||||
3065 | 233654 | 100 | if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { | |||
50 | ||||||
3066 | 50258 | 50 | if (SvPOK(retval)) | |||
3067 | ||||||
3068 | /* If caller wants to handle missing properties, let them */ | |||||
3069 | 50258 | 50 | if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) { | |||
100 | ||||||
3070 | return NULL; | |||||
3071 | } | |||||
3072 | 25126 | Perl_croak(aTHX_ | ||||
3073 | "Can't find Unicode property definition \"%"SVf"\"", | |||||
3074 | SVfARG(retval)); | |||||
3075 | Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); | |||||
3076 | } | |||||
3077 | } /* End of calling the module to find the swash */ | |||||
3078 | ||||||
3079 | /* If this operation fetched a swash, and we will need it later, get it */ | |||||
3080 | 587740 | 100 | if (retval != &PL_sv_undef | |||
3081 | 183396 | 100 | && (minbits == 1 || (flags_p | |||
50 | ||||||
3082 | 0 | 0 | && ! (*flags_p | |||
3083 | 0 | & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)))) | ||||
3084 | { | |||||
3085 | 183172 | swash_hv = MUTABLE_HV(SvRV(retval)); | ||||
3086 | ||||||
3087 | /* If we don't already know that there is a user-defined component to | |||||
3088 | * this swash, and the user has indicated they wish to know if there is | |||||
3089 | * one (by passing |
|||||
3090 | 183172 | 100 | if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) { | |||
50 | ||||||
3091 | 183088 | SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE); | ||||
3092 | 183088 | 50 | if (user_defined && SvUV(*user_defined)) { | |||
50 | ||||||
100 | ||||||
3093 | 130 | *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; | ||||
3094 | } | |||||
3095 | } | |||||
3096 | } | |||||
3097 | ||||||
3098 | /* Make sure there is an inversion list for binary properties */ | |||||
3099 | 587740 | 100 | if (minbits == 1) { | |||
3100 | SV** swash_invlistsvp = NULL; | |||||
3101 | 587516 | SV* swash_invlist = NULL; | ||||
3102 | bool invlist_in_swash_is_valid = FALSE; | |||||
3103 | bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has | |||||
3104 | an unclaimed reference count */ | |||||
3105 | ||||||
3106 | /* If this operation fetched a swash, get its already existing | |||||
3107 | * inversion list, or create one for it */ | |||||
3108 | ||||||
3109 | 587516 | 100 | if (swash_hv) { | |||
3110 | 183172 | swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE); | ||||
3111 | 183172 | 100 | if (swash_invlistsvp) { | |||
3112 | 177752 | swash_invlist = *swash_invlistsvp; | ||||
3113 | invlist_in_swash_is_valid = TRUE; | |||||
3114 | } | |||||
3115 | else { | |||||
3116 | 5420 | swash_invlist = _swash_to_invlist(retval); | ||||
3117 | swash_invlist_unclaimed = TRUE; | |||||
3118 | } | |||||
3119 | } | |||||
3120 | ||||||
3121 | /* If an inversion list was passed in, have to include it */ | |||||
3122 | 587516 | 100 | if (invlist) { | |||
3123 | ||||||
3124 | /* Any fetched swash will by now have an inversion list in it; | |||||
3125 | * otherwise |
|||||
3126 | * didn't fetch a swash */ | |||||
3127 | 405002 | 100 | if (swash_invlist) { | |||
3128 | ||||||
3129 | /* Add the passed-in inversion list, which invalidates the one | |||||
3130 | * already stored in the swash */ | |||||
3131 | invlist_in_swash_is_valid = FALSE; | |||||
3132 | 658 | _invlist_union(invlist, swash_invlist, &swash_invlist); | ||||
3133 | } | |||||
3134 | else { | |||||
3135 | ||||||
3136 | /* Here, there is no swash already. Set up a minimal one, if | |||||
3137 | * we are going to return a swash */ | |||||
3138 | 404344 | 100 | if ((int) _invlist_len(invlist) > invlist_swash_boundary) { | |||
3139 | 6550 | swash_hv = newHV(); | ||||
3140 | 6550 | retval = newRV_noinc(MUTABLE_SV(swash_hv)); | ||||
3141 | } | |||||
3142 | 404344 | swash_invlist = invlist; | ||||
3143 | } | |||||
3144 | } | |||||
3145 | ||||||
3146 | /* Here, we have computed the union of all the passed-in data. It may | |||||
3147 | * be that there was an inversion list in the swash which didn't get | |||||
3148 | * touched; otherwise save the one computed one */ | |||||
3149 | 587516 | 100 | if (! invlist_in_swash_is_valid | |||
3150 | 614646 | 100 | && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) | |||
3151 | { | |||||
3152 | 11642 | 50 | if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist)) | |||
3153 | { | |||||
3154 | 0 | Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); | ||||
3155 | } | |||||
3156 | /* We just stole a reference count. */ | |||||
3157 | 11642 | 100 | if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE; | |||
3158 | 6550 | else SvREFCNT_inc_simple_void_NN(swash_invlist); | ||||
3159 | } | |||||
3160 | ||||||
3161 | /* Use the inversion list stand-alone if small enough */ | |||||
3162 | 881274 | 100 | if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { | |||
3163 | 398126 | SvREFCNT_dec(retval); | ||||
3164 | 398126 | 100 | if (!swash_invlist_unclaimed) | |||
3165 | 397798 | SvREFCNT_inc_simple_void_NN(swash_invlist); | ||||
3166 | 398126 | retval = newRV_noinc(swash_invlist); | ||||
3167 | } | |||||
3168 | } | |||||
3169 | ||||||
3170 | 600306 | return retval; | ||||
3171 | } | |||||
3172 | ||||||
3173 | ||||||
3174 | /* This API is wrong for special case conversions since we may need to | |||||
3175 | * return several Unicode characters for a single Unicode character | |||||
3176 | * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is | |||||
3177 | * the lower-level routine, and it is similarly broken for returning | |||||
3178 | * multiple values. --jhi | |||||
3179 | * For those, you should use to_utf8_case() instead */ | |||||
3180 | /* Now SWASHGET is recasted into S_swatch_get in this file. */ | |||||
3181 | ||||||
3182 | /* Note: | |||||
3183 | * Returns the value of property/mapping C |
|||||
3184 | * of the string C |
|||||
3185 | * assumed to be in well-formed utf8. If C |
|||||
3186 | * is assumed to be in native 8-bit encoding. Caches the swatch in C |
|||||
3187 | * | |||||
3188 | * A "swash" is a hash which contains initially the keys/values set up by | |||||
3189 | * SWASHNEW. The purpose is to be able to completely represent a Unicode | |||||
3190 | * property for all possible code points. Things are stored in a compact form | |||||
3191 | * (see utf8_heavy.pl) so that calculation is required to find the actual | |||||
3192 | * property value for a given code point. As code points are looked up, new | |||||
3193 | * key/value pairs are added to the hash, so that the calculation doesn't have | |||||
3194 | * to ever be re-done. Further, each calculation is done, not just for the | |||||
3195 | * desired one, but for a whole block of code points adjacent to that one. | |||||
3196 | * For binary properties on ASCII machines, the block is usually for 64 code | |||||
3197 | * points, starting with a code point evenly divisible by 64. Thus if the | |||||
3198 | * property value for code point 257 is requested, the code goes out and | |||||
3199 | * calculates the property values for all 64 code points between 256 and 319, | |||||
3200 | * and stores these as a single 64-bit long bit vector, called a "swatch", | |||||
3201 | * under the key for code point 256. The key is the UTF-8 encoding for code | |||||
3202 | * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding | |||||
3203 | * for a code point is 13 bytes, the key will be 12 bytes long. If the value | |||||
3204 | * for code point 258 is then requested, this code realizes that it would be | |||||
3205 | * stored under the key for 256, and would find that value and extract the | |||||
3206 | * relevant bit, offset from 256. | |||||
3207 | * | |||||
3208 | * Non-binary properties are stored in as many bits as necessary to represent | |||||
3209 | * their values (32 currently, though the code is more general than that), not | |||||
3210 | * as single bits, but the principal is the same: the value for each key is a | |||||
3211 | * vector that encompasses the property values for all code points whose UTF-8 | |||||
3212 | * representations are represented by the key. That is, for all code points | |||||
3213 | * whose UTF-8 representations are length N bytes, and the key is the first N-1 | |||||
3214 | * bytes of that. | |||||
3215 | */ | |||||
3216 | UV | |||||
3217 | 9535344 | Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) | ||||
3218 | { | |||||
3219 | dVAR; | |||||
3220 | 9535344 | HV *const hv = MUTABLE_HV(SvRV(swash)); | ||||
3221 | U32 klen; | |||||
3222 | U32 off; | |||||
3223 | STRLEN slen; | |||||
3224 | STRLEN needents; | |||||
3225 | const U8 *tmps = NULL; | |||||
3226 | U32 bit; | |||||
3227 | SV *swatch; | |||||
3228 | 9535344 | const U8 c = *ptr; | ||||
3229 | ||||||
3230 | PERL_ARGS_ASSERT_SWASH_FETCH; | |||||
3231 | ||||||
3232 | /* If it really isn't a hash, it isn't really swash; must be an inversion | |||||
3233 | * list */ | |||||
3234 | 9535344 | 100 | if (SvTYPE(hv) != SVt_PVHV) { | |||
3235 | 2874804 | 50 | return _invlist_contains_cp((SV*)hv, | |||
3236 | (do_utf8) | |||||
3237 | ? valid_utf8_to_uvchr(ptr, NULL) | |||||
3238 | : c); | |||||
3239 | } | |||||
3240 | ||||||
3241 | /* We store the values in a "swatch" which is a vec() value in a swash | |||||
3242 | * hash. Code points 0-255 are a single vec() stored with key length | |||||
3243 | * (klen) 0. All other code points have a UTF-8 representation | |||||
3244 | * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which | |||||
3245 | * share 0xAA..0xYY, which is the key in the hash to that vec. So the key | |||||
3246 | * length for them is the length of the encoded char - 1. ptr[klen] is the | |||||
3247 | * final byte in the sequence representing the character */ | |||||
3248 | 8097942 | 50 | if (!do_utf8 || UTF8_IS_INVARIANT(c)) { | |||
100 | ||||||
3249 | klen = 0; | |||||
3250 | needents = 256; | |||||
3251 | 14106 | off = c; | ||||
3252 | } | |||||
3253 | 8083836 | 100 | else if (UTF8_IS_DOWNGRADEABLE_START(c)) { | |||
3254 | klen = 0; | |||||
3255 | needents = 256; | |||||
3256 | 3928 | off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1)); | ||||
3257 | } | |||||
3258 | else { | |||||
3259 | 8079908 | klen = UTF8SKIP(ptr) - 1; | ||||
3260 | ||||||
3261 | /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into | |||||
3262 | * the vec is the final byte in the sequence. (In EBCDIC this is | |||||
3263 | * converted to I8 to get consecutive values.) To help you visualize | |||||
3264 | * all this: | |||||
3265 | * Straight 1047 After final byte | |||||
3266 | * UTF-8 UTF-EBCDIC I8 transform | |||||
3267 | * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0 | |||||
3268 | * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1 | |||||
3269 | * ... | |||||
3270 | * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9 | |||||
3271 | * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA | |||||
3272 | * ... | |||||
3273 | * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2 | |||||
3274 | * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3 | |||||
3275 | * ... | |||||
3276 | * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB | |||||
3277 | * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC | |||||
3278 | * ... | |||||
3279 | * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF | |||||
3280 | * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41 | |||||
3281 | * | |||||
3282 | * (There are no discontinuities in the elided (...) entries.) | |||||
3283 | * The UTF-8 key for these 33 code points is '\xD0' (which also is the | |||||
3284 | * key for the next 31, up through U+043F, whose UTF-8 final byte is | |||||
3285 | * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points. | |||||
3286 | * The final UTF-8 byte, which ranges between \x80 and \xBF, is an | |||||
3287 | * index into the vec() swatch (after subtracting 0x80, which we | |||||
3288 | * actually do with an '&'). | |||||
3289 | * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32 | |||||
3290 | * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has | |||||
3291 | * dicontinuities which go away by transforming it into I8, and we | |||||
3292 | * effectively subtract 0xA0 to get the index. */ | |||||
3293 | needents = (1 << UTF_ACCUMULATION_SHIFT); | |||||
3294 | 8079908 | off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK; | ||||
3295 | } | |||||
3296 | ||||||
3297 | /* | |||||
3298 | * This single-entry cache saves about 1/3 of the utf8 overhead in test | |||||
3299 | * suite. (That is, only 7-8% overall over just a hash cache. Still, | |||||
3300 | * it's nothing to sniff at.) Pity we usually come through at least | |||||
3301 | * two function calls to get here... | |||||
3302 | * | |||||
3303 | * NB: this code assumes that swatches are never modified, once generated! | |||||
3304 | */ | |||||
3305 | ||||||
3306 | 12128315 | 100 | if (hv == PL_last_swash_hv && | |||
100 | ||||||
3307 | 12021991 | 100 | klen == PL_last_swash_klen && | |||
3308 | 7982572 | 100 | (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) ) | |||
3309 | { | |||||
3310 | 7832052 | tmps = PL_last_swash_tmps; | ||||
3311 | 7832052 | slen = PL_last_swash_slen; | ||||
3312 | } | |||||
3313 | else { | |||||
3314 | /* Try our second-level swatch cache, kept in a hash. */ | |||||
3315 | 265890 | SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE); | ||||
3316 | ||||||
3317 | /* If not cached, generate it via swatch_get */ | |||||
3318 | 265890 | 100 | if (!svp || !SvPOK(*svp) | |||
50 | ||||||
3319 | 244590 | 50 | || !(tmps = (const U8*)SvPV_const(*svp, slen))) | |||
50 | ||||||
3320 | { | |||||
3321 | 21300 | 100 | if (klen) { | |||
3322 | 20738 | const UV code_point = valid_utf8_to_uvchr(ptr, NULL); | ||||
3323 | 20738 | swatch = swatch_get(swash, | ||||
3324 | code_point & ~((UV)needents - 1), | |||||
3325 | needents); | |||||
3326 | } | |||||
3327 | else { /* For the first 256 code points, the swatch has a key of | |||||
3328 | length 0 */ | |||||
3329 | 562 | swatch = swatch_get(swash, 0, needents); | ||||
3330 | } | |||||
3331 | ||||||
3332 | 21300 | 100 | if (IN_PERL_COMPILETIME) | |||
3333 | 4918 | CopHINTS_set(PL_curcop, PL_hints); | ||||
3334 | ||||||
3335 | 21300 | svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); | ||||
3336 | ||||||
3337 | 21300 | 50 | if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) | |||
50 | ||||||
50 | ||||||
3338 | 21300 | 50 | || (slen << 3) < needents) | |||
3339 | 0 | Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, " | ||||
3340 | "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf, | |||||
3341 | svp, tmps, (UV)slen, (UV)needents); | |||||
3342 | } | |||||
3343 | ||||||
3344 | 265890 | PL_last_swash_hv = hv; | ||||
3345 | assert(klen <= sizeof(PL_last_swash_key)); | |||||
3346 | 265890 | PL_last_swash_klen = (U8)klen; | ||||
3347 | /* FIXME change interpvar.h? */ | |||||
3348 | 265890 | PL_last_swash_tmps = (U8 *) tmps; | ||||
3349 | 265890 | PL_last_swash_slen = slen; | ||||
3350 | 265890 | 100 | if (klen) | |||
3351 | 256902 | Copy(ptr, PL_last_swash_key, klen, U8); | ||||
3352 | } | |||||
3353 | ||||||
3354 | 8097942 | switch ((int)((slen << 3) / needents)) { | ||||
3355 | case 1: | |||||
3356 | 6899056 | bit = 1 << (off & 7); | ||||
3357 | 6899056 | off >>= 3; | ||||
3358 | 6899056 | return (tmps[off] & bit) != 0; | ||||
3359 | case 8: | |||||
3360 | 70 | return tmps[off]; | ||||
3361 | case 16: | |||||
3362 | 2986 | off <<= 1; | ||||
3363 | 2986 | return (tmps[off] << 8) + tmps[off + 1] ; | ||||
3364 | case 32: | |||||
3365 | 1195830 | off <<= 2; | ||||
3366 | 1195830 | return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; | ||||
3367 | } | |||||
3368 | 4767672 | Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, " | ||||
3369 | "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents); | |||||
3370 | NORETURN_FUNCTION_END; | |||||
3371 | } | |||||
3372 | ||||||
3373 | /* Read a single line of the main body of the swash input text. These are of | |||||
3374 | * the form: | |||||
3375 | * 0053 0056 0073 | |||||
3376 | * where each number is hex. The first two numbers form the minimum and | |||||
3377 | * maximum of a range, and the third is the value associated with the range. | |||||
3378 | * Not all swashes should have a third number | |||||
3379 | * | |||||
3380 | * On input: l points to the beginning of the line to be examined; it points | |||||
3381 | * to somewhere in the string of the whole input text, and is | |||||
3382 | * terminated by a \n or the null string terminator. | |||||
3383 | * lend points to the null terminator of that string | |||||
3384 | * wants_value is non-zero if the swash expects a third number | |||||
3385 | * typestr is the name of the swash's mapping, like 'ToLower' | |||||
3386 | * On output: *min, *max, and *val are set to the values read from the line. | |||||
3387 | * returns a pointer just beyond the line examined. If there was no | |||||
3388 | * valid min number on the line, returns lend+1 | |||||
3389 | */ | |||||
3390 | ||||||
3391 | STATIC U8* | |||||
3392 | 2303340 | S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, | ||||
3393 | const bool wants_value, const U8* const typestr) | |||||
3394 | { | |||||
3395 | 2303340 | 100 | const int typeto = typestr[0] == 'T' && typestr[1] == 'o'; | |||
100 | ||||||
3396 | STRLEN numlen; /* Length of the number */ | |||||
3397 | 2303340 | I32 flags = PERL_SCAN_SILENT_ILLDIGIT | ||||
3398 | | PERL_SCAN_DISALLOW_PREFIX | |||||
3399 | | PERL_SCAN_SILENT_NON_PORTABLE; | |||||
3400 | ||||||
3401 | /* nl points to the next \n in the scan */ | |||||
3402 | 2303340 | U8* const nl = (U8*)memchr(l, '\n', lend - l); | ||||
3403 | ||||||
3404 | /* Get the first number on the line: the range minimum */ | |||||
3405 | 2303340 | numlen = lend - l; | ||||
3406 | 2303340 | *min = grok_hex((char *)l, &numlen, &flags, NULL); | ||||
3407 | 2303340 | 50 | if (numlen) /* If found a hex number, position past it */ | |||
3408 | 2303340 | l += numlen; | ||||
3409 | 0 | 0 | else if (nl) { /* Else, go handle next line, if any */ | |||
3410 | 0 | return nl + 1; /* 1 is length of "\n" */ | ||||
3411 | } | |||||
3412 | else { /* Else, no next line */ | |||||
3413 | 0 | return lend + 1; /* to LIST's end at which \n is not found */ | ||||
3414 | } | |||||
3415 | ||||||
3416 | /* The max range value follows, separated by a BLANK */ | |||||
3417 | 2303340 | 100 | if (isBLANK(*l)) { | |||
3418 | 2303198 | ++l; | ||||
3419 | 2303198 | flags = PERL_SCAN_SILENT_ILLDIGIT | ||||
3420 | | PERL_SCAN_DISALLOW_PREFIX | |||||
3421 | | PERL_SCAN_SILENT_NON_PORTABLE; | |||||
3422 | 2303198 | numlen = lend - l; | ||||
3423 | 2303198 | *max = grok_hex((char *)l, &numlen, &flags, NULL); | ||||
3424 | 2303198 | 100 | if (numlen) | |||
3425 | 1402214 | l += numlen; | ||||
3426 | else /* If no value here, it is a single element range */ | |||||
3427 | 900984 | *max = *min; | ||||
3428 | ||||||
3429 | /* Non-binary tables have a third entry: what the first element of the | |||||
3430 | * range maps to */ | |||||
3431 | 2303198 | 100 | if (wants_value) { | |||
3432 | 541538 | 50 | if (isBLANK(*l)) { | |||
3433 | 541538 | ++l; | ||||
3434 | ||||||
3435 | /* The ToLc, etc table mappings are not in hex, and must be | |||||
3436 | * corrected by adding the code point to them */ | |||||
3437 | 541538 | 100 | if (typeto) { | |||
3438 | 541128 | char *after_strtol = (char *) lend; | ||||
3439 | 541128 | *val = Strtol((char *)l, &after_strtol, 10); | ||||
3440 | l = (U8 *) after_strtol; | |||||
3441 | } | |||||
3442 | else { /* Other tables are in hex, and are the correct result | |||||
3443 | without tweaking */ | |||||
3444 | 410 | flags = PERL_SCAN_SILENT_ILLDIGIT | ||||
3445 | | PERL_SCAN_DISALLOW_PREFIX | |||||
3446 | | PERL_SCAN_SILENT_NON_PORTABLE; | |||||
3447 | 410 | numlen = lend - l; | ||||
3448 | 410 | *val = grok_hex((char *)l, &numlen, &flags, NULL); | ||||
3449 | 410 | 50 | if (numlen) | |||
3450 | l += numlen; | |||||
3451 | else | |||||
3452 | 0 | *val = 0; | ||||
3453 | } | |||||
3454 | } | |||||
3455 | else { | |||||
3456 | 0 | *val = 0; | ||||
3457 | 0 | 0 | if (typeto) { | |||
3458 | /* diag_listed_as: To%s: illegal mapping '%s' */ | |||||
3459 | 0 | Perl_croak(aTHX_ "%s: illegal mapping '%s'", | ||||
3460 | typestr, l); | |||||
3461 | } | |||||
3462 | } | |||||
3463 | } | |||||
3464 | else | |||||
3465 | 1761660 | *val = 0; /* bits == 1, then any val should be ignored */ | ||||
3466 | } | |||||
3467 | else { /* Nothing following range min, should be single element with no | |||||
3468 | mapping expected */ | |||||
3469 | 142 | *max = *min; | ||||
3470 | 142 | 50 | if (wants_value) { | |||
3471 | 0 | *val = 0; | ||||
3472 | 0 | 0 | if (typeto) { | |||
3473 | /* diag_listed_as: To%s: illegal mapping '%s' */ | |||||
3474 | 0 | Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l); | ||||
3475 | } | |||||
3476 | } | |||||
3477 | else | |||||
3478 | 142 | *val = 0; /* bits == 1, then val should be ignored */ | ||||
3479 | } | |||||
3480 | ||||||
3481 | /* Position to next line if any, or EOF */ | |||||
3482 | 2303340 | 100 | if (nl) | |||
3483 | 2303268 | l = nl + 1; | ||||
3484 | else | |||||
3485 | l = lend; | |||||
3486 | ||||||
3487 | 2303340 | return l; | ||||
3488 | } | |||||
3489 | ||||||
3490 | /* Note: | |||||
3491 | * Returns a swatch (a bit vector string) for a code point sequence | |||||
3492 | * that starts from the value C |
|||||
3493 | * A C |
|||||
3494 | * Should be used via swash_fetch, which will cache the swatch in C |
|||||
3495 | */ | |||||
3496 | STATIC SV* | |||||
3497 | 21300 | S_swatch_get(pTHX_ SV* swash, UV start, UV span) | ||||
3498 | { | |||||
3499 | SV *swatch; | |||||
3500 | U8 *l, *lend, *x, *xend, *s, *send; | |||||
3501 | STRLEN lcur, xcur, scur; | |||||
3502 | 21300 | HV *const hv = MUTABLE_HV(SvRV(swash)); | ||||
3503 | 21300 | SV** const invlistsvp = hv_fetchs(hv, "V", FALSE); | ||||
3504 | ||||||
3505 | SV** listsvp = NULL; /* The string containing the main body of the table */ | |||||
3506 | SV** extssvp = NULL; | |||||
3507 | SV** invert_it_svp = NULL; | |||||
3508 | U8* typestr = NULL; | |||||
3509 | STRLEN bits; | |||||
3510 | STRLEN octets; /* if bits == 1, then octets == 0 */ | |||||
3511 | UV none; | |||||
3512 | 21300 | UV end = start + span; | ||||
3513 | ||||||
3514 | 21300 | 100 | if (invlistsvp == NULL) { | |||
3515 | 1070 | SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); | ||||
3516 | 1070 | SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); | ||||
3517 | 1070 | SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); | ||||
3518 | 1070 | extssvp = hv_fetchs(hv, "EXTRAS", FALSE); | ||||
3519 | 1070 | listsvp = hv_fetchs(hv, "LIST", FALSE); | ||||
3520 | 1070 | invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); | ||||
3521 | ||||||
3522 | 1070 | 50 | bits = SvUV(*bitssvp); | |||
3523 | 1070 | 50 | none = SvUV(*nonesvp); | |||
3524 | 1070 | 50 | typestr = (U8*)SvPV_nolen(*typesvp); | |||
3525 | } | |||||
3526 | else { | |||||
3527 | bits = 1; | |||||
3528 | none = 0; | |||||
3529 | } | |||||
3530 | 21300 | octets = bits >> 3; /* if bits == 1, then octets == 0 */ | ||||
3531 | ||||||
3532 | PERL_ARGS_ASSERT_SWATCH_GET; | |||||
3533 | ||||||
3534 | 21300 | 100 | if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { | |||
50 | ||||||
3535 | 0 | Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf, | ||||
3536 | (UV)bits); | |||||
3537 | } | |||||
3538 | ||||||
3539 | /* If overflowed, use the max possible */ | |||||
3540 | 21300 | 100 | if (end < start) { | |||
3541 | end = UV_MAX; | |||||
3542 | 10 | span = end - start; | ||||
3543 | } | |||||
3544 | ||||||
3545 | /* create and initialize $swatch */ | |||||
3546 | 21300 | 100 | scur = octets ? (span * octets) : (span + 7) / 8; | |||
3547 | 21300 | swatch = newSV(scur); | ||||
3548 | 21300 | SvPOK_on(swatch); | ||||
3549 | 21300 | s = (U8*)SvPVX(swatch); | ||||
3550 | 21300 | 100 | if (octets && none) { | |||
3551 | 230 | const U8* const e = s + scur; | ||||
3552 | 28889 | 100 | while (s < e) { | |||
3553 | 28544 | 100 | if (bits == 8) | |||
3554 | 4352 | *s++ = (U8)(none & 0xff); | ||||
3555 | 24192 | 100 | else if (bits == 16) { | |||
3556 | 16640 | *s++ = (U8)((none >> 8) & 0xff); | ||||
3557 | 16640 | *s++ = (U8)( none & 0xff); | ||||
3558 | } | |||||
3559 | 7552 | 50 | else if (bits == 32) { | |||
3560 | 7552 | *s++ = (U8)((none >> 24) & 0xff); | ||||
3561 | 7552 | *s++ = (U8)((none >> 16) & 0xff); | ||||
3562 | 7552 | *s++ = (U8)((none >> 8) & 0xff); | ||||
3563 | 18048 | *s++ = (U8)( none & 0xff); | ||||
3564 | } | |||||
3565 | } | |||||
3566 | 230 | *s = '\0'; | ||||
3567 | } | |||||
3568 | else { | |||||
3569 | 21070 | (void)memzero((U8*)s, scur + 1); | ||||
3570 | } | |||||
3571 | 21300 | SvCUR_set(swatch, scur); | ||||
3572 | 21300 | s = (U8*)SvPVX(swatch); | ||||
3573 | ||||||
3574 | 21300 | 100 | if (invlistsvp) { /* If has an inversion list set up use that */ | |||
3575 | 20230 | _invlist_populate_swatch(*invlistsvp, start, end, s); | ||||
3576 | 20230 | return swatch; | ||||
3577 | } | |||||
3578 | ||||||
3579 | /* read $swash->{LIST} */ | |||||
3580 | 1070 | 50 | l = (U8*)SvPV(*listsvp, lcur); | |||
3581 | 1070 | lend = l + lcur; | ||||
3582 | 528023 | 100 | while (l < lend) { | |||
3583 | UV min, max, val, upper; | |||||
3584 | 526418 | l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val, | ||||
3585 | cBOOL(octets), typestr); | |||||
3586 | 526418 | 50 | if (l > lend) { | |||
3587 | break; | |||||
3588 | } | |||||
3589 | ||||||
3590 | /* If looking for something beyond this range, go try the next one */ | |||||
3591 | 526418 | 100 | if (max < start) | |||
3592 | 279486 | continue; | ||||
3593 | ||||||
3594 | /* |
|||||
3595 | * platform's infinity, where we can't go any higher, we want to | |||||
3596 | * include the code point at |
|||||
3597 | upper = (max < end) | |||||
3598 | ? max | |||||
3599 | 252919 | 100 | : (max != UV_MAX || end != UV_MAX) | |||
3600 | ? end - 1 | |||||
3601 | 234958 | 50 | : end; | |||
3602 | ||||||
3603 | 246932 | 50 | if (octets) { | |||
3604 | UV key; | |||||
3605 | 246932 | 100 | if (min < start) { | |||
3606 | 116 | 100 | if (!none || val < none) { | |||
100 | ||||||
3607 | 98 | val += start - min; | ||||
3608 | } | |||||
3609 | 116 | min = start; | ||||
3610 | } | |||||
3611 | 269928 | 100 | for (key = min; key <= upper; key++) { | |||
3612 | STRLEN offset; | |||||
3613 | /* offset must be non-negative (start <= min <= key < end) */ | |||||
3614 | 22996 | offset = octets * (key - start); | ||||
3615 | 22996 | 100 | if (bits == 8) | |||
3616 | 26 | s[offset] = (U8)(val & 0xff); | ||||
3617 | 22970 | 100 | else if (bits == 16) { | |||
3618 | 3092 | s[offset ] = (U8)((val >> 8) & 0xff); | ||||
3619 | 3092 | s[offset + 1] = (U8)( val & 0xff); | ||||
3620 | } | |||||
3621 | 19878 | 50 | else if (bits == 32) { | |||
3622 | 19878 | s[offset ] = (U8)((val >> 24) & 0xff); | ||||
3623 | 19878 | s[offset + 1] = (U8)((val >> 16) & 0xff); | ||||
3624 | 19878 | s[offset + 2] = (U8)((val >> 8) & 0xff); | ||||
3625 | 19878 | s[offset + 3] = (U8)( val & 0xff); | ||||
3626 | } | |||||
3627 | ||||||
3628 | 22996 | 100 | if (!none || val < none) | |||
100 | ||||||
3629 | 20316 | ++val; | ||||
3630 | } | |||||
3631 | } | |||||
3632 | else { /* bits == 1, then val should be ignored */ | |||||
3633 | UV key; | |||||
3634 | 0 | 0 | if (min < start) | |||
3635 | 0 | min = start; | ||||
3636 | ||||||
3637 | 263209 | 0 | for (key = min; key <= upper; key++) { | |||
3638 | 0 | const STRLEN offset = (STRLEN)(key - start); | ||||
3639 | 0 | s[offset >> 3] |= 1 << (offset & 7); | ||||
3640 | } | |||||
3641 | } | |||||
3642 | } /* while */ | |||||
3643 | ||||||
3644 | /* Invert if the data says it should be. Assumes that bits == 1 */ | |||||
3645 | 1070 | 100 | if (invert_it_svp && SvUV(*invert_it_svp)) { | |||
50 | ||||||
50 | ||||||
3646 | ||||||
3647 | /* Unicode properties should come with all bits above PERL_UNICODE_MAX | |||||
3648 | * be 0, and their inversion should also be 0, as we don't succeed any | |||||
3649 | * Unicode property matches for non-Unicode code points */ | |||||
3650 | 0 | 0 | if (start <= PERL_UNICODE_MAX) { | |||
3651 | ||||||
3652 | /* The code below assumes that we never cross the | |||||
3653 | * Unicode/above-Unicode boundary in a range, as otherwise we would | |||||
3654 | * have to figure out where to stop flipping the bits. Since this | |||||
3655 | * boundary is divisible by a large power of 2, and swatches comes | |||||
3656 | * in small powers of 2, this should be a valid assumption */ | |||||
3657 | assert(start + span - 1 <= PERL_UNICODE_MAX); | |||||
3658 | ||||||
3659 | 0 | send = s + scur; | ||||
3660 | 0 | 0 | while (s < send) { | |||
3661 | 0 | *s = ~(*s); | ||||
3662 | 0 | s++; | ||||
3663 | } | |||||
3664 | } | |||||
3665 | } | |||||
3666 | ||||||
3667 | /* read $swash->{EXTRAS} | |||||
3668 | * This code also copied to swash_to_invlist() below */ | |||||
3669 | 1070 | 50 | x = (U8*)SvPV(*extssvp, xcur); | |||
3670 | 1070 | xend = x + xcur; | ||||
3671 | 11950 | 100 | while (x < xend) { | |||
3672 | STRLEN namelen; | |||||
3673 | U8 *namestr; | |||||
3674 | SV** othersvp; | |||||
3675 | HV* otherhv; | |||||
3676 | STRLEN otherbits; | |||||
3677 | SV **otherbitssvp, *other; | |||||
3678 | U8 *s, *o, *nl; | |||||
3679 | STRLEN slen, olen; | |||||
3680 | ||||||
3681 | 230 | const U8 opc = *x++; | ||||
3682 | 230 | 50 | if (opc == '\n') | |||
3683 | 0 | continue; | ||||
3684 | ||||||
3685 | 230 | nl = (U8*)memchr(x, '\n', xend - x); | ||||
3686 | ||||||
3687 | 230 | 50 | if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { | |||
50 | ||||||
3688 | 230 | 50 | if (nl) { | |||
3689 | 230 | x = nl + 1; /* 1 is length of "\n" */ | ||||
3690 | 230 | continue; | ||||
3691 | } | |||||
3692 | else { | |||||
3693 | x = xend; /* to EXTRAS' end at which \n is not found */ | |||||
3694 | break; | |||||
3695 | } | |||||
3696 | } | |||||
3697 | ||||||
3698 | namestr = x; | |||||
3699 | 0 | 0 | if (nl) { | |||
3700 | 0 | namelen = nl - namestr; | ||||
3701 | 0 | x = nl + 1; | ||||
3702 | } | |||||
3703 | else { | |||||
3704 | 0 | namelen = xend - namestr; | ||||
3705 | x = xend; | |||||
3706 | } | |||||
3707 | ||||||
3708 | 0 | othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); | ||||
3709 | 0 | otherhv = MUTABLE_HV(SvRV(*othersvp)); | ||||
3710 | 0 | otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); | ||||
3711 | 0 | 0 | otherbits = (STRLEN)SvUV(*otherbitssvp); | |||
3712 | 0 | 0 | if (bits < otherbits) | |||
3713 | 0 | Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, " | ||||
3714 | "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits); | |||||
3715 | ||||||
3716 | /* The "other" swatch must be destroyed after. */ | |||||
3717 | 0 | other = swatch_get(*othersvp, start, span); | ||||
3718 | 0 | 0 | o = (U8*)SvPV(other, olen); | |||
3719 | ||||||
3720 | 0 | 0 | if (!olen) | |||
3721 | 0 | Perl_croak(aTHX_ "panic: swatch_get got improper swatch"); | ||||
3722 | ||||||
3723 | 0 | 0 | s = (U8*)SvPV(swatch, slen); | |||
3724 | 0 | 0 | if (bits == 1 && otherbits == 1) { | |||
3725 | 0 | 0 | if (slen != olen) | |||
3726 | 0 | Perl_croak(aTHX_ "panic: swatch_get found swatch length " | ||||
3727 | "mismatch, slen=%"UVuf", olen=%"UVuf, | |||||
3728 | (UV)slen, (UV)olen); | |||||
3729 | ||||||
3730 | 0 | switch (opc) { | ||||
3731 | case '+': | |||||
3732 | 0 | 0 | while (slen--) | |||
3733 | 0 | *s++ |= *o++; | ||||
3734 | break; | |||||
3735 | case '!': | |||||
3736 | 0 | 0 | while (slen--) | |||
3737 | 0 | *s++ |= ~*o++; | ||||
3738 | break; | |||||
3739 | case '-': | |||||
3740 | 0 | 0 | while (slen--) | |||
3741 | 0 | *s++ &= ~*o++; | ||||
3742 | break; | |||||
3743 | case '&': | |||||
3744 | 0 | 0 | while (slen--) | |||
3745 | 0 | *s++ &= *o++; | ||||
3746 | break; | |||||
3747 | default: | |||||
3748 | break; | |||||
3749 | } | |||||
3750 | } | |||||
3751 | else { | |||||
3752 | 0 | STRLEN otheroctets = otherbits >> 3; | ||||
3753 | STRLEN offset = 0; | |||||
3754 | 0 | U8* const send = s + slen; | ||||
3755 | ||||||
3756 | 0 | 0 | while (s < send) { | |||
3757 | UV otherval = 0; | |||||
3758 | ||||||
3759 | 0 | 0 | if (otherbits == 1) { | |||
3760 | 0 | otherval = (o[offset >> 3] >> (offset & 7)) & 1; | ||||
3761 | 0 | ++offset; | ||||
3762 | } | |||||
3763 | else { | |||||
3764 | STRLEN vlen = otheroctets; | |||||
3765 | 0 | otherval = *o++; | ||||
3766 | 0 | 0 | while (--vlen) { | |||
3767 | 0 | otherval <<= 8; | ||||
3768 | 0 | otherval |= *o++; | ||||
3769 | } | |||||
3770 | } | |||||
3771 | ||||||
3772 | 0 | 0 | if (opc == '+' && otherval) | |||
3773 | NOOP; /* replace with otherval */ | |||||
3774 | 0 | 0 | else if (opc == '!' && !otherval) | |||
3775 | otherval = 1; | |||||
3776 | 0 | 0 | else if (opc == '-' && otherval) | |||
3777 | otherval = 0; | |||||
3778 | 0 | 0 | else if (opc == '&' && !otherval) | |||
3779 | otherval = 0; | |||||
3780 | else { | |||||
3781 | 0 | s += octets; /* no replacement */ | ||||
3782 | 0 | continue; | ||||
3783 | } | |||||
3784 | ||||||
3785 | 0 | 0 | if (bits == 8) | |||
3786 | 0 | *s++ = (U8)( otherval & 0xff); | ||||
3787 | 0 | 0 | else if (bits == 16) { | |||
3788 | 0 | *s++ = (U8)((otherval >> 8) & 0xff); | ||||
3789 | 0 | *s++ = (U8)( otherval & 0xff); | ||||
3790 | } | |||||
3791 | 0 | 0 | else if (bits == 32) { | |||
3792 | 0 | *s++ = (U8)((otherval >> 24) & 0xff); | ||||
3793 | 0 | *s++ = (U8)((otherval >> 16) & 0xff); | ||||
3794 | 0 | *s++ = (U8)((otherval >> 8) & 0xff); | ||||
3795 | 0 | *s++ = (U8)( otherval & 0xff); | ||||
3796 | } | |||||
3797 | } | |||||
3798 | } | |||||
3799 | 115 | sv_free(other); /* through with it! */ | ||||
3800 | } /* while */ | |||||
3801 | return swatch; | |||||
3802 | } | |||||
3803 | ||||||
3804 | HV* | |||||
3805 | 24 | Perl__swash_inversion_hash(pTHX_ SV* const swash) | ||||
3806 | { | |||||
3807 | ||||||
3808 | /* Subject to change or removal. For use only in regcomp.c and regexec.c | |||||
3809 | * Can't be used on a property that is subject to user override, as it | |||||
3810 | * relies on the value of SPECIALS in the swash which would be set by | |||||
3811 | * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set | |||||
3812 | * for overridden properties | |||||
3813 | * | |||||
3814 | * Returns a hash which is the inversion and closure of a swash mapping. | |||||
3815 | * For example, consider the input lines: | |||||
3816 | * 004B 006B | |||||
3817 | * 004C 006C | |||||
3818 | * 212A 006B | |||||
3819 | * | |||||
3820 | * The returned hash would have two keys, the utf8 for 006B and the utf8 for | |||||
3821 | * 006C. The value for each key is an array. For 006C, the array would | |||||
3822 | * have two elements, the utf8 for itself, and for 004C. For 006B, there | |||||
3823 | * would be three elements in its array, the utf8 for 006B, 004B and 212A. | |||||
3824 | * | |||||
3825 | * Essentially, for any code point, it gives all the code points that map to | |||||
3826 | * it, or the list of 'froms' for that point. | |||||
3827 | * | |||||
3828 | * Currently it ignores any additions or deletions from other swashes, | |||||
3829 | * looking at just the main body of the swash, and if there are SPECIALS | |||||
3830 | * in the swash, at that hash | |||||
3831 | * | |||||
3832 | * The specials hash can be extra code points, and most likely consists of | |||||
3833 | * maps from single code points to multiple ones (each expressed as a string | |||||
3834 | * of utf8 characters). This function currently returns only 1-1 mappings. | |||||
3835 | * However consider this possible input in the specials hash: | |||||
3836 | * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074 | |||||
3837 | * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074 | |||||
3838 | * | |||||
3839 | * Both FB05 and FB06 map to the same multi-char sequence, which we don't | |||||
3840 | * currently handle. But it also means that FB05 and FB06 are equivalent in | |||||
3841 | * a 1-1 mapping which we should handle, and this relationship may not be in | |||||
3842 | * the main table. Therefore this function examines all the multi-char | |||||
3843 | * sequences and adds the 1-1 mappings that come out of that. */ | |||||
3844 | ||||||
3845 | U8 *l, *lend; | |||||
3846 | STRLEN lcur; | |||||
3847 | 24 | HV *const hv = MUTABLE_HV(SvRV(swash)); | ||||
3848 | ||||||
3849 | /* The string containing the main body of the table. This will have its | |||||
3850 | * assertion fail if the swash has been converted to its inversion list */ | |||||
3851 | 24 | SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); | ||||
3852 | ||||||
3853 | 24 | SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); | ||||
3854 | 24 | SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); | ||||
3855 | 24 | SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); | ||||
3856 | /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/ | |||||
3857 | 24 | 50 | const U8* const typestr = (U8*)SvPV_nolen(*typesvp); | |||
3858 | 24 | 50 | const STRLEN bits = SvUV(*bitssvp); | |||
3859 | 24 | const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ | ||||
3860 | 24 | 50 | const UV none = SvUV(*nonesvp); | |||
3861 | 24 | SV **specials_p = hv_fetchs(hv, "SPECIALS", 0); | ||||
3862 | ||||||
3863 | 24 | HV* ret = newHV(); | ||||
3864 | ||||||
3865 | PERL_ARGS_ASSERT__SWASH_INVERSION_HASH; | |||||
3866 | ||||||
3867 | /* Must have at least 8 bits to get the mappings */ | |||||
3868 | 24 | 50 | if (bits != 8 && bits != 16 && bits != 32) { | |||
50 | ||||||
3869 | 0 | Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf, | ||||
3870 | (UV)bits); | |||||
3871 | } | |||||
3872 | ||||||
3873 | 24 | 50 | if (specials_p) { /* It might be "special" (sometimes, but not always, a | |||
3874 | mapping to more than one character */ | |||||
3875 | ||||||
3876 | /* Construct an inverse mapping hash for the specials */ | |||||
3877 | 24 | HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p)); | ||||
3878 | 24 | HV * specials_inverse = newHV(); | ||||
3879 | char *char_from; /* the lhs of the map */ | |||||
3880 | I32 from_len; /* its byte length */ | |||||
3881 | char *char_to; /* the rhs of the map */ | |||||
3882 | I32 to_len; /* its byte length */ | |||||
3883 | SV *sv_to; /* and in a sv */ | |||||
3884 | AV* from_list; /* list of things that map to each 'to' */ | |||||
3885 | ||||||
3886 | 24 | hv_iterinit(specials_hv); | ||||
3887 | ||||||
3888 | /* The keys are the characters (in utf8) that map to the corresponding | |||||
3889 | * utf8 string value. Iterate through the list creating the inverse | |||||
3890 | * list. */ | |||||
3891 | 2532 | 100 | while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) { | |||
3892 | SV** listp; | |||||
3893 | 2496 | 50 | if (! SvPOK(sv_to)) { | |||
3894 | 0 | Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() " | ||||
3895 | "unexpectedly is not a string, flags=%lu", | |||||
3896 | 0 | (unsigned long)SvFLAGS(sv_to)); | ||||
3897 | } | |||||
3898 | /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ | |||||
3899 | ||||||
3900 | /* Each key in the inverse list is a mapped-to value, and the key's | |||||
3901 | * hash value is a list of the strings (each in utf8) that map to | |||||
3902 | * it. Those strings are all one character long */ | |||||
3903 | 2496 | 100 | if ((listp = hv_fetch(specials_inverse, | |||
3904 | SvPVX(sv_to), | |||||
3905 | SvCUR(sv_to), 0))) | |||||
3906 | { | |||||
3907 | 744 | from_list = (AV*) *listp; | ||||
3908 | } | |||||
3909 | else { /* No entry yet for it: create one */ | |||||
3910 | 1752 | from_list = newAV(); | ||||
3911 | 1752 | 50 | if (! hv_store(specials_inverse, | |||
3912 | SvPVX(sv_to), | |||||
3913 | SvCUR(sv_to), | |||||
3914 | (SV*) from_list, 0)) | |||||
3915 | { | |||||
3916 | 0 | Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); | ||||
3917 | } | |||||
3918 | } | |||||
3919 | ||||||
3920 | /* Here have the list associated with this 'to' (perhaps newly | |||||
3921 | * created and empty). Just add to it. Note that we ASSUME that | |||||
3922 | * the input is guaranteed to not have duplications, so we don't | |||||
3923 | * check for that. Duplications just slow down execution time. */ | |||||
3924 | 2496 | av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE)); | ||||
3925 | } | |||||
3926 | ||||||
3927 | /* Here, 'specials_inverse' contains the inverse mapping. Go through | |||||
3928 | * it looking for cases like the FB05/FB06 examples above. There would | |||||
3929 | * be an entry in the hash like | |||||
3930 | * 'st' => [ FB05, FB06 ] | |||||
3931 | * In this example we will create two lists that get stored in the | |||||
3932 | * returned hash, 'ret': | |||||
3933 | * FB05 => [ FB05, FB06 ] | |||||
3934 | * FB06 => [ FB05, FB06 ] | |||||
3935 | * | |||||
3936 | * Note that there is nothing to do if the array only has one element. | |||||
3937 | * (In the normal 1-1 case handled below, we don't have to worry about | |||||
3938 | * two lists, as everything gets tied to the single list that is | |||||
3939 | * generated for the single character 'to'. But here, we are omitting | |||||
3940 | * that list, ('st' in the example), so must have multiple lists.) */ | |||||
3941 | 1776 | 100 | while ((from_list = (AV *) hv_iternextsv(specials_inverse, | |||
3942 | &char_to, &to_len))) | |||||
3943 | { | |||||
3944 | 1752 | 100 | if (av_len(from_list) > 0) { | |||
3945 | SSize_t i; | |||||
3946 | ||||||
3947 | /* We iterate over all combinations of i,j to place each code | |||||
3948 | * point on each list */ | |||||
3949 | 1872 | 100 | for (i = 0; i <= av_len(from_list); i++) { | |||
3950 | SSize_t j; | |||||
3951 | 1488 | AV* i_list = newAV(); | ||||
3952 | 1488 | SV** entryp = av_fetch(from_list, i, FALSE); | ||||
3953 | 1488 | 50 | if (entryp == NULL) { | |||
3954 | 0 | Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); | ||||
3955 | } | |||||
3956 | 1488 | 50 | if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) { | |||
3957 | 0 | Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp)); | ||||
3958 | } | |||||
3959 | 1488 | 50 | if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp), | |||
3960 | (SV*) i_list, FALSE)) | |||||
3961 | { | |||||
3962 | 0 | Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); | ||||
3963 | } | |||||
3964 | ||||||
3965 | /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ | |||||
3966 | 3720 | 100 | for (j = 0; j <= av_len(from_list); j++) { | |||
3967 | 2976 | entryp = av_fetch(from_list, j, FALSE); | ||||
3968 | 2976 | 50 | if (entryp == NULL) { | |||
3969 | 0 | Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); | ||||
3970 | } | |||||
3971 | ||||||
3972 | /* When i==j this adds itself to the list */ | |||||
3973 | 2976 | 50 | av_push(i_list, newSVuv(utf8_to_uvchr_buf( | |||
3974 | (U8*) SvPVX(*entryp), | |||||
3975 | (U8*) SvPVX(*entryp) + SvCUR(*entryp), | |||||
3976 | 0))); | |||||
3977 | /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ | |||||
3978 | } | |||||
3979 | } | |||||
3980 | } | |||||
3981 | } | |||||
3982 | 24 | SvREFCNT_dec(specials_inverse); /* done with it */ | ||||
3983 | } /* End of specials */ | |||||
3984 | ||||||
3985 | /* read $swash->{LIST} */ | |||||
3986 | 24 | 50 | l = (U8*)SvPV(*listsvp, lcur); | |||
3987 | 24 | lend = l + lcur; | ||||
3988 | ||||||
3989 | /* Go through each input line */ | |||||
3990 | 15156 | 100 | while (l < lend) { | |||
3991 | UV min, max, val; | |||||
3992 | UV inverse; | |||||
3993 | 15120 | l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val, | ||||
3994 | cBOOL(octets), typestr); | |||||
3995 | 15120 | 50 | if (l > lend) { | |||
3996 | break; | |||||
3997 | } | |||||
3998 | ||||||
3999 | /* Each element in the range is to be inverted */ | |||||
4000 | 40440 | 100 | for (inverse = min; inverse <= max; inverse++) { | |||
4001 | AV* list; | |||||
4002 | SV** listp; | |||||
4003 | IV i; | |||||
4004 | bool found_key = FALSE; | |||||
4005 | bool found_inverse = FALSE; | |||||
4006 | ||||||
4007 | /* The key is the inverse mapping */ | |||||
4008 | char key[UTF8_MAXBYTES+1]; | |||||
4009 | 25320 | char* key_end = (char *) uvchr_to_utf8((U8*) key, val); | ||||
4010 | 25320 | STRLEN key_len = key_end - key; | ||||
4011 | ||||||
4012 | /* Get the list for the map */ | |||||
4013 | 25320 | 100 | if ((listp = hv_fetch(ret, key, key_len, FALSE))) { | |||
4014 | 1176 | list = (AV*) *listp; | ||||
4015 | } | |||||
4016 | else { /* No entry yet for it: create one */ | |||||
4017 | 24144 | list = newAV(); | ||||
4018 | 24144 | 50 | if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) { | |||
4019 | 0 | Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); | ||||
4020 | } | |||||
4021 | } | |||||
4022 | ||||||
4023 | /* Look through list to see if this inverse mapping already is | |||||
4024 | * listed, or if there is a mapping to itself already */ | |||||
4025 | 27048 | 100 | for (i = 0; i <= av_len(list); i++) { | |||
4026 | 2400 | SV** entryp = av_fetch(list, i, FALSE); | ||||
4027 | SV* entry; | |||||
4028 | 2400 | 50 | if (entryp == NULL) { | |||
4029 | 0 | Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); | ||||
4030 | } | |||||
4031 | 2400 | entry = *entryp; | ||||
4032 | /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/ | |||||
4033 | 2400 | 50 | if (SvUV(entry) == val) { | |||
100 | ||||||
4034 | found_key = TRUE; | |||||
4035 | } | |||||
4036 | 2400 | 50 | if (SvUV(entry) == inverse) { | |||
100 | ||||||
4037 | found_inverse = TRUE; | |||||
4038 | } | |||||
4039 | ||||||
4040 | /* No need to continue searching if found everything we are | |||||
4041 | * looking for */ | |||||
4042 | 2400 | 100 | if (found_key && found_inverse) { | |||
100 | ||||||
4043 | break; | |||||
4044 | } | |||||
4045 | } | |||||
4046 | ||||||
4047 | /* Make sure there is a mapping to itself on the list */ | |||||
4048 | 25320 | 100 | if (! found_key) { | |||
4049 | 24144 | av_push(list, newSVuv(val)); | ||||
4050 | /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/ | |||||
4051 | } | |||||
4052 | ||||||
4053 | ||||||
4054 | /* Simply add the value to the list */ | |||||
4055 | 25320 | 100 | if (! found_inverse) { | |||
4056 | 24648 | av_push(list, newSVuv(inverse)); | ||||
4057 | /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/ | |||||
4058 | } | |||||
4059 | ||||||
4060 | /* swatch_get() increments the value of val for each element in the | |||||
4061 | * range. That makes more compact tables possible. You can | |||||
4062 | * express the capitalization, for example, of all consecutive | |||||
4063 | * letters with a single line: 0061\t007A\t0041 This maps 0061 to | |||||
4064 | * 0041, 0062 to 0042, etc. I (khw) have never understood 'none', | |||||
4065 | * and it's not documented; it appears to be used only in | |||||
4066 | * implementing tr//; I copied the semantics from swatch_get(), just | |||||
4067 | * in case */ | |||||
4068 | 25320 | 50 | if (!none || val < none) { | |||
0 | ||||||
4069 | 25320 | ++val; | ||||
4070 | } | |||||
4071 | } | |||||
4072 | } | |||||
4073 | ||||||
4074 | 24 | return ret; | ||||
4075 | } | |||||
4076 | ||||||
4077 | SV* | |||||
4078 | 6680 | Perl__swash_to_invlist(pTHX_ SV* const swash) | ||||
4079 | { | |||||
4080 | ||||||
4081 | /* Subject to change or removal. For use only in one place in regcomp.c. | |||||
4082 | * Ownership is given to one reference count in the returned SV* */ | |||||
4083 | ||||||
4084 | U8 *l, *lend; | |||||
4085 | char *loc; | |||||
4086 | STRLEN lcur; | |||||
4087 | 6680 | HV *const hv = MUTABLE_HV(SvRV(swash)); | ||||
4088 | UV elements = 0; /* Number of elements in the inversion list */ | |||||
4089 | 6680 | U8 empty[] = ""; | ||||
4090 | SV** listsvp; | |||||
4091 | SV** typesvp; | |||||
4092 | SV** bitssvp; | |||||
4093 | SV** extssvp; | |||||
4094 | SV** invert_it_svp; | |||||
4095 | ||||||
4096 | U8* typestr; | |||||
4097 | STRLEN bits; | |||||
4098 | STRLEN octets; /* if bits == 1, then octets == 0 */ | |||||
4099 | U8 *x, *xend; | |||||
4100 | STRLEN xcur; | |||||
4101 | ||||||
4102 | SV* invlist; | |||||
4103 | ||||||
4104 | PERL_ARGS_ASSERT__SWASH_TO_INVLIST; | |||||
4105 | ||||||
4106 | /* If not a hash, it must be the swash's inversion list instead */ | |||||
4107 | 6680 | 100 | if (SvTYPE(hv) != SVt_PVHV) { | |||
4108 | 28 | return SvREFCNT_inc_simple_NN((SV*) hv); | ||||
4109 | } | |||||
4110 | ||||||
4111 | /* The string containing the main body of the table */ | |||||
4112 | 6652 | listsvp = hv_fetchs(hv, "LIST", FALSE); | ||||
4113 | 6652 | typesvp = hv_fetchs(hv, "TYPE", FALSE); | ||||
4114 | 6652 | bitssvp = hv_fetchs(hv, "BITS", FALSE); | ||||
4115 | 6652 | extssvp = hv_fetchs(hv, "EXTRAS", FALSE); | ||||
4116 | 6652 | invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); | ||||
4117 | ||||||
4118 | 6652 | 50 | typestr = (U8*)SvPV_nolen(*typesvp); | |||
4119 | 6652 | 50 | bits = SvUV(*bitssvp); | |||
4120 | 6652 | octets = bits >> 3; /* if bits == 1, then octets == 0 */ | ||||
4121 | ||||||
4122 | /* read $swash->{LIST} */ | |||||
4123 | 6652 | 50 | if (SvPOK(*listsvp)) { | |||
4124 | 6652 | 50 | l = (U8*)SvPV(*listsvp, lcur); | |||
4125 | } | |||||
4126 | else { | |||||
4127 | /* LIST legitimately doesn't contain a string during compilation phases | |||||
4128 | * of Perl itself, before the Unicode tables are generated. In this | |||||
4129 | * case, just fake things up by creating an empty list */ | |||||
4130 | l = empty; | |||||
4131 | 0 | lcur = 0; | ||||
4132 | } | |||||
4133 | loc = (char *) l; | |||||
4134 | 6652 | lend = l + lcur; | ||||
4135 | ||||||
4136 | /* Scan the input to count the number of lines to preallocate array size | |||||
4137 | * based on worst possible case, which is each line in the input creates 2 | |||||
4138 | * elements in the inversion list: 1) the beginning of a range in the list; | |||||
4139 | * 2) the beginning of a range not in the list. */ | |||||
4140 | 1771708 | 100 | while ((loc = (strchr(loc, '\n'))) != NULL) { | |||
4141 | 1761730 | elements += 2; | ||||
4142 | 1761730 | loc++; | ||||
4143 | } | |||||
4144 | ||||||
4145 | /* If the ending is somehow corrupt and isn't a new line, add another | |||||
4146 | * element for the final range that isn't in the inversion list */ | |||||
4147 | 9978 | 50 | if (! (*lend == '\n' | |||
50 | ||||||
4148 | 9978 | 100 | || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n')))) | |||
100 | ||||||
4149 | { | |||||
4150 | 72 | elements++; | ||||
4151 | } | |||||
4152 | ||||||
4153 | 6652 | invlist = _new_invlist(elements); | ||||
4154 | ||||||
4155 | /* Now go through the input again, adding each range to the list */ | |||||
4156 | 1771780 | 100 | while (l < lend) { | |||
4157 | UV start, end; | |||||
4158 | UV val; /* Not used by this function */ | |||||
4159 | ||||||
4160 | 1761802 | l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val, | ||||
4161 | cBOOL(octets), typestr); | |||||
4162 | ||||||
4163 | 1761802 | 50 | if (l > lend) { | |||
4164 | break; | |||||
4165 | } | |||||
4166 | ||||||
4167 | 1761802 | invlist = _add_range_to_invlist(invlist, start, end); | ||||
4168 | } | |||||
4169 | ||||||
4170 | /* Invert if the data says it should be */ | |||||
4171 | 6652 | 100 | if (invert_it_svp && SvUV(*invert_it_svp)) { | |||
50 | ||||||
100 | ||||||
4172 | 94 | _invlist_invert_prop(invlist); | ||||
4173 | } | |||||
4174 | ||||||
4175 | /* This code is copied from swatch_get() | |||||
4176 | * read $swash->{EXTRAS} */ | |||||
4177 | 6652 | 50 | x = (U8*)SvPV(*extssvp, xcur); | |||
4178 | 6652 | xend = x + xcur; | ||||
4179 | 11942 | 100 | while (x < xend) { | |||
4180 | STRLEN namelen; | |||||
4181 | U8 *namestr; | |||||
4182 | SV** othersvp; | |||||
4183 | HV* otherhv; | |||||
4184 | STRLEN otherbits; | |||||
4185 | SV **otherbitssvp, *other; | |||||
4186 | U8 *nl; | |||||
4187 | ||||||
4188 | 1964 | const U8 opc = *x++; | ||||
4189 | 1964 | 100 | if (opc == '\n') | |||
4190 | 2 | continue; | ||||
4191 | ||||||
4192 | 1962 | nl = (U8*)memchr(x, '\n', xend - x); | ||||
4193 | ||||||
4194 | 1962 | 100 | if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { | |||
100 | ||||||
4195 | 844 | 50 | if (nl) { | |||
4196 | 844 | x = nl + 1; /* 1 is length of "\n" */ | ||||
4197 | 844 | continue; | ||||
4198 | } | |||||
4199 | else { | |||||
4200 | x = xend; /* to EXTRAS' end at which \n is not found */ | |||||
4201 | break; | |||||
4202 | } | |||||
4203 | } | |||||
4204 | ||||||
4205 | namestr = x; | |||||
4206 | 1118 | 100 | if (nl) { | |||
4207 | 1104 | namelen = nl - namestr; | ||||
4208 | 1104 | x = nl + 1; | ||||
4209 | } | |||||
4210 | else { | |||||
4211 | 14 | namelen = xend - namestr; | ||||
4212 | x = xend; | |||||
4213 | } | |||||
4214 | ||||||
4215 | 1118 | othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); | ||||
4216 | 1118 | otherhv = MUTABLE_HV(SvRV(*othersvp)); | ||||
4217 | 1118 | otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); | ||||
4218 | 1118 | 50 | otherbits = (STRLEN)SvUV(*otherbitssvp); | |||
4219 | ||||||
4220 | 1118 | 50 | if (bits != otherbits || bits != 1) { | |||
4221 | 0 | Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean " | ||||
4222 | "properties, bits=%"UVuf", otherbits=%"UVuf, | |||||
4223 | (UV)bits, (UV)otherbits); | |||||
4224 | } | |||||
4225 | ||||||
4226 | /* The "other" swatch must be destroyed after. */ | |||||
4227 | 1118 | other = _swash_to_invlist((SV *)*othersvp); | ||||
4228 | ||||||
4229 | /* End of code copied from swatch_get() */ | |||||
4230 | 1118 | switch (opc) { | ||||
4231 | case '+': | |||||
4232 | 710 | _invlist_union(invlist, other, &invlist); | ||||
4233 | 710 | break; | ||||
4234 | case '!': | |||||
4235 | 310 | _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist); | ||||
4236 | 310 | break; | ||||
4237 | case '-': | |||||
4238 | 80 | _invlist_subtract(invlist, other, &invlist); | ||||
4239 | 80 | break; | ||||
4240 | case '&': | |||||
4241 | 18 | _invlist_intersection(invlist, other, &invlist); | ||||
4242 | 18 | break; | ||||
4243 | default: | |||||
4244 | break; | |||||
4245 | } | |||||
4246 | 1541 | sv_free(other); /* through with it! */ | ||||
4247 | } | |||||
4248 | ||||||
4249 | 6666 | return invlist; | ||||
4250 | } | |||||
4251 | ||||||
4252 | SV* | |||||
4253 | 180730 | Perl__get_swash_invlist(pTHX_ SV* const swash) | ||||
4254 | { | |||||
4255 | SV** ptr; | |||||
4256 | ||||||
4257 | PERL_ARGS_ASSERT__GET_SWASH_INVLIST; | |||||
4258 | ||||||
4259 | 180730 | 50 | if (! SvROK(swash)) { | |||
4260 | return NULL; | |||||
4261 | } | |||||
4262 | ||||||
4263 | /* If it really isn't a hash, it isn't really swash; must be an inversion | |||||
4264 | * list */ | |||||
4265 | 180730 | 50 | if (SvTYPE(SvRV(swash)) != SVt_PVHV) { | |||
4266 | 0 | return SvRV(swash); | ||||
4267 | } | |||||
4268 | ||||||
4269 | 180730 | ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); | ||||
4270 | 180730 | 50 | if (! ptr) { | |||
4271 | return NULL; | |||||
4272 | } | |||||
4273 | ||||||
4274 | 180730 | return *ptr; | ||||
4275 | } | |||||
4276 | ||||||
4277 | bool | |||||
4278 | 21060 | Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) | ||||
4279 | { | |||||
4280 | /* May change: warns if surrogates, non-character code points, or | |||||
4281 | * non-Unicode code points are in s which has length len bytes. Returns | |||||
4282 | * TRUE if none found; FALSE otherwise. The only other validity check is | |||||
4283 | * to make sure that this won't exceed the string's length */ | |||||
4284 | ||||||
4285 | 21060 | const U8* const e = s + len; | ||||
4286 | bool ok = TRUE; | |||||
4287 | ||||||
4288 | PERL_ARGS_ASSERT_CHECK_UTF8_PRINT; | |||||
4289 | ||||||
4290 | 2379248 | 100 | while (s < e) { | |||
4291 | 2347658 | 50 | if (UTF8SKIP(s) > len) { | |||
4292 | 0 | 0 | Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), | |||
4293 | 0 | 0 | "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print"); | |||
0 | ||||||
4294 | 0 | return FALSE; | ||||
4295 | } | |||||
4296 | 2347658 | 100 | if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) { | |||
4297 | STRLEN char_len; | |||||
4298 | 9974 | 100 | if (UTF8_IS_SUPER(s)) { | |||
50 | ||||||
100 | ||||||
4299 | 12 | 100 | if (ckWARN_d(WARN_NON_UNICODE)) { | |||
4300 | 10 | 100 | UV uv = utf8_to_uvchr_buf(s, e, &char_len); | |||
4301 | 10 | Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), | ||||
4302 | "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv); | |||||
4303 | ok = FALSE; | |||||
4304 | } | |||||
4305 | } | |||||
4306 | 9962 | 100 | else if (UTF8_IS_SURROGATE(s)) { | |||
100 | ||||||
4307 | 14 | 100 | if (ckWARN_d(WARN_SURROGATE)) { | |||
4308 | 12 | 100 | UV uv = utf8_to_uvchr_buf(s, e, &char_len); | |||
4309 | 12 | Perl_warner(aTHX_ packWARN(WARN_SURROGATE), | ||||
4310 | "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv); | |||||
4311 | ok = FALSE; | |||||
4312 | } | |||||
4313 | } | |||||
4314 | else if | |||||
4315 | 32667 | 100 | ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)) | |||
100 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
50 | ||||||
50 | ||||||
100 | ||||||
4316 | 31863 | 100 | && (ckWARN_d(WARN_NONCHAR))) | |||
100 | ||||||
100 | ||||||
100 | ||||||
100 | ||||||
100 | ||||||
50 | ||||||
100 | ||||||
100 | ||||||
4317 | { | |||||
4318 | 80 | 100 | UV uv = utf8_to_uvchr_buf(s, e, &char_len); | |||
4319 | 80 | Perl_warner(aTHX_ packWARN(WARN_NONCHAR), | ||||
4320 | "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv); | |||||
4321 | ok = FALSE; | |||||
4322 | } | |||||
4323 | } | |||||
4324 | 2347658 | s += UTF8SKIP(s); | ||||
4325 | } | |||||
4326 | ||||||
4327 | return ok; | |||||
4328 | } | |||||
4329 | ||||||
4330 | /* | |||||
4331 | =for apidoc pv_uni_display | |||||
4332 | ||||||
4333 | Build to the scalar C |
|||||
4334 | length C |
|||||
4335 | (if longer, the rest is truncated and "..." will be appended). | |||||
4336 | ||||||
4337 | The C |
|||||
4338 | isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH | |||||
4339 | to display the \\[nrfta\\] as the backslashed versions (like '\n') | |||||
4340 | (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\). | |||||
4341 | UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both | |||||
4342 | UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on. | |||||
4343 | ||||||
4344 | The pointer to the PV of the C |
|||||
4345 | ||||||
4346 | =cut */ | |||||
4347 | char * | |||||
4348 | 33996 | Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags) | ||||
4349 | { | |||||
4350 | int truncated = 0; | |||||
4351 | const char *s, *e; | |||||
4352 | ||||||
4353 | PERL_ARGS_ASSERT_PV_UNI_DISPLAY; | |||||
4354 | ||||||
4355 | 33996 | sv_setpvs(dsv, ""); | ||||
4356 | 33996 | SvUTF8_off(dsv); | ||||
4357 | 68154 | 100 | for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { | |||
4358 | UV u; | |||||
4359 | /* This serves double duty as a flag and a character to print after | |||||
4360 | a \ when flags & UNI_DISPLAY_BACKSLASH is true. | |||||
4361 | */ | |||||
4362 | char ok = 0; | |||||
4363 | ||||||
4364 | 34172 | 50 | if (pvlim && SvCUR(dsv) >= pvlim) { | |||
100 | ||||||
4365 | truncated++; | |||||
4366 | break; | |||||
4367 | } | |||||
4368 | 34158 | 100 | u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0); | |||
4369 | 34158 | 100 | if (u < 256) { | |||
4370 | 328 | const unsigned char c = (unsigned char)u & 0xFF; | ||||
4371 | 328 | 100 | if (flags & UNI_DISPLAY_BACKSLASH) { | |||
4372 | 66 | switch (c) { | ||||
4373 | case '\n': | |||||
4374 | 2 | ok = 'n'; break; | ||||
4375 | case '\r': | |||||
4376 | 2 | ok = 'r'; break; | ||||
4377 | case '\t': | |||||
4378 | 2 | ok = 't'; break; | ||||
4379 | case '\f': | |||||
4380 | 2 | ok = 'f'; break; | ||||
4381 | case '\a': | |||||
4382 | 2 | ok = 'a'; break; | ||||
4383 | case '\\': | |||||
4384 | 2 | ok = '\\'; break; | ||||
4385 | default: break; | |||||
4386 | } | |||||
4387 | 66 | 100 | if (ok) { | |||
4388 | 12 | const char string = ok; | ||||
4389 | 12 | sv_catpvs(dsv, "\\"); | ||||
4390 | 12 | sv_catpvn(dsv, &string, 1); | ||||
4391 | } | |||||
4392 | } | |||||
4393 | /* isPRINT() is the locale-blind version. */ | |||||
4394 | 328 | 100 | if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { | |||
50 | ||||||
100 | ||||||
4395 | 160 | const char string = c; | ||||
4396 | 160 | sv_catpvn(dsv, &string, 1); | ||||
4397 | ok = 1; | |||||
4398 | } | |||||
4399 | } | |||||
4400 | 34158 | 100 | if (!ok) | |||
4401 | 33986 | Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); | ||||
4402 | } | |||||
4403 | 33996 | 100 | if (truncated) | |||
4404 | 14 | sv_catpvs(dsv, "..."); | ||||
4405 | ||||||
4406 | 33996 | return SvPVX(dsv); | ||||
4407 | } | |||||
4408 | ||||||
4409 | /* | |||||
4410 | =for apidoc sv_uni_display | |||||
4411 | ||||||
4412 | Build to the scalar C |
|||||
4413 | the displayable version being at most C |
|||||
4414 | (if longer, the rest is truncated and "..." will be appended). | |||||
4415 | ||||||
4416 | The C |
|||||
4417 | ||||||
4418 | The pointer to the PV of the C |
|||||
4419 | ||||||
4420 | =cut | |||||
4421 | */ | |||||
4422 | char * | |||||
4423 | 33996 | Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) | ||||
4424 | { | |||||
4425 | const char * const ptr = | |||||
4426 | 33996 | 50 | isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv); | |||
50 | ||||||
4427 | ||||||
4428 | PERL_ARGS_ASSERT_SV_UNI_DISPLAY; | |||||
4429 | ||||||
4430 | 33996 | return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr, | ||||
4431 | 33996 | SvCUR(ssv), pvlim, flags); | ||||
4432 | } | |||||
4433 | ||||||
4434 | /* | |||||
4435 | =for apidoc foldEQ_utf8 | |||||
4436 | ||||||
4437 | Returns true if the leading portions of the strings C |
|||||
4438 | of which may be in UTF-8) are the same case-insensitively; false otherwise. | |||||
4439 | How far into the strings to compare is determined by other input parameters. | |||||
4440 | ||||||
4441 | If C |
|||||
4442 | otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C |
|||||
4443 | with respect to C |
|||||
4444 | ||||||
4445 | If the byte length C |
|||||
4446 | equality. In other words, C |
|||||
4447 | scan will not be considered to be a match unless the goal is reached, and | |||||
4448 | scanning won't continue past that goal. Correspondingly for C |
|||||
4449 | C |
|||||
4450 | ||||||
4451 | If C |
|||||
4452 | considered an end pointer to the position 1 byte past the maximum point | |||||
4453 | in C |
|||||
4454 | (This routine assumes that UTF-8 encoded input strings are not malformed; | |||||
4455 | malformed input can cause it to read past C |
|||||
4456 | This means that if both C |
|||||
4457 | is less than C |
|||||
4458 | never | |||||
4459 | get as far as its goal (and in fact is asserted against). Correspondingly for | |||||
4460 | C |
|||||
4461 | ||||||
4462 | At least one of C |
|||||
4463 | C |
|||||
4464 | reached for a successful match. Also, if the fold of a character is multiple | |||||
4465 | characters, all of them must be matched (see tr21 reference below for | |||||
4466 | 'folding'). | |||||
4467 | ||||||
4468 | Upon a successful match, if C |
|||||
4469 | it will be set to point to the beginning of the I |
|||||
4470 | beyond what was matched. Correspondingly for C |
|||||
4471 | ||||||
4472 | For case-insensitiveness, the "casefolding" of Unicode is used | |||||
4473 | instead of upper/lowercasing both the characters, see | |||||
4474 | L |
|||||
4475 | ||||||
4476 | =cut */ | |||||
4477 | ||||||
4478 | /* A flags parameter has been added which may change, and hence isn't | |||||
4479 | * externally documented. Currently it is: | |||||
4480 | * 0 for as-documented above | |||||
4481 | * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an | |||||
4482 | ASCII one, to not match | |||||
4483 | * FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code | |||||
4484 | * points below 256; unicode rules for above 255; and | |||||
4485 | * folds that cross those boundaries are disallowed, | |||||
4486 | * like the NOMIX_ASCII option | |||||
4487 | * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this | |||||
4488 | * routine. This allows that step to be skipped. | |||||
4489 | * FOLDEQ_S2_ALREADY_FOLDED Similarly. | |||||
4490 | */ | |||||
4491 | I32 | |||||
4492 | 1342716 | Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags) | ||||
4493 | { | |||||
4494 | dVAR; | |||||
4495 | const U8 *p1 = (const U8*)s1; /* Point to current char */ | |||||
4496 | const U8 *p2 = (const U8*)s2; | |||||
4497 | const U8 *g1 = NULL; /* goal for s1 */ | |||||
4498 | const U8 *g2 = NULL; | |||||
4499 | const U8 *e1 = NULL; /* Don't scan s1 past this */ | |||||
4500 | U8 *f1 = NULL; /* Point to current folded */ | |||||
4501 | const U8 *e2 = NULL; | |||||
4502 | U8 *f2 = NULL; | |||||
4503 | 1342716 | STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ | ||||
4504 | U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; | |||||
4505 | U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; | |||||
4506 | ||||||
4507 | PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; | |||||
4508 | ||||||
4509 | assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE)) | |||||
4510 | && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED)))); | |||||
4511 | /* The algorithm is to trial the folds without regard to the flags on | |||||
4512 | * the first line of the above assert(), and then see if the result | |||||
4513 | * violates them. This means that the inputs can't be pre-folded to a | |||||
4514 | * violating result, hence the assert. This could be changed, with the | |||||
4515 | * addition of extra tests here for the already-folded case, which would | |||||
4516 | * slow it down. That cost is more than any possible gain for when these | |||||
4517 | * flags are specified, as the flags indicate /il or /iaa matching which | |||||
4518 | * is less common than /iu, and I (khw) also believe that real-world /il | |||||
4519 | * and /iaa matches are most likely to involve code points 0-255, and this | |||||
4520 | * function only under rare conditions gets called for 0-255. */ | |||||
4521 | ||||||
4522 | 1342716 | 100 | if (pe1) { | |||
4523 | 417046 | e1 = *(U8**)pe1; | ||||
4524 | } | |||||
4525 | ||||||
4526 | 1342716 | 100 | if (l1) { | |||
4527 | 925670 | g1 = (const U8*)s1 + l1; | ||||
4528 | } | |||||
4529 | ||||||
4530 | 1342716 | 100 | if (pe2) { | |||
4531 | 925670 | e2 = *(U8**)pe2; | ||||
4532 | } | |||||
4533 | ||||||
4534 | 1342716 | 100 | if (l2) { | |||
4535 | 417046 | g2 = (const U8*)s2 + l2; | ||||
4536 | } | |||||
4537 | ||||||
4538 | /* Must have at least one goal */ | |||||
4539 | assert(g1 || g2); | |||||
4540 | ||||||
4541 | 1342716 | 100 | if (g1) { | |||
4542 | ||||||
4543 | /* Will never match if goal is out-of-bounds */ | |||||
4544 | assert(! e1 || e1 >= g1); | |||||
4545 | ||||||
4546 | /* Here, there isn't an end pointer, or it is beyond the goal. We | |||||
4547 | * only go as far as the goal */ | |||||
4548 | e1 = g1; | |||||
4549 | } | |||||
4550 | else { | |||||
4551 | assert(e1); /* Must have an end for looking at s1 */ | |||||
4552 | } | |||||
4553 | ||||||
4554 | /* Same for goal for s2 */ | |||||
4555 | 1342716 | 100 | if (g2) { | |||
4556 | assert(! e2 || e2 >= g2); | |||||
4557 | e2 = g2; | |||||
4558 | } | |||||
4559 | else { | |||||
4560 | assert(e2); | |||||
4561 | } | |||||
4562 | ||||||
4563 | /* If both operands are already folded, we could just do a memEQ on the | |||||
4564 | * whole strings at once, but it would be better if the caller realized | |||||
4565 | * this and didn't even call us */ | |||||
4566 | ||||||
4567 | /* Look through both strings, a character at a time */ | |||||
4568 | 2542844 | 100 | while (p1 < e1 && p2 < e2) { | |||
4569 | ||||||
4570 | /* If at the beginning of a new character in s1, get its fold to use | |||||
4571 | * and the length of the fold. (exception: locale rules just get the | |||||
4572 | * character to a single byte) */ | |||||
4573 | 1609316 | 100 | if (n1 == 0) { | |||
4574 | 1582220 | 100 | if (flags & FOLDEQ_S1_ALREADY_FOLDED) { | |||
4575 | f1 = (U8 *) p1; | |||||
4576 | 523574 | n1 = UTF8SKIP(f1); | ||||
4577 | } | |||||
4578 | else { | |||||
4579 | /* If in locale matching, we use two sets of rules, depending | |||||
4580 | * on if the code point is above or below 255. Here, we test | |||||
4581 | * for and handle locale rules */ | |||||
4582 | 1058646 | 100 | if ((flags & FOLDEQ_UTF8_LOCALE) | |||
4583 | 507440 | 100 | && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1))) | |||
100 | ||||||
4584 | { | |||||
4585 | /* There is no mixing of code points above and below 255. */ | |||||
4586 | 433594 | 100 | if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) { | |||
100 | ||||||
4587 | return 0; | |||||
4588 | } | |||||
4589 | ||||||
4590 | /* We handle locale rules by converting, if necessary, the | |||||
4591 | * code point to a single byte. */ | |||||
4592 | 391934 | 100 | if (! u1 || UTF8_IS_INVARIANT(*p1)) { | |||
100 | ||||||
4593 | 372000 | *foldbuf1 = *p1; | ||||
4594 | } | |||||
4595 | else { | |||||
4596 | 19934 | *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1)); | ||||
4597 | } | |||||
4598 | 391934 | n1 = 1; | ||||
4599 | } | |||||
4600 | 625052 | 100 | else if (isASCII(*p1)) { /* Note, that here won't be both | |||
4601 | ASCII and using locale rules */ | |||||
4602 | ||||||
4603 | /* If trying to mix non- with ASCII, and not supposed to, | |||||
4604 | * fail */ | |||||
4605 | 342356 | 100 | if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) { | |||
100 | ||||||
4606 | return 0; | |||||
4607 | } | |||||
4608 | 318190 | n1 = 1; | ||||
4609 | 318190 | 100 | *foldbuf1 = toFOLD(*p1); | |||
4610 | } | |||||
4611 | 282696 | 100 | else if (u1) { | |||
4612 | 244502 | to_utf8_fold(p1, foldbuf1, &n1); | ||||
4613 | } | |||||
4614 | else { /* Not utf8, get utf8 fold */ | |||||
4615 | 38194 | to_uni_fold(*p1, foldbuf1, &n1); | ||||
4616 | } | |||||
4617 | f1 = foldbuf1; | |||||
4618 | } | |||||
4619 | } | |||||
4620 | ||||||
4621 | 1543490 | 100 | if (n2 == 0) { /* Same for s2 */ | |||
4622 | 1487888 | 100 | if (flags & FOLDEQ_S2_ALREADY_FOLDED) { | |||
4623 | f2 = (U8 *) p2; | |||||
4624 | 182386 | n2 = UTF8SKIP(f2); | ||||
4625 | } | |||||
4626 | else { | |||||
4627 | 1305502 | 100 | if ((flags & FOLDEQ_UTF8_LOCALE) | |||
4628 | 467818 | 100 | && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2))) | |||
100 | ||||||
4629 | { | |||||
4630 | /* Here, the next char in s2 is < 256. We've already | |||||
4631 | * worked on s1, and if it isn't also < 256, can't match */ | |||||
4632 | 455012 | 100 | if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) { | |||
100 | ||||||
4633 | return 0; | |||||
4634 | } | |||||
4635 | 391934 | 100 | if (! u2 || UTF8_IS_INVARIANT(*p2)) { | |||
100 | ||||||
4636 | 379464 | *foldbuf2 = *p2; | ||||
4637 | } | |||||
4638 | else { | |||||
4639 | 12470 | *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1)); | ||||
4640 | } | |||||
4641 | ||||||
4642 | /* Use another function to handle locale rules. We've made | |||||
4643 | * sure that both characters to compare are single bytes */ | |||||
4644 | 391934 | 100 | if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) { | |||
4645 | return 0; | |||||
4646 | } | |||||
4647 | 302340 | n1 = n2 = 0; | ||||
4648 | } | |||||
4649 | 850490 | 100 | else if (isASCII(*p2)) { | |||
4650 | 536880 | 100 | if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) { | |||
100 | ||||||
4651 | return 0; | |||||
4652 | } | |||||
4653 | 492254 | n2 = 1; | ||||
4654 | 492254 | 100 | *foldbuf2 = toFOLD(*p2); | |||
4655 | } | |||||
4656 | 313610 | 100 | else if (u2) { | |||
4657 | 803144 | to_utf8_fold(p2, foldbuf2, &n2); | ||||
4658 | } | |||||
4659 | else { | |||||
4660 | 53514 | to_uni_fold(*p2, foldbuf2, &n2); | ||||
4661 | } | |||||
4662 | f2 = foldbuf2; | |||||
4663 | } | |||||
4664 | } | |||||
4665 | ||||||
4666 | /* Here f1 and f2 point to the beginning of the strings to compare. | |||||
4667 | * These strings are the folds of the next character from each input | |||||
4668 | * string, stored in utf8. */ | |||||
4669 | ||||||
4670 | /* While there is more to look for in both folds, see if they | |||||
4671 | * continue to match */ | |||||
4672 | 2251468 | 100 | while (n1 && n2) { | |||
100 | ||||||
4673 | 1051340 | U8 fold_length = UTF8SKIP(f1); | ||||
4674 | 1051340 | 100 | if (fold_length != UTF8SKIP(f2) | |||
4675 | 986510 | 100 | || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE | |||
100 | ||||||
4676 | function call for single | |||||
4677 | byte */ | |||||
4678 | 905722 | 100 | || memNE((char*)f1, (char*)f2, fold_length)) | |||
4679 | { | |||||
4680 | return 0; /* mismatch */ | |||||
4681 | } | |||||
4682 | ||||||
4683 | /* Here, they matched, advance past them */ | |||||
4684 | 905276 | n1 -= fold_length; | ||||
4685 | 905276 | f1 += fold_length; | ||||
4686 | 905276 | n2 -= fold_length; | ||||
4687 | 905276 | f2 += fold_length; | ||||
4688 | } | |||||
4689 | ||||||
4690 | /* When reach the end of any fold, advance the input past it */ | |||||
4691 | 1200128 | 100 | if (n1 == 0) { | |||
4692 | 1172938 | 100 | p1 += u1 ? UTF8SKIP(p1) : 1; | |||
4693 | } | |||||
4694 | 1200128 | 100 | if (n2 == 0) { | |||
4695 | 1171507 | 100 | p2 += u2 ? UTF8SKIP(p2) : 1; | |||
4696 | } | |||||
4697 | } /* End of loop through both strings */ | |||||
4698 | ||||||
4699 | /* A match is defined by each scan that specified an explicit length | |||||
4700 | * reaching its final goal, and the other not having matched a partial | |||||
4701 | * character (which can happen when the fold of a character is more than one | |||||
4702 | * character). */ | |||||
4703 | 933528 | 100 | if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { | |||
100 | ||||||
100 | ||||||
100 | ||||||
4704 | return 0; | |||||
4705 | } | |||||
4706 | ||||||
4707 | /* Successful match. Set output pointers */ | |||||
4708 | 734446 | 100 | if (pe1) { | |||
4709 | 267984 | *pe1 = (char*)p1; | ||||
4710 | } | |||||
4711 | 734446 | 100 | if (pe2) { | |||
4712 | 904589 | *pe2 = (char*)p2; | ||||
4713 | } | |||||
4714 | return 1; | |||||
4715 | 0 | } | ||||
4716 | ||||||
4717 | /* | |||||
4718 | * Local variables: | |||||
4719 | * c-indentation-style: bsd | |||||
4720 | * c-basic-offset: 4 | |||||
4721 | * indent-tabs-mode: nil | |||||
4722 | * End: | |||||
4723 | * | |||||
4724 | * ex: set ts=8 sts=4 sw=4 et: | |||||
4725 | */ |