File Coverage

ext/re/re_comp.c
Criterion Covered Total %
statement 4613 5001 92.2
branch n/a
condition n/a
subroutine n/a
total 4613 5001 92.2


line stmt bran cond sub time code
1           /* regcomp.c
2           */
3            
4           /*
5           * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6           *
7           * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8           */
9            
10           /* This file contains functions for compiling a regular expression. See
11           * also regexec.c which funnily enough, contains functions for executing
12           * a regular expression.
13           *
14           * This file is also copied at build time to ext/re/re_comp.c, where
15           * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16           * This causes the main functions to be compiled under new names and with
17           * debugging support added, which makes "use re 'debug'" work.
18           */
19            
20           /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21           * confused with the original package (see point 3 below). Thanks, Henry!
22           */
23            
24           /* Additional note: this code is very heavily munged from Henry's version
25           * in places. In some spots I've traded clarity for efficiency, so don't
26           * blame Henry for some of the lack of readability.
27           */
28            
29           /* The names of the functions have been changed from regcomp and
30           * regexec to pregcomp and pregexec in order to avoid conflicts
31           * with the POSIX routines of the same names.
32           */
33            
34           #ifdef PERL_EXT_RE_BUILD
35           #include "re_top.h"
36           #endif
37            
38           /*
39           * pregcomp and pregexec -- regsub and regerror are not used in perl
40           *
41           * Copyright (c) 1986 by University of Toronto.
42           * Written by Henry Spencer. Not derived from licensed software.
43           *
44           * Permission is granted to anyone to use this software for any
45           * purpose on any computer system, and to redistribute it freely,
46           * subject to the following restrictions:
47           *
48           * 1. The author is not responsible for the consequences of use of
49           * this software, no matter how awful, even if they arise
50           * from defects in it.
51           *
52           * 2. The origin of this software must not be misrepresented, either
53           * by explicit claim or by omission.
54           *
55           * 3. Altered versions must be plainly marked as such, and must not
56           * be misrepresented as being the original software.
57           *
58           *
59           **** Alterations to Henry's code are...
60           ****
61           **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62           **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63           **** by Larry Wall and others
64           ****
65           **** You may distribute under the terms of either the GNU General Public
66           **** License or the Artistic License, as specified in the README file.
67            
68           *
69           * Beware that some of this code is subtly aware of the way operator
70           * precedence is structured in regular expressions. Serious changes in
71           * regular-expression syntax might require a total rethink.
72           */
73           #include "EXTERN.h"
74           #define PERL_IN_REGCOMP_C
75           #include "perl.h"
76            
77           #ifndef PERL_IN_XSUB_RE
78           # include "INTERN.h"
79           #endif
80            
81           #define REG_COMP_C
82           #ifdef PERL_IN_XSUB_RE
83           # include "re_comp.h"
84           extern const struct regexp_engine my_reg_engine;
85           #else
86           # include "regcomp.h"
87           #endif
88            
89           #include "dquote_static.c"
90           #include "charclass_invlists.h"
91           #include "inline_invlist.c"
92           #include "unicode_constants.h"
93            
94           #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95           #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96           #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97            
98           #ifdef op
99           #undef op
100           #endif /* op */
101            
102           #ifdef MSDOS
103           # if defined(BUGGY_MSC6)
104           /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105           # pragma optimize("a",off)
106           /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107           # pragma optimize("w",on )
108           # endif /* BUGGY_MSC6 */
109           #endif /* MSDOS */
110            
111           #ifndef STATIC
112           #define STATIC static
113           #endif
114            
115            
116           typedef struct RExC_state_t {
117           U32 flags; /* RXf_* are we folding, multilining? */
118           U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119           char *precomp; /* uncompiled string. */
120           REGEXP *rx_sv; /* The SV that is the regexp. */
121           regexp *rx; /* perl core regexp structure */
122           regexp_internal *rxi; /* internal data for regexp object pprivate field */
123           char *start; /* Start of input for compile */
124           char *end; /* End of input for compile */
125           char *parse; /* Input-scan pointer. */
126           SSize_t whilem_seen; /* number of WHILEM in this expr */
127           regnode *emit_start; /* Start of emitted-code area */
128           regnode *emit_bound; /* First regnode outside of the allocated space */
129           regnode *emit; /* Code-emit pointer; if = &emit_dummy,
130           implies compiling, so don't emit */
131           regnode emit_dummy; /* placeholder for emit to point to */
132           I32 naughty; /* How bad is this pattern? */
133           I32 sawback; /* Did we see \1, ...? */
134           U32 seen;
135           SSize_t size; /* Code size. */
136           I32 npar; /* Capture buffer count, (OPEN). */
137           I32 cpar; /* Capture buffer count, (CLOSE). */
138           I32 nestroot; /* root parens we are in - used by accept */
139           I32 extralen;
140           I32 seen_zerolen;
141           regnode **open_parens; /* pointers to open parens */
142           regnode **close_parens; /* pointers to close parens */
143           regnode *opend; /* END node in program */
144           I32 utf8; /* whether the pattern is utf8 or not */
145           I32 orig_utf8; /* whether the pattern was originally in utf8 */
146           /* XXX use this for future optimisation of case
147           * where pattern must be upgraded to utf8. */
148           I32 uni_semantics; /* If a d charset modifier should use unicode
149           rules, even if the pattern is not in
150           utf8 */
151           HV *paren_names; /* Paren names */
152          
153           regnode **recurse; /* Recurse regops */
154           I32 recurse_count; /* Number of recurse regops */
155           I32 in_lookbehind;
156           I32 contains_locale;
157           I32 override_recoding;
158           I32 in_multi_char_class;
159           struct reg_code_block *code_blocks; /* positions of literal (?{})
160           within pattern */
161           int num_code_blocks; /* size of code_blocks[] */
162           int code_index; /* next code_blocks[] slot */
163           #if ADD_TO_REGEXEC
164           char *starttry; /* -Dr: where regtry was called. */
165           #define RExC_starttry (pRExC_state->starttry)
166           #endif
167           SV *runtime_code_qr; /* qr with the runtime code blocks */
168           #ifdef DEBUGGING
169           const char *lastparse;
170           I32 lastnum;
171           AV *paren_name_list; /* idx -> name */
172           #define RExC_lastparse (pRExC_state->lastparse)
173           #define RExC_lastnum (pRExC_state->lastnum)
174           #define RExC_paren_name_list (pRExC_state->paren_name_list)
175           #endif
176           } RExC_state_t;
177            
178           #define RExC_flags (pRExC_state->flags)
179           #define RExC_pm_flags (pRExC_state->pm_flags)
180           #define RExC_precomp (pRExC_state->precomp)
181           #define RExC_rx_sv (pRExC_state->rx_sv)
182           #define RExC_rx (pRExC_state->rx)
183           #define RExC_rxi (pRExC_state->rxi)
184           #define RExC_start (pRExC_state->start)
185           #define RExC_end (pRExC_state->end)
186           #define RExC_parse (pRExC_state->parse)
187           #define RExC_whilem_seen (pRExC_state->whilem_seen)
188           #ifdef RE_TRACK_PATTERN_OFFSETS
189           #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
190           #endif
191           #define RExC_emit (pRExC_state->emit)
192           #define RExC_emit_dummy (pRExC_state->emit_dummy)
193           #define RExC_emit_start (pRExC_state->emit_start)
194           #define RExC_emit_bound (pRExC_state->emit_bound)
195           #define RExC_naughty (pRExC_state->naughty)
196           #define RExC_sawback (pRExC_state->sawback)
197           #define RExC_seen (pRExC_state->seen)
198           #define RExC_size (pRExC_state->size)
199           #define RExC_npar (pRExC_state->npar)
200           #define RExC_nestroot (pRExC_state->nestroot)
201           #define RExC_extralen (pRExC_state->extralen)
202           #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203           #define RExC_utf8 (pRExC_state->utf8)
204           #define RExC_uni_semantics (pRExC_state->uni_semantics)
205           #define RExC_orig_utf8 (pRExC_state->orig_utf8)
206           #define RExC_open_parens (pRExC_state->open_parens)
207           #define RExC_close_parens (pRExC_state->close_parens)
208           #define RExC_opend (pRExC_state->opend)
209           #define RExC_paren_names (pRExC_state->paren_names)
210           #define RExC_recurse (pRExC_state->recurse)
211           #define RExC_recurse_count (pRExC_state->recurse_count)
212           #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
213           #define RExC_contains_locale (pRExC_state->contains_locale)
214           #define RExC_override_recoding (pRExC_state->override_recoding)
215           #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
216            
217            
218           #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
219           #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
220           ((*s) == '{' && regcurly(s, FALSE)))
221            
222           #ifdef SPSTART
223           #undef SPSTART /* dratted cpp namespace... */
224           #endif
225           /*
226           * Flags to be passed up and down.
227           */
228           #define WORST 0 /* Worst case. */
229           #define HASWIDTH 0x01 /* Known to match non-null strings. */
230            
231           /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
232           * character. (There needs to be a case: in the switch statement in regexec.c
233           * for any node marked SIMPLE.) Note that this is not the same thing as
234           * REGNODE_SIMPLE */
235           #define SIMPLE 0x02
236           #define SPSTART 0x04 /* Starts with * or + */
237           #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
238           #define TRYAGAIN 0x10 /* Weeded out a declaration. */
239           #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
240            
241           #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
242            
243           /* whether trie related optimizations are enabled */
244           #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245           #define TRIE_STUDY_OPT
246           #define FULL_TRIE_STUDY
247           #define TRIE_STCLASS
248           #endif
249            
250            
251            
252           #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253           #define PBITVAL(paren) (1 << ((paren) & 7))
254           #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255           #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256           #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
257            
258           #define REQUIRE_UTF8 STMT_START { \
259           if (!UTF) { \
260           *flagp = RESTART_UTF8; \
261           return NULL; \
262           } \
263           } STMT_END
264            
265           /* This converts the named class defined in regcomp.h to its equivalent class
266           * number defined in handy.h. */
267           #define namedclass_to_classnum(class) ((int) ((class) / 2))
268           #define classnum_to_namedclass(classnum) ((classnum) * 2)
269            
270           /* About scan_data_t.
271            
272           During optimisation we recurse through the regexp program performing
273           various inplace (keyhole style) optimisations. In addition study_chunk
274           and scan_commit populate this data structure with information about
275           what strings MUST appear in the pattern. We look for the longest
276           string that must appear at a fixed location, and we look for the
277           longest string that may appear at a floating location. So for instance
278           in the pattern:
279          
280           /FOO[xX]A.*B[xX]BAR/
281          
282           Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
283           strings (because they follow a .* construct). study_chunk will identify
284           both FOO and BAR as being the longest fixed and floating strings respectively.
285          
286           The strings can be composites, for instance
287          
288           /(f)(o)(o)/
289          
290           will result in a composite fixed substring 'foo'.
291          
292           For each string some basic information is maintained:
293          
294           - offset or min_offset
295           This is the position the string must appear at, or not before.
296           It also implicitly (when combined with minlenp) tells us how many
297           characters must match before the string we are searching for.
298           Likewise when combined with minlenp and the length of the string it
299           tells us how many characters must appear after the string we have
300           found.
301          
302           - max_offset
303           Only used for floating strings. This is the rightmost point that
304           the string can appear at. If set to SSize_t_MAX it indicates that the
305           string can occur infinitely far to the right.
306          
307           - minlenp
308           A pointer to the minimum number of characters of the pattern that the
309           string was found inside. This is important as in the case of positive
310           lookahead or positive lookbehind we can have multiple patterns
311           involved. Consider
312          
313           /(?=FOO).*F/
314          
315           The minimum length of the pattern overall is 3, the minimum length
316           of the lookahead part is 3, but the minimum length of the part that
317           will actually match is 1. So 'FOO's minimum length is 3, but the
318           minimum length for the F is 1. This is important as the minimum length
319           is used to determine offsets in front of and behind the string being
320           looked for. Since strings can be composites this is the length of the
321           pattern at the time it was committed with a scan_commit. Note that
322           the length is calculated by study_chunk, so that the minimum lengths
323           are not known until the full pattern has been compiled, thus the
324           pointer to the value.
325          
326           - lookbehind
327          
328           In the case of lookbehind the string being searched for can be
329           offset past the start point of the final matching string.
330           If this value was just blithely removed from the min_offset it would
331           invalidate some of the calculations for how many chars must match
332           before or after (as they are derived from min_offset and minlen and
333           the length of the string being searched for).
334           When the final pattern is compiled and the data is moved from the
335           scan_data_t structure into the regexp structure the information
336           about lookbehind is factored in, with the information that would
337           have been lost precalculated in the end_shift field for the
338           associated string.
339            
340           The fields pos_min and pos_delta are used to store the minimum offset
341           and the delta to the maximum offset at the current point in the pattern.
342            
343           */
344            
345           typedef struct scan_data_t {
346           /*I32 len_min; unused */
347           /*I32 len_delta; unused */
348           SSize_t pos_min;
349           SSize_t pos_delta;
350           SV *last_found;
351           SSize_t last_end; /* min value, <0 unless valid. */
352           SSize_t last_start_min;
353           SSize_t last_start_max;
354           SV **longest; /* Either &l_fixed, or &l_float. */
355           SV *longest_fixed; /* longest fixed string found in pattern */
356           SSize_t offset_fixed; /* offset where it starts */
357           SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
358           I32 lookbehind_fixed; /* is the position of the string modfied by LB */
359           SV *longest_float; /* longest floating string found in pattern */
360           SSize_t offset_float_min; /* earliest point in string it can appear */
361           SSize_t offset_float_max; /* latest point in string it can appear */
362           SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
363           SSize_t lookbehind_float; /* is the pos of the string modified by LB */
364           I32 flags;
365           I32 whilem_c;
366           SSize_t *last_closep;
367           struct regnode_charclass_class *start_class;
368           } scan_data_t;
369            
370           /* The below is perhaps overboard, but this allows us to save a test at the
371           * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
372           * and 'a' differ by a single bit; the same with the upper and lower case of
373           * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
374           * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
375           * then inverts it to form a mask, with just a single 0, in the bit position
376           * where the upper- and lowercase differ. XXX There are about 40 other
377           * instances in the Perl core where this micro-optimization could be used.
378           * Should decide if maintenance cost is worse, before changing those
379           *
380           * Returns a boolean as to whether or not 'v' is either a lowercase or
381           * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
382           * compile-time constant, the generated code is better than some optimizing
383           * compilers figure out, amounting to a mask and test. The results are
384           * meaningless if 'c' is not one of [A-Za-z] */
385           #define isARG2_lower_or_UPPER_ARG1(c, v) \
386           (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
387            
388           /*
389           * Forward declarations for pregcomp()'s friends.
390           */
391            
392           static const scan_data_t zero_scan_data =
393           { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
394            
395           #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
396           #define SF_BEFORE_SEOL 0x0001
397           #define SF_BEFORE_MEOL 0x0002
398           #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
399           #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
400            
401           #ifdef NO_UNARY_PLUS
402           # define SF_FIX_SHIFT_EOL (0+2)
403           # define SF_FL_SHIFT_EOL (0+4)
404           #else
405           # define SF_FIX_SHIFT_EOL (+2)
406           # define SF_FL_SHIFT_EOL (+4)
407           #endif
408            
409           #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
410           #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
411            
412           #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
413           #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
414           #define SF_IS_INF 0x0040
415           #define SF_HAS_PAR 0x0080
416           #define SF_IN_PAR 0x0100
417           #define SF_HAS_EVAL 0x0200
418           #define SCF_DO_SUBSTR 0x0400
419           #define SCF_DO_STCLASS_AND 0x0800
420           #define SCF_DO_STCLASS_OR 0x1000
421           #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
422           #define SCF_WHILEM_VISITED_POS 0x2000
423            
424           #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
425           #define SCF_SEEN_ACCEPT 0x8000
426           #define SCF_TRIE_DOING_RESTUDY 0x10000
427            
428           #define UTF cBOOL(RExC_utf8)
429            
430           /* The enums for all these are ordered so things work out correctly */
431           #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
432           #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
433           #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
434           #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
435           #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
436           #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
437           #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
438            
439           #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
440            
441           #define OOB_NAMEDCLASS -1
442            
443           /* There is no code point that is out-of-bounds, so this is problematic. But
444           * its only current use is to initialize a variable that is always set before
445           * looked at. */
446           #define OOB_UNICODE 0xDEADBEEF
447            
448           #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
449           #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
450            
451            
452           /* length of regex to show in messages that don't mark a position within */
453           #define RegexLengthToShowInErrorMessages 127
454            
455           /*
456           * If MARKER[12] are adjusted, be sure to adjust the constants at the top
457           * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
458           * op/pragma/warn/regcomp.
459           */
460           #define MARKER1 "<-- HERE" /* marker as it appears in the description */
461           #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
462            
463           #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
464            
465           /*
466           * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
467           * arg. Show regex, up to a maximum length. If it's too long, chop and add
468           * "...".
469           */
470           #define _FAIL(code) STMT_START { \
471           const char *ellipses = ""; \
472           IV len = RExC_end - RExC_precomp; \
473           \
474           if (!SIZE_ONLY) \
475           SAVEFREESV(RExC_rx_sv); \
476           if (len > RegexLengthToShowInErrorMessages) { \
477           /* chop 10 shorter than the max, to ensure meaning of "..." */ \
478           len = RegexLengthToShowInErrorMessages - 10; \
479           ellipses = "..."; \
480           } \
481           code; \
482           } STMT_END
483            
484           #define FAIL(msg) _FAIL( \
485           Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
486           msg, (int)len, RExC_precomp, ellipses))
487            
488           #define FAIL2(msg,arg) _FAIL( \
489           Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
490           arg, (int)len, RExC_precomp, ellipses))
491            
492           /*
493           * Simple_vFAIL -- like FAIL, but marks the current location in the scan
494           */
495           #define Simple_vFAIL(m) STMT_START { \
496           const IV offset = RExC_parse - RExC_precomp; \
497           Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
498           m, (int)offset, RExC_precomp, RExC_precomp + offset); \
499           } STMT_END
500            
501           /*
502           * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
503           */
504           #define vFAIL(m) STMT_START { \
505           if (!SIZE_ONLY) \
506           SAVEFREESV(RExC_rx_sv); \
507           Simple_vFAIL(m); \
508           } STMT_END
509            
510           /*
511           * Like Simple_vFAIL(), but accepts two arguments.
512           */
513           #define Simple_vFAIL2(m,a1) STMT_START { \
514           const IV offset = RExC_parse - RExC_precomp; \
515           S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
516           (int)offset, RExC_precomp, RExC_precomp + offset); \
517           } STMT_END
518            
519           /*
520           * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
521           */
522           #define vFAIL2(m,a1) STMT_START { \
523           if (!SIZE_ONLY) \
524           SAVEFREESV(RExC_rx_sv); \
525           Simple_vFAIL2(m, a1); \
526           } STMT_END
527            
528            
529           /*
530           * Like Simple_vFAIL(), but accepts three arguments.
531           */
532           #define Simple_vFAIL3(m, a1, a2) STMT_START { \
533           const IV offset = RExC_parse - RExC_precomp; \
534           S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
535           (int)offset, RExC_precomp, RExC_precomp + offset); \
536           } STMT_END
537            
538           /*
539           * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
540           */
541           #define vFAIL3(m,a1,a2) STMT_START { \
542           if (!SIZE_ONLY) \
543           SAVEFREESV(RExC_rx_sv); \
544           Simple_vFAIL3(m, a1, a2); \
545           } STMT_END
546            
547           /*
548           * Like Simple_vFAIL(), but accepts four arguments.
549           */
550           #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
551           const IV offset = RExC_parse - RExC_precomp; \
552           S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
553           (int)offset, RExC_precomp, RExC_precomp + offset); \
554           } STMT_END
555            
556           #define vFAIL4(m,a1,a2,a3) STMT_START { \
557           if (!SIZE_ONLY) \
558           SAVEFREESV(RExC_rx_sv); \
559           Simple_vFAIL4(m, a1, a2, a3); \
560           } STMT_END
561            
562           /* m is not necessarily a "literal string", in this macro */
563           #define reg_warn_non_literal_string(loc, m) STMT_START { \
564           const IV offset = loc - RExC_precomp; \
565           Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
566           m, (int)offset, RExC_precomp, RExC_precomp + offset); \
567           } STMT_END
568            
569           #define ckWARNreg(loc,m) STMT_START { \
570           const IV offset = loc - RExC_precomp; \
571           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
572           (int)offset, RExC_precomp, RExC_precomp + offset); \
573           } STMT_END
574            
575           #define vWARN_dep(loc, m) STMT_START { \
576           const IV offset = loc - RExC_precomp; \
577           Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
578           (int)offset, RExC_precomp, RExC_precomp + offset); \
579           } STMT_END
580            
581           #define ckWARNdep(loc,m) STMT_START { \
582           const IV offset = loc - RExC_precomp; \
583           Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
584           m REPORT_LOCATION, \
585           (int)offset, RExC_precomp, RExC_precomp + offset); \
586           } STMT_END
587            
588           #define ckWARNregdep(loc,m) STMT_START { \
589           const IV offset = loc - RExC_precomp; \
590           Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
591           m REPORT_LOCATION, \
592           (int)offset, RExC_precomp, RExC_precomp + offset); \
593           } STMT_END
594            
595           #define ckWARN2reg_d(loc,m, a1) STMT_START { \
596           const IV offset = loc - RExC_precomp; \
597           Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
598           m REPORT_LOCATION, \
599           a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
600           } STMT_END
601            
602           #define ckWARN2reg(loc, m, a1) STMT_START { \
603           const IV offset = loc - RExC_precomp; \
604           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
605           a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
606           } STMT_END
607            
608           #define vWARN3(loc, m, a1, a2) STMT_START { \
609           const IV offset = loc - RExC_precomp; \
610           Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611           a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
612           } STMT_END
613            
614           #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
615           const IV offset = loc - RExC_precomp; \
616           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
617           a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
618           } STMT_END
619            
620           #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
621           const IV offset = loc - RExC_precomp; \
622           Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
623           a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
624           } STMT_END
625            
626           #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
627           const IV offset = loc - RExC_precomp; \
628           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
629           a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
630           } STMT_END
631            
632           #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
633           const IV offset = loc - RExC_precomp; \
634           Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635           a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
636           } STMT_END
637            
638            
639           /* Allow for side effects in s */
640           #define REGC(c,s) STMT_START { \
641           if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
642           } STMT_END
643            
644           /* Macros for recording node offsets. 20001227 mjd@plover.com
645           * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
646           * element 2*n-1 of the array. Element #2n holds the byte length node #n.
647           * Element 0 holds the number n.
648           * Position is 1 indexed.
649           */
650           #ifndef RE_TRACK_PATTERN_OFFSETS
651           #define Set_Node_Offset_To_R(node,byte)
652           #define Set_Node_Offset(node,byte)
653           #define Set_Cur_Node_Offset
654           #define Set_Node_Length_To_R(node,len)
655           #define Set_Node_Length(node,len)
656           #define Set_Node_Cur_Length(node,start)
657           #define Node_Offset(n)
658           #define Node_Length(n)
659           #define Set_Node_Offset_Length(node,offset,len)
660           #define ProgLen(ri) ri->u.proglen
661           #define SetProgLen(ri,x) ri->u.proglen = x
662           #else
663           #define ProgLen(ri) ri->u.offsets[0]
664           #define SetProgLen(ri,x) ri->u.offsets[0] = x
665           #define Set_Node_Offset_To_R(node,byte) STMT_START { \
666           if (! SIZE_ONLY) { \
667           MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
668           __LINE__, (int)(node), (int)(byte))); \
669           if((node) < 0) { \
670           Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
671           } else { \
672           RExC_offsets[2*(node)-1] = (byte); \
673           } \
674           } \
675           } STMT_END
676            
677           #define Set_Node_Offset(node,byte) \
678           Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
679           #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
680            
681           #define Set_Node_Length_To_R(node,len) STMT_START { \
682           if (! SIZE_ONLY) { \
683           MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
684           __LINE__, (int)(node), (int)(len))); \
685           if((node) < 0) { \
686           Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
687           } else { \
688           RExC_offsets[2*(node)] = (len); \
689           } \
690           } \
691           } STMT_END
692            
693           #define Set_Node_Length(node,len) \
694           Set_Node_Length_To_R((node)-RExC_emit_start, len)
695           #define Set_Node_Cur_Length(node, start) \
696           Set_Node_Length(node, RExC_parse - start)
697            
698           /* Get offsets and lengths */
699           #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
700           #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
701            
702           #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
703           Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
704           Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
705           } STMT_END
706           #endif
707            
708           #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
709           #define EXPERIMENTAL_INPLACESCAN
710           #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
711            
712           #define DEBUG_STUDYDATA(str,data,depth) \
713           DEBUG_OPTIMISE_MORE_r(if(data){ \
714           PerlIO_printf(Perl_debug_log, \
715           "%*s" str "Pos:%"IVdf"/%"IVdf \
716           " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
717           (int)(depth)*2, "", \
718           (IV)((data)->pos_min), \
719           (IV)((data)->pos_delta), \
720           (UV)((data)->flags), \
721           (IV)((data)->whilem_c), \
722           (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
723           is_inf ? "INF " : "" \
724           ); \
725           if ((data)->last_found) \
726           PerlIO_printf(Perl_debug_log, \
727           "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
728           " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
729           SvPVX_const((data)->last_found), \
730           (IV)((data)->last_end), \
731           (IV)((data)->last_start_min), \
732           (IV)((data)->last_start_max), \
733           ((data)->longest && \
734           (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
735           SvPVX_const((data)->longest_fixed), \
736           (IV)((data)->offset_fixed), \
737           ((data)->longest && \
738           (data)->longest==&((data)->longest_float)) ? "*" : "", \
739           SvPVX_const((data)->longest_float), \
740           (IV)((data)->offset_float_min), \
741           (IV)((data)->offset_float_max) \
742           ); \
743           PerlIO_printf(Perl_debug_log,"\n"); \
744           });
745            
746           /* Mark that we cannot extend a found fixed substring at this point.
747           Update the longest found anchored substring and the longest found
748           floating substrings if needed. */
749            
750           STATIC void
751 24403259         S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
752           SSize_t *minlenp, int is_inf)
753           {
754 24403259         const STRLEN l = CHR_SVLEN(data->last_found);
755 24403259         const STRLEN old_l = CHR_SVLEN(*data->longest);
756 24403259         GET_RE_DEBUG_FLAGS_DECL;
757            
758 9017128         PERL_ARGS_ASSERT_SCAN_COMMIT;
759            
760 9017128         if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
761 4708910         SvSetMagicSV(*data->longest, data->last_found);
762 4708910         if (*data->longest == data->longest_fixed) {
763 519868         data->offset_fixed = l ? data->last_start_min : data->pos_min;
764 4189152         if (data->flags & SF_BEFORE_EOL)
765           data->flags
766 4708788         |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
767           else
768 4708904         data->flags &= ~SF_FIX_BEFORE_EOL;
769 4308292         data->minlen_fixed=minlenp;
770 4308292         data->lookbehind_fixed=0;
771           }
772           else { /* *data->longest == data->longest_float */
773 4308182         data->offset_float_min = l ? data->last_start_min : data->pos_min;
774 1181149         data->offset_float_max = (l
775           ? data->last_start_max
776 4308182         : (data->pos_delta == SSize_t_MAX
777           ? SSize_t_MAX
778 2937189         : data->pos_min + data->pos_delta));
779 1371253         if (is_inf
780 4308182         || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
781 1693699         data->offset_float_max = SSize_t_MAX;
782 2614483         if (data->flags & SF_BEFORE_EOL)
783           data->flags
784 4308180         |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
785           else
786 4308178         data->flags &= ~SF_FL_BEFORE_EOL;
787 24403101         data->minlen_float=minlenp;
788 24403101         data->lookbehind_float=0;
789           }
790           }
791 24403259         SvCUR_set(data->last_found, 0);
792           {
793 1237694         SV * const sv = data->last_found;
794 1237694         if (SvUTF8(sv) && SvMAGICAL(sv)) {
795 1237550         MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
796 24403115         if (mg)
797 24403115         mg->mg_len = 0;
798           }
799           }
800 24403259         data->last_end = -1;
801 11174063         data->flags &= ~SF_BEFORE_EOL;
802 11174063         DEBUG_STUDYDATA("commit: ",data,0);
803 11174063         }
804            
805           /* These macros set, clear and test whether the synthetic start class ('ssc',
806           * given by the parameter) matches an empty string (EOS). This uses the
807           * 'next_off' field in the node, to save a bit in the flags field. The ssc
808           * stands alone, so there is never a next_off, so this field is otherwise
809           * unused. The EOS information is used only for compilation, but theoretically
810           * it could be passed on to the execution code. This could be used to store
811           * more than one bit of information, but only this one is currently used. */
812           #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
813           #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
814           #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
815            
816           /* Can match anything (initialization) */
817           STATIC void
818 11174083         S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
819           {
820 11174083         PERL_ARGS_ASSERT_CL_ANYTHING;
821            
822 322450         ANYOF_BITMAP_SETALL(cl);
823 322450         cl->flags = ANYOF_UNICODE_ALL;
824 10851817         SET_SSC_EOS(cl);
825            
826           /* If any portion of the regex is to operate under locale rules,
827           * initialization includes it. The reason this isn't done for all regexes
828           * is that the optimizer was written under the assumption that locale was
829           * all-or-nothing. Given the complexity and lack of documentation in the
830           * optimizer, and that there are inadequate test cases for locale, so many
831           * parts of it may not work properly, it is safest to avoid locale unless
832           * necessary. */
833 11174083         if (RExC_contains_locale) {
834 561264         ANYOF_CLASS_SETALL(cl); /* /l uses class */
835 9000016         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
836           }
837           else {
838 8472778         ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
839           }
840 527606         }
841            
842           /* Can match anything (initialization) */
843           STATIC int
844 289256         S_cl_is_anything(const struct regnode_charclass_class *cl)
845           {
846           int value;
847            
848 323810         PERL_ARGS_ASSERT_CL_IS_ANYTHING;
849            
850 11157999         for (value = 0; value < ANYOF_MAX; value += 2)
851 11157999         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
852           return 1;
853 11157999         if (!(cl->flags & ANYOF_UNICODE_ALL))
854           return 0;
855 11157999         if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
856           return 0;
857 11157999         return 1;
858           }
859            
860           /* Can match anything (initialization) */
861           STATIC void
862 2996302         S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
863           {
864 2996302         PERL_ARGS_ASSERT_CL_INIT;
865            
866           Zero(cl, 1, struct regnode_charclass_class);
867 2897626         cl->type = ANYOF;
868 2765006         cl_anything(pRExC_state, cl);
869 2765006         ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
870 2765006         }
871            
872           /* These two functions currently do the exact same thing */
873           #define cl_init_zero cl_init
874            
875           /* 'AND' a given class with another one. Can create false positives. 'cl'
876           * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
877           * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
878           STATIC void
879 2764828         S_cl_and(struct regnode_charclass_class *cl,
880           const struct regnode_charclass_class *and_with)
881           {
882 476246         PERL_ARGS_ASSERT_CL_AND;
883            
884 468742         assert(PL_regkind[and_with->type] == ANYOF);
885            
886           /* I (khw) am not sure all these restrictions are necessary XXX */
887 89415577         if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
888 88005574         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
889 2996124         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
890 47614         && !(and_with->flags & ANYOF_LOC_FOLD)
891 47614         && !(cl->flags & ANYOF_LOC_FOLD)) {
892           int i;
893            
894 47614         if (and_with->flags & ANYOF_INVERT)
895 47608         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
896 0         cl->bitmap[i] &= ~and_with->bitmap[i];
897           else
898 47800         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
899 3578         cl->bitmap[i] &= and_with->bitmap[i];
900           } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
901            
902 2948516         if (and_with->flags & ANYOF_INVERT) {
903            
904           /* Here, the and'ed node is inverted. Get the AND of the flags that
905           * aren't affected by the inversion. Those that are affected are
906           * handled individually below */
907 2389512         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
908 489192         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
909 489192         cl->flags |= affected_flags;
910            
911           /* We currently don't know how to deal with things that aren't in the
912           * bitmap, but we know that the intersection is no greater than what
913           * is already in cl, so let there be false positives that get sorted
914           * out after the synthetic start class succeeds, and the node is
915           * matched for real. */
916            
917           /* The inversion of these two flags indicate that the resulting
918           * intersection doesn't have them */
919 558998         if (and_with->flags & ANYOF_UNICODE_ALL) {
920 558996         cl->flags &= ~ANYOF_UNICODE_ALL;
921           }
922 558996         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
923 558996         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
924           }
925           }
926           else { /* and'd node is not inverted */
927           U8 outside_bitmap_but_not_utf8; /* Temp variable */
928            
929 2948516         if (! ANYOF_NONBITMAP(and_with)) {
930            
931           /* Here 'and_with' doesn't match anything outside the bitmap
932           * (except possibly ANYOF_UNICODE_ALL), which means the
933           * intersection can't either, except for ANYOF_UNICODE_ALL, in
934           * which case we don't know what the intersection is, but it's no
935           * greater than what cl already has, so can just leave it alone,
936           * with possible false positives */
937 2948516         if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
938 2948510         ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
939 2996118         cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
940           }
941           }
942 2760550         else if (! ANYOF_NONBITMAP(cl)) {
943            
944           /* Here, 'and_with' does match something outside the bitmap, and cl
945           * doesn't have a list of things to match outside the bitmap. If
946           * cl can match all code points above 255, the intersection will
947           * be those above-255 code points that 'and_with' matches. If cl
948           * can't match all Unicode code points, it means that it can't
949           * match anything outside the bitmap (since the 'if' that got us
950           * into this block tested for that), so we leave the bitmap empty.
951           */
952 2760550         if (cl->flags & ANYOF_UNICODE_ALL) {
953 14920         ARG_SET(cl, ARG(and_with));
954            
955           /* and_with's ARG may match things that don't require UTF8.
956           * And now cl's will too, in spite of this being an 'and'. See
957           * the comments below about the kludge */
958 14920         cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
959           }
960           }
961           else {
962           /* Here, both 'and_with' and cl match something outside the
963           * bitmap. Currently we do not do the intersection, so just match
964           * whatever cl had at the beginning. */
965           }
966            
967            
968           /* Take the intersection of the two sets of flags. However, the
969           * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
970           * kludge around the fact that this flag is not treated like the others
971           * which are initialized in cl_anything(). The way the optimizer works
972           * is that the synthetic start class (SSC) is initialized to match
973           * anything, and then the first time a real node is encountered, its
974           * values are AND'd with the SSC's with the result being the values of
975           * the real node. However, there are paths through the optimizer where
976           * the AND never gets called, so those initialized bits are set
977           * inappropriately, which is not usually a big deal, as they just cause
978           * false positives in the SSC, which will just mean a probably
979           * imperceptible slow down in execution. However this bit has a
980           * higher false positive consequence in that it can cause utf8.pm,
981           * utf8_heavy.pl ... to be loaded when not necessary, which is a much
982           * bigger slowdown and also causes significant extra memory to be used.
983           * In order to prevent this, the code now takes a different tack. The
984           * bit isn't set unless some part of the regular expression needs it,
985           * but once set it won't get cleared. This means that these extra
986           * modules won't get loaded unless there was some path through the
987           * pattern that would have required them anyway, and so any false
988           * positives that occur by not ANDing them out when they could be
989           * aren't as severe as they would be if we treated this bit like all
990           * the others */
991 6         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
992           & ANYOF_NONBITMAP_NON_UTF8;
993 6         cl->flags &= and_with->flags;
994 6         cl->flags |= outside_bitmap_but_not_utf8;
995           }
996 6         }
997            
998           /* 'OR' a given class with another one. Can create false positives. 'cl'
999           * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
1000           * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
1001           STATIC void
1002 46         S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
1003           {
1004 46         PERL_ARGS_ASSERT_CL_OR;
1005            
1006 14966         if (or_with->flags & ANYOF_INVERT) {
1007            
1008           /* Here, the or'd node is to be inverted. This means we take the
1009           * complement of everything not in the bitmap, but currently we don't
1010           * know what that is, so give up and match anything */
1011 2745630         if (ANYOF_NONBITMAP(or_with)) {
1012 2745630         cl_anything(pRExC_state, cl);
1013           }
1014           /* We do not use
1015           * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1016           * <= (B1 | !B2) | (CL1 | !CL2)
1017           * which is wasteful if CL2 is small, but we ignore CL2:
1018           * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1019           * XXXX Can we handle case-fold? Unclear:
1020           * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1021           * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1022           */
1023 1408133         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024 89256193         && !(or_with->flags & ANYOF_LOC_FOLD)
1025 87860160         && !(cl->flags & ANYOF_LOC_FOLD) ) {
1026           int i;
1027            
1028 2745630         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1029 24200         cl->bitmap[i] |= ~or_with->bitmap[i];
1030           } /* XXXX: logic is complicated otherwise */
1031           else {
1032 0         cl_anything(pRExC_state, cl);
1033           }
1034            
1035           /* And, we can just take the union of the flags that aren't affected
1036           * by the inversion */
1037 2745630         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1038            
1039           /* For the remaining flags:
1040           ANYOF_UNICODE_ALL and inverted means to not match anything above
1041           255, which means that the union with cl should just be
1042           what cl has in it, so can ignore this flag
1043           ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1044           is (ASCII) 127-255 to match them, but then invert that, so
1045           the union with cl should just be what cl has in it, so can
1046           ignore this flag
1047           */
1048           } else { /* 'or_with' is not inverted */
1049           /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1050 140606         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1051 140192         && (!(or_with->flags & ANYOF_LOC_FOLD)
1052 414         || (cl->flags & ANYOF_LOC_FOLD)) ) {
1053           int i;
1054            
1055           /* OR char bitmap and class bitmap separately */
1056 1886         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1057 1472         cl->bitmap[i] |= or_with->bitmap[i];
1058 460         if (or_with->flags & ANYOF_CLASS) {
1059 2745630         ANYOF_CLASS_OR(or_with, cl);
1060           }
1061           }
1062           else { /* XXXX: logic is complicated, leave it along for a moment. */
1063 2760550         cl_anything(pRExC_state, cl);
1064           }
1065            
1066 630257         if (ANYOF_NONBITMAP(or_with)) {
1067            
1068           /* Use the added node's outside-the-bit-map match if there isn't a
1069           * conflict. If there is a conflict (both nodes match something
1070           * outside the bitmap, but what they match outside is not the same
1071           * pointer, and hence not easily compared until XXX we extend
1072           * inversion lists this far), give up and allow the start class to
1073           * match everything outside the bitmap. If that stuff is all above
1074           * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1075 630211         if (! ANYOF_NONBITMAP(cl)) {
1076 630211         ARG_SET(cl, ARG(or_with));
1077           }
1078 630211         else if (ARG(cl) != ARG(or_with)) {
1079            
1080 630211         if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1081 630211         cl_anything(pRExC_state, cl);
1082           }
1083           else {
1084 630211         cl->flags |= ANYOF_UNICODE_ALL;
1085           }
1086           }
1087           }
1088            
1089           /* Take the union */
1090 92314         cl->flags |= or_with->flags;
1091           }
1092 46         }
1093            
1094           #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1095           #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1096           #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1097           #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1098            
1099            
1100           #ifdef DEBUGGING
1101           /*
1102           dump_trie(trie,widecharmap,revcharmap)
1103           dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1104           dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1105            
1106           These routines dump out a trie in a somewhat readable format.
1107           The _interim_ variants are used for debugging the interim
1108           tables that are used to generate the final compressed
1109           representation which is what dump_trie expects.
1110            
1111           Part of the reason for their existence is to provide a form
1112           of documentation as to how the different representations function.
1113            
1114           */
1115            
1116           /*
1117           Dumps the final compressed table form of the trie to Perl_debug_log.
1118           Used for debugging make_trie().
1119           */
1120            
1121           STATIC void
1122 6         S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1123           AV *revcharmap, U32 depth)
1124           {
1125           U32 state;
1126 6         SV *sv=sv_newmortal();
1127 630217         int colwidth= widecharmap ? 6 : 4;
1128           U16 word;
1129 630217         GET_RE_DEBUG_FLAGS_DECL;
1130            
1131 630217         PERL_ARGS_ASSERT_DUMP_TRIE;
1132            
1133 630217         PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1134 630217         (int)depth * 2 + 2,"",
1135           "Match","Base","Ofs" );
1136            
1137 630311         for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1138 630305         SV ** const tmp = av_fetch( revcharmap, state, 0);
1139 538037         if ( tmp ) {
1140 630681         PerlIO_printf( Perl_debug_log, "%*s",
1141           colwidth,
1142 630681         pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1143           PL_colors[0], PL_colors[1],
1144           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1145           PERL_PV_ESCAPE_FIRSTCHAR
1146           )
1147           );
1148           }
1149           }
1150 630217         PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1151 6         (int)depth * 2 + 2,"");
1152            
1153 630311         for( state = 0 ; state < trie->uniquecharcount ; state++ )
1154 259262         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1155 2557967         PerlIO_printf( Perl_debug_log, "\n");
1156            
1157 1927846         for( state = 1 ; state < trie->statecount ; state++ ) {
1158 1927840         const U32 base = trie->states[ state ].trans.base;
1159            
1160 1927840         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1161            
1162 1927840         if ( trie->states[ state ].wordnum ) {
1163 1927800         PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1164           } else {
1165 5728         PerlIO_printf( Perl_debug_log, "%6s", "" );
1166           }
1167            
1168 5778         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1169            
1170 142         if ( base ) {
1171           U32 ofs = 0;
1172            
1173 842         while( ( base + ofs < trie->uniquecharcount ) ||
1174 358         ( base + ofs - trie->uniquecharcount < trie->lasttrans
1175 5942         && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1176 6074         ofs++;
1177            
1178 1922160         PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1179            
1180 1722742         for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1181 1723784         if ( ( base + ofs >= trie->uniquecharcount ) &&
1182 978878         ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1183 7175658         trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1184           {
1185 6198054         PerlIO_printf( Perl_debug_log, "%*"UVXf,
1186           colwidth,
1187 6198054         (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1188           } else {
1189 6198948         PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1190           }
1191           }
1192            
1193 287120         PerlIO_printf( Perl_debug_log, "]");
1194            
1195           }
1196 287164         PerlIO_printf( Perl_debug_log, "\n" );
1197           }
1198 118829         PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1199 79262         for (word=1; word <= trie->wordcount; word++) {
1200 207954         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1201 75074         (int)word, (int)(trie->wordinfo[word].prev),
1202 132880         (int)(trie->wordinfo[word].len));
1203           }
1204 132836         PerlIO_printf(Perl_debug_log, "\n" );
1205 5910800         }
1206           /*
1207           Dumps a fully constructed but uncompressed trie in list form.
1208           List tries normally only are used for construction when the number of
1209           possible chars (trie->uniquecharcount) is very high.
1210           Used for debugging make_trie().
1211           */
1212           STATIC void
1213 5910794         S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1214           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1215           U32 depth)
1216           {
1217           U32 state;
1218 5682834         SV *sv=sv_newmortal();
1219 227960         int colwidth= widecharmap ? 6 : 4;
1220 14588         GET_RE_DEBUG_FLAGS_DECL;
1221            
1222 213372         PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1223            
1224           /* print out the table precompression. */
1225 213372         PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1226 6197868         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1227           "------:-----+-----------------\n" );
1228          
1229 6050074         for( state=1 ; state < next_alloc ; state ++ ) {
1230           U16 charid;
1231          
1232 367052         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1233 367052         (int)depth * 2 + 2,"", (UV)state );
1234 258210         if ( ! trie->states[ state ].wordnum ) {
1235 258210         PerlIO_printf( Perl_debug_log, "%5s| ","");
1236           } else {
1237 6050074         PerlIO_printf( Perl_debug_log, "W%4x| ",
1238 3437251         trie->states[ state ].wordnum
1239           );
1240           }
1241 3437251         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1242 6050074         SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1243 1642398         if ( tmp ) {
1244 1642398         PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1245           colwidth,
1246 1642398         pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1247           PL_colors[0], PL_colors[1],
1248           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1249           PERL_PV_ESCAPE_FIRSTCHAR
1250           ) ,
1251 1642314         TRIE_LIST_ITEM(state,charid).forid,
1252 1220         (UV)TRIE_LIST_ITEM(state,charid).newstate
1253           );
1254 147794         if (!(charid % 10))
1255 35058         PerlIO_printf(Perl_debug_log, "\n%*s| ",
1256 147794         (int)((depth * 2) + 14), "");
1257           }
1258           }
1259 147794         PerlIO_printf( Perl_debug_log, "\n");
1260           }
1261 0         }
1262            
1263           /*
1264           Dumps a fully constructed but uncompressed trie in table form.
1265           This is the normal DFA style state transition table, with a few
1266           twists to facilitate compression later.
1267           Used for debugging make_trie().
1268           */
1269           STATIC void
1270 147794         S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1271           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1272           U32 depth)
1273           {
1274           U32 state;
1275           U16 charid;
1276 141522         SV *sv=sv_newmortal();
1277 141522         int colwidth= widecharmap ? 6 : 4;
1278 1922114         GET_RE_DEBUG_FLAGS_DECL;
1279            
1280 630211         PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1281          
1282           /*
1283           print out the table precompression so that we can do a visual check
1284           that they are identical.
1285           */
1286          
1287 630211         PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1288            
1289 1291903         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 259772         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1291 1032131         if ( tmp ) {
1292 175469         PerlIO_printf( Perl_debug_log, "%*s",
1293           colwidth,
1294 630211         pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1295           PL_colors[0], PL_colors[1],
1296           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1297           PERL_PV_ESCAPE_FIRSTCHAR
1298           )
1299           );
1300           }
1301           }
1302            
1303 630211         PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1304            
1305 630211         for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1306 4         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1307           }
1308            
1309 4         PerlIO_printf( Perl_debug_log, "\n" );
1310            
1311 4         for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1312            
1313 79008         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1314 79004         (int)depth * 2 + 2,"",
1315 79004         (UV)TRIE_NODENUM( state ) );
1316            
1317 79004         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1318 79004         UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1319 0         if (v)
1320 0         PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1321           else
1322 0         PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1323           }
1324 0         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1325 79004         PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1326           } else {
1327 197606         PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1328 79068         trie->states[ TRIE_NODENUM( state ) ].wordnum );
1329           }
1330           }
1331 79068         }
1332            
1333           #endif
1334            
1335            
1336           /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1337           startbranch: the first branch in the whole branch sequence
1338           first : start branch of sequence of branch-exact nodes.
1339           May be the same as startbranch
1340           last : Thing following the last branch.
1341           May be the same as tail.
1342           tail : item following the branch sequence
1343           count : words in the sequence
1344           flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1345           depth : indent depth
1346            
1347           Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1348            
1349           A trie is an N'ary tree where the branches are determined by digital
1350           decomposition of the key. IE, at the root node you look up the 1st character and
1351           follow that branch repeat until you find the end of the branches. Nodes can be
1352           marked as "accepting" meaning they represent a complete word. Eg:
1353            
1354           /he|she|his|hers/
1355            
1356           would convert into the following structure. Numbers represent states, letters
1357           following numbers represent valid transitions on the letter from that state, if
1358           the number is in square brackets it represents an accepting state, otherwise it
1359           will be in parenthesis.
1360            
1361           +-h->+-e->[3]-+-r->(8)-+-s->[9]
1362           | |
1363           | (2)
1364           | |
1365           (1) +-i->(6)-+-s->[7]
1366           |
1367           +-s->(3)-+-h->(4)-+-e->[5]
1368            
1369           Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1370            
1371           This shows that when matching against the string 'hers' we will begin at state 1
1372           read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1373           then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1374           is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1375           single traverse. We store a mapping from accepting to state to which word was
1376           matched, and then when we have multiple possibilities we try to complete the
1377           rest of the regex in the order in which they occured in the alternation.
1378            
1379           The only prior NFA like behaviour that would be changed by the TRIE support is
1380           the silent ignoring of duplicate alternations which are of the form:
1381            
1382           / (DUPE|DUPE) X? (?{ ... }) Y /x
1383            
1384           Thus EVAL blocks following a trie may be called a different number of times with
1385           and without the optimisation. With the optimisations dupes will be silently
1386           ignored. This inconsistent behaviour of EVAL type nodes is well established as
1387           the following demonstrates:
1388            
1389           'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1390            
1391           which prints out 'word' three times, but
1392            
1393           'words'=~/(word|word|word)(?{ print $1 })S/
1394            
1395           which doesnt print it out at all. This is due to other optimisations kicking in.
1396            
1397           Example of what happens on a structural level:
1398            
1399           The regexp /(ac|ad|ab)+/ will produce the following debug output:
1400            
1401           1: CURLYM[1] {1,32767}(18)
1402           5: BRANCH(8)
1403           6: EXACT (16)
1404           8: BRANCH(11)
1405           9: EXACT (16)
1406           11: BRANCH(14)
1407           12: EXACT (16)
1408           16: SUCCEED(0)
1409           17: NOTHING(18)
1410           18: END(0)
1411            
1412           This would be optimizable with startbranch=5, first=5, last=16, tail=16
1413           and should turn into:
1414            
1415           1: CURLYM[1] {1,32767}(18)
1416           5: TRIE(16)
1417           [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1418          
1419          
1420          
1421           16: SUCCEED(0)
1422           17: NOTHING(18)
1423           18: END(0)
1424            
1425           Cases where tail != last would be like /(?foo|bar)baz/:
1426            
1427           1: BRANCH(4)
1428           2: EXACT (8)
1429           4: BRANCH(7)
1430           5: EXACT (8)
1431           7: TAIL(8)
1432           8: EXACT (10)
1433           10: END(0)
1434            
1435           which would be optimizable with startbranch=1, first=1, last=7, tail=8
1436           and would end up looking like:
1437            
1438           1: TRIE(8)
1439           [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1440          
1441          
1442           7: TAIL(8)
1443           8: EXACT (10)
1444           10: END(0)
1445            
1446           d = uvchr_to_utf8_flags(d, uv, 0);
1447            
1448           is the recommended Unicode-aware way of saying
1449            
1450           *(d++) = uv;
1451           */
1452            
1453           #define TRIE_STORE_REVCHAR(val) \
1454           STMT_START { \
1455           if (UTF) { \
1456           SV *zlopp = newSV(7); /* XXX: optimize me */ \
1457           unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1458           unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1459           SvCUR_set(zlopp, kapow - flrbbbbb); \
1460           SvPOK_on(zlopp); \
1461           SvUTF8_on(zlopp); \
1462           av_push(revcharmap, zlopp); \
1463           } else { \
1464           char ooooff = (char)val; \
1465           av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1466           } \
1467           } STMT_END
1468            
1469           /* This gets the next character from the input, folding it if not already
1470           * folded. */
1471           #define TRIE_READ_CHAR STMT_START { \
1472           wordlen++; \
1473           if ( UTF ) { \
1474           /* if it is UTF then it is either already folded, or does not need \
1475           * folding */ \
1476           uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1477           } \
1478           else if (folder == PL_fold_latin1) { \
1479           /* This folder implies Unicode rules, which in the range expressible \
1480           * by not UTF is the lower case, with the two exceptions, one of \
1481           * which should have been taken care of before calling this */ \
1482           assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1483           uvc = toLOWER_L1(*uc); \
1484           if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1485           len = 1; \
1486           } else { \
1487           /* raw data, will be folded later if needed */ \
1488           uvc = (U32)*uc; \
1489           len = 1; \
1490           } \
1491           } STMT_END
1492            
1493            
1494            
1495           #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1496           if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1497           U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1498           Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1499           } \
1500           TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1501           TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1502           TRIE_LIST_CUR( state )++; \
1503           } STMT_END
1504            
1505           #define TRIE_LIST_NEW(state) STMT_START { \
1506           Newxz( trie->states[ state ].trans.list, \
1507           4, reg_trie_trans_le ); \
1508           TRIE_LIST_CUR( state ) = 1; \
1509           TRIE_LIST_LEN( state ) = 4; \
1510           } STMT_END
1511            
1512           #define TRIE_HANDLE_WORD(state) STMT_START { \
1513           U16 dupe= trie->states[ state ].wordnum; \
1514           regnode * const noper_next = regnext( noper ); \
1515           \
1516           DEBUG_r({ \
1517           /* store the word for dumping */ \
1518           SV* tmp; \
1519           if (OP(noper) != NOTHING) \
1520           tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1521           else \
1522           tmp = newSVpvn_utf8( "", 0, UTF ); \
1523           av_push( trie_words, tmp ); \
1524           }); \
1525           \
1526           curword++; \
1527           trie->wordinfo[curword].prev = 0; \
1528           trie->wordinfo[curword].len = wordlen; \
1529           trie->wordinfo[curword].accept = state; \
1530           \
1531           if ( noper_next < tail ) { \
1532           if (!trie->jump) \
1533           trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1534           trie->jump[curword] = (U16)(noper_next - convert); \
1535           if (!jumper) \
1536           jumper = noper_next; \
1537           if (!nextbranch) \
1538           nextbranch= regnext(cur); \
1539           } \
1540           \
1541           if ( dupe ) { \
1542           /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1543           /* chain, so that when the bits of chain are later */\
1544           /* linked together, the dups appear in the chain */\
1545           trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1546           trie->wordinfo[dupe].prev = curword; \
1547           } else { \
1548           /* we haven't inserted this word yet. */ \
1549           trie->states[ state ].wordnum = curword; \
1550           } \
1551           } STMT_END
1552            
1553            
1554           #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1555           ( ( base + charid >= ucharcount \
1556           && base + charid < ubound \
1557           && state == trie->trans[ base - ucharcount + charid ].check \
1558           && trie->trans[ base - ucharcount + charid ].next ) \
1559           ? trie->trans[ base - ucharcount + charid ].next \
1560           : ( state==1 ? special : 0 ) \
1561           )
1562            
1563           #define MADE_TRIE 1
1564           #define MADE_JUMP_TRIE 2
1565           #define MADE_EXACT_TRIE 4
1566            
1567           STATIC I32
1568 96         S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1569           {
1570           dVAR;
1571           /* first pass, loop through and scan words */
1572           reg_trie_data *trie;
1573           HV *widecharmap = NULL;
1574 78988         AV *revcharmap = newAV();
1575           regnode *cur;
1576 78988         const U32 uniflags = UTF8_ALLOW_DEFAULT;
1577 78988         STRLEN len = 0;
1578 79076         UV uvc = 0;
1579           U16 curword = 0;
1580           U32 next_alloc = 0;
1581           regnode *jumper = NULL;
1582           regnode *nextbranch = NULL;
1583           regnode *convert = NULL;
1584           U32 *prev_states; /* temp array mapping each state to previous one */
1585           /* we just use folder as a flag in utf8 */
1586           const U8 * folder = NULL;
1587            
1588           #ifdef DEBUGGING
1589 79076         const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1590           AV *trie_words = NULL;
1591           /* along with revcharmap, this only used during construction but both are
1592           * useful during debugging so we store them in the struct when debugging.
1593           */
1594           #else
1595           const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1596           STRLEN trie_charcount=0;
1597           #endif
1598           SV *re_trie_maxbuff;
1599 79076         GET_RE_DEBUG_FLAGS_DECL;
1600            
1601 72         PERL_ARGS_ASSERT_MAKE_TRIE;
1602           #ifndef DEBUGGING
1603           PERL_UNUSED_ARG(depth);
1604           #endif
1605            
1606 780243576         switch (flags) {
1607           case EXACT: break;
1608           case EXACTFA:
1609           case EXACTFU_SS:
1610 780164502         case EXACTFU: folder = PL_fold_latin1; break;
1611 0         case EXACTF: folder = PL_fold; break;
1612 0         case EXACTFL: folder = PL_fold_locale; break;
1613 79068         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1614           }
1615            
1616 79076         trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1617 79076         trie->refcount = 1;
1618 79076         trie->startstate = 1;
1619 79076         trie->wordcount = word_count;
1620 8         RExC_rxi->data->data[ data_slot ] = (void*)trie;
1621 79012         trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1622 12         if (flags == EXACT)
1623 10         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1624 12         trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625           trie->wordcount+1, sizeof(reg_trie_wordinfo));
1626            
1627 12         DEBUG_r({
1628           trie_words = newAV();
1629           });
1630            
1631 12         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1632 79084         if (!SvIOK(re_trie_maxbuff)) {
1633 79072         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1634           }
1635 76         DEBUG_TRIE_COMPILE_r({
1636           PerlIO_printf( Perl_debug_log,
1637           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1638           (int)depth * 2 + 2, "",
1639           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1640           REG_NODE_NUM(last), REG_NODE_NUM(tail),
1641           (int)depth);
1642           });
1643          
1644           /* Find the node we are going to overwrite */
1645 79076         if ( first == startbranch && OP( last ) != BRANCH ) {
1646           /* whole branch chain */
1647           convert = first;
1648           } else {
1649           /* branch sub-chain */
1650 79000         convert = NEXTOPER( first );
1651           }
1652          
1653           /* -- First loop and Setup --
1654            
1655           We first traverse the branches and scan each word to determine if it
1656           contains widechars, and how many unique chars there are, this is
1657           important as we have to build a table with at least as many columns as we
1658           have unique chars.
1659            
1660           We use an array of integers to represent the character codes 0..255
1661           (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1662           native representation of the character value as the key and IV's for the
1663           coded index.
1664            
1665           *TODO* If we keep track of how many times each character is used we can
1666           remap the columns so that the table compression later on is more
1667           efficient in terms of memory by ensuring the most common value is in the
1668           middle and the least common are on the outside. IMO this would be better
1669           than a most to least common mapping as theres a decent chance the most
1670           common letter will share a node with the least common, meaning the node
1671           will not be compressible. With a middle is most common approach the worst
1672           case is when we have the least common nodes twice.
1673            
1674           */
1675            
1676 79064         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1677 79056         regnode *noper = NEXTOPER( cur );
1678 124         const U8 *uc = (U8*)STRING( noper );
1679 56         const U8 *e = uc + STR_LEN( noper );
1680           STRLEN foldlen = 0;
1681           U32 wordlen = 0; /* required init */
1682           STRLEN minbytes = 0;
1683           STRLEN maxbytes = 0;
1684 56         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1685            
1686 56         if (OP(noper) == NOTHING) {
1687 2         regnode *noper_next= regnext(noper);
1688 70         if (noper_next != tail && OP(noper_next) == flags) {
1689           noper = noper_next;
1690 68         uc= (U8*)STRING(noper);
1691 79064         e= uc + STR_LEN(noper);
1692 79060         trie->minlen= STR_LEN(noper);
1693           } else {
1694 30         trie->minlen= 0;
1695 30         continue;
1696           }
1697           }
1698            
1699 82         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1700 54         TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1701           regardless of encoding */
1702 90         if (OP( noper ) == EXACTFU_SS) {
1703           /* false positives are ok, so just set this */
1704 36         TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1705           }
1706           }
1707 220         for ( ; uc < e ; uc += len ) {
1708 220         TRIE_CHARCOUNT(trie)++;
1709 79190         TRIE_READ_CHAR;
1710            
1711           /* Acummulate to the current values, the range in the number of
1712           * bytes that this character could match. The max is presumed to
1713           * be the same as the folded input (which TRIE_READ_CHAR returns),
1714           * except that when this is not in UTF-8, it could be matched
1715           * against a string which is UTF-8, and the variant characters
1716           * could be 2 bytes instead of the 1 here. Likewise, for the
1717           * minimum number of bytes when not folded. When folding, the min
1718           * is assumed to be 1 byte could fold to match the single character
1719           * here, or in the case of a multi-char fold, 1 byte can fold to
1720           * the whole sequence. 'foldlen' is used to denote whether we are
1721           * in such a sequence, skipping the min setting if so. XXX TODO
1722           * Use the exact list of what folds to each character, from
1723           * PL_utf8_foldclosures */
1724 79188         if (UTF) {
1725 79004         maxbytes += UTF8SKIP(uc);
1726 79004         if (! folder) {
1727           /* A non-UTF-8 string could be 1 byte to match our 2 */
1728 4         minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1729           ? 1
1730 68         : UTF8SKIP(uc);
1731           }
1732           else {
1733 79072         if (foldlen) {
1734 4         foldlen -= UTF8SKIP(uc);
1735           }
1736           else {
1737 630207         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1738 630207         minbytes++;
1739           }
1740           }
1741           }
1742           else {
1743 630391         maxbytes += (UNI_IS_INVARIANT(*uc))
1744           ? 1
1745           : 2;
1746 630391         if (! folder) {
1747 630283         minbytes++;
1748           }
1749           else {
1750 2479061         if (foldlen) {
1751 1848746         foldlen--;
1752           }
1753           else {
1754 1848854         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1755 1848854         minbytes++;
1756           }
1757           }
1758           }
1759 1848930         if ( uvc < 256 ) {
1760 5872         if ( folder ) {
1761 5796         U8 folded= folder[ (U8) uvc ];
1762 160         if ( !trie->charmap[ folded ] ) {
1763 88         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1764 1848782         TRIE_STORE_REVCHAR( folded );
1765           }
1766           }
1767 10937261         if ( !trie->charmap[ uvc ] ) {
1768 6118870         trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1769 6118870         TRIE_STORE_REVCHAR( uvc );
1770           }
1771 6050170         if ( set_bit ) {
1772           /* store the codepoint in the bitmap, and its folded
1773           * equivalent. */
1774 68840         TRIE_BITMAP_SET(trie, uvc);
1775            
1776           /* store the folded codepoint */
1777 68840         if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1778            
1779 6118826         if ( !UTF ) {
1780           /* store first byte of utf8 representation of
1781           variant codepoints */
1782 6118826         if (! NATIVE_IS_INVARIANT(uvc)) {
1783 6118800         TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1784           }
1785           }
1786           set_bit = 0; /* We've done our bit :-) */
1787           }
1788           } else {
1789           SV** svpp;
1790 5825601         if ( !widecharmap )
1791 5825601         widecharmap = newHV();
1792            
1793 5825601         svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1794            
1795 5825601         if ( !svpp )
1796 5825601         Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1797            
1798 6118800         if ( !SvTRUE( *svpp ) ) {
1799 0         sv_setiv( *svpp, ++trie->uniquecharcount );
1800 1848746         TRIE_STORE_REVCHAR(uvc);
1801           }
1802           }
1803           }
1804 1848800         if( cur == first ) {
1805 630215         trie->minlen = minbytes;
1806 630215         trie->maxlen = maxbytes;
1807 7086061         } else if (minbytes < trie->minlen) {
1808 6455810         trie->minlen = minbytes;
1809 6455852         } else if (maxbytes > trie->maxlen) {
1810 6455810         trie->maxlen = maxbytes;
1811           }
1812           } /* end first pass */
1813 6455816         DEBUG_TRIE_COMPILE_r(
1814           PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1815           (int)depth * 2 + 2,"",
1816           ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1817           (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1818           (int)trie->minlen, (int)trie->maxlen )
1819           );
1820            
1821           /*
1822           We now know what we are dealing with in terms of unique chars and
1823           string sizes so we can calculate how much memory a naive
1824           representation using a flat table will take. If it's over a reasonable
1825           limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1826           conservative but potentially much slower representation using an array
1827           of lists.
1828            
1829           At the end we convert both representations into the same compressed
1830           form that will be used in regexec.c for matching with. The latter
1831           is a form that cannot be used to construct with but has memory
1832           properties similar to the list form and access properties similar
1833           to the table form making it both suitable for fast searches and
1834           small enough that its feasable to store for the duration of a program.
1835            
1836           See the comment in the code where the compressed table is produced
1837           inplace from the flat tabe representation for an explanation of how
1838           the compression works.
1839            
1840           */
1841            
1842            
1843 30925548         Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1844 28538549         prev_states[1] = 0;
1845            
1846 7486107         if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1847           /*
1848           Second Pass -- Array Of Lists Representation
1849            
1850           Each state will be represented by a list of charid:state records
1851           (reg_trie_trans_le) the first such element holds the CUR and LEN
1852           points of the allocated array. (See defines above).
1853            
1854           We build the initial structure using the lists, and then convert
1855           it into the compressed table form which allows faster lookups
1856           (but cant be modified once converted).
1857           */
1858            
1859           STRLEN transcount = 1;
1860            
1861 5825601         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1862           "%*sCompiling trie using list compiler\n",
1863           (int)depth * 2 + 2, ""));
1864            
1865 3434215         trie->states = (reg_trie_state *)
1866 2964532         PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1867           sizeof(reg_trie_state) );
1868 4068809         TRIE_LIST_NEW(1);
1869           next_alloc = 2;
1870            
1871 4068809         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1872            
1873 4068809         regnode *noper = NEXTOPER( cur );
1874 4068809         U8 *uc = (U8*)STRING( noper );
1875 1756792         const U8 *e = uc + STR_LEN( noper );
1876           U32 state = 1; /* required init */
1877           U16 charid = 0; /* sanity init */
1878           U32 wordlen = 0; /* required init */
1879            
1880 3417290         if (OP(noper) == NOTHING) {
1881 682779         regnode *noper_next= regnext(noper);
1882 3417290         if (noper_next != tail && OP(noper_next) == flags) {
1883           noper = noper_next;
1884 3417290         uc= (U8*)STRING(noper);
1885 3417290         e= uc + STR_LEN(noper);
1886           }
1887           }
1888            
1889 630207         if (OP(noper) != NOTHING) {
1890 630207         for ( ; uc < e ; uc += len ) {
1891            
1892 630207         TRIE_READ_CHAR;
1893            
1894 630211         if ( uvc < 256 ) {
1895 630211         charid = trie->charmap[ uvc ];
1896           } else {
1897 630211         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1898 630211         if ( !svpp ) {
1899           charid = 0;
1900           } else {
1901 259168         charid=(U16)SvIV( *svpp );
1902           }
1903           }
1904           /* charid is now 0 if we dont know the char read, or nonzero if we do */
1905 630211         if ( charid ) {
1906            
1907           U16 check;
1908           U32 newstate = 0;
1909            
1910 630211         charid--;
1911 193429         if ( !trie->states[ state ].trans.list ) {
1912 364702         TRIE_LIST_NEW( state );
1913           }
1914 364702         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1915 1237152         if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1916 2062410         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1917 2062404         break;
1918           }
1919           }
1920 1387048         if ( ! newstate ) {
1921 706821         newstate = next_alloc++;
1922 348827         prev_states[newstate] = state;
1923 348827         TRIE_LIST_PUSH( state, charid, newstate );
1924 348827         transcount++;
1925           }
1926           state = newstate;
1927           } else {
1928 13335         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1929           }
1930           }
1931           }
1932 12815         TRIE_HANDLE_WORD(state);
1933            
1934           } /* end second pass */
1935            
1936           /* next alloc is the NEXT state to be allocated */
1937 12815         trie->statecount = next_alloc;
1938 11429         trie->states = (reg_trie_state *)
1939 11429         PerlMemShared_realloc( trie->states,
1940           next_alloc
1941           * sizeof(reg_trie_state) );
1942            
1943           /* and now dump it out before we compress it */
1944 11429         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1945           revcharmap, next_alloc,
1946           depth+1)
1947           );
1948            
1949 11429         trie->trans = (reg_trie_trans *)
1950 0         PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1951           {
1952           U32 state;
1953           U32 tp = 0;
1954           U32 zp = 0;
1955            
1956            
1957 13335         for( state=1 ; state < next_alloc ; state ++ ) {
1958           U32 base=0;
1959            
1960           /*
1961           DEBUG_TRIE_COMPILE_MORE_r(
1962           PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1963           );
1964           */
1965            
1966 13335         if (trie->states[state].trans.list) {
1967 0         U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1968           U16 maxid=minid;
1969           U16 idx;
1970            
1971 371329         for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1972 364702         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1973 16395         if ( forid < minid ) {
1974           minid=forid;
1975 16395         } else if ( forid > maxid ) {
1976           maxid=forid;
1977           }
1978           }
1979 16395         if ( transcount < tp + maxid - minid + 1) {
1980 12817         transcount *= 2;
1981 12817         trie->trans = (reg_trie_trans *)
1982 12817         PerlMemShared_realloc( trie->trans,
1983           transcount
1984           * sizeof(reg_trie_trans) );
1985 16395         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1986           }
1987 40630         base = trie->uniquecharcount + tp - minid;
1988 16397         if ( maxid == minid ) {
1989           U32 set = 0;
1990 348309         for ( ; zp < tp ; zp++ ) {
1991 348309         if ( ! trie->trans[ zp ].next ) {
1992 12817         base = trie->uniquecharcount + zp - minid;
1993 12817         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1994 12817         trie->trans[ zp ].check = state;
1995           set = 1;
1996 12817         break;
1997           }
1998           }
1999 12817         if ( !set ) {
2000 12817         trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2001 2         trie->trans[ tp ].check = state;
2002 630211         tp++;
2003           zp = tp;
2004           }
2005           } else {
2006 630211         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2007 630209         const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2008 630209         trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2009 630209         trie->trans[ tid ].check = state;
2010           }
2011 191162         tp += ( maxid - minid + 1 );
2012           }
2013 630209         Safefree(trie->states[ state ].trans.list);
2014           }
2015           /*
2016           DEBUG_TRIE_COMPILE_MORE_r(
2017           PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2018           );
2019           */
2020 623187         trie->states[ state ].trans.base=base;
2021           }
2022 531169         trie->lasttrans = tp + 1;
2023           }
2024           } else {
2025           /*
2026           Second Pass -- Flat Table Representation.
2027            
2028           we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2029           We know that we will need Charcount+1 trans at most to store the data
2030           (one row per char at worst case) So we preallocate both structures
2031           assuming worst case.
2032            
2033           We then construct the trie using only the .next slots of the entry
2034           structs.
2035            
2036           We use the .check field of the first entry of the node temporarily to
2037           make compression both faster and easier by keeping track of how many non
2038           zero fields are in the node.
2039            
2040           Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2041           transition.
2042            
2043           There are two terms at use here: state as a TRIE_NODEIDX() which is a
2044           number representing the first entry of the node, and state as a
2045           TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2046           TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2047           are 2 entrys per node. eg:
2048            
2049           A B A B
2050           1. 2 4 1. 3 7
2051           2. 0 3 3. 0 5
2052           3. 0 0 5. 0 0
2053           4. 0 0 7. 0 0
2054            
2055           The table is internally in the right hand, idx form. However as we also
2056           have to deal with the states array which is indexed by nodenum we have to
2057           use TRIE_NODENUM() to convert.
2058            
2059           */
2060 90126         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2061           "%*sCompiling trie using table compiler\n",
2062           (int)depth * 2 + 2, ""));
2063            
2064 90126         trie->trans = (reg_trie_trans *)
2065 90126         PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2066           * trie->uniquecharcount + 1,
2067           sizeof(reg_trie_trans) );
2068 90126         trie->states = (reg_trie_state *)
2069 540099         PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2070           sizeof(reg_trie_state) );
2071 630218         next_alloc = trie->uniquecharcount + 1;
2072            
2073            
2074 2247419         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2075            
2076 1927806         regnode *noper = NEXTOPER( cur );
2077 5990         const U8 *uc = (U8*)STRING( noper );
2078 1921872         const U8 *e = uc + STR_LEN( noper );
2079            
2080           U32 state = 1; /* required init */
2081            
2082           U16 charid = 0; /* sanity init */
2083           U32 accept_state = 0; /* sanity init */
2084            
2085           U32 wordlen = 0; /* required init */
2086            
2087 8727781         if (OP(noper) == NOTHING) {
2088 7782298         regnode *noper_next= regnext(noper);
2089 7782298         if (noper_next != tail && OP(noper_next) == flags) {
2090           noper = noper_next;
2091 6010852         uc= (U8*)STRING(noper);
2092 6010852         e= uc + STR_LEN(noper);
2093           }
2094           }
2095            
2096 1921872         if ( OP(noper) != NOTHING ) {
2097 630633         for ( ; uc < e ; uc += len ) {
2098            
2099 630395         TRIE_READ_CHAR;
2100            
2101 630395         if ( uvc < 256 ) {
2102 941001         charid = trie->charmap[ uvc ];
2103           } else {
2104 630211         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2105 25678         charid = svpp ? (U16)SvIV(*svpp) : 0;
2106           }
2107 25862         if ( charid ) {
2108 25862         charid--;
2109 25862         if ( !trie->trans[ state + charid ].next ) {
2110 25778         trie->trans[ state + charid ].next = next_alloc;
2111 25778         trie->trans[ state ].check++;
2112 25778         prev_states[TRIE_NODENUM(next_alloc)]
2113 25778         = TRIE_NODENUM(state);
2114 25778         next_alloc += trie->uniquecharcount;
2115           }
2116 25862         state = trie->trans[ state + charid ].next;
2117           } else {
2118 25678         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2119           }
2120           /* charid is now 0 if we dont know the char read, or nonzero if we do */
2121           }
2122           }
2123 25734         accept_state = TRIE_NODENUM( state );
2124 25734         TRIE_HANDLE_WORD(accept_state);
2125            
2126           } /* end second pass */
2127            
2128           /* and now dump it out before we compress it */
2129 25686         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2130           revcharmap,
2131           next_alloc, depth+1));
2132            
2133           {
2134           /*
2135           * Inplace compress the table.*
2136            
2137           For sparse data sets the table constructed by the trie algorithm will
2138           be mostly 0/FAIL transitions or to put it another way mostly empty.
2139           (Note that leaf nodes will not contain any transitions.)
2140            
2141           This algorithm compresses the tables by eliminating most such
2142           transitions, at the cost of a modest bit of extra work during lookup:
2143            
2144           - Each states[] entry contains a .base field which indicates the
2145           index in the state[] array wheres its transition data is stored.
2146            
2147           - If .base is 0 there are no valid transitions from that node.
2148            
2149           - If .base is nonzero then charid is added to it to find an entry in
2150           the trans array.
2151            
2152           -If trans[states[state].base+charid].check!=state then the
2153           transition is taken to be a 0/Fail transition. Thus if there are fail
2154           transitions at the front of the node then the .base offset will point
2155           somewhere inside the previous nodes data (or maybe even into a node
2156           even earlier), but the .check field determines if the transition is
2157           valid.
2158            
2159           XXX - wrong maybe?
2160           The following process inplace converts the table to the compressed
2161           table: We first do not compress the root node 1,and mark all its
2162           .check pointers as 1 and set its .base pointer as 1 as well. This
2163           allows us to do a DFA construction from the compressed table later,
2164           and ensures that any .base pointers we calculate later are greater
2165           than 0.
2166            
2167           - We set 'pos' to indicate the first entry of the second node.
2168            
2169           - We then iterate over the columns of the node, finding the first and
2170           last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2171           and set the .check pointers accordingly, and advance pos
2172           appropriately and repreat for the next node. Note that when we copy
2173           the next pointers we have to convert them from the original
2174           NODEIDX form to NODENUM form as the former is not valid post
2175           compression.
2176            
2177           - If a node has no transitions used we mark its base as 0 and do not
2178           advance the pos pointer.
2179            
2180           - If a node only has one transition we use a second pointer into the
2181           structure to fill in allocated fail transitions from other states.
2182           This pointer is independent of the main pointer and scans forward
2183           looking for null transitions that are allocated to a state. When it
2184           finds one it writes the single transition into the "hole". If the
2185           pointer doesnt find one the single transition is appended as normal.
2186            
2187           - Once compressed we can Renew/realloc the structures to release the
2188           excess space.
2189            
2190           See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2191           specifically Fig 3.47 and the associated pseudocode.
2192            
2193           demq
2194           */
2195 25686         const U32 laststate = TRIE_NODENUM( next_alloc );
2196           U32 state, charid;
2197           U32 pos = 0, zp=0;
2198 25686         trie->statecount = laststate;
2199            
2200 25794         for ( state = 1 ; state < laststate ; state++ ) {
2201           U8 flag = 0;
2202 25786         const U32 stateidx = TRIE_NODEIDX( state );
2203 25786         const U32 o_used = trie->trans[ stateidx ].check;
2204 330712         U32 used = trie->trans[ stateidx ].check;
2205 305034         trie->trans[ stateidx ].check = 0;
2206            
2207 305684         for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2208 145700         if ( flag || trie->trans[ stateidx + charid ].next ) {
2209 145242         if ( trie->trans[ stateidx + charid ].next ) {
2210 436852         if (o_used == 1) {
2211 411092         for ( ; zp < pos ; zp++ ) {
2212 411122         if ( ! trie->trans[ zp ].next ) {
2213           break;
2214           }
2215           }
2216 1566276129         trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2217 1565852036         trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2218 1565852036         trie->trans[ zp ].check = state;
2219 310038         if ( ++zp > pos ) pos = zp;
2220           break;
2221           }
2222 310058         used--;
2223           }
2224 310190         if ( !flag ) {
2225           flag = 1;
2226 266082         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2227           }
2228 266256         trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2229 266256         trie->trans[ pos ].check = state;
2230 216         pos++;
2231           }
2232           }
2233           }
2234 266072         trie->lasttrans = pos + 1;
2235 25686         trie->states = (reg_trie_state *)
2236 25686         PerlMemShared_realloc( trie->states, laststate
2237           * sizeof(reg_trie_state) );
2238 25686         DEBUG_TRIE_COMPILE_MORE_r(
2239           PerlIO_printf( Perl_debug_log,
2240           "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2241           (int)depth * 2 + 2,"",
2242           (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2243           (IV)next_alloc,
2244           (IV)pos,
2245           ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2246           );
2247            
2248           } /* end table compress */
2249           }
2250 14511669         DEBUG_TRIE_COMPILE_MORE_r(
2251           PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2252           (int)depth * 2 + 2, "",
2253           (UV)trie->statecount,
2254           (UV)trie->lasttrans)
2255           );
2256           /* resize the trans array to remove unused space */
2257 14511669         trie->trans = (reg_trie_trans *)
2258 14511669         PerlMemShared_realloc( trie->trans, trie->lasttrans
2259           * sizeof(reg_trie_trans) );
2260            
2261           { /* Modify the program and insert the new TRIE node */
2262 22779147         U8 nodetype =(U8)(flags & 0xFF);
2263           char *str=NULL;
2264          
2265           #ifdef DEBUGGING
2266           regnode *optimize = NULL;
2267           #ifdef RE_TRACK_PATTERN_OFFSETS
2268            
2269           U32 mjd_offset = 0;
2270           U32 mjd_nodelen = 0;
2271           #endif /* RE_TRACK_PATTERN_OFFSETS */
2272           #endif /* DEBUGGING */
2273           /*
2274           This means we convert either the first branch or the first Exact,
2275           depending on whether the thing following (in 'last') is a branch
2276           or not and whther first is the startbranch (ie is it a sub part of
2277           the alternation or is it the whole thing.)
2278           Assuming its a sub part we convert the EXACT otherwise we convert
2279           the whole branch sequence, including the first.
2280           */
2281           /* Find the node we are going to overwrite */
2282 13896921         if ( first != startbranch || OP( last ) == BRANCH ) {
2283           /* branch sub-chain */
2284 12795297         NEXT_OFF( first ) = (U16)(last - first);
2285           #ifdef RE_TRACK_PATTERN_OFFSETS
2286 1111076         DEBUG_r({
2287           mjd_offset= Node_Offset((convert));
2288           mjd_nodelen= Node_Length((convert));
2289           });
2290           #endif
2291           /* whole branch chain */
2292           }
2293           #ifdef RE_TRACK_PATTERN_OFFSETS
2294           else {
2295 1111084         DEBUG_r({
2296           const regnode *nop = NEXTOPER( convert );
2297           mjd_offset= Node_Offset((nop));
2298           mjd_nodelen= Node_Length((nop));
2299           });
2300           }
2301 1111084         DEBUG_OPTIMISE_r(
2302           PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2303           (int)depth * 2 + 2, "",
2304           (UV)mjd_offset, (UV)mjd_nodelen)
2305           );
2306           #endif
2307           /* But first we check to see if there is a common prefix we can
2308           split out as an EXACT and put in front of the TRIE node. */
2309 1111084         trie->startstate= 1;
2310 1101624         if ( trie->bitmap && !widecharmap && !trie->jump ) {
2311           U32 state;
2312 1101622         for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2313           U32 ofs = 0;
2314           I32 idx = -1;
2315           U32 count = 0;
2316 1101628         const U32 base = trie->states[ state ].trans.base;
2317            
2318 9472         if ( trie->states[state].wordnum )
2319           count = 1;
2320            
2321 9488         for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2322 9556         if ( ( base + ofs >= trie->uniquecharcount ) &&
2323 9556         ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2324 9050         trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2325           {
2326 9012         if ( ++count > 1 ) {
2327 9002         SV **tmp = av_fetch( revcharmap, ofs, 0);
2328 9002         const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2329 14511677         if ( state == 1 ) break;
2330 14511673         if ( count == 2 ) {
2331 14511663         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2332 1832660         DEBUG_OPTIMISE_r(
2333           PerlIO_printf(Perl_debug_log,
2334           "%*sNew Start State=%"UVuf" Class: [",
2335           (int)depth * 2 + 2, "",
2336           (UV)state));
2337 1832660         if (idx >= 0) {
2338 1832660         SV ** const tmp = av_fetch( revcharmap, idx, 0);
2339 1327376         const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2340            
2341 529534         TRIE_BITMAP_SET(trie,*ch);
2342 529534         if ( folder )
2343 303114         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2344 303116         DEBUG_OPTIMISE_r(
2345           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2346           );
2347           }
2348           }
2349 226430         TRIE_BITMAP_SET(trie,*ch);
2350 48448         if ( folder )
2351 47878         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2352 46774         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2353           }
2354 46784         idx = ofs;
2355           }
2356           }
2357 46774         if ( count == 1 ) {
2358 179662         SV **tmp = av_fetch( revcharmap, idx, 0);
2359           STRLEN len;
2360 269490         char *ch = SvPV( *tmp, len );
2361 179662         DEBUG_OPTIMISE_r({
2362           SV *sv=sv_newmortal();
2363           PerlIO_printf( Perl_debug_log,
2364           "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2365           (int)depth * 2 + 2, "",
2366           (UV)state, (UV)idx,
2367           pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2368           PL_colors[0], PL_colors[1],
2369           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2370           PERL_PV_ESCAPE_FIRSTCHAR
2371           )
2372           );
2373           });
2374 158640         if ( state==1 ) {
2375 137614         OP( convert ) = nodetype;
2376 137614         str=STRING(convert);
2377 104372         STR_LEN(convert)=0;
2378           }
2379 73492         STR_LEN(convert) += len;
2380 11178         while (len--)
2381 11166         *str++ = *ch++;
2382           } else {
2383           #ifdef DEBUGGING
2384 62332         if (state>1)
2385 62328         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2386           #endif
2387           break;
2388           }
2389           }
2390 372401         trie->prefixlen = (state-1);
2391 1034822         if (str) {
2392 100918         regnode *n = convert+NODE_SZ_STR(convert);
2393 64648         NEXT_OFF(convert) = NODE_SZ_STR(convert);
2394 17018         trie->startstate = state;
2395 17018         trie->minlen -= (state - 1);
2396 17018         trie->maxlen -= (state - 1);
2397           #ifdef DEBUGGING
2398           /* At least the UNICOS C compiler choked on this
2399           * being argument to DEBUG_r(), so let's just have
2400           * it right here. */
2401           if (
2402           #ifdef PERL_EXT_RE_BUILD
2403           1
2404           #else
2405           DEBUG_r_TEST
2406           #endif
2407           ) {
2408           regnode *fix = convert;
2409 47632         U32 word = trie->wordcount;
2410 47632         mjd_nodelen++;
2411 981532         Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2412 526450         while( ++fix < n ) {
2413 1994023         Set_Node_Offset_Length(fix, 0, 0);
2414           }
2415 1224706         while (word--) {
2416 1224704         SV ** const tmp = av_fetch( trie_words, word, 0 );
2417 1172530         if (tmp) {
2418 18088         if ( STR_LEN(convert) <= SvCUR(*tmp) )
2419 1172530         sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2420           else
2421 1172518         sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2422           }
2423           }
2424           }
2425           #endif
2426 52176         if (trie->maxlen) {
2427           convert = n;
2428           } else {
2429 49618         NEXT_OFF(convert) = (U16)(tail - convert);
2430           DEBUG_r(optimize= n);
2431           }
2432           }
2433           }
2434 46010         if (!jumper)
2435           jumper = last;
2436 41578         if ( trie->maxlen ) {
2437 39484         NEXT_OFF( convert ) = (U16)(tail - convert);
2438 52182         ARG_SET( convert, data_slot );
2439           /* Store the offset to the first unabsorbed branch in
2440           jump[0], which is otherwise unused by the jump logic.
2441           We use this when dumping a trie and during optimisation. */
2442 703241         if (trie->jump)
2443 14511661         trie->jump[0] = (U16)(nextbranch - convert);
2444          
2445           /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2446           * and there is a bitmap
2447           * and the first "jump target" node we found leaves enough room
2448           * then convert the TRIE node into a TRIEC node, with the bitmap
2449           * embedded inline in the opcode - this is hypothetically faster.
2450           */
2451 17472848         if ( !trie->states[trie->startstate].wordnum
2452 17472846         && trie->bitmap
2453 17472844         && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2454           {
2455 17472842         OP( convert ) = TRIEC;
2456 17472842         Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2457 21805354         PerlMemShared_free(trie->bitmap);
2458 9141715         trie->bitmap= NULL;
2459           } else
2460 643351         OP( convert ) = TRIE;
2461            
2462           /* store the type in the flags */
2463 68931023         convert->flags = nodetype;
2464 53853883         DEBUG_r({
2465           optimize = convert
2466           + NODE_STEP_REGNODE
2467           + regarglen[ OP( convert ) ];
2468           });
2469           /* XXX We really should free up the resource in trie now,
2470           as we won't use them - (which resources?) dmq */
2471           }
2472           /* needed for dumping*/
2473 53853883         DEBUG_r(if (optimize) {
2474           regnode *opt = convert;
2475            
2476           while ( ++opt < optimize) {
2477           Set_Node_Offset_Length(opt,0,0);
2478           }
2479           /*
2480           Try to clean up some of the debris left after the
2481           optimisation.
2482           */
2483           while( optimize < jumper ) {
2484           mjd_nodelen += Node_Length((optimize));
2485           OP( optimize ) = OPTIMIZED;
2486           Set_Node_Offset_Length(optimize,0,0);
2487           optimize++;
2488           }
2489           Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2490           });
2491           } /* end node insert */
2492            
2493           /* Finish populating the prev field of the wordinfo array. Walk back
2494           * from each accept state until we find another accept state, and if
2495           * so, point the first word's .prev field at the second word. If the
2496           * second already has a .prev field set, stop now. This will be the
2497           * case either if we've already processed that word's accept state,
2498           * or that state had multiple words, and the overspill words were
2499           * already linked up earlier.
2500           */
2501           {
2502           U16 word;
2503           U32 state;
2504           U16 prev;
2505            
2506 53853931         for (word=1; word <= trie->wordcount; word++) {
2507           prev = 0;
2508 53853931         if (trie->wordinfo[word].prev)
2509 102878242         continue;
2510 51439177         state = trie->wordinfo[word].accept;
2511 51439377         while (state) {
2512 51439321         state = prev_states[state];
2513 51439321         if (!state)
2514           break;
2515 77481289         prev = trie->states[state].wordnum;
2516 46795196         if (prev)
2517           break;
2518           }
2519 46051257         trie->wordinfo[word].prev = prev;
2520           }
2521 747025         Safefree(prev_states);
2522           }
2523            
2524            
2525           /* and now dump out the compressed format */
2526 747025         DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2527            
2528 51439129         RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2529           #ifdef DEBUGGING
2530 238911         RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2531 51200226         RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2532           #else
2533           SvREFCNT_dec_NN(revcharmap);
2534           #endif
2535 53853891         return trie->jump
2536           ? MADE_JUMP_TRIE
2537 52728374         : trie->startstate>1
2538           ? MADE_EXACT_TRIE
2539           : MADE_TRIE;
2540           }
2541            
2542           STATIC void
2543 1129043         S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2544           {
2545           /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2546            
2547           This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2548           "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2549           ISBN 0-201-10088-6
2550            
2551           We find the fail state for each state in the trie, this state is the longest proper
2552           suffix of the current state's 'word' that is also a proper prefix of another word in our
2553           trie. State 1 represents the word '' and is thus the default fail state. This allows
2554           the DFA not to have to restart after its tried and failed a word at a given point, it
2555           simply continues as though it had been matching the other word in the first place.
2556           Consider
2557           'abcdgu'=~/abcdefg|cdgu/
2558           When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2559           fail, which would bring us to the state representing 'd' in the second word where we would
2560           try 'g' and succeed, proceeding to match 'cdgu'.
2561           */
2562           /* add a fail transition */
2563 1129043         const U32 trie_offset = ARG(source);
2564 1129043         reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2565           U32 *q;
2566 1129043         const U32 ucharcount = trie->uniquecharcount;
2567 1129043         const U32 numstates = trie->statecount;
2568 723937         const U32 ubound = trie->lasttrans + ucharcount;
2569           U32 q_read = 0;
2570           U32 q_write = 0;
2571           U32 charid;
2572 1129043         U32 base = trie->states[ 1 ].trans.base;
2573           U32 *fail;
2574           reg_ac_data *aho;
2575 929480         const U32 data_slot = add_data( pRExC_state, 1, "T" );
2576 4607645         GET_RE_DEBUG_FLAGS_DECL;
2577            
2578 3478606         PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2579           #ifndef DEBUGGING
2580           PERL_UNUSED_ARG(depth);
2581           #endif
2582            
2583            
2584 3478606         ARG_SET( stclass, data_slot );
2585 3478606         aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2586 3468846         RExC_rxi->data->data[ data_slot ] = (void*)aho;
2587 3468846         aho->trie=trie_offset;
2588 9764         aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2589 3478606         Copy( trie->states, aho->states, numstates, reg_trie_state );
2590 3478606         Newxz( q, numstates, U32);
2591 3478606         aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2592 3478606         aho->refcount = 1;
2593 5398         fail = aho->fail;
2594           /* initialize fail[0..1] to be 1 so that we always have
2595           a valid final fail state */
2596 3478606         fail[ 0 ] = fail[ 1 ] = 1;
2597            
2598 2227890         for ( charid = 0; charid < ucharcount ; charid++ ) {
2599 2227886         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2600 3478684         if ( newstate ) {
2601 3413278         q[ q_write ] = newstate;
2602           /* set to point at the root */
2603 3478608         fail[ q[ q_write++ ] ]=1;
2604           }
2605           }
2606 3478678         while ( q_read < q_write) {
2607 3478674         const U32 cur = q[ q_read++ % numstates ];
2608 3130034         base = trie->states[ cur ].trans.base;
2609            
2610 1334762         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2611 3480658         const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2612 3480658         if (ch_state) {
2613           U32 fail_state = cur;
2614           U32 fail_base;
2615           do {
2616 182392         fail_state = fail[ fail_state ];
2617 3478668         fail_base = aho->states[ fail_state ].trans.base;
2618 182         } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2619            
2620 182         fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2621 182         fail[ ch_state ] = fail_state;
2622 182         if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2623           {
2624 116         aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2625           }
2626 3478668         q[ q_write++ % numstates] = ch_state;
2627           }
2628           }
2629           }
2630           /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2631           when we fail in state 1, this allows us to use the
2632           charclass scan to find a valid start char. This is based on the principle
2633           that theres a good chance the string being searched contains lots of stuff
2634           that cant be a start char.
2635           */
2636 3468846         fail[ 0 ] = fail[ 1 ] = 0;
2637 210992         DEBUG_TRIE_COMPILE_r({
2638           PerlIO_printf(Perl_debug_log,
2639           "%*sStclass Failtable (%"UVuf" states): 0",
2640           (int)(depth * 2), "", (UV)numstates
2641           );
2642           for( q_read=1; q_read
2643           PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2644           }
2645           PerlIO_printf(Perl_debug_log, "\n");
2646           });
2647 3468846         Safefree(q);
2648           /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2649 3478606         }
2650            
2651            
2652           /*
2653           * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2654           * These need to be revisited when a newer toolchain becomes available.
2655           */
2656           #if defined(__sparc64__) && defined(__GNUC__)
2657           # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2658           # undef SPARC64_GCC_WORKAROUND
2659           # define SPARC64_GCC_WORKAROUND 1
2660           # endif
2661           #endif
2662            
2663           #define DEBUG_PEEP(str,scan,depth) \
2664           DEBUG_OPTIMISE_r({if (scan){ \
2665           SV * const mysv=sv_newmortal(); \
2666           regnode *Next = regnext(scan); \
2667           regprop(RExC_rx, mysv, scan); \
2668           PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2669           (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2670           Next ? (REG_NODE_NUM(Next)) : 0 ); \
2671           }});
2672            
2673            
2674           /* The below joins as many adjacent EXACTish nodes as possible into a single
2675           * one. The regop may be changed if the node(s) contain certain sequences that
2676           * require special handling. The joining is only done if:
2677           * 1) there is room in the current conglomerated node to entirely contain the
2678           * next one.
2679           * 2) they are the exact same node type
2680           *
2681           * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2682           * these get optimized out
2683           *
2684           * If a node is to match under /i (folded), the number of characters it matches
2685           * can be different than its character length if it contains a multi-character
2686           * fold. *min_subtract is set to the total delta of the input nodes.
2687           *
2688           * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2689           * and contains LATIN SMALL LETTER SHARP S
2690           *
2691           * This is as good a place as any to discuss the design of handling these
2692           * multi-character fold sequences. It's been wrong in Perl for a very long
2693           * time. There are three code points in Unicode whose multi-character folds
2694           * were long ago discovered to mess things up. The previous designs for
2695           * dealing with these involved assigning a special node for them. This
2696           * approach doesn't work, as evidenced by this example:
2697           * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2698           * Both these fold to "sss", but if the pattern is parsed to create a node that
2699           * would match just the \xDF, it won't be able to handle the case where a
2700           * successful match would have to cross the node's boundary. The new approach
2701           * that hopefully generally solves the problem generates an EXACTFU_SS node
2702           * that is "sss".
2703           *
2704           * It turns out that there are problems with all multi-character folds, and not
2705           * just these three. Now the code is general, for all such cases. The
2706           * approach taken is:
2707           * 1) This routine examines each EXACTFish node that could contain multi-
2708           * character fold sequences. It returns in *min_subtract how much to
2709           * subtract from the the actual length of the string to get a real minimum
2710           * match length; it is 0 if there are no multi-char folds. This delta is
2711           * used by the caller to adjust the min length of the match, and the delta
2712           * between min and max, so that the optimizer doesn't reject these
2713           * possibilities based on size constraints.
2714           * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2715           * is used for an EXACTFU node that contains at least one "ss" sequence in
2716           * it. For non-UTF-8 patterns and strings, this is the only case where
2717           * there is a possible fold length change. That means that a regular
2718           * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2719           * with length changes, and so can be processed faster. regexec.c takes
2720           * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2721           * pre-folded by regcomp.c. This saves effort in regex matching.
2722           * However, the pre-folding isn't done for non-UTF8 patterns because the
2723           * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2724           * down by forcing the pattern into UTF8 unless necessary. Also what
2725           * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2726           * possibilities for the non-UTF8 patterns are quite simple, except for
2727           * the sharp s. All the ones that don't involve a UTF-8 target string are
2728           * members of a fold-pair, and arrays are set up for all of them so that
2729           * the other member of the pair can be found quickly. Code elsewhere in
2730           * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2731           * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2732           * described in the next item.
2733           * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2734           * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2735           * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2736           * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2737           * character in the pattern corresponds to at most a single character in
2738           * the target string. (And I do mean character, and not byte here, unlike
2739           * other parts of the documentation that have never been updated to
2740           * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2741           * two character string 'ss'; in EXACTFA nodes it can match
2742           * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2743           * instances where it is violated. I'm reluctant to try to change the
2744           * assumption, as the code involved is impenetrable to me (khw), so
2745           * instead the code here punts. This routine examines (when the pattern
2746           * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2747           * boolean indicating whether or not the node contains a sharp s. When it
2748           * is true, the caller sets a flag that later causes the optimizer in this
2749           * file to not set values for the floating and fixed string lengths, and
2750           * thus avoids the optimizer code in regexec.c that makes the invalid
2751           * assumption. Thus, there is no optimization based on string lengths for
2752           * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2753           * (The reason the assumption is wrong only in these two cases is that all
2754           * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2755           * other folds to their expanded versions. We can't prefold sharp s to
2756           * 'ss' in EXACTF nodes because we don't know at compile time if it
2757           * actually matches 'ss' or not. It will match iff the target string is
2758           * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2759           * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2760           * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2761           * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2762           * require the pattern to be forced into UTF-8, the overhead of which we
2763           * want to avoid.)
2764           *
2765           * Similarly, the code that generates tries doesn't currently handle
2766           * not-already-folded multi-char folds, and it looks like a pain to change
2767           * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2768           * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2769           * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2770           * using /iaa matching will be doing so almost entirely with ASCII
2771           * strings, so this should rarely be encountered in practice */
2772            
2773           #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2774           if (PL_regkind[OP(scan)] == EXACT) \
2775           join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2776            
2777           STATIC U32
2778 2862741         S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2779           /* Merge several consecutive EXACTish nodes into one. */
2780 1129217         regnode *n = regnext(scan);
2781           U32 stringok = 1;
2782 1129217         regnode *next = scan + NODE_SZ_STR(scan);
2783           U32 merged = 0;
2784           U32 stopnow = 0;
2785           #ifdef DEBUGGING
2786           regnode *stop = scan;
2787 723995         GET_RE_DEBUG_FLAGS_DECL;
2788           #else
2789           PERL_UNUSED_ARG(depth);
2790           #endif
2791            
2792 723995         PERL_ARGS_ASSERT_JOIN_EXACT;
2793           #ifndef EXPERIMENTAL_INPLACESCAN
2794           PERL_UNUSED_ARG(flags);
2795           PERL_UNUSED_ARG(val);
2796           #endif
2797 122836         DEBUG_PEEP("join",scan,depth);
2798            
2799           /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2800           * EXACT ones that are mergeable to the current one. */
2801 601377         while (n
2802 724035         && (PL_regkind[OP(n)] == NOTHING
2803 650941         || (stringok && OP(n) == OP(scan)))
2804 1129079         && NEXT_OFF(n)
2805 1129079         && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2806           {
2807          
2808 974993         if (OP(n) == TAIL || n > next)
2809           stringok = 0;
2810 969927         if (PL_regkind[OP(n)] == NOTHING) {
2811 1129051         DEBUG_PEEP("skip:",n,depth);
2812 136894         NEXT_OFF(scan) += NEXT_OFF(n);
2813 136894         next = n + NODE_STEP_REGNODE;
2814           #ifdef DEBUGGING
2815 126964         if (stringok)
2816           stop = n;
2817           #endif
2818 126964         n = regnext(n);
2819           }
2820 992185         else if (stringok) {
2821 586219         const unsigned int oldl = STR_LEN(scan);
2822 413109         regnode * const nnext = regnext(n);
2823            
2824           /* XXX I (khw) kind of doubt that this works on platforms where
2825           * U8_MAX is above 255 because of lots of other assumptions */
2826           /* Don't join if the sum can't fit into a single node */
2827 413109         if (oldl + STR_LEN(n) > U8_MAX)
2828           break;
2829          
2830 173138         DEBUG_PEEP("merg",n,depth);
2831 173138         merged++;
2832            
2833 173138         NEXT_OFF(scan) += NEXT_OFF(n);
2834 173138         STR_LEN(scan) += STR_LEN(n);
2835 173138         next = n + NODE_SZ_STR(n);
2836           /* Now we can overwrite *n : */
2837 173138         Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2838           #ifdef DEBUGGING
2839 1129067         stop = next - 1;
2840           #endif
2841           n = nnext;
2842           if (stopnow) break;
2843           }
2844            
2845           #ifdef EXPERIMENTAL_INPLACESCAN
2846           if (flags && !NEXT_OFF(n)) {
2847           DEBUG_PEEP("atch", val, depth);
2848           if (reg_off_by_arg[OP(n)]) {
2849           ARG_SET(n, val - n);
2850           }
2851           else {
2852           NEXT_OFF(n) = val - n;
2853           }
2854           stopnow = 1;
2855           }
2856           #endif
2857           }
2858            
2859 1125687         *min_subtract = 0;
2860 1011573         *has_exactf_sharp_s = FALSE;
2861            
2862           /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2863           * can now analyze for sequences of problematic code points. (Prior to
2864           * this final joining, sequences could have been split over boundaries, and
2865           * hence missed). The sequences only happen in folding, hence for any
2866           * non-EXACT EXACTish node */
2867 1011573         if (OP(scan) != EXACT) {
2868 18868         const U8 * const s0 = (U8*) STRING(scan);
2869           const U8 * s = s0;
2870 1125543         const U8 * const s_end = s0 + STR_LEN(scan);
2871            
2872           /* One pass is made over the node's string looking for all the
2873           * possibilities. to avoid some tests in the loop, there are two main
2874           * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2875           * non-UTF-8 */
2876 1122151         if (UTF) {
2877            
2878           /* Examine the string for a multi-character fold sequence. UTF-8
2879           * patterns have all characters pre-folded by the time this code is
2880           * executed */
2881 2170020         while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2882           length sequence we are looking for is 2 */
2883           {
2884           int count = 0;
2885 494764         int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2886 4033070         if (! len) { /* Not a multi-char fold: get next char */
2887 3464092         s += UTF8SKIP(s);
2888 3464092         continue;
2889           }
2890            
2891           /* Nodes with 'ss' require special handling, except for EXACTFL
2892           * and EXACTFA-ish for which there is no multi-char fold to
2893           * this */
2894 3464092         if (len == 2 && *s == 's' && *(s+1) == 's'
2895 3464092         && OP(scan) != EXACTFL
2896 2206944         && OP(scan) != EXACTFA
2897 2206944         && OP(scan) != EXACTFA_NO_TRIE)
2898           {
2899           count = 2;
2900 2206944         OP(scan) = EXACTFU_SS;
2901 2200916         s += 2;
2902           }
2903           else { /* Here is a generic multi-char fold. */
2904 1297931         const U8* multi_end = s + len;
2905            
2906           /* Count how many characters in it. In the case of /l and
2907           * /aa, no folds which contain ASCII code points are
2908           * allowed, so check for those, and skip if found. (In
2909           * EXACTFL, no folds are allowed to any Latin1 code point,
2910           * not just ASCII. But there aren't any of these
2911           * currently, nor ever likely, so don't take the time to
2912           * test for them. The code that generates the
2913           * is_MULTI_foo() macros croaks should one actually get put
2914           * into Unicode .) */
2915 1297931         if (OP(scan) != EXACTFL
2916 376         && OP(scan) != EXACTFA
2917 376         && OP(scan) != EXACTFA_NO_TRIE)
2918           {
2919 376         count = utf8_length(s, multi_end);
2920 376         s = multi_end;
2921           }
2922           else {
2923 376         while (s < multi_end) {
2924 324         if (isASCII(*s)) {
2925 1297555         s++;
2926 1297931         goto next_iteration;
2927           }
2928           else {
2929 1297875         s += UTF8SKIP(s);
2930           }
2931 2166161         count++;
2932           }
2933           }
2934           }
2935            
2936           /* The delta is how long the sequence is minus 1 (1 is how long
2937           * the character that folds to the sequence is) */
2938 111824         *min_subtract += count - 1;
2939           next_iteration: ;
2940           }
2941           }
2942 111854         else if (OP(scan) == EXACTFA) {
2943            
2944           /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2945           * fold to the ASCII range (and there are no existing ones in the
2946           * upper latin1 range). But, as outlined in the comments preceding
2947           * this function, we need to flag any occurrences of the sharp s.
2948           * This character forbids trie formation (because of added
2949           * complexity) */
2950 2166161         while (s < s_end) {
2951 1257148         if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2952 1122117         OP(scan) = EXACTFA_NO_TRIE;
2953 518391         *has_exactf_sharp_s = TRUE;
2954 518391         break;
2955           }
2956 773267         s++;
2957 518391         continue;
2958           }
2959           }
2960 512773         else if (OP(scan) != EXACTFL) {
2961            
2962           /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2963           * multi-char folds that are all Latin1. (This code knows that
2964           * there are no current multi-char folds possible with EXACTFL,
2965           * relying on fold_grind.t to catch any errors if the very unlikely
2966           * event happens that some get added in future Unicode versions.)
2967           * As explained in the comments preceding this function, we look
2968           * also for the sharp s in EXACTF nodes; it can be in the final
2969           * position. Otherwise we can stop looking 1 byte earlier because
2970           * have to find at least two characters for a multi-fold */
2971 32651         const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2972            
2973 32765         while (s < upper) {
2974 32697         int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2975 23193         if (! len) { /* Not a multi-char fold. */
2976 80         if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2977           {
2978 0         *has_exactf_sharp_s = TRUE;
2979           }
2980 80         s++;
2981 80         continue;
2982           }
2983            
2984 0         if (len == 2
2985 0         && isARG2_lower_or_UPPER_ARG1('s', *s)
2986 0         && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2987           {
2988            
2989           /* EXACTF nodes need to know that the minimum length
2990           * changed so that a sharp s in the string can match this
2991           * ss in the pattern, but they remain EXACTF nodes, as they
2992           * won't match this unless the target string is is UTF-8,
2993           * which we don't know until runtime */
2994 0         if (OP(scan) != EXACTF) {
2995 1129039         OP(scan) = EXACTFU_SS;
2996           }
2997           }
2998            
2999 52724836         *min_subtract += len - 1;
3000 21780         s += len;
3001           }
3002           }
3003           }
3004            
3005           #ifdef DEBUGGING
3006           /* Allow dumping but overwriting the collection of skipped
3007           * ops and/or strings with fake optimized ops */
3008 6470         n = scan + NODE_SZ_STR(scan);
3009 6574         while (n <= stop) {
3010 6218         OP(n) = OPTIMIZED;
3011 6218         FLAGS(n) = 0;
3012 6218         NEXT_OFF(n) = 0;
3013 182         n++;
3014           }
3015           #endif
3016 306         DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3017 6470         return stopnow;
3018           }
3019            
3020           /* REx optimizer. Converts nodes into quicker variants "in place".
3021           Finds fixed substrings. */
3022            
3023           /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3024           to the position after last scanned or to NULL. */
3025            
3026           #define INIT_AND_WITHP \
3027           assert(!and_withp); \
3028           Newx(and_withp,1,struct regnode_charclass_class); \
3029           SAVEFREEPV(and_withp)
3030            
3031           /* this is a chain of data about sub patterns we are processing that
3032           need to be handled separately/specially in study_chunk. Its so
3033           we can simulate recursion without losing state. */
3034           struct scan_frame;
3035           typedef struct scan_frame {
3036           regnode *last; /* last node to process in this frame */
3037           regnode *next; /* next node to process when last is reached */
3038           struct scan_frame *prev; /*previous frame*/
3039           I32 stop; /* what stopparen do we use */
3040           } scan_frame;
3041            
3042            
3043           #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3044            
3045           STATIC SSize_t
3046 3306         S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3047           SSize_t *minlenp, SSize_t *deltap,
3048           regnode *last,
3049           scan_data_t *data,
3050           I32 stopparen,
3051           U8* recursed,
3052           struct regnode_charclass_class *and_withp,
3053           U32 flags, U32 depth)
3054           /* scanp: Start here (read-write). */
3055           /* deltap: Write maxlen-minlen here. */
3056           /* last: Stop before this one. */
3057           /* data: string data about the pattern */
3058           /* stopparen: treat close N as END */
3059           /* recursed: which subroutines have we recursed into */
3060           /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3061           {
3062           dVAR;
3063           /* There must be at least this number of characters to match */
3064           SSize_t min = 0;
3065           I32 pars = 0, code;
3066 3306         regnode *scan = *scanp, *next;
3067           SSize_t delta = 0;
3068 6488         int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3069           int is_inf_internal = 0; /* The studied chunk is infinite */
3070 4366         I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3071           scan_data_t data_fake;
3072           SV *re_trie_maxbuff = NULL;
3073 4366         regnode *first_non_open = scan;
3074           SSize_t stopmin = SSize_t_MAX;
3075           scan_frame *frame = NULL;
3076 2318         GET_RE_DEBUG_FLAGS_DECL;
3077            
3078 196         PERL_ARGS_ASSERT_STUDY_CHUNK;
3079            
3080           #ifdef DEBUGGING
3081 196         StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3082           #endif
3083            
3084 2318         if ( depth == 0 ) {
3085 168         while (first_non_open && OP(first_non_open) == OPEN)
3086 2130         first_non_open=regnext(first_non_open);
3087           }
3088            
3089            
3090           fake_study_recurse:
3091 15992         while ( scan && OP(scan) != END && scan < last ){
3092 15796         UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3093           node length to get a real minimum (because
3094           the folded version may be shorter) */
3095 15796         bool has_exactf_sharp_s = FALSE;
3096           /* Peephole optimizer: */
3097 22088         DEBUG_STUDYDATA("Peep:", data,depth);
3098 19966         DEBUG_PEEP("Peep",scan,depth);
3099            
3100           /* Its not clear to khw or hv why this is done here, and not in the
3101           * clauses that deal with EXACT nodes. khw's guess is that it's
3102           * because of a previous design */
3103 19966         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3104            
3105           /* Follow the next-chain of the current node and optimize
3106           away all the NOTHINGs from it. */
3107 19966         if (OP(scan) != CURLYX) {
3108 20274         const int max = (reg_off_by_arg[OP(scan)]
3109 19966         ? I32_MAX
3110           /* I32 may be smaller than U16 on CRAYs! */
3111 19966         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3112 19966         int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3113           int noff;
3114 52703364         regnode *n = scan;
3115            
3116           /* Skip NOTHING and LONGJMP. */
3117 12679619         while ((n = regnext(n))
3118 12679311         && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3119 928774         || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3120 928466         && off + noff < max)
3121 928466         off += noff;
3122 11750845         if (reg_off_by_arg[OP(scan)])
3123 12679003         ARG(scan) = off;
3124           else
3125 12679311         NEXT_OFF(scan) = off;
3126           }
3127            
3128            
3129            
3130           /* The principal pseudo-switch. Cannot be a switch, since we
3131           look into several different things. */
3132 7981987         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3133 3494223         || OP(scan) == IFTHEN) {
3134 3493931         next = regnext(scan);
3135 3493931         code = OP(scan);
3136           /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3137            
3138 7981687         if (OP(next) == code || code == IFTHEN) {
3139           /* NOTE - There is similar code to this block below for handling
3140           TRIE nodes on a re-study. If you change stuff here check there
3141           too. */
3142           SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3143           struct regnode_charclass_class accum;
3144 7981687         regnode * const startbranch=scan;
3145            
3146 511202         if (flags & SCF_DO_SUBSTR)
3147 7981687         SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3148 4457770         if (flags & SCF_DO_STCLASS)
3149 8112063         cl_init_zero(pRExC_state, &accum);
3150            
3151 7981743         while (OP(scan) == code) {
3152           SSize_t deltanext, minnext, fake;
3153           I32 f = 0;
3154           struct regnode_charclass_class this_class;
3155            
3156 129794         num++;
3157 7981735         data_fake.flags = 0;
3158 7981735         if (data) {
3159 7981735         data_fake.whilem_c = data->whilem_c;
3160 12679059         data_fake.last_closep = data->last_closep;
3161           }
3162           else
3163 9116401         data_fake.last_closep = &fake;
3164            
3165 6053568         data_fake.pos_delta = delta;
3166 6049736         next = regnext(scan);
3167 56         scan = NEXTOPER(scan);
3168 40798         if (code != BRANCH)
3169 6134996         scan = NEXTOPER(scan);
3170 6135052         if (flags & SCF_DO_STCLASS) {
3171 6135042         cl_init(pRExC_state, &this_class);
3172 6053558         data_fake.start_class = &this_class;
3173           f = SCF_DO_STCLASS_AND;
3174           }
3175 81540         if (flags & SCF_WHILEM_VISITED_POS)
3176 20900702         f |= SCF_WHILEM_VISITED_POS;
3177            
3178           /* we suppose the run is continuous, last=next...*/
3179 20859960         minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3180           next, &data_fake,
3181           stopparen, recursed, NULL, f,depth+1);
3182 2363092         if (min1 > minnext)
3183           min1 = minnext;
3184 6135052         if (deltanext == SSize_t_MAX) {
3185           is_inf = is_inf_internal = 1;
3186           max1 = SSize_t_MAX;
3187 6135052         } else if (max1 < minnext + deltanext)
3188 6053522         max1 = minnext + deltanext;
3189 6544063         scan = next;
3190 432984         if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3191 432848         pars++;
3192 136         if (data_fake.flags & SCF_SEEN_ACCEPT) {
3193 432928         if ( stopmin > minnext)
3194 432928         stopmin = min + min1;
3195 12679003         flags &= ~SCF_DO_SUBSTR;
3196 40024053         if (data)
3197 1832658         data->flags |= SCF_SEEN_ACCEPT;
3198           }
3199 1832714         if (data) {
3200 1832714         if (data_fake.flags & SF_HAS_EVAL)
3201 1018818         data->flags |= SF_HAS_EVAL;
3202 1832714         data->whilem_c = data_fake.whilem_c;
3203           }
3204 797898         if (flags & SCF_DO_STCLASS)
3205 797888         cl_or(pRExC_state, &accum, &this_class);
3206           }
3207 797850         if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3208           min1 = 0;
3209 1832666         if (flags & SCF_DO_SUBSTR) {
3210 35100         data->pos_min += min1;
3211 1832666         if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3212 1832658         data->pos_delta = SSize_t_MAX;
3213           else
3214 1832666         data->pos_delta += max1 - min1;
3215 1018826         if (max1 != min1 || is_inf)
3216 1018822         data->longest = &(data->longest_float);
3217           }
3218 8         min += min1;
3219 1018826         if (delta == SSize_t_MAX
3220 1018826         || SSize_t_MAX - delta - (max1 - min1) < 0)
3221           delta = SSize_t_MAX;
3222           else
3223 127476         delta += max1 - min1;
3224 1832666         if (flags & SCF_DO_STCLASS_OR) {
3225 482397         cl_or(pRExC_state, data->start_class, &accum);
3226 274802         if (min1) {
3227 229976         cl_and(data->start_class, and_withp);
3228 35727         flags &= ~SCF_DO_STCLASS;
3229           }
3230           }
3231 346264         else if (flags & SCF_DO_STCLASS_AND) {
3232 346262         if (min1) {
3233 346262         cl_and(data->start_class, &accum);
3234 274808         flags &= ~SCF_DO_STCLASS;
3235           }
3236           else {
3237           /* Switch to OR mode: cache the old value of
3238           * data->start_class */
3239 274802         INIT_AND_WITHP;
3240 274802         StructCopy(data->start_class, and_withp,
3241           struct regnode_charclass_class);
3242 44826         flags &= ~SCF_DO_STCLASS_AND;
3243 229976         StructCopy(&accum, data->start_class,
3244           struct regnode_charclass_class);
3245 229976         flags |= SCF_DO_STCLASS_OR;
3246 202190         SET_SSC_EOS(data->start_class);
3247           }
3248           }
3249            
3250 48912         if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3251           /* demq.
3252            
3253           Assuming this was/is a branch we are dealing with: 'scan' now
3254           points at the item that follows the branch sequence, whatever
3255           it is. We now start at the beginning of the sequence and look
3256           for subsequences of
3257            
3258           BRANCH->EXACT=>x1
3259           BRANCH->EXACT=>x2
3260           tail
3261            
3262           which would be constructed from a pattern like /A|LIST|OF|WORDS/
3263            
3264           If we can find such a subsequence we need to turn the first
3265           element into a trie and then add the subsequent branch exact
3266           strings to the trie.
3267            
3268           We have two cases
3269            
3270           1. patterns where the whole set of branches can be converted.
3271            
3272           2. patterns where only a subset can be converted.
3273            
3274           In case 1 we can replace the whole set with a single regop
3275           for the trie. In case 2 we need to keep the start and end
3276           branches so
3277            
3278           'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3279           becomes BRANCH TRIE; BRANCH X;
3280            
3281           There is an additional case, that being where there is a
3282           common prefix, which gets split out into an EXACT like node
3283           preceding the TRIE node.
3284            
3285           If x(1..n)==tail then we can do a simple trie, if not we make
3286           a "jump" trie, such that when we match the appropriate word
3287           we "jump" to the appropriate tail node. Essentially we turn
3288           a nested if into a case structure of sorts.
3289            
3290           */
3291            
3292           int made=0;
3293 153294         if (!re_trie_maxbuff) {
3294 6912         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3295 6912         if (!SvIOK(re_trie_maxbuff))
3296 71454         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3297           }
3298 18327959         if ( SvIV(re_trie_maxbuff)>=0 ) {
3299           regnode *cur;
3300           regnode *first = (regnode *)NULL;
3301           regnode *last = (regnode *)NULL;
3302 18292232         regnode *tail = scan;
3303           U8 trietype = 0;
3304           U32 count=0;
3305            
3306           #ifdef DEBUGGING
3307 2072174         SV * const mysv = sv_newmortal(); /* for dumping */
3308           #endif
3309           /* var tail is used because there may be a TAIL
3310           regop in the way. Ie, the exacts will point to the
3311           thing following the TAIL, but the last branch will
3312           point at the TAIL. So we advance tail. If we
3313           have nested (?:) we may have to move through several
3314           tails.
3315           */
3316            
3317 1486420         while ( OP( tail ) == TAIL ) {
3318           /* this is the TAIL generated by (?:) */
3319 59536         tail = regnext( tail );
3320           }
3321            
3322          
3323 39232         DEBUG_TRIE_COMPILE_r({
3324           regprop(RExC_rx, mysv, tail );
3325           PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3326           (int)depth * 2 + 2, "",
3327           "Looking for TRIE'able sequences. Tail node is: ",
3328           SvPV_nolen_const( mysv )
3329           );
3330           });
3331          
3332           /*
3333            
3334           Step through the branches
3335           cur represents each branch,
3336           noper is the first thing to be matched as part of that branch
3337           noper_next is the regnext() of that node.
3338            
3339           We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3340           via a "jump trie" but we also support building with NOJUMPTRIE,
3341           which restricts the trie logic to structures like /FOO|BAR/.
3342            
3343           If noper is a trieable nodetype then the branch is a possible optimization
3344           target. If we are building under NOJUMPTRIE then we require that noper_next
3345           is the same as scan (our current position in the regex program).
3346            
3347           Once we have two or more consecutive such branches we can create a
3348           trie of the EXACT's contents and stitch it in place into the program.
3349            
3350           If the sequence represents all of the branches in the alternation we
3351           replace the entire thing with a single TRIE node.
3352            
3353           Otherwise when it is a subsequence we need to stitch it in place and
3354           replace only the relevant branches. This means the first branch has
3355           to remain as it is used by the alternation logic, and its next pointer,
3356           and needs to be repointed at the item on the branch chain following
3357           the last branch we have optimized away.
3358            
3359           This could be either a BRANCH, in which case the subsequence is internal,
3360           or it could be the item following the branch sequence in which case the
3361           subsequence is at the end (which does not necessarily mean the first node
3362           is the start of the alternation).
3363            
3364           TRIE_TYPE(X) is a define which maps the optype to a trietype.
3365            
3366           optype | trietype
3367           ----------------+-----------
3368           NOTHING | NOTHING
3369           EXACT | EXACT
3370           EXACTFU | EXACTFU
3371           EXACTFU_SS | EXACTFU
3372           EXACTFA | EXACTFA
3373            
3374            
3375           */
3376           #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3377           ( EXACT == (X) ) ? EXACT : \
3378           ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3379           ( EXACTFA == (X) ) ? EXACTFA : \
3380           0 )
3381            
3382           /* dont use tail as the end marker for this traverse */
3383 37628         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3384 37620         regnode * const noper = NEXTOPER( cur );
3385 56         U8 noper_type = OP( noper );
3386 56         U8 noper_trietype = TRIE_TYPE( noper_type );
3387           #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3388 56         regnode * const noper_next = regnext( noper );
3389 56         U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3390 56         U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3391           #endif
3392            
3393 56         DEBUG_TRIE_COMPILE_r({
3394           regprop(RExC_rx, mysv, cur);
3395           PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3396           (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3397            
3398           regprop(RExC_rx, mysv, noper);
3399           PerlIO_printf( Perl_debug_log, " -> %s",
3400           SvPV_nolen_const(mysv));
3401            
3402           if ( noper_next ) {
3403           regprop(RExC_rx, mysv, noper_next );
3404           PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3405           SvPV_nolen_const(mysv));
3406           }
3407           PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3408           REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3409           PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3410           );
3411           });
3412            
3413           /* Is noper a trieable nodetype that can be merged with the
3414           * current trie (if there is one)? */
3415 56         if ( noper_trietype
3416 56         &&
3417           (
3418 39280         ( noper_trietype == NOTHING)
3419 59590         || ( trietype == NOTHING )
3420 1832712         || ( trietype == noper_trietype )
3421           )
3422           #ifdef NOJUMPTRIE
3423           && noper_next == tail
3424           #endif
3425 38191443         && count < U16_MAX)
3426           {
3427           /* Handle mergable triable node
3428           * Either we are the first node in a new trieable sequence,
3429           * in which case we do some bookkeeping, otherwise we update
3430           * the end pointer. */
3431 10896442         if ( !first ) {
3432           first = cur;
3433 10896394         if ( noper_trietype == NOTHING ) {
3434           #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3435           regnode * const noper_next = regnext( noper );
3436           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3437           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3438           #endif
3439            
3440 10896394         if ( noper_next_trietype ) {
3441           trietype = noper_next_trietype;
3442 2414754         } else if (noper_next_type) {
3443           /* a NOTHING regop is 1 regop wide. We need at least two
3444           * for a trie so we can't merge this in */
3445           first = NULL;
3446           }
3447           } else {
3448           trietype = noper_trietype;
3449           }
3450           } else {
3451 2414802         if ( trietype == NOTHING )
3452           trietype = noper_trietype;
3453           last = cur;
3454           }
3455 1144186         if (first)
3456 963624         count++;
3457           } /* end handle mergable triable node */
3458           else {
3459           /* handle unmergable node -
3460           * noper may either be a triable node which can not be tried
3461           * together with the current trie, or a non triable node */
3462 963584         if ( last ) {
3463           /* If last is set and trietype is not NOTHING then we have found
3464           * at least two triable branch sequences in a row of a similar
3465           * trietype so we can turn them into a trie. If/when we
3466           * allow NOTHING to start a trie sequence this condition will be
3467           * required, and it isn't expensive so we leave it in for now. */
3468 296026         if ( trietype && trietype != NOTHING )
3469 296026         make_trie( pRExC_state,
3470           startbranch, first, cur, tail, count,
3471           trietype, depth+1 );
3472           last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3473           }
3474 296034         if ( noper_trietype
3475           #ifdef NOJUMPTRIE
3476           && noper_next == tail
3477           #endif
3478           ){
3479           /* noper is triable, so we can start a new trie sequence */
3480           count = 1;
3481           first = cur;
3482           trietype = noper_trietype;
3483 848112         } else if (first) {
3484           /* if we already saw a first but the current node is not triable then we have
3485           * to reset the first information. */
3486           count = 0;
3487           first = NULL;
3488           trietype = 0;
3489           }
3490           } /* end handle unmergable node */
3491           } /* loop over branches */
3492 667558         DEBUG_TRIE_COMPILE_r({
3493           regprop(RExC_rx, mysv, cur);
3494           PerlIO_printf( Perl_debug_log,
3495           "%*s- %s (%d) \n", (int)depth * 2 + 2,
3496           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3497            
3498           });
3499 848120         if ( last && trietype ) {
3500 3711545         if ( trietype != NOTHING ) {
3501           /* the last branch of the sequence was part of a trie,
3502           * so we have to construct it here outside of the loop
3503           */
3504 418977         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3505           #ifdef TRIE_STUDY_OPT
3506 418985         if ( ((made == MADE_EXACT_TRIE &&
3507 418977         startbranch == first)
3508 3292574         || ( first_non_open == first )) &&
3509           depth==0 ) {
3510 3292574         flags |= SCF_TRIE_RESTUDY;
3511 2975736         if ( startbranch == first
3512 2975736         && scan == tail )
3513           {
3514 4441028         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3515           }
3516           }
3517           #endif
3518           } else {
3519           /* at this point we know whatever we have is a NOTHING sequence/branch
3520           * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3521           */
3522 806         if ( startbranch == first ) {
3523           regnode *opt;
3524           /* the entire thing is a NOTHING sequence, something like this:
3525           * (?:|) So we can turn it into a plain NOTHING op. */
3526 4440386         DEBUG_TRIE_COMPILE_r({
3527           regprop(RExC_rx, mysv, cur);
3528           PerlIO_printf( Perl_debug_log,
3529           "%*s- %s (%d) \n", (int)depth * 2 + 2,
3530           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3531            
3532           });
3533 4440386         OP(startbranch)= NOTHING;
3534 4441024         NEXT_OFF(startbranch)= tail - startbranch;
3535 4441024         for ( opt= startbranch + 1; opt < tail ; opt++ )
3536 2414754         OP(opt)= OPTIMIZED;
3537           }
3538           }
3539           } /* end if ( last) */
3540           } /* TRIE_MAXBUF is non zero */
3541          
3542           } /* do trie */
3543          
3544           }
3545 2414754         else if ( code == BRANCHJ ) { /* single branch is optimized. */
3546 4441024         scan = NEXTOPER(NEXTOPER(scan));
3547           } else /* single branch is optimized. */
3548 4441024         scan = NEXTOPER(scan);
3549 5156027         continue;
3550 3965620         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3551           scan_frame *newframe = NULL;
3552           I32 paren;
3553           regnode *start;
3554           regnode *end;
3555            
3556 3965320         if (OP(scan) != SUSPEND) {
3557           /* set the pointer */
3558 5156019         if (OP(scan) == GOSUB) {
3559 5155657         paren = ARG(scan);
3560 5155657         RExC_recurse[ARG2L(scan)] = scan;
3561 5155657         start = RExC_open_parens[paren-1];
3562 860149         end = RExC_close_parens[paren-1];
3563           } else {
3564           paren = 0;
3565 5156019         start = RExC_rxi->program + 1;
3566 1939909         end = RExC_opend;
3567           }
3568 1939909         if (!recursed) {
3569 1939909         Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3570 1939909         SAVEFREEPV(recursed);
3571           }
3572 1939909         if (!PAREN_TEST(recursed,paren+1)) {
3573 5156019         PAREN_SET(recursed,paren+1);
3574 426292         Newx(newframe,1,scan_frame);
3575           } else {
3576 5156019         if (flags & SCF_DO_SUBSTR) {
3577 5156019         SCAN_COMMIT(pRExC_state,data,minlenp);
3578 1939909         data->longest = &(data->longest_float);
3579           }
3580           is_inf = is_inf_internal = 1;
3581 5156019         if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3582 3606443         cl_anything(pRExC_state, data->start_class);
3583 245366         flags &= ~SCF_DO_STCLASS;
3584           }
3585           } else {
3586 3361077         Newx(newframe,1,scan_frame);
3587           paren = stopparen;
3588 985875         start = scan+2;
3589 985875         end = regnext(scan);
3590           }
3591 985875         if (newframe) {
3592 985875         assert(start);
3593 985875         assert(end);
3594 985875         SAVEFREEPV(newframe);
3595 1549576         newframe->next = regnext(scan);
3596 61936         newframe->last = last;
3597 61936         newframe->stop = stopparen;
3598 1487640         newframe->prev = frame;
3599            
3600           frame = newframe;
3601 646732         scan = start;
3602           stopparen = paren;
3603           last = end;
3604            
3605 1549576         continue;
3606           }
3607           }
3608 5156319         else if (OP(scan) == EXACT) {
3609 2741119         SSize_t l = STR_LEN(scan);
3610           UV uc;
3611 5156163         if (UTF) {
3612 5154387         const U8 * const s = (U8*)STRING(scan);
3613 2482219         uc = utf8_to_uvchr_buf(s, s + l, NULL);
3614 7076         l = utf8_length(s, s + l);
3615           } else {
3616 338         uc = *((U8*)STRING(scan));
3617           }
3618 166         min += l;
3619 166         if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3620           /* The code below prefers earlier match for fixed
3621           offset, later match for variable offset. */
3622 136         if (data->last_end == -1) { /* Update the start info. */
3623 5156017         data->last_start_min = data->pos_min;
3624 7688130         data->last_start_max = is_inf
3625 4320789         ? SSize_t_MAX : data->pos_min + data->pos_delta;
3626           }
3627 5156133         sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3628 5156133         if (UTF)
3629 2014207         SvUTF8_on(data->last_found);
3630           {
3631 5156133         SV * const sv = data->last_found;
3632 2414888         MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3633 1179612         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3634 478270         if (mg && mg->mg_len >= 0)
3635 478152         mg->mg_len += utf8_length((U8*)STRING(scan),
3636           (U8*)STRING(scan)+STR_LEN(scan));
3637           }
3638 478270         data->last_end = data->pos_min + l;
3639 282985         data->pos_min += l; /* As in the first entry. */
3640 182450         data->flags &= ~SF_BEFORE_EOL;
3641           }
3642 412500         if (flags & SCF_DO_STCLASS_AND) {
3643           /* Check whether it is compatible with what we know already! */
3644           int compat = 1;
3645            
3646            
3647           /* If compatible, we or it in below. It is compatible if is
3648           * in the bitmp and either 1) its bit or its fold is set, or 2)
3649           * it's for a locale. Even if there isn't unicode semantics
3650           * here, at runtime there may be because of matching against a
3651           * utf8 string, so accept a possible false positive for
3652           * latin1-range folds */
3653 412612         if (uc >= 0x100 ||
3654 412478         (!(data->start_class->flags & ANYOF_LOCALE)
3655 470         && !ANYOF_BITMAP_TEST(data->start_class, uc)
3656 348         && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3657 412356         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3658           )
3659           {
3660           compat = 0;
3661           }
3662 412490         ANYOF_CLASS_ZERO(data->start_class);
3663 412490         ANYOF_BITMAP_ZERO(data->start_class);
3664 5156149         if (compat)
3665 2002516         ANYOF_BITMAP_SET(data->start_class, uc);
3666 1453778         else if (uc >= 0x100) {
3667           int i;
3668            
3669           /* Some Unicode code points fold to the Latin1 range; as
3670           * XXX temporary code, instead of figuring out if this is
3671           * one, just assume it is and set all the start class bits
3672           * that could be some such above 255 code point's fold
3673           * which will generate fals positives. As the code
3674           * elsewhere that does compute the fold settles down, it
3675           * can be extracted out and re-used here */
3676 1456838         for (i = 0; i < 256; i++){
3677 527818         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3678 515052         ANYOF_BITMAP_SET(data->start_class, i);
3679           }
3680           }
3681           }
3682 514838         CLEAR_SSC_EOS(data->start_class);
3683 657695         if (uc < 0x100)
3684 794420         data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3685           }
3686 514714         else if (flags & SCF_DO_STCLASS_OR) {
3687           /* false positive possible if the class is case-folded */
3688 630904         if (uc < 0x100)
3689 116200         ANYOF_BITMAP_SET(data->start_class, uc);
3690           else
3691 116200         data->start_class->flags |= ANYOF_UNICODE_ALL;
3692 116200         CLEAR_SSC_EOS(data->start_class);
3693 348         cl_and(data->start_class, and_withp);
3694           }
3695 492         flags &= ~SCF_DO_STCLASS;
3696           }
3697 116356         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3698 116234         SSize_t l = STR_LEN(scan);
3699 116234         UV uc = *((U8*)STRING(scan));
3700            
3701           /* Search for fixed substrings supports EXACT only. */
3702 398538         if (flags & SCF_DO_SUBSTR) {
3703 4641317         assert(data);
3704 1487696         SCAN_COMMIT(pRExC_state, data, minlenp);
3705           }
3706 1487724         if (UTF) {
3707 709004         const U8 * const s = (U8 *)STRING(scan);
3708 708790         uc = utf8_to_uvchr_buf(s, s + l, NULL);
3709 708790         l = utf8_length(s, s + l);
3710           }
3711 34         if (has_exactf_sharp_s) {
3712 1062285         RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3713           }
3714 708824         min += l - min_subtract;
3715 5156049         assert (min >= 0);
3716 631579         delta += min_subtract;
3717 5156049         if (flags & SCF_DO_SUBSTR) {
3718 3965322         data->pos_min += l - min_subtract;
3719 3965322         if (data->pos_min < 0) {
3720 118234         data->pos_min = 0;
3721           }
3722 118240         data->pos_delta += min_subtract;
3723 118240         if (min_subtract) {
3724 118234         data->longest = &(data->longest_float);
3725           }
3726           }
3727 118268         if (flags & SCF_DO_STCLASS_AND) {
3728           /* Check whether it is compatible with what we know already! */
3729           int compat = 1;
3730 3272         if (uc >= 0x100 ||
3731 118262         (!(data->start_class->flags & ANYOF_LOCALE)
3732 118262         && !ANYOF_BITMAP_TEST(data->start_class, uc)
3733 118234         && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3734           {
3735           compat = 0;
3736           }
3737 111634         ANYOF_CLASS_ZERO(data->start_class);
3738 14296         ANYOF_BITMAP_ZERO(data->start_class);
3739 14296         if (compat) {
3740 14296         ANYOF_BITMAP_SET(data->start_class, uc);
3741 14296         CLEAR_SSC_EOS(data->start_class);
3742 14296         if (OP(scan) == EXACTFL) {
3743           /* XXX This set is probably no longer necessary, and
3744           * probably wrong as LOCALE now is on in the initial
3745           * state */
3746 14268         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3747           }
3748           else {
3749            
3750           /* Also set the other member of the fold pair. In case
3751           * that unicode semantics is called for at runtime, use
3752           * the full latin1 fold. (Can't do this for locale,
3753           * because not known until runtime) */
3754 6988         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3755            
3756           /* All other (EXACTFL handled above) folds except under
3757           * /iaa that include s, S, and sharp_s also may include
3758           * the others */
3759 14296         if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
3760           {
3761 14296         if (uc == 's' || uc == 'S') {
3762 0         ANYOF_BITMAP_SET(data->start_class,
3763           LATIN_SMALL_LETTER_SHARP_S);
3764           }
3765 14296         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3766 6628         ANYOF_BITMAP_SET(data->start_class, 's');
3767 9942         ANYOF_BITMAP_SET(data->start_class, 'S');
3768           }
3769           }
3770           }
3771           }
3772 6628         else if (uc >= 0x100) {
3773           int i;
3774 3965316         for (i = 0; i < 256; i++){
3775 5570619         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3776 3277556         ANYOF_BITMAP_SET(data->start_class, i);
3777           }
3778           }
3779           }
3780           }
3781 794028         else if (flags & SCF_DO_STCLASS_OR) {
3782 6283198         if (data->start_class->flags & ANYOF_LOC_FOLD) {
3783           /* false positive possible if the class is case-folded.
3784           Assume that the locale settings are the same... */
3785 6342588         if (uc < 0x100) {
3786 3965316         ANYOF_BITMAP_SET(data->start_class, uc);
3787 3943698         if (OP(scan) != EXACTFL) {
3788            
3789           /* And set the other member of the fold pair, but
3790           * can't do that in locale because not known until
3791           * run-time */
3792 3943698         ANYOF_BITMAP_SET(data->start_class,
3793           PL_fold_latin1[uc]);
3794            
3795           /* All folds except under /iaa that include s, S,
3796           * and sharp_s also may include the others */
3797 113028         if (OP(scan) != EXACTFA
3798 57382         && OP(scan) != EXACTFA_NO_TRIE)
3799           {
3800 116236         if (uc == 's' || uc == 'S') {
3801 113028         ANYOF_BITMAP_SET(data->start_class,
3802           LATIN_SMALL_LETTER_SHARP_S);
3803           }
3804 3208         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3805 113028         ANYOF_BITMAP_SET(data->start_class, 's');
3806 113028         ANYOF_BITMAP_SET(data->start_class, 'S');
3807           }
3808           }
3809           }
3810           }
3811 113028         CLEAR_SSC_EOS(data->start_class);
3812           }
3813 113028         cl_and(data->start_class, and_withp);
3814           }
3815 127210         flags &= ~SCF_DO_STCLASS;
3816           }
3817 14270         else if (REGNODE_VARIES(OP(scan))) {
3818           SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
3819 14148         I32 fl = 0, f = flags;
3820 3943698         regnode * const oscan = scan;
3821           struct regnode_charclass_class this_class;
3822           struct regnode_charclass_class *oclass = NULL;
3823           I32 next_is_eval = 0;
3824            
3825 3965316         switch (PL_regkind[OP(scan)]) {
3826           case WHILEM: /* End of (?:...)* . */
3827 5156015         scan = NEXTOPER(scan);
3828 672         goto finish;
3829           case PLUS:
3830 8448583         if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3831 7893119         next = NEXTOPER(scan);
3832 932226         if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3833           mincount = 1;
3834           maxcount = REG_INFTY;
3835 932226         next = regnext(scan);
3836 8448583         scan = NEXTOPER(scan);
3837 33053         goto do_curly;
3838           }
3839           }
3840 25923         if (flags & SCF_DO_SUBSTR)
3841 25923         data->pos_min++;
3842 33053         min++;
3843           /* Fall through. */
3844           case STAR:
3845 360         if (flags & SCF_DO_STCLASS) {
3846           mincount = 0;
3847           maxcount = REG_INFTY;
3848 33053         next = regnext(scan);
3849 27295001         scan = NEXTOPER(scan);
3850 18188         goto do_curly;
3851           }
3852           is_inf = is_inf_internal = 1;
3853 17490         scan = regnext(scan);
3854 17490         if (flags & SCF_DO_SUBSTR) {
3855 4486185         SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3856 4477440         data->longest = &(data->longest_float);
3857           }
3858           goto optimize_curly_tail;
3859           case CURLY:
3860 4389990         if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3861 0         && (scan->flags == stopparen))
3862           {
3863           mincount = 1;
3864           maxcount = 1;
3865           } else {
3866 0         mincount = ARG1(scan);
3867 0         maxcount = ARG2(scan);
3868           }
3869 17490         next = regnext(scan);
3870 0         if (OP(scan) == CURLYX) {
3871 17490         I32 lp = (data ? *(data->last_closep) : 0);
3872 18188         scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3873           }
3874 18188         scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3875 18188         next_is_eval = (OP(scan) == EVAL);
3876           do_curly:
3877 1116         if (flags & SCF_DO_SUBSTR) {
3878 1116         if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3879 1116         pos_before = data->pos_min;
3880           }
3881 1116         if (data) {
3882 27276813         fl = data->flags;
3883 4266835         data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3884 2184871         if (is_inf)
3885 2184871         data->flags |= SF_IS_INF;
3886           }
3887 4266835         if (flags & SCF_DO_STCLASS) {
3888 4266835         cl_init(pRExC_state, &this_class);
3889 1456087         oclass = data->start_class;
3890 1456087         data->start_class = &this_class;
3891 167964         f |= SCF_DO_STCLASS_AND;
3892 0         f &= ~SCF_DO_STCLASS_OR;
3893           }
3894           /* Exclude from super-linear cache processing any {n,m}
3895           regops for which the combination of input pos and regex
3896           pos is not enough information to determine if a match
3897           will be possible.
3898            
3899           For example, in the regex /foo(bar\s*){4,8}baz/ with the
3900           regex pos at the \s*, the prospects for a match depend not
3901           only on the input position but also on how many (bar\s*)
3902           repeats into the {4,8} we are. */
3903 279325         if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3904 111361         f &= ~SCF_WHILEM_VISITED_POS;
3905            
3906           /* This will finish on WHILEM, setting scan, or on NULL: */
3907 442         minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3908           last, data, stopparen, recursed, NULL,
3909           (mincount == 0
3910           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3911            
3912 28         if (flags & SCF_DO_STCLASS)
3913 442         data->start_class = oclass;
3914 111361         if (mincount == 0 || minnext == 0) {
3915 110947         if (flags & SCF_DO_STCLASS_OR) {
3916 856944         cl_or(pRExC_state, data->start_class, &this_class);
3917           }
3918 768424         else if (flags & SCF_DO_STCLASS_AND) {
3919           /* Switch to OR mode: cache the old value of
3920           * data->start_class */
3921 88520         INIT_AND_WITHP;
3922 294204         StructCopy(data->start_class, and_withp,
3923           struct regnode_charclass_class);
3924 294204         flags &= ~SCF_DO_STCLASS_AND;
3925 291516         StructCopy(&this_class, data->start_class,
3926           struct regnode_charclass_class);
3927 290952         flags |= SCF_DO_STCLASS_OR;
3928 74774664         SET_SSC_EOS(data->start_class);
3929           }
3930           } else { /* Non-zero len */
3931 74483712         if (flags & SCF_DO_STCLASS_OR) {
3932 64631706         cl_or(pRExC_state, data->start_class, &this_class);
3933 2688         cl_and(data->start_class, and_withp);
3934           }
3935 58         else if (flags & SCF_DO_STCLASS_AND)
3936 674595         cl_and(data->start_class, &this_class);
3937 673280         flags &= ~SCF_DO_STCLASS;
3938           }
3939 59340         if (!scan) /* It was not CURLYX, but CURLY. */
3940 25614         scan = next;
3941 25614         if (!(flags & SCF_TRIE_DOING_RESTUDY)
3942           /* ? quantifier ok, except for (?{ ... }) */
3943 22470         && (next_is_eval || !(mincount == 0 && maxcount == 1))
3944 22470         && (minnext == 0) && (deltanext == 0)
3945 5774790         && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3946 5752320         && maxcount <= REG_INFTY/3) /* Complement check for big count */
3947           {
3948           /* Fatal warnings may leak the regexp without this: */
3949 1325032         SAVEFREESV(RExC_rx_sv);
3950 3144         ckWARNreg(RExC_parse,
3951           "Quantifier unexpected on zero-length expression");
3952 0         (void)ReREFCNT_inc(RExC_rx_sv);
3953           }
3954            
3955 806616         min += minnext * mincount;
3956 804864         is_inf_internal |= deltanext == SSize_t_MAX
3957 765560         || (maxcount == REG_INFTY && minnext + deltanext > 0);
3958 3144         is_inf |= is_inf_internal;
3959 3144         if (is_inf)
3960           delta = SSize_t_MAX;
3961           else
3962 1456087         delta += (minnext + deltanext) * maxcount - minnext * mincount;
3963            
3964           /* Try powerful optimization CURLYX => CURLYN. */
3965 94794         if ( OP(oscan) == CURLYX && data
3966           && data->flags & SF_IN_PAR
3967 1456087         && !(data->flags & SF_HAS_EVAL)
3968 23009978         && !deltanext && minnext == 1 ) {
3969           /* Try to optimize to CURLYN. */
3970 4741990         regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3971           regnode * const nxt1 = nxt;
3972           #ifdef DEBUGGING
3973           regnode *nxt2;
3974           #endif
3975            
3976           /* Skip open. */
3977 2370995         nxt = regnext(nxt);
3978 2370995         if (!REGNODE_SIMPLE(OP(nxt))
3979 20638983         && !(PL_regkind[OP(nxt)] == EXACT
3980 219885         && STR_LEN(nxt) == 1))
3981           goto nogo;
3982           #ifdef DEBUGGING
3983           nxt2 = nxt;
3984           #endif
3985 219885         nxt = regnext(nxt);
3986 219885         if (OP(nxt) != CLOSE)
3987           goto nogo;
3988 144920         if (RExC_open_parens) {
3989 71920         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3990 0         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3991           }
3992           /* Now we know that nxt2 is the only contents: */
3993 0         oscan->flags = (U8)ARG(nxt);
3994 0         OP(oscan) = CURLYN;
3995 0         OP(nxt1) = NOTHING; /* was OPEN. */
3996            
3997           #ifdef DEBUGGING
3998 0         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3999 0         NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4000 0         NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4001 0         OP(nxt) = OPTIMIZED; /* was CLOSE. */
4002 219885         OP(nxt + 1) = OPTIMIZED; /* was count. */
4003 219885         NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4004           #endif
4005           }
4006           nogo:
4007            
4008           /* Try optimization CURLYX => CURLYM. */
4009 219885         if ( OP(oscan) == CURLYX && data
4010           && !(data->flags & SF_HAS_PAR)
4011 219885         && !(data->flags & SF_HAS_EVAL)
4012 219885         && !deltanext /* atom is fixed width */
4013 0         && minnext != 0 /* CURLYM can't handle zero width */
4014 219885         && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4015           ) {
4016           /* XXXX How to optimize if data == 0? */
4017           /* Optimize to a simpler form. */
4018 219885         regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4019           regnode *nxt2;
4020            
4021 64270         OP(oscan) = CURLYM;
4022 9444         while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4023 9444         && (OP(nxt2) != WHILEM))
4024           nxt = nxt2;
4025 219885         OP(nxt2) = SUCCEED; /* Whas WHILEM */
4026           /* Need to optimize away parenths. */
4027 217387         if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4028           /* Set the parenth number. */
4029 219885         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4030            
4031 219885         oscan->flags = (U8)ARG(nxt);
4032 219885         if (RExC_open_parens) {
4033 219885         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4034 46724         RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4035           }
4036 16         OP(nxt1) = OPTIMIZED; /* was OPEN. */
4037 46708         OP(nxt) = OPTIMIZED; /* was CLOSE. */
4038            
4039           #ifdef DEBUGGING
4040 2         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4041 46706         OP(nxt + 1) = OPTIMIZED; /* was count. */
4042 219867         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4043 219867         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4044           #endif
4045           #if 0
4046           while ( nxt1 && (OP(nxt1) != WHILEM)) {
4047           regnode *nnxt = regnext(nxt1);
4048           if (nnxt == nxt) {
4049           if (reg_off_by_arg[OP(nxt1)])
4050           ARG_SET(nxt1, nxt2 - nxt1);
4051           else if (nxt2 - nxt1 < U16_MAX)
4052           NEXT_OFF(nxt1) = nxt2 - nxt1;
4053           else
4054           OP(nxt) = NOTHING; /* Cannot beautify */
4055           }
4056           nxt1 = nnxt;
4057           }
4058           #endif
4059           /* Optimize again: */
4060 1634         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4061           NULL, stopparen, recursed, NULL, 0,depth+1);
4062           }
4063           else
4064 219867         oscan->flags = 0;
4065           }
4066 0         else if ((OP(oscan) == CURLYX)
4067 219867         && (flags & SCF_WHILEM_VISITED_POS)
4068           /* See the comment on a similar expression above.
4069           However, this time it's not a subexpression
4070           we care about, but the expression itself. */
4071 219867         && (maxcount == REG_INFTY)
4072 9444         && data && ++data->whilem_c < 16) {
4073           /* This stays as CURLYX, we can put the count/of pair. */
4074           /* Find WHILEM (as in regexec.c) */
4075 178         regnode *nxt = oscan + NEXT_OFF(oscan);
4076            
4077 9266         if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4078 9266         nxt += ARG(nxt);
4079 9266         PREVOPER(nxt)->flags = (U8)(data->whilem_c
4080 9266         | (RExC_whilem_seen << 4)); /* On WHILEM */
4081           }
4082 20419098         if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4083 5604918         pars++;
4084 5601450         if (flags & SCF_DO_SUBSTR) {
4085           SV *last_str = NULL;
4086 14814180         int counted = mincount != 0;
4087            
4088 5602032         if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4089           #if defined(SPARC64_GCC_WORKAROUND)
4090           SSize_t b = 0;
4091           STRLEN l = 0;
4092           const char *s = NULL;
4093           SSize_t old = 0;
4094            
4095           if (pos_before >= data->last_start_min)
4096           b = pos_before;
4097           else
4098           b = data->last_start_min;
4099            
4100           l = 0;
4101           s = SvPV_const(data->last_found, l);
4102           old = b - data->last_start_min;
4103            
4104           #else
4105 5601450         SSize_t b = pos_before >= data->last_start_min
4106 2388919         ? pos_before : data->last_start_min;
4107           STRLEN l;
4108 2388919         const char * const s = SvPV_const(data->last_found, l);
4109 5601450         SSize_t old = b - data->last_start_min;
4110           #endif
4111            
4112 5601450         if (UTF)
4113 9212148         old = utf8_hop((U8*)s, old) - (U8*)s;
4114 114456         l -= old;
4115           /* Get the added string: */
4116 114456         last_str = newSVpvn_utf8(s + old, l, UTF);
4117 9097692         if (deltanext == 0 && pos_before == b) {
4118           /* What was added is a constant string */
4119 746         if (mincount > 1) {
4120 506         SvGROW(last_str, (mincount * l) + 1);
4121 506         repeatcpy(SvPVX(last_str) + l,
4122 746         SvPVX_const(last_str), l, mincount - 1);
4123 116         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4124           /* Add additional parts. */
4125 116         SvCUR_set(data->last_found,
4126           SvCUR(data->last_found) - l);
4127 9096946         sv_catsv(data->last_found, last_str);
4128           {
4129 111760         SV * sv = data->last_found;
4130           MAGIC *mg =
4131 2856         SvUTF8(sv) && SvMAGICAL(sv) ?
4132 2856         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4133 111760         if (mg && mg->mg_len >= 0)
4134 158         mg->mg_len += CHR_SVLEN(last_str) - l;
4135           }
4136 111760         data->last_end += l * (mincount - 1);
4137           }
4138           } else {
4139           /* start offset must point into the last copy */
4140 8985186         data->last_start_min += minnext * (mincount - 1);
4141 184878         data->last_start_max += is_inf ? SSize_t_MAX
4142 184878         : (maxcount - 1) * (minnext + data->pos_delta);
4143           }
4144           }
4145           /* It is counted once already... */
4146 123612         data->pos_min += minnext * (mincount - counted);
4147           #if 0
4148           PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4149           " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4150           " maxcount=%"UVdf" mincount=%"UVdf"\n",
4151           (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4152           (UV)mincount);
4153           if (deltanext != SSize_t_MAX)
4154           PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4155           (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4156           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4157           #endif
4158 28         if (deltanext == SSize_t_MAX ||
4159 123612         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4160 60         data->pos_delta = SSize_t_MAX;
4161           else
4162 0         data->pos_delta += - counted * deltanext +
4163 0         (minnext + deltanext) * maxcount - minnext * mincount;
4164 8861574         if (mincount != maxcount) {
4165           /* Cannot extend fixed substrings found inside
4166           the group. */
4167 36473         SCAN_COMMIT(pRExC_state,data,minlenp);
4168 36473         if (mincount && last_str) {
4169 36473         SV * const sv = data->last_found;
4170 36473         MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4171 32863         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4172            
4173 36473         if (mg)
4174 214         mg->mg_len = -1;
4175 36473         sv_setsv(sv, last_str);
4176 34725         data->last_end = data->pos_min;
4177 34725         data->last_start_min =
4178 6016         data->pos_min - CHR_SVLEN(last_str);
4179 5142         data->last_start_max = is_inf
4180           ? SSize_t_MAX
4181 5142         : data->pos_min + data->pos_delta
4182 5142         - CHR_SVLEN(last_str);
4183           }
4184 4470         data->longest = &(data->longest_float);
4185           }
4186 4470         SvREFCNT_dec(last_str);
4187           }
4188 672         if (data && (fl & SF_HAS_EVAL))
4189 5142         data->flags |= SF_HAS_EVAL;
4190           optimize_curly_tail:
4191 5142         if (OP(oscan) != CURLYX) {
4192 0         while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4193 0         && NEXT_OFF(next))
4194 5142         NEXT_OFF(oscan) += NEXT_OFF(next);
4195           }
4196 4470         continue;
4197           default: /* REF, and CLUMP only? */
4198 5142         if (flags & SCF_DO_SUBSTR) {
4199 3766         SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4200 1748         data->longest = &(data->longest_float);
4201           }
4202           is_inf = is_inf_internal = 1;
4203 3766         if (flags & SCF_DO_STCLASS_OR)
4204 3766         cl_anything(pRExC_state, data->start_class);
4205 5142         flags &= ~SCF_DO_STCLASS;
4206           break;
4207           }
4208           }
4209 3100         else if (OP(scan) == LNBREAK) {
4210 5142         if (flags & SCF_DO_STCLASS) {
4211           int value = 0;
4212 2298         CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4213 5142         if (flags & SCF_DO_STCLASS_AND) {
4214 3970         for (value = 0; value < 256; value++)
4215 1440         if (!is_VERTWS_cp(value))
4216 5142         ANYOF_BITMAP_CLEAR(data->start_class, value);
4217           }
4218           else {
4219 834         for (value = 0; value < 256; value++)
4220 5142         if (is_VERTWS_cp(value))
4221 0         ANYOF_BITMAP_SET(data->start_class, value);
4222           }
4223 0         if (flags & SCF_DO_STCLASS_OR)
4224 0         cl_and(data->start_class, and_withp);
4225 0         flags &= ~SCF_DO_STCLASS;
4226           }
4227 0         min++;
4228 5142         delta++; /* Because of the 2 char string cr-lf */
4229 4470         if (flags & SCF_DO_SUBSTR) {
4230 24         SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4231 4470         data->pos_min += 1;
4232 5142         data->pos_delta += 1;
4233 0         data->longest = &(data->longest_float);
4234           }
4235           }
4236 36595         else if (REGNODE_SIMPLE(OP(scan))) {
4237           int value = 0;
4238            
4239 32869         if (flags & SCF_DO_SUBSTR) {
4240 32869         SCAN_COMMIT(pRExC_state,data,minlenp);
4241 32869         data->pos_min++;
4242           }
4243 30019         min++;
4244 36479         if (flags & SCF_DO_STCLASS) {
4245           int loop_max = 256;
4246 36473         CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4247            
4248           /* Some of the logic below assumes that switching
4249           locale on will only add false positives. */
4250 36473         switch (PL_regkind[OP(scan)]) {
4251           U8 classnum;
4252            
4253           case SANY:
4254           default:
4255           #ifdef DEBUGGING
4256 42         Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4257           #endif
4258           do_default:
4259 42         if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4260 42         cl_anything(pRExC_state, data->start_class);
4261           break;
4262           case REG_ANY:
4263 42         if (OP(scan) == SANY)
4264           goto do_default;
4265 36431         if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4266 172         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4267 0         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4268 0         cl_anything(pRExC_state, data->start_class);
4269           }
4270 172         if (flags & SCF_DO_STCLASS_AND || !value)
4271 172         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4272           break;
4273           case ANYOF:
4274 172         if (flags & SCF_DO_STCLASS_AND)
4275 172         cl_and(data->start_class,
4276           (struct regnode_charclass_class*)scan);
4277           else
4278 172         cl_or(pRExC_state, data->start_class,
4279           (struct regnode_charclass_class*)scan);
4280           break;
4281           case POSIXA:
4282           loop_max = 128;
4283           /* FALL THROUGH */
4284           case POSIXL:
4285           case POSIXD:
4286           case POSIXU:
4287 172         classnum = FLAGS(scan);
4288 36473         if (flags & SCF_DO_STCLASS_AND) {
4289 36473         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4290 46720021         ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4291 15077722         for (value = 0; value < loop_max; value++) {
4292 19658         if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4293 19658         ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4294           }
4295           }
4296           }
4297           }
4298           else {
4299 19658         if (data->start_class->flags & ANYOF_LOCALE) {
4300 19658         ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4301           }
4302           else {
4303            
4304           /* Even if under locale, set the bits for non-locale
4305           * in case it isn't a true locale-node. This will
4306           * create false positives if it truly is locale */
4307 19658         for (value = 0; value < loop_max; value++) {
4308 17472818         if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4309 17472818         ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4310           }
4311           }
4312           }
4313           }
4314           break;
4315           case NPOSIXA:
4316           loop_max = 128;
4317           /* FALL THROUGH */
4318           case NPOSIXL:
4319           case NPOSIXU:
4320           case NPOSIXD:
4321 17472818         classnum = FLAGS(scan);
4322 4343511         if (flags & SCF_DO_STCLASS_AND) {
4323 17472818         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4324 17472818         ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4325 1306136         for (value = 0; value < loop_max; value++) {
4326 1306136         if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4327 16166682         ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4328           }
4329           }
4330           }
4331           }
4332           else {
4333 3838307         if (data->start_class->flags & ANYOF_LOCALE) {
4334 3838307         ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4335           }
4336           else {
4337            
4338           /* Even if under locale, set the bits for non-locale in
4339           * case it isn't a true locale-node. This will create
4340           * false positives if it truly is locale */
4341 17472818         for (value = 0; value < loop_max; value++) {
4342 382429         if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4343 17472818         ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4344           }
4345           }
4346 33895         if (PL_regkind[OP(scan)] == NPOSIXD) {
4347 17472818         data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4348           }
4349           }
4350           }
4351           break;
4352           }
4353 2753131         if (flags & SCF_DO_STCLASS_OR)
4354 2753131         cl_and(data->start_class, and_withp);
4355 2753131         flags &= ~SCF_DO_STCLASS;
4356           }
4357           }
4358 2753247         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4359 666260         data->flags |= (OP(scan) == MEOL
4360           ? SF_BEFORE_MEOL
4361 2086883         : SF_BEFORE_SEOL);
4362 2753135         SCAN_COMMIT(pRExC_state, data, minlenp);
4363            
4364           }
4365 2753243         else if ( PL_regkind[OP(scan)] == BRANCHJ
4366           /* Lookbehind, or need to calculate parens/evals/stclass: */
4367 2753131         && (scan->flags || data || (flags & SCF_DO_STCLASS))
4368 48         && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4369 48         if ( OP(scan) == UNLESSM &&
4370 48         scan->flags == 0 &&
4371 0         OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4372 0         OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4373           ) {
4374           regnode *opt;
4375 0         regnode *upto= regnext(scan);
4376 0         DEBUG_PARSE_r({
4377           SV * const mysv_val=sv_newmortal();
4378           DEBUG_STUDYDATA("OPFAIL",data,depth);
4379            
4380           /*DEBUG_PARSE_MSG("opfail");*/
4381           regprop(RExC_rx, mysv_val, upto);
4382           PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4383           SvPV_nolen_const(mysv_val),
4384           (IV)REG_NODE_NUM(upto),
4385           (IV)(upto - scan)
4386           );
4387           });
4388 0         OP(scan) = OPFAIL;
4389 0         NEXT_OFF(scan) = upto - scan;
4390 0         for (opt= scan + 1; opt < upto ; opt++)
4391 0         OP(opt) = OPTIMIZED;
4392 336         scan= upto;
4393 288         continue;
4394           }
4395           if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4396           || OP(scan) == UNLESSM )
4397           {
4398           /* Negative Lookahead/lookbehind
4399           In this case we can't do fixed string optimisation.
4400           */
4401            
4402 48         SSize_t deltanext, minnext, fake = 0;
4403           regnode *nscan;
4404           struct regnode_charclass_class intrnl;
4405           int f = 0;
4406            
4407 48         data_fake.flags = 0;
4408 4931834         if (data) {
4409 4931834         data_fake.whilem_c = data->whilem_c;
4410 4477820         data_fake.last_closep = data->last_closep;
4411           }
4412           else
4413 4477820         data_fake.last_closep = &fake;
4414 4477814         data_fake.pos_delta = delta;
4415 4477814         if ( flags & SCF_DO_STCLASS && !scan->flags
4416 56         && OP(scan) == IFMATCH ) { /* Lookahead */
4417 454014         cl_init(pRExC_state, &intrnl);
4418 50512         data_fake.start_class = &intrnl;
4419           f |= SCF_DO_STCLASS_AND;
4420           }
4421 50512         if (flags & SCF_WHILEM_VISITED_POS)
4422 2547655         f |= SCF_WHILEM_VISITED_POS;
4423 0         next = regnext(scan);
4424 0         nscan = NEXTOPER(NEXTOPER(scan));
4425 0         minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4426           last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4427 0         if (scan->flags) {
4428 0         if (deltanext) {
4429 0         FAIL("Variable length lookbehind not implemented");
4430           }
4431 312412         else if (minnext > (I32)U8_MAX) {
4432 312412         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4433           }
4434 312412         scan->flags = (U8)minnext;
4435           }
4436 7322298         if (data) {
4437 6853680         if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4438 6853676         pars++;
4439 4         if (data_fake.flags & SF_HAS_EVAL)
4440 4         data->flags |= SF_HAS_EVAL;
4441 6853680         data->whilem_c = data_fake.whilem_c;
4442           }
4443 3482         if (f & SCF_DO_STCLASS_AND) {
4444 202         if (flags & SCF_DO_STCLASS_OR) {
4445           /* OR before, AND after: ideally we would recurse with
4446           * data_fake to get the AND applied by study of the
4447           * remainder of the pattern, and then derecurse;
4448           * *** HACK *** for now just treat as "no information".
4449           * See [perl #56690].
4450           */
4451 202         cl_init(pRExC_state, data->start_class);
4452           } else {
4453           /* AND before and after: combine and continue */
4454 3280         const int was = TEST_SSC_EOS(data->start_class);
4455            
4456 202         cl_and(data->start_class, &intrnl);
4457 202         if (was)
4458 6853680         SET_SSC_EOS(data->start_class);
4459           }
4460           }
4461           }
4462           #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4463           else {
4464           /* Positive Lookahead/lookbehind
4465           In this case we can do fixed string optimisation,
4466           but we must be careful about it. Note in the case of
4467           lookbehind the positions will be offset by the minimum
4468           length of the pattern, something we won't know about
4469           until after the recurse.
4470           */
4471           SSize_t deltanext;
4472           I32 fake = 0;
4473           regnode *nscan;
4474           struct regnode_charclass_class intrnl;
4475           int f = 0;
4476           /* We use SAVEFREEPV so that when the full compile
4477           is finished perl will clean up the allocated
4478           minlens when it's all done. This way we don't
4479           have to worry about freeing them when we know
4480           they wont be used, which would be a pain.
4481           */
4482           SSize_t *minnextp;
4483           Newx( minnextp, 1, SSize_t );
4484           SAVEFREEPV(minnextp);
4485            
4486           if (data) {
4487           StructCopy(data, &data_fake, scan_data_t);
4488           if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4489           f |= SCF_DO_SUBSTR;
4490           if (scan->flags)
4491           SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4492           data_fake.last_found=newSVsv(data->last_found);
4493           }
4494           }
4495           else
4496           data_fake.last_closep = &fake;
4497           data_fake.flags = 0;
4498           data_fake.pos_delta = delta;
4499           if (is_inf)
4500           data_fake.flags |= SF_IS_INF;
4501           if ( flags & SCF_DO_STCLASS && !scan->flags
4502           && OP(scan) == IFMATCH ) { /* Lookahead */
4503           cl_init(pRExC_state, &intrnl);
4504           data_fake.start_class = &intrnl;
4505           f |= SCF_DO_STCLASS_AND;
4506           }
4507           if (flags & SCF_WHILEM_VISITED_POS)
4508           f |= SCF_WHILEM_VISITED_POS;
4509           next = regnext(scan);
4510           nscan = NEXTOPER(NEXTOPER(scan));
4511            
4512           *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4513           last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4514           if (scan->flags) {
4515           if (deltanext) {
4516           FAIL("Variable length lookbehind not implemented");
4517           }
4518           else if (*minnextp > (I32)U8_MAX) {
4519           FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4520           }
4521           scan->flags = (U8)*minnextp;
4522           }
4523            
4524           *minnextp += min;
4525            
4526           if (f & SCF_DO_STCLASS_AND) {
4527           const int was = TEST_SSC_EOS(data.start_class);
4528            
4529           cl_and(data->start_class, &intrnl);
4530           if (was)
4531           SET_SSC_EOS(data->start_class);
4532           }
4533           if (data) {
4534           if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4535           pars++;
4536           if (data_fake.flags & SF_HAS_EVAL)
4537           data->flags |= SF_HAS_EVAL;
4538           data->whilem_c = data_fake.whilem_c;
4539           if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4540           if (RExC_rx->minlen<*minnextp)
4541           RExC_rx->minlen=*minnextp;
4542           SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4543           SvREFCNT_dec_NN(data_fake.last_found);
4544          
4545           if ( data_fake.minlen_fixed != minlenp )
4546           {
4547           data->offset_fixed= data_fake.offset_fixed;
4548           data->minlen_fixed= data_fake.minlen_fixed;
4549           data->lookbehind_fixed+= scan->flags;
4550           }
4551           if ( data_fake.minlen_float != minlenp )
4552           {
4553           data->minlen_float= data_fake.minlen_float;
4554           data->offset_float_min=data_fake.offset_float_min;
4555           data->offset_float_max=data_fake.offset_float_max;
4556           data->lookbehind_float+= scan->flags;
4557           }
4558           }
4559           }
4560           }
4561           #endif
4562           }
4563 6853792         else if (OP(scan) == OPEN) {
4564 312422         if (stopparen != (I32)ARG(scan))
4565 312422         pars++;
4566           }
4567 312514         else if (OP(scan) == CLOSE) {
4568 312422         if (stopparen == (I32)ARG(scan)) {
4569           break;
4570           }
4571 312422         if ((I32)ARG(scan) == is_par) {
4572 312420         next = regnext(scan);
4573            
4574 26813582         if ( next && (OP(next) != WHILEM) && next < last)
4575           is_par = 0; /* Disable optimization */
4576           }
4577 26813584         if (data)
4578 6931925         *(data->last_closep) = ARG(scan);
4579           }
4580 6932007         else if (OP(scan) == EVAL) {
4581 116880278         if (data)
4582 45033410         data->flags |= SF_HAS_EVAL;
4583           }
4584 45033342         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4585 16         if (flags & SCF_DO_SUBSTR) {
4586 45033314         SCAN_COMMIT(pRExC_state,data,minlenp);
4587 45033330         flags &= ~SCF_DO_SUBSTR;
4588           }
4589 36         if (data && OP(scan)==ACCEPT) {
4590 36         data->flags |= SCF_SEEN_ACCEPT;
4591 0         if (stopmin > min)
4592           stopmin = min;
4593           }
4594           }
4595 48         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4596           {
4597 8         if (flags & SCF_DO_SUBSTR) {
4598 8         SCAN_COMMIT(pRExC_state,data,minlenp);
4599 8         data->longest = &(data->longest_float);
4600           }
4601           is_inf = is_inf_internal = 1;
4602 0         if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4603 0         cl_anything(pRExC_state, data->start_class);
4604 28         flags &= ~SCF_DO_STCLASS;
4605           }
4606 48         else if (OP(scan) == GPOS) {
4607 36         if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4608 36         !(delta || is_inf || (data && data->pos_delta)))
4609           {
4610 45033294         if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4611 4498516         RExC_rx->extflags |= RXf_ANCH_GPOS;
4612 11068         if (RExC_rx->gofs < (STRLEN)min)
4613 11066         RExC_rx->gofs = min;
4614           } else {
4615 11066         RExC_rx->extflags |= RXf_GPOS_FLOAT;
4616 11066         RExC_rx->gofs = 0;
4617           }
4618           }
4619           #ifdef TRIE_STUDY_OPT
4620           #ifdef FULL_TRIE_STUDY
4621 11078         else if (PL_regkind[OP(scan)] == TRIE) {
4622           /* NOTE - There is similar code to this block above for handling
4623           BRANCH nodes on the initial study. If you change stuff here
4624           check there too. */
4625 11072         regnode *trie_node= scan;
4626 4498522         regnode *tail= regnext(scan);
4627 22249075         reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4628           SSize_t max1 = 0, min1 = SSize_t_MAX;
4629           struct regnode_charclass_class accum;
4630            
4631 45033300         if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4632 15504         SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4633 15504         if (flags & SCF_DO_STCLASS)
4634 4         cl_init_zero(pRExC_state, &accum);
4635          
4636 10         if (!trie->jump) {
4637 10         min1= trie->minlen;
4638 6         max1= trie->maxlen;
4639           } else {
4640           const regnode *nextbranch= NULL;
4641           U32 word;
4642          
4643 45033392         for ( word=1 ; word <= trie->wordcount ; word++)
4644           {
4645 14         SSize_t deltanext=0, minnext=0, f = 0, fake;
4646           struct regnode_charclass_class this_class;
4647          
4648 14         data_fake.flags = 0;
4649 14         if (data) {
4650 45033378         data_fake.whilem_c = data->whilem_c;
4651 102         data_fake.last_closep = data->last_closep;
4652           }
4653           else
4654 102         data_fake.last_closep = &fake;
4655 110         data_fake.pos_delta = delta;
4656 102         if (flags & SCF_DO_STCLASS) {
4657 8         cl_init(pRExC_state, &this_class);
4658 98         data_fake.start_class = &this_class;
4659           f = SCF_DO_STCLASS_AND;
4660           }
4661 45033280         if (flags & SCF_WHILEM_VISITED_POS)
4662 15021019         f |= SCF_WHILEM_VISITED_POS;
4663          
4664 45033280         if (trie->jump[word]) {
4665 25151657         if (!nextbranch)
4666 25151657         nextbranch = trie_node + trie->jump[0];
4667 25151657         scan= trie_node + trie->jump[word];
4668           /* We go from the jump point to the branch that follows
4669           it. Note this means we need the vestigal unused branches
4670           even though they arent otherwise used.
4671           */
4672 2132         minnext = study_chunk(pRExC_state, &scan, minlenp,
4673           &deltanext, (regnode *)nextbranch, &data_fake,
4674           stopparen, recursed, NULL, f,depth+1);
4675           }
4676 2132         if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4677 2132         nextbranch= regnext((regnode*)nextbranch);
4678          
4679 25151657         if (min1 > (SSize_t)(minnext + trie->minlen))
4680 45033280         min1 = minnext + trie->minlen;
4681 11062         if (deltanext == SSize_t_MAX) {
4682           is_inf = is_inf_internal = 1;
4683           max1 = SSize_t_MAX;
4684 45033294         } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4685 1593078         max1 = minnext + deltanext + trie->maxlen;
4686          
4687 1593078         if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4688 1593078         pars++;
4689 103156         if (data_fake.flags & SCF_SEEN_ACCEPT) {
4690 103156         if ( stopmin > min + min1)
4691 103156         stopmin = min + min1;
4692 206370         flags &= ~SCF_DO_SUBSTR;
4693 103214         if (data)
4694 103214         data->flags |= SCF_SEEN_ACCEPT;
4695           }
4696 103214         if (data) {
4697 103214         if (data_fake.flags & SF_HAS_EVAL)
4698 103214         data->flags |= SF_HAS_EVAL;
4699 103214         data->whilem_c = data_fake.whilem_c;
4700           }
4701 103214         if (flags & SCF_DO_STCLASS)
4702 154821         cl_or(pRExC_state, &accum, &this_class);
4703           }
4704           }
4705 103220         if (flags & SCF_DO_SUBSTR) {
4706 26813580         data->pos_min += min1;
4707 6931921         data->pos_delta += max1 - min1;
4708 26813580         if (max1 != min1 || is_inf)
4709 5172         data->longest = &(data->longest_float);
4710           }
4711 391241         min += min1;
4712 388688         delta += max1 - min1;
4713 304954         if (flags & SCF_DO_STCLASS_OR) {
4714 5184         cl_or(pRExC_state, data->start_class, &accum);
4715 5184         if (min1) {
4716 5184         cl_and(data->start_class, and_withp);
4717 386131         flags &= ~SCF_DO_STCLASS;
4718           }
4719           }
4720 5272         else if (flags & SCF_DO_STCLASS_AND) {
4721 5210         if (min1) {
4722 64         cl_and(data->start_class, &accum);
4723 64         flags &= ~SCF_DO_STCLASS;
4724           }
4725           else {
4726           /* Switch to OR mode: cache the old value of
4727           * data->start_class */
4728 0         INIT_AND_WITHP;
4729 0         StructCopy(data->start_class, and_withp,
4730           struct regnode_charclass_class);
4731 64         flags &= ~SCF_DO_STCLASS_AND;
4732 64         StructCopy(&accum, data->start_class,
4733           struct regnode_charclass_class);
4734 2730         flags |= SCF_DO_STCLASS_OR;
4735 2666         SET_SSC_EOS(data->start_class);
4736           }
4737           }
4738 6         scan= tail;
4739 70         continue;
4740           }
4741           #else
4742           else if (PL_regkind[OP(scan)] == TRIE) {
4743           reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4744           U8*bang=NULL;
4745          
4746           min += trie->minlen;
4747           delta += (trie->maxlen - trie->minlen);
4748           flags &= ~SCF_DO_STCLASS; /* xxx */
4749           if (flags & SCF_DO_SUBSTR) {
4750           SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4751           data->pos_min += trie->minlen;
4752           data->pos_delta += (trie->maxlen - trie->minlen);
4753           if (trie->maxlen != trie->minlen)
4754           data->longest = &(data->longest_float);
4755           }
4756           if (trie->jump) /* no more substrings -- for now /grr*/
4757           flags &= ~SCF_DO_SUBSTR;
4758           }
4759           #endif /* old or new */
4760           #endif /* TRIE_STUDY_OPT */
4761            
4762           /* Else: zero-length, ignore. */
4763 358         scan = regnext(scan);
4764           }
4765 2730         if (frame) {
4766 2470         last = frame->last;
4767 352         scan = frame->next;
4768 240         stopparen = frame->stop;
4769 218         frame = frame->prev;
4770 218         goto fake_study_recurse;
4771           }
4772            
4773           finish:
4774 218         assert(!frame);
4775 218         DEBUG_STUDYDATA("pre-fin:",data,depth);
4776            
4777 218         *scanp = scan;
4778 2644         *deltap = is_inf_internal ? SSize_t_MAX : delta;
4779 196         if (flags & SCF_DO_SUBSTR && is_inf)
4780 2448         data->pos_delta = SSize_t_MAX - data->pos_min;
4781 260         if (is_par > (I32)U8_MAX)
4782           is_par = 0;
4783 260         if (is_par && pars==1 && data) {
4784 14         data->flags |= SF_IN_PAR;
4785 64         data->flags &= ~SF_HAS_PAR;
4786           }
4787 260         else if (pars && data) {
4788 74         data->flags |= SF_HAS_PAR;
4789 74         data->flags &= ~SF_IN_PAR;
4790           }
4791 260         if (flags & SCF_DO_STCLASS_OR)
4792 64         cl_and(data->start_class, and_withp);
4793 260         if (flags & SCF_TRIE_RESTUDY)
4794 70         data->flags |= SCF_TRIE_RESTUDY;
4795          
4796 260         DEBUG_STUDYDATA("post-fin:",data,depth);
4797          
4798 260         return min < stopmin ? min : stopmin;
4799           }
4800            
4801           STATIC U32
4802 156         S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4803           {
4804 156         U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4805            
4806 156         PERL_ARGS_ASSERT_ADD_DATA;
4807            
4808 156         Renewc(RExC_rxi->data,
4809           sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4810           char, struct reg_data);
4811 92         if(count)
4812 16         Renew(RExC_rxi->data->what, count + n, U8);
4813           else
4814 140         Newx(RExC_rxi->data->what, n, U8);
4815 156         RExC_rxi->data->count = count + n;
4816 156         Copy(s, RExC_rxi->data->what + count, n, U8);
4817 156         return count;
4818           }
4819            
4820           /*XXX: todo make this not included in a non debugging perl */
4821           #ifndef PERL_IN_XSUB_RE
4822           void
4823           Perl_reginitcolors(pTHX)
4824           {
4825           dVAR;
4826           const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4827           if (s) {
4828           char *t = savepv(s);
4829           int i = 0;
4830           PL_colors[0] = t;
4831           while (++i < 6) {
4832           t = strchr(t, '\t');
4833           if (t) {
4834           *t = '\0';
4835           PL_colors[i] = ++t;
4836           }
4837           else
4838           PL_colors[i] = t = (char *)"";
4839           }
4840           } else {
4841           int i = 0;
4842           while (i < 6)
4843           PL_colors[i++] = (char *)"";
4844           }
4845           PL_colorset = 1;
4846           }
4847           #endif
4848            
4849            
4850           #ifdef TRIE_STUDY_OPT
4851           #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4852           STMT_START { \
4853           if ( \
4854           (data.flags & SCF_TRIE_RESTUDY) \
4855           && ! restudied++ \
4856           ) { \
4857           dOsomething; \
4858           goto reStudy; \
4859           } \
4860           } STMT_END
4861           #else
4862           #define CHECK_RESTUDY_GOTO_butfirst
4863           #endif
4864            
4865           /*
4866           * pregcomp - compile a regular expression into internal code
4867           *
4868           * Decides which engine's compiler to call based on the hint currently in
4869           * scope
4870           */
4871            
4872           #ifndef PERL_IN_XSUB_RE
4873            
4874           /* return the currently in-scope regex engine (or the default if none) */
4875            
4876           regexp_engine const *
4877           Perl_current_re_engine(pTHX)
4878           {
4879           dVAR;
4880            
4881           if (IN_PERL_COMPILETIME) {
4882           HV * const table = GvHV(PL_hintgv);
4883           SV **ptr;
4884            
4885           if (!table)
4886           return &PL_core_reg_engine;
4887           ptr = hv_fetchs(table, "regcomp", FALSE);
4888           if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4889           return &PL_core_reg_engine;
4890           return INT2PTR(regexp_engine*,SvIV(*ptr));
4891           }
4892           else {
4893           SV *ptr;
4894           if (!PL_curcop->cop_hints_hash)
4895           return &PL_core_reg_engine;
4896           ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4897           if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4898           return &PL_core_reg_engine;
4899           return INT2PTR(regexp_engine*,SvIV(ptr));
4900           }
4901           }
4902            
4903            
4904           REGEXP *
4905           Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4906           {
4907           dVAR;
4908           regexp_engine const *eng = current_re_engine();
4909           GET_RE_DEBUG_FLAGS_DECL;
4910            
4911           PERL_ARGS_ASSERT_PREGCOMP;
4912            
4913           /* Dispatch a request to compile a regexp to correct regexp engine. */
4914           DEBUG_COMPILE_r({
4915           PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4916           PTR2UV(eng));
4917           });
4918           return CALLREGCOMP_ENG(eng, pattern, flags);
4919           }
4920           #endif
4921            
4922           /* public(ish) entry point for the perl core's own regex compiling code.
4923           * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4924           * pattern rather than a list of OPs, and uses the internal engine rather
4925           * than the current one */
4926            
4927           REGEXP *
4928 64         Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4929           {
4930 0         SV *pat = pattern; /* defeat constness! */
4931 0         PERL_ARGS_ASSERT_RE_COMPILE;
4932 64         return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4933           #ifdef PERL_IN_XSUB_RE
4934           &my_reg_engine,
4935           #else
4936           &PL_core_reg_engine,
4937           #endif
4938           NULL, NULL, rx_flags, 0);
4939           }
4940            
4941            
4942           /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4943           * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4944           * point to the realloced string and length.
4945           *
4946           * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4947           * stuff added */
4948            
4949           static void
4950 100         S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4951           char **pat_p, STRLEN *plen_p, int num_code_blocks)
4952           {
4953 44         U8 *const src = (U8*)*pat_p;
4954           U8 *dst;
4955           int n=0;
4956           STRLEN s = 0, d = 0;
4957           bool do_end = 0;
4958 44         GET_RE_DEBUG_FLAGS_DECL;
4959            
4960 92         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4961           "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4962            
4963 210         Newx(dst, *plen_p * 2 + 1, U8);
4964            
4965 380         while (s < *plen_p) {
4966 282         if (NATIVE_IS_INVARIANT(src[s]))
4967 228         dst[d] = src[s];
4968           else {
4969 54         dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
4970 32         dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
4971           }
4972 225         if (n < num_code_blocks) {
4973 22         if (!do_end && pRExC_state->code_blocks[n].start == s) {
4974 10         pRExC_state->code_blocks[n].start = d;
4975 12         assert(dst[d] == '(');
4976           do_end = 1;
4977           }
4978 90         else if (do_end && pRExC_state->code_blocks[n].end == s) {
4979 90         pRExC_state->code_blocks[n].end = d;
4980 90         assert(dst[d] == ')');
4981           do_end = 0;
4982 90         n++;
4983           }
4984           }
4985 291         s++;
4986 282         d++;
4987           }
4988 92         dst[d] = '\0';
4989 92         *plen_p = d;
4990 92         *pat_p = (char*) dst;
4991 92         SAVEFREEPV(*pat_p);
4992 96         RExC_orig_utf8 = RExC_utf8 = 1;
4993 16779664         }
4994            
4995            
4996            
4997           /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4998           * while recording any code block indices, and handling overloading,
4999           * nested qr// objects etc. If pat is null, it will allocate a new
5000           * string, or just return the first arg, if there's only one.
5001           *
5002           * Returns the malloced/updated pat.
5003           * patternp and pat_count is the array of SVs to be concatted;
5004           * oplist is the optional list of ops that generated the SVs;
5005           * recompile_p is a pointer to a boolean that will be set if
5006           * the regex will need to be recompiled.
5007           * delim, if non-null is an SV that will be inserted between each element
5008           */
5009            
5010           static SV*
5011 16779864         S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5012           SV *pat, SV ** const patternp, int pat_count,
5013           OP *oplist, bool *recompile_p, SV *delim)
5014           {
5015           SV **svp;
5016           int n = 0;
5017           bool use_delim = FALSE;
5018           bool alloced = FALSE;
5019            
5020           /* if we know we have at least two args, create an empty string,
5021           * then concatenate args to that. For no args, return an empty string */
5022 1323509         if (!pat && pat_count != 1) {
5023 8832072         pat = newSVpvn("", 0);
5024 8791262         SAVEFREESV(pat);
5025           alloced = TRUE;
5026           }
5027            
5028 555330         for (svp = patternp; svp < patternp + pat_count; svp++) {
5029           SV *sv;
5030           SV *rx = NULL;
5031           STRLEN orig_patlen = 0;
5032           bool code = 0;
5033 554750         SV *msv = use_delim ? delim : *svp;
5034            
5035           /* if we've got a delimiter, we go round the loop twice for each
5036           * svp slot (except the last), using the delimiter the second
5037           * time round */
5038 8237106         if (use_delim) {
5039 8236762         svp--;
5040           use_delim = FALSE;
5041           }
5042 8791512         else if (delim)
5043           use_delim = TRUE;
5044            
5045 17582680         if (SvTYPE(msv) == SVt_PVAV) {
5046           /* we've encountered an interpolated array within
5047           * the pattern, e.g. /...@a..../. Expand the list of elements,
5048           * then recursively append elements.
5049           * The code in this block is based on S_pushav() */
5050            
5051           AV *const av = (AV*)msv;
5052 8791168         const I32 maxarg = AvFILL(av) + 1;
5053           SV **array;
5054            
5055 8791168         if (oplist) {
5056 8791168         assert(oplist->op_type == OP_PADAV
5057           || oplist->op_type == OP_RV2AV);
5058 5508759         oplist = oplist->op_sibling;;
5059           }
5060            
5061 12882224         if (SvRMAGICAL(av)) {
5062           U32 i;
5063            
5064 0         Newx(array, maxarg, SV*);
5065 26813538         SAVEFREEPV(array);
5066 26813538         for (i=0; i < (U32)maxarg; i++) {
5067 26813538         SV ** const svp = av_fetch(av, i, FALSE);
5068 26813538         array[i] = svp ? *svp : &PL_sv_undef;
5069           }
5070           }
5071           else
5072 20210         array = AvARRAY(av);
5073            
5074 20210         pat = S_concat_pat(aTHX_ pRExC_state, pat,
5075           array, maxarg, NULL, recompile_p,
5076           /* $" */
5077 20210         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5078            
5079 20210         continue;
5080           }
5081            
5082            
5083           /* we make the assumption here that each op in the list of
5084           * op_siblings maps to one SV pushed onto the stack,
5085           * except for code blocks, with have both an OP_NULL and
5086           * and OP_CONST.
5087           * This allows us to match up the list of SVs against the
5088           * list of OPs to find the next code block.
5089           *
5090           * Note that PUSHMARK PADSV PADSV ..
5091           * is optimised to
5092           * PADRANGE PADSV PADSV ..
5093           * so the alignment still works. */
5094            
5095 20554         if (oplist) {
5096 20290         if (oplist->op_type == OP_NULL
5097 20242         && (oplist->op_flags & OPf_SPECIAL))
5098           {
5099 20242         assert(n < pRExC_state->num_code_blocks);
5100 20242         pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5101 20242         pRExC_state->code_blocks[n].block = oplist;
5102 20242         pRExC_state->code_blocks[n].src_regex = NULL;
5103 20242         n++;
5104           code = 1;
5105 20242         oplist = oplist->op_sibling; /* skip CONST */
5106 20242         assert(oplist);
5107           }
5108 20290         oplist = oplist->op_sibling;;
5109           }
5110            
5111           /* apply magic and QR overloading to arg */
5112            
5113 20554         SvGETMAGIC(msv);
5114 20554         if (SvROK(msv) && SvAMAGIC(msv)) {
5115 20234         SV *sv = AMG_CALLunary(msv, regexp_amg);
5116 20234         if (sv) {
5117 20210         if (SvROK(sv))
5118 20210         sv = SvRV(sv);
5119 20210         if (SvTYPE(sv) != SVt_REGEXP)
5120 20210         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5121           msv = sv;
5122           }
5123           }
5124            
5125           /* try concatenation overload ... */
5126 20554         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5127           (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5128           {
5129 20210         sv_setsv(pat, sv);
5130           /* overloading involved: all bets are off over literal
5131           * code. Pretend we haven't seen it */
5132 20210         pRExC_state->num_code_blocks -= n;
5133 20210         n = 0;
5134           }
5135           else {
5136           /* ... or failing that, try "" overload */
5137 20554         while (SvAMAGIC(msv)
5138 20210         && (sv = AMG_CALLunary(msv, string_amg))
5139 20210         && sv != msv
5140 20210         && !( SvROK(msv)
5141 20210         && SvROK(sv)
5142 20210         && SvRV(msv) == SvRV(sv))
5143           ) {
5144           msv = sv;
5145 20210         SvGETMAGIC(msv);
5146           }
5147 20554         if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5148 26813632         msv = SvRV(msv);
5149            
5150 26813882         if (pat) {
5151           /* this is a partially unrolled
5152           * sv_catsv_nomg(pat, msv);
5153           * that allows us to adjust code block indices if
5154           * needed */
5155           STRLEN dlen;
5156 26813740         char *dst = SvPV_force_nomg(pat, dlen);
5157 22326156         orig_patlen = dlen;
5158 28280369         if (SvUTF8(msv) && !SvUTF8(pat)) {
5159 1466683         S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5160 4328895         sv_setpvn(pat, dst, dlen);
5161 2896618         SvUTF8_on(pat);
5162           }
5163 11268         sv_catsv_nomg(pat, msv);
5164           rx = msv;
5165           }
5166           else
5167           pat = msv;
5168            
5169 1432621         if (code)
5170 10604         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5171           }
5172            
5173           /* extract any code blocks within any embedded qr//'s */
5174 10916         if (rx && SvTYPE(rx) == SVt_REGEXP
5175 26813570         && RX_ENGINE((REGEXP*)rx)->op_comp)
5176           {
5177            
5178 4477796         RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5179 4328657         if (ri->num_code_blocks) {
5180           int i;
5181           /* the presence of an embedded qr// with code means
5182           * we should always recompile: the text of the
5183           * qr// may not have changed, but it may be a
5184           * different closure than last time */
5185 2896414         *recompile_p = 1;
5186 1453155         Renew(pRExC_state->code_blocks,
5187           pRExC_state->num_code_blocks + ri->num_code_blocks,
5188           struct reg_code_block);
5189 4477788         pRExC_state->num_code_blocks += ri->num_code_blocks;
5190            
5191 4477812         for (i=0; i < ri->num_code_blocks; i++) {
5192           struct reg_code_block *src, *dst;
5193 4477788         STRLEN offset = orig_patlen
5194 3045553         + ReANY((REGEXP *)rx)->pre_prefix;
5195 4328649         assert(n < pRExC_state->num_code_blocks);
5196 2896414         src = &ri->code_blocks[i];
5197 1453155         dst = &pRExC_state->code_blocks[n];
5198 26813562         dst->start = src->start + offset;
5199 10596         dst->end = src->end + offset;
5200 10596         dst->block = src->block;
5201 10620         dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5202           src->src_regex
5203           ? src->src_regex
5204           : (REGEXP*)rx);
5205 26813562         n++;
5206           }
5207           }
5208           }
5209           }
5210           /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5211 26813774         if (alloced)
5212 102         SvSETMAGIC(pat);
5213            
5214 26813774         return pat;
5215           }
5216            
5217            
5218            
5219           /* see if there are any run-time code blocks in the pattern.
5220           * False positives are allowed */
5221            
5222           static bool
5223 13427977         S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5224           char *pat, STRLEN plen)
5225           {
5226           int n = 0;
5227           STRLEN s;
5228            
5229 13428009         for (s = 0; s < plen; s++) {
5230 13428073         if (n < pRExC_state->num_code_blocks
5231 13428025         && s == pRExC_state->code_blocks[n].start)
5232           {
5233 13385597         s = pRExC_state->code_blocks[n].end;
5234 13385597         n++;
5235 12         continue;
5236           }
5237           /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5238           * positives here */
5239 144         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5240 36         (pat[s+2] == '{'
5241 12         || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5242           )
5243           return 1;
5244           }
5245           return 0;
5246           }
5247            
5248           /* Handle run-time code blocks. We will already have compiled any direct
5249           * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5250           * copy of it, but with any literal code blocks blanked out and
5251           * appropriate chars escaped; then feed it into
5252           *
5253           * eval "qr'modified_pattern'"
5254           *
5255           * For example,
5256           *
5257           * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5258           *
5259           * becomes
5260           *
5261           * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5262           *
5263           * After eval_sv()-ing that, grab any new code blocks from the returned qr
5264           * and merge them with any code blocks of the original regexp.
5265           *
5266           * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5267           * instead, just save the qr and return FALSE; this tells our caller that
5268           * the original pattern needs upgrading to utf8.
5269           */
5270            
5271           static bool
5272 24         S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5273           char *pat, STRLEN plen)
5274           {
5275           SV *qr;
5276            
5277 13385609         GET_RE_DEBUG_FLAGS_DECL;
5278            
5279 13385609         if (pRExC_state->runtime_code_qr) {
5280           /* this is the second time we've been called; this should
5281           * only happen if the main pattern got upgraded to utf8
5282           * during compilation; re-use the qr we compiled first time
5283           * round (which should be utf8 too)
5284           */
5285 13385585         qr = pRExC_state->runtime_code_qr;
5286 13385585         pRExC_state->runtime_code_qr = NULL;
5287 13695865         assert(RExC_utf8 && SvUTF8(qr));
5288           }
5289           else {
5290           int n = 0;
5291           STRLEN s;
5292           char *p, *newpat;
5293 13690723         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5294           SV *sv, *qr_ref;
5295 5194         dSP;
5296            
5297           /* determine how many extra chars we need for ' and \ escaping */
5298 13696189         for (s = 0; s < plen; s++) {
5299 9032686         if (pat[s] == '\'' || pat[s] == '\\')
5300 8935362         newlen++;
5301           }
5302            
5303 8650018         Newx(newpat, newlen, char);
5304           p = newpat;
5305 12969417         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5306            
5307 12969657         for (s = 0; s < plen; s++) {
5308 11564764         if (n < pRExC_state->num_code_blocks
5309 4916190         && s == pRExC_state->code_blocks[n].start)
5310           {
5311           /* blank out literal code block */
5312 4916122         assert(pat[s] == '(');
5313 4916194         while (s <= pRExC_state->code_blocks[n].end) {
5314 8779827         *p++ = '_';
5315 6462         s++;
5316           }
5317 8773377         s--;
5318 8779767         n++;
5319 8779767         continue;
5320           }
5321 8779983         if (pat[s] == '\'' || pat[s] == '\\')
5322 8779755         *p++ = '\\';
5323 292         *p++ = pat[s];
5324           }
5325 24         *p++ = '\'';
5326 88         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5327 0         *p++ = 'x';
5328 155164         *p++ = '\0';
5329 8779779         DEBUG_COMPILE_r({
5330           PerlIO_printf(Perl_debug_log,
5331           "%sre-parsing pattern for runtime code:%s %s\n",
5332           PL_colors[4],PL_colors[5],newpat);
5333           });
5334            
5335 8779779         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5336 8779779         Safefree(newpat);
5337            
5338 8779779         ENTER;
5339 8779779         SAVETMPS;
5340 8779779         save_re_context();
5341 8779779         PUSHSTACKi(PERLSI_REQUIRE);
5342           /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5343           * parsing qr''; normally only q'' does this. It also alters
5344           * hints handling */
5345 8779779         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5346 8779779         SvREFCNT_dec_NN(sv);
5347 8779779         SPAGAIN;
5348 8779779         qr_ref = POPs;
5349 8779779         PUTBACK;
5350           {
5351 8779779         SV * const errsv = ERRSV;
5352 8779779         if (SvTRUE_NN(errsv))
5353           {
5354 8779755         Safefree(pRExC_state->code_blocks);
5355           /* use croak_sv ? */
5356 8779755         Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5357           }
5358           }
5359 8779779         assert(SvROK(qr_ref));
5360 8779779         qr = SvRV(qr_ref);
5361 8779803         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5362           /* the leaving below frees the tmp qr_ref.
5363           * Give qr a life of its own */
5364           SvREFCNT_inc(qr);
5365 8779779         POPSTACK;
5366 8779779         FREETMPS;
5367 8779779         LEAVE;
5368            
5369           }
5370            
5371 8779779         if (!RExC_utf8 && SvUTF8(qr)) {
5372           /* first time through; the pattern got upgraded; save the
5373           * qr for the next time through */
5374 8779755         assert(!pRExC_state->runtime_code_qr);
5375 112836         pRExC_state->runtime_code_qr = qr;
5376 112836         return 0;
5377           }
5378            
5379            
5380           /* extract any code blocks within the returned qr// */
5381            
5382            
5383           /* merge the main (r1) and run-time (r2) code blocks into one */
5384           {
5385 112860         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5386           struct reg_code_block *new_block, *dst;
5387           RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5388           int i1 = 0, i2 = 0;
5389            
5390 112860         if (!r2->num_code_blocks) /* we guessed wrong */
5391           {
5392 8779755         SvREFCNT_dec_NN(qr);
5393 310280         return 1;
5394           }
5395            
5396 310304         Newx(new_block,
5397           r1->num_code_blocks + r2->num_code_blocks,
5398           struct reg_code_block);
5399           dst = new_block;
5400            
5401 310364         while ( i1 < r1->num_code_blocks
5402 48         || i2 < r2->num_code_blocks)
5403           {
5404           struct reg_code_block *src;
5405           bool is_qr = 0;
5406            
5407 8466107         if (i1 == r1->num_code_blocks) {
5408 112848         src = &r2->code_blocks[i2++];
5409           is_qr = 1;
5410           }
5411 8466083         else if (i2 == r2->num_code_blocks)
5412 1222622         src = &r1->code_blocks[i1++];
5413 8466095         else if ( r1->code_blocks[i1].start
5414 12         < r2->code_blocks[i2].start)
5415           {
5416 8466083         src = &r1->code_blocks[i1++];
5417 8466083         assert(src->end < r2->code_blocks[i2].start);
5418           }
5419           else {
5420 4944         assert( r1->code_blocks[i1].start
5421           > r2->code_blocks[i2].start);
5422 8466071         src = &r2->code_blocks[i2++];
5423           is_qr = 1;
5424 8466071         assert(src->end < r1->code_blocks[i1].start);
5425           }
5426            
5427 8466107         assert(pat[src->start] == '(');
5428 36         assert(pat[src->end] == ')');
5429 8466107         dst->start = src->start;
5430 8466107         dst->end = src->end;
5431 8466107         dst->block = src->block;
5432 8466107         dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5433 8466119         : src->src_regex;
5434 1846538         dst++;
5435           }
5436 1846526         r1->num_code_blocks += r2->num_code_blocks;
5437 3488637         Safefree(r1->code_blocks);
5438 105424         r1->code_blocks = new_block;
5439           }
5440            
5441 103292         SvREFCNT_dec_NN(qr);
5442 6619593         return 1;
5443           }
5444            
5445            
5446           STATIC bool
5447 8466339         S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5448           SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5449           {
5450           /* This is the common code for setting up the floating and fixed length
5451           * string data extracted from Perl_re_op_compile() below. Returns a boolean
5452           * as to whether succeeded or not */
5453            
5454           I32 t;
5455           SSize_t ml;
5456            
5457 16932410         if (! (longest_length
5458           || (eol /* Can't have SEOL and MULTI */
5459 8466073         && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5460           )
5461           /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5462 8466191         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5463           {
5464           return FALSE;
5465           }
5466            
5467           /* copy the information about the longest from the reg_scan_data
5468           over to the program. */
5469 8466191         if (SvUTF8(sv_longest)) {
5470 8466091         *rx_utf8 = sv_longest;
5471 8466091         *rx_substr = NULL;
5472           } else {
5473 8466171         *rx_substr = sv_longest;
5474 8466171         *rx_utf8 = NULL;
5475           }
5476           /* end_shift is how many chars that must be matched that
5477           follow this item. We calculate it ahead of time as once the
5478           lookbehind offset is added in we lose the ability to correctly
5479           calculate it.*/
5480 8466191         ml = minlen ? *(minlen) : (SSize_t)longest_length;
5481 8466311         *rx_end_shift = ml - offset
5482 8466191         - longest_length + (SvTAIL(sv_longest) != 0)
5483 8466191         + lookbehind;
5484            
5485 682354         t = (eol/* Can't have SEOL and MULTI */
5486 8466073         && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5487 8466191         fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5488            
5489 8466071         return TRUE;
5490           }
5491            
5492           /*
5493           * Perl_re_op_compile - the perl internal RE engine's function to compile a
5494           * regular expression into internal code.
5495           * The pattern may be passed either as:
5496           * a list of SVs (patternp plus pat_count)
5497           * a list of OPs (expr)
5498           * If both are passed, the SV list is used, but the OP list indicates
5499           * which SVs are actually pre-compiled code blocks
5500           *
5501           * The SVs in the list have magic and qr overloading applied to them (and
5502           * the list may be modified in-place with replacement SVs in the latter
5503           * case).
5504           *
5505           * If the pattern hasn't changed from old_re, then old_re will be
5506           * returned.
5507           *
5508           * eng is the current engine. If that engine has an op_comp method, then
5509           * handle directly (i.e. we assume that op_comp was us); otherwise, just
5510           * do the initial concatenation of arguments and pass on to the external
5511           * engine.
5512           *
5513           * If is_bare_re is not null, set it to a boolean indicating whether the
5514           * arg list reduced (after overloading) to a single bare regex which has
5515           * been returned (i.e. /$qr/).
5516           *
5517           * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5518           *
5519           * pm_flags contains the PMf_* flags, typically based on those from the
5520           * pm_flags field of the related PMOP. Currently we're only interested in
5521           * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5522           *
5523           * We can't allocate space until we know how big the compiled form will be,
5524           * but we can't compile it (and thus know how big it is) until we've got a
5525           * place to put the code. So we cheat: we compile it twice, once with code
5526           * generation turned off and size counting turned on, and once "for real".
5527           * This also means that we don't allocate space until we are sure that the
5528           * thing really will compile successfully, and we never have to move the
5529           * code and thus invalidate pointers into it. (Note that it has to be in
5530           * one piece because free() must be able to free it all.) [NB: not true in perl]
5531           *
5532           * Beware that the optimization-preparation code in here knows about some
5533           * of the structure of the compiled regexp. [I'll say.]
5534           */
5535            
5536           REGEXP *
5537 8466307         Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5538           OP *expr, const regexp_engine* eng, REGEXP *old_re,
5539           bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5540           {
5541           dVAR;
5542           REGEXP *rx;
5543           struct regexp *r;
5544           regexp_internal *ri;
5545           STRLEN plen;
5546           char *exp;
5547           regnode *scan;
5548           I32 flags;
5549 1450628         SSize_t minlen = 0;
5550           U32 rx_flags;
5551           SV *pat;
5552           SV *code_blocksv = NULL;
5553           SV** new_patternp = patternp;
5554            
5555           /* these are all flags - maybe they should be turned
5556           * into a single int with different bit masks */
5557           I32 sawlookahead = 0;
5558           I32 sawplus = 0;
5559           I32 sawopen = 0;
5560           I32 sawminmod = 0;
5561            
5562           regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5563 1450628         bool recompile = 0;
5564           bool runtime_code = 0;
5565           scan_data_t data;
5566           RExC_state_t RExC_state;
5567           RExC_state_t * const pRExC_state = &RExC_state;
5568           #ifdef TRIE_STUDY_OPT
5569           int restudied = 0;
5570           RExC_state_t copyRExC_state;
5571           #endif
5572 1450628         GET_RE_DEBUG_FLAGS_DECL;
5573            
5574 8466307         PERL_ARGS_ASSERT_RE_OP_COMPILE;
5575            
5576 4316342         DEBUG_r(if (!PL_colorset) reginitcolors());
5577            
5578           #ifndef PERL_IN_XSUB_RE
5579           /* Initialize these here instead of as-needed, as is quick and avoids
5580           * having to test them each time otherwise */
5581           if (! PL_AboveLatin1) {
5582           PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5583           PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5584           PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5585            
5586           PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5587           = _new_invlist_C_array(L1PosixAlnum_invlist);
5588           PL_Posix_ptrs[_CC_ALPHANUMERIC]
5589           = _new_invlist_C_array(PosixAlnum_invlist);
5590            
5591           PL_L1Posix_ptrs[_CC_ALPHA]
5592           = _new_invlist_C_array(L1PosixAlpha_invlist);
5593           PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5594            
5595           PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5596           PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5597            
5598           /* Cased is the same as Alpha in the ASCII range */
5599           PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5600           PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5601            
5602           PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5603           PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5604            
5605           PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5606           PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5607            
5608           PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5609           PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5610            
5611           PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5612           PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5613            
5614           PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5615           PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5616            
5617           PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5618           PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5619            
5620           PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5621           PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5622           PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5623           PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5624            
5625           PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5626           PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5627            
5628           PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5629            
5630           PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5631           PL_L1Posix_ptrs[_CC_WORDCHAR]
5632           = _new_invlist_C_array(L1PosixWord_invlist);
5633            
5634           PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5635           PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5636            
5637           PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5638           }
5639           #endif
5640            
5641 42330591         pRExC_state->code_blocks = NULL;
5642 33864520         pRExC_state->num_code_blocks = 0;
5643            
5644 3895847         if (is_bare_re)
5645 33864464         *is_bare_re = FALSE;
5646            
5647 8466331         if (expr && (expr->op_type == OP_LIST ||
5648 8466095         (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5649           /* allocate code_blocks if needed */
5650           OP *o;
5651           int ncode = 0;
5652            
5653 16932294         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5654 8466191         if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5655 8466103         ncode++; /* count of DO blocks */
5656 84         if (ncode) {
5657 8466103         pRExC_state->num_code_blocks = ncode;
5658 8466103         Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5659           }
5660           }
5661            
5662 16932378         if (!pat_count) {
5663           /* compile-time pattern with just OP_CONSTs and DO blocks */
5664            
5665           int n;
5666           OP *o;
5667            
5668           /* find how many CONSTs there are */
5669 8466127         assert(expr);
5670           n = 0;
5671 8466127         if (expr->op_type == OP_CONST)
5672           n = 1;
5673           else
5674 8466223         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5675 2722         if (o->op_type == OP_CONST)
5676 2658         n++;
5677           }
5678            
5679           /* fake up an SV array */
5680            
5681 2658         assert(!new_patternp);
5682 2658         Newx(new_patternp, n, SV*);
5683 8466127         SAVEFREEPV(new_patternp);
5684           pat_count = n;
5685            
5686           n = 0;
5687 8466127         if (expr->op_type == OP_CONST)
5688 8466095         new_patternp[n] = cSVOPx_sv(expr);
5689           else
5690 8466223         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5691 8466191         if (o->op_type == OP_CONST)
5692 8466127         new_patternp[n++] = cSVOPo_sv;
5693           }
5694            
5695           }
5696            
5697 8466307         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5698           "Assembling pattern from %d elements%s\n", pat_count,
5699           orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5700            
5701           /* set expr to the first arg op */
5702            
5703 8466307         if (pRExC_state->num_code_blocks
5704 8466103         && expr->op_type != OP_CONST)
5705           {
5706 8466103         expr = cLISTOPx(expr)->op_first;
5707 8466103         assert( expr->op_type == OP_PUSHMARK
5708           || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5709           || expr->op_type == OP_PADRANGE);
5710 8466103         expr = expr->op_sibling;
5711           }
5712            
5713 8466307         pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5714           expr, &recompile, NULL);
5715            
5716           /* handle bare (possibly after overloading) regex: foo =~ $re */
5717           {
5718           SV *re = pat;
5719 8466307         if (SvROK(re))
5720 8466071         re = SvRV(re);
5721 8466307         if (SvTYPE(re) == SVt_REGEXP) {
5722 62         if (is_bare_re)
5723 62         *is_bare_re = TRUE;
5724           SvREFCNT_inc(re);
5725 8465813         Safefree(pRExC_state->code_blocks);
5726 8465813         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5727           "Precompiled pattern%s\n",
5728           orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5729            
5730           return (REGEXP*)re;
5731           }
5732           }
5733            
5734 2776         exp = SvPV_nomg(pat, plen);
5735            
5736 4333987         if (!eng->op_comp) {
5737 8498368         if ((SvUTF8(pat) && IN_BYTES)
5738 8498368         || SvGMAGICAL(pat) || SvAMAGIC(pat))
5739           {
5740           /* make a temporary copy; either to convert to bytes,
5741           * or to avoid repeating get-magic / overloaded stringify */
5742 8498368         pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5743           (IN_BYTES ? 0 : SvUTF8(pat)));
5744           }
5745 8465751         Safefree(pRExC_state->code_blocks);
5746 8465751         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5747           }
5748            
5749           /* ignore the utf8ness if the pattern is 0 length */
5750 32791         RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5751 32791         RExC_uni_semantics = 0;
5752 32791         RExC_contains_locale = 0;
5753 174         pRExC_state->runtime_code_qr = NULL;
5754            
5755 32791         DEBUG_COMPILE_r({
5756           SV *dsv= sv_newmortal();
5757           RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5758           PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5759           PL_colors[4],PL_colors[5],s);
5760           });
5761            
5762           redo_first_pass:
5763           /* we jump here if we upgrade the pattern to utf8 and have to
5764           * recompile */
5765            
5766 32815         if ((pm_flags & PMf_USE_RE_EVAL)
5767           /* this second condition covers the non-regex literal case,
5768           * i.e. $foo =~ '(?{})'. */
5769 8498542         || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5770           )
5771 8498392         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5772            
5773           /* return old regex if pattern hasn't changed */
5774           /* XXX: note in the below we have to check the flags as well as the pattern.
5775           *
5776           * Things get a touch tricky as we have to compare the utf8 flag independently
5777           * from the compile flags.
5778           */
5779            
5780 685962         if ( old_re
5781 8498462         && !recompile
5782 8498446         && !!RX_UTF8(old_re) == !!RExC_utf8
5783 1712960         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5784 8498476         && RX_PRECOMP(old_re)
5785 8498476         && RX_PRELEN(old_re) == plen
5786 8457216         && memEQ(RX_PRECOMP(old_re), exp, plen)
5787 8457120         && !runtime_code /* with runtime code, always recompile */ )
5788           {
5789 8457112         Safefree(pRExC_state->code_blocks);
5790 18342017         return old_re;
5791           }
5792            
5793           rx_flags = orig_rx_flags;
5794            
5795 13459962         if (initial_charset == REGEX_LOCALE_CHARSET) {
5796 13345125         RExC_contains_locale = 1;
5797           }
5798 13174618         else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5799            
5800           /* Set to use unicode semantics if the pattern is in utf8 and has the
5801           * 'depends' charset specified, as it means unicode when utf8 */
5802           set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5803           }
5804            
5805 13061762         RExC_precomp = exp;
5806 13186260         RExC_flags = rx_flags;
5807 5806115         RExC_pm_flags = pm_flags;
5808            
5809 1347418         if (runtime_code) {
5810 1180809         if (TAINTING_get && TAINT_get)
5811 1180785         Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5812            
5813 1347284         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5814           /* whoops, we have a non-utf8 pattern, whilst run-time code
5815           * got compiled as utf8. Try again with a utf8 pattern */
5816 1347260         S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5817           pRExC_state->num_code_blocks);
5818 11574721         goto redo_first_pass;
5819           }
5820           }
5821 4788639         assert(!pRExC_state->runtime_code_qr);
5822            
5823 487292         RExC_sawback = 0;
5824            
5825 6799143         RExC_seen = 0;
5826 26008         RExC_in_lookbehind = 0;
5827 25836         RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5828 25836         RExC_extralen = 0;
5829 15622         RExC_override_recoding = 0;
5830 15622         RExC_in_multi_char_class = 0;
5831            
5832           /* First pass: determine size, legality. */
5833 10372         RExC_parse = exp;
5834 10372         RExC_start = exp;
5835 25836         RExC_end = exp + plen;
5836 25836         RExC_naughty = 0;
5837 25836         RExC_npar = 1;
5838 6760720         RExC_nestroot = 0;
5839 1467975         RExC_size = 0L;
5840 5292903         RExC_emit = &RExC_emit_dummy;
5841 260520         RExC_whilem_seen = 0;
5842 5032541         RExC_open_parens = NULL;
5843 4401362         RExC_close_parens = NULL;
5844 2929895         RExC_opend = NULL;
5845 2977711         RExC_paren_names = NULL;
5846           #ifdef DEBUGGING
5847 2977711         RExC_paren_name_list = NULL;
5848           #endif
5849 2054988         RExC_recurse = NULL;
5850 17720         RExC_recurse_count = 0;
5851 17720         pRExC_state->code_index = 0;
5852            
5853           #if 0 /* REGC() is (currently) a NOP at the first pass.
5854           * Clever compilers notice this and complain. --jhi */
5855           REGC((U8)REG_MAGIC, (char*)RExC_emit);
5856           #endif
5857 17720         DEBUG_PARSE_r(
5858           PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5859           RExC_lastnum=0;
5860           RExC_lastparse=NULL;
5861           );
5862           /* reg may croak on us, not giving us a chance to free
5863           pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5864           need it to survive as long as the regexp (qr/(?{})/).
5865           We must check that code_blocksv is not already set, because we may
5866           have jumped back to restart the sizing pass. */
5867 3029214         if (pRExC_state->code_blocks && !code_blocksv) {
5868 1251520         code_blocksv = newSV_type(SVt_PV);
5869 260060         SAVEFREESV(code_blocksv);
5870 188944         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5871 122602         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5872           }
5873 122692         if (reg(pRExC_state, 0, &flags,1) == NULL) {
5874           /* It's possible to write a regexp in ascii that represents Unicode
5875           codepoints outside of the byte range, such as via \x{100}. If we
5876           detect such a sequence we have to convert the entire pattern to utf8
5877           and then recompile, as our sizing calculation will have been based
5878           on 1 byte == 1 character, but we will need to use utf8 to encode
5879           at least some part of the pattern, and therefore must convert the whole
5880           thing.
5881           -- dmq */
5882 122558         if (flags & RESTART_UTF8) {
5883 122558         S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5884           pRExC_state->num_code_blocks);
5885 122558         goto redo_first_pass;
5886           }
5887 122534         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5888           }
5889 8457206         if (code_blocksv)
5890 163841         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5891            
5892 163701         DEBUG_PARSE_r({
5893           PerlIO_printf(Perl_debug_log,
5894           "Required size %"IVdf" nodes\n"
5895           "Starting second pass (creation)\n",
5896           (IV)RExC_size);
5897           RExC_lastnum=0;
5898           RExC_lastparse=NULL;
5899           });
5900            
5901           /* The first pass could have found things that force Unicode semantics */
5902 8457206         if ((RExC_utf8 || RExC_uni_semantics)
5903 8457092         && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5904           {
5905           set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5906           }
5907            
5908           /* Small enough for pointer-storage convention?
5909           If extralen==0, this means that we will not need long jumps. */
5910 8457206         if (RExC_size >= 0x10000L && RExC_extralen)
5911 8457072         RExC_size += RExC_extralen;
5912           else
5913 8457206         RExC_extralen = 0;
5914 8457206         if (RExC_whilem_seen > 15)
5915 8457072         RExC_whilem_seen = 15;
5916            
5917           /* Allocate space and zero-initialize. Note, the two step process
5918           of zeroing when in debug mode, thus anything assigned has to
5919           happen after that */
5920 8457206         rx = (REGEXP*) newSV_type(SVt_REGEXP);
5921 8457206         r = ReANY(rx);
5922 8457206         Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5923           char, regexp_internal);
5924 6216215         if ( r == NULL || ri == NULL )
5925 6216081         FAIL("Regexp out of space");
5926           #ifdef DEBUGGING
5927           /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5928 8457206         Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5929           #else
5930           /* bulk initialize base fields with 0. */
5931           Zero(ri, sizeof(regexp_internal), char);
5932           #endif
5933            
5934           /* non-zero initialization begins here */
5935 8457206         RXi_SET( r, ri );
5936 8457184         r->engine= eng;
5937 8429185         r->extflags = rx_flags;
5938 2749792         RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5939            
5940 1307114         if (pm_flags & PMf_IS_QR) {
5941 756375         ri->code_blocks = pRExC_state->code_blocks;
5942 756373         ri->num_code_blocks = pRExC_state->num_code_blocks;
5943           }
5944           else
5945           {
5946           int n;
5947 755647         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5948 8429075         if (pRExC_state->code_blocks[n].src_regex)
5949 8429075         SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5950 10521380         SAVEFREEPV(pRExC_state->code_blocks);
5951           }
5952            
5953           {
5954 4541188         bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5955 2092543         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5956            
5957           /* The caret is output if there are any defaults: if not all the STD
5958           * flags are set, or if no character set specifier is needed */
5959 24727911         bool has_default =
5960 8350711         (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5961 8350711         || ! has_charset);
5962 8350711         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5963 8350711         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5964           >> RXf_PMf_STD_PMMOD_SHIFT);
5965           const char *fptr = STD_PAT_MODS; /*"msix"*/
5966           char *p;
5967           /* Allocate for the worst case, which is all the std flags are turned
5968           * on. If more precision is desired, we could do a population count of
5969           * the flags set. This could be done with a small lookup table, or by
5970           * shifting, masking and adding, or even, when available, assembly
5971           * language for a machine-language population count.
5972           * We never output a minus, as all those are defaults, so are
5973           * covered by the caret */
5974 8350711         const STRLEN wraplen = plen + has_p + has_runon
5975 4113228         + has_default /* If needs a caret */
5976            
5977           /* If needs a character set specifier */
5978 4113228         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5979           + (sizeof(STD_PAT_MODS) - 1)
5980           + (sizeof("(?:)") - 1);
5981            
5982 4113228         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5983 987909         r->xpv_len_u.xpvlenu_pv = p;
5984 4113228         if (RExC_utf8)
5985 4315977         SvFLAGS(rx) |= SVf_UTF8;
5986 8429185         *p++='('; *p++='?';
5987            
5988           /* If a default, cover it using the caret */
5989 29087870         if (has_default) {
5990 8429185         *p++= DEFAULT_PAT_MOD;
5991           }
5992 8429185         if (has_charset) {
5993           STRLEN len;
5994 8429071         const char* const name = get_regex_charset_name(r->extflags, &len);
5995 8429071         Copy(name, p, len, char);
5996 8429071         p += len;
5997           }
5998 8429185         if (has_p)
5999 4678074         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6000           {
6001           char ch;
6002 4678744         while((ch = *fptr++)) {
6003 3751513         if(reganch & 1)
6004 8429055         *p++ = ch;
6005 8429587         reganch >>= 1;
6006           }
6007           }
6008            
6009 2240949         *p++ = ':';
6010 206347         Copy(RExC_precomp, p, plen, char);
6011 8429185         assert ((RX_WRAPPED(rx) - p) < 16);
6012 4018439         r->pre_prefix = p - RX_WRAPPED(rx);
6013 1808186         p += plen;
6014 561398         if (has_runon)
6015 450060         *p++ = '\n';
6016 450194         *p++ = ')';
6017 450194         *p = 0;
6018 450328         SvCUR_set(rx, p - RX_WRAPPED(rx));
6019           }
6020            
6021 450194         r->intflags = 0;
6022 450194         r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6023          
6024 8429185         if (RExC_seen & REG_SEEN_RECURSE) {
6025 4126792         Newxz(RExC_open_parens, RExC_npar,regnode *);
6026 4126792         SAVEFREEPV(RExC_open_parens);
6027 4126792         Newxz(RExC_close_parens,RExC_npar,regnode *);
6028 4126792         SAVEFREEPV(RExC_close_parens);
6029           }
6030            
6031           /* Useful during FAIL. */
6032           #ifdef RE_TRACK_PATTERN_OFFSETS
6033 4126926         Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6034 117512         DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6035           "%s %"UVuf" bytes for offset annotations.\n",
6036           ri->u.offsets ? "Got" : "Couldn't get",
6037           (UV)((2*RExC_size+1) * sizeof(U32))));
6038           #endif
6039 4302393         SetProgLen(ri,RExC_size);
6040 4302393         RExC_rx_sv = rx;
6041 4302393         RExC_rx = r;
6042 4302393         RExC_rxi = ri;
6043            
6044           /* Second pass: emit code. */
6045 4302393         RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6046 8429185         RExC_pm_flags = pm_flags;
6047 6308887         RExC_parse = exp;
6048 6308887         RExC_end = exp + plen;
6049 1676307         RExC_naughty = 0;
6050 41430         RExC_npar = 1;
6051 41430         RExC_emit_start = ri->program;
6052 41430         RExC_emit = ri->program;
6053 41430         RExC_emit_bound = ri->program + RExC_size + 1;
6054 41430         pRExC_state->code_index = 0;
6055            
6056 41430         REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6057 41430         if (reg(pRExC_state, 0, &flags,1) == NULL) {
6058 36678         ReREFCNT_dec(rx);
6059 36678         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6060           }
6061           /* XXXX To minimize changes to RE engine we always allocate
6062           3-units-long substrs field. */
6063 36812         Newx(r->substrs, 1, struct reg_substr_data);
6064 134         if (RExC_recurse_count) {
6065 0         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6066 0         SAVEFREEPV(RExC_recurse);
6067           }
6068            
6069           reStudy:
6070 140         r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6071 140         Zero(r->substrs, 1, struct reg_substr_data);
6072            
6073           #ifdef TRIE_STUDY_OPT
6074 140         if (!restudied) {
6075 134         StructCopy(&zero_scan_data, &data, scan_data_t);
6076 8465863         copyRExC_state = RExC_state;
6077           } else {
6078 8465735         U32 seen=RExC_seen;
6079 7956857         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6080          
6081 8465735         RExC_state = copyRExC_state;
6082 123524         if (seen & REG_TOP_LEVEL_BRANCHES)
6083 8465729         RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6084           else
6085 329634         RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6086 8465735         StructCopy(&zero_scan_data, &data, scan_data_t);
6087           }
6088           #else
6089           StructCopy(&zero_scan_data, &data, scan_data_t);
6090           #endif
6091            
6092           /* Dig out information for optimizations. */
6093 112956         r->extflags = RExC_flags; /* was pm_op */
6094           /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6095          
6096 8465869         if (UTF)
6097 126         SvUTF8_on(rx); /* Unicode in it? */
6098 8465869         ri->regstclass = NULL;
6099 1374         if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6100 1234         r->intflags |= PREGf_NAUGHTY;
6101 8465869         scan = ri->program + 1; /* First BRANCH. */
6102            
6103           /* testing for BRANCH here tells us whether there is "must appear"
6104           data in the pattern. If there is then we can use it for optimisations */
6105 398         if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6106           SSize_t fake;
6107           STRLEN longest_float_length, longest_fixed_length;
6108           struct regnode_charclass_class ch_class; /* pointed to by data */
6109           int stclass_flag;
6110 8465869         SSize_t last_close = 0; /* pointed to by data */
6111 5306         regnode *first= scan;
6112 8465869         regnode *first_next= regnext(first);
6113           /*
6114           * Skip introductions and multiplicators >= 1
6115           * so that we can extract the 'meat' of the pattern that must
6116           * match in the large if() sequence following.
6117           * NOTE that EXACT is NOT covered here, as it is normally
6118           * picked up by the optimiser separately.
6119           *
6120           * This is unfortunate as the optimiser isnt handling lookahead
6121           * properly currently.
6122           *
6123           */
6124 13643         while ((OP(first) == OPEN && (sawopen = 1)) ||
6125           /* An OR of *one* alternative - should not happen now. */
6126 8457205         (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6127           /* for now we can't handle lookbehind IFMATCH*/
6128 8466009         (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6129 8466009         (OP(first) == PLUS) ||
6130 8466009         (OP(first) == MINMOD) ||
6131           /* An {n,m} with n>0 */
6132 8466009         (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6133 39310         (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6134           {
6135           /*
6136           * the only op that could be a regnode is PLUS, all the rest
6137           * will be regnode_1 or regnode_2.
6138           *
6139           * (yves doesn't think this is true)
6140           */
6141 8426569         if (OP(first) == PLUS)
6142           sawplus = 1;
6143           else {
6144 17803         if (OP(first) == MINMOD)
6145           sawminmod = 1;
6146 8408774         first += regarglen[OP(first)];
6147           }
6148 36966         first = NEXTOPER(first);
6149 8371816         first_next= regnext(first);
6150           }
6151            
6152           /* Starting-point info. */
6153           again:
6154 67727         DEBUG_PEEP("first:",first,0);
6155           /* Ignore EXACT as we deal with it later. */
6156 8465869         if (PL_regkind[OP(first)] == EXACT) {
6157 8465847         if (OP(first) == EXACT)
6158           NOOP; /* Empty, get anchored substr later. */
6159           else
6160 4601         ri->regstclass = first;
6161           }
6162           #ifdef TRIE_STCLASS
6163 3324         else if (PL_regkind[OP(first)] == TRIE &&
6164 3302         ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6165 8465733         {
6166           regnode *trie_op;
6167           /* this can happen only on restudy */
6168 17830745         if ( OP(first) == TRIE ) {
6169 8648         struct regnode_1 *trieop = (struct regnode_1 *)
6170           PerlMemShared_calloc(1, sizeof(struct regnode_1));
6171 8648         StructCopy(first,trieop,struct regnode_1);
6172           trie_op=(regnode *)trieop;
6173           } else {
6174 8562         struct regnode_charclass *trieop = (struct regnode_charclass *)
6175           PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6176 82         StructCopy(first,trieop,struct regnode_charclass);
6177           trie_op=(regnode *)trieop;
6178           }
6179 10         OP(trie_op)+=2;
6180 80         make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6181 84         ri->regstclass = trie_op;
6182           }
6183           #endif
6184 58         else if (REGNODE_SIMPLE(OP(first)))
6185 38         ri->regstclass = first;
6186 32         else if (PL_regkind[OP(first)] == BOUND ||
6187           PL_regkind[OP(first)] == NBOUND)
6188 20         ri->regstclass = first;
6189 36         else if (PL_regkind[OP(first)] == BOL) {
6190 4319         r->extflags |= (OP(first) == MBOL
6191           ? RXf_ANCH_MBOL
6192 226         : (OP(first) == SBOL
6193           ? RXf_ANCH_SBOL
6194           : RXf_ANCH_BOL));
6195 226         first = NEXTOPER(first);
6196 56         goto again;
6197           }
6198 186         else if (OP(first) == GPOS) {
6199 170         r->extflags |= RXf_ANCH_GPOS;
6200 113         first = NEXTOPER(first);
6201 8562         goto again;
6202           }
6203 8594         else if ((!sawopen || !RExC_sawback) &&
6204 58         (OP(first) == STAR &&
6205 8562         PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6206 8562         !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6207           {
6208           /* turn .* into ^.* with an implied $*=1 */
6209           const int type =
6210 8562         (OP(NEXTOPER(first)) == REG_ANY)
6211           ? RXf_ANCH_MBOL
6212 8554         : RXf_ANCH_SBOL;
6213 8554         r->extflags |= type;
6214 10146         r->intflags |= PREGf_IMPLICIT;
6215 8782         first = NEXTOPER(first);
6216 8782         goto again;
6217           }
6218 7394         if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6219 7254         && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6220           /* x+ must match at the 1st pos of run of x's */
6221 7254         r->intflags |= PREGf_SKIP;
6222            
6223           /* Scan is after the zeroth branch, first is atomic matcher. */
6224           #ifdef TRIE_STUDY_OPT
6225 7394         DEBUG_PARSE_r(
6226           if (!restudied)
6227           PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6228           (IV)(first - scan + 1))
6229           );
6230           #else
6231           DEBUG_PARSE_r(
6232           PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6233           (IV)(first - scan + 1))
6234           );
6235           #endif
6236            
6237            
6238           /*
6239           * If there's something expensive in the r.e., find the
6240           * longest literal string that must appear and make it the
6241           * regmust. Resolve ties in favor of later strings, since
6242           * the regstart check works with the beginning of the r.e.
6243           * and avoiding duplication strengthens checking. Not a
6244           * strong reason, but sufficient in the absence of others.
6245           * [Now we resolve ties in favor of the earlier string if
6246           * it happens that c_offset_min has been invalidated, since the
6247           * earlier string may buy us something the later one won't.]
6248           */
6249            
6250 1668         data.longest_fixed = newSVpvs("");
6251 144         data.longest_float = newSVpvs("");
6252 1732         data.last_found = newSVpvs("");
6253 208         data.longest = &(data.longest_fixed);
6254 1504         ENTER_with_name("study_chunk");
6255 4441         SAVEFREESV(data.longest_fixed);
6256 180         SAVEFREESV(data.longest_float);
6257 180         SAVEFREESV(data.last_found);
6258 180         first = scan;
6259 156         if (!ri->regstclass) {
6260 156         cl_init(pRExC_state, &ch_class);
6261 156         data.start_class = &ch_class;
6262           stclass_flag = SCF_DO_STCLASS_AND;
6263           } else /* XXXX Check for BOUND? */
6264           stclass_flag = 0;
6265 160         data.last_closep = &last_close;
6266          
6267 170         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6268           &data, -1, NULL, NULL,
6269           SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6270           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6271           0);
6272            
6273            
6274 196         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6275            
6276            
6277 190         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6278 182         && data.last_start_min == 0 && data.last_end > 0
6279 163         && !RExC_seen_zerolen
6280 218         && !(RExC_seen & REG_SEEN_VERBARG)
6281 218         && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6282 218         r->extflags |= RXf_CHECK_ALL;
6283 451         scan_commit(pRExC_state, &data,&minlen,0);
6284            
6285 254         longest_float_length = CHR_SVLEN(data.longest_float);
6286            
6287 368         if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6288 244         && data.offset_fixed == data.offset_float_min
6289 290         && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6290 838         && S_setup_longest (aTHX_ pRExC_state,
6291           data.longest_float,
6292 246         &(r->float_utf8),
6293 246         &(r->float_substr),
6294 246         &(r->float_end_shift),
6295           data.lookbehind_float,
6296           data.offset_float_min,
6297           data.minlen_float,
6298           longest_float_length,
6299 254         cBOOL(data.flags & SF_FL_BEFORE_EOL),
6300 250         cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6301           {
6302 24         r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6303 24         r->float_max_offset = data.offset_float_max;
6304 24         if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6305 16         r->float_max_offset -= data.lookbehind_float;
6306 12         SvREFCNT_inc_simple_void_NN(data.longest_float);
6307           }
6308           else {
6309 138         r->float_substr = r->float_utf8 = NULL;
6310           longest_float_length = 0;
6311           }
6312            
6313 142         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6314            
6315 812         if (S_setup_longest (aTHX_ pRExC_state,
6316           data.longest_fixed,
6317 142         &(r->anchored_utf8),
6318 142         &(r->anchored_substr),
6319 144         &(r->anchored_end_shift),
6320 150         data.lookbehind_fixed,
6321           data.offset_fixed,
6322           data.minlen_fixed,
6323           longest_fixed_length,
6324 150         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6325 150         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6326           {
6327 132         r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6328 132         SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6329           }
6330           else {
6331 78         r->anchored_substr = r->anchored_utf8 = NULL;
6332           longest_fixed_length = 0;
6333           }
6334 170         LEAVE_with_name("study_chunk");
6335            
6336 170         if (ri->regstclass
6337 56         && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6338 54         ri->regstclass = NULL;
6339            
6340 174         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6341 42         && stclass_flag
6342 34         && ! TEST_SSC_EOS(data.start_class)
6343 24         && !cl_is_anything(data.start_class))
6344           {
6345 36         const U32 n = add_data(pRExC_state, 1, "f");
6346 31         OP(data.start_class) = ANYOF_SYNTHETIC;
6347            
6348 16         Newx(RExC_rxi->data->data[n], 1,
6349           struct regnode_charclass_class);
6350 46427967         StructCopy(data.start_class,
6351           (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6352           struct regnode_charclass_class);
6353 69095412         ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6354 46427967         r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6355 46427607         DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6356           regprop(r, sv, (regnode*)data.start_class);
6357           PerlIO_printf(Perl_debug_log,
6358           "synthetic stclass \"%s\".\n",
6359           SvPVX_const(sv));});
6360           }
6361            
6362           /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6363 542         if (longest_fixed_length > longest_float_length) {
6364 522         r->check_end_shift = r->anchored_end_shift;
6365 186         r->check_substr = r->anchored_substr;
6366 186         r->check_utf8 = r->anchored_utf8;
6367 522         r->check_offset_min = r->check_offset_max = r->anchored_offset;
6368 46428033         if (r->extflags & RXf_ANCH_SINGLE)
6369 46427847         r->extflags |= RXf_NOSCAN;
6370           }
6371           else {
6372 46427867         r->check_end_shift = r->float_end_shift;
6373 388         r->check_substr = r->float_substr;
6374 388         r->check_utf8 = r->float_utf8;
6375 388         r->check_offset_min = r->float_min_offset;
6376 46427499         r->check_offset_max = r->float_max_offset;
6377           }
6378 1042         if ((r->check_substr || r->check_utf8) ) {
6379 1026         r->extflags |= RXf_USE_INTUIT;
6380 1026         if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6381 69087923         r->extflags |= RXf_INTUIT_TAIL;
6382           }
6383           /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6384           if ( (STRLEN)minlen < longest_float_length )
6385           minlen= longest_float_length;
6386           if ( (STRLEN)minlen < longest_fixed_length )
6387           minlen= longest_fixed_length;
6388           */
6389           }
6390           else {
6391           /* Several toplevels. Best we can is to set minlen. */
6392           SSize_t fake;
6393           struct regnode_charclass_class ch_class;
6394 65417605         SSize_t last_close = 0;
6395            
6396 42756255         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6397            
6398 42755587         scan = ri->program + 1;
6399 42755587         cl_init(pRExC_state, &ch_class);
6400 42756863         data.start_class = &ch_class;
6401 42756863         data.last_closep = &last_close;
6402            
6403          
6404 42756863         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6405           &data, -1, NULL, NULL,
6406           SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6407           |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6408           0);
6409          
6410 42756863         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6411            
6412 42756863         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6413 63589333         = r->float_substr = r->float_utf8 = NULL;
6414            
6415 58         if (! TEST_SSC_EOS(data.start_class)
6416 16         && !cl_is_anything(data.start_class))
6417           {
6418 42756805         const U32 n = add_data(pRExC_state, 1, "f");
6419 238428         OP(data.start_class) = ANYOF_SYNTHETIC;
6420            
6421 42518435         Newx(RExC_rxi->data->data[n], 1,
6422           struct regnode_charclass_class);
6423 42756863         StructCopy(data.start_class,
6424           (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6425           struct regnode_charclass_class);
6426 2298         ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6427 0         r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6428 0         DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6429           regprop(r, sv, (regnode*)data.start_class);
6430           PerlIO_printf(Perl_debug_log,
6431           "synthetic stclass \"%s\".\n",
6432           SvPVX_const(sv));});
6433           }
6434           }
6435            
6436           /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6437           the "real" pattern. */
6438 134         DEBUG_OPTIMISE_r({
6439           PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6440           (IV)minlen, (IV)r->minlen);
6441           });
6442 134         r->minlenret = minlen;
6443 134         if (r->minlen < minlen)
6444 124         r->minlen = minlen;
6445          
6446 134         if (RExC_seen & REG_SEEN_GPOS)
6447 0         r->extflags |= RXf_GPOS_SEEN;
6448 134         if (RExC_seen & REG_SEEN_LOOKBEHIND)
6449 0         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6450 2432         if (pRExC_state->num_code_blocks)
6451 3671172         r->extflags |= RXf_EVAL_SEEN;
6452 25595668         if (RExC_seen & REG_SEEN_CANY)
6453 294292         r->extflags |= RXf_CANY_SEEN;
6454 294426         if (RExC_seen & REG_SEEN_VERBARG)
6455           {
6456 0         r->intflags |= PREGf_VERBARG_SEEN;
6457 294292         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6458           }
6459 134         if (RExC_seen & REG_SEEN_CUTGROUP)
6460 0         r->intflags |= PREGf_CUTGROUP_SEEN;
6461 134         if (pm_flags & PMf_USE_RE_EVAL)
6462 24         r->intflags |= PREGf_USE_RE_EVAL;
6463 134         if (RExC_paren_names)
6464 0         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6465           else
6466 134         RXp_PAREN_NAMES(r) = NULL;
6467            
6468           {
6469 134         regnode *first = ri->program + 1;
6470 134         U8 fop = OP(first);
6471           regnode *next = NEXTOPER(first);
6472 134         U8 nop = OP(next);
6473            
6474 134         if (PL_regkind[fop] == NOTHING && nop == END)
6475 2         r->extflags |= RXf_NULL;
6476 132         else if (PL_regkind[fop] == BOL && nop == END)
6477 0         r->extflags |= RXf_START_ONLY;
6478 132         else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6479 0         r->extflags |= RXf_WHITE;
6480 132         else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6481 0         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6482            
6483           }
6484           #ifdef DEBUGGING
6485 134         if (RExC_paren_names) {
6486 0         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6487 0         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6488           } else
6489           #endif
6490 134         ri->name_list_idx = 0;
6491            
6492 134         if (RExC_recurse_count) {
6493 0         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6494 0         const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6495 0         ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6496           }
6497           }
6498 134         Newxz(r->offs, RExC_npar, regexp_paren_pair);
6499           /* assume we don't need to swap parens around before we match */
6500            
6501 134         DEBUG_DUMP_r({
6502           PerlIO_printf(Perl_debug_log,"Final program:\n");
6503           regdump(r);
6504           });
6505           #ifdef RE_TRACK_PATTERN_OFFSETS
6506 134         DEBUG_OFFSETS_r(if (ri->u.offsets) {
6507           const STRLEN len = ri->u.offsets[0];
6508           STRLEN i;
6509           GET_RE_DEBUG_FLAGS_DECL;
6510           PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6511           for (i = 1; i <= len; i++) {
6512           if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6513           PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6514           (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6515           }
6516           PerlIO_printf(Perl_debug_log, "\n");
6517           });
6518           #endif
6519            
6520           #ifdef USE_ITHREADS
6521           /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6522           * by setting the regexp SV to readonly-only instead. If the
6523           * pattern's been recompiled, the USEDness should remain. */
6524           if (old_re && SvREADONLY(old_re))
6525           SvREADONLY_on(rx);
6526           #endif
6527           return rx;
6528           }
6529            
6530            
6531           SV*
6532 0         Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6533           const U32 flags)
6534           {
6535 0         PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6536            
6537           PERL_UNUSED_ARG(value);
6538            
6539 0         if (flags & RXapif_FETCH) {
6540 2086952         return reg_named_buff_fetch(rx, key, flags);
6541 2086952         } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6542 39214         Perl_croak_no_modify();
6543           return NULL;
6544 39214         } else if (flags & RXapif_EXISTS) {
6545 39214         return reg_named_buff_exists(rx, key, flags)
6546           ? &PL_sv_yes
6547 38946         : &PL_sv_no;
6548 49992         } else if (flags & RXapif_REGNAMES) {
6549 64429         return reg_named_buff_all(rx, flags);
6550 91290         } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6551 91290         return reg_named_buff_scalar(rx, flags);
6552           } else {
6553 268         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6554           return NULL;
6555           }
6556           }
6557            
6558           SV*
6559 268         Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6560           const U32 flags)
6561           {
6562 38946         PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6563           PERL_UNUSED_ARG(lastkey);
6564            
6565 20306         if (flags & RXapif_FIRSTKEY)
6566 20306         return reg_named_buff_firstkey(rx, flags);
6567 20306         else if (flags & RXapif_NEXTKEY)
6568 8070         return reg_named_buff_nextkey(rx, flags);
6569           else {
6570 8070         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6571           return NULL;
6572           }
6573           }
6574            
6575           SV*
6576 0         Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6577           const U32 flags)
6578           {
6579           AV *retarray = NULL;
6580           SV *ret;
6581 8070         struct regexp *const rx = ReANY(r);
6582            
6583 7958         PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6584            
6585 8070         if (flags & RXapif_ALL)
6586 7958         retarray=newAV();
6587            
6588 8070         if (rx && RXp_PAREN_NAMES(rx)) {
6589 112         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6590 19417         if (he_str) {
6591           IV i;
6592 14672152         SV* sv_dat=HeVAL(he_str);
6593 14672152         I32 *nums=(I32*)SvPVX(sv_dat);
6594 14672152         for ( i=0; i
6595 10708566         if ((I32)(rx->nparens) >= nums[i]
6596 52907588         && rx->offs[nums[i]].start != -1
6597 24765565         && rx->offs[nums[i]].end != -1)
6598           {
6599 19295878         ret = newSVpvs("");
6600 3349824         CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6601 987586         if (!retarray)
6602           return ret;
6603           } else {
6604 11342222         if (retarray)
6605 0         ret = newSVsv(&PL_sv_undef);
6606           }
6607 11342222         if (retarray)
6608 14958468         av_push(retarray, ret);
6609           }
6610 14958468         if (retarray)
6611 14958468         return newRV_noinc(MUTABLE_SV(retarray));
6612           }
6613           }
6614           return NULL;
6615           }
6616            
6617           bool
6618 14958468         Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6619           const U32 flags)
6620           {
6621 14958468         struct regexp *const rx = ReANY(r);
6622            
6623 14958468         PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6624            
6625 14958468         if (rx && RXp_PAREN_NAMES(rx)) {
6626 707352         if (flags & RXapif_ALL) {
6627 707352         return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6628           } else {
6629 707352         SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6630 707352         if (sv) {
6631 707352         SvREFCNT_dec_NN(sv);
6632 707352         return TRUE;
6633           } else {
6634           return FALSE;
6635           }
6636           }
6637           } else {
6638           return FALSE;
6639           }
6640           }
6641            
6642           SV*
6643 0         Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6644           {
6645 707352         struct regexp *const rx = ReANY(r);
6646            
6647 707352         PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6648            
6649 707352         if ( rx && RXp_PAREN_NAMES(rx) ) {
6650 707352         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6651            
6652 707352         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6653           } else {
6654           return FALSE;
6655           }
6656           }
6657            
6658           SV*
6659 1830678         Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6660           {
6661 1830678         struct regexp *const rx = ReANY(r);
6662 1830678         GET_RE_DEBUG_FLAGS_DECL;
6663            
6664 5783079         PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6665            
6666 11342132         if (rx && RXp_PAREN_NAMES(rx)) {
6667 11342132         HV *hv = RXp_PAREN_NAMES(rx);
6668           HE *temphe;
6669 6896898         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6670           IV i;
6671           IV parno = 0;
6672 6896898         SV* sv_dat = HeVAL(temphe);
6673 4445234         I32 *nums = (I32*)SvPVX(sv_dat);
6674 4445234         for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6675 4445234         if ((I32)(rx->lastparen) >= nums[i] &&
6676 0         rx->offs[nums[i]].start != -1 &&
6677 0         rx->offs[nums[i]].end != -1)
6678           {
6679 0         parno = nums[i];
6680 4445234         break;
6681           }
6682           }
6683 4445234         if (parno || flags & RXapif_ALL) {
6684 687818         return newSVhek(HeKEY_hek(temphe));
6685           }
6686           }
6687           }
6688           return NULL;
6689           }
6690            
6691           SV*
6692 687818         Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6693           {
6694           SV *ret;
6695           AV *av;
6696           SSize_t length;
6697 0         struct regexp *const rx = ReANY(r);
6698            
6699 11342132         PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6700            
6701 10654314         if (rx && RXp_PAREN_NAMES(rx)) {
6702 10654314         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6703 1830588         return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6704 1830588         } else if (flags & RXapif_ONE) {
6705 8823726         ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6706 10654314         av = MUTABLE_AV(SvRV(ret));
6707 10654314         length = av_len(av);
6708 10653628         SvREFCNT_dec_NN(ret);
6709 686         return newSViv(length + 1);
6710           } else {
6711 3349824         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6712           return NULL;
6713           }
6714           }
6715           return &PL_sv_undef;
6716           }
6717            
6718           SV*
6719 3349824         Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6720           {
6721 3349824         struct regexp *const rx = ReANY(r);
6722 3349824         AV *av = newAV();
6723            
6724 3349824         PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6725            
6726 2217452         if (rx && RXp_PAREN_NAMES(rx)) {
6727 2129444         HV *hv= RXp_PAREN_NAMES(rx);
6728           HE *temphe;
6729 227640         (void)hv_iterinit(hv);
6730 248976         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6731           IV i;
6732           IV parno = 0;
6733 1132372         SV* sv_dat = HeVAL(temphe);
6734 52594         I32 *nums = (I32*)SvPVX(sv_dat);
6735 1330892         for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6736 1060580         if ((I32)(rx->lastparen) >= nums[i] &&
6737 1060580         rx->offs[nums[i]].start != -1 &&
6738 807960         rx->offs[nums[i]].end != -1)
6739           {
6740 280234         parno = nums[i];
6741 1815029         break;
6742           }
6743           }
6744 20230         if (parno || flags & RXapif_ALL) {
6745 20230         av_push(av, newSVhek(HeKEY_hek(temphe)));
6746           }
6747           }
6748           }
6749            
6750 20230         return newRV_noinc(MUTABLE_SV(av));
6751           }
6752            
6753           void
6754 20230         Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6755           SV * const sv)
6756           {
6757 65613         struct regexp *const rx = ReANY(r);
6758           char *s = NULL;
6759           SSize_t i = 0;
6760           SSize_t s1, t1;
6761           I32 n = paren;
6762            
6763 40362         PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6764          
6765 27610         if ( n == RX_BUFF_IDX_CARET_PREMATCH
6766 27610         || n == RX_BUFF_IDX_CARET_FULLMATCH
6767 25822         || n == RX_BUFF_IDX_CARET_POSTMATCH
6768           )
6769           {
6770 25822         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6771 4602         if (!keepcopy) {
6772           /* on something like
6773           * $r = qr/.../;
6774           * /$qr/p;
6775           * the KEEPCOPY is set on the PMOP rather than the regex */
6776 33972         if (PL_curpm && r == PM_GETRE(PL_curpm))
6777 33972         keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6778           }
6779 976008         if (!keepcopy)
6780           goto ret_undef;
6781           }
6782            
6783 942036         if (!rx->subbeg)
6784           goto ret_undef;
6785            
6786 942036         if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6787           /* no need to distinguish between them any more */
6788           n = RX_BUFF_IDX_FULLMATCH;
6789            
6790 33972         if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6791 3798         && rx->offs[0].start != -1)
6792           {
6793           /* $`, ${^PREMATCH} */
6794 4         i = rx->offs[0].start;
6795 4         s = rx->subbeg;
6796           }
6797           else
6798 30174         if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6799 3633929         && rx->offs[0].end != -1)
6800           {
6801           /* $', ${^POSTMATCH} */
6802 6983254         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6803 285628         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6804           }
6805           else
6806 285520         if ( 0 <= n && n <= (I32)rx->nparens &&
6807 916         (s1 = rx->offs[n].start) != -1 &&
6808 285628         (t1 = rx->offs[n].end) != -1)
6809           {
6810           /* $&, ${^MATCH}, $1 ... */
6811 285520         i = t1 - s1;
6812 285520         s = rx->subbeg + s1 - rx->suboffset;
6813           } else {
6814           goto ret_undef;
6815           }
6816            
6817 95060         assert(s >= rx->subbeg);
6818 3348301         assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
6819 52         if (i >= 0) {
6820           #if NO_TAINT_SUPPORT
6821           sv_setpvn(sv, s, i);
6822           #else
6823 0         const int oldtainted = TAINT_get;
6824 52         TAINT_NOT;
6825 0         sv_setpvn(sv, s, i);
6826 0         TAINT_set(oldtainted);
6827           #endif
6828 0         if ( (rx->extflags & RXf_CANY_SEEN)
6829 0         ? (RXp_MATCH_UTF8(rx)
6830 52         && (!i || is_utf8_string((U8*)s, i)))
6831 0         : (RXp_MATCH_UTF8(rx)) )
6832           {
6833 3348249         SvUTF8_on(sv);
6834           }
6835           else
6836 2         SvUTF8_off(sv);
6837 2         if (TAINTING_get) {
6838 2         if (RXp_MATCH_TAINTED(rx)) {
6839 0         if (SvTYPE(sv) >= SVt_PVMG) {
6840 0         MAGIC* const mg = SvMAGIC(sv);
6841           MAGIC* mgt;
6842 3348249         TAINT;
6843 3348249         SvMAGIC_set(sv, mg->mg_moremagic);
6844 17489700         SvTAINT(sv);
6845 12500446         if ((mgt = SvMAGIC(sv))) {
6846 8162610         mg->mg_moremagic = mgt;
6847 1404116         SvMAGIC_set(sv, mg);
6848           }
6849           } else {
6850 4898992         TAINT;
6851 4898992         SvTAINT(sv);
6852           }
6853           } else
6854 7601454         SvTAINTED_off(sv);
6855           }
6856           } else {
6857           ret_undef:
6858 7601454         sv_setsv(sv,&PL_sv_undef);
6859 12500446         return;
6860           }
6861           }
6862            
6863           void
6864 6504669         Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6865           SV const * const value)
6866           {
6867 5327917         PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6868            
6869           PERL_UNUSED_ARG(rx);
6870           PERL_UNUSED_ARG(paren);
6871           PERL_UNUSED_ARG(value);
6872            
6873 6504669         if (!PL_localizing)
6874 5995777         Perl_croak_no_modify();
6875 5995777         }
6876            
6877           I32
6878 8749986         Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6879           const I32 paren)
6880           {
6881 3348249         struct regexp *const rx = ReANY(r);
6882           I32 i;
6883           I32 s1, t1;
6884            
6885 2861965         PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6886            
6887 508394         if ( paren == RX_BUFF_IDX_CARET_PREMATCH
6888 3348249         || paren == RX_BUFF_IDX_CARET_FULLMATCH
6889 3347751         || paren == RX_BUFF_IDX_CARET_POSTMATCH
6890           )
6891           {
6892 3348249         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6893 3348249         if (!keepcopy) {
6894           /* on something like
6895           * $r = qr/.../;
6896           * /$qr/p;
6897           * the KEEPCOPY is set on the PMOP rather than the regex */
6898 3348249         if (PL_curpm && r == PM_GETRE(PL_curpm))
6899 3347751         keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6900           }
6901 3264485         if (!keepcopy)
6902           goto warn_undef;
6903           }
6904            
6905           /* Some of this code was originally in C in F */
6906 83266         switch (paren) {
6907           case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6908           case RX_BUFF_IDX_PREMATCH: /* $` */
6909 83266         if (rx->offs[0].start != -1) {
6910 3348249         i = rx->offs[0].start;
6911 3348239         if (i > 0) {
6912           s1 = 0;
6913           t1 = i;
6914           goto getlen;
6915           }
6916           }
6917           return 0;
6918            
6919           case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6920           case RX_BUFF_IDX_POSTMATCH: /* $' */
6921 3348249         if (rx->offs[0].end != -1) {
6922 3492349         i = rx->sublen - rx->offs[0].end;
6923 4439715         if (i > 0) {
6924 4439715         s1 = rx->offs[0].end;
6925 8879430         t1 = rx->sublen;
6926 12710         goto getlen;
6927           }
6928           }
6929           return 0;
6930            
6931           default: /* $& / ${^MATCH}, $1, $2, ... */
6932 12710         if (paren <= (I32)rx->nparens &&
6933 4         (s1 = rx->offs[paren].start) != -1 &&
6934 0         (t1 = rx->offs[paren].end) != -1)
6935           {
6936 4         i = t1 - s1;
6937 0         goto getlen;
6938           } else {
6939           warn_undef:
6940 0         if (ckWARN(WARN_UNINITIALIZED))
6941 0         report_uninit((const SV *)sv);
6942           return 0;
6943           }
6944           }
6945           getlen:
6946 0         if (i > 0 && RXp_MATCH_UTF8(rx)) {
6947 0         const char * const s = rx->subbeg - rx->suboffset + s1;
6948           const U8 *ep;
6949           STRLEN el;
6950            
6951 0         i = t1 - s1;
6952 4427005         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6953 3397265         i = el;
6954           }
6955 3274319         return i;
6956           }
6957            
6958           SV*
6959 3274419         Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6960           {
6961 123046         PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6962           PERL_UNUSED_ARG(rx);
6963           if (0)
6964           return NULL;
6965           else
6966 123046         return newSVpvs("Regexp");
6967           }
6968            
6969           /* Scans the name of a named buffer from the pattern.
6970           * If flags is REG_RSN_RETURN_NULL returns null.
6971           * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6972           * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6973           * to the parsed name as looked up in the RExC_paren_names hash.
6974           * If there is an error throws a vFAIL().. type exception.
6975           */
6976            
6977           #define REG_RSN_RETURN_NULL 0
6978           #define REG_RSN_RETURN_NAME 1
6979           #define REG_RSN_RETURN_DATA 2
6980            
6981           STATIC SV*
6982 4427005         S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6983           {
6984 4427005         char *name_start = RExC_parse;
6985            
6986 75944686         PERL_ARGS_ASSERT_REG_SCAN_NAME;
6987            
6988 69347018         if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6989           /* skip IDFIRST by using do...while */
6990 6042912         if (UTF)
6991           do {
6992 1481680         RExC_parse += UTF8SKIP(RExC_parse);
6993 63899850         } while (isWORDCHAR_utf8((U8*)RExC_parse));
6994           else
6995           do {
6996 63899850         RExC_parse++;
6997 5447168         } while (isWORDCHAR(*RExC_parse));
6998           } else {
6999 5447168         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7000 69347018         vFAIL("Group name must start with a non-digit word character");
7001           }
7002 35958424         if ( flags ) {
7003 35958424         SV* sv_name
7004 2579080         = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7005           SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7006 33388594         if ( flags == REG_RSN_RETURN_NAME)
7007           return sv_name;
7008 2192632         else if (flags==REG_RSN_RETURN_DATA) {
7009           HE *he_str = NULL;
7010           SV *sv_dat = NULL;
7011 51496145         if ( ! sv_name ) /* should not happen*/
7012 4427005         Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7013 3773243         if (RExC_paren_names)
7014 1467660         he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7015 4427005         if ( he_str )
7016 1467660         sv_dat = HeVAL(he_str);
7017 4427005         if ( ! sv_dat )
7018 2434830         vFAIL("Reference to nonexistent named group");
7019           return sv_dat;
7020           }
7021           else {
7022 4427005         Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7023           (unsigned long) flags);
7024           }
7025           assert(0); /* NOT REACHED */
7026           }
7027           return NULL;
7028           }
7029            
7030           #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7031           int rem=(int)(RExC_end - RExC_parse); \
7032           int cut; \
7033           int num; \
7034           int iscut=0; \
7035           if (rem>10) { \
7036           rem=10; \
7037           iscut=1; \
7038           } \
7039           cut=10-rem; \
7040           if (RExC_lastparse!=RExC_parse) \
7041           PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7042           rem, RExC_parse, \
7043           cut + 4, \
7044           iscut ? "..." : "<" \
7045           ); \
7046           else \
7047           PerlIO_printf(Perl_debug_log,"%16s",""); \
7048           \
7049           if (SIZE_ONLY) \
7050           num = RExC_size + 1; \
7051           else \
7052           num=REG_NODE_NUM(RExC_emit); \
7053           if (RExC_lastnum!=num) \
7054           PerlIO_printf(Perl_debug_log,"|%4d",num); \
7055           else \
7056           PerlIO_printf(Perl_debug_log,"|%4s",""); \
7057           PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7058           (int)((depth*2)), "", \
7059           (funcname) \
7060           ); \
7061           RExC_lastnum=num; \
7062           RExC_lastparse=RExC_parse; \
7063           })
7064            
7065            
7066            
7067           #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7068           DEBUG_PARSE_MSG((funcname)); \
7069           PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7070           })
7071           #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7072           DEBUG_PARSE_MSG((funcname)); \
7073           PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7074           })
7075            
7076           /* This section of code defines the inversion list object and its methods. The
7077           * interfaces are highly subject to change, so as much as possible is static to
7078           * this file. An inversion list is here implemented as a malloc'd C UV array
7079           * as an SVt_INVLIST scalar.
7080           *
7081           * An inversion list for Unicode is an array of code points, sorted by ordinal
7082           * number. The zeroth element is the first code point in the list. The 1th
7083           * element is the first element beyond that not in the list. In other words,
7084           * the first range is
7085           * invlist[0]..(invlist[1]-1)
7086           * The other ranges follow. Thus every element whose index is divisible by two
7087           * marks the beginning of a range that is in the list, and every element not
7088           * divisible by two marks the beginning of a range not in the list. A single
7089           * element inversion list that contains the single code point N generally
7090           * consists of two elements
7091           * invlist[0] == N
7092           * invlist[1] == N+1
7093           * (The exception is when N is the highest representable value on the
7094           * machine, in which case the list containing just it would be a single
7095           * element, itself. By extension, if the last range in the list extends to
7096           * infinity, then the first element of that range will be in the inversion list
7097           * at a position that is divisible by two, and is the final element in the
7098           * list.)
7099           * Taking the complement (inverting) an inversion list is quite simple, if the
7100           * first element is 0, remove it; otherwise add a 0 element at the beginning.
7101           * This implementation reserves an element at the beginning of each inversion
7102           * list to always contain 0; there is an additional flag in the header which
7103           * indicates if the list begins at the 0, or is offset to begin at the next
7104           * element.
7105           *
7106           * More about inversion lists can be found in "Unicode Demystified"
7107           * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7108           * More will be coming when functionality is added later.
7109           *
7110           * The inversion list data structure is currently implemented as an SV pointing
7111           * to an array of UVs that the SV thinks are bytes. This allows us to have an
7112           * array of UV whose memory management is automatically handled by the existing
7113           * facilities for SV's.
7114           *
7115           * Some of the methods should always be private to the implementation, and some
7116           * should eventually be made public */
7117            
7118           /* The header definitions are in F */
7119            
7120           PERL_STATIC_INLINE UV*
7121           S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7122           {
7123           /* Returns a pointer to the first element in the inversion list's array.
7124           * This is called upon initialization of an inversion list. Where the
7125           * array begins depends on whether the list has the code point U+0000 in it
7126           * or not. The other parameter tells it whether the code that follows this
7127           * call is about to put a 0 in the inversion list or not. The first
7128           * element is either the element reserved for 0, if TRUE, or the element
7129           * after it, if FALSE */
7130            
7131           bool* offset = get_invlist_offset_addr(invlist);
7132           UV* zero_addr = (UV *) SvPVX(invlist);
7133            
7134           PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7135            
7136           /* Must be empty */
7137           assert(! _invlist_len(invlist));
7138            
7139           *zero_addr = 0;
7140            
7141           /* 1^1 = 0; 1^0 = 1 */
7142           *offset = 1 ^ will_have_0;
7143           return zero_addr + *offset;
7144           }
7145            
7146           PERL_STATIC_INLINE UV*
7147 1467664         S_invlist_array(pTHX_ SV* const invlist)
7148           {
7149           /* Returns the pointer to the inversion list's array. Every time the
7150           * length changes, this needs to be called in case malloc or realloc moved
7151           * it */
7152            
7153 813902         PERL_ARGS_ASSERT_INVLIST_ARRAY;
7154            
7155           /* Must not be empty. If these fail, you probably didn't check for
7156           * being non-zero before trying to get the array */
7157 653766         assert(_invlist_len(invlist));
7158            
7159           /* The very first element always contains zero, The array begins either
7160           * there, or if the inversion list is offset, at the element after it.
7161           * The offset header field determines which; it contains 0 or 1 to indicate
7162           * how much additionally to add */
7163 653766         assert(0 == *(SvPVX(invlist)));
7164 4427009         return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7165           }
7166            
7167           PERL_STATIC_INLINE void
7168           S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7169           {
7170           /* Sets the current number of elements stored in the inversion list.
7171           * Updates SvCUR correspondingly */
7172            
7173 3359727         PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7174            
7175 4427005         assert(SvTYPE(invlist) == SVt_INVLIST);
7176            
7177 4433360         SvCUR_set(invlist,
7178           (len == 0)
7179           ? 0
7180           : TO_INTERNAL_SIZE(len + offset));
7181 11342008         assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7182           }
7183            
7184           PERL_STATIC_INLINE IV*
7185           S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7186           {
7187           /* Return the address of the IV that is reserved to hold the cached index
7188           * */
7189            
7190 11342008         PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7191            
7192 3747235         assert(SvTYPE(invlist) == SVt_INVLIST);
7193            
7194 7594773         return &(((XINVLIST*) SvANY(invlist))->prev_index);
7195           }
7196            
7197           PERL_STATIC_INLINE IV
7198           S_invlist_previous_index(pTHX_ SV* const invlist)
7199           {
7200           /* Returns cached index of previous search */
7201            
7202           PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7203            
7204           return *get_invlist_previous_index_addr(invlist);
7205           }
7206            
7207           PERL_STATIC_INLINE void
7208           S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7209           {
7210           /* Caches for later retrieval */
7211            
7212 11342008         PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7213            
7214           assert(index == 0 || index < (int) _invlist_len(invlist));
7215            
7216 7589037         *get_invlist_previous_index_addr(invlist) = index;
7217           }
7218            
7219           PERL_STATIC_INLINE UV
7220           S_invlist_max(pTHX_ SV* const invlist)
7221           {
7222           /* Returns the maximum number of elements storable in the inversion list's
7223           * array, without having to realloc() */
7224            
7225           PERL_ARGS_ASSERT_INVLIST_MAX;
7226            
7227           assert(SvTYPE(invlist) == SVt_INVLIST);
7228            
7229           /* Assumes worst case, in which the 0 element is not counted in the
7230           * inversion list, so subtracts 1 for that */
7231           return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7232           ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7233           : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7234           }
7235            
7236           #ifndef PERL_IN_XSUB_RE
7237           SV*
7238           Perl__new_invlist(pTHX_ IV initial_size)
7239           {
7240            
7241           /* Return a pointer to a newly constructed inversion list, with enough
7242           * space to store 'initial_size' elements. If that number is negative, a
7243           * system default is used instead */
7244            
7245           SV* new_list;
7246            
7247           if (initial_size < 0) {
7248           initial_size = 10;
7249           }
7250            
7251           /* Allocate the initial space */
7252           new_list = newSV_type(SVt_INVLIST);
7253            
7254           /* First 1 is in case the zero element isn't in the list; second 1 is for
7255           * trailing NUL */
7256           SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7257           invlist_set_len(new_list, 0, 0);
7258            
7259           /* Force iterinit() to be used to get iteration to work */
7260           *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7261            
7262           *get_invlist_previous_index_addr(new_list) = 0;
7263            
7264           return new_list;
7265           }
7266           #endif
7267            
7268           STATIC SV*
7269 11327036         S__new_invlist_C_array(pTHX_ const UV* const list)
7270           {
7271           /* Return a pointer to a newly constructed inversion list, initialized to
7272           * point to , which has to be in the exact correct inversion list
7273           * form, including internal fields. Thus this is a dangerous routine that
7274           * should not be used in the wrong hands. The passed in 'list' contains
7275           * several header fields at the beginning that are not part of the
7276           * inversion list body proper */
7277            
7278 8198205         const STRLEN length = (STRLEN) list[0];
7279 8198205         const UV version_id = list[1];
7280 3143803         const bool offset = cBOOL(list[2]);
7281           #define HEADER_LENGTH 3
7282           /* If any of the above changes in any way, you must change HEADER_LENGTH
7283           * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7284           * perl -E 'say int(rand 2**31-1)'
7285           */
7286           #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7287           data structure type, so that one being
7288           passed in can be validated to be an
7289           inversion list of the correct vintage.
7290           */
7291            
7292 3143803         SV* invlist = newSV_type(SVt_INVLIST);
7293            
7294 3143803         PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7295            
7296 3143803         if (version_id != INVLIST_VERSION_ID) {
7297 7306085         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7298           }
7299            
7300           /* The generated array passed in includes header elements that aren't part
7301           * of the list proper, so start it just after them */
7302 1641696         SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7303            
7304 1641696         SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7305           shouldn't touch it */
7306            
7307 980910         *(get_invlist_offset_addr(invlist)) = offset;
7308            
7309           /* The 'length' passed to us is the physical number of elements in the
7310           * inversion list. But if there is an offset the logical number is one
7311           * less than that */
7312 980910         invlist_set_len(invlist, length - offset, offset);
7313            
7314           invlist_set_previous_index(invlist, 0);
7315            
7316           /* Initialize the iteration pointer. */
7317 124         invlist_iterfinish(invlist);
7318            
7319 980972         return invlist;
7320           }
7321            
7322           STATIC void
7323           S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7324           {
7325           /* Grow the maximum size of an inversion list */
7326            
7327           PERL_ARGS_ASSERT_INVLIST_EXTEND;
7328            
7329           assert(SvTYPE(invlist) == SVt_INVLIST);
7330            
7331           /* Add one to account for the zero element at the beginning which may not
7332           * be counted by the calling parameters */
7333           SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7334           }
7335            
7336           PERL_STATIC_INLINE void
7337           S_invlist_trim(pTHX_ SV* const invlist)
7338           {
7339           PERL_ARGS_ASSERT_INVLIST_TRIM;
7340            
7341           assert(SvTYPE(invlist) == SVt_INVLIST);
7342            
7343           /* Change the length of the inversion list to how many entries it currently
7344           * has */
7345           SvPV_shrink_to_cur((SV *) invlist);
7346           }
7347            
7348           #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7349            
7350           STATIC void
7351           S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7352           {
7353           /* Subject to change or removal. Append the range from 'start' to 'end' at
7354           * the end of the inversion list. The range must be above any existing
7355           * ones. */
7356            
7357           UV* array;
7358           UV max = invlist_max(invlist);
7359           UV len = _invlist_len(invlist);
7360           bool offset;
7361            
7362           PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7363            
7364           if (len == 0) { /* Empty lists must be initialized */
7365           offset = start != 0;
7366           array = _invlist_array_init(invlist, ! offset);
7367           }
7368           else {
7369           /* Here, the existing list is non-empty. The current max entry in the
7370           * list is generally the first value not in the set, except when the
7371           * set extends to the end of permissible values, in which case it is
7372           * the first entry in that final set, and so this call is an attempt to
7373           * append out-of-order */
7374            
7375           UV final_element = len - 1;
7376           array = invlist_array(invlist);
7377           if (array[final_element] > start
7378           || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7379           {
7380           Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7381           array[final_element], start,
7382           ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7383           }
7384            
7385           /* Here, it is a legal append. If the new range begins with the first
7386           * value not in the set, it is extending the set, so the new first
7387           * value not in the set is one greater than the newly extended range.
7388           * */
7389           offset = *get_invlist_offset_addr(invlist);
7390           if (array[final_element] == start) {
7391           if (end != UV_MAX) {
7392           array[final_element] = end + 1;
7393           }
7394           else {
7395           /* But if the end is the maximum representable on the machine,
7396           * just let the range that this would extend to have no end */
7397           invlist_set_len(invlist, len - 1, offset);
7398           }
7399           return;
7400           }
7401           }
7402            
7403           /* Here the new range doesn't extend any existing set. Add it */
7404            
7405           len += 2; /* Includes an element each for the start and end of range */
7406            
7407           /* If wll overflow the existing space, extend, which may cause the array to
7408           * be moved */
7409           if (max < len) {
7410           invlist_extend(invlist, len);
7411            
7412           /* Have to set len here to avoid assert failure in invlist_array() */
7413           invlist_set_len(invlist, len, offset);
7414            
7415           array = invlist_array(invlist);
7416           }
7417           else {
7418           invlist_set_len(invlist, len, offset);
7419           }
7420            
7421           /* The next item on the list starts the range, the one after that is
7422           * one past the new range. */
7423           array[len - 2] = start;
7424           if (end != UV_MAX) {
7425           array[len - 1] = end + 1;
7426           }
7427           else {
7428           /* But if the end is the maximum representable on the machine, just let
7429           * the range have no end */
7430           invlist_set_len(invlist, len - 1, offset);
7431           }
7432           }
7433            
7434           #ifndef PERL_IN_XSUB_RE
7435            
7436           IV
7437           Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7438           {
7439           /* Searches the inversion list for the entry that contains the input code
7440           * point . If is not in the list, -1 is returned. Otherwise, the
7441           * return value is the index into the list's array of the range that
7442           * contains */
7443            
7444           IV low = 0;
7445           IV mid;
7446           IV high = _invlist_len(invlist);
7447           const IV highest_element = high - 1;
7448           const UV* array;
7449            
7450           PERL_ARGS_ASSERT__INVLIST_SEARCH;
7451            
7452           /* If list is empty, return failure. */
7453           if (high == 0) {
7454           return -1;
7455           }
7456            
7457           /* (We can't get the array unless we know the list is non-empty) */
7458           array = invlist_array(invlist);
7459            
7460           mid = invlist_previous_index(invlist);
7461           assert(mid >=0 && mid <= highest_element);
7462            
7463           /* contains the cache of the result of the previous call to this
7464           * function (0 the first time). See if this call is for the same result,
7465           * or if it is for mid-1. This is under the theory that calls to this
7466           * function will often be for related code points that are near each other.
7467           * And benchmarks show that caching gives better results. We also test
7468           * here if the code point is within the bounds of the list. These tests
7469           * replace others that would have had to be made anyway to make sure that
7470           * the array bounds were not exceeded, and these give us extra information
7471           * at the same time */
7472           if (cp >= array[mid]) {
7473           if (cp >= array[highest_element]) {
7474           return highest_element;
7475           }
7476            
7477           /* Here, array[mid] <= cp < array[highest_element]. This means that
7478           * the final element is not the answer, so can exclude it; it also
7479           * means that is not the final element, so can refer to 'mid + 1'
7480           * safely */
7481           if (cp < array[mid + 1]) {
7482           return mid;
7483           }
7484           high--;
7485           low = mid + 1;
7486           }
7487           else { /* cp < aray[mid] */
7488           if (cp < array[0]) { /* Fail if outside the array */
7489           return -1;
7490           }
7491           high = mid;
7492           if (cp >= array[mid - 1]) {
7493           goto found_entry;
7494           }
7495           }
7496            
7497           /* Binary search. What we are looking for is such that
7498           * array[i] <= cp < array[i+1]
7499           * The loop below converges on the i+1. Note that there may not be an
7500           * (i+1)th element in the array, and things work nonetheless */
7501           while (low < high) {
7502           mid = (low + high) / 2;
7503           assert(mid <= highest_element);
7504           if (array[mid] <= cp) { /* cp >= array[mid] */
7505           low = mid + 1;
7506            
7507           /* We could do this extra test to exit the loop early.
7508           if (cp < array[low]) {
7509           return mid;
7510           }
7511           */
7512           }
7513           else { /* cp < array[mid] */
7514           high = mid;
7515           }
7516           }
7517            
7518           found_entry:
7519           high--;
7520           invlist_set_previous_index(invlist, high);
7521           return high;
7522           }
7523            
7524           void
7525           Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7526           {
7527           /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7528           * but is used when the swash has an inversion list. This makes this much
7529           * faster, as it uses a binary search instead of a linear one. This is
7530           * intimately tied to that function, and perhaps should be in utf8.c,
7531           * except it is intimately tied to inversion lists as well. It assumes
7532           * that is all 0's on input */
7533            
7534           UV current = start;
7535           const IV len = _invlist_len(invlist);
7536           IV i;
7537           const UV * array;
7538            
7539           PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7540            
7541           if (len == 0) { /* Empty inversion list */
7542           return;
7543           }
7544            
7545           array = invlist_array(invlist);
7546            
7547           /* Find which element it is */
7548           i = _invlist_search(invlist, start);
7549            
7550           /* We populate from to */
7551           while (current < end) {
7552           UV upper;
7553            
7554           /* The inversion list gives the results for every possible code point
7555           * after the first one in the list. Only those ranges whose index is
7556           * even are ones that the inversion list matches. For the odd ones,
7557           * and if the initial code point is not in the list, we have to skip
7558           * forward to the next element */
7559           if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7560           i++;
7561           if (i >= len) { /* Finished if beyond the end of the array */
7562           return;
7563           }
7564           current = array[i];
7565           if (current >= end) { /* Finished if beyond the end of what we
7566           are populating */
7567           if (LIKELY(end < UV_MAX)) {
7568           return;
7569           }
7570            
7571           /* We get here when the upper bound is the maximum
7572           * representable on the machine, and we are looking for just
7573           * that code point. Have to special case it */
7574           i = len;
7575           goto join_end_of_list;
7576           }
7577           }
7578           assert(current >= start);
7579            
7580           /* The current range ends one below the next one, except don't go past
7581           * */
7582           i++;
7583           upper = (i < len && array[i] < end) ? array[i] : end;
7584            
7585           /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7586           * for each code point in it */
7587           for (; current < upper; current++) {
7588           const STRLEN offset = (STRLEN)(current - start);
7589           swatch[offset >> 3] |= 1 << (offset & 7);
7590           }
7591            
7592           join_end_of_list:
7593            
7594           /* Quit if at the end of the list */
7595           if (i >= len) {
7596            
7597           /* But first, have to deal with the highest possible code point on
7598           * the platform. The previous code assumes that is one
7599           * beyond where we want to populate, but that is impossible at the
7600           * platform's infinity, so have to handle it specially */
7601           if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7602           {
7603           const STRLEN offset = (STRLEN)(end - start);
7604           swatch[offset >> 3] |= 1 << (offset & 7);
7605           }
7606           return;
7607           }
7608            
7609           /* Advance to the next range, which will be for code points not in the
7610           * inversion list */
7611           current = array[i];
7612           }
7613            
7614           return;
7615           }
7616            
7617           void
7618           Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7619           {
7620           /* Take the union of two inversion lists and point to it. *output
7621           * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7622           * the reference count to that list will be decremented. The first list,
7623           * , may be NULL, in which case a copy of the second list is returned.
7624           * If is TRUE, the union is taken of the complement
7625           * (inversion) of instead of b itself.
7626           *
7627           * The basis for this comes from "Unicode Demystified" Chapter 13 by
7628           * Richard Gillam, published by Addison-Wesley, and explained at some
7629           * length there. The preface says to incorporate its examples into your
7630           * code at your own risk.
7631           *
7632           * The algorithm is like a merge sort.
7633           *
7634           * XXX A potential performance improvement is to keep track as we go along
7635           * if only one of the inputs contributes to the result, meaning the other
7636           * is a subset of that one. In that case, we can skip the final copy and
7637           * return the larger of the input lists, but then outside code might need
7638           * to keep track of whether to free the input list or not */
7639            
7640           const UV* array_a; /* a's array */
7641           const UV* array_b;
7642           UV len_a; /* length of a's array */
7643           UV len_b;
7644            
7645           SV* u; /* the resulting union */
7646           UV* array_u;
7647           UV len_u;
7648            
7649           UV i_a = 0; /* current index into a's array */
7650           UV i_b = 0;
7651           UV i_u = 0;
7652            
7653           /* running count, as explained in the algorithm source book; items are
7654           * stopped accumulating and are output when the count changes to/from 0.
7655           * The count is incremented when we start a range that's in the set, and
7656           * decremented when we start a range that's not in the set. So its range
7657           * is 0 to 2. Only when the count is zero is something not in the set.
7658           */
7659           UV count = 0;
7660            
7661           PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7662           assert(a != b);
7663            
7664           /* If either one is empty, the union is the other one */
7665           if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7666           if (*output == a) {
7667           if (a != NULL) {
7668           SvREFCNT_dec_NN(a);
7669           }
7670           }
7671           if (*output != b) {
7672           *output = invlist_clone(b);
7673           if (complement_b) {
7674           _invlist_invert(*output);
7675           }
7676           } /* else *output already = b; */
7677           return;
7678           }
7679           else if ((len_b = _invlist_len(b)) == 0) {
7680           if (*output == b) {
7681           SvREFCNT_dec_NN(b);
7682           }
7683            
7684           /* The complement of an empty list is a list that has everything in it,
7685           * so the union with includes everything too */
7686           if (complement_b) {
7687           if (a == *output) {
7688           SvREFCNT_dec_NN(a);
7689           }
7690           *output = _new_invlist(1);
7691           _append_range_to_invlist(*output, 0, UV_MAX);
7692           }
7693           else if (*output != a) {
7694           *output = invlist_clone(a);
7695           }
7696           /* else *output already = a; */
7697           return;
7698           }
7699            
7700           /* Here both lists exist and are non-empty */
7701           array_a = invlist_array(a);
7702           array_b = invlist_array(b);
7703            
7704           /* If are to take the union of 'a' with the complement of b, set it
7705           * up so are looking at b's complement. */
7706           if (complement_b) {
7707            
7708           /* To complement, we invert: if the first element is 0, remove it. To
7709           * do this, we just pretend the array starts one later */
7710           if (array_b[0] == 0) {
7711           array_b++;
7712           len_b--;
7713           }
7714           else {
7715            
7716           /* But if the first element is not zero, we pretend the list starts
7717           * at the 0 that is always stored immediately before the array. */
7718           array_b--;
7719           len_b++;
7720           }
7721           }
7722            
7723           /* Size the union for the worst case: that the sets are completely
7724           * disjoint */
7725           u = _new_invlist(len_a + len_b);
7726            
7727           /* Will contain U+0000 if either component does */
7728           array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7729           || (len_b > 0 && array_b[0] == 0));
7730            
7731           /* Go through each list item by item, stopping when exhausted one of
7732           * them */
7733           while (i_a < len_a && i_b < len_b) {
7734           UV cp; /* The element to potentially add to the union's array */
7735           bool cp_in_set; /* is it in the the input list's set or not */
7736            
7737           /* We need to take one or the other of the two inputs for the union.
7738           * Since we are merging two sorted lists, we take the smaller of the
7739           * next items. In case of a tie, we take the one that is in its set
7740           * first. If we took one not in the set first, it would decrement the
7741           * count, possibly to 0 which would cause it to be output as ending the
7742           * range, and the next time through we would take the same number, and
7743           * output it again as beginning the next range. By doing it the
7744           * opposite way, there is no possibility that the count will be
7745           * momentarily decremented to 0, and thus the two adjoining ranges will
7746           * be seamlessly merged. (In a tie and both are in the set or both not
7747           * in the set, it doesn't matter which we take first.) */
7748           if (array_a[i_a] < array_b[i_b]
7749           || (array_a[i_a] == array_b[i_b]
7750           && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7751           {
7752           cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7753           cp= array_a[i_a++];
7754           }
7755           else {
7756           cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7757           cp = array_b[i_b++];
7758           }
7759            
7760           /* Here, have chosen which of the two inputs to look at. Only output
7761           * if the running count changes to/from 0, which marks the
7762           * beginning/end of a range in that's in the set */
7763           if (cp_in_set) {
7764           if (count == 0) {
7765           array_u[i_u++] = cp;
7766           }
7767           count++;
7768           }
7769           else {
7770           count--;
7771           if (count == 0) {
7772           array_u[i_u++] = cp;
7773           }
7774           }
7775           }
7776            
7777           /* Here, we are finished going through at least one of the lists, which
7778           * means there is something remaining in at most one. We check if the list
7779           * that hasn't been exhausted is positioned such that we are in the middle
7780           * of a range in its set or not. (i_a and i_b point to the element beyond
7781           * the one we care about.) If in the set, we decrement 'count'; if 0, there
7782           * is potentially more to output.
7783           * There are four cases:
7784           * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7785           * in the union is entirely from the non-exhausted set.
7786           * 2) Both were in their sets, count is 2. Nothing further should
7787           * be output, as everything that remains will be in the exhausted
7788           * list's set, hence in the union; decrementing to 1 but not 0 insures
7789           * that
7790           * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7791           * Nothing further should be output because the union includes
7792           * everything from the exhausted set. Not decrementing ensures that.
7793           * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7794           * decrementing to 0 insures that we look at the remainder of the
7795           * non-exhausted set */
7796           if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7797           || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7798           {
7799           count--;
7800           }
7801            
7802           /* The final length is what we've output so far, plus what else is about to
7803           * be output. (If 'count' is non-zero, then the input list we exhausted
7804           * has everything remaining up to the machine's limit in its set, and hence
7805           * in the union, so there will be no further output. */
7806           len_u = i_u;
7807           if (count == 0) {
7808           /* At most one of the subexpressions will be non-zero */
7809           len_u += (len_a - i_a) + (len_b - i_b);
7810           }
7811            
7812           /* Set result to final length, which can change the pointer to array_u, so
7813           * re-find it */
7814           if (len_u != _invlist_len(u)) {
7815           invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7816           invlist_trim(u);
7817           array_u = invlist_array(u);
7818           }
7819            
7820           /* When 'count' is 0, the list that was exhausted (if one was shorter than
7821           * the other) ended with everything above it not in its set. That means
7822           * that the remaining part of the union is precisely the same as the
7823           * non-exhausted list, so can just copy it unchanged. (If both list were
7824           * exhausted at the same time, then the operations below will be both 0.)
7825           */
7826           if (count == 0) {
7827           IV copy_count; /* At most one will have a non-zero copy count */
7828           if ((copy_count = len_a - i_a) > 0) {
7829           Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7830           }
7831           else if ((copy_count = len_b - i_b) > 0) {
7832           Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7833           }
7834           }
7835            
7836           /* We may be removing a reference to one of the inputs */
7837           if (a == *output || b == *output) {
7838           assert(! invlist_is_iterating(*output));
7839           SvREFCNT_dec_NN(*output);
7840           }
7841            
7842           *output = u;
7843           return;
7844           }
7845            
7846           void
7847           Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7848           {
7849           /* Take the intersection of two inversion lists and point to it. *i
7850           * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7851           * the reference count to that list will be decremented.
7852           * If is TRUE, the result will be the intersection of
7853           * and the complement (or inversion) of instead of directly.
7854           *
7855           * The basis for this comes from "Unicode Demystified" Chapter 13 by
7856           * Richard Gillam, published by Addison-Wesley, and explained at some
7857           * length there. The preface says to incorporate its examples into your
7858           * code at your own risk. In fact, it had bugs
7859           *
7860           * The algorithm is like a merge sort, and is essentially the same as the
7861           * union above
7862           */
7863            
7864           const UV* array_a; /* a's array */
7865           const UV* array_b;
7866           UV len_a; /* length of a's array */
7867           UV len_b;
7868            
7869           SV* r; /* the resulting intersection */
7870           UV* array_r;
7871           UV len_r;
7872            
7873           UV i_a = 0; /* current index into a's array */
7874           UV i_b = 0;
7875           UV i_r = 0;
7876            
7877           /* running count, as explained in the algorithm source book; items are
7878           * stopped accumulating and are output when the count changes to/from 2.
7879           * The count is incremented when we start a range that's in the set, and
7880           * decremented when we start a range that's not in the set. So its range
7881           * is 0 to 2. Only when the count is 2 is something in the intersection.
7882           */
7883           UV count = 0;
7884            
7885           PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7886           assert(a != b);
7887            
7888           /* Special case if either one is empty */
7889           len_a = (a == NULL) ? 0 : _invlist_len(a);
7890           if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7891            
7892           if (len_a != 0 && complement_b) {
7893            
7894           /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7895           * be empty. Here, also we are using 'b's complement, which hence
7896           * must be every possible code point. Thus the intersection is
7897           * simply 'a'. */
7898           if (*i != a) {
7899           if (*i == b) {
7900           SvREFCNT_dec_NN(b);
7901           }
7902            
7903           *i = invlist_clone(a);
7904           }
7905           /* else *i is already 'a' */
7906           return;
7907           }
7908            
7909           /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7910           * intersection must be empty */
7911           if (*i == a) {
7912           SvREFCNT_dec_NN(a);
7913           }
7914           else if (*i == b) {
7915           SvREFCNT_dec_NN(b);
7916           }
7917           *i = _new_invlist(0);
7918           return;
7919           }
7920            
7921           /* Here both lists exist and are non-empty */
7922           array_a = invlist_array(a);
7923           array_b = invlist_array(b);
7924            
7925           /* If are to take the intersection of 'a' with the complement of b, set it
7926           * up so are looking at b's complement. */
7927           if (complement_b) {
7928            
7929           /* To complement, we invert: if the first element is 0, remove it. To
7930           * do this, we just pretend the array starts one later */
7931           if (array_b[0] == 0) {
7932           array_b++;
7933           len_b--;
7934           }
7935           else {
7936            
7937           /* But if the first element is not zero, we pretend the list starts
7938           * at the 0 that is always stored immediately before the array. */
7939           array_b--;
7940           len_b++;
7941           }
7942           }
7943            
7944           /* Size the intersection for the worst case: that the intersection ends up
7945           * fragmenting everything to be completely disjoint */
7946           r= _new_invlist(len_a + len_b);
7947            
7948           /* Will contain U+0000 iff both components do */
7949           array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7950           && len_b > 0 && array_b[0] == 0);
7951            
7952           /* Go through each list item by item, stopping when exhausted one of
7953           * them */
7954           while (i_a < len_a && i_b < len_b) {
7955           UV cp; /* The element to potentially add to the intersection's
7956           array */
7957           bool cp_in_set; /* Is it in the input list's set or not */
7958            
7959           /* We need to take one or the other of the two inputs for the
7960           * intersection. Since we are merging two sorted lists, we take the
7961           * smaller of the next items. In case of a tie, we take the one that
7962           * is not in its set first (a difference from the union algorithm). If
7963           * we took one in the set first, it would increment the count, possibly
7964           * to 2 which would cause it to be output as starting a range in the
7965           * intersection, and the next time through we would take that same
7966           * number, and output it again as ending the set. By doing it the
7967           * opposite of this, there is no possibility that the count will be
7968           * momentarily incremented to 2. (In a tie and both are in the set or
7969           * both not in the set, it doesn't matter which we take first.) */
7970           if (array_a[i_a] < array_b[i_b]
7971           || (array_a[i_a] == array_b[i_b]
7972           && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7973           {
7974           cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7975           cp= array_a[i_a++];
7976           }
7977           else {
7978           cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7979           cp= array_b[i_b++];
7980           }
7981            
7982           /* Here, have chosen which of the two inputs to look at. Only output
7983           * if the running count changes to/from 2, which marks the
7984           * beginning/end of a range that's in the intersection */
7985           if (cp_in_set) {
7986           count++;
7987           if (count == 2) {
7988           array_r[i_r++] = cp;
7989           }
7990           }
7991           else {
7992           if (count == 2) {
7993           array_r[i_r++] = cp;
7994           }
7995           count--;
7996           }
7997           }
7998            
7999           /* Here, we are finished going through at least one of the lists, which
8000           * means there is something remaining in at most one. We check if the list
8001           * that has been exhausted is positioned such that we are in the middle
8002           * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8003           * the ones we care about.) There are four cases:
8004           * 1) Both weren't in their sets, count is 0, and remains 0. There's
8005           * nothing left in the intersection.
8006           * 2) Both were in their sets, count is 2 and perhaps is incremented to
8007           * above 2. What should be output is exactly that which is in the
8008           * non-exhausted set, as everything it has is also in the intersection
8009           * set, and everything it doesn't have can't be in the intersection
8010           * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8011           * gets incremented to 2. Like the previous case, the intersection is
8012           * everything that remains in the non-exhausted set.
8013           * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8014           * remains 1. And the intersection has nothing more. */
8015           if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8016           || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8017           {
8018           count++;
8019           }
8020            
8021           /* The final length is what we've output so far plus what else is in the
8022           * intersection. At most one of the subexpressions below will be non-zero */
8023           len_r = i_r;
8024           if (count >= 2) {
8025           len_r += (len_a - i_a) + (len_b - i_b);
8026           }
8027            
8028           /* Set result to final length, which can change the pointer to array_r, so
8029           * re-find it */
8030           if (len_r != _invlist_len(r)) {
8031           invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8032           invlist_trim(r);
8033           array_r = invlist_array(r);
8034           }
8035            
8036           /* Finish outputting any remaining */
8037           if (count >= 2) { /* At most one will have a non-zero copy count */
8038           IV copy_count;
8039           if ((copy_count = len_a - i_a) > 0) {
8040           Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8041           }
8042           else if ((copy_count = len_b - i_b) > 0) {
8043           Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8044           }
8045           }
8046            
8047           /* We may be removing a reference to one of the inputs */
8048           if (a == *i || b == *i) {
8049           assert(! invlist_is_iterating(*i));
8050           SvREFCNT_dec_NN(*i);
8051           }
8052            
8053           *i = r;
8054           return;
8055           }
8056            
8057           SV*
8058           Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8059           {
8060           /* Add the range from 'start' to 'end' inclusive to the inversion list's
8061           * set. A pointer to the inversion list is returned. This may actually be
8062           * a new list, in which case the passed in one has been destroyed. The
8063           * passed in inversion list can be NULL, in which case a new one is created
8064           * with just the one range in it */
8065            
8066           SV* range_invlist;
8067           UV len;
8068            
8069           if (invlist == NULL) {
8070           invlist = _new_invlist(2);
8071           len = 0;
8072           }
8073           else {
8074           len = _invlist_len(invlist);
8075           }
8076            
8077           /* If comes after the final entry actually in the list, can just append it
8078           * to the end, */
8079           if (len == 0
8080           || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8081           && start >= invlist_array(invlist)[len - 1]))
8082           {
8083           _append_range_to_invlist(invlist, start, end);
8084           return invlist;
8085           }
8086            
8087           /* Here, can't just append things, create and return a new inversion list
8088           * which is the union of this range and the existing inversion list */
8089           range_invlist = _new_invlist(2);
8090           _append_range_to_invlist(range_invlist, start, end);
8091            
8092           _invlist_union(invlist, range_invlist, &invlist);
8093            
8094           /* The temporary can be freed */
8095           SvREFCNT_dec_NN(range_invlist);
8096            
8097           return invlist;
8098           }
8099            
8100           #endif
8101            
8102           PERL_STATIC_INLINE SV*
8103 980786         S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8104 94         return _add_range_to_invlist(invlist, cp, cp);
8105           }
8106            
8107           #ifndef PERL_IN_XSUB_RE
8108           void
8109           Perl__invlist_invert(pTHX_ SV* const invlist)
8110           {
8111           /* Complement the input inversion list. This adds a 0 if the list didn't
8112           * have a zero; removes it otherwise. As described above, the data
8113           * structure is set up so that this is very efficient */
8114            
8115           PERL_ARGS_ASSERT__INVLIST_INVERT;
8116            
8117           assert(! invlist_is_iterating(invlist));
8118            
8119           /* The inverse of matching nothing is matching everything */
8120           if (_invlist_len(invlist) == 0) {
8121           _append_range_to_invlist(invlist, 0, UV_MAX);
8122           return;
8123           }
8124            
8125           *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8126           }
8127            
8128           void
8129           Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8130           {
8131           /* Complement the input inversion list (which must be a Unicode property,
8132           * all of which don't match above the Unicode maximum code point.) And
8133           * Perl has chosen to not have the inversion match above that either. This
8134           * adds a 0x110000 if the list didn't end with it, and removes it if it did
8135           */
8136            
8137           UV len;
8138           UV* array;
8139            
8140           PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8141            
8142           _invlist_invert(invlist);
8143            
8144           len = _invlist_len(invlist);
8145            
8146           if (len != 0) { /* If empty do nothing */
8147           array = invlist_array(invlist);
8148           if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8149           /* Add 0x110000. First, grow if necessary */
8150           len++;
8151           if (invlist_max(invlist) < len) {
8152           invlist_extend(invlist, len);
8153           array = invlist_array(invlist);
8154           }
8155           invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8156           array[len - 1] = PERL_UNICODE_MAX + 1;
8157           }
8158           else { /* Remove the 0x110000 */
8159           invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8160           }
8161           }
8162            
8163           return;
8164           }
8165           #endif
8166            
8167           PERL_STATIC_INLINE SV*
8168           S_invlist_clone(pTHX_ SV* const invlist)
8169           {
8170            
8171           /* Return a new inversion list that is a copy of the input one, which is
8172           * unchanged */
8173            
8174           /* Need to allocate extra space to accommodate Perl's addition of a
8175           * trailing NUL to SvPV's, since it thinks they are always strings */
8176           SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8177           STRLEN physical_length = SvCUR(invlist);
8178           bool offset = *(get_invlist_offset_addr(invlist));
8179            
8180           PERL_ARGS_ASSERT_INVLIST_CLONE;
8181            
8182           *(get_invlist_offset_addr(new_invlist)) = offset;
8183           invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8184           Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8185            
8186           return new_invlist;
8187           }
8188            
8189           PERL_STATIC_INLINE STRLEN*
8190 108         S_get_invlist_iter_addr(pTHX_ SV* invlist)
8191           {
8192           /* Return the address of the UV that contains the current iteration
8193           * position */
8194            
8195 108         PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8196            
8197 108         assert(SvTYPE(invlist) == SVt_INVLIST);
8198            
8199 104         return &(((XINVLIST*) SvANY(invlist))->iterator);
8200           }
8201            
8202           PERL_STATIC_INLINE void
8203 94         S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8204           {
8205 94         PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8206            
8207 94         *get_invlist_iter_addr(invlist) = 0;
8208 94         }
8209            
8210           PERL_STATIC_INLINE void
8211 8         S_invlist_iterfinish(pTHX_ SV* invlist)
8212           {
8213           /* Terminate iterator for invlist. This is to catch development errors.
8214           * Any iteration that is interrupted before completed should call this
8215           * function. Functions that add code points anywhere else but to the end
8216           * of an inversion list assert that they are not in the middle of an
8217           * iteration. If they were, the addition would make the iteration
8218           * problematical: if the iteration hadn't reached the place where things
8219           * were being added, it would be ok */
8220            
8221 98         PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8222            
8223 285528         *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8224 285528         }
8225            
8226           STATIC bool
8227 285530         S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8228           {
8229           /* An C call on must be used to set this up.
8230           * This call sets in <*start> and <*end>, the next range in .
8231           * Returns if successful and the next call will return the next
8232           * range; if was already at the end of the list. If the latter,
8233           * <*start> and <*end> are unchanged, and the next call to this function
8234           * will start over at the beginning of the list */
8235            
8236 285530         STRLEN* pos = get_invlist_iter_addr(invlist);
8237 285530         UV len = _invlist_len(invlist);
8238           UV *array;
8239            
8240 285530         PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8241            
8242 285530         if (*pos >= len) {
8243 285526         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8244 47151425         return FALSE;
8245           }
8246            
8247 5770047         array = invlist_array(invlist);
8248            
8249 7121754         *start = array[(*pos)++];
8250            
8251 17722335         if (*pos >= len) {
8252 17722331         *end = UV_MAX;
8253           }
8254           else {
8255 3510147         *end = array[(*pos)++] - 1;
8256           }
8257            
8258           return TRUE;
8259           }
8260            
8261           PERL_STATIC_INLINE bool
8262           S_invlist_is_iterating(pTHX_ SV* const invlist)
8263           {
8264           PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8265            
8266           return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8267           }
8268            
8269           PERL_STATIC_INLINE UV
8270 3510143         S_invlist_highest(pTHX_ SV* const invlist)
8271           {
8272           /* Returns the highest code point that matches an inversion list. This API
8273           * has an ambiguity, as it returns 0 under either the highest is actually
8274           * 0, or if the list is empty. If this distinction matters to you, check
8275           * for emptiness before calling this function */
8276            
8277 14212188         UV len = _invlist_len(invlist);
8278           UV *array;
8279            
8280 14212188         PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8281            
8282 868314         if (len == 0) {
8283           return 0;
8284           }
8285            
8286 15577922         array = invlist_array(invlist);
8287            
8288           /* The last element in the array in the inversion list always starts a
8289           * range that goes to infinity. That range may be for code points that are
8290           * matched in the inversion list, or it may be for ones that aren't
8291           * matched. In the latter case, the highest code point in the set is one
8292           * less than the beginning of this range; otherwise it is the final element
8293           * of this range: infinity */
8294 0         return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8295           ? UV_MAX
8296 931956         : array[len - 1] - 1;
8297           }
8298            
8299           #ifndef PERL_IN_XSUB_RE
8300           SV *
8301           Perl__invlist_contents(pTHX_ SV* const invlist)
8302           {
8303           /* Get the contents of an inversion list into a string SV so that they can
8304           * be printed out. It uses the format traditionally done for debug tracing
8305           */
8306            
8307           UV start, end;
8308           SV* output = newSVpvs("\n");
8309            
8310           PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8311            
8312           assert(! invlist_is_iterating(invlist));
8313            
8314           invlist_iterinit(invlist);
8315           while (invlist_iternext(invlist, &start, &end)) {
8316           if (end == UV_MAX) {
8317           Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8318           }
8319           else if (end != start) {
8320           Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8321           start, end);
8322           }
8323           else {
8324           Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8325           }
8326           }
8327            
8328           return output;
8329           }
8330           #endif
8331            
8332           #ifndef PERL_IN_XSUB_RE
8333           void
8334           Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8335           {
8336           /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8337           * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8338           * the string 'indent'. The output looks like this:
8339           [0] 0x000A .. 0x000D
8340           [2] 0x0085
8341           [4] 0x2028 .. 0x2029
8342           [6] 0x3104 .. INFINITY
8343           * This means that the first range of code points matched by the list are
8344           * 0xA through 0xD; the second range contains only the single code point
8345           * 0x85, etc. An inversion list is an array of UVs. Two array elements
8346           * are used to define each range (except if the final range extends to
8347           * infinity, only a single element is needed). The array index of the
8348           * first element for the corresponding range is given in brackets. */
8349            
8350           UV start, end;
8351           STRLEN count = 0;
8352            
8353           PERL_ARGS_ASSERT__INVLIST_DUMP;
8354            
8355           if (invlist_is_iterating(invlist)) {
8356           Perl_dump_indent(aTHX_ level, file,
8357           "%sCan't dump inversion list because is in middle of iterating\n",
8358           indent);
8359           return;
8360           }
8361            
8362           invlist_iterinit(invlist);
8363           while (invlist_iternext(invlist, &start, &end)) {
8364           if (end == UV_MAX) {
8365           Perl_dump_indent(aTHX_ level, file,
8366           "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8367           indent, (UV)count, start);
8368           }
8369           else if (end != start) {
8370           Perl_dump_indent(aTHX_ level, file,
8371           "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8372           indent, (UV)count, start, end);
8373           }
8374           else {
8375           Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8376           indent, (UV)count, start);
8377           }
8378           count += 2;
8379           }
8380           }
8381           #endif
8382            
8383           #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8384           bool
8385           S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8386           {
8387           /* Return a boolean as to if the two passed in inversion lists are
8388           * identical. The final argument, if TRUE, says to take the complement of
8389           * the second inversion list before doing the comparison */
8390            
8391           const UV* array_a = invlist_array(a);
8392           const UV* array_b = invlist_array(b);
8393           UV len_a = _invlist_len(a);
8394           UV len_b = _invlist_len(b);
8395            
8396           UV i = 0; /* current index into the arrays */
8397           bool retval = TRUE; /* Assume are identical until proven otherwise */
8398            
8399           PERL_ARGS_ASSERT__INVLISTEQ;
8400            
8401           /* If are to compare 'a' with the complement of b, set it
8402           * up so are looking at b's complement. */
8403           if (complement_b) {
8404            
8405           /* The complement of nothing is everything, so would have to have
8406           * just one element, starting at zero (ending at infinity) */
8407           if (len_b == 0) {
8408           return (len_a == 1 && array_a[0] == 0);
8409           }
8410           else if (array_b[0] == 0) {
8411            
8412           /* Otherwise, to complement, we invert. Here, the first element is
8413           * 0, just remove it. To do this, we just pretend the array starts
8414           * one later */
8415            
8416           array_b++;
8417           len_b--;
8418           }
8419           else {
8420            
8421           /* But if the first element is not zero, we pretend the list starts
8422           * at the 0 that is always stored immediately before the array. */
8423           array_b--;
8424           len_b++;
8425           }
8426           }
8427            
8428           /* Make sure that the lengths are the same, as well as the final element
8429           * before looping through the remainder. (Thus we test the length, final,
8430           * and first elements right off the bat) */
8431           if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8432           retval = FALSE;
8433           }
8434           else for (i = 0; i < len_a - 1; i++) {
8435           if (array_a[i] != array_b[i]) {
8436           retval = FALSE;
8437           break;
8438           }
8439           }
8440            
8441           return retval;
8442           }
8443           #endif
8444            
8445           #undef HEADER_LENGTH
8446           #undef TO_INTERNAL_SIZE
8447           #undef FROM_INTERNAL_SIZE
8448           #undef INVLIST_VERSION_ID
8449            
8450           /* End of inversion list object */
8451            
8452           STATIC void
8453 932044         S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8454           {
8455           /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8456           * constructs, and updates RExC_flags with them. On input, RExC_parse
8457           * should point to the first flag; it is updated on output to point to the
8458           * final ')' or ':'. There needs to be at least one flag, or this will
8459           * abort */
8460            
8461           /* for (?g), (?gc), and (?o) warnings; warning
8462           about (?c) will warn about (?g) -- japhy */
8463            
8464           #define WASTED_O 0x01
8465           #define WASTED_G 0x02
8466           #define WASTED_C 0x04
8467           #define WASTED_GC (WASTED_G|WASTED_C)
8468           I32 wastedflags = 0x00;
8469 1394422         U32 posflags = 0, negflags = 0;
8470           U32 *flagsp = &posflags;
8471           char has_charset_modifier = '\0';
8472           regex_charset cs;
8473           bool has_use_defaults = FALSE;
8474 932044         const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8475            
8476 88         PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8477            
8478           /* '^' as an initial flag sets certain defaults */
8479 88         if (UCHARAT(RExC_parse) == '^') {
8480 88         RExC_parse++;
8481           has_use_defaults = TRUE;
8482 88         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8483 88         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8484           ? REGEX_UNICODE_CHARSET
8485           : REGEX_DEPENDS_CHARSET);
8486           }
8487            
8488 88         cs = get_regex_charset(RExC_flags);
8489 88         if (cs == REGEX_DEPENDS_CHARSET
8490 88         && (RExC_utf8 || RExC_uni_semantics))
8491           {
8492           cs = REGEX_UNICODE_CHARSET;
8493           }
8494            
8495 96         while (*RExC_parse) {
8496           /* && strchr("iogcmsx", *RExC_parse) */
8497           /* (?g), (?gc) and (?o) are useless here
8498           and must be globally applied -- japhy */
8499 96         switch (*RExC_parse) {
8500            
8501           /* Code for the imsx flags */
8502 8         CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8503            
8504           case LOCALE_PAT_MOD:
8505 0         if (has_charset_modifier) {
8506           goto excess_modifier;
8507           }
8508 0         else if (flagsp == &negflags) {
8509           goto neg_modifier;
8510           }
8511           cs = REGEX_LOCALE_CHARSET;
8512           has_charset_modifier = LOCALE_PAT_MOD;
8513 0         RExC_contains_locale = 1;
8514 0         break;
8515           case UNICODE_PAT_MOD:
8516 0         if (has_charset_modifier) {
8517           goto excess_modifier;
8518           }
8519 0         else if (flagsp == &negflags) {
8520           goto neg_modifier;
8521           }
8522           cs = REGEX_UNICODE_CHARSET;
8523           has_charset_modifier = UNICODE_PAT_MOD;
8524           break;
8525           case ASCII_RESTRICT_PAT_MOD:
8526 0         if (flagsp == &negflags) {
8527           goto neg_modifier;
8528           }
8529 0         if (has_charset_modifier) {
8530 0         if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8531           goto excess_modifier;
8532           }
8533           /* Doubled modifier implies more restricted */
8534           cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8535           }
8536           else {
8537           cs = REGEX_ASCII_RESTRICTED_CHARSET;
8538           }
8539           has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8540           break;
8541           case DEPENDS_PAT_MOD:
8542 4759122         if (has_use_defaults) {
8543           goto fail_modifiers;
8544           }
8545 4759122         else if (flagsp == &negflags) {
8546           goto neg_modifier;
8547           }
8548 4759122         else if (has_charset_modifier) {
8549           goto excess_modifier;
8550           }
8551            
8552           /* The dual charset means unicode semantics if the
8553           * pattern (or target, not known until runtime) are
8554           * utf8, or something in the pattern indicates unicode
8555           * semantics */
8556 4759122         cs = (RExC_utf8 || RExC_uni_semantics)
8557           ? REGEX_UNICODE_CHARSET
8558           : REGEX_DEPENDS_CHARSET;
8559           has_charset_modifier = DEPENDS_PAT_MOD;
8560 1075156         break;
8561           excess_modifier:
8562 1075156         RExC_parse++;
8563 1075156         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8564 4759122         vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8565           }
8566 4759122         else if (has_charset_modifier == *(RExC_parse - 1)) {
8567 3591854         vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8568           }
8569           else {
8570 9979322         vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8571           }
8572           /*NOTREACHED*/
8573           neg_modifier:
8574 9979322         RExC_parse++;
8575 827054         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8576           /*NOTREACHED*/
8577           case ONCE_PAT_MOD: /* 'o' */
8578           case GLOBAL_PAT_MOD: /* 'g' */
8579 786790         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8580 786788         const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8581 786786         if (! (wastedflags & wflagbit) ) {
8582 786786         wastedflags |= wflagbit;
8583           /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8584 1275096         vWARN5(
8585           RExC_parse + 1,
8586           "Useless (%s%c) - %suse /%c modifier",
8587           flagsp == &negflags ? "?-" : "?",
8588           *RExC_parse,
8589           flagsp == &negflags ? "don't " : "",
8590           *RExC_parse
8591           );
8592           }
8593           }
8594           break;
8595            
8596           case CONTINUE_PAT_MOD: /* 'c' */
8597 1275092         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8598 1698322         if (! (wastedflags & WASTED_C) ) {
8599 1698322         wastedflags |= WASTED_GC;
8600           /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8601 593498         vWARN3(
8602           RExC_parse + 1,
8603           "Useless (%sc) - %suse /gc modifier",
8604           flagsp == &negflags ? "?-" : "?",
8605           flagsp == &negflags ? "don't " : ""
8606           );
8607           }
8608           }
8609           break;
8610           case KEEPCOPY_PAT_MOD: /* 'p' */
8611 511642         if (flagsp == &negflags) {
8612 511638         if (SIZE_ONLY)
8613 511638         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8614           } else {
8615 511638         *flagsp |= RXf_PMf_KEEPCOPY;
8616           }
8617           break;
8618           case '-':
8619           /* A flag is a default iff it is following a minus, so
8620           * if there is a minus, it means will be trying to
8621           * re-specify a default which is an error */
8622 511638         if (has_use_defaults || flagsp == &negflags) {
8623           goto fail_modifiers;
8624           }
8625           flagsp = &negflags;
8626           wastedflags = 0; /* reset so (?g-c) warns twice */
8627           break;
8628           case ':':
8629           case ')':
8630 98         RExC_flags |= posflags;
8631 98         RExC_flags &= ~negflags;
8632           set_regex_charset(&RExC_flags, cs);
8633 90         return;
8634           /*NOTREACHED*/
8635           default:
8636           fail_modifiers:
8637 8         RExC_parse++;
8638 2         vFAIL3("Sequence (%.*s...) not recognized",
8639           RExC_parse-seqstart, seqstart);
8640           /*NOTREACHED*/
8641           }
8642            
8643 14         ++RExC_parse;
8644           }
8645           }
8646            
8647           /*
8648           - reg - regular expression, i.e. main body or parenthesized thing
8649           *
8650           * Caller must absorb opening parenthesis.
8651           *
8652           * Combining parenthesis handling with the base level of regular expression
8653           * is a trifle forced, but the need to tie the tails of the branches to what
8654           * follows makes it hard to avoid.
8655           */
8656           #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8657           #ifdef DEBUGGING
8658           #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8659           #else
8660           #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8661           #endif
8662            
8663           /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8664           flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8665           needs to be restarted.
8666           Otherwise would only return NULL if regbranch() returns NULL, which
8667           cannot happen. */
8668           STATIC regnode *
8669 558         S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8670           /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8671           * 2 is like 1, but indicates that nextchar() has been called to advance
8672           * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
8673           * this flag alerts us to the need to check for that */
8674           {
8675           dVAR;
8676           regnode *ret; /* Will be the head of the group. */
8677           regnode *br;
8678           regnode *lastbr;
8679           regnode *ender = NULL;
8680           I32 parno = 0;
8681           I32 flags;
8682 558         U32 oregflags = RExC_flags;
8683           bool have_branch = 0;
8684           bool is_open = 0;
8685           I32 freeze_paren = 0;
8686           I32 after_freeze = 0;
8687            
8688 848         char * parse_start = RExC_parse; /* MJD */
8689 600         char * const oregcomp_parse = RExC_parse;
8690            
8691 600         GET_RE_DEBUG_FLAGS_DECL;
8692            
8693 596         PERL_ARGS_ASSERT_REG;
8694 596         DEBUG_PARSE("reg ");
8695            
8696 688         *flagp = 0; /* Tentatively. */
8697            
8698            
8699           /* Make an OPEN node, if parenthesized. */
8700 576         if (paren) {
8701            
8702           /* Under /x, space and comments can be gobbled up between the '(' and
8703           * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
8704           * intervening space, as the sequence is a token, and a token should be
8705           * indivisible */
8706 284         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8707            
8708 284         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8709 20         char *start_verb = RExC_parse;
8710           STRLEN verb_len = 0;
8711           char *start_arg = NULL;
8712           unsigned char op = 0;
8713           int argok = 1;
8714           int internal_argval = 0; /* internal_argval is only useful if !argok */
8715            
8716 0         if (has_intervening_patws && SIZE_ONLY) {
8717 0         ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8718           }
8719 20         while ( *RExC_parse && *RExC_parse != ')' ) {
8720 3532         if ( *RExC_parse == ':' ) {
8721 4759080         start_arg = RExC_parse + 1;
8722 4759080         break;
8723           }
8724 4759080         RExC_parse++;
8725           }
8726 30         ++start_verb;
8727 30         verb_len = RExC_parse - start_verb;
8728 5220200         if ( start_arg ) {
8729 36693902         RExC_parse++;
8730 36693902         while ( *RExC_parse && *RExC_parse != ')' )
8731 36693902         RExC_parse++;
8732 36693902         if ( *RExC_parse != ')' )
8733 36693902         vFAIL("Unterminated verb pattern argument");
8734 36693902         if ( RExC_parse == start_arg )
8735           start_arg = NULL;
8736           } else {
8737 19448076         if ( *RExC_parse != ')' )
8738 19448076         vFAIL("Unterminated verb pattern");
8739           }
8740          
8741 4380         switch ( *start_verb ) {
8742           case 'A': /* (*ACCEPT) */
8743 4380         if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8744           op = ACCEPT;
8745 2193         internal_argval = RExC_nestroot;
8746           }
8747           break;
8748           case 'C': /* (*COMMIT) */
8749 28824         if ( memEQs(start_verb,verb_len,"COMMIT") )
8750           op = COMMIT;
8751           break;
8752           case 'F': /* (*FAIL) */
8753 24832         if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8754           op = OPFAIL;
8755           argok = 0;
8756           }
8757           break;
8758           case ':': /* (*:NAME) */
8759           case 'M': /* (*MARK:NAME) */
8760 388         if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8761           op = MARKPOINT;
8762           argok = -1;
8763           }
8764           break;
8765           case 'P': /* (*PRUNE) */
8766 388         if ( memEQs(start_verb,verb_len,"PRUNE") )
8767           op = PRUNE;
8768           break;
8769           case 'S': /* (*SKIP) */
8770 24444         if ( memEQs(start_verb,verb_len,"SKIP") )
8771           op = SKIP;
8772           break;
8773           case 'T': /* (*THEN) */
8774           /* [19:06] :: is then */
8775 4380         if ( memEQs(start_verb,verb_len,"THEN") ) {
8776           op = CUTGROUP;
8777 4380         RExC_seen |= REG_SEEN_CUTGROUP;
8778           }
8779           break;
8780           }
8781 4380         if ( ! op ) {
8782 388         RExC_parse++;
8783 1462         vFAIL3("Unknown verb pattern '%.*s'",
8784           verb_len, start_verb);
8785           }
8786 880         if ( argok ) {
8787 388         if ( start_arg && internal_argval ) {
8788 0         vFAIL3("Verb pattern '%.*s' may not have an argument",
8789           verb_len, start_verb);
8790 388         } else if ( argok < 0 && !start_arg ) {
8791 3992         vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8792           verb_len, start_verb);
8793           } else {
8794 0         ret = reganode(pRExC_state, op, internal_argval);
8795 4380         if ( ! internal_argval && ! SIZE_ONLY ) {
8796 232         if (start_arg) {
8797 232         SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8798 108         ARG(ret) = add_data( pRExC_state, 1, "S" );
8799 508         RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8800 224         ret->flags = 0;
8801           } else {
8802 2436         ret->flags = 1;
8803           }
8804           }
8805           }
8806 324         if (!internal_argval)
8807 520         RExC_seen |= REG_SEEN_VERBARG;
8808 520         } else if ( start_arg ) {
8809 4380         vFAIL3("Verb pattern '%.*s' may not have an argument",
8810           verb_len, start_verb);
8811           } else {
8812 28         ret = reg_node(pRExC_state, op);
8813           }
8814 28         nextchar(pRExC_state);
8815 4352         return ret;
8816           }
8817 4108         else if (*RExC_parse == '?') { /* (?...) */
8818           bool is_logical = 0;
8819 252         const char * const seqstart = RExC_parse;
8820 4096         if (has_intervening_patws && SIZE_ONLY) {
8821 0         ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8822           }
8823            
8824 4096         RExC_parse++;
8825 4096         paren = *RExC_parse++;
8826           ret = NULL; /* For look-ahead/behind. */
8827 2058         switch (paren) {
8828            
8829           case 'P': /* (?P...) variants for those used to PCRE/Python */
8830 194         paren = *RExC_parse++;
8831 194         if ( paren == '<') /* (?P<...>) named capture */
8832           goto named_capture;
8833 194         else if (paren == '>') { /* (?P>name) named recursion */
8834           goto named_recursion;
8835           }
8836 194         else if (paren == '=') { /* (?P=...) named backref */
8837           /* this pretty much dupes the code for \k in regatom(), if
8838           you change this make sure you change that */
8839 1612         char* name_start = RExC_parse;
8840           U32 num = 0;
8841 3844         SV *sv_dat = reg_scan_name(pRExC_state,
8842           SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8843 3728         if (RExC_parse == name_start || *RExC_parse != ')')
8844 508         vFAIL2("Sequence %.3s... not terminated",parse_start);
8845            
8846 0         if (!SIZE_ONLY) {
8847 508         num = add_data( pRExC_state, 1, "S" );
8848 4352         RExC_rxi->data->data[num]=(void*)sv_dat;
8849 4352         SvREFCNT_inc_simple_void(sv_dat);
8850           }
8851 19443696         RExC_sawback = 1;
8852 8024360         ret = reganode(pRExC_state,
8853           ((! FOLD)
8854           ? NREF
8855           : (ASCII_FOLD_RESTRICTED)
8856           ? NREFFA
8857           : (AT_LEAST_UNI_SEMANTICS)
8858           ? NREFFU
8859           : (LOC)
8860           ? NREFFL
8861           : NREFF),
8862           num);
8863 8024360         *flagp |= HASWIDTH;
8864            
8865 6         Set_Node_Offset(ret, parse_start+1);
8866 8024360         Set_Node_Cur_Length(ret, parse_start);
8867            
8868 8024360         nextchar(pRExC_state);
8869 8024360         return ret;
8870           }
8871 3686         RExC_parse++;
8872 3686         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8873           /*NOTREACHED*/
8874           case '<': /* (?<...) */
8875 1174         if (*RExC_parse == '!')
8876           paren = ',';
8877 942         else if (*RExC_parse != '=')
8878           named_capture:
8879           { /* (?<...>) */
8880           char *name_start;
8881           SV *svname;
8882           paren= '>';
8883           case '\'': /* (?'...') */
8884 928         name_start= RExC_parse;
8885 928         svname = reg_scan_name(pRExC_state,
8886           SIZE_ONLY ? /* reverse test from the others */
8887           REG_RSN_RETURN_NAME :
8888           REG_RSN_RETURN_NULL);
8889 928         if (RExC_parse == name_start) {
8890 0         RExC_parse++;
8891 928         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8892           /*NOTREACHED*/
8893           }
8894 464         if (*RExC_parse != paren)
8895 464         vFAIL2("Sequence (?%c... not terminated",
8896           paren=='>' ? '<' : paren);
8897 464         if (SIZE_ONLY) {
8898           HE *he_str;
8899           SV *sv_dat = NULL;
8900 928         if (!svname) /* shouldn't happen */
8901 928         Perl_croak(aTHX_
8902           "panic: reg_scan_name returned NULL");
8903 928         if (!RExC_paren_names) {
8904 928         RExC_paren_names= newHV();
8905 928         sv_2mortal(MUTABLE_SV(RExC_paren_names));
8906           #ifdef DEBUGGING
8907 14         RExC_paren_name_list= newAV();
8908 14         sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8909           #endif
8910           }
8911 110910         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8912 47126         if ( he_str )
8913 22962         sv_dat = HeVAL(he_str);
8914 22962         if ( ! sv_dat ) {
8915           /* croak baby croak */
8916 22806         Perl_croak(aTHX_
8917           "panic: paren_name hash element allocation failed");
8918 0         } else if ( SvPOK(sv_dat) ) {
8919           /* (?|...) can mean we have dupes so scan to check
8920           its already been stored. Maybe a flag indicating
8921           we are inside such a construct would be useful,
8922           but the arrays are likely to be quite small, so
8923           for now we punt -- dmq */
8924 0         IV count = SvIV(sv_dat);
8925 22806         I32 *pv = (I32*)SvPVX(sv_dat);
8926           IV i;
8927 0         for ( i = 0 ; i < count ; i++ ) {
8928 22806         if ( pv[i] == RExC_npar ) {
8929           count = 0;
8930           break;
8931           }
8932           }
8933 12236         if ( count ) {
8934 0         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8935 12236         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8936 10476         pv[count] = RExC_npar;
8937 10476         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8938           }
8939           } else {
8940 12236         (void)SvUPGRADE(sv_dat,SVt_PVNV);
8941 12236         sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8942 12236         SvIOK_on(sv_dat);
8943 12236         SvIV_set(sv_dat, 1);
8944           }
8945           #ifdef DEBUGGING
8946           /* Yes this does cause a memory leak in debugging Perls */
8947 0         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8948 18110         SvREFCNT_dec_NN(svname);
8949           #endif
8950            
8951           /*sv_dump(sv_dat);*/
8952           }
8953 488         nextchar(pRExC_state);
8954           paren = 1;
8955 488         goto capturing_parens;
8956           }
8957 1146         RExC_seen |= REG_SEEN_LOOKBEHIND;
8958 662         RExC_in_lookbehind++;
8959 488         RExC_parse++;
8960           case '=': /* (?=...) */
8961 484         RExC_seen_zerolen++;
8962 484         break;
8963           case '!': /* (?!...) */
8964 484         RExC_seen_zerolen++;
8965 484         if (*RExC_parse == ')') {
8966 17622         ret=reg_node(pRExC_state, OPFAIL);
8967 11748         nextchar(pRExC_state);
8968 11748         return ret;
8969           }
8970           break;
8971           case '|': /* (?|...) */
8972           /* branch reset, behave like a (?:...) except that
8973           buffers in alternations share the same numbers */
8974           paren = ':';
8975 11748         after_freeze = freeze_paren = RExC_npar;
8976 22806         break;
8977           case ':': /* (?:...) */
8978           case '>': /* (?>...) */
8979           break;
8980           case '$': /* (?$...) */
8981           case '@': /* (?@...) */
8982 22806         vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8983           break;
8984           case '#': /* (?#...) */
8985           /* XXX As soon as we disallow separating the '?' and '*' (by
8986           * spaces or (?#...) comment), it is believed that this case
8987           * will be unreachable and can be removed. See
8988           * [perl #117327] */
8989 93452         while (*RExC_parse && *RExC_parse != ')')
8990 93452         RExC_parse++;
8991 93452         if (*RExC_parse != ')')
8992 149934         FAIL("Sequence (?#... not terminated");
8993 149934         nextchar(pRExC_state);
8994 290252         *flagp = TRYAGAIN;
8995 290252         return NULL;
8996           case '0' : /* (?0) */
8997           case 'R' : /* (?R) */
8998 348         if (*RExC_parse != ')')
8999 348         FAIL("Sequence (?R) not terminated");
9000 348         ret = reg_node(pRExC_state, GOSTART);
9001 2676         *flagp |= POSTPONED;
9002 2676         nextchar(pRExC_state);
9003 2         return ret;
9004           /*notreached*/
9005           { /* named and numeric backreferences */
9006           I32 num;
9007           case '&': /* (?&NAME) */
9008 0         parse_start = RExC_parse - 1;
9009           named_recursion:
9010           {
9011 0         SV *sv_dat = reg_scan_name(pRExC_state,
9012           SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9013 0         num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9014           }
9015 0         goto gen_recurse_regop;
9016           assert(0); /* NOT REACHED */
9017           case '+':
9018 0         if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9019 0         RExC_parse++;
9020 0         vFAIL("Illegal pattern");
9021           }
9022           goto parse_recursion;
9023           /* NOT REACHED*/
9024           case '-': /* (?-1) */
9025 128         if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9026 0         RExC_parse--; /* rewind to let it be handled later */
9027 128         goto parse_flags;
9028           }
9029           /*FALLTHROUGH */
9030           case '1': case '2': case '3': case '4': /* (?1) */
9031           case '5': case '6': case '7': case '8': case '9':
9032 128         RExC_parse--;
9033           parse_recursion:
9034 128         num = atoi(RExC_parse);
9035 128         parse_start = RExC_parse - 1; /* MJD */
9036 3516         if (*RExC_parse == '-')
9037 3516         RExC_parse++;
9038 3516         while (isDIGIT(*RExC_parse))
9039 348         RExC_parse++;
9040 0         if (*RExC_parse!=')')
9041 0         vFAIL("Expecting close bracket");
9042            
9043           gen_recurse_regop:
9044 3304         if ( paren == '-' ) {
9045           /*
9046           Diagram of capture buffer numbering.
9047           Top line is the normal capture buffer numbers
9048           Bottom line is the negative indexing as from
9049           the X (the (?-2))
9050            
9051           + 1 2 3 4 5 X 6 7
9052           /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9053           - 5 4 3 2 1 X x x
9054            
9055           */
9056 2944         num = RExC_npar + num;
9057 2944         if (num < 1) {
9058 2732         RExC_parse++;
9059 3080         vFAIL("Reference to nonexistent group");
9060           }
9061 3080         } else if ( paren == '+' ) {
9062 1720         num = RExC_npar + num - 1;
9063           }
9064            
9065 6160         ret = reganode(pRExC_state, GOSUB, num);
9066 3080         if (!SIZE_ONLY) {
9067 3080         if (num > (I32)RExC_rx->nparens) {
9068 0         RExC_parse++;
9069 6596         vFAIL("Reference to nonexistent group");
9070           }
9071 360         ARG2L_SET( ret, RExC_recurse_count++);
9072 360         RExC_emit++;
9073 0         DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9074           "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9075           } else {
9076 0         RExC_size++;
9077           }
9078 6236         RExC_seen |= REG_SEEN_RECURSE;
9079 348         Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9080 6596         Set_Node_Offset(ret, parse_start); /* MJD */
9081            
9082 6596         *flagp |= POSTPONED;
9083 3298         nextchar(pRExC_state);
9084 0         return ret;
9085           } /* named and numeric backreferences */
9086           assert(0); /* NOT REACHED */
9087            
9088           case '?': /* (??...) */
9089           is_logical = 1;
9090 0         if (*RExC_parse != '{') {
9091 3298         RExC_parse++;
9092 3298         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9093           /*NOTREACHED*/
9094           }
9095 3298         *flagp |= POSTPONED;
9096 6596         paren = *RExC_parse++;
9097           /* FALL THROUGH */
9098           case '{': /* (?{...}) */
9099           {
9100           U32 n = 0;
9101           struct reg_code_block *cb;
9102            
9103 6756         RExC_seen_zerolen++;
9104            
9105 6756         if ( !pRExC_state->num_code_blocks
9106 6756         || pRExC_state->code_index >= pRExC_state->num_code_blocks
9107 223756         || pRExC_state->code_blocks[pRExC_state->code_index].start
9108 320         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9109 160         - RExC_start)
9110           ) {
9111 223436         if (RExC_pm_flags & PMf_USE_RE_EVAL)
9112 223436         FAIL("panic: Sequence (?{...}): no code block found\n");
9113 228792         FAIL("Eval-group not allowed at runtime, use re 'eval'");
9114           }
9115           /* this is a pre-compiled code block (?{...}) */
9116 228952         cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9117 228940         RExC_parse = RExC_start + cb->end;
9118 343330         if (!SIZE_ONLY) {
9119 343250         OP *o = cb->block;
9120 228860         if (cb->src_regex) {
9121 72         n = add_data(pRExC_state, 2, "rl");
9122 96         RExC_rxi->data->data[n] =
9123 72         (void*)SvREFCNT_inc((SV*)cb->src_regex);
9124 228816         RExC_rxi->data->data[n+1] = (void*)o;
9125           }
9126           else {
9127 228800         n = add_data(pRExC_state, 1,
9128 228800         (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9129 114366         RExC_rxi->data->data[n] = (void*)o;
9130           }
9131           }
9132 114494         pRExC_state->code_index++;
9133 103446         nextchar(pRExC_state);
9134            
9135 206732         if (is_logical) {
9136           regnode *eval;
9137 103286         ret = reg_node(pRExC_state, LOGICAL);
9138 103286         eval = reganode(pRExC_state, EVAL, n);
9139 11048         if (!SIZE_ONLY) {
9140 11048         ret->flags = 2;
9141           /* for later propagation into (??{}) return value */
9142 11048         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9143           }
9144 228768         REGTAIL(pRExC_state, ret, eval);
9145           /* deal with the length of this later - MJD */
9146 228768         return ret;
9147           }
9148 228928         ret = reganode(pRExC_state, EVAL, n);
9149 223596         Set_Node_Length(ret, RExC_parse - parse_start + 1);
9150 223596         Set_Node_Offset(ret, parse_start);
9151           return ret;
9152           }
9153           case '(': /* (?(?{...})...) and (?(?=...)...) */
9154           {
9155           int is_define= 0;
9156 223436         if (RExC_parse[0] == '?') { /* (?(?...)) */
9157 111668         if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9158 111668         || RExC_parse[1] == '<'
9159 223436         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9160           I32 flag;
9161           regnode *tail;
9162            
9163 223436         ret = reg_node(pRExC_state, LOGICAL);
9164 5332         if (!SIZE_ONLY)
9165 5332         ret->flags = 1;
9166          
9167 6598         tail = reg(pRExC_state, 1, &flag, depth+1);
9168 2188         if (flag & RESTART_UTF8) {
9169 944         *flagp = RESTART_UTF8;
9170 944         return NULL;
9171           }
9172 2188         REGTAIL(pRExC_state, ret, tail);
9173 2188         goto insert_if;
9174           }
9175           }
9176 994         else if ( RExC_parse[0] == '<' /* (?()...) */
9177 2188         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9178           {
9179 2188         char ch = RExC_parse[0] == '<' ? '>' : '\'';
9180 200         char *name_start= RExC_parse++;
9181           U32 num = 0;
9182 200         SV *sv_dat=reg_scan_name(pRExC_state,
9183           SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9184 1988         if (RExC_parse == name_start || *RExC_parse != ch)
9185 1988         vFAIL2("Sequence (?(%c... not terminated",
9186           (ch == '>' ? '<' : ch));
9187 6615         RExC_parse++;
9188 4410         if (!SIZE_ONLY) {
9189 232         num = add_data( pRExC_state, 1, "S" );
9190 232         RExC_rxi->data->data[num]=(void*)sv_dat;
9191 232         SvREFCNT_inc_simple_void(sv_dat);
9192           }
9193 232         ret = reganode(pRExC_state,NGROUPP,num);
9194 0         goto insert_if_check_paren;
9195           }
9196 232         else if (RExC_parse[0] == 'D' &&
9197 232         RExC_parse[1] == 'E' &&
9198 116         RExC_parse[2] == 'F' &&
9199 116         RExC_parse[3] == 'I' &&
9200 116         RExC_parse[4] == 'N' &&
9201 232         RExC_parse[5] == 'E')
9202           {
9203 232         ret = reganode(pRExC_state,DEFINEP,0);
9204 4356         RExC_parse +=6 ;
9205           is_define = 1;
9206 534         goto insert_if_check_paren;
9207           }
9208 534         else if (RExC_parse[0] == 'R') {
9209 534         RExC_parse++;
9210           parno = 0;
9211 534         if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9212 356         parno = atoi(RExC_parse++);
9213 356         while (isDIGIT(*RExC_parse))
9214 356         RExC_parse++;
9215 356         } else if (RExC_parse[0] == '&') {
9216           SV *sv_dat;
9217 3822         RExC_parse++;
9218 696         sv_dat = reg_scan_name(pRExC_state,
9219           SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9220 696         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9221           }
9222 232         ret = reganode(pRExC_state,INSUBP,parno);
9223 348         goto insert_if_check_paren;
9224           }
9225 0         else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9226           /* (?(1)...) */
9227           char c;
9228 464         parno = atoi(RExC_parse++);
9229            
9230 232         while (isDIGIT(*RExC_parse))
9231 232         RExC_parse++;
9232 232         ret = reganode(pRExC_state, GROUPP, parno);
9233            
9234           insert_if_check_paren:
9235 696         if ((c = *nextchar(pRExC_state)) != ')')
9236 696         vFAIL("Switch condition not recognized");
9237           insert_if:
9238 3126         REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9239 3124         br = regbranch(pRExC_state, &flags, 1,depth+1);
9240 4686         if (br == NULL) {
9241 0         if (flags & RESTART_UTF8) {
9242 3124         *flagp = RESTART_UTF8;
9243 4408         return NULL;
9244           }
9245 16         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9246           (UV) flags);
9247           } else
9248 6380         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9249 6380         c = *nextchar(pRExC_state);
9250 6380         if (flags&HASWIDTH)
9251 0         *flagp |= HASWIDTH;
9252 0         if (c == '|') {
9253 0         if (is_define)
9254 0         vFAIL("(?(DEFINE)....) does not allow branches");
9255 6380         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9256 6380         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9257 6380         if (flags & RESTART_UTF8) {
9258 6056         *flagp = RESTART_UTF8;
9259 6380         return NULL;
9260           }
9261 3744         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9262           (UV) flags);
9263           }
9264 0         REGTAIL(pRExC_state, ret, lastbr);
9265 3744         if (flags&HASWIDTH)
9266 3744         *flagp |= HASWIDTH;
9267 0         c = *nextchar(pRExC_state);
9268           }
9269           else
9270           lastbr = NULL;
9271 0         if (c != ')')
9272 0         vFAIL("Switch (?(condition)... contains too many branches");
9273 0         ender = reg_node(pRExC_state, TAIL);
9274 3744         REGTAIL(pRExC_state, br, ender);
9275 3744         if (lastbr) {
9276 3728         REGTAIL(pRExC_state, lastbr, ender);
9277 3744         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9278           }
9279           else
9280 6380         REGTAIL(pRExC_state, ret, ender);
9281 16         RExC_size++; /* XXX WHY do we need this?!!
9282           For large programs it seems to be required
9283           but I can't figure out why. -- dmq*/
9284 6364         return ret;
9285           }
9286           else {
9287 6364         vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9288           }
9289           }
9290           case '[': /* (?[ ... ]) */
9291 6364         return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9292           oregcomp_parse);
9293           case 0:
9294 3728         RExC_parse--; /* for vFAIL to print correctly */
9295 3728         vFAIL("Sequence (? incomplete");
9296           break;
9297           default: /* e.g., (?i) */
9298 2724         --RExC_parse;
9299           parse_flags:
9300 6452         parse_lparen_question_flags(pRExC_state);
9301 6452         if (UCHARAT(RExC_parse) != ':') {
9302 2         nextchar(pRExC_state);
9303 6250         *flagp = TRYAGAIN;
9304 16         return NULL;
9305           }
9306           paren = ':';
9307 104         nextchar(pRExC_state);
9308           ret = NULL;
9309 4756252         goto parse_rest;
9310           } /* end switch */
9311           }
9312           else { /* (...) */
9313           capturing_parens:
9314 4759120         parno = RExC_npar;
9315 4759080         RExC_npar++;
9316          
9317 4752         ret = reganode(pRExC_state, OPEN, parno);
9318 4752         if (!SIZE_ONLY ){
9319 4746         if (!RExC_nestroot)
9320 4754334         RExC_nestroot = parno;
9321 4754334         if (RExC_seen & REG_SEEN_RECURSE
9322 11442142         && !RExC_open_parens[parno-1])
9323           {
9324 11442142         DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9325           "Setting open paren #%"IVdf" to %d\n",
9326           (IV)parno, REG_NODE_NUM(ret)));
9327 11442142         RExC_open_parens[parno-1]= ret;
9328           }
9329           }
9330 11442154         Set_Node_Length(ret, 1); /* MJD */
9331 5579271         Set_Node_Offset(ret, RExC_parse); /* MJD */
9332           is_open = 1;
9333           }
9334           }
9335           else /* ! paren */
9336           ret = NULL;
9337          
9338           parse_rest:
9339           /* Pick up the branches, linking them together. */
9340 4247294         parse_start = RExC_parse; /* MJD */
9341 5579655         br = regbranch(pRExC_state, &flags, 1,depth+1);
9342            
9343           /* branch_len = (paren != 0); */
9344            
9345 5722         if (br == NULL) {
9346 5348         if (flags & RESTART_UTF8) {
9347 36434938         *flagp = RESTART_UTF8;
9348 36429002         return NULL;
9349           }
9350 1014112         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9351           }
9352 1014484         if (*RExC_parse == '|') {
9353 1014128         if (!SIZE_ONLY && RExC_extralen) {
9354 0         reginsert(pRExC_state, BRANCHJ, br, depth+1);
9355           }
9356           else { /* MJD */
9357 35414882         reginsert(pRExC_state, BRANCH, br, depth+1);
9358 2245784         Set_Node_Length(br, paren != 0);
9359 16         Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9360           }
9361           have_branch = 1;
9362 2245784         if (SIZE_ONLY)
9363 2245776         RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9364           }
9365 1126611         else if (paren == ':') {
9366 33169186         *flagp |= flags&SIMPLE;
9367           }
9368 5906420         if (is_open) { /* Starts with OPEN. */
9369 35414878         REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9370           }
9371 11158994         else if (paren != '?') /* Not Conditional */
9372           ret = br;
9373 24256604         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9374           lastbr = br;
9375 35415706         while (*RExC_parse == '|') {
9376 57545076         if (!SIZE_ONLY && RExC_extralen) {
9377 4689456         ender = reganode(pRExC_state, LONGJMP,0);
9378 0         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9379           }
9380 96         if (SIZE_ONLY)
9381 4689504         RExC_extralen += 2; /* Account for LONGJMP. */
9382 2348195         nextchar(pRExC_state);
9383 4689552         if (freeze_paren) {
9384 4689456         if (RExC_npar > after_freeze)
9385 2676         after_freeze = RExC_npar;
9386 2676         RExC_npar = freeze_paren;
9387           }
9388 2772         br = regbranch(pRExC_state, &flags, 0, depth+1);
9389            
9390 4689552         if (br == NULL) {
9391 4689414         if (flags & RESTART_UTF8) {
9392 10         *flagp = RESTART_UTF8;
9393 10         return NULL;
9394           }
9395 10         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9396           }
9397 96         REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9398           lastbr = br;
9399 4689500         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9400           }
9401            
9402 4689776         if (have_branch || paren != ':') {
9403           /* Make a closing node, and hook it on the end. */
9404 35415098         switch (paren) {
9405           case ':':
9406 29508770         ender = reg_node(pRExC_state, TAIL);
9407 964548         break;
9408           case 1: case 2:
9409 964556         ender = reganode(pRExC_state, CLOSE, parno);
9410 11158610         if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9411 11158598         DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9412           "Setting close paren #%"IVdf" to %d\n",
9413           (IV)parno, REG_NODE_NUM(ender)));
9414 5326         RExC_close_parens[parno-1]= ender;
9415 5326         if (RExC_nestroot == parno)
9416 4834         RExC_nestroot = 0;
9417           }
9418 439646         Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9419 453758         Set_Node_Length(ender,1); /* MJD */
9420           break;
9421           case '<':
9422           case ',':
9423           case '=':
9424           case '!':
9425 453746         *flagp &= ~HASWIDTH;
9426           /* FALL THROUGH */
9427           case '>':
9428 16931878         ender = reg_node(pRExC_state, SUCCEED);
9429 16931878         break;
9430           case 0:
9431 8466019         ender = reg_node(pRExC_state, END);
9432 29509034         if (!SIZE_ONLY) {
9433 29508900         assert(!RExC_opend); /* there can only be one! */
9434 1119633         RExC_opend = ender;
9435           }
9436           break;
9437           }
9438 41580         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9439           SV * const mysv_val1=sv_newmortal();
9440           SV * const mysv_val2=sv_newmortal();
9441           DEBUG_PARSE_MSG("lsbr");
9442           regprop(RExC_rx, mysv_val1, lastbr);
9443           regprop(RExC_rx, mysv_val2, ender);
9444           PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9445           SvPV_nolen_const(mysv_val1),
9446           (IV)REG_NODE_NUM(lastbr),
9447           SvPV_nolen_const(mysv_val2),
9448           (IV)REG_NODE_NUM(ender),
9449           (IV)(ender - lastbr)
9450           );
9451           });
9452 6196541         REGTAIL(pRExC_state, lastbr, ender);
9453            
9454 5077042         if (have_branch && !SIZE_ONLY) {
9455           char is_nothing= 1;
9456 5076766         if (depth==1)
9457 3460842         RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9458            
9459           /* Hook the tails of the branches to the closing node. */
9460 3460920         for (br = ret; br; br = regnext(br)) {
9461 1615986         const U8 op = PL_regkind[OP(br)];
9462 70         if (op == BRANCH) {
9463 1119555         REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9464 252         if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9465           is_nothing= 0;
9466           }
9467 210         else if (op == BRANCHJ) {
9468 196         REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9469           /* for now we always disable this optimisation * /
9470           if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9471           */
9472           is_nothing= 0;
9473           }
9474           }
9475 108         if (is_nothing) {
9476 100         br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9477 384         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9478           SV * const mysv_val1=sv_newmortal();
9479           SV * const mysv_val2=sv_newmortal();
9480           DEBUG_PARSE_MSG("NADA");
9481           regprop(RExC_rx, mysv_val1, ret);
9482           regprop(RExC_rx, mysv_val2, ender);
9483           PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9484           SvPV_nolen_const(mysv_val1),
9485           (IV)REG_NODE_NUM(ret),
9486           SvPV_nolen_const(mysv_val2),
9487           (IV)REG_NODE_NUM(ender),
9488           (IV)(ender - ret)
9489           );
9490           });
9491 288         OP(br)= NOTHING;
9492 96         if (OP(ender) == TAIL) {
9493 35414814         NEXT_OFF(br)= 0;
9494 453746         RExC_emit= br + 1;
9495           } else {
9496           regnode *opt;
9497 453746         for ( opt= br + 1; opt < ender ; opt++ )
9498 453746         OP(opt)= OPTIMIZED;
9499 453746         NEXT_OFF(br)= ender - br;
9500           }
9501           }
9502           }
9503           }
9504            
9505           {
9506           const char *p;
9507           static const char parens[] = "=!<,>";
9508            
9509 454118         if (paren && (p = strchr(parens, paren))) {
9510 453746         U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9511 35414814         int flag = (p - parens) > 1;
9512            
9513 18482936         if (paren == '>')
9514           node = SUSPEND, flag = 0;
9515 18482936         reginsert(pRExC_state, node,ret, depth+1);
9516 30         Set_Node_Cur_Length(ret, parse_start);
9517 30         Set_Node_Offset(ret, parse_start + 1);
9518 16931878         ret->flags = flag;
9519 56         REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9520           }
9521           }
9522            
9523           /* Check for proper termination. */
9524 428         if (paren) {
9525           /* restore original flags, but keep (?p) */
9526 160         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9527 104         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9528 35414728         RExC_parse = oregcomp_parse;
9529 93448         vFAIL("Unmatched (");
9530           }
9531           }
9532 35414996         else if (!paren && RExC_parse < RExC_end) {
9533 18610531         if (*RExC_parse == ')') {
9534 41134494         RExC_parse++;
9535 41134494         vFAIL("Unmatched )");
9536           }
9537           else
9538 41134494         FAIL("Junk on end of regexp"); /* "Can't happen". */
9539           assert(0); /* NOTREACHED */
9540           }
9541            
9542 4689828         if (RExC_in_lookbehind) {
9543 0         RExC_in_lookbehind--;
9544           }
9545 4689828         if (after_freeze > RExC_npar)
9546 41134494         RExC_npar = after_freeze;
9547           return(ret);
9548           }
9549            
9550           /*
9551           - regbranch - one alternative of an | operator
9552           *
9553           * Implements the concatenation operator.
9554           *
9555           * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9556           * restarted.
9557           */
9558           STATIC regnode *
9559 2348591         S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9560           {
9561           dVAR;
9562           regnode *ret;
9563           regnode *chain = NULL;
9564           regnode *latest;
9565 41134986         I32 flags = 0, c = 0;
9566 41134986         GET_RE_DEBUG_FLAGS_DECL;
9567            
9568 41134986         PERL_ARGS_ASSERT_REGBRANCH;
9569            
9570 137092971         DEBUG_PARSE("brnc");
9571            
9572 76713332         if (first)
9573           ret = NULL;
9574           else {
9575 76712936         if (!SIZE_ONLY && RExC_extralen)
9576 76706862         ret = reganode(pRExC_state, BRANCHJ,0);
9577           else {
9578 1014574         ret = reg_node(pRExC_state, BRANCH);
9579 452         Set_Node_Length(ret, 1);
9580           }
9581           }
9582            
9583 1014614         if (!first && SIZE_ONLY)
9584 1014170         RExC_extralen += 1; /* BRANCHJ */
9585            
9586 1014614         *flagp = WORST; /* Tentatively. */
9587            
9588 492         RExC_parse--;
9589 75692876         nextchar(pRExC_state);
9590 75694056         while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9591 75693096         flags &= ~TRYAGAIN;
9592 40178458         latest = regpiece(pRExC_state, &flags,depth+1);
9593 35515350         if (latest == NULL) {
9594 35514662         if (flags & TRYAGAIN)
9595 75692562         continue;
9596 40114418         if (flags & RESTART_UTF8) {
9597 164082         *flagp = RESTART_UTF8;
9598 164082         return NULL;
9599           }
9600 40114394         FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9601           }
9602 31592139         else if (ret == NULL)
9603           ret = latest;
9604 76713528         *flagp |= flags&(HASWIDTH|POSTPONED);
9605 76713528         if (chain == NULL) /* First piece. */
9606 76713300         *flagp |= flags&SPSTART;
9607           else {
9608 76713068         RExC_naughty++;
9609 76707126         REGTAIL(pRExC_state, chain, latest);
9610           }
9611           chain = latest;
9612 1015166         c++;
9613           }
9614 1014946         if (chain == NULL) { /* Loop ran zero times. */
9615 8         chain = reg_node(pRExC_state, NOTHING);
9616 1014486         if (ret == NULL)
9617           ret = chain;
9618           }
9619 75692888         if (c == 1) {
9620 75692724         *flagp |= flags&SIMPLE;
9621           }
9622            
9623           return ret;
9624           }
9625            
9626           /*
9627           - regpiece - something followed by possible [*+?]
9628           *
9629           * Note that the branching code sequences used for ? and the general cases
9630           * of * and + are somewhat optimized: they use the same NOTHING node as
9631           * both the endmarker for their branch list and the body of the last branch.
9632           * It might seem that this node could be dispensed with entirely, but the
9633           * endmarker role is not redundant.
9634           *
9635           * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9636           * TRYAGAIN.
9637           * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9638           * restarted.
9639           */
9640           STATIC regnode *
9641 864014         S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9642           {
9643           dVAR;
9644           regnode *ret;
9645           char op;
9646           char *next;
9647           I32 flags;
9648 3764533         const char * const origparse = RExC_parse;
9649           I32 min;
9650           I32 max = REG_INFTY;
9651           #ifdef RE_TRACK_PATTERN_OFFSETS
9652           char *parse_start;
9653           #endif
9654           const char *maxpos = NULL;
9655            
9656           /* Save the original in case we change the emitted regop to a FAIL. */
9657 2473538         regnode * const orig_emit = RExC_emit;
9658            
9659 800934         GET_RE_DEBUG_FLAGS_DECL;
9660            
9661 2473538         PERL_ARGS_ASSERT_REGPIECE;
9662            
9663 864014         DEBUG_PARSE("piec");
9664            
9665 864014         ret = regatom(pRExC_state, &flags,depth+1);
9666 864014         if (ret == NULL) {
9667 863326         if (flags & (TRYAGAIN|RESTART_UTF8))
9668 863326         *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9669           else
9670 800222         FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9671 63104         return(NULL);
9672           }
9673            
9674 863990         op = *RExC_parse;
9675            
9676 823670         if (op == '{' && regcurly(RExC_parse, FALSE)) {
9677           maxpos = NULL;
9678           #ifdef RE_TRACK_PATTERN_OFFSETS
9679 2         parse_start = RExC_parse; /* MJD */
9680           #endif
9681 863300         next = RExC_parse + 1;
9682 863300         while (isDIGIT(*next) || *next == ',') {
9683 863300         if (*next == ',') {
9684 132         if (maxpos)
9685           break;
9686           else
9687           maxpos = next;
9688           }
9689 66         next++;
9690           }
9691 66         if (*next == '}') { /* got one */
9692 66         if (!maxpos)
9693           maxpos = next;
9694 132         RExC_parse++;
9695 38999632         min = atoi(RExC_parse);
9696 9368024         if (*maxpos == ',')
9697 3714528         maxpos++;
9698           else
9699 3714528         maxpos = RExC_parse;
9700           max = atoi(maxpos);
9701 5653496         if (!max && *maxpos != '0')
9702           max = REG_INFTY; /* meaning "infinity" */
9703 5653496         else if (max >= REG_INFTY)
9704 5653496         vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9705 5653496         RExC_parse = next;
9706 0         nextchar(pRExC_state);
9707 0         if (max < min) { /* If can't match, warn and optimize to fail
9708           unconditionally */
9709 0         if (SIZE_ONLY) {
9710 5653496         ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9711            
9712           /* We can't back off the size because we have to reserve
9713           * enough space for all the things we are about to throw
9714           * away, but we can shrink it by the ammount we are about
9715           * to re-use here */
9716 5653496         RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9717           }
9718           else {
9719 0         RExC_emit = orig_emit;
9720           }
9721 5653496         ret = reg_node(pRExC_state, OPFAIL);
9722 5653496         return ret;
9723           }
9724            
9725           do_curly:
9726 3241146         if ((flags&SIMPLE)) {
9727 5653496         RExC_naughty += 2 + RExC_naughty / 2;
9728 9368024         reginsert(pRExC_state, CURLY, ret, depth+1);
9729 9368024         Set_Node_Offset(ret, parse_start+1); /* MJD */
9730 2824626         Set_Node_Cur_Length(ret, parse_start);
9731           }
9732           else {
9733 9368024         regnode * const w = reg_node(pRExC_state, WHILEM);
9734            
9735 9367188         w->flags = 0;
9736 9368024         REGTAIL(pRExC_state, ret, w);
9737 4435696         if (!SIZE_ONLY && RExC_extralen) {
9738 4435696         reginsert(pRExC_state, LONGJMP,ret, depth+1);
9739 74829118         reginsert(pRExC_state, NOTHING,ret, depth+1);
9740 58802534         NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9741           }
9742 58802534         reginsert(pRExC_state, CURLYX,ret, depth+1);
9743           /* MJD hk */
9744 16026584         Set_Node_Offset(ret, parse_start+1);
9745 16026584         Set_Node_Length(ret,
9746           op == '{' ? (RExC_parse - parse_start) : 1);
9747            
9748 16026584         if (!SIZE_ONLY && RExC_extralen)
9749 5553704         NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9750 5553704         REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9751 5553704         if (SIZE_ONLY)
9752 10472880         RExC_whilem_seen++, RExC_extralen += 3;
9753 9297354         RExC_naughty += 4 + RExC_naughty; /* compound interest */
9754           }
9755 1968024         ret->flags = 0;
9756            
9757 1968024         if (min > 0)
9758 1968024         *flagp = WORST;
9759 7329330         if (max > 0)
9760 5346952         *flagp |= HASWIDTH;
9761 16889752         if (!SIZE_ONLY) {
9762 80         ARG1_SET(ret, (U16)min);
9763 80         ARG2_SET(ret, (U16)max);
9764           }
9765            
9766           goto nest_check;
9767           }
9768           }
9769            
9770 764         if (!ISMULT1(op)) {
9771 16890436         *flagp = flags;
9772 3258166         return(ret);
9773           }
9774            
9775           #if 0 /* Now runtime fix should be reliable. */
9776            
9777           /* if this is reinstated, don't forget to put this back into perldiag:
9778            
9779           =item Regexp *+ operand could be empty at {#} in regex m/%s/
9780            
9781           (F) The part of the regexp subject to either the * or + quantifier
9782           could match an empty string. The {#} shows in the regular
9783           expression about where the problem was discovered.
9784            
9785           */
9786            
9787           if (!(flags&HASWIDTH) && op != '?')
9788           vFAIL("Regexp *+ operand could be empty");
9789           #endif
9790            
9791           #ifdef RE_TRACK_PATTERN_OFFSETS
9792 3257478         parse_start = RExC_parse;
9793           #endif
9794 3257478         nextchar(pRExC_state);
9795            
9796 13632270         *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9797            
9798 13704         if (op == '*' && (flags&SIMPLE)) {
9799 13704         reginsert(pRExC_state, STAR, ret, depth+1);
9800 13704         ret->flags = 0;
9801 13704         RExC_naughty += 4;
9802           }
9803 13704         else if (op == '*') {
9804           min = 0;
9805           goto do_curly;
9806           }
9807 13704         else if (op == '+' && (flags&SIMPLE)) {
9808 13704         reginsert(pRExC_state, PLUS, ret, depth+1);
9809 16889748         ret->flags = 0;
9810 30         RExC_naughty += 3;
9811           }
9812 30         else if (op == '+') {
9813           min = 1;
9814           goto do_curly;
9815           }
9816 10018         else if (op == '?') {
9817           min = 0; max = 1;
9818           goto do_curly;
9819           }
9820           nest_check:
9821 10018         if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9822 1446         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9823 10741         ckWARN3reg(RExC_parse,
9824           "%.*s matches null string many times",
9825           (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9826           origparse);
9827 10018         (void)ReREFCNT_inc(RExC_rx_sv);
9828           }
9829            
9830 5962         if (RExC_parse < RExC_end && *RExC_parse == '?') {
9831 5962         nextchar(pRExC_state);
9832 1638         reginsert(pRExC_state, MINMOD, ret, depth+1);
9833 14         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9834           }
9835           else
9836 4324         if (RExC_parse < RExC_end && *RExC_parse == '+') {
9837           regnode *ender;
9838 4324         nextchar(pRExC_state);
9839 4324         ender = reg_node(pRExC_state, SUCCEED);
9840 4324         REGTAIL(pRExC_state, ret, ender);
9841 4324         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9842 4324         ret->flags = 0;
9843 4056         ender = reg_node(pRExC_state, TAIL);
9844 28         REGTAIL(pRExC_state, ret, ender);
9845           }
9846            
9847 4028         if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9848 4028         RExC_parse++;
9849 6008         vFAIL("Nested quantifiers");
9850           }
9851            
9852           return(ret);
9853           }
9854            
9855           STATIC bool
9856 3988         S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9857           const bool strict /* Apply stricter parsing rules? */
9858           )
9859           {
9860          
9861           /* This is expected to be called by a parser routine that has recognized '\N'
9862           and needs to handle the rest. RExC_parse is expected to point at the first
9863           char following the N at the time of the call. On successful return,
9864           RExC_parse has been updated to point to just after the sequence identified
9865           by this routine, and <*flagp> has been updated.
9866            
9867           The \N may be inside (indicated by the boolean ) or outside a
9868           character class.
9869            
9870           \N may begin either a named sequence, or if outside a character class, mean
9871           to match a non-newline. For non single-quoted regexes, the tokenizer has
9872           attempted to decide which, and in the case of a named sequence, converted it
9873           into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9874           where c1... are the characters in the sequence. For single-quoted regexes,
9875           the tokenizer passes the \N sequence through unchanged; this code will not
9876           attempt to determine this nor expand those, instead raising a syntax error.
9877           The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9878           or there is no '}', it signals that this \N occurrence means to match a
9879           non-newline.
9880            
9881           Only the \N{U+...} form should occur in a character class, for the same
9882           reason that '.' inside a character class means to just match a period: it
9883           just doesn't make sense.
9884            
9885           The function raises an error (via vFAIL), and doesn't return for various
9886           syntax errors. Otherwise it returns TRUE and sets or on
9887           success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9888           RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9889           only possible if node_p is non-NULL.
9890            
9891            
9892           If is non-null, it means the caller can accept an input sequence
9893           consisting of a just a single code point; <*valuep> is set to that value
9894           if the input is such.
9895            
9896           If is non-null it signifies that the caller can accept any other
9897           legal sequence (i.e., one that isn't just a single code point). <*node_p>
9898           is set as follows:
9899           1) \N means not-a-NL: points to a newly created REG_ANY node;
9900           2) \N{}: points to a new NOTHING node;
9901           3) otherwise: points to a new EXACT node containing the resolved
9902           string.
9903           Note that FALSE is returned for single code point sequences if is
9904           null.
9905           */
9906            
9907           char * endbrace; /* '}' following the name */
9908           char* p;
9909           char *endchar; /* Points to '.' or '}' ending cur char in the input
9910           stream */
9911           bool has_multiple_chars; /* true if the input stream contains a sequence of
9912           more than one character */
9913            
9914 3988         GET_RE_DEBUG_FLAGS_DECL;
9915          
9916 14         PERL_ARGS_ASSERT_GROK_BSLASH_N;
9917            
9918 14         GET_RE_DEBUG_FLAGS;
9919            
9920 4014         assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9921            
9922           /* The [^\n] meaning of \N ignores spaces and comments under the /x
9923           * modifier. The other meaning does not */
9924 26         p = (RExC_flags & RXf_PMf_EXTENDED)
9925 8         ? regwhite( pRExC_state, RExC_parse )
9926 18         : RExC_parse;
9927            
9928           /* Disambiguate between \N meaning a named character versus \N meaning
9929           * [^\n]. The former is assumed when it can't be the latter. */
9930 14         if (*p != '{' || regcurly(p, FALSE)) {
9931 8         RExC_parse = p;
9932 2         if (! node_p) {
9933           /* no bare \N in a charclass */
9934 2         if (in_char_class) {
9935 6         vFAIL("\\N in a character class must be a named character: \\N{...}");
9936           }
9937           return FALSE;
9938           }
9939 20         nextchar(pRExC_state);
9940 20         *node_p = reg_node(pRExC_state, REG_ANY);
9941 3988         *flagp |= HASWIDTH|SIMPLE;
9942 3988         RExC_naughty++;
9943 7976         RExC_parse--;
9944 3988         Set_Node_Length(*node_p, 1); /* MJD */
9945           return TRUE;
9946           }
9947            
9948           /* Here, we have decided it should be a named character or sequence */
9949            
9950           /* The test above made sure that the next real character is a '{', but
9951           * under the /x modifier, it could be separated by space (or a comment and
9952           * \n) and this is not allowed (for consistency with \x{...} and the
9953           * tokenizer handling of \N{NAME}). */
9954 7458         if (*RExC_parse != '{') {
9955 3542         vFAIL("Missing braces on \\N{}");
9956           }
9957            
9958 3542         RExC_parse++; /* Skip past the '{' */
9959            
9960 3542         if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9961 3542         || ! (endbrace == RExC_parse /* nothing between the {} */
9962 3542         || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9963 3514         && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9964           {
9965 70         if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9966 70         vFAIL("\\N{NAME} must be resolved by the lexer");
9967           }
9968            
9969 0         if (endbrace == RExC_parse) { /* empty: \N{} */
9970           bool ret = TRUE;
9971 70         if (node_p) {
9972 28         *node_p = reg_node(pRExC_state,NOTHING);
9973           }
9974 70         else if (in_char_class) {
9975 3472         if (SIZE_ONLY && in_char_class) {
9976 6         if (strict) {
9977 2         RExC_parse++; /* Position after the "}" */
9978 2         vFAIL("Zero length \\N{}");
9979           }
9980           else {
9981 4         ckWARNreg(RExC_parse,
9982           "Ignoring zero length \\N{} in character class");
9983           }
9984           }
9985           ret = FALSE;
9986           }
9987           else {
9988           return FALSE;
9989           }
9990 3470         nextchar(pRExC_state);
9991 446         return ret;
9992           }
9993            
9994 412         RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9995 412         RExC_parse += 2; /* Skip past the 'U+' */
9996            
9997 34         endchar = RExC_parse + strcspn(RExC_parse, ".}");
9998            
9999           /* Code points are separated by dots. If none, there is only one code
10000           * point, and is terminated by the brace */
10001 34         has_multiple_chars = (endchar < endbrace);
10002            
10003 3175         if (valuep && (! has_multiple_chars || in_char_class)) {
10004           /* We only pay attention to the first char of
10005           multichar strings being returned in char classes. I kinda wonder
10006           if this makes sense as it does change the behaviour
10007           from earlier versions, OTOH that behaviour was broken
10008           as well. XXX Solution is to recharacterize as
10009           [rest-of-class]|multi1|multi2... */
10010            
10011 3124         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10012 3124         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10013           | PERL_SCAN_DISALLOW_PREFIX
10014 3124         | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10015            
10016 3124         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10017            
10018           /* The tokenizer should have guaranteed validity, but it's possible to
10019           * bypass it by using single quoting, so check */
10020 6248         if (length_of_hex == 0
10021 34         || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10022           {
10023 34         RExC_parse += length_of_hex; /* Includes all the valid */
10024 34         RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10025 0         ? UTF8SKIP(RExC_parse)
10026           : 1;
10027           /* Guard against malformed utf8 */
10028 34         if (RExC_parse >= endchar) {
10029 34         RExC_parse = endchar;
10030           }
10031 34         vFAIL("Invalid hexadecimal number in \\N{U+...}");
10032           }
10033            
10034 2         if (in_char_class && has_multiple_chars) {
10035 2         if (strict) {
10036 2         RExC_parse = endbrace;
10037 0         vFAIL("\\N{} in character class restricted to one character");
10038           }
10039           else {
10040 32         ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10041           }
10042           }
10043            
10044 32         RExC_parse = endbrace + 1;
10045           }
10046 32         else if (! node_p || ! has_multiple_chars) {
10047            
10048           /* Here, the input is legal, but not according to the caller's
10049           * options. We fail without advancing the parse, so that the
10050           * caller can try again */
10051 32         RExC_parse = p;
10052 4960         return FALSE;
10053           }
10054           else {
10055            
10056           /* What is done here is to convert this to a sub-pattern of the form
10057           * (?:\x{char1}\x{char2}...)
10058           * and then call reg recursively. That way, it retains its atomicness,
10059           * while not having to worry about special handling that some code
10060           * points may have. toke.c has converted the original Unicode values
10061           * to native, so that we can just pass on the hex values unchanged. We
10062           * do have to set a flag to keep recoding from happening in the
10063           * recursion */
10064            
10065 464         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10066           STRLEN len;
10067 464         char *orig_end = RExC_end;
10068           I32 flags;
10069            
10070 464         while (RExC_parse < endbrace) {
10071            
10072           /* Convert to notation the rest of the code understands */
10073 464         sv_catpv(substitute_parse, "\\x{");
10074 464         sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10075 464         sv_catpv(substitute_parse, "}");
10076            
10077           /* Point to the beginning of the next character in the sequence. */
10078 464         RExC_parse = endchar + 1;
10079 464         endchar = RExC_parse + strcspn(RExC_parse, ".}");
10080           }
10081 928         sv_catpv(substitute_parse, ")");
10082            
10083 464         RExC_parse = SvPV(substitute_parse, len);
10084            
10085           /* Don't allow empty number */
10086 0         if (len < 8) {
10087 464         vFAIL("Invalid hexadecimal number in \\N{U+...}");
10088           }
10089 29313254         RExC_end = RExC_parse + len;
10090            
10091           /* The values are Unicode, and therefore not subject to recoding */
10092 16821824         RExC_override_recoding = 1;
10093            
10094 8410912         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10095 2265568         if (flags & RESTART_UTF8) {
10096 8462674         *flagp = RESTART_UTF8;
10097 29003628         return FALSE;
10098           }
10099 29003628         FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10100           (UV) flags);
10101           }
10102 29003628         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10103            
10104 643952         RExC_parse = endbrace;
10105 389050         RExC_end = orig_end;
10106 434032         RExC_override_recoding = 0;
10107            
10108 42406         nextchar(pRExC_state);
10109           }
10110            
10111           return TRUE;
10112           }
10113            
10114            
10115           /*
10116           * reg_recode
10117           *
10118           * It returns the code point in utf8 for the value in *encp.
10119           * value: a code value in the source encoding
10120           * encp: a pointer to an Encode object
10121           *
10122           * If the result from Encode is not a single character,
10123           * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10124           */
10125           STATIC UV
10126 42406         S_reg_recode(pTHX_ const char value, SV **encp)
10127           {
10128 573915         STRLEN numlen = 1;
10129 384530         SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10130 0         const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10131 0         const STRLEN newlen = SvCUR(sv);
10132           UV uv = UNICODE_REPLACEMENT;
10133            
10134 384530         PERL_ARGS_ASSERT_REG_RECODE;
10135            
10136 384530         if (newlen)
10137 0         uv = SvUTF8(sv)
10138 0         ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10139 0         : *(U8*)s;
10140            
10141 29003628         if (!newlen || numlen != newlen) {
10142           uv = UNICODE_REPLACEMENT;
10143 14557868         *encp = NULL;
10144           }
10145 14445760         return uv;
10146           }
10147            
10148           PERL_STATIC_INLINE U8
10149 14446192         S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10150           {
10151           U8 op;
10152            
10153 14446192         PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10154            
10155 322992         if (! FOLD) {
10156           return EXACT;
10157           }
10158            
10159 29003764         op = get_regex_charset(RExC_flags);
10160 29003696         if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10161 14434854         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10162           been, so there is no hole */
10163           }
10164            
10165 183677         return op + EXACTF;
10166           }
10167            
10168           PERL_STATIC_INLINE void
10169 14401972         S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10170           {
10171           /* This knows the details about sizing an EXACTish node, setting flags for
10172           * it (by setting <*flagp>, and potentially populating it with a single
10173           * character.
10174           *
10175           * If (the length in bytes) is non-zero, this function assumes that
10176           * the node has already been populated, and just does the sizing. In this
10177           * case should be the final code point that has already been
10178           * placed into the node. This value will be ignored except that under some
10179           * circumstances <*flagp> is set based on it.
10180           *
10181           * If is zero, the function assumes that the node is to contain only
10182           * the single character given by and calculates what
10183           * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10184           * additionally will populate the node's STRING with , if
10185           * is 0. In both cases <*flagp> is appropriately set
10186           *
10187           * It knows that under FOLD, the Latin Sharp S and UTF characters above
10188           * 255, must be folded (the former only when the rules indicate it can
10189           * match 'ss') */
10190            
10191 29004036         bool len_passed_in = cBOOL(len != 0);
10192           U8 character[UTF8_MAXBYTES_CASE+1];
10193            
10194 76713248         PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10195            
10196 76713248         if (! len_passed_in) {
10197 76712872         if (UTF) {
10198 76712840         if (FOLD && (! LOC || code_point > 255)) {
10199 76715040         _to_uni_fold_flags(code_point,
10200           character,
10201           &len,
10202           FOLD_FLAGS_FULL | ((LOC)
10203           ? FOLD_FLAGS_LOCALE
10204           : (ASCII_FOLD_RESTRICTED)
10205           ? FOLD_FLAGS_NOMIX_ASCII
10206           : 0));
10207           }
10208           else {
10209 76717240         uvchr_to_utf8( character, code_point);
10210 6310938         len = UTF8SKIP(character);
10211           }
10212           }
10213 6311002         else if (! FOLD
10214 6310970         || code_point != LATIN_SMALL_LETTER_SHARP_S
10215 109760         || ASCII_FOLD_RESTRICTED
10216 6201178         || ! AT_LEAST_UNI_SEMANTICS)
10217           {
10218 478512         *character = (U8) code_point;
10219 5722730         len = 1;
10220           }
10221           else {
10222 4163218         *character = 's';
10223 4163218         *(character + 1) = 's';
10224 1943588         len = 2;
10225           }
10226           }
10227            
10228 4163626         if (SIZE_ONLY) {
10229 59804         RExC_size += STR_SZ(len);
10230           }
10231           else {
10232 4103822         RExC_emit += STR_SZ(len);
10233 90444         STR_LEN(node) = len;
10234 4013582         if (! len_passed_in) {
10235 5133946         Copy((char *) character, STRING(node), len, char);
10236           }
10237           }
10238            
10239 5134338         *flagp |= HASWIDTH;
10240            
10241           /* A single character node is SIMPLE, except for the special-cased SHARP S
10242           * under /di. */
10243 3633514         if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10244 1501072         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10245 5133930         || ! FOLD || ! DEPENDS_SEMANTICS))
10246           {
10247 5134178         *flagp |= SIMPLE;
10248           }
10249 5134338         }
10250            
10251           /*
10252           - regatom - the lowest level
10253            
10254           Try to identify anything special at the start of the pattern. If there
10255           is, then handle it as required. This may involve generating a single regop,
10256           such as for an assertion; or it may involve recursing, such as to
10257           handle a () structure.
10258            
10259           If the string doesn't start with something special then we gobble up
10260           as much literal text as we can.
10261            
10262           Once we have been able to handle whatever type of thing started the
10263           sequence, we return.
10264            
10265           Note: we have to be careful with escapes, as they can be both literal
10266           and special, and in the case of \10 and friends, context determines which.
10267            
10268           A summary of the code structure is:
10269            
10270           switch (first_byte) {
10271           cases for each special:
10272           handle this special;
10273           break;
10274           case '\\':
10275           switch (2nd byte) {
10276           cases for each unambiguous special:
10277           handle this special;
10278           break;
10279           cases for each ambigous special/literal:
10280           disambiguate;
10281           if (special) handle here
10282           else goto defchar;
10283           default: // unambiguously literal:
10284           goto defchar;
10285           }
10286           default: // is a literal char
10287           // FALL THROUGH
10288           defchar:
10289           create EXACTish node for literal;
10290           while (more input and node isn't full) {
10291           switch (input_byte) {
10292           cases for each special;
10293           make sure parse pointer is set so that the next call to
10294           regatom will see this special first
10295           goto loopdone; // EXACTish node terminated by prev. char
10296           default:
10297           append char to EXACTISH node;
10298           }
10299           get next input byte;
10300           }
10301           loopdone:
10302           }
10303           return the generated node;
10304            
10305           Specifically there are two separate switches for handling
10306           escape sequences, with the one for handling literal escapes requiring
10307           a dummy entry for all of the special escapes that are actually handled
10308           by the other.
10309            
10310           Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10311           TRYAGAIN.
10312           Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10313           restarted.
10314           Otherwise does not return NULL.
10315           */
10316            
10317           STATIC regnode *
10318 7827542         S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10319           {
10320           dVAR;
10321 7827542         regnode *ret = NULL;
10322 7827392         I32 flags = 0;
10323 800         char *parse_start = RExC_parse;
10324           U8 op;
10325           int invert = 0;
10326            
10327 800         GET_RE_DEBUG_FLAGS_DECL;
10328            
10329 7827304         *flagp = WORST; /* Tentatively. */
10330            
10331 117096         DEBUG_PARSE("atom");
10332            
10333 712         PERL_ARGS_ASSERT_REGATOM;
10334            
10335           tryagain:
10336 7710920         switch ((U8)*RExC_parse) {
10337           case '^':
10338 7710208         RExC_seen_zerolen++;
10339 18922790         nextchar(pRExC_state);
10340 18922790         if (RExC_flags & RXf_PMf_MULTILINE)
10341 18919996         ret = reg_node(pRExC_state, MBOL);
10342 592196         else if (RExC_flags & RXf_PMf_SINGLELINE)
10343 4740         ret = reg_node(pRExC_state, SBOL);
10344           else
10345 340         ret = reg_node(pRExC_state, BOL);
10346 340         Set_Node_Length(ret, 1); /* MJD */
10347           break;
10348           case '$':
10349 587460         nextchar(pRExC_state);
10350 587460         if (*RExC_parse)
10351 587456         RExC_seen_zerolen++;
10352 4         if (RExC_flags & RXf_PMf_MULTILINE)
10353 18327800         ret = reg_node(pRExC_state, MEOL);
10354 18327804         else if (RExC_flags & RXf_PMf_SINGLELINE)
10355 16         ret = reg_node(pRExC_state, SEOL);
10356           else
10357 20         ret = reg_node(pRExC_state, EOL);
10358 20         Set_Node_Length(ret, 1); /* MJD */
10359           break;
10360           case '.':
10361 8         nextchar(pRExC_state);
10362 21320         if (RExC_flags & RXf_PMf_SINGLELINE)
10363 21312         ret = reg_node(pRExC_state, SANY);
10364           else
10365 21320         ret = reg_node(pRExC_state, REG_ANY);
10366 2322         *flagp |= HASWIDTH|SIMPLE;
10367 2322         RExC_naughty++;
10368 18803348         Set_Node_Length(ret, 1); /* MJD */
10369           break;
10370           case '[':
10371           {
10372 156956         char * const oregcomp_parse = ++RExC_parse;
10373 156956         ret = regclass(pRExC_state, flagp,depth+1,
10374           FALSE, /* means parse the whole char class */
10375           TRUE, /* allow multi-char folds */
10376           FALSE, /* don't silence non-portable warnings. */
10377           NULL);
10378 156956         if (*RExC_parse != ']') {
10379 156920         RExC_parse = oregcomp_parse;
10380 247160         vFAIL("Unmatched [");
10381           }
10382 247196         if (ret == NULL) {
10383 247160         if (*flagp & RESTART_UTF8)
10384           return NULL;
10385 247160         FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10386           (UV) *flagp);
10387           }
10388 56         nextchar(pRExC_state);
10389 56         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10390           break;
10391           }
10392           case '(':
10393 284         nextchar(pRExC_state);
10394 284         ret = reg(pRExC_state, 2, &flags,depth+1);
10395 284         if (ret == NULL) {
10396 32616         if (flags & TRYAGAIN) {
10397 32616         if (RExC_parse == RExC_end) {
10398           /* Make parent create an empty node if needed. */
10399 32616         *flagp |= TRYAGAIN;
10400 32616         return(NULL);
10401           }
10402           goto tryagain;
10403           }
10404 904084         if (flags & RESTART_UTF8) {
10405 904084         *flagp = RESTART_UTF8;
10406 904084         return NULL;
10407           }
10408 904084         FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10409           }
10410 584         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10411 584         break;
10412           case '|':
10413           case ')':
10414 320         if (flags & TRYAGAIN) {
10415 320         *flagp |= TRYAGAIN;
10416 11212         return NULL;
10417           }
10418 11212         vFAIL("Internal urp");
10419           /* Supposed to be caught earlier. */
10420           break;
10421           case '{':
10422 118900         if (!regcurly(RExC_parse, FALSE)) {
10423 1049454         RExC_parse++;
10424 1049454         goto defchar;
10425           }
10426           /* FALL THROUGH */
10427           case '?':
10428           case '+':
10429           case '*':
10430 2098908         RExC_parse++;
10431 1049454         vFAIL("Quantifier follows nothing");
10432           break;
10433           case '\\':
10434           /* Special Escapes
10435            
10436           This switch handles escape sequences that resolve to some kind
10437           of special regop and not to literal text. Escape sequnces that
10438           resolve to literal text are handled below in the switch marked
10439           "Literal Escapes".
10440            
10441           Every entry in this switch *must* have a corresponding entry
10442           in the literal escape switch. However, the opposite is not
10443           required, as the default for this switch is to jump to the
10444           literal text handling code.
10445           */
10446 1049530         switch ((U8)*++RExC_parse) {
10447           U8 arg;
10448           /* Special Escapes */
10449           case 'A':
10450 2098908         RExC_seen_zerolen++;
10451 1049454         ret = reg_node(pRExC_state, SBOL);
10452 1049454         *flagp |= SIMPLE;
10453 6         goto finish_meta_pat;
10454           case 'G':
10455 5768         ret = reg_node(pRExC_state, GPOS);
10456 5768         RExC_seen |= REG_SEEN_GPOS;
10457 11536         *flagp |= SIMPLE;
10458 5768         goto finish_meta_pat;
10459           case 'K':
10460 5768         RExC_seen_zerolen++;
10461 11536         ret = reg_node(pRExC_state, KEEPS);
10462 5768         *flagp |= SIMPLE;
10463           /* XXX:dmq : disabling in-place substitution seems to
10464           * be necessary here to avoid cases of memory corruption, as
10465           * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10466           */
10467 5768         RExC_seen |= REG_SEEN_LOOKBEHIND;
10468 37124         goto finish_meta_pat;
10469           case 'Z':
10470 43196         ret = reg_node(pRExC_state, SEOL);
10471 43196         *flagp |= SIMPLE;
10472 109796         RExC_seen_zerolen++; /* Do not optimize RE away */
10473 5524732         goto finish_meta_pat;
10474           case 'z':
10475 2762366         ret = reg_node(pRExC_state, EOS);
10476 2767498         *flagp |= SIMPLE;
10477 398334         RExC_seen_zerolen++; /* Do not optimize RE away */
10478 2767498         goto finish_meta_pat;
10479           case 'C':
10480 2767498         ret = reg_node(pRExC_state, CANY);
10481 1383749         RExC_seen |= REG_SEEN_CANY;
10482 2767498         *flagp |= HASWIDTH|SIMPLE;
10483 5218248         goto finish_meta_pat;
10484           case 'X':
10485 5218248         ret = reg_node(pRExC_state, CLUMP);
10486 410346         *flagp |= HASWIDTH;
10487 410346         goto finish_meta_pat;
10488            
10489           case 'W':
10490           invert = 1;
10491           /* FALLTHROUGH */
10492           case 'w':
10493           arg = ANYOF_WORDCHAR;
10494           goto join_posix;
10495            
10496           case 'b':
10497 410340         RExC_seen_zerolen++;
10498 0         RExC_seen |= REG_SEEN_LOOKBEHIND;
10499 410340         op = BOUND + get_regex_charset(RExC_flags);
10500 410340         if (op > BOUNDA) { /* /aa is same as /a */
10501           op = BOUNDA;
10502           }
10503 410340         ret = reg_node(pRExC_state, op);
10504 4774         FLAGS(ret) = get_regex_charset(RExC_flags);
10505 4774         *flagp |= SIMPLE;
10506 410         if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10507 408         ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10508           }
10509           goto finish_meta_pat;
10510           case 'B':
10511 408         RExC_seen_zerolen++;
10512 11346         RExC_seen |= REG_SEEN_LOOKBEHIND;
10513 11346         op = NBOUND + get_regex_charset(RExC_flags);
10514 2         if (op > NBOUNDA) { /* /aa is same as /a */
10515           op = NBOUNDA;
10516           }
10517 2         ret = reg_node(pRExC_state, op);
10518 11344         FLAGS(ret) = get_regex_charset(RExC_flags);
10519 11344         *flagp |= SIMPLE;
10520 11120         if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10521 11120         ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10522           }
10523           goto finish_meta_pat;
10524            
10525           case 'D':
10526           invert = 1;
10527           /* FALLTHROUGH */
10528           case 'd':
10529           arg = ANYOF_DIGIT;
10530           goto join_posix;
10531            
10532           case 'R':
10533 0         ret = reg_node(pRExC_state, LNBREAK);
10534 11120         *flagp |= HASWIDTH|SIMPLE;
10535 5504         goto finish_meta_pat;
10536            
10537           case 'H':
10538           invert = 1;
10539           /* FALLTHROUGH */
10540           case 'h':
10541           arg = ANYOF_BLANK;
10542           op = POSIXU;
10543           goto join_posix_op_known;
10544            
10545           case 'V':
10546           invert = 1;
10547           /* FALLTHROUGH */
10548           case 'v':
10549           arg = ANYOF_VERTWS;
10550           op = POSIXU;
10551           goto join_posix_op_known;
10552            
10553           case 'S':
10554           invert = 1;
10555           /* FALLTHROUGH */
10556           case 's':
10557           arg = ANYOF_SPACE;
10558            
10559           join_posix:
10560            
10561 5504         op = POSIXD + get_regex_charset(RExC_flags);
10562 5504         if (op > POSIXA) { /* /aa is same as /a */
10563           op = POSIXA;
10564           }
10565            
10566           join_posix_op_known:
10567            
10568 11120         if (invert) {
10569 19800         op += NPOSIXD - POSIXD;
10570           }
10571            
10572 11120         ret = reg_node(pRExC_state, op);
10573 11120         if (! SIZE_ONLY) {
10574 11120         FLAGS(ret) = namedclass_to_classnum(arg);
10575           }
10576            
10577 47912         *flagp |= HASWIDTH|SIMPLE;
10578           /* FALL THROUGH */
10579            
10580           finish_meta_pat:
10581 47912         nextchar(pRExC_state);
10582 1700         Set_Node_Length(ret, 2); /* MJD */
10583           break;
10584           case 'p':
10585           case 'P':
10586           {
10587           #ifdef DEBUGGING
10588 1700         char* parse_start = RExC_parse - 2;
10589           #endif
10590            
10591 788         RExC_parse--;
10592            
10593 1700         ret = regclass(pRExC_state, flagp,depth+1,
10594           TRUE, /* means just parse this element */
10595           FALSE, /* don't allow multi-char folds */
10596           FALSE, /* don't silence non-portable warnings.
10597           It would be a bug if these returned
10598           non-portables */
10599           NULL);
10600           /* regclass() can only return RESTART_UTF8 if multi-char folds
10601           are allowed. */
10602 684         if (!ret)
10603 1700         FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10604           (UV) *flagp);
10605            
10606 580         RExC_parse--;
10607            
10608 580         Set_Node_Offset(ret, parse_start + 2);
10609 580         Set_Node_Cur_Length(ret, parse_start);
10610 47332         nextchar(pRExC_state);
10611           }
10612 47332         break;
10613           case 'N':
10614           /* Handle \N and \N{NAME} with multiple code points here and not
10615           * below because it can be multicharacter. join_exact() will join
10616           * them up later on. Also this makes sure that things like
10617           * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10618           * The options to the grok function call causes it to fail if the
10619           * sequence is just a single code point. We then go treat it as
10620           * just another character in the current EXACT node, and hence it
10621           * gets uniform treatment with all the other characters. The
10622           * special treatment for quantifiers is not needed for such single
10623           * character sequences */
10624 58         ++RExC_parse;
10625 56         if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10626           FALSE /* not strict */ )) {
10627 2         if (*flagp & RESTART_UTF8)
10628           return NULL;
10629 47274         RExC_parse--;
10630 656         goto defchar;
10631           }
10632           break;
10633           case 'k': /* Handle \k and \k'NAME' */
10634           parse_named_seq:
10635           {
10636 656         char ch= RExC_parse[1];
10637 28         if (ch != '<' && ch != '\'' && ch != '{') {
10638 47246         RExC_parse++;
10639 85784         vFAIL2("Sequence %.2s... not terminated",parse_start);
10640           } else {
10641           /* this pretty much dupes the code for (?P=...) in reg(), if
10642           you change this make sure you change that */
10643 43056         char* name_start = (RExC_parse += 2);
10644           U32 num = 0;
10645 42728         SV *sv_dat = reg_scan_name(pRExC_state,
10646           SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10647 166         ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10648 2         if (RExC_parse == name_start || *RExC_parse != ch)
10649 164         vFAIL2("Sequence %.3s... not terminated",parse_start);
10650            
10651 42726         if (!SIZE_ONLY) {
10652 21363         num = add_data( pRExC_state, 1, "S" );
10653 120         RExC_rxi->data->data[num]=(void*)sv_dat;
10654 42606         SvREFCNT_inc_simple_void(sv_dat);
10655           }
10656            
10657 52344         RExC_sawback = 1;
10658 42606         ret = reganode(pRExC_state,
10659           ((! FOLD)
10660           ? NREF
10661           : (ASCII_FOLD_RESTRICTED)
10662           ? NREFFA
10663           : (AT_LEAST_UNI_SEMANTICS)
10664           ? NREFFU
10665           : (LOC)
10666           ? NREFFL
10667           : NREFF),
10668           num);
10669 42606         *flagp |= HASWIDTH;
10670            
10671           /* override incorrect value set in reganode MJD */
10672 42606         Set_Node_Offset(ret, parse_start+1);
10673 42606         Set_Node_Cur_Length(ret, parse_start);
10674 42         nextchar(pRExC_state);
10675            
10676           }
10677 2         break;
10678           }
10679           case 'g':
10680           case '1': case '2': case '3': case '4':
10681           case '5': case '6': case '7': case '8': case '9':
10682           {
10683           I32 num;
10684 101836         bool isg = *RExC_parse == 'g';
10685           bool isrel = 0;
10686           bool hasbrace = 0;
10687 0         if (isg) {
10688 15532552         RExC_parse++;
10689 28670082         if (*RExC_parse == '{') {
10690 28670082         RExC_parse++;
10691           hasbrace = 1;
10692           }
10693 28670082         if (*RExC_parse == '-') {
10694 28670082         RExC_parse++;
10695           isrel = 1;
10696           }
10697 32548546         if (hasbrace && !isDIGIT(*RExC_parse)) {
10698 150672493         if (isrel) RExC_parse--;
10699 204049195         RExC_parse -= 2;
10700 107861312         goto parse_named_seq;
10701           } }
10702 130149556         num = atoi(RExC_parse);
10703 130149556         if (isg && num == 0) {
10704 11881294         if (*RExC_parse == '0') {
10705 130149556         vFAIL("Reference to invalid group 0");
10706           }
10707           else {
10708 24485600         vFAIL("Unterminated \\g... pattern");
10709           }
10710           }
10711 1208504         if (isrel) {
10712 1208504         num = RExC_npar - num;
10713 6341112         if (num < 1)
10714 6341112         vFAIL("Reference to nonexistent or unclosed group");
10715           }
10716 6341112         if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10717           /* Probably a character specified in octal, e.g. \35 */
10718           goto defchar;
10719           else {
10720           #ifdef RE_TRACK_PATTERN_OFFSETS
10721 4718         char * const parse_start = RExC_parse - 1; /* MJD */
10722           #endif
10723 4718         while (isDIGIT(*RExC_parse))
10724 1632         RExC_parse++;
10725 0         if (hasbrace) {
10726 1632         if (*RExC_parse != '}')
10727 1632         vFAIL("Unterminated \\g{...} pattern");
10728 3002         RExC_parse++;
10729           }
10730 3002         if (!SIZE_ONLY) {
10731 390         if (num > (I32)RExC_rx->nparens)
10732 40684         vFAIL("Reference to nonexistent group");
10733           }
10734 40684         RExC_sawback = 1;
10735 40684         ret = reganode(pRExC_state,
10736           ((! FOLD)
10737           ? REF
10738           : (ASCII_FOLD_RESTRICTED)
10739           ? REFFA
10740           : (AT_LEAST_UNI_SEMANTICS)
10741           ? REFFU
10742           : (LOC)
10743           ? REFFL
10744           : REFF),
10745           num);
10746 102352         *flagp |= HASWIDTH;
10747            
10748           /* override incorrect value set in reganode MJD */
10749 102352         Set_Node_Offset(ret, parse_start+1);
10750 102352         Set_Node_Cur_Length(ret, parse_start);
10751 19276         RExC_parse--;
10752 19276         nextchar(pRExC_state);
10753           }
10754           }
10755 19276         break;
10756           case '\0':
10757 10560         if (RExC_parse >= RExC_end)
10758 10560         FAIL("Trailing \\");
10759           /* FALL THROUGH */
10760           default:
10761           /* Do not generate "unrecognized" warnings here, we fall
10762           back into the quick-grab loop below */
10763 10636         parse_start--;
10764 10464         goto defchar;
10765           }
10766           break;
10767            
10768           case '#':
10769 10388         if (RExC_flags & RXf_PMf_EXTENDED) {
10770 10388         if ( reg_skipcomment( pRExC_state ) )
10771           goto tryagain;
10772           }
10773           /* FALL THROUGH */
10774            
10775           default:
10776            
10777 782         parse_start = RExC_parse - 1;
10778            
10779 782         RExC_parse++;
10780            
10781           defchar: {
10782           STRLEN len = 0;
10783 410         UV ender = 0;
10784           char *p;
10785           char *s;
10786           #define MAX_NODE_STRING_SIZE 127
10787           char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10788           char *s0;
10789           U8 upper_parse = MAX_NODE_STRING_SIZE;
10790           STRLEN foldlen;
10791 410         U8 node_type = compute_EXACTish(pRExC_state);
10792           bool next_is_quantifier;
10793           char * oldp = NULL;
10794            
10795           /* We can convert EXACTF nodes to EXACTFU if they contain only
10796           * characters that match identically regardless of the target
10797           * string's UTF8ness. The reason to do this is that EXACTF is not
10798           * trie-able, EXACTFU is. (We don't need to figure this out until
10799           * pass 2) */
10800 848         bool maybe_exactfu = node_type == EXACTF && PASS2;
10801            
10802           /* If a folding node contains only code points that don't
10803           * participate in folds, it can be changed into an EXACT node,
10804           * which allows the optimizer more things to look for */
10805           bool maybe_exact;
10806            
10807 848         ret = reg_node(pRExC_state, node_type);
10808            
10809           /* In pass1, folded, we use a temporary buffer instead of the
10810           * actual node, as the node doesn't exist yet */
10811 848         s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10812            
10813           s0 = s;
10814            
10815           reparse:
10816            
10817           /* We do the EXACTFish to EXACT node only if folding, and not if in
10818           * locale, as whether a character folds or not isn't known until
10819           * runtime. (And we don't need to figure this out until pass 2) */
10820 800         maybe_exact = FOLD && ! LOC && PASS2;
10821            
10822           /* XXX The node can hold up to 255 bytes, yet this only goes to
10823           * 127. I (khw) do not know why. Keeping it somewhat less than
10824           * 255 allows us to not have to worry about overflow due to
10825           * converting to utf8 and fold expansion, but that value is
10826           * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10827           * split up by this limit into a single one using the real max of
10828           * 255. Even at 127, this breaks under rare circumstances. If
10829           * folding, we do not want to split a node at a character that is a
10830           * non-final in a multi-char fold, as an input string could just
10831           * happen to want to match across the node boundary. The join
10832           * would solve that problem if the join actually happens. But a
10833           * series of more than two nodes in a row each of 127 would cause
10834           * the first join to succeed to get to 254, but then there wouldn't
10835           * be room for the next one, which could at be one of those split
10836           * multi-char folds. I don't know of any fool-proof solution. One
10837           * could back off to end with only a code point that isn't such a
10838           * non-final, but it is possible for there not to be any in the
10839           * entire node. */
10840 1886380         for (p = RExC_parse - 1;
10841 1887200         len < upper_parse && p < RExC_end;
10842 1885580         len++)
10843           {
10844 1136         oldp = p;
10845            
10846 1136         if (RExC_flags & RXf_PMf_EXTENDED)
10847 1884756         p = regwhite( pRExC_state, p );
10848 1885888         switch ((U8)*p) {
10849           case '^':
10850           case '$':
10851           case '.':
10852           case '[':
10853           case '(':
10854           case ')':
10855           case '|':
10856           goto loopdone;
10857           case '\\':
10858           /* Literal Escapes Switch
10859            
10860           This switch is meant to handle escape sequences that
10861           resolve to a literal character.
10862            
10863           Every escape sequence that represents something
10864           else, like an assertion or a char class, is handled
10865           in the switch marked 'Special Escapes' above in this
10866           routine, but also has an entry here as anything that
10867           isn't explicitly mentioned here will be treated as
10868           an unescaped equivalent literal.
10869           */
10870            
10871 1884584         switch ((U8)*++p) {
10872           /* These are all the special escapes. */
10873           case 'A': /* Start assertion */
10874           case 'b': case 'B': /* Word-boundary assertion*/
10875           case 'C': /* Single char !DANGEROUS! */
10876           case 'd': case 'D': /* digit class */
10877           case 'g': case 'G': /* generic-backref, pos assertion */
10878           case 'h': case 'H': /* HORIZWS */
10879           case 'k': case 'K': /* named backref, keep marker */
10880           case 'p': case 'P': /* Unicode property */
10881           case 'R': /* LNBREAK */
10882           case 's': case 'S': /* space class */
10883           case 'v': case 'V': /* VERTWS */
10884           case 'w': case 'W': /* word class */
10885           case 'X': /* eXtended Unicode "combining character sequence" */
10886           case 'z': case 'Z': /* End of line/string assertion */
10887 991212         --p;
10888 37652         goto loopdone;
10889            
10890           /* Anything after here is an escape that resolves to a
10891           literal. (Except digits, which may or may not)
10892           */
10893           case 'n':
10894 37652         ender = '\n';
10895 37624         p++;
10896 64         break;
10897           case 'N': /* Handle a single-code point named character. */
10898           /* The options cause it to fail if a multiple code
10899           * point sequence. Handle those in the switch() above
10900           * */
10901 64         RExC_parse = p + 1;
10902 22296         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10903           flagp, depth, FALSE,
10904           FALSE /* not strict */ ))
10905           {
10906 1764         if (*flagp & RESTART_UTF8)
10907 1764         FAIL("panic: grok_bslash_N set RESTART_UTF8");
10908 65914         RExC_parse = p = oldp;
10909 65914         goto loopdone;
10910           }
10911 65914         p = RExC_parse;
10912 65914         if (ender > 0xff) {
10913 514         REQUIRE_UTF8;
10914           }
10915           break;
10916           case 'r':
10917 65764         ender = '\r';
10918 65764         p++;
10919 32882         break;
10920           case 't':
10921 9700         ender = '\t';
10922 3414         p++;
10923 4         break;
10924           case 'f':
10925 65764         ender = '\f';
10926 300         p++;
10927 300         break;
10928           case 'e':
10929 300         ender = ASCII_TO_NATIVE('\033');
10930 300         p++;
10931 0         break;
10932           case 'a':
10933 300         ender = '\a';
10934 80         p++;
10935 0         break;
10936           case 'o':
10937           {
10938           UV result;
10939           const char* error_msg;
10940            
10941 14757394         bool valid = grok_bslash_o(&p,
10942           &result,
10943           &error_msg,
10944           TRUE, /* out warnings */
10945           FALSE, /* not strict */
10946           TRUE, /* Output warnings
10947           for non-
10948           portables */
10949           UTF);
10950 32         if (! valid) {
10951 32         RExC_parse = p; /* going to die anyway; point
10952           to exact spot of failure */
10953 89353930         vFAIL(error_msg);
10954           }
10955 44592865         ender = result;
10956 2233732         if (PL_encoding && ender < 0x100) {
10957           goto recode_encoding;
10958           }
10959 2233654         if (ender > 0xff) {
10960 4         REQUIRE_UTF8;
10961           }
10962           break;
10963           }
10964           case 'x':
10965           {
10966 104295288         UV result = UV_MAX; /* initialize to erroneous
10967           value */
10968           const char* error_msg;
10969            
10970 183964         bool valid = grok_bslash_x(&p,
10971           &result,
10972           &error_msg,
10973           TRUE, /* out warnings */
10974           FALSE, /* not strict */
10975           TRUE, /* Output warnings
10976           for non-
10977           portables */
10978           UTF);
10979 183964         if (! valid) {
10980 103927400         RExC_parse = p; /* going to die anyway; point
10981           to exact spot of failure */
10982 112317160         vFAIL(error_msg);
10983           }
10984 8621502         ender = result;
10985            
10986 112317200         if (PL_encoding && ender < 0x100) {
10987           goto recode_encoding;
10988           }
10989 932708         if (ender > 0xff) {
10990 932708         REQUIRE_UTF8;
10991           }
10992           break;
10993           }
10994           case 'c':
10995 111384492         p++;
10996 100926454         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10997 264068         break;
10998           case '8': case '9': /* must be a backreference */
10999 264068         --p;
11000 264068         goto loopdone;
11001           case '1': case '2': case '3':case '4':
11002           case '5': case '6': case '7':
11003           /* When we parse backslash escapes there is ambiguity
11004           * between backreferences and octal escapes. Any escape
11005           * from \1 - \9 is a backreference, any multi-digit
11006           * escape which does not start with 0 and which when
11007           * evaluated as decimal could refer to an already
11008           * parsed capture buffer is a backslash. Anything else
11009           * is octal.
11010           *
11011           * Note this implies that \118 could be interpreted as
11012           * 118 OR as "\11" . "8" depending on whether there
11013           * were 118 capture buffers defined already in the
11014           * pattern. */
11015 264068         if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
11016           { /* Not to be treated as an octal constant, go
11017           find backref */
11018 264068         --p;
11019 100662386         goto loopdone;
11020           }
11021           case '0':
11022           {
11023 10458038         I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11024 2056604         STRLEN numlen = 3;
11025 7137372         ender = grok_oct(p, &numlen, &flags, NULL);
11026 2858452         if (ender > 0xff) {
11027 1250202         REQUIRE_UTF8;
11028           }
11029 1238714         p += numlen;
11030 1220638         if (SIZE_ONLY /* like \08, \178 */
11031 942760         && numlen < 3
11032 147310         && p < RExC_end
11033 7137372         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11034           {
11035 4980999         reg_warn_non_literal_string(
11036           p + 1,
11037           form_short_octal_warning(p, numlen));
11038           }
11039           }
11040 473660         if (PL_encoding && ender < 0x100)
11041           goto recode_encoding;
11042           break;
11043           recode_encoding:
11044 439560         if (! RExC_override_recoding) {
11045 439560         SV* enc = PL_encoding;
11046 34100         ender = reg_recode((const char)(U8)ender, &enc);
11047 34100         if (!enc && SIZE_ONLY)
11048 34100         ckWARNreg(p, "Invalid escape in the specified encoding");
11049 5636952         REQUIRE_UTF8;
11050           }
11051           break;
11052           case '\0':
11053 2847006         if (p >= RExC_end)
11054 1324192         FAIL("Trailing \\");
11055           /* FALL THROUGH */
11056           default:
11057 1017300         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11058           /* Include any { following the alpha to emphasize
11059           * that it could be part of an escape at some point
11060           * in the future */
11061 40         int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11062 40         ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11063           }
11064           goto normal_default;
11065           } /* End of switch on '\' */
11066           break;
11067           default: /* A literal character */
11068            
11069 788         if (! SIZE_ONLY
11070 408         && RExC_flags & RXf_PMf_EXTENDED
11071 1017244         && ckWARN_d(WARN_DEPRECATED)
11072 2847006         && is_PATWS_non_low(p, UTF))
11073           {
11074 3320666         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11075           "Escape literal pattern white space under /x");
11076           }
11077            
11078           normal_default:
11079 3321510         if (UTF8_IS_START(*p) && UTF) {
11080           STRLEN numlen;
11081 111384532         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11082           &numlen, UTF8_ALLOW_DEFAULT);
11083 3523220         p += numlen;
11084           }
11085           else
11086 3523944         ender = (U8) *p++;
11087           break;
11088           } /* End of switch on the literal */
11089            
11090           /* Here, have looked at the literal character and
11091           * contains its ordinal,

points to the character after it

11092           */
11093            
11094 6382706         if ( RExC_flags & RXf_PMf_EXTENDED)
11095 3616         p = regwhite( pRExC_state, p );
11096            
11097           /* If the next thing is a quantifier, it applies to this
11098           * character only, which means that this character has to be in
11099           * its own node and can't just be appended to the string in an
11100           * existing node, so if there are already other characters in
11101           * the node, close the node with just them, and set up to do
11102           * this character again next time through, when it will be the
11103           * only thing in its new node */
11104 6220         if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11105           {
11106 117048         p = oldp;
11107 2700         goto loopdone;
11108           }
11109            
11110 1736         if (! FOLD) {
11111 594         if (UTF) {
11112 972         const STRLEN unilen = reguni(pRExC_state, ender, s);
11113 1478         if (unilen > 0) {
11114 1020         s += unilen;
11115 1430         len += unilen;
11116           }
11117            
11118           /* The loop increments each time, as all but this
11119           * path (and one other) through it add a single byte to
11120           * the EXACTish node. But this one has changed len to
11121           * be the correct final value, so subtract one to
11122           * cancel out the increment that follows */
11123 972         len--;
11124           }
11125           else {
11126 584         REGC((char)ender, s++);
11127           }
11128           }
11129 228         else /* FOLD */ if (! ( UTF
11130           /* See comments for join_exact() as to why we fold this
11131           * non-UTF at compile time */
11132           || (node_type == EXACTFU
11133 0         && ender == LATIN_SMALL_LETTER_SHARP_S)))
11134           {
11135 276         if (IS_IN_SOME_FOLD_L1(ender)) {
11136           maybe_exact = FALSE;
11137            
11138           /* See if the character's fold differs between /d and
11139           * /u. This includes the multi-char fold SHARP S to
11140           * 'ss' */
11141 220         if (maybe_exactfu
11142 1002         && (PL_fold[ender] != PL_fold_latin1[ender]
11143 3702         || ender == LATIN_SMALL_LETTER_SHARP_S
11144 986         || (len > 0
11145 2796         && isARG2_lower_or_UPPER_ARG1('s', ender)
11146 74         && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11147           {
11148           maybe_exactfu = FALSE;
11149           }
11150           }
11151 244         *(s++) = (char) ender;
11152           }
11153           else { /* UTF */
11154            
11155           /* Prime the casefolded buffer. Locale rules, which apply
11156           * only to code points < 256, aren't known until execution,
11157           * so for them, just output the original character using
11158           * utf8. If we start to fold non-UTF patterns, be sure to
11159           * update join_exact() */
11160 16         if (LOC && ender < 256) {
11161 48         if (NATIVE_IS_INVARIANT(ender)) {
11162 48         *s = (U8) ender;
11163 28359676         foldlen = 1;
11164           } else {
11165 0         *s = UTF8_TWO_BYTE_HI(ender);
11166 28359676         *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11167 7556628         foldlen = 2;
11168           }
11169           }
11170           else {
11171 2192202         UV folded = _to_uni_fold_flags(
11172           ender,
11173           (U8 *) s,
11174           &foldlen,
11175           FOLD_FLAGS_FULL
11176           | ((LOC) ? FOLD_FLAGS_LOCALE
11177           : (ASCII_FOLD_RESTRICTED)
11178           ? FOLD_FLAGS_NOMIX_ASCII
11179           : 0)
11180           );
11181            
11182           /* If this node only contains non-folding code points
11183           * so far, see if this new one is also non-folding */
11184 5364426         if (maybe_exact) {
11185 308424         if (folded != ender) {
11186           maybe_exact = FALSE;
11187           }
11188           else {
11189           /* Here the fold is the original; we have
11190           * to check further to see if anything
11191           * folds to it */
11192 28359676         if (! PL_utf8_foldable) {
11193 28359676         SV* swash = swash_init("utf8",
11194           "_Perl_Any_Folds",
11195           &PL_sv_undef, 1, 0);
11196 28359676         PL_utf8_foldable =
11197 28359648         _get_swash_invlist(swash);
11198 28359648         SvREFCNT_dec_NN(swash);
11199           }
11200 0         if (_invlist_contains_cp(PL_utf8_foldable,
11201           ender))
11202           {
11203           maybe_exact = FALSE;
11204           }
11205           }
11206           }
11207 76199659         ender = folded;
11208           }
11209 20504202         s += foldlen;
11210            
11211           /* The loop increments each time, as all but this
11212           * path (and one other) through it add a single byte to the
11213           * EXACTish node. But this one has changed len to be the
11214           * correct final value, so subtract one to cancel out the
11215           * increment that follows */
11216 20504202         len += foldlen - 1;
11217           }
11218            
11219 34974671         if (next_is_quantifier) {
11220            
11221           /* Here, the next input is a quantifier, and to get here,
11222           * the current character is the only one in the node.
11223           * Also, here doesn't include the final byte for this
11224           * character */
11225 24920534         len++;
11226 4415500         goto loopdone;
11227           }
11228            
11229           } /* End of loop through literal characters */
11230            
11231           /* Here we have either exhausted the input or ran out of room in
11232           * the node. (If we encountered a character that can't be in the
11233           * node, transfer is made directly to , and so we
11234           * wouldn't have fallen off the end of the loop.) In the latter
11235           * case, we artificially have to split the node into two, because
11236           * we just don't have enough space to hold everything. This
11237           * creates a problem if the final character participates in a
11238           * multi-character fold in the non-final position, as a match that
11239           * should have occurred won't, due to the way nodes are matched,
11240           * and our artificial boundary. So back off until we find a non-
11241           * problematic character -- one that isn't at the beginning or
11242           * middle of such a fold. (Either it doesn't participate in any
11243           * folds, or appears only in the final position of all the folds it
11244           * does participate in.) A better solution with far fewer false
11245           * positives, and that would fill the nodes more completely, would
11246           * be to actually have available all the multi-character folds to
11247           * test against, and to back-off only far enough to be sure that
11248           * this node isn't ending with a partial one. is set
11249           * further below (if we need to reparse the node) to include just
11250           * up through that final non-problematic character that this code
11251           * identifies, so when it is set to less than the full node, we can
11252           * skip the rest of this */
11253 20505122         if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11254            
11255           const STRLEN full_len = len;
11256            
11257 644176         assert(len >= MAX_NODE_STRING_SIZE);
11258            
11259           /* Here, points to the final byte of the final character.
11260           * Look backwards through the string until find a non-
11261           * problematic character */
11262            
11263 618672         if (! UTF) {
11264            
11265           /* These two have no multi-char folds to non-UTF characters
11266           */
11267 25608         if (ASCII_FOLD_RESTRICTED || LOC) {
11268           goto loopdone;
11269           }
11270            
11271 2237886         while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11272 20504202         len = s - s0 + 1;
11273           }
11274           else {
11275 48952         if (! PL_NonL1NonFinalFold) {
11276 48952         PL_NonL1NonFinalFold = _new_invlist_C_array(
11277           NonL1_Perl_Non_Final_Folds_invlist);
11278           }
11279            
11280           /* Point to the first byte of the final character */
11281 121586         s = (char *) utf8_hop((U8 *) s, -1);
11282            
11283 97098         while (s >= s0) { /* Search backwards until find
11284           non-problematic char */
11285 48148         if (UTF8_IS_INVARIANT(*s)) {
11286            
11287           /* There are no ascii characters that participate
11288           * in multi-char folds under /aa. In EBCDIC, the
11289           * non-ascii invariants are all control characters,
11290           * so don't ever participate in any folds. */
11291 48950         if (ASCII_FOLD_RESTRICTED
11292 220         || ! IS_NON_FINAL_FOLD(*s))
11293           {
11294           break;
11295           }
11296           }
11297 220         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11298            
11299           /* No Latin1 characters participate in multi-char
11300           * folds under /l */
11301 212         if (LOC
11302 10         || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11303           *s, *(s+1))))
11304           {
11305           break;
11306           }
11307           }
11308 24080         else if (! _invlist_contains_cp(
11309           PL_NonL1NonFinalFold,
11310           valid_utf8_to_uvchr((U8 *) s, NULL)))
11311           {
11312           break;
11313           }
11314            
11315           /* Here, the current character is problematic in that
11316           * it does occur in the non-final position of some
11317           * fold, so try the character before it, but have to
11318           * special case the very first byte in the string, so
11319           * we don't read outside the string */
11320 48952         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11321           } /* End of loop backwards through the string */
11322            
11323           /* If there were only problematic characters in the string,
11324           * will point to before s0, in which case the length
11325           * should be 0, otherwise include the length of the
11326           * non-problematic character just found */
11327 89316         len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11328           }
11329            
11330           /* Here, have found the final character, if any, that is
11331           * non-problematic as far as ending the node without splitting
11332           * it across a potential multi-char fold. contains the
11333           * number of bytes in the node up-to and including that
11334           * character, or is 0 if there is no such character, meaning
11335           * the whole node contains only problematic characters. In
11336           * this case, give up and just take the node as-is. We can't
11337           * do any better */
11338 30248         if (len == 0) {
11339           len = full_len;
11340            
11341           /* If the node ends in an 's' we make sure it stays EXACTF,
11342           * as if it turns into an EXACTFU, it could later get
11343           * joined with another 's' that would then wrongly match
11344           * the sharp s */
11345 59544         if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11346           {
11347           maybe_exactfu = FALSE;
11348           }
11349           } else {
11350            
11351           /* Here, the node does contain some characters that aren't
11352           * problematic. If one such is the final character in the
11353           * node, we are done */
11354 59544         if (len == full_len) {
11355           goto loopdone;
11356           }
11357 400740         else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11358            
11359           /* If the final character is problematic, but the
11360           * penultimate is not, back-off that last character to
11361           * later start a new node with it */
11362 341196         p = oldp;
11363 59544         goto loopdone;
11364           }
11365            
11366           /* Here, the final non-problematic character is earlier
11367           * in the input than the penultimate character. What we do
11368           * is reparse from the beginning, going up only as far as
11369           * this final ok one, thus guaranteeing that the node ends
11370           * in an acceptable character. The reason we reparse is
11371           * that we know how far in the character is, but we don't
11372           * know how to correlate its position with the input parse.
11373           * An alternate implementation would be to build that
11374           * correlation as we go along during the original parse,
11375           * but that would entail extra work for every node, whereas
11376           * this code gets executed only when the string is too
11377           * large for the node, and the final two characters are
11378           * problematic, an infrequent occurrence. Yet another
11379           * possible strategy would be to save the tail of the
11380           * string, and the next time regatom is called, initialize
11381           * with that. The problem with this is that unless you
11382           * back off one more character, you won't be guaranteed
11383           * regatom will get called again, unless regbranch,
11384           * regpiece ... are also changed. If you do back off that
11385           * extra character, so that there is input guaranteed to
11386           * force calling regatom, you can't handle the case where
11387           * just the first character in the node is acceptable. I
11388           * (khw) decided to try this method which doesn't have that
11389           * pitfall; if performance issues are found, we can do a
11390           * combination of the current approach plus that one */
11391 556         upper_parse = len;
11392           len = 0;
11393           s = s0;
11394 4         goto reparse;
11395           }
11396           } /* End of verifying node ends with an appropriate char */
11397            
11398           loopdone: /* Jumped to when encounters something that shouldn't be in
11399           the node */
11400            
11401           /* I (khw) don't know if you can get here with zero length, but the
11402           * old code handled this situation by creating a zero-length EXACT
11403           * node. Might as well be NOTHING instead */
11404 400         if (len == 0) {
11405 20         OP(ret) = NOTHING;
11406           }
11407           else {
11408 380         if (FOLD) {
11409           /* If 'maybe_exact' is still set here, means there are no
11410           * code points in the node that participate in folds;
11411           * similarly for 'maybe_exactfu' and code points that match
11412           * differently depending on UTF8ness of the target string
11413           * */
11414 620         if (maybe_exact) {
11415 58988         OP(ret) = EXACT;
11416           }
11417 59056         else if (maybe_exactfu) {
11418 58878         OP(ret) = EXACTFU;
11419           }
11420           }
11421 59220         alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11422           }
11423            
11424 59220         RExC_parse = p - 1;
11425 59200         Set_Node_Cur_Length(ret, parse_start);
11426 59200         nextchar(pRExC_state);
11427           {
11428           /* len is STRLEN which is unsigned, need to copy to signed */
11429 59200         IV iv = len;
11430 1350         if (iv < 0)
11431 56376         vFAIL("Internal disaster");
11432           }
11433            
11434           } /* End of label 'defchar:' */
11435           break;
11436           } /* End of giant switch on input character */
11437            
11438 1820         return(ret);
11439           }
11440            
11441           STATIC char *
11442 696         S_regwhite( RExC_state_t *pRExC_state, char *p )
11443           {
11444 824         const char *e = RExC_end;
11445            
11446 468         PERL_ARGS_ASSERT_REGWHITE;
11447            
11448 440         while (p < e) {
11449 5668         if (isSPACE(*p))
11450 952         ++p;
11451 3624         else if (*p == '#') {
11452           bool ended = 0;
11453           do {
11454 1764         if (*p++ == '\n') {
11455           ended = 1;
11456           break;
11457           }
11458 1860         } while (p < e);
11459 1860         if (!ended)
11460 42572         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11461           }
11462           else
11463           break;
11464           }
11465 41540         return p;
11466           }
11467            
11468           STATIC char *
11469 1124         S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11470           {
11471           /* Returns the next non-pattern-white space, non-comment character (the
11472           * latter only if 'recognize_comment is true) in the string p, which is
11473           * ended by RExC_end. If there is no line break ending a comment,
11474           * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11475 1412         const char *e = RExC_end;
11476            
11477 58824         PERL_ARGS_ASSERT_REGPATWS;
11478            
11479 64         while (p < e) {
11480           STRLEN len;
11481 58760         if ((len = is_PATWS_safe(p, e, UTF))) {
11482 43664         p += len;
11483           }
11484 20         else if (recognize_comment && *p == '#') {
11485           bool ended = 0;
11486           do {
11487 10         p++;
11488 0         if (is_LNBREAK_safe(p, e, UTF)) {
11489           ended = 1;
11490           break;
11491           }
11492 10         } while (p < e);
11493 144         if (!ended)
11494 4         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11495           }
11496           else
11497           break;
11498           }
11499 140         return p;
11500           }
11501            
11502           /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11503           Character classes ([:foo:]) can also be negated ([:^foo:]).
11504           Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11505           Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11506           but trigger failures because they are currently unimplemented. */
11507            
11508           #define POSIXCC_DONE(c) ((c) == ':')
11509           #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11510           #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11511            
11512           PERL_STATIC_INLINE I32
11513           S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11514           {
11515           dVAR;
11516           I32 namedclass = OOB_NAMEDCLASS;
11517            
11518 6138         PERL_ARGS_ASSERT_REGPPOSIXCC;
11519            
11520 6138         if (value == '[' && RExC_parse + 1 < RExC_end &&
11521           /* I smell either [: or [= or [. -- POSIX has been here, right? */
11522 6138         POSIXCC(UCHARAT(RExC_parse)))
11523           {
11524 6138         const char c = UCHARAT(RExC_parse);
11525 34         char* const s = RExC_parse++;
11526            
11527 98         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11528 34         RExC_parse++;
11529 34         if (RExC_parse == RExC_end) {
11530 30         if (strict) {
11531            
11532           /* Try to give a better location for the error (than the end of
11533           * the string) by looking for the matching ']' */
11534 2         RExC_parse = s;
11535 4         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11536 3071         RExC_parse++;
11537           }
11538 4         vFAIL2("Unmatched '%c' in POSIX class", c);
11539           }
11540           /* Grandfather lone [:, [=, [. */
11541 6         RExC_parse = s;
11542           }
11543           else {
11544 6260         const char* const t = RExC_parse++; /* skip over the c */
11545 6260         assert(*t == c);
11546            
11547 9390         if (UCHARAT(RExC_parse) == ']') {
11548 2         const char *posixcc = s + 1;
11549 6258         RExC_parse++; /* skip over the ending ] */
11550            
11551 6258         if (*s == ':') {
11552 3162         const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11553 3162         const I32 skip = t - posixcc;
11554            
11555           /* Initially switch on the length of the name. */
11556 8607         switch (skip) {
11557           case 4:
11558 7008         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11559           this is the Perl \w
11560           */
11561           namedclass = ANYOF_WORDCHAR;
11562           break;
11563           case 5:
11564           /* Names all of length 5. */
11565           /* alnum alpha ascii blank cntrl digit graph lower
11566           print punct space upper */
11567           /* Offset 4 gives the best switch position. */
11568 7008         switch (posixcc[4]) {
11569           case 'a':
11570 7008         if (memEQ(posixcc, "alph", 4)) /* alpha */
11571           namedclass = ANYOF_ALPHA;
11572           break;
11573           case 'e':
11574 28         if (memEQ(posixcc, "spac", 4)) /* space */
11575           namedclass = ANYOF_PSXSPC;
11576           break;
11577           case 'h':
11578 138         if (memEQ(posixcc, "grap", 4)) /* graph */
11579           namedclass = ANYOF_GRAPH;
11580           break;
11581           case 'i':
11582 138         if (memEQ(posixcc, "asci", 4)) /* ascii */
11583           namedclass = ANYOF_ASCII;
11584           break;
11585           case 'k':
11586 3104         if (memEQ(posixcc, "blan", 4)) /* blank */
11587           namedclass = ANYOF_BLANK;
11588           break;
11589           case 'l':
11590 3104         if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11591           namedclass = ANYOF_CNTRL;
11592           break;
11593           case 'm':
11594 3086         if (memEQ(posixcc, "alnu", 4)) /* alnum */
11595           namedclass = ANYOF_ALPHANUMERIC;
11596           break;
11597           case 'r':
11598 3104         if (memEQ(posixcc, "lowe", 4)) /* lower */
11599 0         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11600 3050         else if (memEQ(posixcc, "uppe", 4)) /* upper */
11601 12         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11602           break;
11603           case 't':
11604 3050         if (memEQ(posixcc, "digi", 4)) /* digit */
11605           namedclass = ANYOF_DIGIT;
11606 3050         else if (memEQ(posixcc, "prin", 4)) /* print */
11607           namedclass = ANYOF_PRINT;
11608 3104         else if (memEQ(posixcc, "punc", 4)) /* punct */
11609           namedclass = ANYOF_PUNCT;
11610           break;
11611           }
11612           break;
11613           case 6:
11614 3090         if (memEQ(posixcc, "xdigit", 6))
11615           namedclass = ANYOF_XDIGIT;
11616           break;
11617           }
11618            
11619 3090         if (namedclass == OOB_NAMEDCLASS)
11620 3088         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11621           t - s - 1, s + 1);
11622            
11623           /* The #defines are structured so each complement is +1 to
11624           * the normal one */
11625 3086         if (complement) {
11626 3086         namedclass++;
11627           }
11628 3086         assert (posixcc[skip] == ':');
11629 3086         assert (posixcc[skip+1] == ']');
11630 3864         } else if (!SIZE_ONLY) {
11631           /* [[=foo=]] and [[.foo.]] are still future. */
11632            
11633           /* adjust RExC_parse so the warning shows after
11634           the class closes */
11635 22         while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11636 3096         RExC_parse++;
11637 7910         vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11638           }
11639           } else {
11640           /* Maternal grandfather:
11641           * "[:" ending in ":" but not in ":]" */
11642 6362         if (strict) {
11643 6362         vFAIL("Unmatched '[' in POSIX class");
11644           }
11645            
11646           /* Grandfather lone [:, [=, [. */
11647 6362         RExC_parse = s;
11648           }
11649           }
11650           }
11651            
11652           return namedclass;
11653           }
11654            
11655           STATIC bool
11656 6362         S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11657           {
11658           /* This applies some heuristics at the current parse position (which should
11659           * be at a '[') to see if what follows might be intended to be a [:posix:]
11660           * class. It returns true if it really is a posix class, of course, but it
11661           * also can return true if it thinks that what was intended was a posix
11662           * class that didn't quite make it.
11663           *
11664           * It will return true for
11665           * [:alphanumerics:
11666           * [:alphanumerics] (as long as the ] isn't followed immediately by a
11667           * ')' indicating the end of the (?[
11668           * [:any garbage including %^&$ punctuation:]
11669           *
11670           * This is designed to be called only from S_handle_regex_sets; it could be
11671           * easily adapted to be called from the spot at the beginning of regclass()
11672           * that checks to see in a normal bracketed class if the surrounding []
11673           * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11674           * change long-standing behavior, so I (khw) didn't do that */
11675 0         char* p = RExC_parse + 1;
11676 6362         char first_char = *p;
11677            
11678 3320         PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11679            
11680 14         assert(*(p - 1) == '[');
11681            
11682 14         if (! POSIXCC(first_char)) {
11683           return FALSE;
11684           }
11685            
11686 14         p++;
11687 14         while (p < RExC_end && isWORDCHAR(*p)) p++;
11688            
11689 14         if (p >= RExC_end) {
11690           return FALSE;
11691           }
11692            
11693 14         if (p - RExC_parse > 2 /* Got at least 1 word character */
11694 12         && (*p == first_char
11695 10         || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11696           {
11697           return TRUE;
11698           }
11699            
11700 10         p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11701            
11702 10         return (p
11703 10         && p - RExC_parse > 2 /* [:] evaluates to colon;
11704           [::] is a bad posix class. */
11705 10         && first_char == *(p - 1));
11706           }
11707            
11708           STATIC regnode *
11709 2         S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11710           char * const oregcomp_parse)
11711           {
11712           /* Handle the (?[...]) construct to do set operations */
11713            
11714           U8 curchar;
11715           UV start, end; /* End points of code point ranges */
11716           SV* result_string;
11717           char *save_end, *save_parse;
11718           SV* final;
11719           STRLEN len;
11720           regnode* node;
11721           AV* stack;
11722 0         const bool save_fold = FOLD;
11723            
11724 2         GET_RE_DEBUG_FLAGS_DECL;
11725            
11726 0         PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11727            
11728 2         if (LOC) {
11729 10         vFAIL("(?[...]) not valid in locale");
11730           }
11731 10         RExC_uni_semantics = 1;
11732            
11733           /* This will return only an ANYOF regnode, or (unlikely) something smaller
11734           * (such as EXACT). Thus we can skip most everything if just sizing. We
11735           * call regclass to handle '[]' so as to not have to reinvent its parsing
11736           * rules here (throwing away the size it computes each time). And, we exit
11737           * upon an unescaped ']' that isn't one ending a regclass. To do both
11738           * these things, we need to realize that something preceded by a backslash
11739           * is escaped, so we have to keep track of backslashes */
11740 10         if (SIZE_ONLY) {
11741           UV depth = 0; /* how many nested (?[...]) constructs */
11742            
11743 10         Perl_ck_warner_d(aTHX_
11744           packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11745           "The regex_sets feature is experimental" REPORT_LOCATION,
11746 10         (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11747            
11748 2         while (RExC_parse < RExC_end) {
11749 2         SV* current = NULL;
11750 116         RExC_parse = regpatws(pRExC_state, RExC_parse,
11751           TRUE); /* means recognize comments */
11752 0         switch (*RExC_parse) {
11753           case '?':
11754 96         if (RExC_parse[1] == '[') depth++, RExC_parse++;
11755           /* FALL THROUGH */
11756           default:
11757           break;
11758           case '\\':
11759           /* Skip the next byte (which could cause us to end up in
11760           * the middle of a UTF-8 character, but since none of those
11761           * are confusable with anything we currently handle in this
11762           * switch (invariants all), it's safe. We'll just hit the
11763           * default: case next time and keep on incrementing until
11764           * we find one of the invariants we do handle. */
11765 96         RExC_parse++;
11766 3034         break;
11767           case '[':
11768           {
11769           /* If this looks like it is a [:posix:] class, leave the
11770           * parse pointer at the '[' to fool regclass() into
11771           * thinking it is part of a '[[:posix:]]'. That function
11772           * will use strict checking to force a syntax error if it
11773           * doesn't work out to a legitimate class */
11774 3034         bool is_posix_class
11775           = could_it_be_a_POSIX_class(pRExC_state);
11776 3022         if (! is_posix_class) {
11777 3034         RExC_parse++;
11778           }
11779            
11780           /* regclass() can only return RESTART_UTF8 if multi-char
11781           folds are allowed. */
11782 0         if (!regclass(pRExC_state, flagp,depth+1,
11783           is_posix_class, /* parse the whole char
11784           class only if not a
11785           posix class */
11786           FALSE, /* don't allow multi-char folds */
11787           TRUE, /* silence non-portable warnings. */
11788           ¤t))
11789 3024         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11790           (UV) *flagp);
11791            
11792           /* function call leaves parse pointing to the ']', except
11793           * if we faked it */
11794 12         if (is_posix_class) {
11795 86         RExC_parse--;
11796           }
11797            
11798 84         SvREFCNT_dec(current); /* In case it returned something */
11799 84         break;
11800           }
11801            
11802           case ']':
11803 4         if (depth--) break;
11804 4         RExC_parse++;
11805 82         if (RExC_parse < RExC_end
11806 82         && *RExC_parse == ')')
11807           {
11808 10         node = reganode(pRExC_state, ANYOF, 0);
11809 10         RExC_size += ANYOF_SKIP;
11810 30         nextchar(pRExC_state);
11811 20         Set_Node_Length(node,
11812           RExC_parse - oregcomp_parse + 1); /* MJD */
11813           return node;
11814           }
11815           goto no_close;
11816           }
11817 20         RExC_parse++;
11818           }
11819            
11820           no_close:
11821 2         FAIL("Syntax error in (?[...])");
11822           }
11823            
11824           /* Pass 2 only after this. Everything in this construct is a
11825           * metacharacter. Operands begin with either a '\' (for an escape
11826           * sequence), or a '[' for a bracketed character class. Any other
11827           * character should be an operator, or parenthesis for grouping. Both
11828           * types of operands are handled by calling regclass() to parse them. It
11829           * is called with a parameter to indicate to return the computed inversion
11830           * list. The parsing here is implemented via a stack. Each entry on the
11831           * stack is a single character representing one of the operators, or the
11832           * '('; or else a pointer to an operand inversion list. */
11833            
11834           #define IS_OPERAND(a) (! SvIOK(a))
11835            
11836           /* The stack starts empty. It is a syntax error if the first thing parsed
11837           * is a binary operator; everything else is pushed on the stack. When an
11838           * operand is parsed, the top of the stack is examined. If it is a binary
11839           * operator, the item before it should be an operand, and both are replaced
11840           * by the result of doing that operation on the new operand and the one on
11841           * the stack. Thus a sequence of binary operands is reduced to a single
11842           * one before the next one is parsed.
11843           *
11844           * A unary operator may immediately follow a binary in the input, for
11845           * example
11846           * [a] + ! [b]
11847           * When an operand is parsed and the top of the stack is a unary operator,
11848           * the operation is performed, and then the stack is rechecked to see if
11849           * this new operand is part of a binary operation; if so, it is handled as
11850           * above.
11851           *
11852           * A '(' is simply pushed on the stack; it is valid only if the stack is
11853           * empty, or the top element of the stack is an operator or another '('
11854           * (for which the parenthesized expression will become an operand). By the
11855           * time the corresponding ')' is parsed everything in between should have
11856           * been parsed and evaluated to a single operand (or else is a syntax
11857           * error), and is handled as a regular operand */
11858            
11859 2         sv_2mortal((SV *)(stack = newAV()));
11860            
11861 28         while (RExC_parse < RExC_end) {
11862 28         I32 top_index = av_tindex(stack);
11863           SV** top_ptr;
11864 50         SV* current = NULL;
11865            
11866           /* Skip white space */
11867 28         RExC_parse = regpatws(pRExC_state, RExC_parse,
11868           TRUE); /* means recognize comments */
11869 28         if (RExC_parse >= RExC_end) {
11870 22         Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11871           }
11872 22         if ((curchar = UCHARAT(RExC_parse)) == ']') {
11873           break;
11874           }
11875            
11876 22         switch (curchar) {
11877            
11878           case '?':
11879 6         if (av_tindex(stack) >= 0 /* This makes sure that we can
11880           safely subtract 1 from
11881           RExC_parse in the next clause.
11882           If we have something on the
11883           stack, we have parsed something
11884           */
11885 6         && UCHARAT(RExC_parse - 1) == '('
11886 6         && RExC_parse < RExC_end)
11887           {
11888           /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11889           * This happens when we have some thing like
11890           *
11891           * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11892           * ...
11893           * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11894           *
11895           * Here we would be handling the interpolated
11896           * '$thai_or_lao'. We handle this by a recursive call to
11897           * ourselves which returns the inversion list the
11898           * interpolated expression evaluates to. We use the flags
11899           * from the interpolated pattern. */
11900 22         U32 save_flags = RExC_flags;
11901 1587         const char * const save_parse = ++RExC_parse;
11902            
11903 3162         parse_lparen_question_flags(pRExC_state);
11904            
11905 3056         if (RExC_parse == save_parse /* Makes sure there was at
11906           least one flag (or this
11907           embedding wasn't compiled)
11908           */
11909 106         || RExC_parse >= RExC_end - 4
11910 106         || UCHARAT(RExC_parse) != ':'
11911 6         || UCHARAT(++RExC_parse) != '('
11912 6         || UCHARAT(++RExC_parse) != '?'
11913 6         || UCHARAT(++RExC_parse) != '[')
11914           {
11915            
11916           /* In combination with the above, this moves the
11917           * pointer to the point just after the first erroneous
11918           * character (or if there are no flags, to where they
11919           * should have been) */
11920 100         if (RExC_parse >= RExC_end - 4) {
11921 100         RExC_parse = RExC_end;
11922           }
11923 22         else if (RExC_parse != save_parse) {
11924 22         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11925           }
11926 10         vFAIL("Expecting '(?flags:(?[...'");
11927           }
11928 10         RExC_parse++;
11929 10         (void) handle_regex_sets(pRExC_state, ¤t, flagp,
11930           depth+1, oregcomp_parse);
11931            
11932           /* Here, 'current' contains the embedded expression's
11933           * inversion list, and RExC_parse points to the trailing
11934           * ']'; the next character should be the ')' which will be
11935           * paired with the '(' that has been put on the stack, so
11936           * the whole embedded expression reduces to '(operand)' */
11937 10         RExC_parse++;
11938            
11939 18         RExC_flags = save_flags;
11940 18         goto handle_operand;
11941           }
11942           /* FALL THROUGH */
11943            
11944           default:
11945 18         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11946 18         vFAIL("Unexpected character");
11947            
11948           case '\\':
11949           /* regclass() can only return RESTART_UTF8 if multi-char
11950           folds are allowed. */
11951 32         if (!regclass(pRExC_state, flagp,depth+1,
11952           TRUE, /* means parse just the next thing */
11953           FALSE, /* don't allow multi-char folds */
11954           FALSE, /* don't silence non-portable warnings. */
11955           ¤t))
11956 32         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11957           (UV) *flagp);
11958           /* regclass() will return with parsing just the \ sequence,
11959           * leaving the parse pointer at the next thing to parse */
11960 32         RExC_parse--;
11961 32         goto handle_operand;
11962            
11963           case '[': /* Is a bracketed character class */
11964           {
11965 8         bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11966            
11967 8         if (! is_posix_class) {
11968 8         RExC_parse++;
11969           }
11970            
11971           /* regclass() can only return RESTART_UTF8 if multi-char
11972           folds are allowed. */
11973 8         if(!regclass(pRExC_state, flagp,depth+1,
11974           is_posix_class, /* parse the whole char class
11975           only if not a posix class */
11976           FALSE, /* don't allow multi-char folds */
11977           FALSE, /* don't silence non-portable warnings. */
11978           ¤t))
11979 10         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11980           (UV) *flagp);
11981           /* function call leaves parse pointing to the ']', except if we
11982           * faked it */
11983 10         if (is_posix_class) {
11984 10         RExC_parse--;
11985           }
11986            
11987           goto handle_operand;
11988           }
11989            
11990           case '&':
11991           case '|':
11992           case '+':
11993           case '-':
11994           case '^':
11995 10         if (top_index < 0
11996 10         || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11997 10         || ! IS_OPERAND(*top_ptr))
11998           {
11999 10         RExC_parse++;
12000 10         vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12001           }
12002 10         av_push(stack, newSVuv(curchar));
12003 10         break;
12004            
12005           case '!':
12006 10         av_push(stack, newSVuv(curchar));
12007 10         break;
12008            
12009           case '(':
12010 0         if (top_index >= 0) {
12011 90         top_ptr = av_fetch(stack, top_index, FALSE);
12012 90         assert(top_ptr);
12013 3266         if (IS_OPERAND(*top_ptr)) {
12014 3042         RExC_parse++;
12015 3036         vFAIL("Unexpected '(' with no preceding operator");
12016           }
12017           }
12018 3036         av_push(stack, newSVuv(curchar));
12019 3034         break;
12020            
12021           case ')':
12022           {
12023           SV* lparen;
12024 8         if (top_index < 1
12025 3034         || ! (current = av_pop(stack))
12026 10         || ! IS_OPERAND(current)
12027 10         || ! (lparen = av_pop(stack))
12028 3024         || IS_OPERAND(lparen)
12029 188566         || SvUV(lparen) != '(')
12030           {
12031 184030         SvREFCNT_dec(current);
12032 65238         RExC_parse++;
12033 151411         vFAIL("Unexpected ')'");
12034           }
12035 3024         top_index -= 2;
12036 3024         SvREFCNT_dec_NN(lparen);
12037            
12038           /* FALL THROUGH */
12039           }
12040            
12041           handle_operand:
12042            
12043           /* Here, we have an operand to process, in 'current' */
12044            
12045 3024         if (top_index < 0) { /* Just push if stack is empty */
12046 3024         av_push(stack, current);
12047           }
12048           else {
12049 3024         SV* top = av_pop(stack);
12050           SV *prev = NULL;
12051           char current_operator;
12052            
12053 3024         if (IS_OPERAND(top)) {
12054 3024         SvREFCNT_dec_NN(top);
12055 0         SvREFCNT_dec_NN(current);
12056 3024         vFAIL("Operand with no preceding operator");
12057           }
12058 376         current_operator = (char) SvUV(top);
12059 3024         switch (current_operator) {
12060           case '(': /* Push the '(' back on followed by the new
12061           operand */
12062 3024         av_push(stack, top);
12063 3024         av_push(stack, current);
12064           SvREFCNT_inc(top); /* Counters the '_dec' done
12065           just after the 'break', so
12066           it doesn't get wrongly freed
12067           */
12068           break;
12069            
12070           case '!':
12071 3024         _invlist_invert(current);
12072            
12073           /* Unlike binary operators, the top of the stack,
12074           * now that this unary one has been popped off, may
12075           * legally be an operator, and we now have operand
12076           * for it. */
12077 3024         top_index--;
12078 4572         SvREFCNT_dec_NN(top);
12079 8246454         goto handle_operand;
12080            
12081           case '&':
12082 8246454         prev = av_pop(stack);
12083 8246454         _invlist_intersection(prev,
12084           current,
12085           ¤t);
12086 8246454         av_push(stack, current);
12087 8246454         break;
12088            
12089           case '|':
12090           case '+':
12091 8246454         prev = av_pop(stack);
12092 8246454         _invlist_union(prev, current, ¤t);
12093 8246454         av_push(stack, current);
12094 8246454         break;
12095            
12096           case '-':
12097 8246454         prev = av_pop(stack);;
12098 8246454         _invlist_subtract(prev, current, ¤t);
12099 8246454         av_push(stack, current);
12100 8246454         break;
12101            
12102           case '^': /* The union minus the intersection */
12103           {
12104 8246454         SV* i = NULL;
12105 4179995         SV* u = NULL;
12106           SV* element;
12107            
12108 4066459         prev = av_pop(stack);
12109 4066459         _invlist_union(prev, current, &u);
12110 6062609         _invlist_intersection(prev, current, &i);
12111           /* _invlist_subtract will overwrite current
12112           without freeing what it already contains */
12113 277276         element = current;
12114 4066459         _invlist_subtract(u, i, ¤t);
12115 4066459         av_push(stack, current);
12116 4066459         SvREFCNT_dec_NN(i);
12117 8246454         SvREFCNT_dec_NN(u);
12118 6254         SvREFCNT_dec_NN(element);
12119 8246454         break;
12120           }
12121            
12122           default:
12123 2126224         Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12124           }
12125 2126224         SvREFCNT_dec_NN(top);
12126 2126224         SvREFCNT_dec(prev);
12127           }
12128           }
12129            
12130 756         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12131           }
12132            
12133 8246454         if (av_tindex(stack) < 0 /* Was empty */
12134 144220         || ((final = av_pop(stack)) == NULL)
12135 144220         || ! IS_OPERAND(final)
12136 226130         || av_tindex(stack) >= 0) /* More left on stack */
12137           {
12138 13400         vFAIL("Incomplete expression within '(?[ ])'");
12139           }
12140            
12141           /* Here, 'final' is the resultant inversion list from evaluating the
12142           * expression. Return it if so requested */
12143 144220         if (return_invlist) {
12144 40         *return_invlist = final;
12145 40         return END;
12146           }
12147            
12148           /* Otherwise generate a resultant node, based on 'final'. regclass() is
12149           * expecting a string of ranges and individual code points */
12150 36         invlist_iterinit(final);
12151 8246450         result_string = newSVpvs("");
12152 410492         while (invlist_iternext(final, &start, &end)) {
12153 8246450         if (start == end) {
12154 27335836         Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12155           }
12156           else {
12157 26922254         Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12158           start, end);
12159           }
12160           }
12161            
12162 17330         save_parse = RExC_parse;
12163 26922254         RExC_parse = SvPV(result_string, len);
12164 19146320         save_end = RExC_end;
12165 19146320         RExC_end = RExC_parse + len;
12166            
12167           /* We turn off folding around the call, as the class we have constructed
12168           * already has all folding taken into consideration, and we don't want
12169           * regclass() to add to that */
12170 17203734         RExC_flags &= ~RXf_PMf_FOLD;
12171           /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12172           */
12173 17203734         node = regclass(pRExC_state, flagp,depth+1,
12174           FALSE, /* means parse the whole char class */
12175           FALSE, /* don't allow multi-char folds */
12176           TRUE, /* silence non-portable warnings. The above may very
12177           well have generated non-portable code points, but
12178           they're valid on this machine */
12179           NULL);
12180 19146320         if (!node)
12181 1458856         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12182           PTR2UV(flagp));
12183 1458856         if (save_fold) {
12184 17687464         RExC_flags |= RXf_PMf_FOLD;
12185           }
12186 19146320         RExC_parse = save_parse + 1;
12187 126354         RExC_end = save_end;
12188 126354         SvREFCNT_dec_NN(final);
12189 89234         SvREFCNT_dec_NN(result_string);
12190            
12191 19086776         nextchar(pRExC_state);
12192 7711468         Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12193           return node;
12194           }
12195           #undef IS_OPERAND
12196            
12197           /* The names of properties whose definitions are not known at compile time are
12198           * stored in this SV, after a constant heading. So if the length has been
12199           * changed since initialization, then there is a run-time definition. */
12200           #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12201            
12202           STATIC regnode *
12203 898384         S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12204           const bool stop_at_1, /* Just parse the next thing, don't
12205           look for a full character class */
12206           bool allow_multi_folds,
12207           const bool silence_non_portable, /* Don't output warnings
12208           about too large
12209           characters */
12210           SV** ret_invlist) /* Return an inversion list, not a node */
12211           {
12212           /* parse a bracketed class specification. Most of these will produce an
12213           * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12214           * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12215           * under /i with multi-character folds: it will be rewritten following the
12216           * paradigm of this example, where the s are characters which
12217           * fold to multiple character sequences:
12218           * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12219           * gets effectively rewritten as:
12220           * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12221           * reg() gets called (recursively) on the rewritten version, and this
12222           * function will return what it constructs. (Actually the s
12223           * aren't physically removed from the [abcdefghi], it's just that they are
12224           * ignored in the recursion by means of a flag:
12225           * .)
12226           *
12227           * ANYOF nodes contain a bit map for the first 256 characters, with the
12228           * corresponding bit set if that character is in the list. For characters
12229           * above 255, a range list or swash is used. There are extra bits for \w,
12230           * etc. in locale ANYOFs, as what these match is not determinable at
12231           * compile time
12232           *
12233           * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12234           * to be restarted. This can only happen if ret_invlist is non-NULL.
12235           */
12236            
12237           dVAR;
12238           UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12239           IV range = 0;
12240 898384         UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12241           regnode *ret;
12242           STRLEN numlen;
12243           IV namedclass = OOB_NAMEDCLASS;
12244           char *rangebegin = NULL;
12245           bool need_class = 0;
12246           SV *listsv = NULL;
12247           STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12248           than just initialized. */
12249 6813156         SV* properties = NULL; /* Code points that match \p{} \P{} */
12250 7711504         SV* posixes = NULL; /* Code points that match classes like, [:word:],
12251           extended beyond the Latin1 range */
12252           UV element_count = 0; /* Number of distinct elements in the class.
12253           Optimizations may be possible if this is tiny */
12254           AV * multi_char_matches = NULL; /* Code points that fold to more than one
12255           character; used under /i */
12256           UV n;
12257 34172         char * stop_ptr = RExC_end; /* where to stop parsing */
12258 37870         const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12259           space? */
12260 13044         const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12261            
12262           /* Unicode properties are stored in a swash; this holds the current one
12263           * being parsed. If this swash is the only above-latin1 component of the
12264           * character class, an optimization is to pass it directly on to the
12265           * execution engine. Otherwise, it is set to NULL to indicate that there
12266           * are other things in the class that have to be dealt with at execution
12267           * time */
12268           SV* swash = NULL; /* Code points that match \p{} \P{} */
12269            
12270           /* Set if a component of this character class is user-defined; just passed
12271           * on to the engine */
12272           bool has_user_defined_property = FALSE;
12273            
12274           /* inversion list of code points this node matches only when the target
12275           * string is in UTF-8. (Because is under /d) */
12276 51872         SV* depends_list = NULL;
12277            
12278           /* inversion list of code points this node matches. For much of the
12279           * function, it includes only those that match regardless of the utf8ness
12280           * of the target string */
12281 560         SV* cp_list = NULL;
12282            
12283           #ifdef EBCDIC
12284           /* In a range, counts how many 0-2 of the ends of it came from literals,
12285           * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12286           UV literal_endpoint = 0;
12287           #endif
12288           bool invert = FALSE; /* Is this class to be complemented */
12289            
12290           /* Is there any thing like \W or [:^digit:] that matches above the legal
12291           * Unicode range? */
12292           bool runtime_posix_matches_above_Unicode = FALSE;
12293            
12294 52         regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12295           case we need to change the emitted regop to an EXACT. */
12296 48         const char * orig_parse = RExC_parse;
12297 176         const SSize_t orig_size = RExC_size;
12298 432         GET_RE_DEBUG_FLAGS_DECL;
12299            
12300 562         PERL_ARGS_ASSERT_REGCLASS;
12301           #ifndef DEBUGGING
12302           PERL_UNUSED_ARG(depth);
12303           #endif
12304            
12305 48         DEBUG_PARSE("clas");
12306            
12307           /* Assume we are going to generate an ANYOF node. */
12308 36         ret = reganode(pRExC_state, ANYOF, 0);
12309            
12310 411620         if (SIZE_ONLY) {
12311 411602         RExC_size += ANYOF_SKIP;
12312           listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12313           }
12314           else {
12315 20         ANYOF_FLAGS(ret) = 0;
12316            
12317 411600         RExC_emit += ANYOF_SKIP;
12318 411412         if (LOC) {
12319 411376         ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12320           }
12321 411394         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12322 22         initial_listsv_len = SvCUR(listsv);
12323 433624         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12324           }
12325            
12326 22270         if (skip_white) {
12327 411372         RExC_parse = regpatws(pRExC_state, RExC_parse,
12328           FALSE /* means don't recognize comments */);
12329           }
12330            
12331 38         if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12332 411370         RExC_parse++;
12333           invert = TRUE;
12334           allow_multi_folds = FALSE;
12335 617089         RExC_naughty++;
12336 34         if (skip_white) {
12337 206         RExC_parse = regpatws(pRExC_state, RExC_parse,
12338           FALSE /* means don't recognize comments */);
12339           }
12340           }
12341            
12342           /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12343 411612         if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12344 205804         const char *s = RExC_parse;
12345 85732         const char c = *s++;
12346            
12347 85732         while (isWORDCHAR(*s))
12348 85732         s++;
12349 137402         if (*s && c == *s && s[1] == ']') {
12350 8804         SAVEFREESV(RExC_rx_sv);
12351 8804         ckWARN3reg(s+2,
12352           "POSIX syntax [%c %c] belongs inside character classes",
12353           c, c);
12354 205804         (void)ReREFCNT_inc(RExC_rx_sv);
12355           }
12356           }
12357            
12358           /* If the caller wants us to just parse a single element, accomplish this
12359           * by faking the loop ending condition */
12360 308742         if (stop_at_1 && RExC_end > RExC_parse) {
12361 205804         stop_ptr = RExC_parse + 1;
12362           }
12363            
12364           /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12365 205840         if (UCHARAT(RExC_parse) == ']')
12366           goto charclassloop;
12367            
12368           parseit:
12369           while (1) {
12370 205880         if (RExC_parse >= stop_ptr) {
12371           break;
12372           }
12373            
12374 92         if (skip_white) {
12375 205804         RExC_parse = regpatws(pRExC_state, RExC_parse,
12376           FALSE /* means don't recognize comments */);
12377           }
12378            
12379 205880         if (UCHARAT(RExC_parse) == ']') {
12380           break;
12381           }
12382            
12383           charclassloop:
12384            
12385           namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12386 25172         save_value = value;
12387           save_prevvalue = prevvalue;
12388            
12389 40         if (!range) {
12390 25172         rangebegin = RExC_parse;
12391 44         element_count++;
12392           }
12393 44         if (UTF) {
12394 25128         value = utf8n_to_uvchr((U8*)RExC_parse,
12395           RExC_end - RExC_parse,
12396           &numlen, UTF8_ALLOW_DEFAULT);
12397 25128         RExC_parse += numlen;
12398           }
12399           else
12400 25168         value = UCHARAT(RExC_parse++);
12401            
12402 180712         if (value == '['
12403 180672         && RExC_parse < RExC_end
12404 180672         && POSIXCC(UCHARAT(RExC_parse)))
12405           {
12406 86590         namedclass = regpposixcc(pRExC_state, value, strict);
12407           }
12408 86630         else if (value == '\\') {
12409 94082         if (UTF) {
12410 205800         value = utf8n_to_uvchr((U8*)RExC_parse,
12411           RExC_end - RExC_parse,
12412           &numlen, UTF8_ALLOW_DEFAULT);
12413 411572         RExC_parse += numlen;
12414           }
12415           else
12416 411572         value = UCHARAT(RExC_parse++);
12417            
12418           /* Some compilers cannot handle switching on 64-bit integer
12419           * values, therefore value cannot be an UV. Yes, this will
12420           * be a problem later if we want switch on Unicode.
12421           * A similar issue a little bit later when switching on
12422           * namedclass. --jhi */
12423            
12424           /* If the \ is escaping white space when white space is being
12425           * skipped, it means that that white space is wanted literally, and
12426           * is already in 'value'. Otherwise, need to translate the escape
12427           * into what it signifies. */
12428 411572         if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12429            
12430           case 'w': namedclass = ANYOF_WORDCHAR; break;
12431 65956         case 'W': namedclass = ANYOF_NWORDCHAR; break;
12432 37916         case 's': namedclass = ANYOF_SPACE; break;
12433 29748         case 'S': namedclass = ANYOF_NSPACE; break;
12434 21052         case 'd': namedclass = ANYOF_DIGIT; break;
12435 21548         case 'D': namedclass = ANYOF_NDIGIT; break;
12436 10504         case 'v': namedclass = ANYOF_VERTWS; break;
12437 10376         case 'V': namedclass = ANYOF_NVERTWS; break;
12438 398         case 'h': namedclass = ANYOF_HORIZWS; break;
12439 398         case 'H': namedclass = ANYOF_NHORIZWS; break;
12440           case 'N': /* Handle \N{NAME} in class */
12441           {
12442           /* We only pay attention to the first char of
12443           multichar strings being returned. I kinda wonder
12444           if this makes sense as it does change the behaviour
12445           from earlier versions, OTOH that behaviour was broken
12446           as well. */
12447 398         if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12448           TRUE, /* => charclass */
12449           strict))
12450           {
12451 14         if (*flagp & RESTART_UTF8)
12452 384         FAIL("panic: grok_bslash_N set RESTART_UTF8");
12453           goto parseit;
12454           }
12455           }
12456           break;
12457           case 'p':
12458           case 'P':
12459           {
12460           char *e;
12461            
12462           /* We will handle any undefined properties ourselves */
12463 4029980         U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12464            
12465 4029980         if (RExC_parse >= RExC_end)
12466 4029980         vFAIL2("Empty \\%c{}", (U8)value);
12467 10         if (*RExC_parse == '{') {
12468 4029970         const U8 c = (U8)value;
12469 143538         e = strchr(RExC_parse++, '}');
12470 143538         if (!e)
12471 153174         vFAIL2("Missing right brace on \\%c{}", c);
12472 153174         while (isSPACE(UCHARAT(RExC_parse)))
12473 153174         RExC_parse++;
12474 153174         if (e == RExC_parse)
12475 153174         vFAIL2("Empty \\%c{}", c);
12476 50522         n = e - RExC_parse;
12477 10         while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12478 10         n--;
12479           }
12480           else {
12481 50512         e = RExC_parse;
12482           n = 1;
12483           }
12484 25256         if (!SIZE_ONLY) {
12485           SV* invlist;
12486           char* name;
12487            
12488 25256         if (UCHARAT(RExC_parse) == '^') {
12489 25256         RExC_parse++;
12490 16         n--;
12491           /* toggle. (The rhs xor gets the single bit that
12492           * differs between P and p; the other xor inverts just
12493           * that bit) */
12494 4         value ^= 'P' ^ 'p';
12495            
12496 4         while (isSPACE(UCHARAT(RExC_parse))) {
12497 4         RExC_parse++;
12498 153164         n--;
12499           }
12500           }
12501           /* Try to get the definition of the property into
12502           * . If /i is in effect, the effective property
12503           * will have its name be <__NAME_i>. The design is
12504           * discussed in commit
12505           * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12506 164         Newx(name, n + sizeof("_i__\n"), char);
12507            
12508 164         sprintf(name, "%s%.*s%s\n",
12509 164         (FOLD) ? "__" : "",
12510           (int)n,
12511           RExC_parse,
12512 164         (FOLD) ? "_i" : ""
12513           );
12514            
12515           /* Look up the property name, and get its swash and
12516           * inversion list, if the property is found */
12517 0         if (swash) {
12518 0         SvREFCNT_dec_NN(swash);
12519           }
12520 0         swash = _core_swash_init("utf8", name, &PL_sv_undef,
12521           1, /* binary */
12522           0, /* not tr/// */
12523           NULL, /* No inversion list */
12524           &swash_init_flags
12525           );
12526 0         if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12527 2519612         if (swash) {
12528 424         SvREFCNT_dec_NN(swash);
12529           swash = NULL;
12530           }
12531            
12532           /* Here didn't find it. It could be a user-defined
12533           * property that will be available at run-time. If we
12534           * accept only compile-time properties, is an error;
12535           * otherwise add it to the list for run-time look up */
12536 4         if (ret_invlist) {
12537 420         RExC_parse = e + 1;
12538 420         vFAIL3("Property '%.*s' is unknown", (int) n, name);
12539           }
12540 416         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12541 28485191         (value == 'p' ? '+' : '!'),
12542           name);
12543           has_user_defined_property = TRUE;
12544            
12545           /* We don't know yet, so have to assume that the
12546           * property could match something in the Latin1 range,
12547           * hence something that isn't utf8. Note that this
12548           * would cause things in to match
12549           * inappropriately, except that any \p{}, including
12550           * this one forces Unicode semantics, which means there
12551           * is */
12552 878488         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12553           }
12554           else {
12555            
12556           /* Here, did get the swash and its inversion list. If
12557           * the swash is from a user-defined property, then this
12558           * whole character class should be regarded as such */
12559 877984         has_user_defined_property =
12560           (swash_init_flags
12561 877984         & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12562            
12563           /* Invert if asking for the complement */
12564 1344         if (value == 'P') {
12565 672         _invlist_union_complement_2nd(properties,
12566           invlist,
12567           &properties);
12568            
12569           /* The swash can't be used as-is, because we've
12570           * inverted things; delay removing it to here after
12571           * have copied its invlist above */
12572 672         SvREFCNT_dec_NN(swash);
12573           swash = NULL;
12574           }
12575           else {
12576 672         _invlist_union(properties, invlist, &properties);
12577           }
12578           }
12579 1344         Safefree(name);
12580           }
12581 19146126         RExC_parse = e + 1;
12582           namedclass = ANYOF_UNIPROP; /* no official name, but it's
12583           named */
12584            
12585           /* \p means they want Unicode semantics */
12586 725880         RExC_uni_semantics = 1;
12587           }
12588 376         break;
12589 282         case 'n': value = '\n'; break;
12590 188         case 'r': value = '\r'; break;
12591 188         case 't': value = '\t'; break;
12592 8         case 'f': value = '\f'; break;
12593 180         case 'b': value = '\b'; break;
12594 180         case 'e': value = ASCII_TO_NATIVE('\033');break;
12595 176         case 'a': value = '\a'; break;
12596           case 'o':
12597 176         RExC_parse--; /* function expects to be pointed at the 'o' */
12598           {
12599           const char* error_msg;
12600 176         bool valid = grok_bslash_o(&RExC_parse,
12601           &value,
12602           &error_msg,
12603           SIZE_ONLY, /* warnings in pass
12604           1 only */
12605           strict,
12606           silence_non_portable,
12607           UTF);
12608 364         if (! valid) {
12609 725868         vFAIL(error_msg);
12610           }
12611           }
12612 362946         if (PL_encoding && value < 0x100) {
12613           goto recode_encoding;
12614           }
12615           break;
12616           case 'x':
12617 362946         RExC_parse--; /* function expects to be pointed at the 'x' */
12618           {
12619           const char* error_msg;
12620 206080         bool valid = grok_bslash_x(&RExC_parse,
12621           &value,
12622           &error_msg,
12623           TRUE, /* Output warnings */
12624           strict,
12625           silence_non_portable,
12626           UTF);
12627 282         if (! valid) {
12628 282         vFAIL(error_msg);
12629           }
12630           }
12631 156866         if (PL_encoding && value < 0x100)
12632           goto recode_encoding;
12633           break;
12634           case 'c':
12635 351         value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12636 0         break;
12637           case '0': case '1': case '2': case '3': case '4':
12638           case '5': case '6': case '7':
12639           {
12640           /* Take 1-3 octal digits */
12641 234         I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12642 156632         numlen = (strict) ? 4 : 3;
12643 156632         value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12644 156632         RExC_parse += numlen;
12645 127100         if (numlen != 3) {
12646 127100         if (strict) {
12647 39         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12648 26         vFAIL("Need exactly 3 octal digits");
12649           }
12650 127100         else if (! SIZE_ONLY /* like \08, \178 */
12651 124366         && numlen < 3
12652 129171         && RExC_parse < RExC_end
12653 85686         && isDIGIT(*RExC_parse)
12654 129171         && ckWARN(WARN_REGEXP))
12655           {
12656 228         SAVEFREESV(RExC_rx_sv);
12657 128829         reg_warn_non_literal_string(
12658           RExC_parse + 1,
12659           form_short_octal_warning(RExC_parse, numlen));
12660 56838         (void)ReREFCNT_inc(RExC_rx_sv);
12661           }
12662           }
12663 456         if (PL_encoding && value < 0x100)
12664           goto recode_encoding;
12665           break;
12666           }
12667           recode_encoding:
12668 37556         if (! RExC_override_recoding) {
12669 56154         SV* enc = PL_encoding;
12670 140         value = reg_recode((const char)(U8)value, &enc);
12671 37416         if (!enc) {
12672 37416         if (strict) {
12673 37416         vFAIL("Invalid escape in the specified encoding");
12674           }
12675 37416         else if (SIZE_ONLY) {
12676 0         ckWARNreg(RExC_parse,
12677           "Invalid escape in the specified encoding");
12678           }
12679           }
12680           break;
12681           }
12682           default:
12683           /* Allow \_ to not give an error */
12684 0         if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12685 55944         if (strict) {
12686 36080         vFAIL2("Unrecognized escape \\%c in character class",
12687           (int)value);
12688           }
12689           else {
12690 2734         SAVEFREESV(RExC_rx_sv);
12691 142         ckWARN2reg(RExC_parse,
12692           "Unrecognized escape \\%c in character class passed through",
12693           (int)value);
12694 32266         (void)ReREFCNT_inc(RExC_rx_sv);
12695           }
12696           }
12697           break;
12698           } /* End of switch on char following backslash */
12699           } /* end of handling backslash escape sequences */
12700           #ifdef EBCDIC
12701           else
12702           literal_endpoint++;
12703           #endif
12704            
12705           /* Here, we have the current token in 'value' */
12706            
12707           /* What matches in a locale is not known until runtime. This includes
12708           * what the Posix classes (like \w, [:space:]) match. Room must be
12709           * reserved (one time per class) to store such classes, either if Perl
12710           * is compiled so that locale nodes always should have this space, or
12711           * if there is such class info to be stored. The space will contain a
12712           * bit for each named class that is to be matched against. This isn't
12713           * needed for \p{} and pseudo-classes, as they are not affected by
12714           * locale, and hence are dealt with separately */
12715 36389         if (LOC
12716 35955         && ! need_class
12717 236         && (ANYOF_LOCALE == ANYOF_CLASS
12718 236         || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12719           {
12720           need_class = 1;
12721 236         if (SIZE_ONLY) {
12722 236         RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12723           }
12724           else {
12725 0         RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12726 0         ANYOF_CLASS_ZERO(ret);
12727           }
12728 236         ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12729           }
12730            
12731 11230         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12732            
12733           /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12734           * literal, as is the character that began the false range, i.e.
12735           * the 'a' in the examples */
12736 10962         if (range) {
12737 10962         if (!SIZE_ONLY) {
12738 5664         const int w = (RExC_parse >= rangebegin)
12739 152         ? RExC_parse - rangebegin
12740           : 0;
12741 152         if (strict) {
12742 152         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12743           }
12744           else {
12745 152         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12746 0         ckWARN4reg(RExC_parse,
12747           "False [] range \"%*.*s\"",
12748           w, w, rangebegin);
12749 0         (void)ReREFCNT_inc(RExC_rx_sv);
12750 152         cp_list = add_cp_to_invlist(cp_list, '-');
12751 362946         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12752           }
12753           }
12754            
12755           range = 0; /* this was not a true range */
12756 18783168         element_count += 2; /* So counts for three values */
12757           }
12758            
12759 10682         if (! SIZE_ONLY) {
12760 18783168         U8 classnum = namedclass_to_classnum(namedclass);
12761 1942210         if (namedclass >= ANYOF_MAX) { /* If a special class */
12762 30         if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12763            
12764           /* Here, should be \h, \H, \v, or \V. Neither /d nor
12765           * /l make a difference in what these match. There
12766           * would be problems if these characters had folds
12767           * other than themselves, as cp_list is subject to
12768           * folding. */
12769 30         if (classnum != _CC_VERTSPACE) {
12770 16840958         assert( namedclass == ANYOF_HORIZWS
12771           || namedclass == ANYOF_NHORIZWS);
12772            
12773           /* It turns out that \h is just a synonym for
12774           * XPosixBlank */
12775           classnum = _CC_BLANK;
12776           }
12777            
12778 16840958         _invlist_union_maybe_complement_2nd(
12779           cp_list,
12780           PL_XPosix_ptrs[classnum],
12781           cBOOL(namedclass % 2), /* Complement if odd
12782           (NHORIZWS, NVERTWS)
12783           */
12784           &cp_list);
12785           }
12786           }
12787 16635728         else if (classnum == _CC_ASCII) {
12788           #ifdef HAS_ISASCII
12789 2008838         if (LOC) {
12790 2008838         ANYOF_CLASS_SET(ret, namedclass);
12791           }
12792           else
12793           #endif /* Not isascii(); just use the hard-coded definition for it */
12794 560         _invlist_union_maybe_complement_2nd(
12795           posixes,
12796           PL_ASCII,
12797           cBOOL(namedclass % 2), /* Complement if odd
12798           (NASCII) */
12799           &posixes);
12800           }
12801           else { /* Garden variety class */
12802            
12803           /* The ascii range inversion list */
12804 560         SV* ascii_source = PL_Posix_ptrs[classnum];
12805            
12806           /* The full Latin1 range inversion list */
12807 2008838         SV* l1_source = PL_L1Posix_ptrs[classnum];
12808            
12809           /* This code is structured into two major clauses. The
12810           * first is for classes whose complete definitions may not
12811           * already be known. It not, the Latin1 definition
12812           * (guaranteed to already known) is used plus code is
12813           * generated to load the rest at run-time (only if needed).
12814           * If the complete definition is known, it drops down to
12815           * the second clause, where the complete definition is
12816           * known */
12817            
12818 1942950         if (classnum < _FIRST_NON_SWASH_CC) {
12819            
12820           /* Here, the class has a swash, which may or not
12821           * already be loaded */
12822            
12823           /* The name of the property to use to match the full
12824           * eXtended Unicode range swash for this character
12825           * class */
12826 1942950         const char *Xname = swash_property_names[classnum];
12827            
12828           /* If returning the inversion list, we can't defer
12829           * getting this until runtime */
12830 364         if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12831 42         PL_utf8_swash_ptrs[classnum] =
12832 28         _core_swash_init("utf8", Xname, &PL_sv_undef,
12833           1, /* binary */
12834           0, /* not tr/// */
12835           NULL, /* No inversion list */
12836           NULL /* No flags */
12837           );
12838 28         assert(PL_utf8_swash_ptrs[classnum]);
12839           }
12840 28         if ( ! PL_utf8_swash_ptrs[classnum]) {
12841 12         if (namedclass % 2 == 0) { /* A non-complemented
12842           class */
12843           /* If not /a matching, there are code points we
12844           * don't know at compile time. Arrange for the
12845           * unknown matches to be loaded at run-time, if
12846           * needed */
12847 16         if (! AT_LEAST_ASCII_RESTRICTED) {
12848 352         Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12849           Xname);
12850           }
12851 0         if (LOC) { /* Under locale, set run-time
12852           lookup */
12853 352         ANYOF_CLASS_SET(ret, namedclass);
12854           }
12855           else {
12856           /* Add the current class's code points to
12857           * the running total */
12858 1942938         _invlist_union(posixes,
12859           (AT_LEAST_ASCII_RESTRICTED)
12860           ? ascii_source
12861           : l1_source,
12862           &posixes);
12863           }
12864           }
12865           else { /* A complemented class */
12866 16840188         if (AT_LEAST_ASCII_RESTRICTED) {
12867           /* Under /a should match everything above
12868           * ASCII, plus the complement of the set's
12869           * ASCII matches */
12870 1324676         _invlist_union_complement_2nd(posixes,
12871           ascii_source,
12872           &posixes);
12873           }
12874           else {
12875           /* Arrange for the unknown matches to be
12876           * loaded at run-time, if needed */
12877 16840188         Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12878           Xname);
12879           runtime_posix_matches_above_Unicode = TRUE;
12880 3145228         if (LOC) {
12881 2885488         ANYOF_CLASS_SET(ret, namedclass);
12882           }
12883           else {
12884            
12885           /* We want to match everything in
12886           * Latin1, except those things that
12887           * l1_source matches */
12888 1684768         SV* scratch_list = NULL;
12889 887984         _invlist_subtract(PL_Latin1, l1_source,
12890           &scratch_list);
12891            
12892           /* Add the list from this class to the
12893           * running total */
12894 726704         if (! posixes) {
12895 523064         posixes = scratch_list;
12896           }
12897           else {
12898 523064         _invlist_union(posixes,
12899           scratch_list,
12900           &posixes);
12901 523064         SvREFCNT_dec_NN(scratch_list);
12902           }
12903 523064         if (DEPENDS_SEMANTICS) {
12904           ANYOF_FLAGS(ret)
12905 523064         |= ANYOF_NON_UTF8_LATIN1_ALL;
12906           }
12907           }
12908           }
12909           }
12910           goto namedclass_done;
12911           }
12912            
12913           /* Here, there is a swash loaded for the class. If no
12914           * inversion list for it yet, get it */
12915 523064         if (! PL_XPosix_ptrs[classnum]) {
12916           PL_XPosix_ptrs[classnum]
12917 0         = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12918           }
12919           }
12920            
12921           /* Here there is an inversion list already loaded for the
12922           * entire class */
12923            
12924 0         if (namedclass % 2 == 0) { /* A non-complemented class,
12925           like ANYOF_PUNCT */
12926 523064         if (! LOC) {
12927           /* For non-locale, just add it to any existing list
12928           * */
12929 523064         _invlist_union(posixes,
12930           (AT_LEAST_ASCII_RESTRICTED)
12931           ? ascii_source
12932           : PL_XPosix_ptrs[classnum],
12933           &posixes);
12934           }
12935           else { /* Locale */
12936 523064         SV* scratch_list = NULL;
12937            
12938           /* For above Latin1 code points, we use the full
12939           * Unicode range */
12940 726704         _invlist_intersection(PL_AboveLatin1,
12941           PL_XPosix_ptrs[classnum],
12942           &scratch_list);
12943           /* And set the output to it, adding instead if
12944           * there already is an output. Checking if
12945           * 'posixes' is NULL first saves an extra clone.
12946           * Its reference count will be decremented at the
12947           * next union, etc, or if this is the only
12948           * instance, at the end of the routine */
12949 726704         if (! posixes) {
12950 726704         posixes = scratch_list;
12951           }
12952           else {
12953 16113484         _invlist_union(posixes, scratch_list, &posixes);
12954 7938490         SvREFCNT_dec_NN(scratch_list);
12955           }
12956            
12957           #ifndef HAS_ISBLANK
12958           if (namedclass != ANYOF_BLANK) {
12959           #endif
12960           /* Set this class in the node for runtime
12961           * matching */
12962 8246214         ANYOF_CLASS_SET(ret, namedclass);
12963           #ifndef HAS_ISBLANK
12964           }
12965           else {
12966           /* No isblank(), use the hard-coded ASCII-range
12967           * blanks, adding them to the running total. */
12968            
12969           _invlist_union(posixes, ascii_source, &posixes);
12970           }
12971           #endif
12972           }
12973           }
12974           else { /* A complemented class, like ANYOF_NPUNCT */
12975 523064         if (! LOC) {
12976 523064         _invlist_union_complement_2nd(
12977           posixes,
12978           (AT_LEAST_ASCII_RESTRICTED)
12979           ? ascii_source
12980           : PL_XPosix_ptrs[classnum],
12981           &posixes);
12982           /* Under /d, everything in the upper half of the
12983           * Latin1 range matches this complement */
12984 523064         if (DEPENDS_SEMANTICS) {
12985 1615508         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12986           }
12987           }
12988           else { /* Locale */
12989 1092444         SV* scratch_list = NULL;
12990 523064         _invlist_subtract(PL_AboveLatin1,
12991           PL_XPosix_ptrs[classnum],
12992           &scratch_list);
12993 1307660         if (! posixes) {
12994 523064         posixes = scratch_list;
12995           }
12996           else {
12997 0         _invlist_union(posixes, scratch_list, &posixes);
12998 523064         SvREFCNT_dec_NN(scratch_list);
12999           }
13000           #ifndef HAS_ISBLANK
13001           if (namedclass != ANYOF_NBLANK) {
13002           #endif
13003 523064         ANYOF_CLASS_SET(ret, namedclass);
13004           #ifndef HAS_ISBLANK
13005           }
13006           else {
13007           /* Get the list of all code points in Latin1
13008           * that are not ASCII blanks, and add them to
13009           * the running total */
13010           _invlist_subtract(PL_Latin1, ascii_source,
13011           &scratch_list);
13012           _invlist_union(posixes, scratch_list, &posixes);
13013           SvREFCNT_dec_NN(scratch_list);
13014           }
13015           #endif
13016           }
13017           }
13018           }
13019           namedclass_done:
13020 261722         continue; /* Go get next character */
13021           }
13022           } /* end of namedclass \blah */
13023            
13024           /* Here, we have a single value. If 'range' is set, it is the ending
13025           * of a range--check its validity. Later, we will handle each
13026           * individual code point in the range. If 'range' isn't set, this
13027           * could be the beginning of a range, so check for that by looking
13028           * ahead to see if the next real character to be processed is the range
13029           * indicator--the minus sign */
13030            
13031 261762         if (skip_white) {
13032 261722         RExC_parse = regpatws(pRExC_state, RExC_parse,
13033           FALSE /* means don't recognize comments */);
13034           }
13035            
13036 523104         if (range) {
13037 523064         if (prevvalue > value) /* b-a */ {
13038 523064         const int w = RExC_parse - rangebegin;
13039 523064         Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
13040           range = 0; /* not a valid range */
13041           }
13042           }
13043           else {
13044 523104         prevvalue = value; /* save the beginning of the potential range */
13045 523104         if (! stop_at_1 /* Can't be a range if parsing just one thing */
13046 523104         && *RExC_parse == '-')
13047           {
13048 523064         char* next_char_ptr = RExC_parse + 1;
13049 523064         if (skip_white) { /* Get the next real char after the '-' */
13050 523064         next_char_ptr = regpatws(pRExC_state,
13051 523064         RExC_parse + 1,
13052           FALSE); /* means don't recognize
13053           comments */
13054           }
13055            
13056           /* If the '-' is at the end of the class (just before the ']',
13057           * it is a literal minus; otherwise it is a range */
13058 523064         if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13059 7723150         RExC_parse = next_char_ptr;
13060            
13061           /* a bad range like \w-, [:word:]- ? */
13062 3307274         if (namedclass > OOB_NAMEDCLASS) {
13063 482500         if (strict || ckWARN(WARN_REGEXP)) {
13064 472         const int w =
13065 630         RExC_parse >= rangebegin ?
13066 420         RExC_parse - rangebegin : 0;
13067 2124         if (strict) {
13068 0         vFAIL4("False [] range \"%*.*s\"",
13069           w, w, rangebegin);
13070           }
13071           else {
13072 0         vWARN4(RExC_parse,
13073           "False [] range \"%*.*s\"",
13074           w, w, rangebegin);
13075           }
13076           }
13077 0         if (!SIZE_ONLY) {
13078 142264         cp_list = add_cp_to_invlist(cp_list, '-');
13079           }
13080 71132         element_count++;
13081           } else
13082           range = 1; /* yeah, it's a range! */
13083 72048         continue; /* but do it the next time */
13084           }
13085           }
13086           }
13087            
13088           /* Here, is the beginning of the range, if any; or
13089           * if not */
13090            
13091           /* non-Latin1 code point implies unicode semantics. Must be set in
13092           * pass1 so is there for the whole of pass 2 */
13093 50496         if (value > 255) {
13094 50456         RExC_uni_semantics = 1;
13095           }
13096            
13097           /* Ready to process either the single value, or the completed range.
13098           * For single-valued non-inverted ranges, we consider the possibility
13099           * of multi-char folds. (We made a conscious decision to not do this
13100           * for the other cases because it can often lead to non-intuitive
13101           * results. For example, you have the peculiar case that:
13102           * "s s" =~ /^[^\xDF]+$/i => Y
13103           * "ss" =~ /^[^\xDF]+$/i => N
13104           *
13105           * See [perl #89750] */
13106 72088         if (FOLD && allow_multi_folds && value == prevvalue) {
13107 72048         if (value == LATIN_SMALL_LETTER_SHARP_S
13108 2824774         || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13109           value)))
13110           {
13111           /* Here is indeed a multi-char fold. Get what it is */
13112            
13113           U8 foldbuf[UTF8_MAXBYTES_CASE];
13114           STRLEN foldlen;
13115            
13116 1921808         UV folded = _to_uni_fold_flags(
13117           value,
13118           foldbuf,
13119           &foldlen,
13120           FOLD_FLAGS_FULL
13121           | ((LOC) ? FOLD_FLAGS_LOCALE
13122           : (ASCII_FOLD_RESTRICTED)
13123           ? FOLD_FLAGS_NOMIX_ASCII
13124           : 0)
13125           );
13126            
13127           /* Here, should be the first character of the
13128           * multi-char fold of , with containing the
13129           * whole thing. But, if this fold is not allowed (because of
13130           * the flags), will be the same as , and should
13131           * be processed like any other character, so skip the special
13132           * handling */
13133 1613808         if (folded != value) {
13134            
13135           /* Skip if we are recursed, currently parsing the class
13136           * again. Otherwise add this character to the list of
13137           * multi-char folds. */
13138 20436         if (! RExC_in_multi_char_class) {
13139           AV** this_array_ptr;
13140           AV* this_array;
13141 20436         STRLEN cp_count = utf8_length(foldbuf,
13142           foldbuf + foldlen);
13143 840176         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13144            
13145 1335729         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13146            
13147            
13148 902966         if (! multi_char_matches) {
13149 307656         multi_char_matches = newAV();
13150           }
13151            
13152           /* is actually an array of arrays.
13153           * There will be one or two top-level elements: [2],
13154           * and/or [3]. The [2] element is an array, each
13155           * element thereof is a character which folds to TWO
13156           * characters; [3] is for folds to THREE characters.
13157           * (Unicode guarantees a maximum of 3 characters in any
13158           * fold.) When we rewrite the character class below,
13159           * we will do so such that the longest folds are
13160           * written first, so that it prefers the longest
13161           * matching strings first. This is done even if it
13162           * turns out that any quantifier is non-greedy, out of
13163           * programmer laziness. Tom Christiansen has agreed
13164           * that this is ok. This makes the test for the
13165           * ligature 'ffi' come before the test for 'ff' */
13166 3307274         if (av_exists(multi_char_matches, cp_count)) {
13167 1013840         this_array_ptr = (AV**) av_fetch(multi_char_matches,
13168           cp_count, FALSE);
13169 1013840         this_array = *this_array_ptr;
13170           }
13171           else {
13172 1013840         this_array = newAV();
13173 757749         av_store(multi_char_matches, cp_count,
13174           (SV*) this_array);
13175           }
13176 455842         av_push(this_array, multi_fold);
13177           }
13178            
13179           /* This element should not be processed further in this
13180           * class */
13181 507114         element_count--;
13182 507114         value = save_value;
13183           prevvalue = save_prevvalue;
13184 175116         continue;
13185           }
13186           }
13187           }
13188            
13189           /* Deal with this element of the class */
13190 25336         if (! SIZE_ONLY) {
13191           #ifndef EBCDIC
13192 1013860         cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13193           #else
13194           SV* this_range = _new_invlist(1);
13195           _append_range_to_invlist(this_range, prevvalue, value);
13196            
13197           /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13198           * If this range was specified using something like 'i-j', we want
13199           * to include only the 'i' and the 'j', and not anything in
13200           * between, so exclude non-ASCII, non-alphabetics from it.
13201           * However, if the range was specified with something like
13202           * [\x89-\x91] or [\x89-j], all code points within it should be
13203           * included. literal_endpoint==2 means both ends of the range used
13204           * a literal character, not \x{foo} */
13205           if (literal_endpoint == 2
13206           && ((prevvalue >= 'a' && value <= 'z')
13207           || (prevvalue >= 'A' && value <= 'Z')))
13208           {
13209           _invlist_intersection(this_range, PL_ASCII,
13210           &this_range);
13211           _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13212           &this_range);
13213           }
13214           _invlist_union(cp_list, this_range, &cp_list);
13215           literal_endpoint = 0;
13216           #endif
13217           }
13218            
13219           range = 0; /* this range (if it was one) is done now */
13220           } /* End of loop through all the text within the brackets */
13221            
13222           /* If anything in the class expands to more than one character, we have to
13223           * deal with them by building up a substitute parse string, and recursively
13224           * calling reg() on it, instead of proceeding */
13225 1013876         if (multi_char_matches) {
13226 350232         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13227           I32 cp_count;
13228           STRLEN len;
13229 175116         char *save_end = RExC_end;
13230 350232         char *save_parse = RExC_parse;
13231           bool first_time = TRUE; /* First multi-char occurrence doesn't get
13232           a "|" */
13233           I32 reg_flags;
13234            
13235 663608         assert(! invert);
13236           #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13237           because too confusing */
13238           if (invert) {
13239           sv_catpv(substitute_parse, "(?:");
13240           }
13241           #endif
13242            
13243           /* Look at the longest folds first */
13244 643172         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13245            
13246 1013840         if (av_exists(multi_char_matches, cp_count)) {
13247           AV** this_array_ptr;
13248           SV* this_sequence;
13249            
13250 1013840         this_array_ptr = (AV**) av_fetch(multi_char_matches,
13251           cp_count, FALSE);
13252 1013840         while ((this_sequence = av_pop(*this_array_ptr)) !=
13253           &PL_sv_undef)
13254           {
13255 1013840         if (! first_time) {
13256 6709310         sv_catpv(substitute_parse, "|");
13257           }
13258           first_time = FALSE;
13259            
13260 3355953         sv_catpv(substitute_parse, SvPVX(this_sequence));
13261           }
13262           }
13263           }
13264            
13265           /* If the character class contains anything else besides these
13266           * multi-character folds, have to include it in recursive parsing */
13267 931452         if (element_count) {
13268 931452         sv_catpv(substitute_parse, "|[");
13269 594446         sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13270 337006         sv_catpv(substitute_parse, "]");
13271           }
13272            
13273 18         sv_catpv(substitute_parse, ")");
13274           #if 0
13275           if (invert) {
13276           /* This is a way to get the parse to skip forward a whole named
13277           * sequence instead of matching the 2nd character when it fails the
13278           * first */
13279           sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13280           }
13281           #endif
13282            
13283 18         RExC_parse = SvPV(substitute_parse, len);
13284 18         RExC_end = RExC_parse + len;
13285 337006         RExC_in_multi_char_class = 1;
13286 36         RExC_emit = (regnode *)orig_emit;
13287            
13288 0         ret = reg(pRExC_state, 1, ®_flags, depth+1);
13289            
13290 24         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13291            
13292 18         RExC_parse = save_parse;
13293 24         RExC_end = save_end;
13294 24         RExC_in_multi_char_class = 0;
13295 337006         SvREFCNT_dec_NN(multi_char_matches);
13296 931452         return ret;
13297           }
13298            
13299           /* If the character class contains only a single element, it may be
13300           * optimizable into another node type which is smaller and runs faster.
13301           * Check if this is the case for this class */
13302 1785706         if (element_count == 1 && ! ret_invlist) {
13303           U8 op = END;
13304           U8 arg = 0;
13305            
13306 1280279         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13307           [:digit:] or \p{foo} */
13308            
13309           /* All named classes are mapped into POSIXish nodes, with its FLAG
13310           * argument giving which class it is */
13311 93600         switch ((I32)namedclass) {
13312           case ANYOF_UNIPROP:
13313           break;
13314            
13315           /* These don't depend on the charset modifiers. They always
13316           * match under /u rules */
13317           case ANYOF_NHORIZWS:
13318           case ANYOF_HORIZWS:
13319 1967138         namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13320           /* FALLTHROUGH */
13321            
13322           case ANYOF_NVERTWS:
13323           case ANYOF_VERTWS:
13324           op = POSIXU;
13325           goto join_posix;
13326            
13327           /* The actual POSIXish node for all the rest depends on the
13328           * charset modifier. The ones in the first set depend only on
13329           * ASCII or, if available on this platform, locale */
13330           case ANYOF_ASCII:
13331           case ANYOF_NASCII:
13332           #ifdef HAS_ISASCII
13333 1112920         op = (LOC) ? POSIXL : POSIXA;
13334           #else
13335           op = POSIXA;
13336           #endif
13337 785108         goto join_posix;
13338            
13339           case ANYOF_NCASED:
13340           case ANYOF_LOWER:
13341           case ANYOF_NLOWER:
13342           case ANYOF_UPPER:
13343           case ANYOF_NUPPER:
13344           /* under /a could be alpha */
13345 879713         if (FOLD) {
13346 745616         if (ASCII_RESTRICTED) {
13347 745616         namedclass = ANYOF_ALPHA + (namedclass % 2);
13348           }
13349 39488         else if (! LOC) {
13350           break;
13351           }
13352           }
13353           /* FALLTHROUGH */
13354            
13355           /* The rest have more possibilities depending on the charset.
13356           * We take advantage of the enum ordering of the charset
13357           * modifiers to get the exact node type, */
13358           default:
13359 39488         op = POSIXD + get_regex_charset(RExC_flags);
13360 785108         if (op > POSIXA) { /* /aa is same as /a */
13361           op = POSIXA;
13362           }
13363           #ifndef HAS_ISBLANK
13364           if (op == POSIXL
13365           && (namedclass == ANYOF_BLANK
13366           || namedclass == ANYOF_NBLANK))
13367           {
13368           op = POSIXA;
13369           }
13370           #endif
13371            
13372           join_posix:
13373           /* The odd numbered ones are the complements of the
13374           * next-lower even number one */
13375 677123         if (namedclass % 2 == 1) {
13376 462730         invert = ! invert;
13377 57268         namedclass--;
13378           }
13379 57268         arg = namedclass_to_classnum(namedclass);
13380 57268         break;
13381           }
13382           }
13383 40240         else if (value == prevvalue) {
13384            
13385           /* Here, the class consists of just a single code point */
13386            
13387 40240         if (invert) {
13388 34566         if (! LOC && value == '\n') {
13389           op = REG_ANY; /* Optimize [^\n] */
13390 34566         *flagp |= HASWIDTH|SIMPLE;
13391 34566         RExC_naughty++;
13392           }
13393           }
13394 77810         else if (value < 256 || UTF) {
13395            
13396           /* Optimize a single value into an EXACTish node, but not if it
13397           * would require converting the pattern to UTF-8. */
13398 77810         op = compute_EXACTish(pRExC_state);
13399           }
13400           } /* Otherwise is a range */
13401 77778         else if (! LOC) { /* locale could vary these */
13402 13030         if (prevvalue == '0') {
13403 13030         if (value == '9') {
13404           arg = _CC_DIGIT;
13405           op = POSIXA;
13406           }
13407           }
13408           }
13409            
13410           /* Here, we have changed away from its initial value iff we found
13411           * an optimization */
13412 11562         if (op != END) {
13413            
13414           /* Throw away this ANYOF regnode, and emit the calculated one,
13415           * which should correspond to the beginning, not current, state of
13416           * the parse */
13417 11562         const char * cur_parse = RExC_parse;
13418 32         RExC_parse = (char *)orig_parse;
13419 32         if ( SIZE_ONLY) {
13420 785140         if (! LOC) {
13421            
13422           /* To get locale nodes to not use the full ANYOF size would
13423           * require moving the code above that writes the portions
13424           * of it that aren't in other nodes to after this point.
13425           * e.g. ANYOF_CLASS_SET */
13426 614360         RExC_size = orig_size;
13427           }
13428           }
13429           else {
13430 327828         RExC_emit = (regnode *)orig_emit;
13431 214308         if (PL_regkind[op] == POSIXD) {
13432 813106         if (invert) {
13433 598814         op += NPOSIXD - POSIXD;
13434           }
13435           }
13436           }
13437            
13438 598846         ret = reg_node(pRExC_state, op);
13439            
13440 32         if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13441 598814         if (! SIZE_ONLY) {
13442 898221         FLAGS(ret) = arg;
13443           }
13444 53760         *flagp |= HASWIDTH|SIMPLE;
13445           }
13446 898253         else if (PL_regkind[op] == EXACT) {
13447 11552         alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13448           }
13449            
13450 643055         RExC_parse = (char *) cur_parse;
13451            
13452 587326         SvREFCNT_dec(posixes);
13453 32         SvREFCNT_dec(cp_list);
13454 931484         return ret;
13455           }
13456           }
13457            
13458 3355957         if (SIZE_ONLY)
13459           return ret;
13460           /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13461            
13462           /* If folding, we calculate all characters that could fold to or from the
13463           * ones already on the list */
13464 157112         if (FOLD && cp_list) {
13465           UV start, end; /* End points of code point ranges */
13466            
13467 7694         SV* fold_intersection = NULL;
13468            
13469           /* If the highest code point is within Latin1, we can use the
13470           * compiled-in Alphas list, and not have to go out to disk. This
13471           * yields two false positives, the masculine and feminine ordinal
13472           * indicators, which are weeded out below using the
13473           * IS_IN_SOME_FOLD_L1() macro */
13474 6814         if (invlist_highest(cp_list) < 256) {
13475 6814         _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13476           &fold_intersection);
13477           }
13478           else {
13479            
13480           /* Here, there are non-Latin1 code points, so we will have to go
13481           * fetch the list of all the characters that participate in folds
13482           */
13483 880         if (! PL_utf8_foldable) {
13484 98006         SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13485           &PL_sv_undef, 1, 0);
13486 98006         PL_utf8_foldable = _get_swash_invlist(swash);
13487 98006         SvREFCNT_dec_NN(swash);
13488           }
13489            
13490           /* This is a hash that for a particular fold gives all characters
13491           * that are involved in it */
13492 98006         if (! PL_utf8_foldclosures) {
13493            
13494           /* If we were unable to find any folds, then we likely won't be
13495           * able to find the closures. So just create an empty list.
13496           * Folding will effectively be restricted to the non-Unicode
13497           * rules hard-coded into Perl. (This case happens legitimately
13498           * during compilation of Perl itself before the Unicode tables
13499           * are generated) */
13500 98006         if (_invlist_len(PL_utf8_foldable) == 0) {
13501 83188         PL_utf8_foldclosures = newHV();
13502           }
13503           else {
13504           /* If the folds haven't been read in, call a fold function
13505           * to force that */
13506 83188         if (! PL_utf8_tofold) {
13507           U8 dummy[UTF8_MAXBYTES_CASE+1];
13508            
13509           /* This string is just a short named one above \xff */
13510 14818         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13511 98006         assert(PL_utf8_tofold); /* Verify that worked */
13512           }
13513 0         PL_utf8_foldclosures =
13514 0         _swash_inversion_hash(PL_utf8_tofold);
13515           }
13516           }
13517            
13518           /* Only the characters in this class that participate in folds need
13519           * be checked. Get the intersection of this class and all the
13520           * possible characters that are foldable. This can quickly narrow
13521           * down a large class */
13522 98006         _invlist_intersection(PL_utf8_foldable, cp_list,
13523           &fold_intersection);
13524           }
13525            
13526           /* Now look at the foldable characters in this class individually */
13527 3355953         invlist_iterinit(fold_intersection);
13528 180652         while (invlist_iternext(fold_intersection, &start, &end)) {
13529           UV j;
13530            
13531           /* Locale folding for Latin1 characters is deferred until runtime */
13532 180652         if (LOC && start < 256) {
13533 506         start = 256;
13534           }
13535            
13536           /* Look at every character in the range */
13537 504         for (j = start; j <= end; j++) {
13538            
13539           U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13540           STRLEN foldlen;
13541           SV** listp;
13542            
13543 504         if (j < 256) {
13544            
13545           /* We have the latin1 folding rules hard-coded here so that
13546           * an innocent-looking character class, like /[ks]/i won't
13547           * have to go out to disk to find the possible matches.
13548           * XXX It would be better to generate these via regen, in
13549           * case a new version of the Unicode standard adds new
13550           * mappings, though that is not really likely, and may be
13551           * caught by the default: case of the switch below. */
13552            
13553 504         if (IS_IN_SOME_FOLD_L1(j)) {
13554            
13555           /* ASCII is always matched; non-ASCII is matched only
13556           * under Unicode rules */
13557 0         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13558 0         cp_list =
13559 504         add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13560           }
13561           else {
13562 506         depends_list =
13563 506         add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13564           }
13565           }
13566            
13567 180146         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13568 180652         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13569           {
13570           /* Certain Latin1 characters have matches outside
13571           * Latin1. To get here, is one of those
13572           * characters. None of these matches is valid for
13573           * ASCII characters under /aa, which is why the 'if'
13574           * just above excludes those. These matches only
13575           * happen when the target string is utf8. The code
13576           * below adds the single fold closures for to the
13577           * inversion list. */
13578 180170         switch (j) {
13579           case 'k':
13580           case 'K':
13581 3355953         cp_list =
13582 1565073         add_cp_to_invlist(cp_list, KELVIN_SIGN);
13583 949782         break;
13584           case 's':
13585           case 'S':
13586 885746         cp_list = add_cp_to_invlist(cp_list,
13587           LATIN_SMALL_LETTER_LONG_S);
13588 885746         break;
13589           case MICRO_SIGN:
13590 885746         cp_list = add_cp_to_invlist(cp_list,
13591           GREEK_CAPITAL_LETTER_MU);
13592 64         cp_list = add_cp_to_invlist(cp_list,
13593           GREEK_SMALL_LETTER_MU);
13594 3355953         break;
13595           case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13596           case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13597 3120         cp_list =
13598 3120         add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13599 3120         break;
13600           case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13601 0         cp_list = add_cp_to_invlist(cp_list,
13602           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13603 3120         break;
13604           case LATIN_SMALL_LETTER_SHARP_S:
13605 3814846         cp_list = add_cp_to_invlist(cp_list,
13606           LATIN_CAPITAL_LETTER_SHARP_S);
13607 195360         break;
13608           case 'F': case 'f':
13609           case 'I': case 'i':
13610           case 'L': case 'l':
13611           case 'T': case 't':
13612           case 'A': case 'a':
13613           case 'H': case 'h':
13614           case 'J': case 'j':
13615           case 'N': case 'n':
13616           case 'W': case 'w':
13617           case 'Y': case 'y':
13618           /* These all are targets of multi-character
13619           * folds from code points that require UTF8 to
13620           * express, so they can't match unless the
13621           * target string is in UTF-8, so no action here
13622           * is necessary, as regexec.c properly handles
13623           * the general case for UTF-8 matching and
13624           * multi-char folds */
13625           break;
13626           default:
13627           /* Use deprecated warning to increase the
13628           * chances of this being output */
13629 3352833         ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13630 3327701         break;
13631           }
13632           }
13633 3164785         continue;
13634           }
13635            
13636           /* Here is an above Latin1 character. We don't have the rules
13637           * hard-coded for it. First, get its fold. This is the simple
13638           * fold, as the multi-character folds have been handled earlier
13639           * and separated out */
13640 3091451         _to_uni_fold_flags(j, foldbuf, &foldlen,
13641           ((LOC)
13642           ? FOLD_FLAGS_LOCALE
13643           : (ASCII_FOLD_RESTRICTED)
13644           ? FOLD_FLAGS_NOMIX_ASCII
13645           : 0));
13646            
13647           /* Single character fold of above Latin1. Add everything in
13648           * its fold closure to the list that this node should match.
13649           * The fold closures data structure is a hash with the keys
13650           * being the UTF-8 of every character that is folded to, like
13651           * 'k', and the values each an array of all code points that
13652           * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13653           * Multi-character folds are not included */
13654 3090783         if ((listp = hv_fetch(PL_utf8_foldclosures,
13655           (char *) foldbuf, foldlen, FALSE)))
13656           {
13657 3088039         AV* list = (AV*) *listp;
13658           IV k;
13659 3088039         for (k = 0; k <= av_len(list); k++) {
13660 3088039         SV** c_p = av_fetch(list, k, FALSE);
13661           UV c;
13662 124         if (c_p == NULL) {
13663 3087915         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13664           }
13665 1505627         c = SvUV(*c_p);
13666            
13667           /* /aa doesn't allow folds between ASCII and non-; /l
13668           * doesn't allow them between above and below 256 */
13669 54984         if ((ASCII_FOLD_RESTRICTED
13670 780         && (isASCII(c) != isASCII(j)))
13671 780         || (LOC && c < 256)) {
13672 0         continue;
13673           }
13674            
13675           /* Folds involving non-ascii Latin1 characters
13676           * under /d are added to a separate list */
13677 0         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13678           {
13679 0         cp_list = add_cp_to_invlist(cp_list, c);
13680           }
13681           else {
13682 0         depends_list = add_cp_to_invlist(depends_list, c);
13683           }
13684           }
13685           }
13686           }
13687           }
13688 0         SvREFCNT_dec_NN(fold_intersection);
13689           }
13690            
13691           /* And combine the result (if any) with any inversion list from posix
13692           * classes. The lists are kept separate up to now because we don't want to
13693           * fold the classes (folding of those is automatically handled by the swash
13694           * fetching code) */
13695 2         if (posixes) {
13696 0         if (! DEPENDS_SEMANTICS) {
13697 0         if (cp_list) {
13698 0         _invlist_union(cp_list, posixes, &cp_list);
13699 1582288         SvREFCNT_dec_NN(posixes);
13700           }
13701           else {
13702 984682         cp_list = posixes;
13703           }
13704           }
13705           else {
13706           /* Under /d, we put into a separate list the Latin1 things that
13707           * match only when the target string is utf8 */
13708 166         SV* nonascii_but_latin1_properties = NULL;
13709 166         _invlist_intersection(posixes, PL_Latin1,
13710           &nonascii_but_latin1_properties);
13711 984516         _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13712           &nonascii_but_latin1_properties);
13713 3280         _invlist_subtract(posixes, nonascii_but_latin1_properties,
13714           &posixes);
13715 3250         if (cp_list) {
13716 272         _invlist_union(cp_list, posixes, &cp_list);
13717 272         SvREFCNT_dec_NN(posixes);
13718           }
13719           else {
13720 3088039         cp_list = posixes;
13721           }
13722            
13723 3088039         if (depends_list) {
13724 1342         _invlist_union(depends_list, nonascii_but_latin1_properties,
13725           &depends_list);
13726 1342         SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13727           }
13728           else {
13729 1342         depends_list = nonascii_but_latin1_properties;
13730           }
13731           }
13732           }
13733            
13734           /* And combine the result (if any) with any inversion list from properties.
13735           * The lists are kept separate up to now so that we can distinguish the two
13736           * in regards to matching above-Unicode. A run-time warning is generated
13737           * if a Unicode property is matched against a non-Unicode code point. But,
13738           * we allow user-defined properties to match anything, without any warning,
13739           * and we also suppress the warning if there is a portion of the character
13740           * class that isn't a Unicode property, and which matches above Unicode, \W
13741           * or [\x{110000}] for example.
13742           * (Note that in this case, unlike the Posix one above, there is no
13743           * , because having a Unicode property forces Unicode
13744           * semantics */
13745 1344         if (properties) {
13746 1342         bool warn_super = ! has_user_defined_property;
13747 780         if (cp_list) {
13748            
13749           /* If it matters to the final outcome, see if a non-property
13750           * component of the class matches above Unicode. If so, the
13751           * warning gets suppressed. This is true even if just a single
13752           * such code point is specified, as though not strictly correct if
13753           * another such code point is matched against, the fact that they
13754           * are using above-Unicode code points indicates they should know
13755           * the issues involved */
13756 1342         if (warn_super) {
13757 1342         bool non_prop_matches_above_Unicode =
13758           runtime_posix_matches_above_Unicode
13759 3351491         | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13760 3351491         if (invert) {
13761 3326359         non_prop_matches_above_Unicode =
13762 11152661         ! non_prop_matches_above_Unicode;
13763           }
13764 8632132         warn_super = ! non_prop_matches_above_Unicode;
13765           }
13766            
13767 653708         _invlist_union(properties, cp_list, &cp_list);
13768 8632132         SvREFCNT_dec_NN(properties);
13769           }
13770           else {
13771 7826302         cp_list = properties;
13772           }
13773            
13774 276181514         if (warn_super) {
13775 268355212         OP(ret) = ANYOF_WARN_SUPER;
13776           }
13777           }
13778            
13779           /* Here, we have calculated what code points should be in the character
13780           * class.
13781           *
13782           * Now we can see about various optimizations. Fold calculation (which we
13783           * did above) needs to take place before inversion. Otherwise /[^k]/i
13784           * would invert to include K, which under /i would match k, which it
13785           * shouldn't. Therefore we can't invert folded locale now, as it won't be
13786           * folded until runtime */
13787            
13788           /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13789           * at compile time. Besides not inverting folded locale now, we can't
13790           * invert if there are things such as \w, which aren't known until runtime
13791           * */
13792 268355214         if (invert
13793 3326359         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13794 3326359         && ! depends_list
13795 3176295         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13796           {
13797 4954799         _invlist_invert(cp_list);
13798            
13799           /* Any swash can't be used as-is, because we've inverted things */
13800 1865159         if (swash) {
13801 1865159         SvREFCNT_dec_NN(swash);
13802           swash = NULL;
13803           }
13804            
13805           /* Clear the invert flag since have just done it here */
13806           invert = FALSE;
13807           }
13808            
13809 3351493         if (ret_invlist) {
13810 162916         *ret_invlist = cp_list;
13811 3351491         SvREFCNT_dec(swash);
13812            
13813           /* Discard the generated node */
13814 137370         if (SIZE_ONLY) {
13815 98250         RExC_size = orig_size;
13816           }
13817           else {
13818 98250         RExC_emit = orig_emit;
13819           }
13820           return orig_emit;
13821           }
13822            
13823           /* If we didn't do folding, it's because some information isn't available
13824           * until runtime; set the run-time fold flag for these. (We don't have to
13825           * worry about properties folding, as that is taken care of by the swash
13826           * fetching) */
13827 39122         if (FOLD && LOC)
13828           {
13829 3351491         ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13830           }
13831            
13832           /* Some character classes are equivalent to other nodes. Such nodes take
13833           * up less room and generally fewer operations to execute than ANYOF nodes.
13834           * Above, we checked for and optimized into some such equivalents for
13835           * certain common classes that are easy to test. Getting to this point in
13836           * the code means that the class didn't get optimized there. Since this
13837           * code is only executed in Pass 2, it is too late to save space--it has
13838           * been allocated in Pass 1, and currently isn't given back. But turning
13839           * things into an EXACTish node can allow the optimizer to join it to any
13840           * adjacent such nodes. And if the class is equivalent to things like /./,
13841           * expensive run-time swashes can be avoided. Now that we have more
13842           * complete information, we can find things necessarily missed by the
13843           * earlier code. I (khw) am not sure how much to look for here. It would
13844           * be easy, but perhaps too slow, to check any candidates against all the
13845           * node types they could possibly match using _invlistEQ(). */
13846            
13847 278         if (cp_list
13848 3351493         && ! invert
13849 1851173         && ! depends_list
13850 1824923         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13851 1526572         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13852           {
13853           UV start, end;
13854           U8 op = END; /* The optimzation node-type */
13855 2338313         const char * cur_parse= RExC_parse;
13856            
13857 1526572         invlist_iterinit(cp_list);
13858 90254         if (! invlist_iternext(cp_list, &start, &end)) {
13859            
13860           /* Here, the list is empty. This happens, for example, when a
13861           * Unicode property is the only thing in the character class, and
13862           * it doesn't match anything. (perluniprops.pod notes such
13863           * properties) */
13864           op = OPFAIL;
13865 90252         *flagp |= HASWIDTH|SIMPLE;
13866           }
13867 1436320         else if (start == end) { /* The range is a single code point */
13868 1436318         if (! invlist_iternext(cp_list, &start, &end)
13869            
13870           /* Don't do this optimization if it would require changing
13871           * the pattern to UTF-8 */
13872 1410068         && (start < 256 || UTF))
13873           {
13874           /* Here, the list contains a single code point. Can optimize
13875           * into an EXACT node */
13876            
13877 1410068         value = start;
13878            
13879 1526570         if (! FOLD) {
13880           op = EXACT;
13881           }
13882 1526570         else if (LOC) {
13883            
13884           /* A locale node under folding with one code point can be
13885           * an EXACTFL, as its fold won't be calculated until
13886           * runtime */
13887           op = EXACTFL;
13888           }
13889           else {
13890            
13891           /* Here, we are generally folding, but there is only one
13892           * code point to match. If we have to, we use an EXACT
13893           * node, but it would be better for joining with adjacent
13894           * nodes in the optimization pass if we used the same
13895           * EXACTFish node that any such are likely to be. We can
13896           * do this iff the code point doesn't participate in any
13897           * folds. For example, an EXACTF of a colon is the same as
13898           * an EXACT one, since nothing folds to or from a colon. */
13899 1526570         if (value < 256) {
13900 1526570         if (IS_IN_SOME_FOLD_L1(value)) {
13901           op = EXACT;
13902           }
13903           }
13904           else {
13905 3351491         if (! PL_utf8_foldable) {
13906 5838272         SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13907           &PL_sv_undef, 1, 0);
13908 11470056         PL_utf8_foldable = _get_swash_invlist(swash);
13909 12016300         SvREFCNT_dec_NN(swash);
13910           }
13911 1073048         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13912           op = EXACT;
13913           }
13914           }
13915            
13916           /* If we haven't found the node type, above, it means we
13917           * can use the prevailing one */
13918 0         if (op == END) {
13919 165799008         op = compute_EXACTish(pRExC_state);
13920           }
13921           }
13922           }
13923           }
13924 196100742         else if (start == 0) {
13925 225688232         if (end == UV_MAX) {
13926           op = SANY;
13927 190061120         *flagp |= HASWIDTH|SIMPLE;
13928 22476186         RExC_naughty++;
13929           }
13930 8990800         else if (end == '\n' - 1
13931 6808         && invlist_iternext(cp_list, &start, &end)
13932 5856         && start == '\n' + 1 && end == UV_MAX)
13933           {
13934           op = REG_ANY;
13935 28         *flagp |= HASWIDTH|SIMPLE;
13936 5828         RExC_naughty++;
13937           }
13938           }
13939 954         invlist_iterfinish(cp_list);
13940            
13941 954         if (op != END) {
13942 225687252         RExC_parse = (char *)orig_parse;
13943 81662828         RExC_emit = (regnode *)orig_emit;
13944            
13945 58815224         ret = reg_node(pRExC_state, op);
13946            
13947 58815224         RExC_parse = (char *)cur_parse;
13948            
13949 22847604         if (PL_regkind[op] == EXACT) {
13950 1073048         alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13951           }
13952            
13953 1073048         SvREFCNT_dec_NN(cp_list);
13954 165798980         return ret;
13955           }
13956           }
13957            
13958           /* Here, contains all the code points we can determine at
13959           * compile time that match under all conditions. Go through it, and
13960           * for things that belong in the bitmap, put them there, and delete from
13961           * . While we are at it, see if everything above 255 is in the
13962           * list, and if so, set a flag to speed up execution */
13963 85740864         ANYOF_BITMAP_ZERO(ret);
13964 85740864         if (cp_list) {
13965            
13966           /* This gets set if we actually need to modify things */
13967           bool change_invlist = FALSE;
13968            
13969           UV start, end;
13970            
13971           /* Start looking through */
13972 85740864         invlist_iterinit(cp_list);
13973 44009311         while (invlist_iternext(cp_list, &start, &end)) {
13974           UV high;
13975           int i;
13976            
13977 44009307         if (end == UV_MAX && start <= 256) {
13978 41731557         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13979           }
13980            
13981           /* Quit if are above what we should change */
13982 2         if (start > 255) {
13983           break;
13984           }
13985            
13986           change_invlist = TRUE;
13987            
13988           /* Set all the bits in the range, up to the max that we are doing */
13989 41731559         high = (end < 255) ? end : 255;
13990 41731563         for (i = start; i <= (int) high; i++) {
13991 41731561         if (! ANYOF_BITMAP_TEST(ret, i)) {
13992 64131094         ANYOF_BITMAP_SET(ret, i);
13993           }
13994           }
13995           }
13996 31165056         invlist_iterfinish(cp_list);
13997            
13998           /* Done with loop; remove any code points that are in the bitmap from
13999           * */
14000 31165056         if (change_invlist) {
14001 31165056         _invlist_subtract(cp_list, PL_Latin1, &cp_list);
14002           }
14003            
14004           /* If have completely emptied it, remove it completely */
14005 15782902         if (_invlist_len(cp_list) == 0) {
14006 15782902         SvREFCNT_dec_NN(cp_list);
14007 15382156         cp_list = NULL;
14008           }
14009           }
14010            
14011 2         if (invert) {
14012 15382154         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14013           }
14014            
14015           /* Here, the bitmap has been populated with all the Latin1 code points that
14016           * always match. Can now add to the overall list those that match only
14017           * when the target string is UTF-8 (). */
14018 15382156         if (depends_list) {
14019 15382154         if (cp_list) {
14020 23385730         _invlist_union(cp_list, depends_list, &cp_list);
14021 264068         SvREFCNT_dec_NN(depends_list);
14022           }
14023           else {
14024 22860448         cp_list = depends_list;
14025           }
14026           }
14027            
14028           /* If there is a swash and more than one element, we can't use the swash in
14029           * the optimization below. */
14030 22860450         if (swash && element_count > 1) {
14031 22860448         SvREFCNT_dec_NN(swash);
14032           swash = NULL;
14033           }
14034            
14035 22860450         if (! cp_list
14036 11448702         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14037           {
14038 28493730         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14039           }
14040           else {
14041           /* av[0] stores the character class description in its textual form:
14042           * used later (regexec.c:Perl_regclass_swash()) to initialize the
14043           * appropriate swash, and is also useful for dumping the regnode.
14044           * av[1] if NULL, is a placeholder to later contain the swash computed
14045           * from av[0]. But if no further computation need be done, the
14046           * swash is stored there now.
14047           * av[2] stores the cp_list inversion list for use in addition or
14048           * instead of av[0]; used only if av[1] is NULL
14049           * av[3] is set if any component of the class is from a user-defined
14050           * property; used only if av[1] is NULL */
14051 11411748         AV * const av = newAV();
14052           SV *rv;
14053            
14054 11411748         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14055           ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14056 11411748         if (swash) {
14057 11411748         av_store(av, 1, swash);
14058 5826930         SvREFCNT_dec_NN(cp_list);
14059           }
14060           else {
14061 29986         av_store(av, 1, NULL);
14062 2754         if (cp_list) {
14063 29986         av_store(av, 2, cp_list);
14064 2754         av_store(av, 3, newSVuv(has_user_defined_property));
14065           }
14066           }
14067            
14068 78024679         rv = newRV_noinc(MUTABLE_SV(av));
14069 66612931         n = add_data(pRExC_state, 1, "s");
14070 11411748         RExC_rxi->data->data[n] = (void*)rv;
14071 11411748         ARG_SET(ret, n);
14072           }
14073            
14074 11411750         *flagp |= HASWIDTH|SIMPLE;
14075 99636294         return ret;
14076           }
14077           #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14078            
14079            
14080           /* reg_skipcomment()
14081            
14082           Absorbs an /x style # comments from the input stream.
14083           Returns true if there is more text remaining in the stream.
14084           Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14085           terminates the pattern without including a newline.
14086            
14087           Note its the callers responsibility to ensure that we are
14088           actually in /x mode
14089            
14090           */
14091            
14092           STATIC bool
14093 99636292         S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14094           {
14095           bool ended = 0;
14096            
14097 99636292         PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14098            
14099 102950058         while (RExC_parse < RExC_end)
14100 102950058         if (*RExC_parse++ == '\n') {
14101           ended = 1;
14102           break;
14103           }
14104 50675954         if (!ended) {
14105           /* we ran off the end of the pattern without ending
14106           the comment, so we have to add an \n when wrapping */
14107 241953         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14108 50434001         return 0;
14109           } else
14110           return 1;
14111           }
14112            
14113           /* nextchar()
14114            
14115           Advances the parse position, and optionally absorbs
14116           "whitespace" from the inputstream.
14117            
14118           Without /x "whitespace" means (?#...) style comments only,
14119           with /x this means (?#...) and # comments and whitespace proper.
14120            
14121           Returns the RExC_parse point from BEFORE the scan occurs.
14122            
14123           This is the /x friendly way of saying RExC_parse++.
14124           */
14125            
14126           STATIC char*
14127 1628         S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14128           {
14129 1628         char* const retval = RExC_parse++;
14130            
14131 1628         PERL_ARGS_ASSERT_NEXTCHAR;
14132            
14133           for (;;) {
14134 1628         if (RExC_end - RExC_parse >= 3
14135 2534004         && *RExC_parse == '('
14136 2533104         && RExC_parse[1] == '?'
14137 300         && RExC_parse[2] == '#')
14138           {
14139 0         while (*RExC_parse != ')') {
14140 0         if (RExC_parse == RExC_end)
14141 20183530         FAIL("Sequence (?#... not terminated");
14142 20183530         RExC_parse++;
14143           }
14144 15671000         RExC_parse++;
14145 6763936         continue;
14146           }
14147 4514158         if (RExC_flags & RXf_PMf_EXTENDED) {
14148 4512530         if (isSPACE(*RExC_parse)) {
14149 20183530         RExC_parse++;
14150 20183210         continue;
14151           }
14152 20183210         else if (*RExC_parse == '#') {
14153 20183210         if ( reg_skipcomment( pRExC_state ) )
14154 20183210         continue;
14155           }
14156           }
14157 20184838         return retval;
14158           }
14159           }
14160            
14161           /*
14162           - reg_node - emit a node
14163           */
14164           STATIC regnode * /* Location. */
14165 34696613         S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14166           {
14167           dVAR;
14168           regnode *ptr;
14169 20184350         regnode * const ret = RExC_emit;
14170 20184350         GET_RE_DEBUG_FLAGS_DECL;
14171            
14172 20184350         PERL_ARGS_ASSERT_REG_NODE;
14173            
14174 20184350         if (SIZE_ONLY) {
14175           SIZE_ALIGN(RExC_size);
14176 20183952         RExC_size += 1;
14177 15723825         return(ret);
14178           }
14179 15723801         if (RExC_emit >= RExC_emit_bound)
14180 15723403         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14181           op, RExC_emit, RExC_emit_bound);
14182            
14183 15723735         NODE_ALIGN_FILL(ret);
14184           ptr = ret;
14185 464         FILL_ADVANCE_NODE(ptr, op);
14186           #ifdef RE_TRACK_PATTERN_OFFSETS
14187 464         if (RExC_offsets) { /* MJD */
14188 408         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14189           "reg_node", __LINE__,
14190           PL_reg_name[op],
14191           (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14192           ? "Overwriting end of array!\n" : "OK",
14193           (UV)(RExC_emit - RExC_emit_start),
14194           (UV)(RExC_parse - RExC_start),
14195           (UV)RExC_offsets[0]));
14196 408         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14197           }
14198           #endif
14199 408         RExC_emit = ptr;
14200 408         return(ret);
14201           }
14202            
14203           /*
14204           - reganode - emit a node with an argument
14205           */
14206           STATIC regnode * /* Location. */
14207 230         S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14208           {
14209           dVAR;
14210           regnode *ptr;
14211 230         regnode * const ret = RExC_emit;
14212 15723623         GET_RE_DEBUG_FLAGS_DECL;
14213            
14214 15723623         PERL_ARGS_ASSERT_REGANODE;
14215            
14216 15723623         if (SIZE_ONLY) {
14217           SIZE_ALIGN(RExC_size);
14218 15723513         RExC_size += 2;
14219           /*
14220           We can't do this:
14221          
14222           assert(2==regarglen[op]+1);
14223            
14224           Anything larger than this has to allocate the extra amount.
14225           If we changed this to be:
14226          
14227           RExC_size += (1 + regarglen[op]);
14228          
14229           then it wouldn't matter. Its not clear what side effect
14230           might come from that so its not done so far.
14231           -- dmq
14232           */
14233 15723513         return(ret);
14234           }
14235 15723513         if (RExC_emit >= RExC_emit_bound)
14236 15723403         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14237           op, RExC_emit, RExC_emit_bound);
14238            
14239 15723513         NODE_ALIGN_FILL(ret);
14240           ptr = ret;
14241 15723513         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14242           #ifdef RE_TRACK_PATTERN_OFFSETS
14243 15723513         if (RExC_offsets) { /* MJD */
14244 15723513         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14245           "reganode",
14246           __LINE__,
14247           PL_reg_name[op],
14248           (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14249           "Overwriting end of array!\n" : "OK",
14250           (UV)(RExC_emit - RExC_emit_start),
14251           (UV)(RExC_parse - RExC_start),
14252           (UV)RExC_offsets[0]));
14253 15723513         Set_Cur_Node_Offset;
14254           }
14255           #endif
14256 15723513         RExC_emit = ptr;
14257 15723513         return(ret);
14258           }
14259            
14260           /*
14261           - reguni - emit (if appropriate) a Unicode character
14262           */
14263           STATIC STRLEN
14264 15723459         S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14265           {
14266           dVAR;
14267            
14268 15723459         PERL_ARGS_ASSERT_REGUNI;
14269            
14270 23386600         return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14271           }
14272            
14273           /*
14274           - reginsert - insert an operator in front of already-emitted operand
14275           *
14276           * Means relocating the operand.
14277           */
14278           STATIC void
14279 15723419         S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14280           {
14281           dVAR;
14282           regnode *src;
14283           regnode *dst;
14284           regnode *place;
14285 15723419         const int offset = regarglen[(U8)op];
14286 4512440         const int size = NODE_STEP_REGNODE + offset;
14287 4512440         GET_RE_DEBUG_FLAGS_DECL;
14288            
14289 4512440         PERL_ARGS_ASSERT_REGINSERT;
14290           PERL_UNUSED_ARG(depth);
14291           /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14292 5266         DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14293 3606         if (SIZE_ONLY) {
14294 3328         RExC_size += size;
14295 4512448         return;
14296           }
14297            
14298 1293102         src = RExC_emit;
14299 4207675         RExC_emit += size;
14300 2268574         dst = RExC_emit;
14301 1586628         if (RExC_open_parens) {
14302           int paren;
14303           /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14304 1928673         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14305 210870         if ( RExC_open_parens[paren] >= opnd ) {
14306           /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14307 210870         RExC_open_parens[paren] += size;
14308           } else {
14309           /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14310           }
14311 12076         if ( RExC_close_parens[paren] >= opnd ) {
14312           /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14313 12076         RExC_close_parens[paren] += size;
14314           } else {
14315           /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14316           }
14317           }
14318           }
14319            
14320 12100         while (src > opnd) {
14321 12092         StructCopy(--src, --dst, regnode);
14322           #ifdef RE_TRACK_PATTERN_OFFSETS
14323 12092         if (RExC_offsets) { /* MJD 20010112 */
14324 12092         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14325           "reg_insert",
14326           __LINE__,
14327           PL_reg_name[op],
14328           (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14329           ? "Overwriting end of array!\n" : "OK",
14330           (UV)(src - RExC_emit_start),
14331           (UV)(dst - RExC_emit_start),
14332           (UV)RExC_offsets[0]));
14333 12092         Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14334 355330         Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14335           }
14336           #endif
14337           }
14338          
14339            
14340           place = opnd; /* Op node, where operand used to be. */
14341           #ifdef RE_TRACK_PATTERN_OFFSETS
14342 355322         if (RExC_offsets) { /* MJD */
14343 355322         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14344           "reginsert",
14345           __LINE__,
14346           PL_reg_name[op],
14347           (UV)(place - RExC_emit_start) > RExC_offsets[0]
14348           ? "Overwriting end of array!\n" : "OK",
14349           (UV)(place - RExC_emit_start),
14350           (UV)(RExC_parse - RExC_start),
14351           (UV)RExC_offsets[0]));
14352 355322         Set_Node_Offset(place, RExC_parse);
14353 355322         Set_Node_Length(place, 1);
14354           }
14355           #endif
14356 355322         src = NEXTOPER(place);
14357 355322         FILL_ADVANCE_NODE(place, op);
14358 248418         Zero(src, offset, regnode);
14359           }
14360            
14361           /*
14362           - regtail - set the next-pointer at the end of a node chain of p to val.
14363           - SEE ALSO: regtail_study
14364           */
14365           /* TODO: All three parms should be const */
14366           STATIC void
14367 355934         S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14368           {
14369           dVAR;
14370           regnode *scan;
14371 127682         GET_RE_DEBUG_FLAGS_DECL;
14372            
14373 355934         PERL_ARGS_ASSERT_REGTAIL;
14374           #ifndef DEBUGGING
14375           PERL_UNUSED_ARG(depth);
14376           #endif
14377            
14378 355934         if (SIZE_ONLY)
14379 620         return;
14380            
14381           /* Find last node. */
14382           scan = p;
14383           for (;;) {
14384 1293620         regnode * const temp = regnext(scan);
14385 1293620         DEBUG_PARSE_r({
14386           SV * const mysv=sv_newmortal();
14387           DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14388           regprop(RExC_rx, mysv, scan);
14389           PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14390           SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14391           (temp == NULL ? "->" : ""),
14392           (temp == NULL ? PL_reg_name[OP(val)] : "")
14393           );
14394           });
14395 4512950         if (temp == NULL)
14396           break;
14397           scan = temp;
14398           }
14399            
14400 4512734         if (reg_off_by_arg[OP(scan)]) {
14401 264077073         ARG_SET(scan, val - scan);
14402           }
14403           else {
14404 264077383         NEXT_OFF(scan) = val - scan;
14405           }
14406           }
14407            
14408           #ifdef DEBUGGING
14409           /*
14410           - regtail_study - set the next-pointer at the end of a node chain of p to val.
14411           - Look for optimizable sequences at the same time.
14412           - currently only looks for EXACT chains.
14413            
14414           This is experimental code. The idea is to use this routine to perform
14415           in place optimizations on branches and groups as they are constructed,
14416           with the long term intention of removing optimization from study_chunk so
14417           that it is purely analytical.
14418            
14419           Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14420           to control which is which.
14421            
14422           */
14423           /* TODO: All four parms should be const */
14424            
14425           STATIC U8
14426 264077129         S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14427           {
14428           dVAR;
14429           regnode *scan;
14430           U8 exact = PSEUDO;
14431           #ifdef EXPERIMENTAL_INPLACESCAN
14432           I32 min = 0;
14433           #endif
14434 56         GET_RE_DEBUG_FLAGS_DECL;
14435            
14436 264077129         PERL_ARGS_ASSERT_REGTAIL_STUDY;
14437            
14438            
14439 264077129         if (SIZE_ONLY)
14440           return exact;
14441            
14442           /* Find last node. */
14443            
14444           scan = p;
14445           for (;;) {
14446 233538108         regnode * const temp = regnext(scan);
14447           #ifdef EXPERIMENTAL_INPLACESCAN
14448           if (PL_regkind[OP(scan)] == EXACT) {
14449           bool has_exactf_sharp_s; /* Unexamined in this routine */
14450           if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14451           return EXACT;
14452           }
14453           #endif
14454 298         if ( exact ) {
14455 298         switch (OP(scan)) {
14456           case EXACT:
14457           case EXACTF:
14458           case EXACTFA_NO_TRIE:
14459           case EXACTFA:
14460           case EXACTFU:
14461           case EXACTFU_SS:
14462           case EXACTFL:
14463 296         if( exact == PSEUDO )
14464 294         exact= OP(scan);
14465 2         else if ( exact != OP(scan) )
14466           exact= 0;
14467           case NOTHING:
14468           break;
14469           default:
14470           exact= 0;
14471           }
14472           }
14473 298         DEBUG_PARSE_r({
14474           SV * const mysv=sv_newmortal();
14475           DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14476           regprop(RExC_rx, mysv, scan);
14477           PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14478           SvPV_nolen_const(mysv),
14479           REG_NODE_NUM(scan),
14480           PL_reg_name[exact]);
14481           });
14482 58         if (temp == NULL)
14483           break;
14484           scan = temp;
14485           }
14486 296         DEBUG_PARSE_r({
14487           SV * const mysv_val=sv_newmortal();
14488           DEBUG_PARSE_MSG("");
14489           regprop(RExC_rx, mysv_val, val);
14490           PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14491           SvPV_nolen_const(mysv_val),
14492           (IV)REG_NODE_NUM(val),
14493           (IV)(val - scan)
14494           );
14495           });
14496 296         if (reg_off_by_arg[OP(scan)]) {
14497 240         ARG_SET(scan, val - scan);
14498           }
14499           else {
14500 296         NEXT_OFF(scan) = val - scan;
14501           }
14502            
14503           return exact;
14504           }
14505           #endif
14506            
14507           /*
14508           - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14509           */
14510           #ifdef DEBUGGING
14511            
14512           static void
14513 240         S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14514           {
14515           int bit;
14516           int set=0;
14517            
14518 240         for (bit=0; bit<32; bit++) {
14519 240         if (flags & (1<
14520 240         if (!set++ && lead)
14521 240         PerlIO_printf(Perl_debug_log, "%s",lead);
14522 0         PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14523           }
14524           }
14525 240         if (lead) {
14526 240         if (set)
14527 240         PerlIO_printf(Perl_debug_log, "\n");
14528           else
14529 529658         PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14530           }
14531 529658         }
14532            
14533           static void
14534 226366         S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14535           {
14536           int bit;
14537           int set=0;
14538           regex_charset cs;
14539            
14540 226366         for (bit=0; bit<32; bit++) {
14541 447208         if (flags & (1<
14542 220842         if ((1<
14543 147228         continue;
14544           }
14545 147228         if (!set++ && lead)
14546 147228         PerlIO_printf(Perl_debug_log, "%s",lead);
14547 146950         PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14548           }
14549           }
14550 146950         if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14551 146950         if (!set++ && lead) {
14552 123588305         PerlIO_printf(Perl_debug_log, "%s",lead);
14553           }
14554 0         switch (cs) {
14555           case REGEX_UNICODE_CHARSET:
14556 0         PerlIO_printf(Perl_debug_log, "UNICODE");
14557 0         break;
14558           case REGEX_LOCALE_CHARSET:
14559 0         PerlIO_printf(Perl_debug_log, "LOCALE");
14560 0         break;
14561           case REGEX_ASCII_RESTRICTED_CHARSET:
14562 0         PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14563 0         break;
14564           case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14565 0         PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14566 0         break;
14567           default:
14568 0         PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14569 0         break;
14570           }
14571           }
14572 0         if (lead) {
14573 0         if (set)
14574 0         PerlIO_printf(Perl_debug_log, "\n");
14575           else
14576 0         PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14577           }
14578 0         }
14579           #endif
14580            
14581           void
14582 132         Perl_regdump(pTHX_ const regexp *r)
14583           {
14584           #ifdef DEBUGGING
14585           dVAR;
14586 132         SV * const sv = sv_newmortal();
14587 132         SV *dsv= sv_newmortal();
14588 132         RXi_GET_DECL(r,ri);
14589 132         GET_RE_DEBUG_FLAGS_DECL;
14590            
14591 132         PERL_ARGS_ASSERT_REGDUMP;
14592            
14593 132         (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14594            
14595           /* Header fields of interest. */
14596 132         if (r->anchored_substr) {
14597 96         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14598           RE_SV_DUMPLEN(r->anchored_substr), 30);
14599 192         PerlIO_printf(Perl_debug_log,
14600           "anchored %s%s at %"IVdf" ",
14601 96         s, RE_SV_TAIL(r->anchored_substr),
14602 96         (IV)r->anchored_offset);
14603 36         } else if (r->anchored_utf8) {
14604 20         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14605           RE_SV_DUMPLEN(r->anchored_utf8), 30);
14606 40         PerlIO_printf(Perl_debug_log,
14607           "anchored utf8 %s%s at %"IVdf" ",
14608 20         s, RE_SV_TAIL(r->anchored_utf8),
14609 20         (IV)r->anchored_offset);
14610           }
14611 132         if (r->float_substr) {
14612 4         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14613           RE_SV_DUMPLEN(r->float_substr), 30);
14614 12         PerlIO_printf(Perl_debug_log,
14615           "floating %s%s at %"IVdf"..%"UVuf" ",
14616 4         s, RE_SV_TAIL(r->float_substr),
14617 8         (IV)r->float_min_offset, (UV)r->float_max_offset);
14618 128         } else if (r->float_utf8) {
14619 0         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14620           RE_SV_DUMPLEN(r->float_utf8), 30);
14621 0         PerlIO_printf(Perl_debug_log,
14622           "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14623 0         s, RE_SV_TAIL(r->float_utf8),
14624 0         (IV)r->float_min_offset, (UV)r->float_max_offset);
14625           }
14626 132         if (r->check_substr || r->check_utf8)
14627 236         PerlIO_printf(Perl_debug_log,
14628           (const char *)
14629 118         (r->check_substr == r->float_substr
14630 24         && r->check_utf8 == r->float_utf8
14631           ? "(checking floating" : "(checking anchored"));
14632 132         if (r->extflags & RXf_NOSCAN)
14633 0         PerlIO_printf(Perl_debug_log, " noscan");
14634 132         if (r->extflags & RXf_CHECK_ALL)
14635 48         PerlIO_printf(Perl_debug_log, " isall");
14636 132         if (r->check_substr || r->check_utf8)
14637 118         PerlIO_printf(Perl_debug_log, ") ");
14638            
14639 132         if (ri->regstclass) {
14640 6         regprop(r, sv, ri->regstclass);
14641 6         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14642           }
14643 132         if (r->extflags & RXf_ANCH) {
14644 0         PerlIO_printf(Perl_debug_log, "anchored");
14645 0         if (r->extflags & RXf_ANCH_BOL)
14646 0         PerlIO_printf(Perl_debug_log, "(BOL)");
14647 0         if (r->extflags & RXf_ANCH_MBOL)
14648 0         PerlIO_printf(Perl_debug_log, "(MBOL)");
14649 0         if (r->extflags & RXf_ANCH_SBOL)
14650 0         PerlIO_printf(Perl_debug_log, "(SBOL)");
14651 0         if (r->extflags & RXf_ANCH_GPOS)
14652 0         PerlIO_printf(Perl_debug_log, "(GPOS)");
14653 0         PerlIO_putc(Perl_debug_log, ' ');
14654           }
14655 132         if (r->extflags & RXf_GPOS_SEEN)
14656 0         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14657 132         if (r->intflags & PREGf_SKIP)
14658 0         PerlIO_printf(Perl_debug_log, "plus ");
14659 132         if (r->intflags & PREGf_IMPLICIT)
14660 0         PerlIO_printf(Perl_debug_log, "implicit ");
14661 132         PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14662 132         if (r->extflags & RXf_EVAL_SEEN)
14663 68         PerlIO_printf(Perl_debug_log, "with eval ");
14664 132         PerlIO_printf(Perl_debug_log, "\n");
14665 132         DEBUG_FLAGS_r({
14666           regdump_extflags("r->extflags: ",r->extflags);
14667           regdump_intflags("r->intflags: ",r->intflags);
14668           });
14669           #else
14670           PERL_ARGS_ASSERT_REGDUMP;
14671           PERL_UNUSED_CONTEXT;
14672           PERL_UNUSED_ARG(r);
14673           #endif /* DEBUGGING */
14674 132         }
14675            
14676           /*
14677           - regprop - printable representation of opcode
14678           */
14679           #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14680           STMT_START { \
14681           if (do_sep) { \
14682           Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14683           if (flags & ANYOF_INVERT) \
14684           /*make sure the invert info is in each */ \
14685           sv_catpvs(sv, "^"); \
14686           do_sep = 0; \
14687           } \
14688           } STMT_END
14689            
14690           void
14691 684         Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14692           {
14693           #ifdef DEBUGGING
14694           dVAR;
14695           int k;
14696            
14697           /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14698           static const char * const anyofs[] = {
14699           #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14700           || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14701           || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14702           || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14703           || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14704           || _CC_VERTSPACE != 16
14705           #error Need to adjust order of anyofs[]
14706           #endif
14707           "[\\w]",
14708           "[\\W]",
14709           "[\\d]",
14710           "[\\D]",
14711           "[:alpha:]",
14712           "[:^alpha:]",
14713           "[:lower:]",
14714           "[:^lower:]",
14715           "[:upper:]",
14716           "[:^upper:]",
14717           "[:punct:]",
14718           "[:^punct:]",
14719           "[:print:]",
14720           "[:^print:]",
14721           "[:alnum:]",
14722           "[:^alnum:]",
14723           "[:graph:]",
14724           "[:^graph:]",
14725           "[:cased:]",
14726           "[:^cased:]",
14727           "[\\s]",
14728           "[\\S]",
14729           "[:blank:]",
14730           "[:^blank:]",
14731           "[:xdigit:]",
14732           "[:^xdigit:]",
14733           "[:space:]",
14734           "[:^space:]",
14735           "[:cntrl:]",
14736           "[:^cntrl:]",
14737           "[:ascii:]",
14738           "[:^ascii:]",
14739           "[\\v]",
14740           "[\\V]"
14741           };
14742 684         RXi_GET_DECL(prog,progi);
14743 684         GET_RE_DEBUG_FLAGS_DECL;
14744          
14745 684         PERL_ARGS_ASSERT_REGPROP;
14746            
14747 684         sv_setpvs(sv, "");
14748            
14749 684         if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14750           /* It would be nice to FAIL() here, but this may be called from
14751           regexec.c, and it would be hard to supply pRExC_state. */
14752 0         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14753 684         sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14754            
14755 684         k = PL_regkind[OP(o)];
14756            
14757 684         if (k == EXACT) {
14758 216         sv_catpvs(sv, " ");
14759           /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14760           * is a crude hack but it may be the best for now since
14761           * we have no flag "this EXACTish node was UTF-8"
14762           * --jhi */
14763 216         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14764           PERL_PV_ESCAPE_UNI_DETECT |
14765           PERL_PV_ESCAPE_NONASCII |
14766           PERL_PV_PRETTY_ELLIPSES |
14767           PERL_PV_PRETTY_LTGT |
14768           PERL_PV_PRETTY_NOCLEAR
14769           );
14770 468         } else if (k == TRIE) {
14771           /* print the details of the trie in dumpuntil instead, as
14772           * progi->data isn't available here */
14773 22         const char op = OP(o);
14774 22         const U32 n = ARG(o);
14775           const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14776 22         (reg_ac_data *)progi->data->data[n] :
14777           NULL;
14778 22         const reg_trie_data * const trie
14779 22         = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14780          
14781 22         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14782 22         DEBUG_TRIE_COMPILE_r(
14783           Perl_sv_catpvf(aTHX_ sv,
14784           "",
14785           (UV)trie->startstate,
14786           (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14787           (UV)trie->wordcount,
14788           (UV)trie->minlen,
14789           (UV)trie->maxlen,
14790           (UV)TRIE_CHARCOUNT(trie),
14791           (UV)trie->uniquecharcount
14792           )
14793           );
14794 22         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14795 14         sv_catpvs(sv, "[");
14796 14         (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14797           ? ANYOF_BITMAP(o)
14798           : TRIE_BITMAP(trie));
14799 14         sv_catpvs(sv, "]");
14800           }
14801          
14802 446         } else if (k == CURLY) {
14803 0         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14804 0         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14805 0         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14806           }
14807 446         else if (k == WHILEM && o->flags) /* Ordinal/of */
14808 0         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14809 446         else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14810 66         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14811 66         if ( RXp_PAREN_NAMES(prog) ) {
14812 0         if ( k != REF || (OP(o) < NREF)) {
14813 0         AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14814 0         SV **name= av_fetch(list, ARG(o), 0 );
14815 0         if (name)
14816 0         Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14817           }
14818           else {
14819 0         AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14820 0         SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14821 0         I32 *nums=(I32*)SvPVX(sv_dat);
14822 0         SV **name= av_fetch(list, nums[0], 0 );
14823           I32 n;
14824 0         if (name) {
14825 0         for ( n=0; n
14826 0         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14827 0         (n ? "," : ""), (IV)nums[n]);
14828           }
14829 0         Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14830           }
14831           }
14832           }
14833 380         } else if (k == GOSUB)
14834 0         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14835 380         else if (k == VERB) {
14836 0         if (!o->flags)
14837 0         Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14838 0         SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14839 380         } else if (k == LOGICAL)
14840 0         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14841 380         else if (k == ANYOF) {
14842 2         const U8 flags = ANYOF_FLAGS(o);
14843           int do_sep = 0;
14844            
14845            
14846 2         if (flags & ANYOF_LOCALE)
14847 0         sv_catpvs(sv, "{loc}");
14848 2         if (flags & ANYOF_LOC_FOLD)
14849 0         sv_catpvs(sv, "{i}");
14850 2         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14851 2         if (flags & ANYOF_INVERT)
14852 0         sv_catpvs(sv, "^");
14853            
14854           /* output what the standard cp 0-255 bitmap matches */
14855 2         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14856          
14857 2         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14858           /* output any special charclass tests (used entirely under use locale) */
14859 2         if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14860           int i;
14861 0         for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14862 0         if (ANYOF_CLASS_TEST(o,i)) {
14863 0         sv_catpv(sv, anyofs[i]);
14864           do_sep = 1;
14865           }
14866           }
14867           }
14868          
14869 2         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14870          
14871 2         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14872 0         sv_catpvs(sv, "{non-utf8-latin1-all}");
14873           }
14874            
14875           /* output information about the unicode matching */
14876 2         if (flags & ANYOF_UNICODE_ALL)
14877 0         sv_catpvs(sv, "{unicode_all}");
14878 2         else if (ANYOF_NONBITMAP(o)) {
14879           SV *lv; /* Set if there is something outside the bit map. */
14880           bool byte_output = FALSE; /* If something in the bitmap has been
14881           output */
14882            
14883 0         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14884 0         sv_catpvs(sv, "{outside bitmap}");
14885           }
14886           else {
14887 0         sv_catpvs(sv, "{utf8}");
14888           }
14889            
14890           /* Get the stuff that wasn't in the bitmap */
14891 0         (void) regclass_swash(prog, o, FALSE, &lv, NULL);
14892 0         if (lv && lv != &PL_sv_undef) {
14893 0         char *s = savesvpv(lv);
14894           char * const origs = s;
14895            
14896 0         while (*s && *s != '\n')
14897 0         s++;
14898            
14899 0         if (*s == '\n') {
14900 0         const char * const t = ++s;
14901            
14902           if (byte_output) {
14903           sv_catpvs(sv, " ");
14904           }
14905            
14906 0         while (*s) {
14907 0         if (*s == '\n') {
14908            
14909           /* Truncate very long output */
14910 0         if (s - origs > 256) {
14911 0         Perl_sv_catpvf(aTHX_ sv,
14912           "%.*s...",
14913 0         (int) (s - origs - 1),
14914           t);
14915 0         goto out_dump;
14916           }
14917 0         *s = ' ';
14918           }
14919 0         else if (*s == '\t') {
14920 0         *s = '-';
14921           }
14922 0         s++;
14923           }
14924 0         if (s[-1] == ' ')
14925 0         s[-1] = 0;
14926            
14927 0         sv_catpv(sv, t);
14928           }
14929            
14930           out_dump:
14931            
14932 0         Safefree(origs);
14933 0         SvREFCNT_dec_NN(lv);
14934           }
14935           }
14936            
14937 2         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14938           }
14939 378         else if (k == POSIXD || k == NPOSIXD) {
14940 0         U8 index = FLAGS(o) * 2;
14941 0         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14942 0         Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14943           }
14944           else {
14945 0         sv_catpv(sv, anyofs[index]);
14946           }
14947           }
14948 378         else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14949 0         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14950           #else
14951           PERL_UNUSED_CONTEXT;
14952           PERL_UNUSED_ARG(sv);
14953           PERL_UNUSED_ARG(o);
14954           PERL_UNUSED_ARG(prog);
14955           #endif /* DEBUGGING */
14956 684         }
14957            
14958           SV *
14959 0         Perl_re_intuit_string(pTHX_ REGEXP * const r)
14960           { /* Assume that RE_INTUIT is set */
14961           dVAR;
14962 0         struct regexp *const prog = ReANY(r);
14963 0         GET_RE_DEBUG_FLAGS_DECL;
14964            
14965 0         PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14966           PERL_UNUSED_CONTEXT;
14967            
14968 0         DEBUG_COMPILE_r(
14969           {
14970           const char * const s = SvPV_nolen_const(prog->check_substr
14971           ? prog->check_substr : prog->check_utf8);
14972            
14973           if (!PL_colorset) reginitcolors();
14974           PerlIO_printf(Perl_debug_log,
14975           "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14976           PL_colors[4],
14977           prog->check_substr ? "" : "utf8 ",
14978           PL_colors[5],PL_colors[0],
14979           s,
14980           PL_colors[1],
14981           (strlen(s) > 60 ? "..." : ""));
14982           } );
14983            
14984 0         return prog->check_substr ? prog->check_substr : prog->check_utf8;
14985           }
14986            
14987           /*
14988           pregfree()
14989          
14990           handles refcounting and freeing the perl core regexp structure. When
14991           it is necessary to actually free the structure the first thing it
14992           does is call the 'free' method of the regexp_engine associated to
14993           the regexp, allowing the handling of the void *pprivate; member
14994           first. (This routine is not overridable by extensions, which is why
14995           the extensions free is called first.)
14996          
14997           See regdupe and regdupe_internal if you change anything here.
14998           */
14999           #ifndef PERL_IN_XSUB_RE
15000           void
15001           Perl_pregfree(pTHX_ REGEXP *r)
15002           {
15003           SvREFCNT_dec(r);
15004           }
15005            
15006           void
15007           Perl_pregfree2(pTHX_ REGEXP *rx)
15008           {
15009           dVAR;
15010           struct regexp *const r = ReANY(rx);
15011           GET_RE_DEBUG_FLAGS_DECL;
15012            
15013           PERL_ARGS_ASSERT_PREGFREE2;
15014            
15015           if (r->mother_re) {
15016           ReREFCNT_dec(r->mother_re);
15017           } else {
15018           CALLREGFREE_PVT(rx); /* free the private data */
15019           SvREFCNT_dec(RXp_PAREN_NAMES(r));
15020           Safefree(r->xpv_len_u.xpvlenu_pv);
15021           }
15022           if (r->substrs) {
15023           SvREFCNT_dec(r->anchored_substr);
15024           SvREFCNT_dec(r->anchored_utf8);
15025           SvREFCNT_dec(r->float_substr);
15026           SvREFCNT_dec(r->float_utf8);
15027           Safefree(r->substrs);
15028           }
15029           RX_MATCH_COPY_FREE(rx);
15030           #ifdef PERL_ANY_COW
15031           SvREFCNT_dec(r->saved_copy);
15032           #endif
15033           Safefree(r->offs);
15034           SvREFCNT_dec(r->qr_anoncv);
15035           rx->sv_u.svu_rx = 0;
15036           }
15037            
15038           /* reg_temp_copy()
15039          
15040           This is a hacky workaround to the structural issue of match results
15041           being stored in the regexp structure which is in turn stored in
15042           PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15043           could be PL_curpm in multiple contexts, and could require multiple
15044           result sets being associated with the pattern simultaneously, such
15045           as when doing a recursive match with (??{$qr})
15046          
15047           The solution is to make a lightweight copy of the regexp structure
15048           when a qr// is returned from the code executed by (??{$qr}) this
15049           lightweight copy doesn't actually own any of its data except for
15050           the starp/end and the actual regexp structure itself.
15051          
15052           */
15053          
15054          
15055           REGEXP *
15056           Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15057           {
15058           struct regexp *ret;
15059           struct regexp *const r = ReANY(rx);
15060           const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15061            
15062           PERL_ARGS_ASSERT_REG_TEMP_COPY;
15063            
15064           if (!ret_x)
15065           ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15066           else {
15067           SvOK_off((SV *)ret_x);
15068           if (islv) {
15069           /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15070           to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15071           made both spots point to the same regexp body.) */
15072           REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15073           assert(!SvPVX(ret_x));
15074           ret_x->sv_u.svu_rx = temp->sv_any;
15075           temp->sv_any = NULL;
15076           SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15077           SvREFCNT_dec_NN(temp);
15078           /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15079           ing below will not set it. */
15080           SvCUR_set(ret_x, SvCUR(rx));
15081           }
15082           }
15083           /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15084           sv_force_normal(sv) is called. */
15085           SvFAKE_on(ret_x);
15086           ret = ReANY(ret_x);
15087          
15088           SvFLAGS(ret_x) |= SvUTF8(rx);
15089           /* We share the same string buffer as the original regexp, on which we
15090           hold a reference count, incremented when mother_re is set below.
15091           The string pointer is copied here, being part of the regexp struct.
15092           */
15093           memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15094           sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15095           if (r->offs) {
15096           const I32 npar = r->nparens+1;
15097           Newx(ret->offs, npar, regexp_paren_pair);
15098           Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15099           }
15100           if (r->substrs) {
15101           Newx(ret->substrs, 1, struct reg_substr_data);
15102           StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15103            
15104           SvREFCNT_inc_void(ret->anchored_substr);
15105           SvREFCNT_inc_void(ret->anchored_utf8);
15106           SvREFCNT_inc_void(ret->float_substr);
15107           SvREFCNT_inc_void(ret->float_utf8);
15108            
15109           /* check_substr and check_utf8, if non-NULL, point to either their
15110           anchored or float namesakes, and don't hold a second reference. */
15111           }
15112           RX_MATCH_COPIED_off(ret_x);
15113           #ifdef PERL_ANY_COW
15114           ret->saved_copy = NULL;
15115           #endif
15116           ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15117           SvREFCNT_inc_void(ret->qr_anoncv);
15118          
15119           return ret_x;
15120           }
15121           #endif
15122            
15123           /* regfree_internal()
15124            
15125           Free the private data in a regexp. This is overloadable by
15126           extensions. Perl takes care of the regexp structure in pregfree(),
15127           this covers the *pprivate pointer which technically perl doesn't
15128           know about, however of course we have to handle the
15129           regexp_internal structure when no extension is in use.
15130          
15131           Note this is called before freeing anything in the regexp
15132           structure.
15133           */
15134          
15135           void
15136 106         Perl_regfree_internal(pTHX_ REGEXP * const rx)
15137           {
15138           dVAR;
15139 106         struct regexp *const r = ReANY(rx);
15140 106         RXi_GET_DECL(r,ri);
15141 106         GET_RE_DEBUG_FLAGS_DECL;
15142            
15143 106         PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15144            
15145 418         DEBUG_COMPILE_r({
15146           if (!PL_colorset)
15147           reginitcolors();
15148           {
15149           SV *dsv= sv_newmortal();
15150           RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15151           dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15152           PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15153           PL_colors[4],PL_colors[5],s);
15154           }
15155           });
15156           #ifdef RE_TRACK_PATTERN_OFFSETS
15157 106         if (ri->u.offsets)
15158 106         Safefree(ri->u.offsets); /* 20010421 MJD */
15159           #endif
15160 106         if (ri->code_blocks) {
15161           int n;
15162 32         for (n = 0; n < ri->num_code_blocks; n++)
15163 32         SvREFCNT_dec(ri->code_blocks[n].src_regex);
15164 28         Safefree(ri->code_blocks);
15165           }
15166            
15167 106         if (ri->data) {
15168 48         int n = ri->data->count;
15169            
15170 212         while (--n >= 0) {
15171           /* If you add a ->what type here, update the comment in regcomp.h */
15172 116         switch (ri->data->what[n]) {
15173           case 'a':
15174           case 'r':
15175           case 's':
15176           case 'S':
15177           case 'u':
15178 56         SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15179 56         break;
15180           case 'f':
15181 0         Safefree(ri->data->data[n]);
15182 0         break;
15183           case 'l':
15184           case 'L':
15185           break;
15186           case 'T':
15187           { /* Aho Corasick add-on structure for a trie node.
15188           Used in stclass optimization only */
15189           U32 refcount;
15190 4         reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15191           OP_REFCNT_LOCK;
15192 4         refcount = --aho->refcount;
15193           OP_REFCNT_UNLOCK;
15194 4         if ( !refcount ) {
15195 4         PerlMemShared_free(aho->states);
15196 4         PerlMemShared_free(aho->fail);
15197           /* do this last!!!! */
15198 4         PerlMemShared_free(ri->data->data[n]);
15199 4         PerlMemShared_free(ri->regstclass);
15200           }
15201           }
15202           break;
15203           case 't':
15204           {
15205           /* trie structure. */
15206           U32 refcount;
15207 8         reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15208           OP_REFCNT_LOCK;
15209 8         refcount = --trie->refcount;
15210           OP_REFCNT_UNLOCK;
15211 8         if ( !refcount ) {
15212 8         PerlMemShared_free(trie->charmap);
15213 8         PerlMemShared_free(trie->states);
15214 8         PerlMemShared_free(trie->trans);
15215 8         if (trie->bitmap)
15216 4         PerlMemShared_free(trie->bitmap);
15217 8         if (trie->jump)
15218 0         PerlMemShared_free(trie->jump);
15219 8         PerlMemShared_free(trie->wordinfo);
15220           /* do this last!!!! */
15221 8         PerlMemShared_free(ri->data->data[n]);
15222           }
15223           }
15224           break;
15225           default:
15226 0         Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15227           }
15228           }
15229 48         Safefree(ri->data->what);
15230 48         Safefree(ri->data);
15231           }
15232            
15233 106         Safefree(ri);
15234 106         }
15235            
15236           #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15237           #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15238           #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15239            
15240           /*
15241           re_dup - duplicate a regexp.
15242          
15243           This routine is expected to clone a given regexp structure. It is only
15244           compiled under USE_ITHREADS.
15245            
15246           After all of the core data stored in struct regexp is duplicated
15247           the regexp_engine.dupe method is used to copy any private data
15248           stored in the *pprivate pointer. This allows extensions to handle
15249           any duplication it needs to do.
15250            
15251           See pregfree() and regfree_internal() if you change anything here.
15252           */
15253           #if defined(USE_ITHREADS)
15254           #ifndef PERL_IN_XSUB_RE
15255           void
15256           Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15257           {
15258           dVAR;
15259           I32 npar;
15260           const struct regexp *r = ReANY(sstr);
15261           struct regexp *ret = ReANY(dstr);
15262          
15263           PERL_ARGS_ASSERT_RE_DUP_GUTS;
15264            
15265           npar = r->nparens+1;
15266           Newx(ret->offs, npar, regexp_paren_pair);
15267           Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15268            
15269           if (ret->substrs) {
15270           /* Do it this way to avoid reading from *r after the StructCopy().
15271           That way, if any of the sv_dup_inc()s dislodge *r from the L1
15272           cache, it doesn't matter. */
15273           const bool anchored = r->check_substr
15274           ? r->check_substr == r->anchored_substr
15275           : r->check_utf8 == r->anchored_utf8;
15276           Newx(ret->substrs, 1, struct reg_substr_data);
15277           StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15278            
15279           ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15280           ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15281           ret->float_substr = sv_dup_inc(ret->float_substr, param);
15282           ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15283            
15284           /* check_substr and check_utf8, if non-NULL, point to either their
15285           anchored or float namesakes, and don't hold a second reference. */
15286            
15287           if (ret->check_substr) {
15288           if (anchored) {
15289           assert(r->check_utf8 == r->anchored_utf8);
15290           ret->check_substr = ret->anchored_substr;
15291           ret->check_utf8 = ret->anchored_utf8;
15292           } else {
15293           assert(r->check_substr == r->float_substr);
15294           assert(r->check_utf8 == r->float_utf8);
15295           ret->check_substr = ret->float_substr;
15296           ret->check_utf8 = ret->float_utf8;
15297           }
15298           } else if (ret->check_utf8) {
15299           if (anchored) {
15300           ret->check_utf8 = ret->anchored_utf8;
15301           } else {
15302           ret->check_utf8 = ret->float_utf8;
15303           }
15304           }
15305           }
15306            
15307           RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15308           ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15309            
15310           if (ret->pprivate)
15311           RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15312            
15313           if (RX_MATCH_COPIED(dstr))
15314           ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15315           else
15316           ret->subbeg = NULL;
15317           #ifdef PERL_ANY_COW
15318           ret->saved_copy = NULL;
15319           #endif
15320            
15321           /* Whether mother_re be set or no, we need to copy the string. We
15322           cannot refrain from copying it when the storage points directly to
15323           our mother regexp, because that's
15324           1: a buffer in a different thread
15325           2: something we no longer hold a reference on
15326           so we need to copy it locally. */
15327           RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15328           ret->mother_re = NULL;
15329           }
15330           #endif /* PERL_IN_XSUB_RE */
15331            
15332           /*
15333           regdupe_internal()
15334          
15335           This is the internal complement to regdupe() which is used to copy
15336           the structure pointed to by the *pprivate pointer in the regexp.
15337           This is the core version of the extension overridable cloning hook.
15338           The regexp structure being duplicated will be copied by perl prior
15339           to this and will be provided as the regexp *r argument, however
15340           with the /old/ structures pprivate pointer value. Thus this routine
15341           may override any copying normally done by perl.
15342          
15343           It returns a pointer to the new regexp_internal structure.
15344           */
15345            
15346           void *
15347           Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15348           {
15349           dVAR;
15350           struct regexp *const r = ReANY(rx);
15351           regexp_internal *reti;
15352           int len;
15353           RXi_GET_DECL(r,ri);
15354            
15355           PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15356          
15357           len = ProgLen(ri);
15358          
15359           Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15360           Copy(ri->program, reti->program, len+1, regnode);
15361            
15362           reti->num_code_blocks = ri->num_code_blocks;
15363           if (ri->code_blocks) {
15364           int n;
15365           Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15366           struct reg_code_block);
15367           Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15368           struct reg_code_block);
15369           for (n = 0; n < ri->num_code_blocks; n++)
15370           reti->code_blocks[n].src_regex = (REGEXP*)
15371           sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15372           }
15373           else
15374           reti->code_blocks = NULL;
15375            
15376           reti->regstclass = NULL;
15377            
15378           if (ri->data) {
15379           struct reg_data *d;
15380           const int count = ri->data->count;
15381           int i;
15382            
15383           Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15384           char, struct reg_data);
15385           Newx(d->what, count, U8);
15386            
15387           d->count = count;
15388           for (i = 0; i < count; i++) {
15389           d->what[i] = ri->data->what[i];
15390           switch (d->what[i]) {
15391           /* see also regcomp.h and regfree_internal() */
15392           case 'a': /* actually an AV, but the dup function is identical. */
15393           case 'r':
15394           case 's':
15395           case 'S':
15396           case 'u': /* actually an HV, but the dup function is identical. */
15397           d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15398           break;
15399           case 'f':
15400           /* This is cheating. */
15401           Newx(d->data[i], 1, struct regnode_charclass_class);
15402           StructCopy(ri->data->data[i], d->data[i],
15403           struct regnode_charclass_class);
15404           reti->regstclass = (regnode*)d->data[i];
15405           break;
15406           case 'T':
15407           /* Trie stclasses are readonly and can thus be shared
15408           * without duplication. We free the stclass in pregfree
15409           * when the corresponding reg_ac_data struct is freed.
15410           */
15411           reti->regstclass= ri->regstclass;
15412           /* Fall through */
15413           case 't':
15414           OP_REFCNT_LOCK;
15415           ((reg_trie_data*)ri->data->data[i])->refcount++;
15416           OP_REFCNT_UNLOCK;
15417           /* Fall through */
15418           case 'l':
15419           case 'L':
15420           d->data[i] = ri->data->data[i];
15421           break;
15422           default:
15423           Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15424           }
15425           }
15426            
15427           reti->data = d;
15428           }
15429           else
15430           reti->data = NULL;
15431            
15432           reti->name_list_idx = ri->name_list_idx;
15433            
15434           #ifdef RE_TRACK_PATTERN_OFFSETS
15435           if (ri->u.offsets) {
15436           Newx(reti->u.offsets, 2*len+1, U32);
15437           Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15438           }
15439           #else
15440           SetProgLen(reti,len);
15441           #endif
15442            
15443           return (void*)reti;
15444           }
15445            
15446           #endif /* USE_ITHREADS */
15447            
15448           #ifndef PERL_IN_XSUB_RE
15449            
15450           /*
15451           - regnext - dig the "next" pointer out of a node
15452           */
15453           regnode *
15454           Perl_regnext(pTHX_ regnode *p)
15455           {
15456           dVAR;
15457           I32 offset;
15458            
15459           if (!p)
15460           return(NULL);
15461            
15462           if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15463           Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15464           }
15465            
15466           offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15467           if (offset == 0)
15468           return(NULL);
15469            
15470           return(p+offset);
15471           }
15472           #endif
15473            
15474           STATIC void
15475 0         S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15476           {
15477           va_list args;
15478 0         STRLEN l1 = strlen(pat1);
15479 0         STRLEN l2 = strlen(pat2);
15480           char buf[512];
15481           SV *msv;
15482           const char *message;
15483            
15484 0         PERL_ARGS_ASSERT_RE_CROAK2;
15485            
15486 0         if (l1 > 510)
15487 0         l1 = 510;
15488 0         if (l1 + l2 > 510)
15489 0         l2 = 510 - l1;
15490 0         Copy(pat1, buf, l1 , char);
15491 0         Copy(pat2, buf + l1, l2 , char);
15492 0         buf[l1 + l2] = '\n';
15493 0         buf[l1 + l2 + 1] = '\0';
15494           #ifdef I_STDARG
15495           /* ANSI variant takes additional second argument */
15496 0         va_start(args, pat2);
15497           #else
15498           va_start(args);
15499           #endif
15500 0         msv = vmess(buf, &args);
15501 0         va_end(args);
15502 0         message = SvPV_const(msv,l1);
15503 0         if (l1 > 512)
15504 0         l1 = 512;
15505 0         Copy(message, buf, l1 , char);
15506 0         buf[l1-1] = '\0'; /* Overwrite \n */
15507 0         Perl_croak(aTHX_ "%s", buf);
15508           }
15509            
15510           /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15511            
15512           #ifndef PERL_IN_XSUB_RE
15513           void
15514           Perl_save_re_context(pTHX)
15515           {
15516           dVAR;
15517            
15518           /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15519           if (PL_curpm) {
15520           const REGEXP * const rx = PM_GETRE(PL_curpm);
15521           if (rx) {
15522           U32 i;
15523           for (i = 1; i <= RX_NPARENS(rx); i++) {
15524           char digits[TYPE_CHARS(long)];
15525           const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15526           GV *const *const gvp
15527           = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15528            
15529           if (gvp) {
15530           GV * const gv = *gvp;
15531           if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15532           save_scalar(gv);
15533           }
15534           }
15535           }
15536           }
15537           }
15538           #endif
15539            
15540           #ifdef DEBUGGING
15541            
15542           STATIC void
15543 40         S_put_byte(pTHX_ SV *sv, int c)
15544           {
15545 40         PERL_ARGS_ASSERT_PUT_BYTE;
15546            
15547           /* Our definition of isPRINT() ignores locales, so only bytes that are
15548           not part of UTF-8 are considered printable. I assume that the same
15549           holds for UTF-EBCDIC.
15550           Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15551           which Wikipedia says:
15552            
15553           EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15554           ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15555           identical, to the ASCII delete (DEL) or rubout control character. ...
15556           it is typically mapped to hexadecimal code 9F, in order to provide a
15557           unique character mapping in both directions)
15558            
15559           So the old condition can be simplified to !isPRINT(c) */
15560 40         if (!isPRINT(c)) {
15561 0         switch (c) {
15562 0         case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15563 0         case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15564 0         case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15565 0         case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15566 0         case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15567            
15568           default:
15569 0         Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15570 0         break;
15571           }
15572           }
15573           else {
15574 40         const char string = c;
15575 40         if (c == '-' || c == ']' || c == '\\' || c == '^')
15576 0         sv_catpvs(sv, "\\");
15577 40         sv_catpvn(sv, &string, 1);
15578           }
15579 40         }
15580            
15581           STATIC bool
15582 16         S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15583           {
15584           /* Appends to 'sv' a displayable version of the innards of the bracketed
15585           * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
15586           * output anything */
15587            
15588           int i;
15589           int rangestart = -1;
15590           bool has_output_anything = FALSE;
15591            
15592 16         PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15593            
15594 4112         for (i = 0; i <= 256; i++) {
15595 4112         if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15596 60         if (rangestart == -1)
15597           rangestart = i;
15598 4052         } else if (rangestart != -1) {
15599 30         int j = i - 1;
15600 30         if (i <= rangestart + 3) { /* Individual chars in short ranges */
15601 24         for (; rangestart < i; rangestart++)
15602 24         put_byte(sv, rangestart);
15603           }
15604 8         else if ( j > 255
15605 8         || ! isALPHANUMERIC(rangestart)
15606 8         || ! isALPHANUMERIC(j)
15607 8         || isDIGIT(rangestart) != isDIGIT(j)
15608 8         || isUPPER(rangestart) != isUPPER(j)
15609 8         || isLOWER(rangestart) != isLOWER(j)
15610            
15611           /* This final test should get optimized out except
15612           * on EBCDIC platforms, where it causes ranges that
15613           * cross discontinuities like i/j to be shown as hex
15614           * instead of the misleading, e.g. H-K (since that
15615           * range includes more than H, I, J, K). */
15616 8         || (j - rangestart)
15617           != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15618           {
15619 0         Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15620           rangestart,
15621           (j < 256) ? j : 255);
15622           }
15623           else { /* Here, the ends of the range are both digits, or both
15624           uppercase, or both lowercase; and there's no
15625           discontinuity in the range (which could happen on EBCDIC
15626           platforms) */
15627 8         put_byte(sv, rangestart);
15628 8         sv_catpvs(sv, "-");
15629 8         put_byte(sv, j);
15630           }
15631           rangestart = -1;
15632           has_output_anything = TRUE;
15633           }
15634           }
15635            
15636 16         return has_output_anything;
15637           }
15638            
15639           #define CLEAR_OPTSTART \
15640           if (optstart) STMT_START { \
15641           DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15642           optstart=NULL; \
15643           } STMT_END
15644            
15645           #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15646            
15647           STATIC const regnode *
15648 132         S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15649           const regnode *last, const regnode *plast,
15650           SV* sv, I32 indent, U32 depth)
15651           {
15652           dVAR;
15653           U8 op = PSEUDO; /* Arbitrary non-END op. */
15654           const regnode *next;
15655           const regnode *optstart= NULL;
15656          
15657 132         RXi_GET_DECL(r,ri);
15658 132         GET_RE_DEBUG_FLAGS_DECL;
15659            
15660 132         PERL_ARGS_ASSERT_DUMPUNTIL;
15661            
15662           #ifdef DEBUG_DUMPUNTIL
15663           PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15664           last ? last-start : 0,plast ? plast-start : 0);
15665           #endif
15666          
15667 132         if (plast && plast < last)
15668           last= plast;
15669            
15670 546         while (PL_regkind[op] != END && (!last || node < last)) {
15671           /* While that wasn't END last time... */
15672           NODE_ALIGN(node);
15673 414         op = OP(node);
15674 414         if (op == CLOSE || op == WHILEM)
15675 6         indent--;
15676 414         next = regnext((regnode *)node);
15677            
15678           /* Where, what. */
15679 414         if (OP(node) == OPTIMIZED) {
15680 50         if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15681           optstart = node;
15682           else
15683           goto after_print;
15684           } else
15685 364         CLEAR_OPTSTART;
15686            
15687 364         regprop(r, sv, node);
15688 364         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15689 364         (int)(2*indent + 1), "", SvPVX_const(sv));
15690          
15691 364         if (OP(node) != OPTIMIZED) {
15692 364         if (next == NULL) /* Next ptr. */
15693 132         PerlIO_printf(Perl_debug_log, " (0)");
15694 232         else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15695 0         PerlIO_printf(Perl_debug_log, " (FAIL)");
15696           else
15697 232         PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15698 364         (void)PerlIO_putc(Perl_debug_log, '\n');
15699           }
15700          
15701           after_print:
15702 414         if (PL_regkind[(U8)op] == BRANCHJ) {
15703 0         assert(next);
15704           {
15705 0         const regnode *nnode = (OP(next) == LONGJMP
15706           ? regnext((regnode *)next)
15707 0         : next);
15708 0         if (last && nnode > last)
15709           nnode = last;
15710 0         DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15711           }
15712           }
15713 414         else if (PL_regkind[(U8)op] == BRANCH) {
15714 0         assert(next);
15715 0         DUMPUNTIL(NEXTOPER(node), next);
15716           }
15717 414         else if ( PL_regkind[(U8)op] == TRIE ) {
15718           const regnode *this_trie = node;
15719 8         const char op = OP(node);
15720 8         const U32 n = ARG(node);
15721           const reg_ac_data * const ac = op>=AHOCORASICK ?
15722 8         (reg_ac_data *)ri->data->data[n] :
15723           NULL;
15724 8         const reg_trie_data * const trie =
15725 8         (reg_trie_data*)ri->data->data[optrie];
15726           #ifdef DEBUGGING
15727 8         AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15728           #endif
15729           const regnode *nextbranch= NULL;
15730           I32 word_idx;
15731 8         sv_setpvs(sv, "");
15732 64         for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15733 56         SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15734            
15735 112         PerlIO_printf(Perl_debug_log, "%*s%s ",
15736           (int)(2*(indent+3)), "",
15737 56         elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15738           PL_colors[0], PL_colors[1],
15739           (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15740           PERL_PV_PRETTY_ELLIPSES |
15741           PERL_PV_PRETTY_LTGT
15742           )
15743           : "???"
15744           );
15745 56         if (trie->jump) {
15746 0         U16 dist= trie->jump[word_idx+1];
15747 0         PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15748 0         (UV)((dist ? this_trie + dist : next) - start));
15749 0         if (dist) {
15750 0         if (!nextbranch)
15751 0         nextbranch= this_trie + trie->jump[0];
15752 0         DUMPUNTIL(this_trie + dist, nextbranch);
15753           }
15754 0         if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15755 0         nextbranch= regnext((regnode *)nextbranch);
15756           } else {
15757 56         PerlIO_printf(Perl_debug_log, "\n");
15758           }
15759           }
15760 8         if (last && next > last)
15761           node= last;
15762           else
15763           node= next;
15764           }
15765 406         else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15766 0         DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15767           NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15768           }
15769 406         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15770 0         assert(next);
15771 0         DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15772           }
15773 406         else if ( op == PLUS || op == STAR) {
15774 0         DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15775           }
15776 406         else if (PL_regkind[(U8)op] == ANYOF) {
15777           /* arglen 1 + class block */
15778 2         node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15779           ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15780 2         node = NEXTOPER(node);
15781           }
15782 404         else if (PL_regkind[(U8)op] == EXACT) {
15783           /* Literal string, where present. */
15784 124         node += NODE_SZ_STR(node) - 1;
15785 124         node = NEXTOPER(node);
15786           }
15787           else {
15788 280         node = NEXTOPER(node);
15789 280         node += regarglen[(U8)op];
15790           }
15791 414         if (op == CURLYX || op == OPEN)
15792 6         indent++;
15793           }
15794 132         CLEAR_OPTSTART;
15795           #ifdef DEBUG_DUMPUNTIL
15796           PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15797           #endif
15798 132         return node;
15799 1402         }
15800            
15801           #endif /* DEBUGGING */
15802            
15803           /*
15804           * Local variables:
15805           * c-indentation-style: bsd
15806           * c-basic-offset: 4
15807           * indent-tabs-mode: nil
15808           * End:
15809           *
15810           * ex: set ts=8 sts=4 sw=4 et:
15811           */