File Coverage

ext/re/re_comp.c
Criterion Covered Total %
statement 4613 5001 92.2
branch n/a
condition n/a
subroutine n/a
total 4613 5001 92.2


line stmt bran cond sub time code
1           /* regcomp.c
2           */
3            
4           /*
5           * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6           *
7           * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8           */
9            
10           /* This file contains functions for compiling a regular expression. See
11           * also regexec.c which funnily enough, contains functions for executing
12           * a regular expression.
13           *
14           * This file is also copied at build time to ext/re/re_comp.c, where
15           * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16           * This causes the main functions to be compiled under new names and with
17           * debugging support added, which makes "use re 'debug'" work.
18           */
19            
20           /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21           * confused with the original package (see point 3 below). Thanks, Henry!
22           */
23            
24           /* Additional note: this code is very heavily munged from Henry's version
25           * in places. In some spots I've traded clarity for efficiency, so don't
26           * blame Henry for some of the lack of readability.
27           */
28            
29           /* The names of the functions have been changed from regcomp and
30           * regexec to pregcomp and pregexec in order to avoid conflicts
31           * with the POSIX routines of the same names.
32           */
33            
34           #ifdef PERL_EXT_RE_BUILD
35           #include "re_top.h"
36           #endif
37            
38           /*
39           * pregcomp and pregexec -- regsub and regerror are not used in perl
40           *
41           * Copyright (c) 1986 by University of Toronto.
42           * Written by Henry Spencer. Not derived from licensed software.
43           *
44           * Permission is granted to anyone to use this software for any
45           * purpose on any computer system, and to redistribute it freely,
46           * subject to the following restrictions:
47           *
48           * 1. The author is not responsible for the consequences of use of
49           * this software, no matter how awful, even if they arise
50           * from defects in it.
51           *
52           * 2. The origin of this software must not be misrepresented, either
53           * by explicit claim or by omission.
54           *
55           * 3. Altered versions must be plainly marked as such, and must not
56           * be misrepresented as being the original software.
57           *
58           *
59           **** Alterations to Henry's code are...
60           ****
61           **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62           **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63           **** by Larry Wall and others
64           ****
65           **** You may distribute under the terms of either the GNU General Public
66           **** License or the Artistic License, as specified in the README file.
67            
68           *
69           * Beware that some of this code is subtly aware of the way operator
70           * precedence is structured in regular expressions. Serious changes in
71           * regular-expression syntax might require a total rethink.
72           */
73           #include "EXTERN.h"
74           #define PERL_IN_REGCOMP_C
75           #include "perl.h"
76            
77           #ifndef PERL_IN_XSUB_RE
78           # include "INTERN.h"
79           #endif
80            
81           #define REG_COMP_C
82           #ifdef PERL_IN_XSUB_RE
83           # include "re_comp.h"
84           extern const struct regexp_engine my_reg_engine;
85           #else
86           # include "regcomp.h"
87           #endif
88            
89           #include "dquote_static.c"
90           #include "charclass_invlists.h"
91           #include "inline_invlist.c"
92           #include "unicode_constants.h"
93            
94           #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95           #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96           #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97            
98           #ifdef op
99           #undef op
100           #endif /* op */
101            
102           #ifdef MSDOS
103           # if defined(BUGGY_MSC6)
104           /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105           # pragma optimize("a",off)
106           /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107           # pragma optimize("w",on )
108           # endif /* BUGGY_MSC6 */
109           #endif /* MSDOS */
110            
111           #ifndef STATIC
112           #define STATIC static
113           #endif
114            
115            
116           typedef struct RExC_state_t {
117           U32 flags; /* RXf_* are we folding, multilining? */
118           U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119           char *precomp; /* uncompiled string. */
120           REGEXP *rx_sv; /* The SV that is the regexp. */
121           regexp *rx; /* perl core regexp structure */
122           regexp_internal *rxi; /* internal data for regexp object pprivate field */
123           char *start; /* Start of input for compile */
124           char *end; /* End of input for compile */
125           char *parse; /* Input-scan pointer. */
126           SSize_t whilem_seen; /* number of WHILEM in this expr */
127           regnode *emit_start; /* Start of emitted-code area */
128           regnode *emit_bound; /* First regnode outside of the allocated space */
129           regnode *emit; /* Code-emit pointer; if = &emit_dummy,
130           implies compiling, so don't emit */
131           regnode emit_dummy; /* placeholder for emit to point to */
132           I32 naughty; /* How bad is this pattern? */
133           I32 sawback; /* Did we see \1, ...? */
134           U32 seen;
135           SSize_t size; /* Code size. */
136           I32 npar; /* Capture buffer count, (OPEN). */
137           I32 cpar; /* Capture buffer count, (CLOSE). */
138           I32 nestroot; /* root parens we are in - used by accept */
139           I32 extralen;
140           I32 seen_zerolen;
141           regnode **open_parens; /* pointers to open parens */
142           regnode **close_parens; /* pointers to close parens */
143           regnode *opend; /* END node in program */
144           I32 utf8; /* whether the pattern is utf8 or not */
145           I32 orig_utf8; /* whether the pattern was originally in utf8 */
146           /* XXX use this for future optimisation of case
147           * where pattern must be upgraded to utf8. */
148           I32 uni_semantics; /* If a d charset modifier should use unicode
149           rules, even if the pattern is not in
150           utf8 */
151           HV *paren_names; /* Paren names */
152          
153           regnode **recurse; /* Recurse regops */
154           I32 recurse_count; /* Number of recurse regops */
155           I32 in_lookbehind;
156           I32 contains_locale;
157           I32 override_recoding;
158           I32 in_multi_char_class;
159           struct reg_code_block *code_blocks; /* positions of literal (?{})
160           within pattern */
161           int num_code_blocks; /* size of code_blocks[] */
162           int code_index; /* next code_blocks[] slot */
163           #if ADD_TO_REGEXEC
164           char *starttry; /* -Dr: where regtry was called. */
165           #define RExC_starttry (pRExC_state->starttry)
166           #endif
167           SV *runtime_code_qr; /* qr with the runtime code blocks */
168           #ifdef DEBUGGING
169           const char *lastparse;
170           I32 lastnum;
171           AV *paren_name_list; /* idx -> name */
172           #define RExC_lastparse (pRExC_state->lastparse)
173           #define RExC_lastnum (pRExC_state->lastnum)
174           #define RExC_paren_name_list (pRExC_state->paren_name_list)
175           #endif
176           } RExC_state_t;
177            
178           #define RExC_flags (pRExC_state->flags)
179           #define RExC_pm_flags (pRExC_state->pm_flags)
180           #define RExC_precomp (pRExC_state->precomp)
181           #define RExC_rx_sv (pRExC_state->rx_sv)
182           #define RExC_rx (pRExC_state->rx)
183           #define RExC_rxi (pRExC_state->rxi)
184           #define RExC_start (pRExC_state->start)
185           #define RExC_end (pRExC_state->end)
186           #define RExC_parse (pRExC_state->parse)
187           #define RExC_whilem_seen (pRExC_state->whilem_seen)
188           #ifdef RE_TRACK_PATTERN_OFFSETS
189           #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
190           #endif
191           #define RExC_emit (pRExC_state->emit)
192           #define RExC_emit_dummy (pRExC_state->emit_dummy)
193           #define RExC_emit_start (pRExC_state->emit_start)
194           #define RExC_emit_bound (pRExC_state->emit_bound)
195           #define RExC_naughty (pRExC_state->naughty)
196           #define RExC_sawback (pRExC_state->sawback)
197           #define RExC_seen (pRExC_state->seen)
198           #define RExC_size (pRExC_state->size)
199           #define RExC_npar (pRExC_state->npar)
200           #define RExC_nestroot (pRExC_state->nestroot)
201           #define RExC_extralen (pRExC_state->extralen)
202           #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203           #define RExC_utf8 (pRExC_state->utf8)
204           #define RExC_uni_semantics (pRExC_state->uni_semantics)
205           #define RExC_orig_utf8 (pRExC_state->orig_utf8)
206           #define RExC_open_parens (pRExC_state->open_parens)
207           #define RExC_close_parens (pRExC_state->close_parens)
208           #define RExC_opend (pRExC_state->opend)
209           #define RExC_paren_names (pRExC_state->paren_names)
210           #define RExC_recurse (pRExC_state->recurse)
211           #define RExC_recurse_count (pRExC_state->recurse_count)
212           #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
213           #define RExC_contains_locale (pRExC_state->contains_locale)
214           #define RExC_override_recoding (pRExC_state->override_recoding)
215           #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
216            
217            
218           #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
219           #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
220           ((*s) == '{' && regcurly(s, FALSE)))
221            
222           #ifdef SPSTART
223           #undef SPSTART /* dratted cpp namespace... */
224           #endif
225           /*
226           * Flags to be passed up and down.
227           */
228           #define WORST 0 /* Worst case. */
229           #define HASWIDTH 0x01 /* Known to match non-null strings. */
230            
231           /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
232           * character. (There needs to be a case: in the switch statement in regexec.c
233           * for any node marked SIMPLE.) Note that this is not the same thing as
234           * REGNODE_SIMPLE */
235           #define SIMPLE 0x02
236           #define SPSTART 0x04 /* Starts with * or + */
237           #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
238           #define TRYAGAIN 0x10 /* Weeded out a declaration. */
239           #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
240            
241           #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
242            
243           /* whether trie related optimizations are enabled */
244           #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245           #define TRIE_STUDY_OPT
246           #define FULL_TRIE_STUDY
247           #define TRIE_STCLASS
248           #endif
249            
250            
251            
252           #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253           #define PBITVAL(paren) (1 << ((paren) & 7))
254           #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255           #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256           #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
257            
258           #define REQUIRE_UTF8 STMT_START { \
259           if (!UTF) { \
260           *flagp = RESTART_UTF8; \
261           return NULL; \
262           } \
263           } STMT_END
264            
265           /* This converts the named class defined in regcomp.h to its equivalent class
266           * number defined in handy.h. */
267           #define namedclass_to_classnum(class) ((int) ((class) / 2))
268           #define classnum_to_namedclass(classnum) ((classnum) * 2)
269            
270           /* About scan_data_t.
271            
272           During optimisation we recurse through the regexp program performing
273           various inplace (keyhole style) optimisations. In addition study_chunk
274           and scan_commit populate this data structure with information about
275           what strings MUST appear in the pattern. We look for the longest
276           string that must appear at a fixed location, and we look for the
277           longest string that may appear at a floating location. So for instance
278           in the pattern:
279          
280           /FOO[xX]A.*B[xX]BAR/
281          
282           Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
283           strings (because they follow a .* construct). study_chunk will identify
284           both FOO and BAR as being the longest fixed and floating strings respectively.
285          
286           The strings can be composites, for instance
287          
288           /(f)(o)(o)/
289          
290           will result in a composite fixed substring 'foo'.
291          
292           For each string some basic information is maintained:
293          
294           - offset or min_offset
295           This is the position the string must appear at, or not before.
296           It also implicitly (when combined with minlenp) tells us how many
297           characters must match before the string we are searching for.
298           Likewise when combined with minlenp and the length of the string it
299           tells us how many characters must appear after the string we have
300           found.
301          
302           - max_offset
303           Only used for floating strings. This is the rightmost point that
304           the string can appear at. If set to SSize_t_MAX it indicates that the
305           string can occur infinitely far to the right.
306          
307           - minlenp
308           A pointer to the minimum number of characters of the pattern that the
309           string was found inside. This is important as in the case of positive
310           lookahead or positive lookbehind we can have multiple patterns
311           involved. Consider
312          
313           /(?=FOO).*F/
314          
315           The minimum length of the pattern overall is 3, the minimum length
316           of the lookahead part is 3, but the minimum length of the part that
317           will actually match is 1. So 'FOO's minimum length is 3, but the
318           minimum length for the F is 1. This is important as the minimum length
319           is used to determine offsets in front of and behind the string being
320           looked for. Since strings can be composites this is the length of the
321           pattern at the time it was committed with a scan_commit. Note that
322           the length is calculated by study_chunk, so that the minimum lengths
323           are not known until the full pattern has been compiled, thus the
324           pointer to the value.
325          
326           - lookbehind
327          
328           In the case of lookbehind the string being searched for can be
329           offset past the start point of the final matching string.
330           If this value was just blithely removed from the min_offset it would
331           invalidate some of the calculations for how many chars must match
332           before or after (as they are derived from min_offset and minlen and
333           the length of the string being searched for).
334           When the final pattern is compiled and the data is moved from the
335           scan_data_t structure into the regexp structure the information
336           about lookbehind is factored in, with the information that would
337           have been lost precalculated in the end_shift field for the
338           associated string.
339            
340           The fields pos_min and pos_delta are used to store the minimum offset
341           and the delta to the maximum offset at the current point in the pattern.
342            
343           */
344            
345           typedef struct scan_data_t {
346           /*I32 len_min; unused */
347           /*I32 len_delta; unused */
348           SSize_t pos_min;
349           SSize_t pos_delta;
350           SV *last_found;
351           SSize_t last_end; /* min value, <0 unless valid. */
352           SSize_t last_start_min;
353           SSize_t last_start_max;
354           SV **longest; /* Either &l_fixed, or &l_float. */
355           SV *longest_fixed; /* longest fixed string found in pattern */
356           SSize_t offset_fixed; /* offset where it starts */
357           SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
358           I32 lookbehind_fixed; /* is the position of the string modfied by LB */
359           SV *longest_float; /* longest floating string found in pattern */
360           SSize_t offset_float_min; /* earliest point in string it can appear */
361           SSize_t offset_float_max; /* latest point in string it can appear */
362           SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
363           SSize_t lookbehind_float; /* is the pos of the string modified by LB */
364           I32 flags;
365           I32 whilem_c;
366           SSize_t *last_closep;
367           struct regnode_charclass_class *start_class;
368           } scan_data_t;
369            
370           /* The below is perhaps overboard, but this allows us to save a test at the
371           * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
372           * and 'a' differ by a single bit; the same with the upper and lower case of
373           * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
374           * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
375           * then inverts it to form a mask, with just a single 0, in the bit position
376           * where the upper- and lowercase differ. XXX There are about 40 other
377           * instances in the Perl core where this micro-optimization could be used.
378           * Should decide if maintenance cost is worse, before changing those
379           *
380           * Returns a boolean as to whether or not 'v' is either a lowercase or
381           * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
382           * compile-time constant, the generated code is better than some optimizing
383           * compilers figure out, amounting to a mask and test. The results are
384           * meaningless if 'c' is not one of [A-Za-z] */
385           #define isARG2_lower_or_UPPER_ARG1(c, v) \
386           (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
387            
388           /*
389           * Forward declarations for pregcomp()'s friends.
390           */
391            
392           static const scan_data_t zero_scan_data =
393           { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
394            
395           #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
396           #define SF_BEFORE_SEOL 0x0001
397           #define SF_BEFORE_MEOL 0x0002
398           #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
399           #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
400            
401           #ifdef NO_UNARY_PLUS
402           # define SF_FIX_SHIFT_EOL (0+2)
403           # define SF_FL_SHIFT_EOL (0+4)
404           #else
405           # define SF_FIX_SHIFT_EOL (+2)
406           # define SF_FL_SHIFT_EOL (+4)
407           #endif
408            
409           #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
410           #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
411            
412           #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
413           #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
414           #define SF_IS_INF 0x0040
415           #define SF_HAS_PAR 0x0080
416           #define SF_IN_PAR 0x0100
417           #define SF_HAS_EVAL 0x0200
418           #define SCF_DO_SUBSTR 0x0400
419           #define SCF_DO_STCLASS_AND 0x0800
420           #define SCF_DO_STCLASS_OR 0x1000
421           #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
422           #define SCF_WHILEM_VISITED_POS 0x2000
423            
424           #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
425           #define SCF_SEEN_ACCEPT 0x8000
426           #define SCF_TRIE_DOING_RESTUDY 0x10000
427            
428           #define UTF cBOOL(RExC_utf8)
429            
430           /* The enums for all these are ordered so things work out correctly */
431           #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
432           #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
433           #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
434           #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
435           #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
436           #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
437           #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
438            
439           #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
440            
441           #define OOB_NAMEDCLASS -1
442            
443           /* There is no code point that is out-of-bounds, so this is problematic. But
444           * its only current use is to initialize a variable that is always set before
445           * looked at. */
446           #define OOB_UNICODE 0xDEADBEEF
447            
448           #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
449           #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
450            
451            
452           /* length of regex to show in messages that don't mark a position within */
453           #define RegexLengthToShowInErrorMessages 127
454            
455           /*
456           * If MARKER[12] are adjusted, be sure to adjust the constants at the top
457           * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
458           * op/pragma/warn/regcomp.
459           */
460           #define MARKER1 "<-- HERE" /* marker as it appears in the description */
461           #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
462            
463           #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
464            
465           /*
466           * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
467           * arg. Show regex, up to a maximum length. If it's too long, chop and add
468           * "...".
469           */
470           #define _FAIL(code) STMT_START { \
471           const char *ellipses = ""; \
472           IV len = RExC_end - RExC_precomp; \
473           \
474           if (!SIZE_ONLY) \
475           SAVEFREESV(RExC_rx_sv); \
476           if (len > RegexLengthToShowInErrorMessages) { \
477           /* chop 10 shorter than the max, to ensure meaning of "..." */ \
478           len = RegexLengthToShowInErrorMessages - 10; \
479           ellipses = "..."; \
480           } \
481           code; \
482           } STMT_END
483            
484           #define FAIL(msg) _FAIL( \
485           Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
486           msg, (int)len, RExC_precomp, ellipses))
487            
488           #define FAIL2(msg,arg) _FAIL( \
489           Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
490           arg, (int)len, RExC_precomp, ellipses))
491            
492           /*
493           * Simple_vFAIL -- like FAIL, but marks the current location in the scan
494           */
495           #define Simple_vFAIL(m) STMT_START { \
496           const IV offset = RExC_parse - RExC_precomp; \
497           Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
498           m, (int)offset, RExC_precomp, RExC_precomp + offset); \
499           } STMT_END
500            
501           /*
502           * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
503           */
504           #define vFAIL(m) STMT_START { \
505           if (!SIZE_ONLY) \
506           SAVEFREESV(RExC_rx_sv); \
507           Simple_vFAIL(m); \
508           } STMT_END
509            
510           /*
511           * Like Simple_vFAIL(), but accepts two arguments.
512           */
513           #define Simple_vFAIL2(m,a1) STMT_START { \
514           const IV offset = RExC_parse - RExC_precomp; \
515           S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
516           (int)offset, RExC_precomp, RExC_precomp + offset); \
517           } STMT_END
518            
519           /*
520           * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
521           */
522           #define vFAIL2(m,a1) STMT_START { \
523           if (!SIZE_ONLY) \
524           SAVEFREESV(RExC_rx_sv); \
525           Simple_vFAIL2(m, a1); \
526           } STMT_END
527            
528            
529           /*
530           * Like Simple_vFAIL(), but accepts three arguments.
531           */
532           #define Simple_vFAIL3(m, a1, a2) STMT_START { \
533           const IV offset = RExC_parse - RExC_precomp; \
534           S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
535           (int)offset, RExC_precomp, RExC_precomp + offset); \
536           } STMT_END
537            
538           /*
539           * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
540           */
541           #define vFAIL3(m,a1,a2) STMT_START { \
542           if (!SIZE_ONLY) \
543           SAVEFREESV(RExC_rx_sv); \
544           Simple_vFAIL3(m, a1, a2); \
545           } STMT_END
546            
547           /*
548           * Like Simple_vFAIL(), but accepts four arguments.
549           */
550           #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
551           const IV offset = RExC_parse - RExC_precomp; \
552           S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
553           (int)offset, RExC_precomp, RExC_precomp + offset); \
554           } STMT_END
555            
556           #define vFAIL4(m,a1,a2,a3) STMT_START { \
557           if (!SIZE_ONLY) \
558           SAVEFREESV(RExC_rx_sv); \
559           Simple_vFAIL4(m, a1, a2, a3); \
560           } STMT_END
561            
562           /* m is not necessarily a "literal string", in this macro */
563           #define reg_warn_non_literal_string(loc, m) STMT_START { \
564           const IV offset = loc - RExC_precomp; \
565           Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
566           m, (int)offset, RExC_precomp, RExC_precomp + offset); \
567           } STMT_END
568            
569           #define ckWARNreg(loc,m) STMT_START { \
570           const IV offset = loc - RExC_precomp; \
571           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
572           (int)offset, RExC_precomp, RExC_precomp + offset); \
573           } STMT_END
574            
575           #define vWARN_dep(loc, m) STMT_START { \
576           const IV offset = loc - RExC_precomp; \
577           Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
578           (int)offset, RExC_precomp, RExC_precomp + offset); \
579           } STMT_END
580            
581           #define ckWARNdep(loc,m) STMT_START { \
582           const IV offset = loc - RExC_precomp; \
583           Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
584           m REPORT_LOCATION, \
585           (int)offset, RExC_precomp, RExC_precomp + offset); \
586           } STMT_END
587            
588           #define ckWARNregdep(loc,m) STMT_START { \
589           const IV offset = loc - RExC_precomp; \
590           Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
591           m REPORT_LOCATION, \
592           (int)offset, RExC_precomp, RExC_precomp + offset); \
593           } STMT_END
594            
595           #define ckWARN2reg_d(loc,m, a1) STMT_START { \
596           const IV offset = loc - RExC_precomp; \
597           Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
598           m REPORT_LOCATION, \
599           a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
600           } STMT_END
601            
602           #define ckWARN2reg(loc, m, a1) STMT_START { \
603           const IV offset = loc - RExC_precomp; \
604           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
605           a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
606           } STMT_END
607            
608           #define vWARN3(loc, m, a1, a2) STMT_START { \
609           const IV offset = loc - RExC_precomp; \
610           Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611           a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
612           } STMT_END
613            
614           #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
615           const IV offset = loc - RExC_precomp; \
616           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
617           a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
618           } STMT_END
619            
620           #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
621           const IV offset = loc - RExC_precomp; \
622           Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
623           a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
624           } STMT_END
625            
626           #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
627           const IV offset = loc - RExC_precomp; \
628           Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
629           a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
630           } STMT_END
631            
632           #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
633           const IV offset = loc - RExC_precomp; \
634           Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635           a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
636           } STMT_END
637            
638            
639           /* Allow for side effects in s */
640           #define REGC(c,s) STMT_START { \
641           if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
642           } STMT_END
643            
644           /* Macros for recording node offsets. 20001227 mjd@plover.com
645           * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
646           * element 2*n-1 of the array. Element #2n holds the byte length node #n.
647           * Element 0 holds the number n.
648           * Position is 1 indexed.
649           */
650           #ifndef RE_TRACK_PATTERN_OFFSETS
651           #define Set_Node_Offset_To_R(node,byte)
652           #define Set_Node_Offset(node,byte)
653           #define Set_Cur_Node_Offset
654           #define Set_Node_Length_To_R(node,len)
655           #define Set_Node_Length(node,len)
656           #define Set_Node_Cur_Length(node,start)
657           #define Node_Offset(n)
658           #define Node_Length(n)
659           #define Set_Node_Offset_Length(node,offset,len)
660           #define ProgLen(ri) ri->u.proglen
661           #define SetProgLen(ri,x) ri->u.proglen = x
662           #else
663           #define ProgLen(ri) ri->u.offsets[0]
664           #define SetProgLen(ri,x) ri->u.offsets[0] = x
665           #define Set_Node_Offset_To_R(node,byte) STMT_START { \
666           if (! SIZE_ONLY) { \
667           MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
668           __LINE__, (int)(node), (int)(byte))); \
669           if((node) < 0) { \
670           Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
671           } else { \
672           RExC_offsets[2*(node)-1] = (byte); \
673           } \
674           } \
675           } STMT_END
676            
677           #define Set_Node_Offset(node,byte) \
678           Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
679           #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
680            
681           #define Set_Node_Length_To_R(node,len) STMT_START { \
682           if (! SIZE_ONLY) { \
683           MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
684           __LINE__, (int)(node), (int)(len))); \
685           if((node) < 0) { \
686           Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
687           } else { \
688           RExC_offsets[2*(node)] = (len); \
689           } \
690           } \
691           } STMT_END
692            
693           #define Set_Node_Length(node,len) \
694           Set_Node_Length_To_R((node)-RExC_emit_start, len)
695           #define Set_Node_Cur_Length(node, start) \
696           Set_Node_Length(node, RExC_parse - start)
697            
698           /* Get offsets and lengths */
699           #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
700           #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
701            
702           #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
703           Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
704           Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
705           } STMT_END
706           #endif
707            
708           #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
709           #define EXPERIMENTAL_INPLACESCAN
710           #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
711            
712           #define DEBUG_STUDYDATA(str,data,depth) \
713           DEBUG_OPTIMISE_MORE_r(if(data){ \
714           PerlIO_printf(Perl_debug_log, \
715           "%*s" str "Pos:%"IVdf"/%"IVdf \
716           " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
717           (int)(depth)*2, "", \
718           (IV)((data)->pos_min), \
719           (IV)((data)->pos_delta), \
720           (UV)((data)->flags), \
721           (IV)((data)->whilem_c), \
722           (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
723           is_inf ? "INF " : "" \
724           ); \
725           if ((data)->last_found) \
726           PerlIO_printf(Perl_debug_log, \
727           "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
728           " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
729           SvPVX_const((data)->last_found), \
730           (IV)((data)->last_end), \
731           (IV)((data)->last_start_min), \
732           (IV)((data)->last_start_max), \
733           ((data)->longest && \
734           (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
735           SvPVX_const((data)->longest_fixed), \
736           (IV)((data)->offset_fixed), \
737           ((data)->longest && \
738           (data)->longest==&((data)->longest_float)) ? "*" : "", \
739           SvPVX_const((data)->longest_float), \
740           (IV)((data)->offset_float_min), \
741           (IV)((data)->offset_float_max) \
742           ); \
743           PerlIO_printf(Perl_debug_log,"\n"); \
744           });
745            
746           /* Mark that we cannot extend a found fixed substring at this point.
747           Update the longest found anchored substring and the longest found
748           floating substrings if needed. */
749            
750           STATIC void
751 24403259         S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
752           SSize_t *minlenp, int is_inf)
753           {
754 24403259         const STRLEN l = CHR_SVLEN(data->last_found);
755 24403259         const STRLEN old_l = CHR_SVLEN(*data->longest);
756 24403259         GET_RE_DEBUG_FLAGS_DECL;
757            
758 9017128         PERL_ARGS_ASSERT_SCAN_COMMIT;
759            
760 9017128         if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
761 4708910         SvSetMagicSV(*data->longest, data->last_found);
762 4708910         if (*data->longest == data->longest_fixed) {
763 519868         data->offset_fixed = l ? data->last_start_min : data->pos_min;
764 4189152         if (data->flags & SF_BEFORE_EOL)
765           data->flags
766 4708788         |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
767           else
768 4708904         data->flags &= ~SF_FIX_BEFORE_EOL;
769 4308292         data->minlen_fixed=minlenp;
770 4308292         data->lookbehind_fixed=0;
771           }
772           else { /* *data->longest == data->longest_float */
773 4308182         data->offset_float_min = l ? data->last_start_min : data->pos_min;
774 1181149         data->offset_float_max = (l
775           ? data->last_start_max
776 4308182         : (data->pos_delta == SSize_t_MAX
777           ? SSize_t_MAX
778 2937189         : data->pos_min + data->pos_delta));
779 1371253         if (is_inf
780 4308182         || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
781 1693699         data->offset_float_max = SSize_t_MAX;
782 2614483         if (data->flags & SF_BEFORE_EOL)
783           data->flags
784 4308180         |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
785           else
786 4308178         data->flags &= ~SF_FL_BEFORE_EOL;
787 24403101         data->minlen_float=minlenp;
788 24403101         data->lookbehind_float=0;
789           }
790           }
791 24403259         SvCUR_set(data->last_found, 0);
792           {
793 1237694         SV * const sv = data->last_found;
794 1237694         if (SvUTF8(sv) && SvMAGICAL(sv)) {
795 1237550         MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
796 24403115         if (mg)
797 24403115         mg->mg_len = 0;
798           }
799           }
800 24403259         data->last_end = -1;
801 11174063         data->flags &= ~SF_BEFORE_EOL;
802 11174063         DEBUG_STUDYDATA("commit: ",data,0);
803 11174063         }
804            
805           /* These macros set, clear and test whether the synthetic start class ('ssc',
806           * given by the parameter) matches an empty string (EOS). This uses the
807           * 'next_off' field in the node, to save a bit in the flags field. The ssc
808           * stands alone, so there is never a next_off, so this field is otherwise
809           * unused. The EOS information is used only for compilation, but theoretically
810           * it could be passed on to the execution code. This could be used to store
811           * more than one bit of information, but only this one is currently used. */
812           #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
813           #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
814           #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
815            
816           /* Can match anything (initialization) */
817           STATIC void
818 11174083         S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
819           {
820 11174083         PERL_ARGS_ASSERT_CL_ANYTHING;
821            
822 322450         ANYOF_BITMAP_SETALL(cl);
823 322450         cl->flags = ANYOF_UNICODE_ALL;
824 10851817         SET_SSC_EOS(cl);
825            
826           /* If any portion of the regex is to operate under locale rules,
827           * initialization includes it. The reason this isn't done for all regexes
828           * is that the optimizer was written under the assumption that locale was
829           * all-or-nothing. Given the complexity and lack of documentation in the
830           * optimizer, and that there are inadequate test cases for locale, so many
831           * parts of it may not work properly, it is safest to avoid locale unless
832           * necessary. */
833 11174083         if (RExC_contains_locale) {
834 561264         ANYOF_CLASS_SETALL(cl); /* /l uses class */
835 9000016         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
836           }
837           else {
838 8472778         ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
839           }
840 527606         }
841            
842           /* Can match anything (initialization) */
843           STATIC int
844 289256         S_cl_is_anything(const struct regnode_charclass_class *cl)
845           {
846           int value;
847            
848 323810         PERL_ARGS_ASSERT_CL_IS_ANYTHING;
849            
850 11157999         for (value = 0; value < ANYOF_MAX; value += 2)
851 11157999         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
852           return 1;
853 11157999         if (!(cl->flags & ANYOF_UNICODE_ALL))
854           return 0;
855 11157999         if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
856           return 0;
857 11157999         return 1;
858           }
859            
860           /* Can match anything (initialization) */
861           STATIC void
862 2996302         S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
863           {
864 2996302         PERL_ARGS_ASSERT_CL_INIT;
865            
866           Zero(cl, 1, struct regnode_charclass_class);
867 2897626         cl->type = ANYOF;
868 2765006         cl_anything(pRExC_state, cl);
869 2765006         ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
870 2765006         }
871            
872           /* These two functions currently do the exact same thing */
873           #define cl_init_zero cl_init
874            
875           /* 'AND' a given class with another one. Can create false positives. 'cl'
876           * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
877           * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
878           STATIC void
879 2764828         S_cl_and(struct regnode_charclass_class *cl,
880           const struct regnode_charclass_class *and_with)
881           {
882 476246         PERL_ARGS_ASSERT_CL_AND;
883            
884 468742         assert(PL_regkind[and_with->type] == ANYOF);
885            
886           /* I (khw) am not sure all these restrictions are necessary XXX */
887 89415577         if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
888 88005574         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
889 2996124         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
890 47614         && !(and_with->flags & ANYOF_LOC_FOLD)
891 47614         && !(cl->flags & ANYOF_LOC_FOLD)) {
892           int i;
893            
894 47614         if (and_with->flags & ANYOF_INVERT)
895 47608         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
896 0         cl->bitmap[i] &= ~and_with->bitmap[i];
897           else
898 47800         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
899 3578         cl->bitmap[i] &= and_with->bitmap[i];
900           } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
901            
902 2948516         if (and_with->flags & ANYOF_INVERT) {
903            
904           /* Here, the and'ed node is inverted. Get the AND of the flags that
905           * aren't affected by the inversion. Those that are affected are
906           * handled individually below */
907 2389512         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
908 489192         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
909 489192         cl->flags |= affected_flags;
910            
911           /* We currently don't know how to deal with things that aren't in the
912           * bitmap, but we know that the intersection is no greater than what
913           * is already in cl, so let there be false positives that get sorted
914           * out after the synthetic start class succeeds, and the node is
915           * matched for real. */
916            
917           /* The inversion of these two flags indicate that the resulting
918           * intersection doesn't have them */
919 558998         if (and_with->flags & ANYOF_UNICODE_ALL) {
920 558996         cl->flags &= ~ANYOF_UNICODE_ALL;
921           }
922 558996         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
923 558996         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
924           }
925           }
926           else { /* and'd node is not inverted */
927           U8 outside_bitmap_but_not_utf8; /* Temp variable */
928            
929 2948516         if (! ANYOF_NONBITMAP(and_with)) {
930            
931           /* Here 'and_with' doesn't match anything outside the bitmap
932           * (except possibly ANYOF_UNICODE_ALL), which means the
933           * intersection can't either, except for ANYOF_UNICODE_ALL, in
934           * which case we don't know what the intersection is, but it's no
935           * greater than what cl already has, so can just leave it alone,
936           * with possible false positives */
937 2948516         if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
938 2948510         ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
939 2996118         cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
940           }
941           }
942 2760550         else if (! ANYOF_NONBITMAP(cl)) {
943            
944           /* Here, 'and_with' does match something outside the bitmap, and cl
945           * doesn't have a list of things to match outside the bitmap. If
946           * cl can match all code points above 255, the intersection will
947           * be those above-255 code points that 'and_with' matches. If cl
948           * can't match all Unicode code points, it means that it can't
949           * match anything outside the bitmap (since the 'if' that got us
950           * into this block tested for that), so we leave the bitmap empty.
951           */
952 2760550         if (cl->flags & ANYOF_UNICODE_ALL) {
953 14920         ARG_SET(cl, ARG(and_with));
954            
955           /* and_with's ARG may match things that don't require UTF8.
956           * And now cl's will too, in spite of this being an 'and'. See
957           * the comments below about the kludge */
958 14920         cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
959           }
960           }
961           else {
962           /* Here, both 'and_with' and cl match something outside the
963           * bitmap. Currently we do not do the intersection, so just match
964           * whatever cl had at the beginning. */
965           }
966            
967            
968           /* Take the intersection of the two sets of flags. However, the
969           * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
970           * kludge around the fact that this flag is not treated like the others
971           * which are initialized in cl_anything(). The way the optimizer works
972           * is that the synthetic start class (SSC) is initialized to match
973           * anything, and then the first time a real node is encountered, its
974           * values are AND'd with the SSC's with the result being the values of
975           * the real node. However, there are paths through the optimizer where
976           * the AND never gets called, so those initialized bits are set
977           * inappropriately, which is not usually a big deal, as they just cause
978           * false positives in the SSC, which will just mean a probably
979           * imperceptible slow down in execution. However this bit has a
980           * higher false positive consequence in that it can cause utf8.pm,
981           * utf8_heavy.pl ... to be loaded when not necessary, which is a much
982           * bigger slowdown and also causes significant extra memory to be used.
983           * In order to prevent this, the code now takes a different tack. The
984           * bit isn't set unless some part of the regular expression needs it,
985           * but once set it won't get cleared. This means that these extra
986           * modules won't get loaded unless there was some path through the
987           * pattern that would have required them anyway, and so any false
988           * positives that occur by not ANDing them out when they could be
989           * aren't as severe as they would be if we treated this bit like all
990           * the others */
991 6         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
992           & ANYOF_NONBITMAP_NON_UTF8;
993 6         cl->flags &= and_with->flags;
994 6         cl->flags |= outside_bitmap_but_not_utf8;
995           }
996 6         }
997            
998           /* 'OR' a given class with another one. Can create false positives. 'cl'
999           * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
1000           * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
1001           STATIC void
1002 46         S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
1003           {
1004 46         PERL_ARGS_ASSERT_CL_OR;
1005            
1006 14966         if (or_with->flags & ANYOF_INVERT) {
1007            
1008           /* Here, the or'd node is to be inverted. This means we take the
1009           * complement of everything not in the bitmap, but currently we don't
1010           * know what that is, so give up and match anything */
1011 2745630         if (ANYOF_NONBITMAP(or_with)) {
1012 2745630         cl_anything(pRExC_state, cl);
1013           }
1014           /* We do not use
1015           * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1016           * <= (B1 | !B2) | (CL1 | !CL2)
1017           * which is wasteful if CL2 is small, but we ignore CL2:
1018           * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1019           * XXXX Can we handle case-fold? Unclear:
1020           * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1021           * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1022           */
1023 1408133         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024 89256193         && !(or_with->flags & ANYOF_LOC_FOLD)
1025 87860160         && !(cl->flags & ANYOF_LOC_FOLD) ) {
1026           int i;
1027            
1028 2745630         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1029 24200         cl->bitmap[i] |= ~or_with->bitmap[i];
1030           } /* XXXX: logic is complicated otherwise */
1031           else {
1032 0         cl_anything(pRExC_state, cl);
1033           }
1034            
1035           /* And, we can just take the union of the flags that aren't affected
1036           * by the inversion */
1037 2745630         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1038            
1039           /* For the remaining flags:
1040           ANYOF_UNICODE_ALL and inverted means to not match anything above
1041           255, which means that the union with cl should just be
1042           what cl has in it, so can ignore this flag
1043           ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1044           is (ASCII) 127-255 to match them, but then invert that, so
1045           the union with cl should just be what cl has in it, so can
1046           ignore this flag
1047           */
1048           } else { /* 'or_with' is not inverted */
1049           /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1050 140606         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1051 140192         && (!(or_with->flags & ANYOF_LOC_FOLD)
1052 414         || (cl->flags & ANYOF_LOC_FOLD)) ) {
1053           int i;
1054            
1055           /* OR char bitmap and class bitmap separately */
1056 1886         for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1057 1472         cl->bitmap[i] |= or_with->bitmap[i];
1058 460         if (or_with->flags & ANYOF_CLASS) {
1059 2745630         ANYOF_CLASS_OR(or_with, cl);
1060           }
1061           }
1062           else { /* XXXX: logic is complicated, leave it along for a moment. */
1063 2760550         cl_anything(pRExC_state, cl);
1064           }
1065            
1066 630257         if (ANYOF_NONBITMAP(or_with)) {
1067            
1068           /* Use the added node's outside-the-bit-map match if there isn't a
1069           * conflict. If there is a conflict (both nodes match something
1070           * outside the bitmap, but what they match outside is not the same
1071           * pointer, and hence not easily compared until XXX we extend
1072           * inversion lists this far), give up and allow the start class to
1073           * match everything outside the bitmap. If that stuff is all above
1074           * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1075 630211         if (! ANYOF_NONBITMAP(cl)) {
1076 630211         ARG_SET(cl, ARG(or_with));
1077           }
1078 630211         else if (ARG(cl) != ARG(or_with)) {
1079            
1080 630211         if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1081 630211         cl_anything(pRExC_state, cl);
1082           }
1083           else {
1084 630211         cl->flags |= ANYOF_UNICODE_ALL;
1085           }
1086           }
1087           }
1088            
1089           /* Take the union */
1090 92314         cl->flags |= or_with->flags;
1091           }
1092 46         }
1093            
1094           #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1095           #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1096           #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1097           #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1098            
1099            
1100           #ifdef DEBUGGING
1101           /*
1102           dump_trie(trie,widecharmap,revcharmap)
1103           dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1104           dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1105            
1106           These routines dump out a trie in a somewhat readable format.
1107           The _interim_ variants are used for debugging the interim
1108           tables that are used to generate the final compressed
1109           representation which is what dump_trie expects.
1110            
1111           Part of the reason for their existence is to provide a form
1112           of documentation as to how the different representations function.
1113            
1114           */
1115            
1116           /*
1117           Dumps the final compressed table form of the trie to Perl_debug_log.
1118           Used for debugging make_trie().
1119           */
1120            
1121           STATIC void
1122 6         S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1123           AV *revcharmap, U32 depth)
1124           {
1125           U32 state;
1126 6         SV *sv=sv_newmortal();
1127 630217         int colwidth= widecharmap ? 6 : 4;
1128           U16 word;
1129 630217         GET_RE_DEBUG_FLAGS_DECL;
1130            
1131 630217         PERL_ARGS_ASSERT_DUMP_TRIE;
1132            
1133 630217         PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1134 630217         (int)depth * 2 + 2,"",
1135           "Match","Base","Ofs" );
1136            
1137 630311         for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1138 630305         SV ** const tmp = av_fetch( revcharmap, state, 0);
1139 538037         if ( tmp ) {
1140 630681         PerlIO_printf( Perl_debug_log, "%*s",
1141           colwidth,
1142 630681         pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1143           PL_colors[0], PL_colors[1],
1144           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1145           PERL_PV_ESCAPE_FIRSTCHAR
1146           )
1147           );
1148           }
1149           }
1150 630217         PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1151 6         (int)depth * 2 + 2,"");
1152            
1153 630311         for( state = 0 ; state < trie->uniquecharcount ; state++ )
1154 259262         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1155 2557967         PerlIO_printf( Perl_debug_log, "\n");
1156            
1157 1927846         for( state = 1 ; state < trie->statecount ; state++ ) {
1158 1927840         const U32 base = trie->states[ state ].trans.base;
1159            
1160 1927840         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1161            
1162 1927840         if ( trie->states[ state ].wordnum ) {
1163 1927800         PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1164           } else {
1165 5728         PerlIO_printf( Perl_debug_log, "%6s", "" );
1166           }
1167            
1168 5778         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1169            
1170 142         if ( base ) {
1171           U32 ofs = 0;
1172            
1173 842         while( ( base + ofs < trie->uniquecharcount ) ||
1174 358         ( base + ofs - trie->uniquecharcount < trie->lasttrans
1175 5942         && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1176 6074         ofs++;
1177            
1178 1922160         PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1179            
1180 1722742         for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1181 1723784         if ( ( base + ofs >= trie->uniquecharcount ) &&
1182 978878         ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1183 7175658         trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1184           {
1185 6198054         PerlIO_printf( Perl_debug_log, "%*"UVXf,
1186           colwidth,
1187 6198054         (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1188           } else {
1189 6198948         PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1190           }
1191           }
1192            
1193 287120         PerlIO_printf( Perl_debug_log, "]");
1194            
1195           }
1196 287164         PerlIO_printf( Perl_debug_log, "\n" );
1197           }
1198 118829         PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1199 79262         for (word=1; word <= trie->wordcount; word++) {
1200 207954         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1201 75074         (int)word, (int)(trie->wordinfo[word].prev),
1202 132880         (int)(trie->wordinfo[word].len));
1203           }
1204 132836         PerlIO_printf(Perl_debug_log, "\n" );
1205 5910800         }
1206           /*
1207           Dumps a fully constructed but uncompressed trie in list form.
1208           List tries normally only are used for construction when the number of
1209           possible chars (trie->uniquecharcount) is very high.
1210           Used for debugging make_trie().
1211           */
1212           STATIC void
1213 5910794         S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1214           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1215           U32 depth)
1216           {
1217           U32 state;
1218 5682834         SV *sv=sv_newmortal();
1219 227960         int colwidth= widecharmap ? 6 : 4;
1220 14588         GET_RE_DEBUG_FLAGS_DECL;
1221            
1222 213372         PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1223            
1224           /* print out the table precompression. */
1225 213372         PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1226 6197868         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1227           "------:-----+-----------------\n" );
1228          
1229 6050074         for( state=1 ; state < next_alloc ; state ++ ) {
1230           U16 charid;
1231          
1232 367052         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1233 367052         (int)depth * 2 + 2,"", (UV)state );
1234 258210         if ( ! trie->states[ state ].wordnum ) {
1235 258210         PerlIO_printf( Perl_debug_log, "%5s| ","");
1236           } else {
1237 6050074         PerlIO_printf( Perl_debug_log, "W%4x| ",
1238 3437251         trie->states[ state ].wordnum
1239           );
1240           }
1241 3437251         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1242 6050074         SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1243 1642398         if ( tmp ) {
1244 1642398         PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1245           colwidth,
1246 1642398         pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1247           PL_colors[0], PL_colors[1],
1248           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1249           PERL_PV_ESCAPE_FIRSTCHAR
1250           ) ,
1251 1642314         TRIE_LIST_ITEM(state,charid).forid,
1252 1220         (UV)TRIE_LIST_ITEM(state,charid).newstate
1253           );
1254 147794         if (!(charid % 10))
1255 35058         PerlIO_printf(Perl_debug_log, "\n%*s| ",
1256 147794         (int)((depth * 2) + 14), "");
1257           }
1258           }
1259 147794         PerlIO_printf( Perl_debug_log, "\n");
1260           }
1261 0         }
1262            
1263           /*
1264           Dumps a fully constructed but uncompressed trie in table form.
1265           This is the normal DFA style state transition table, with a few
1266           twists to facilitate compression later.
1267           Used for debugging make_trie().
1268           */
1269           STATIC void
1270 147794         S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1271           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1272           U32 depth)
1273           {
1274           U32 state;
1275           U16 charid;
1276 141522         SV *sv=sv_newmortal();
1277 141522         int colwidth= widecharmap ? 6 : 4;
1278 1922114         GET_RE_DEBUG_FLAGS_DECL;
1279            
1280 630211         PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1281          
1282           /*
1283           print out the table precompression so that we can do a visual check
1284           that they are identical.
1285           */
1286          
1287 630211         PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1288            
1289 1291903         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 259772         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1291 1032131         if ( tmp ) {
1292 175469         PerlIO_printf( Perl_debug_log, "%*s",
1293           colwidth,
1294 630211         pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1295           PL_colors[0], PL_colors[1],
1296           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1297           PERL_PV_ESCAPE_FIRSTCHAR
1298           )
1299           );
1300           }
1301           }
1302            
1303 630211         PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1304            
1305 630211         for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1306 4         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1307           }
1308            
1309 4         PerlIO_printf( Perl_debug_log, "\n" );
1310            
1311 4         for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1312            
1313 79008         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1314 79004         (int)depth * 2 + 2,"",
1315 79004         (UV)TRIE_NODENUM( state ) );
1316            
1317 79004         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1318 79004         UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1319 0         if (v)
1320 0         PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1321           else
1322 0         PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1323           }
1324 0         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1325 79004         PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1326           } else {
1327 197606         PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1328 79068         trie->states[ TRIE_NODENUM( state ) ].wordnum );
1329           }
1330           }
1331 79068         }
1332            
1333           #endif
1334            
1335            
1336           /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1337           startbranch: the first branch in the whole branch sequence
1338           first : start branch of sequence of branch-exact nodes.
1339           May be the same as startbranch
1340           last : Thing following the last branch.
1341           May be the same as tail.
1342           tail : item following the branch sequence
1343           count : words in the sequence
1344           flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1345           depth : indent depth
1346            
1347           Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1348            
1349           A trie is an N'ary tree where the branches are determined by digital
1350           decomposition of the key. IE, at the root node you look up the 1st character and
1351           follow that branch repeat until you find the end of the branches. Nodes can be
1352           marked as "accepting" meaning they represent a complete word. Eg:
1353            
1354           /he|she|his|hers/
1355            
1356           would convert into the following structure. Numbers represent states, letters
1357           following numbers represent valid transitions on the letter from that state, if
1358           the number is in square brackets it represents an accepting state, otherwise it
1359           will be in parenthesis.
1360            
1361           +-h->+-e->[3]-+-r->(8)-+-s->[9]
1362           | |
1363           | (2)
1364           | |
1365           (1) +-i->(6)-+-s->[7]
1366           |
1367           +-s->(3)-+-h->(4)-+-e->[5]
1368            
1369           Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1370            
1371           This shows that when matching against the string 'hers' we will begin at state 1
1372           read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1373           then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1374           is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1375           single traverse. We store a mapping from accepting to state to which word was
1376           matched, and then when we have multiple possibilities we try to complete the
1377           rest of the regex in the order in which they occured in the alternation.
1378            
1379           The only prior NFA like behaviour that would be changed by the TRIE support is
1380           the silent ignoring of duplicate alternations which are of the form:
1381            
1382           / (DUPE|DUPE) X? (?{ ... }) Y /x
1383            
1384           Thus EVAL blocks following a trie may be called a different number of times with
1385           and without the optimisation. With the optimisations dupes will be silently
1386           ignored. This inconsistent behaviour of EVAL type nodes is well established as
1387           the following demonstrates:
1388            
1389           'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1390            
1391           which prints out 'word' three times, but
1392            
1393           'words'=~/(word|word|word)(?{ print $1 })S/
1394            
1395           which doesnt print it out at all. This is due to other optimisations kicking in.
1396            
1397           Example of what happens on a structural level:
1398            
1399           The regexp /(ac|ad|ab)+/ will produce the following debug output:
1400            
1401           1: CURLYM[1] {1,32767}(18)
1402           5: BRANCH(8)
1403           6: EXACT (16)
1404           8: BRANCH(11)
1405           9: EXACT (16)
1406           11: BRANCH(14)
1407           12: EXACT (16)
1408           16: SUCCEED(0)
1409           17: NOTHING(18)
1410           18: END(0)
1411            
1412           This would be optimizable with startbranch=5, first=5, last=16, tail=16
1413           and should turn into:
1414            
1415           1: CURLYM[1] {1,32767}(18)
1416           5: TRIE(16)
1417           [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1418          
1419          
1420          
1421           16: SUCCEED(0)
1422           17: NOTHING(18)
1423           18: END(0)
1424            
1425           Cases where tail != last would be like /(?foo|bar)baz/:
1426            
1427           1: BRANCH(4)
1428           2: EXACT (8)
1429           4: BRANCH(7)
1430           5: EXACT (8)
1431           7: TAIL(8)
1432           8: EXACT (10)
1433           10: END(0)
1434            
1435           which would be optimizable with startbranch=1, first=1, last=7, tail=8
1436           and would end up looking like:
1437            
1438           1: TRIE(8)
1439           [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1440          
1441          
1442           7: TAIL(8)
1443           8: EXACT (10)
1444           10: END(0)
1445            
1446           d = uvchr_to_utf8_flags(d, uv, 0);
1447            
1448           is the recommended Unicode-aware way of saying
1449            
1450           *(d++) = uv;
1451           */
1452            
1453           #define TRIE_STORE_REVCHAR(val) \
1454           STMT_START { \
1455           if (UTF) { \
1456           SV *zlopp = newSV(7); /* XXX: optimize me */ \
1457           unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1458           unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1459           SvCUR_set(zlopp, kapow - flrbbbbb); \
1460           SvPOK_on(zlopp); \
1461           SvUTF8_on(zlopp); \
1462           av_push(revcharmap, zlopp); \
1463           } else { \
1464           char ooooff = (char)val; \
1465           av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1466           } \
1467           } STMT_END
1468            
1469           /* This gets the next character from the input, folding it if not already
1470           * folded. */
1471           #define TRIE_READ_CHAR STMT_START { \
1472           wordlen++; \
1473           if ( UTF ) { \
1474           /* if it is UTF then it is either already folded, or does not need \
1475           * folding */ \
1476           uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1477           } \
1478           else if (folder == PL_fold_latin1) { \
1479           /* This folder implies Unicode rules, which in the range expressible \
1480           * by not UTF is the lower case, with the two exceptions, one of \
1481           * which should have been taken care of before calling this */ \
1482           assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1483           uvc = toLOWER_L1(*uc); \
1484           if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1485           len = 1; \
1486           } else { \
1487           /* raw data, will be folded later if needed */ \
1488           uvc = (U32)*uc; \
1489           len = 1; \
1490           } \
1491           } STMT_END
1492            
1493            
1494            
1495           #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1496           if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1497           U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1498           Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1499           } \
1500           TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1501           TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1502           TRIE_LIST_CUR( state )++; \
1503           } STMT_END
1504            
1505           #define TRIE_LIST_NEW(state) STMT_START { \
1506           Newxz( trie->states[ state ].trans.list, \
1507           4, reg_trie_trans_le ); \
1508           TRIE_LIST_CUR( state ) = 1; \
1509           TRIE_LIST_LEN( state ) = 4; \
1510           } STMT_END
1511            
1512           #define TRIE_HANDLE_WORD(state) STMT_START { \
1513           U16 dupe= trie->states[ state ].wordnum; \
1514           regnode * const noper_next = regnext( noper ); \
1515           \
1516           DEBUG_r({ \
1517           /* store the word for dumping */ \
1518           SV* tmp; \
1519           if (OP(noper) != NOTHING) \
1520           tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1521           else \
1522           tmp = newSVpvn_utf8( "", 0, UTF ); \
1523           av_push( trie_words, tmp ); \
1524           }); \
1525           \
1526           curword++; \
1527           trie->wordinfo[curword].prev = 0; \
1528           trie->wordinfo[curword].len = wordlen; \
1529           trie->wordinfo[curword].accept = state; \
1530           \
1531           if ( noper_next < tail ) { \
1532           if (!trie->jump) \
1533           trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1534           trie->jump[curword] = (U16)(noper_next - convert); \
1535           if (!jumper) \
1536           jumper = noper_next; \
1537           if (!nextbranch) \
1538           nextbranch= regnext(cur); \
1539           } \
1540           \
1541           if ( dupe ) { \
1542           /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1543           /* chain, so that when the bits of chain are later */\
1544           /* linked together, the dups appear in the chain */\
1545           trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1546           trie->wordinfo[dupe].prev = curword; \
1547           } else { \
1548           /* we haven't inserted this word yet. */ \
1549           trie->states[ state ].wordnum = curword; \
1550           } \
1551           } STMT_END
1552            
1553            
1554           #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1555           ( ( base + charid >= ucharcount \
1556           && base + charid < ubound \
1557           && state == trie->trans[ base - ucharcount + charid ].check \
1558           && trie->trans[ base - ucharcount + charid ].next ) \
1559           ? trie->trans[ base - ucharcount + charid ].next \
1560           : ( state==1 ? special : 0 ) \
1561           )
1562            
1563           #define MADE_TRIE 1
1564           #define MADE_JUMP_TRIE 2
1565           #define MADE_EXACT_TRIE 4
1566            
1567           STATIC I32
1568 96         S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1569           {
1570           dVAR;
1571           /* first pass, loop through and scan words */
1572           reg_trie_data *trie;
1573           HV *widecharmap = NULL;
1574 78988         AV *revcharmap = newAV();
1575           regnode *cur;
1576 78988         const U32 uniflags = UTF8_ALLOW_DEFAULT;
1577 78988         STRLEN len = 0;
1578 79076         UV uvc = 0;
1579           U16 curword = 0;
1580           U32 next_alloc = 0;
1581           regnode *jumper = NULL;
1582           regnode *nextbranch = NULL;
1583           regnode *convert = NULL;
1584           U32 *prev_states; /* temp array mapping each state to previous one */
1585           /* we just use folder as a flag in utf8 */
1586           const U8 * folder = NULL;
1587            
1588           #ifdef DEBUGGING
1589 79076         const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1590           AV *trie_words = NULL;
1591           /* along with revcharmap, this only used during construction but both are
1592           * useful during debugging so we store them in the struct when debugging.
1593           */
1594           #else
1595           const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1596           STRLEN trie_charcount=0;
1597           #endif
1598           SV *re_trie_maxbuff;
1599 79076         GET_RE_DEBUG_FLAGS_DECL;
1600            
1601 72         PERL_ARGS_ASSERT_MAKE_TRIE;
1602           #ifndef DEBUGGING
1603           PERL_UNUSED_ARG(depth);
1604           #endif
1605            
1606 780243576         switch (flags) {
1607           case EXACT: break;
1608           case EXACTFA:
1609           case EXACTFU_SS:
1610 780164502         case EXACTFU: folder = PL_fold_latin1; break;
1611 0         case EXACTF: folder = PL_fold; break;
1612 0         case EXACTFL: folder = PL_fold_locale; break;
1613 79068         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1614           }
1615            
1616 79076         trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1617 79076         trie->refcount = 1;
1618 79076         trie->startstate = 1;
1619 79076         trie->wordcount = word_count;
1620 8         RExC_rxi->data->data[ data_slot ] = (void*)trie;
1621 79012         trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1622 12         if (flags == EXACT)
1623 10         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1624 12         trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625           trie->wordcount+1, sizeof(reg_trie_wordinfo));
1626            
1627 12         DEBUG_r({
1628           trie_words = newAV();
1629           });
1630            
1631 12         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1632 79084         if (!SvIOK(re_trie_maxbuff)) {
1633 79072         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1634           }
1635 76         DEBUG_TRIE_COMPILE_r({
1636           PerlIO_printf( Perl_debug_log,
1637           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1638           (int)depth * 2 + 2, "",
1639           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1640           REG_NODE_NUM(last), REG_NODE_NUM(tail),
1641           (int)depth);
1642           });
1643          
1644           /* Find the node we are going to overwrite */
1645 79076         if ( first == startbranch && OP( last ) != BRANCH ) {
1646           /* whole branch chain */
1647           convert = first;
1648           } else {
1649           /* branch sub-chain */
1650 79000         convert = NEXTOPER( first );
1651           }
1652          
1653           /* -- First loop and Setup --
1654            
1655           We first traverse the branches and scan each word to determine if it
1656           contains widechars, and how many unique chars there are, this is
1657           important as we have to build a table with at least as many columns as we
1658           have unique chars.
1659            
1660           We use an array of integers to represent the character codes 0..255
1661           (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1662           native representation of the character value as the key and IV's for the
1663           coded index.
1664            
1665           *TODO* If we keep track of how many times each character is used we can
1666           remap the columns so that the table compression later on is more
1667           efficient in terms of memory by ensuring the most common value is in the
1668           middle and the least common are on the outside. IMO this would be better
1669           than a most to least common mapping as theres a decent chance the most
1670           common letter will share a node with the least common, meaning the node
1671           will not be compressible. With a middle is most common approach the worst
1672           case is when we have the least common nodes twice.
1673            
1674           */
1675            
1676 79064         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1677 79056         regnode *noper = NEXTOPER( cur );
1678 124         const U8 *uc = (U8*)STRING( noper );
1679 56         const U8 *e = uc + STR_LEN( noper );
1680           STRLEN foldlen = 0;
1681           U32 wordlen = 0; /* required init */
1682           STRLEN minbytes = 0;
1683           STRLEN maxbytes = 0;
1684 56         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1685            
1686 56         if (OP(noper) == NOTHING) {
1687 2         regnode *noper_next= regnext(noper);
1688 70         if (noper_next != tail && OP(noper_next) == flags) {
1689           noper = noper_next;
1690 68         uc= (U8*)STRING(noper);
1691 79064         e= uc + STR_LEN(noper);
1692 79060         trie->minlen= STR_LEN(noper);
1693           } else {
1694 30         trie->minlen= 0;
1695 30         continue;
1696           }
1697           }
1698            
1699 82         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1700 54         TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1701           regardless of encoding */
1702 90         if (OP( noper ) == EXACTFU_SS) {
1703           /* false positives are ok, so just set this */
1704 36         TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1705           }
1706           }
1707 220         for ( ; uc < e ; uc += len ) {
1708 220         TRIE_CHARCOUNT(trie)++;
1709 79190         TRIE_READ_CHAR;
1710            
1711           /* Acummulate to the current values, the range in the number of
1712           * bytes that this character could match. The max is presumed to
1713           * be the same as the folded input (which TRIE_READ_CHAR returns),
1714           * except that when this is not in UTF-8, it could be matched
1715           * against a string which is UTF-8, and the variant characters
1716           * could be 2 bytes instead of the 1 here. Likewise, for the
1717           * minimum number of bytes when not folded. When folding, the min
1718           * is assumed to be 1 byte could fold to match the single character
1719           * here, or in the case of a multi-char fold, 1 byte can fold to
1720           * the whole sequence. 'foldlen' is used to denote whether we are
1721           * in such a sequence, skipping the min setting if so. XXX TODO
1722           * Use the exact list of what folds to each character, from
1723           * PL_utf8_foldclosures */
1724 79188         if (UTF) {
1725 79004         maxbytes += UTF8SKIP(uc);
1726 79004         if (! folder) {
1727           /* A non-UTF-8 string could be 1 byte to match our 2 */
1728 4         minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1729           ? 1
1730 68         : UTF8SKIP(uc);
1731           }
1732           else {
1733 79072         if (foldlen) {
1734 4         foldlen -= UTF8SKIP(uc);
1735           }
1736           else {
1737 630207         foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1738 630207         minbytes++;
1739           }
1740           }
1741           }
1742           else {
1743 630391         maxbytes += (UNI_IS_INVARIANT(*uc))
1744           ? 1
1745           : 2;
1746 630391         if (! folder) {
1747 630283         minbytes++;
1748           }
1749           else {
1750 2479061         if (foldlen) {
1751 1848746         foldlen--;
1752           }
1753           else {
1754 1848854         foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1755 1848854         minbytes++;
1756           }
1757           }
1758           }
1759 1848930         if ( uvc < 256 ) {
1760 5872         if ( folder ) {
1761 5796         U8 folded= folder[ (U8) uvc ];
1762 160         if ( !trie->charmap[ folded ] ) {
1763 88         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1764 1848782         TRIE_STORE_REVCHAR( folded );
1765           }
1766           }
1767 10937261         if ( !trie->charmap[ uvc ] ) {
1768 6118870         trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1769 6118870         TRIE_STORE_REVCHAR( uvc );
1770           }
1771 6050170         if ( set_bit ) {
1772           /* store the codepoint in the bitmap, and its folded
1773           * equivalent. */
1774 68840         TRIE_BITMAP_SET(trie, uvc);
1775            
1776           /* store the folded codepoint */
1777 68840         if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1778            
1779 6118826         if ( !UTF ) {
1780           /* store first byte of utf8 representation of
1781           variant codepoints */
1782 6118826         if (! NATIVE_IS_INVARIANT(uvc)) {
1783 6118800         TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1784           }
1785           }
1786           set_bit = 0; /* We've done our bit :-) */
1787           }
1788           } else {
1789           SV** svpp;
1790 5825601         if ( !widecharmap )
1791 5825601         widecharmap = newHV();
1792            
1793 5825601         svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1794            
1795 5825601         if ( !svpp )
1796 5825601         Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1797            
1798 6118800         if ( !SvTRUE( *svpp ) ) {
1799 0         sv_setiv( *svpp, ++trie->uniquecharcount );
1800 1848746         TRIE_STORE_REVCHAR(uvc);
1801           }
1802           }
1803           }
1804 1848800         if( cur == first ) {
1805 630215         trie->minlen = minbytes;
1806 630215         trie->maxlen = maxbytes;
1807 7086061         } else if (minbytes < trie->minlen) {
1808 6455810         trie->minlen = minbytes;
1809 6455852         } else if (maxbytes > trie->maxlen) {
1810 6455810         trie->maxlen = maxbytes;
1811           }
1812           } /* end first pass */
1813 6455816         DEBUG_TRIE_COMPILE_r(
1814           PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1815           (int)depth * 2 + 2,"",
1816           ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1817           (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1818           (int)trie->minlen, (int)trie->maxlen )
1819           );
1820            
1821           /*
1822           We now know what we are dealing with in terms of unique chars and
1823           string sizes so we can calculate how much memory a naive
1824           representation using a flat table will take. If it's over a reasonable
1825           limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1826           conservative but potentially much slower representation using an array
1827           of lists.
1828            
1829           At the end we convert both representations into the same compressed
1830           form that will be used in regexec.c for matching with. The latter
1831           is a form that cannot be used to construct with but has memory
1832           properties similar to the list form and access properties similar
1833           to the table form making it both suitable for fast searches and
1834           small enough that its feasable to store for the duration of a program.
1835            
1836           See the comment in the code where the compressed table is produced
1837           inplace from the flat tabe representation for an explanation of how
1838           the compression works.
1839            
1840           */
1841            
1842            
1843 30925548         Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1844 28538549         prev_states[1] = 0;
1845            
1846 7486107         if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1847           /*
1848           Second Pass -- Array Of Lists Representation
1849            
1850           Each state will be represented by a list of charid:state records
1851           (reg_trie_trans_le) the first such element holds the CUR and LEN
1852           points of the allocated array. (See defines above).
1853            
1854           We build the initial structure using the lists, and then convert
1855           it into the compressed table form which allows faster lookups
1856           (but cant be modified once converted).
1857           */
1858            
1859           STRLEN transcount = 1;
1860            
1861 5825601         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1862           "%*sCompiling trie using list compiler\n",
1863           (int)depth * 2 + 2, ""));
1864            
1865 3434215         trie->states = (reg_trie_state *)
1866 2964532         PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1867           sizeof(reg_trie_state) );
1868 4068809         TRIE_LIST_NEW(1);
1869           next_alloc = 2;
1870            
1871 4068809         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1872            
1873 4068809         regnode *noper = NEXTOPER( cur );
1874 4068809         U8 *uc = (U8*)STRING( noper );
1875 1756792         const U8 *e = uc + STR_LEN( noper );
1876           U32 state = 1; /* required init */
1877           U16 charid = 0; /* sanity init */
1878           U32 wordlen = 0; /* required init */
1879            
1880 3417290         if (OP(noper) == NOTHING) {
1881 682779         regnode *noper_next= regnext(noper);
1882 3417290         if (noper_next != tail && OP(noper_next) == flags) {
1883           noper = noper_next;
1884 3417290         uc= (U8*)STRING(noper);
1885 3417290         e= uc + STR_LEN(noper);
1886           }
1887           }
1888            
1889 630207         if (OP(noper) != NOTHING) {
1890 630207         for ( ; uc < e ; uc += len ) {
1891            
1892 630207         TRIE_READ_CHAR;
1893            
1894 630211         if ( uvc < 256 ) {
1895 630211         charid = trie->charmap[ uvc ];
1896           } else {
1897 630211         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1898 630211         if ( !svpp ) {
1899           charid = 0;
1900           } else {
1901 259168         charid=(U16)SvIV( *svpp );
1902           }
1903           }
1904           /* charid is now 0 if we dont know the char read, or nonzero if we do */
1905 630211         if ( charid ) {
1906            
1907           U16 check;
1908           U32 newstate = 0;
1909            
1910 630211         charid--;
1911 193429         if ( !trie->states[ state ].trans.list ) {
1912 364702         TRIE_LIST_NEW( state );
1913           }
1914 364702         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1915 1237152         if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1916 2062410         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1917 2062404         break;
1918           }
1919           }
1920 1387048         if ( ! newstate ) {
1921 706821         newstate = next_alloc++;
1922 348827         prev_states[newstate] = state;
1923 348827         TRIE_LIST_PUSH( state, charid, newstate );
1924 348827         transcount++;
1925           }
1926           state = newstate;
1927           } else {
1928 13335         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1929           }
1930           }
1931           }
1932 12815         TRIE_HANDLE_WORD(state);
1933            
1934           } /* end second pass */
1935            
1936           /* next alloc is the NEXT state to be allocated */
1937 12815         trie->statecount = next_alloc;
1938 11429         trie->states = (reg_trie_state *)
1939 11429         PerlMemShared_realloc( trie->states,
1940           next_alloc
1941           * sizeof(reg_trie_state) );
1942            
1943           /* and now dump it out before we compress it */
1944 11429         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1945           revcharmap, next_alloc,
1946           depth+1)
1947           );
1948            
1949 11429         trie->trans = (reg_trie_trans *)
1950 0         PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1951           {
1952           U32 state;
1953           U32 tp = 0;
1954           U32 zp = 0;
1955            
1956            
1957 13335         for( state=1 ; state < next_alloc ; state ++ ) {
1958           U32 base=0;
1959            
1960           /*
1961           DEBUG_TRIE_COMPILE_MORE_r(
1962           PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1963           );
1964           */
1965            
1966 13335         if (trie->states[state].trans.list) {
1967 0         U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1968           U16 maxid=minid;
1969           U16 idx;
1970            
1971 371329         for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1972 364702         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1973 16395         if ( forid < minid ) {
1974           minid=forid;
1975 16395         } else if ( forid > maxid ) {
1976           maxid=forid;
1977           }
1978           }
1979 16395         if ( transcount < tp + maxid - minid + 1) {
1980 12817         transcount *= 2;
1981 12817         trie->trans = (reg_trie_trans *)
1982 12817         PerlMemShared_realloc( trie->trans,
1983           transcount
1984           * sizeof(reg_trie_trans) );
1985 16395         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1986           }
1987 40630         base = trie->uniquecharcount + tp - minid;
1988 16397         if ( maxid == minid ) {
1989           U32 set = 0;
1990 348309         for ( ; zp < tp ; zp++ ) {
1991 348309         if ( ! trie->trans[ zp ].next ) {
1992 12817         base = trie->uniquecharcount + zp - minid;
1993 12817         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1994 12817         trie->trans[ zp ].check = state;
1995           set = 1;
1996 12817         break;
1997           }
1998           }
1999 12817         if ( !set ) {
2000 12817         trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2001 2         trie->trans[ tp ].check = state;
2002 630211         tp++;
2003           zp = tp;
2004           }
2005           } else {
2006 630211         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2007 630209         const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2008 630209         trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2009 630209         trie->trans[ tid ].check = state;
2010           }
2011 191162         tp += ( maxid - minid + 1 );
2012           }
2013 630209         Safefree(trie->states[ state ].trans.list);
2014           }
2015           /*
2016           DEBUG_TRIE_COMPILE_MORE_r(
2017           PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2018           );
2019           */
2020 623187         trie->states[ state ].trans.base=base;
2021           }
2022 531169         trie->lasttrans = tp + 1;
2023           }
2024           } else {
2025           /*
2026           Second Pass -- Flat Table Representation.
2027            
2028           we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2029           We know that we will need Charcount+1 trans at most to store the data
2030           (one row per char at worst case) So we preallocate both structures
2031           assuming worst case.
2032            
2033           We then construct the trie using only the .next slots of the entry
2034           structs.
2035            
2036           We use the .check field of the first entry of the node temporarily to
2037           make compression both faster and easier by keeping track of how many non
2038           zero fields are in the node.
2039            
2040           Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2041           transition.
2042            
2043           There are two terms at use here: state as a TRIE_NODEIDX() which is a
2044           number representing the first entry of the node, and state as a
2045           TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2046           TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2047           are 2 entrys per node. eg:
2048            
2049           A B A B
2050           1. 2 4 1. 3 7
2051           2. 0 3 3. 0 5
2052           3. 0 0 5. 0 0
2053           4. 0 0 7. 0 0
2054            
2055           The table is internally in the right hand, idx form. However as we also
2056           have to deal with the states array which is indexed by nodenum we have to
2057           use TRIE_NODENUM() to convert.
2058            
2059           */
2060 90126         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2061           "%*sCompiling trie using table compiler\n",
2062           (int)depth * 2 + 2, ""));
2063            
2064 90126         trie->trans = (reg_trie_trans *)
2065 90126         PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2066           * trie->uniquecharcount + 1,
2067           sizeof(reg_trie_trans) );
2068 90126         trie->states = (reg_trie_state *)
2069 540099         PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2070           sizeof(reg_trie_state) );
2071 630218         next_alloc = trie->uniquecharcount + 1;
2072            
2073            
2074 2247419         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2075            
2076 1927806         regnode *noper = NEXTOPER( cur );
2077 5990         const U8 *uc = (U8*)STRING( noper );
2078 1921872         const U8 *e = uc + STR_LEN( noper );
2079            
2080           U32 state = 1; /* required init */
2081            
2082           U16 charid = 0; /* sanity init */
2083           U32 accept_state = 0; /* sanity init */
2084            
2085           U32 wordlen = 0; /* required init */
2086            
2087 8727781         if (OP(noper) == NOTHING) {
2088 7782298         regnode *noper_next= regnext(noper);
2089 7782298         if (noper_next != tail && OP(noper_next) == flags) {
2090           noper = noper_next;
2091 6010852         uc= (U8*)STRING(noper);
2092 6010852         e= uc + STR_LEN(noper);
2093           }
2094           }
2095            
2096 1921872         if ( OP(noper) != NOTHING ) {
2097 630633         for ( ; uc < e ; uc += len ) {
2098            
2099 630395         TRIE_READ_CHAR;
2100            
2101 630395         if ( uvc < 256 ) {
2102 941001         charid = trie->charmap[ uvc ];
2103           } else {
2104 630211         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2105 25678         charid = svpp ? (U16)SvIV(*svpp) : 0;
2106           }
2107 25862         if ( charid ) {
2108 25862         charid--;
2109 25862         if ( !trie->trans[ state + charid ].next ) {
2110 25778         trie->trans[ state + charid ].next = next_alloc;
2111 25778         trie->trans[ state ].check++;
2112 25778         prev_states[TRIE_NODENUM(next_alloc)]
2113 25778         = TRIE_NODENUM(state);
2114 25778         next_alloc += trie->uniquecharcount;
2115           }
2116 25862         state = trie->trans[ state + charid ].next;
2117           } else {
2118 25678         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2119           }
2120           /* charid is now 0 if we dont know the char read, or nonzero if we do */
2121           }
2122           }
2123 25734         accept_state = TRIE_NODENUM( state );
2124 25734         TRIE_HANDLE_WORD(accept_state);
2125            
2126           } /* end second pass */
2127            
2128           /* and now dump it out before we compress it */
2129 25686         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2130           revcharmap,
2131           next_alloc, depth+1));
2132            
2133           {
2134           /*
2135           * Inplace compress the table.*
2136            
2137           For sparse data sets the table constructed by the trie algorithm will
2138           be mostly 0/FAIL transitions or to put it another way mostly empty.
2139           (Note that leaf nodes will not contain any transitions.)
2140            
2141           This algorithm compresses the tables by eliminating most such
2142           transitions, at the cost of a modest bit of extra work during lookup:
2143            
2144           - Each states[] entry contains a .base field which indicates the
2145           index in the state[] array wheres its transition data is stored.
2146            
2147           - If .base is 0 there are no valid transitions from that node.
2148            
2149           - If .base is nonzero then charid is added to it to find an entry in
2150           the trans array.
2151            
2152           -If trans[states[state].base+charid].check!=state then the
2153           transition is taken to be a 0/Fail transition. Thus if there are fail
2154           transitions at the front of the node then the .base offset will point
2155           somewhere inside the previous nodes data (or maybe even into a node
2156           even earlier), but the .check field determines if the transition is
2157           valid.
2158            
2159           XXX - wrong maybe?
2160           The following process inplace converts the table to the compressed
2161           table: We first do not compress the root node 1,and mark all its
2162           .check pointers as 1 and set its .base pointer as 1 as well. This
2163           allows us to do a DFA construction from the compressed table later,
2164           and ensures that any .base pointers we calculate later are greater
2165           than 0.
2166            
2167           - We set 'pos' to indicate the first entry of the second node.
2168            
2169           - We then iterate over the columns of the node, finding the first and
2170           last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2171           and set the .check pointers accordingly, and advance pos
2172           appropriately and repreat for the next node. Note that when we copy
2173           the next pointers we have to convert them from the original
2174           NODEIDX form to NODENUM form as the former is not valid post
2175           compression.
2176            
2177           - If a node has no transitions used we mark its base as 0 and do not
2178           advance the pos pointer.
2179            
2180           - If a node only has one transition we use a second pointer into the
2181           structure to fill in allocated fail transitions from other states.
2182           This pointer is independent of the main pointer and scans forward
2183           looking for null transitions that are allocated to a state. When it
2184           finds one it writes the single transition into the "hole". If the
2185           pointer doesnt find one the single transition is appended as normal.
2186            
2187           - Once compressed we can Renew/realloc the structures to release the
2188           excess space.
2189            
2190           See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2191           specifically Fig 3.47 and the associated pseudocode.
2192            
2193           demq
2194           */
2195 25686         const U32 laststate = TRIE_NODENUM( next_alloc );
2196           U32 state, charid;
2197           U32 pos = 0, zp=0;
2198 25686         trie->statecount = laststate;
2199            
2200 25794         for ( state = 1 ; state < laststate ; state++ ) {
2201           U8 flag = 0;
2202 25786         const U32 stateidx = TRIE_NODEIDX( state );
2203 25786         const U32 o_used = trie->trans[ stateidx ].check;
2204 330712         U32 used = trie->trans[ stateidx ].check;
2205 305034         trie->trans[ stateidx ].check = 0;
2206            
2207 305684         for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2208 145700         if ( flag || trie->trans[ stateidx + charid ].next ) {
2209 145242         if ( trie->trans[ stateidx + charid ].next ) {
2210 436852         if (o_used == 1) {
2211 411092         for ( ; zp < pos ; zp++ ) {
2212 411122         if ( ! trie->trans[ zp ].next ) {
2213           break;
2214           }
2215           }
2216 1566276129         trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2217 1565852036         trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2218 1565852036         trie->trans[ zp ].check = state;
2219 310038         if ( ++zp > pos ) pos = zp;
2220           break;
2221           }
2222 310058         used--;
2223           }
2224 310190         if ( !flag ) {
2225           flag = 1;
2226 266082         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2227           }
2228 266256         trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2229 266256         trie->trans[ pos ].check = state;
2230 216         pos++;
2231           }
2232           }
2233           }
2234 266072         trie->lasttrans = pos + 1;
2235 25686         trie->states = (reg_trie_state *)
2236 25686         PerlMemShared_realloc( trie->states, laststate
2237           * sizeof(reg_trie_state) );
2238 25686         DEBUG_TRIE_COMPILE_MORE_r(
2239           PerlIO_printf( Perl_debug_log,
2240           "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2241           (int)depth * 2 + 2,"",
2242           (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2243           (IV)next_alloc,
2244           (IV)pos,
2245           ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2246           );
2247            
2248           } /* end table compress */
2249           }
2250 14511669         DEBUG_TRIE_COMPILE_MORE_r(
2251           PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2252           (int)depth * 2 + 2, "",
2253           (UV)trie->statecount,
2254           (UV)trie->lasttrans)
2255           );
2256           /* resize the trans array to remove unused space */
2257 14511669         trie->trans = (reg_trie_trans *)
2258 14511669         PerlMemShared_realloc( trie->trans, trie->lasttrans
2259           * sizeof(reg_trie_trans) );
2260            
2261           { /* Modify the program and insert the new TRIE node */
2262 22779147         U8 nodetype =(U8)(flags & 0xFF);
2263           char *str=NULL;
2264          
2265           #ifdef DEBUGGING
2266           regnode *optimize = NULL;
2267           #ifdef RE_TRACK_PATTERN_OFFSETS
2268            
2269           U32 mjd_offset = 0;
2270           U32 mjd_nodelen = 0;
2271           #endif /* RE_TRACK_PATTERN_OFFSETS */
2272           #endif /* DEBUGGING */
2273           /*
2274           This means we convert either the first branch or the first Exact,
2275           depending on whether the thing following (in 'last') is a branch
2276           or not and whther first is the startbranch (ie is it a sub part of
2277           the alternation or is it the whole thing.)
2278           Assuming its a sub part we convert the EXACT otherwise we convert
2279           the whole branch sequence, including the first.
2280           */
2281           /* Find the node we are going to overwrite */
2282 13896921         if ( first != startbranch || OP( last ) == BRANCH ) {
2283           /* branch sub-chain */
2284 12795297         NEXT_OFF( first ) = (U16)(last - first);
2285           #ifdef RE_TRACK_PATTERN_OFFSETS
2286 1111076         DEBUG_r({
2287           mjd_offset= Node_Offset((convert));
2288           mjd_nodelen= Node_Length((convert));
2289           });
2290           #endif
2291           /* whole branch chain */
2292           }
2293           #ifdef RE_TRACK_PATTERN_OFFSETS
2294           else {
2295 1111084         DEBUG_r({
2296           const regnode *nop = NEXTOPER( convert );
2297           mjd_offset= Node_Offset((nop));
2298           mjd_nodelen= Node_Length((nop));
2299           });
2300           }
2301 1111084         DEBUG_OPTIMISE_r(
2302           PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2303           (int)depth * 2 + 2, "",
2304           (UV)mjd_offset, (UV)mjd_nodelen)
2305           );
2306           #endif
2307           /* But first we check to see if there is a common prefix we can
2308           split out as an EXACT and put in front of the TRIE node. */
2309 1111084         trie->startstate= 1;
2310 1101624         if ( trie->bitmap && !widecharmap && !trie->jump ) {
2311           U32 state;
2312 1101622         for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2313           U32 ofs = 0;
2314           I32 idx = -1;
2315           U32 count = 0;
2316 1101628         const U32 base = trie->states[ state ].trans.base;
2317            
2318 9472         if ( trie->states[state].wordnum )
2319           count = 1;
2320            
2321 9488         for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2322 9556         if ( ( base + ofs >= trie->uniquecharcount ) &&
2323 9556         ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2324 9050         trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2325           {
2326 9012         if ( ++count > 1 ) {
2327 9002         SV **tmp = av_fetch( revcharmap, ofs, 0);
2328 9002         const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2329 14511677         if ( state == 1 ) break;
2330 14511673         if ( count == 2 ) {
2331 14511663         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2332 1832660         DEBUG_OPTIMISE_r(
2333           PerlIO_printf(Perl_debug_log,
2334           "%*sNew Start State=%"UVuf" Class: [",
2335           (int)depth * 2 + 2, "",
2336           (UV)state));
2337 1832660         if (idx >= 0) {
2338 1832660         SV ** const tmp = av_fetch( revcharmap, idx, 0);
2339 1327376         const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2340            
2341 529534         TRIE_BITMAP_SET(trie,*ch);
2342 529534         if ( folder )
2343 303114         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2344 303116         DEBUG_OPTIMISE_r(
2345           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2346           );
2347           }
2348           }
2349 226430         TRIE_BITMAP_SET(trie,*ch);
2350 48448         if ( folder )
2351 47878         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2352 46774         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2353           }
2354 46784         idx = ofs;
2355           }
2356           }
2357 46774         if ( count == 1 ) {
2358 179662         SV **tmp = av_fetch( revcharmap, idx, 0);
2359           STRLEN len;
2360 269490         char *ch = SvPV( *tmp, len );
2361 179662         DEBUG_OPTIMISE_r({
2362           SV *sv=sv_newmortal();
2363           PerlIO_printf( Perl_debug_log,
2364           "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2365           (int)depth * 2 + 2, "",
2366           (UV)state, (UV)idx,
2367           pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2368           PL_colors[0], PL_colors[1],
2369           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2370           PERL_PV_ESCAPE_FIRSTCHAR
2371           )
2372           );
2373           });
2374 158640         if ( state==1 ) {
2375 137614         OP( convert ) = nodetype;
2376 137614         str=STRING(convert);
2377 104372         STR_LEN(convert)=0;
2378           }
2379 73492         STR_LEN(convert) += len;
2380 11178         while (len--)
2381 11166         *str++ = *ch++;
2382           } else {
2383           #ifdef DEBUGGING
2384 62332         if (state>1)
2385 62328         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2386           #endif
2387           break;
2388           }
2389           }
2390 372401         trie->prefixlen = (state-1);
2391 1034822         if (str) {
2392 100918         regnode *n = convert+NODE_SZ_STR(convert);
2393 64648         NEXT_OFF(convert) = NODE_SZ_STR(convert);
2394 17018         trie->startstate = state;
2395 17018         trie->minlen -= (state - 1);
2396 17018         trie->maxlen -= (state - 1);
2397           #ifdef DEBUGGING
2398           /* At least the UNICOS C compiler choked on this
2399           * being argument to DEBUG_r(), so let's just have
2400           * it right here. */
2401           if (
2402           #ifdef PERL_EXT_RE_BUILD
2403           1
2404           #else
2405           DEBUG_r_TEST
2406           #endif
2407           ) {
2408           regnode *fix = convert;
2409 47632         U32 word = trie->wordcount;
2410 47632         mjd_nodelen++;
2411 981532         Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2412 526450         while( ++fix < n ) {
2413 1994023         Set_Node_Offset_Length(fix, 0, 0);
2414           }
2415 1224706         while (word--) {
2416 1224704         SV ** const tmp = av_fetch( trie_words, word, 0 );
2417 1172530         if (tmp) {
2418 18088         if ( STR_LEN(convert) <= SvCUR(*tmp) )
2419 1172530         sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2420           else
2421 1172518         sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2422           }
2423           }
2424           }
2425           #endif
2426 52176         if (trie->maxlen) {
2427           convert = n;
2428           } else {
2429 49618         NEXT_OFF(convert) = (U16)(tail - convert);
2430           DEBUG_r(optimize= n);
2431           }
2432           }
2433           }
2434 46010         if (!jumper)
2435           jumper = last;
2436 41578         if ( trie->maxlen ) {
2437 39484         NEXT_OFF( convert ) = (U16)(tail - convert);
2438 52182         ARG_SET( convert, data_slot );
2439           /* Store the offset to the first unabsorbed branch in
2440           jump[0], which is otherwise unused by the jump logic.
2441           We use this when dumping a trie and during optimisation. */
2442 703241         if (trie->jump)
2443 14511661         trie->jump[0] = (U16)(nextbranch - convert);
2444          
2445           /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2446           * and there is a bitmap
2447           * and the first "jump target" node we found leaves enough room
2448           * then convert the TRIE node into a TRIEC node, with the bitmap
2449           * embedded inline in the opcode - this is hypothetically faster.
2450           */
2451 17472848         if ( !trie->states[trie->startstate].wordnum
2452 17472846         && trie->bitmap
2453 17472844         && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2454           {
2455 17472842         OP( convert ) = TRIEC;
2456 17472842         Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2457 21805354         PerlMemShared_free(trie->bitmap);
2458 9141715         trie->bitmap= NULL;
2459           } else
2460 643351         OP( convert ) = TRIE;
2461            
2462           /* store the type in the flags */
2463 68931023         convert->flags = nodetype;
2464 53853883         DEBUG_r({
2465           optimize = convert
2466           + NODE_STEP_REGNODE
2467           + regarglen[ OP( convert ) ];
2468           });
2469           /* XXX We really should free up the resource in trie now,
2470           as we won't use them - (which resources?) dmq */
2471           }
2472           /* needed for dumping*/
2473 53853883         DEBUG_r(if (optimize) {
2474           regnode *opt = convert;
2475            
2476           while ( ++opt < optimize) {
2477           Set_Node_Offset_Length(opt,0,0);
2478           }
2479           /*
2480           Try to clean up some of the debris left after the
2481           optimisation.
2482           */
2483           while( optimize < jumper ) {
2484           mjd_nodelen += Node_Length((optimize));
2485           OP( optimize ) = OPTIMIZED;
2486           Set_Node_Offset_Length(optimize,0,0);
2487           optimize++;
2488           }
2489           Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2490           });
2491           } /* end node insert */
2492            
2493           /* Finish populating the prev field of the wordinfo array. Walk back
2494           * from each accept state until we find another accept state, and if
2495           * so, point the first word's .prev field at the second word. If the
2496           * second already has a .prev field set, stop now. This will be the
2497           * case either if we've already processed that word's accept state,
2498           * or that state had multiple words, and the overspill words were
2499           * already linked up earlier.
2500           */
2501           {
2502           U16 word;
2503           U32 state;
2504           U16 prev;
2505            
2506 53853931         for (word=1; word <= trie->wordcount; word++) {
2507           prev = 0;
2508 53853931         if (trie->wordinfo[word].prev)
2509 102878242         continue;
2510 51439177         state = trie->wordinfo[word].accept;
2511 51439377         while (state) {
2512 51439321         state = prev_states[state];
2513 51439321         if (!state)
2514           break;
2515 77481289         prev = trie->states[state].wordnum;
2516 46795196         if (prev)
2517           break;
2518           }
2519 46051257         trie->wordinfo[word].prev = prev;
2520           }
2521 747025         Safefree(prev_states);
2522           }
2523            
2524            
2525           /* and now dump out the compressed format */
2526 747025         DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2527            
2528 51439129         RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2529           #ifdef DEBUGGING
2530 238911         RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2531 51200226         RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2532           #else
2533           SvREFCNT_dec_NN(revcharmap);
2534           #endif
2535 53853891         return trie->jump
2536           ? MADE_JUMP_TRIE
2537 52728374         : trie->startstate>1
2538           ? MADE_EXACT_TRIE
2539           : MADE_TRIE;
2540           }
2541            
2542           STATIC void
2543 1129043         S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2544           {
2545           /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2546            
2547           This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2548           "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2549           ISBN 0-201-10088-6
2550            
2551           We find the fail state for each state in the trie, this state is the longest proper
2552           suffix of the current state's 'word' that is also a proper prefix of another word in our
2553           trie. State 1 represents the word '' and is thus the default fail state. This allows
2554           the DFA not to have to restart after its tried and failed a word at a given point, it
2555           simply continues as though it had been matching the other word in the first place.
2556           Consider
2557           'abcdgu'=~/abcdefg|cdgu/
2558           When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2559           fail, which would bring us to the state representing 'd' in the second word where we would
2560           try 'g' and succeed, proceeding to match 'cdgu'.
2561           */
2562           /* add a fail transition */
2563 1129043         const U32 trie_offset = ARG(source);
2564 1129043         reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2565           U32 *q;
2566 1129043         const U32 ucharcount = trie->uniquecharcount;
2567 1129043         const U32 numstates = trie->statecount;
2568 723937         const U32 ubound = trie->lasttrans + ucharcount;
2569           U32 q_read = 0;
2570           U32 q_write = 0;
2571           U32 charid;
2572 1129043         U32 base = trie->states[ 1 ].trans.base;
2573           U32 *fail;
2574           reg_ac_data *aho;
2575 929480         const U32 data_slot = add_data( pRExC_state, 1, "T" );
2576 4607645         GET_RE_DEBUG_FLAGS_DECL;
2577            
2578 3478606         PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2579           #ifndef DEBUGGING
2580           PERL_UNUSED_ARG(depth);
2581           #endif
2582            
2583            
2584 3478606         ARG_SET( stclass, data_slot );
2585 3478606         aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2586 3468846         RExC_rxi->data->data[ data_slot ] = (void*)aho;
2587 3468846         aho->trie=trie_offset;
2588 9764         aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2589 3478606         Copy( trie->states, aho->states, numstates, reg_trie_state );
2590 3478606         Newxz( q, numstates, U32);
2591 3478606         aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2592 3478606         aho->refcount = 1;
2593 5398         fail = aho->fail;
2594           /* initialize fail[0..1] to be 1 so that we always have
2595           a valid final fail state */
2596 3478606         fail[ 0 ] = fail[ 1 ] = 1;
2597            
2598 2227890         for ( charid = 0; charid < ucharcount ; charid++ ) {
2599 2227886         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2600 3478684         if ( newstate ) {
2601 3413278         q[ q_write ] = newstate;
2602           /* set to point at the root */
2603 3478608         fail[ q[ q_write++ ] ]=1;
2604           }
2605           }
2606 3478678         while ( q_read < q_write) {
2607 3478674         const U32 cur = q[ q_read++ % numstates ];
2608 3130034         base = trie->states[ cur ].trans.base;
2609            
2610 1334762         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2611 3480658         const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2612 3480658         if (ch_state) {
2613           U32 fail_state = cur;
2614           U32 fail_base;
2615           do {
2616 182392         fail_state = fail[ fail_state ];
2617 3478668         fail_base = aho->states[ fail_state ].trans.base;
2618 182         } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2619            
2620 182         fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2621 182         fail[ ch_state ] = fail_state;
2622 182         if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2623           {
2624 116         aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2625           }
2626 3478668         q[ q_write++ % numstates] = ch_state;
2627           }
2628           }
2629           }
2630           /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2631           when we fail in state 1, this allows us to use the
2632           charclass scan to find a valid start char. This is based on the principle
2633           that theres a good chance the string being searched contains lots of stuff
2634           that cant be a start char.
2635           */
2636 3468846         fail[ 0 ] = fail[ 1 ] = 0;
2637 210992         DEBUG_TRIE_COMPILE_r({
2638           PerlIO_printf(Perl_debug_log,
2639           "%*sStclass Failtable (%"UVuf" states): 0",
2640           (int)(depth * 2), "", (UV)numstates
2641           );
2642           for( q_read=1; q_read
2643           PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2644           }
2645           PerlIO_printf(Perl_debug_log, "\n");
2646           });
2647 3468846         Safefree(q);
2648           /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2649 3478606         }
2650            
2651            
2652           /*
2653           * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2654           * These need to be revisited when a newer toolchain becomes available.
2655           */
2656           #if defined(__sparc64__) && defined(__GNUC__)
2657           # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2658           # undef SPARC64_GCC_WORKAROUND
2659           # define SPARC64_GCC_WORKAROUND 1
2660           # endif
2661           #endif
2662            
2663           #define DEBUG_PEEP(str,scan,depth) \
2664           DEBUG_OPTIMISE_r({if (scan){ \
2665           SV * const mysv=sv_newmortal(); \
2666           regnode *Next = regnext(scan); \
2667           regprop(RExC_rx, mysv, scan); \
2668           PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2669           (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2670           Next ? (REG_NODE_NUM(Next)) : 0 ); \
2671           }});
2672            
2673            
2674           /* The below joins as many adjacent EXACTish nodes as possible into a single
2675           * one. The regop may be changed if the node(s) contain certain sequences that
2676           * require special handling. The joining is only done if:
2677           * 1) there is room in the current conglomerated node to entirely contain the
2678           * next one.
2679           * 2) they are the exact same node type
2680           *
2681           * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2682           * these get optimized out
2683           *
2684           * If a node is to match under /i (folded), the number of characters it matches
2685           * can be different than its character length if it contains a multi-character
2686           * fold. *min_subtract is set to the total delta of the input nodes.
2687           *
2688           * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2689           * and contains LATIN SMALL LETTER SHARP S
2690           *
2691           * This is as good a place as any to discuss the design of handling these
2692           * multi-character fold sequences. It's been wrong in Perl for a very long
2693           * time. There are three code points in Unicode whose multi-character folds
2694           * were long ago discovered to mess things up. The previous designs for
2695           * dealing with these involved assigning a special node for them. This
2696           * approach doesn't work, as evidenced by this example:
2697           * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2698           * Both these fold to "sss", but if the pattern is parsed to create a node that
2699           * would match just the \xDF, it won't be able to handle the case where a
2700           * successful match would have to cross the node's boundary. The new approach
2701           * that hopefully generally solves the problem generates an EXACTFU_SS node
2702           * that is "sss".
2703           *
2704           * It turns out that there are problems with all multi-character folds, and not
2705           * just these three. Now the code is general, for all such cases. The
2706           * approach taken is:
2707           * 1) This routine examines each EXACTFish node that could contain multi-
2708           * character fold sequences. It returns in *min_subtract how much to
2709           * subtract from the the actual length of the string to get a real minimum
2710           * match length; it is 0 if there are no multi-char folds. This delta is
2711           * used by the caller to adjust the min length of the match, and the delta
2712           * between min and max, so that the optimizer doesn't reject these
2713           * possibilities based on size constraints.
2714           * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2715           * is used for an EXACTFU node that contains at least one "ss" sequence in
2716           * it. For non-UTF-8 patterns and strings, this is the only case where
2717           * there is a possible fold length change. That means that a regular
2718           * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2719           * with length changes, and so can be processed faster. regexec.c takes
2720           * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2721           * pre-folded by regcomp.c. This saves effort in regex matching.
2722           * However, the pre-folding isn't done for non-UTF8 patterns because the
2723           * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2724           * down by forcing the pattern into UTF8 unless necessary. Also what
2725           * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2726           * possibilities for the non-UTF8 patterns are quite simple, except for
2727           * the sharp s. All the ones that don't involve a UTF-8 target string are
2728           * members of a fold-pair, and arrays are set up for all of them so that
2729           * the other member of the pair can be found quickly. Code elsewhere in
2730           * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2731           * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2732           * described in the next item.
2733           * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2734           * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2735           * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2736           * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2737           * character in the pattern corresponds to at most a single character in
2738           * the target string. (And I do mean character, and not byte here, unlike
2739           * other parts of the documentation that have never been updated to
2740           * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2741           * two character string 'ss'; in EXACTFA nodes it can match
2742           * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2743           * instances where it is violated. I'm reluctant to try to change the
2744           * assumption, as the code involved is impenetrable to me (khw), so
2745           * instead the code here punts. This routine examines (when the pattern
2746           * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2747           * boolean indicating whether or not the node contains a sharp s. When it
2748           * is true, the caller sets a flag that later causes the optimizer in this
2749           * file to not set values for the floating and fixed string lengths, and
2750           * thus avoids the optimizer code in regexec.c that makes the invalid
2751           * assumption. Thus, there is no optimization based on string lengths for
2752           * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2753           * (The reason the assumption is wrong only in these two cases is that all
2754           * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2755           * other folds to their expanded versions. We can't prefold sharp s to
2756           * 'ss' in EXACTF nodes because we don't know at compile time if it
2757           * actually matches 'ss' or not. It will match iff the target string is
2758           * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2759           * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2760           * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2761           * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2762           * require the pattern to be forced into UTF-8, the overhead of which we
2763           * want to avoid.)
2764           *
2765           * Similarly, the code that generates tries doesn't currently handle
2766           * not-already-folded multi-char folds, and it looks like a pain to change
2767           * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2768           * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2769           * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2770           * using /iaa matching will be doing so almost entirely with ASCII
2771           * strings, so this should rarely be encountered in practice */
2772            
2773           #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2774           if (PL_regkind[OP(scan)] == EXACT) \
2775           join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2776            
2777           STATIC U32
2778 2862741         S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2779           /* Merge several consecutive EXACTish nodes into one. */
2780 1129217         regnode *n = regnext(scan);
2781           U32 stringok = 1;
2782 1129217         regnode *next = scan + NODE_SZ_STR(scan);
2783           U32 merged = 0;
2784           U32 stopnow = 0;
2785           #ifdef DEBUGGING
2786           regnode *stop = scan;
2787 723995         GET_RE_DEBUG_FLAGS_DECL;
2788           #else
2789           PERL_UNUSED_ARG(depth);
2790           #endif
2791            
2792 723995         PERL_ARGS_ASSERT_JOIN_EXACT;
2793           #ifndef EXPERIMENTAL_INPLACESCAN
2794           PERL_UNUSED_ARG(flags);
2795           PERL_UNUSED_ARG(val);
2796           #endif
2797 122836         DEBUG_PEEP("join",scan,depth);
2798            
2799           /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2800           * EXACT ones that are mergeable to the current one. */
2801 601377         while (n
2802 724035         && (PL_regkind[OP(n)] == NOTHING
2803 650941         || (stringok && OP(n) == OP(scan)))
2804 1129079         && NEXT_OFF(n)
2805 1129079         && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2806           {
2807          
2808 974993         if (OP(n) == TAIL || n > next)
2809           stringok = 0;
2810 969927         if (PL_regkind[OP(n)] == NOTHING) {
2811 1129051         DEBUG_PEEP("skip:",n,depth);
2812 136894         NEXT_OFF(scan) += NEXT_OFF(n);
2813 136894         next = n + NODE_STEP_REGNODE;
2814           #ifdef DEBUGGING
2815 126964         if (stringok)
2816           stop = n;
2817           #endif
2818 126964         n = regnext(n);
2819           }
2820 992185         else if (stringok) {
2821 586219         const unsigned int oldl = STR_LEN(scan);
2822 413109         regnode * const nnext = regnext(n);
2823            
2824           /* XXX I (khw) kind of doubt that this works on platforms where
2825           * U8_MAX is above 255 because of lots of other assumptions */
2826           /* Don't join if the sum can't fit into a single node */
2827 413109         if (oldl + STR_LEN(n) > U8_MAX)
2828           break;
2829          
2830 173138         DEBUG_PEEP("merg",n,depth);
2831 173138         merged++;
2832            
2833 173138         NEXT_OFF(scan) += NEXT_OFF(n);
2834 173138         STR_LEN(scan) += STR_LEN(n);
2835 173138         next = n + NODE_SZ_STR(n);
2836           /* Now we can overwrite *n : */
2837 173138         Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2838           #ifdef DEBUGGING
2839 1129067         stop = next - 1;
2840           #endif
2841           n = nnext;
2842           if (stopnow) break;
2843           }
2844            
2845           #ifdef EXPERIMENTAL_INPLACESCAN
2846           if (flags && !NEXT_OFF(n)) {
2847           DEBUG_PEEP("atch", val, depth);
2848           if (reg_off_by_arg[OP(n)]) {
2849           ARG_SET(n, val - n);
2850           }
2851           else {
2852           NEXT_OFF(n) = val - n;
2853           }
2854           stopnow = 1;
2855           }
2856           #endif
2857           }
2858            
2859 1125687         *min_subtract = 0;
2860 1011573         *has_exactf_sharp_s = FALSE;
2861            
2862           /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2863           * can now analyze for sequences of problematic code points. (Prior to
2864           * this final joining, sequences could have been split over boundaries, and
2865           * hence missed). The sequences only happen in folding, hence for any
2866           * non-EXACT EXACTish node */
2867 1011573         if (OP(scan) != EXACT) {
2868 18868         const U8 * const s0 = (U8*) STRING(scan);
2869           const U8 * s = s0;
2870 1125543         const U8 * const s_end = s0 + STR_LEN(scan);
2871            
2872           /* One pass is made over the node's string looking for all the
2873           * possibilities. to avoid some tests in the loop, there are two main
2874           * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2875           * non-UTF-8 */
2876 1122151         if (UTF) {
2877            
2878           /* Examine the string for a multi-character fold sequence. UTF-8
2879           * patterns have all characters pre-folded by the time this code is
2880           * executed */
2881 2170020         while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2882           length sequence we are looking for is 2 */
2883           {
2884           int count = 0;
2885 494764         int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2886 4033070         if (! len) { /* Not a multi-char fold: get next char */
2887 3464092         s += UTF8SKIP(s);
2888 3464092         continue;
2889           }
2890            
2891           /* Nodes with 'ss' require special handling, except for EXACTFL
2892           * and EXACTFA-ish for which there is no multi-char fold to
2893           * this */
2894 3464092         if (len == 2 && *s == 's' && *(s+1) == 's'
2895 3464092         && OP(scan) != EXACTFL
2896 2206944         && OP(scan) != EXACTFA
2897 2206944         && OP(scan) != EXACTFA_NO_TRIE)
2898           {
2899           count = 2;
2900 2206944         OP(scan) = EXACTFU_SS;
2901 2200916         s += 2;
2902           }
2903           else { /* Here is a generic multi-char fold. */
2904 1297931         const U8* multi_end = s + len;
2905            
2906           /* Count how many characters in it. In the case of /l and
2907           * /aa, no folds which contain ASCII code points are
2908           * allowed, so check for those, and skip if found. (In
2909           * EXACTFL, no folds are allowed to any Latin1 code point,
2910           * not just ASCII. But there aren't any of these
2911           * currently, nor ever likely, so don't take the time to
2912           * test for them. The code that generates the
2913           * is_MULTI_foo() macros croaks should one actually get put
2914           * into Unicode .) */
2915 1297931         if (OP(scan) != EXACTFL
2916 376         && OP(scan) != EXACTFA
2917 376         && OP(scan) != EXACTFA_NO_TRIE)
2918           {
2919 376         count = utf8_length(s, multi_end);
2920 376         s = multi_end;
2921           }
2922           else {
2923 376         while (s < multi_end) {
2924 324         if (isASCII(*s)) {
2925 1297555         s++;
2926 1297931         goto next_iteration;
2927           }
2928           else {
2929 1297875         s += UTF8SKIP(s);
2930           }
2931 2166161         count++;
2932           }
2933           }
2934           }
2935            
2936           /* The delta is how long the sequence is minus 1 (1 is how long
2937           * the character that folds to the sequence is) */
2938 111824         *min_subtract += count - 1;
2939           next_iteration: ;
2940           }
2941           }
2942 111854         else if (OP(scan) == EXACTFA) {
2943            
2944           /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2945           * fold to the ASCII range (and there are no existing ones in the
2946           * upper latin1 range). But, as outlined in the comments preceding
2947           * this function, we need to flag any occurrences of the sharp s.
2948           * This character forbids trie formation (because of added
2949           * complexity) */
2950 2166161         while (s < s_end) {
2951 1257148         if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2952 1122117         OP(scan) = EXACTFA_NO_TRIE;
2953 518391         *has_exactf_sharp_s = TRUE;
2954 518391         break;
2955           }
2956 773267         s++;
2957 518391         continue;
2958           }
2959           }
2960 512773         else if (OP(scan) != EXACTFL) {
2961            
2962           /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2963           * multi-char folds that are all Latin1. (This code knows that
2964           * there are no current multi-char folds possible with EXACTFL,
2965           * relying on fold_grind.t to catch any errors if the very unlikely
2966           * event happens that some get added in future Unicode versions.)
2967           * As explained in the comments preceding this function, we look
2968           * also for the sharp s in EXACTF nodes; it can be in the final
2969           * position. Otherwise we can stop looking 1 byte earlier because
2970           * have to find at least two characters for a multi-fold */
2971 32651         const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2972            
2973 32765         while (s < upper) {
2974 32697         int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2975 23193         if (! len) { /* Not a multi-char fold. */
2976 80         if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2977           {
2978 0         *has_exactf_sharp_s = TRUE;
2979           }
2980 80         s++;
2981 80         continue;
2982           }
2983            
2984 0         if (len == 2
2985 0         && isARG2_lower_or_UPPER_ARG1('s', *s)
2986 0         && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2987           {
2988            
2989           /* EXACTF nodes need to know that the minimum length
2990           * changed so that a sharp s in the string can match this
2991           * ss in the pattern, but they remain EXACTF nodes, as they
2992           * won't match this unless the target string is is UTF-8,
2993           * which we don't know until runtime */
2994 0         if (OP(scan) != EXACTF) {
2995 1129039         OP(scan) = EXACTFU_SS;
2996           }
2997           }
2998            
2999 52724836         *min_subtract += len - 1;
3000 21780         s += len;
3001           }
3002           }
3003           }
3004            
3005           #ifdef DEBUGGING
3006           /* Allow dumping but overwriting the collection of skipped
3007           * ops and/or strings with fake optimized ops */
3008 6470         n = scan + NODE_SZ_STR(scan);
3009 6574         while (n <= stop) {
3010 6218         OP(n) = OPTIMIZED;
3011 6218         FLAGS(n) = 0;
3012 6218         NEXT_OFF(n) = 0;
3013 182         n++;
3014           }
3015           #endif
3016 306         DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3017 6470         return stopnow;
3018           }
3019            
3020           /* REx optimizer. Converts nodes into quicker variants "in place".
3021           Finds fixed substrings. */
3022            
3023           /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3024           to the position after last scanned or to NULL. */
3025            
3026           #define INIT_AND_WITHP \
3027           assert(!and_withp); \
3028           Newx(and_withp,1,struct regnode_charclass_class); \
3029           SAVEFREEPV(and_withp)
3030            
3031           /* this is a chain of data about sub patterns we are processing that
3032           need to be handled separately/specially in study_chunk. Its so
3033           we can simulate recursion without losing state. */
3034           struct scan_frame;
3035           typedef struct scan_frame {
3036           regnode *last; /* last node to process in this frame */
3037           regnode *next; /* next node to process when last is reached */
3038           struct scan_frame *prev; /*previous frame*/
3039           I32 stop; /* what stopparen do we use */
3040           } scan_frame;
3041            
3042            
3043           #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3044            
3045           STATIC SSize_t
3046 3306         S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3047           SSize_t *minlenp, SSize_t *deltap,
3048           regnode *last,
3049           scan_data_t *data,
3050           I32 stopparen,
3051           U8* recursed,
3052           struct regnode_charclass_class *and_withp,
3053           U32 flags, U32 depth)
3054           /* scanp: Start here (read-write). */
3055           /* deltap: Write maxlen-minlen here. */
3056           /* last: Stop before this one. */
3057           /* data: string data about the pattern */
3058           /* stopparen: treat close N as END */
3059           /* recursed: which subroutines have we recursed into */
3060           /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3061           {
3062           dVAR;
3063           /* There must be at least this number of characters to match */
3064           SSize_t min = 0;
3065           I32 pars = 0, code;
3066 3306         regnode *scan = *scanp, *next;
3067           SSize_t delta = 0;
3068 6488         int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3069           int is_inf_internal = 0; /* The studied chunk is infinite */
3070 4366         I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3071           scan_data_t data_fake;
3072           SV *re_trie_maxbuff = NULL;
3073 4366         regnode *first_non_open = scan;
3074           SSize_t stopmin = SSize_t_MAX;
3075           scan_frame *frame = NULL;
3076 2318         GET_RE_DEBUG_FLAGS_DECL;
3077            
3078 196         PERL_ARGS_ASSERT_STUDY_CHUNK;
3079            
3080           #ifdef DEBUGGING
3081 196         StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3082           #endif
3083            
3084 2318         if ( depth == 0 ) {
3085 168         while (first_non_open && OP(first_non_open) == OPEN)
3086 2130         first_non_open=regnext(first_non_open);
3087           }
3088            
3089            
3090           fake_study_recurse:
3091 15992         while ( scan && OP(scan) != END && scan < last ){
3092 15796         UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3093           node length to get a real minimum (because
3094           the folded version may be shorter) */
3095 15796         bool has_exactf_sharp_s = FALSE;
3096           /* Peephole optimizer: */
3097 22088         DEBUG_STUDYDATA("Peep:", data,depth);
3098 19966         DEBUG_PEEP("Peep",scan,depth);
3099            
3100           /* Its not clear to khw or hv why this is done here, and not in the
3101           * clauses that deal with EXACT nodes. khw's guess is that it's
3102           * because of a previous design */
3103 19966         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3104            
3105           /* Follow the next-chain of the current node and optimize
3106           away all the NOTHINGs from it. */
3107 19966         if (OP(scan) != CURLYX) {
3108 20274         const int max = (reg_off_by_arg[OP(scan)]
3109 19966         ? I32_MAX
3110           /* I32 may be smaller than U16 on CRAYs! */
3111 19966         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3112 19966         int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3113           int noff;
3114 52703364         regnode *n = scan;
3115            
3116           /* Skip NOTHING and LONGJMP. */
3117 12679619         while ((n = regnext(n))
3118 12679311         && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3119 928774         || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3120 928466         && off + noff < max)
3121 928466         off += noff;
3122 11750845         if (reg_off_by_arg[OP(scan)])
3123 12679003         ARG(scan) = off;
3124           else
3125 12679311         NEXT_OFF(scan) = off;
3126           }
3127            
3128            
3129            
3130           /* The principal pseudo-switch. Cannot be a switch, since we
3131           look into several different things. */
3132 7981987         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3133 3494223         || OP(scan) == IFTHEN) {
3134 3493931         next = regnext(scan);
3135 3493931         code = OP(scan);
3136           /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3137            
3138 7981687         if (OP(next) == code || code == IFTHEN) {
3139           /* NOTE - There is similar code to this block below for handling
3140           TRIE nodes on a re-study. If you change stuff here check there
3141           too. */
3142           SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3143           struct regnode_charclass_class accum;
3144 7981687         regnode * const startbranch=scan;
3145            
3146 511202         if (flags & SCF_DO_SUBSTR)
3147 7981687         SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3148 4457770         if (flags & SCF_DO_STCLASS)
3149 8112063         cl_init_zero(pRExC_state, &accum);
3150            
3151 7981743         while (OP(scan) == code) {
3152           SSize_t deltanext, minnext, fake;
3153           I32 f = 0;
3154           struct regnode_charclass_class this_class;
3155            
3156 129794         num++;
3157 7981735         data_fake.flags = 0;
3158 7981735         if (data) {
3159 7981735         data_fake.whilem_c = data->whilem_c;
3160 12679059         data_fake.last_closep = data->last_closep;
3161           }
3162           else
3163 9116401         data_fake.last_closep = &fake;
3164            
3165 6053568         data_fake.pos_delta = delta;
3166 6049736         next = regnext(scan);
3167 56         scan = NEXTOPER(scan);
3168 40798         if (code != BRANCH)
3169 6134996         scan = NEXTOPER(scan);
3170 6135052         if (flags & SCF_DO_STCLASS) {
3171 6135042         cl_init(pRExC_state, &this_class);
3172 6053558         data_fake.start_class = &this_class;
3173           f = SCF_DO_STCLASS_AND;
3174           }
3175 81540         if (flags & SCF_WHILEM_VISITED_POS)
3176 20900702         f |= SCF_WHILEM_VISITED_POS;
3177            
3178           /* we suppose the run is continuous, last=next...*/
3179 20859960         minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3180           next, &data_fake,
3181           stopparen, recursed, NULL, f,depth+1);
3182 2363092         if (min1 > minnext)
3183           min1 = minnext;
3184 6135052         if (deltanext == SSize_t_MAX) {
3185           is_inf = is_inf_internal = 1;
3186           max1 = SSize_t_MAX;
3187 6135052         } else if (max1 < minnext + deltanext)
3188 6053522         max1 = minnext + deltanext;
3189 6544063         scan = next;
3190 432984         if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3191 432848         pars++;
3192 136         if (data_fake.flags & SCF_SEEN_ACCEPT) {
3193 432928         if ( stopmin > minnext)
3194 432928         stopmin = min + min1;
3195 12679003         flags &= ~SCF_DO_SUBSTR;
3196 40024053         if (data)
3197 1832658         data->flags |= SCF_SEEN_ACCEPT;
3198           }
3199 1832714         if (data) {
3200 1832714         if (data_fake.flags & SF_HAS_EVAL)
3201 1018818         data->flags |= SF_HAS_EVAL;
3202 1832714         data->whilem_c = data_fake.whilem_c;
3203           }
3204 797898         if (flags & SCF_DO_STCLASS)
3205 797888         cl_or(pRExC_state, &accum, &this_class);
3206           }
3207 797850         if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3208           min1 = 0;
3209 1832666         if (flags & SCF_DO_SUBSTR) {
3210 35100         data->pos_min += min1;
3211 1832666         if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3212 1832658         data->pos_delta = SSize_t_MAX;
3213           else
3214 1832666         data->pos_delta += max1 - min1;
3215 1018826         if (max1 != min1 || is_inf)
3216 1018822         data->longest = &(data->longest_float);
3217           }
3218 8         min += min1;
3219 1018826         if (delta == SSize_t_MAX
3220 1018826         || SSize_t_MAX - delta - (max1 - min1) < 0)
3221           delta = SSize_t_MAX;
3222           else
3223 127476         delta += max1 - min1;
3224 1832666         if (flags & SCF_DO_STCLASS_OR) {
3225 482397         cl_or(pRExC_state, data->start_class, &accum);
3226 274802         if (min1) {
3227 229976         cl_and(data->start_class, and_withp);
3228 35727         flags &= ~SCF_DO_STCLASS;
3229           }
3230           }
3231 346264         else if (flags & SCF_DO_STCLASS_AND) {
3232 346262         if (min1) {
3233 346262         cl_and(data->start_class, &accum);
3234 274808         flags &= ~SCF_DO_STCLASS;
3235           }
3236           else {
3237           /* Switch to OR mode: cache the old value of
3238           * data->start_class */
3239 274802         INIT_AND_WITHP;
3240 274802         StructCopy(data->start_class, and_withp,
3241           struct regnode_charclass_class);
3242 44826         flags &= ~SCF_DO_STCLASS_AND;
3243 229976         StructCopy(&accum, data->start_class,
3244           struct regnode_charclass_class);
3245 229976         flags |= SCF_DO_STCLASS_OR;
3246 202190         SET_SSC_EOS(data->start_class);
3247           }
3248           }
3249            
3250 48912         if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3251           /* demq.
3252            
3253           Assuming this was/is a branch we are dealing with: 'scan' now
3254           points at the item that follows the branch sequence, whatever
3255           it is. We now start at the beginning of the sequence and look
3256           for subsequences of
3257            
3258           BRANCH->EXACT=>x1
3259           BRANCH->EXACT=>x2
3260           tail
3261            
3262           which would be constructed from a pattern like /A|LIST|OF|WORDS/
3263            
3264           If we can find such a subsequence we need to turn the first
3265           element into a trie and then add the subsequent branch exact
3266           strings to the trie.
3267            
3268           We have two cases
3269            
3270           1. patterns where the whole set of branches can be converted.
3271            
3272           2. patterns where only a subset can be converted.
3273            
3274           In case 1 we can replace the whole set with a single regop
3275           for the trie. In case 2 we need to keep the start and end
3276           branches so
3277            
3278           'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3279           becomes BRANCH TRIE; BRANCH X;
3280            
3281           There is an additional case, that being where there is a
3282           common prefix, which gets split out into an EXACT like node
3283           preceding the TRIE node.
3284            
3285           If x(1..n)==tail then we can do a simple trie, if not we make
3286           a "jump" trie, such that when we match the appropriate word
3287           we "jump" to the appropriate tail node. Essentially we turn
3288           a nested if into a case structure of sorts.
3289            
3290           */
3291            
3292           int made=0;
3293 153294         if (!re_trie_maxbuff) {
3294 6912         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3295 6912         if (!SvIOK(re_trie_maxbuff))
3296 71454         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3297           }
3298 18327959         if ( SvIV(re_trie_maxbuff)>=0 ) {
3299           regnode *cur;
3300           regnode *first = (regnode *)NULL;
3301           regnode *last = (regnode *)NULL;
3302 18292232         regnode *tail = scan;
3303           U8 trietype = 0;
3304           U32 count=0;
3305            
3306           #ifdef DEBUGGING
3307 2072174         SV * const mysv = sv_newmortal(); /* for dumping */
3308           #endif
3309           /* var tail is used because there may be a TAIL
3310           regop in the way. Ie, the exacts will point to the
3311           thing following the TAIL, but the last branch will
3312           point at the TAIL. So we advance tail. If we
3313           have nested (?:) we may have to move through several
3314           tails.
3315           */
3316            
3317 1486420         while ( OP( tail ) == TAIL ) {
3318           /* this is the TAIL generated by (?:) */
3319 59536         tail = regnext( tail );
3320           }
3321            
3322          
3323 39232         DEBUG_TRIE_COMPILE_r({
3324           regprop(RExC_rx, mysv, tail );
3325           PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3326           (int)depth * 2 + 2, "",
3327           "Looking for TRIE'able sequences. Tail node is: ",
3328           SvPV_nolen_const( mysv )
3329           );
3330           });
3331          
3332           /*
3333            
3334           Step through the branches
3335           cur represents each branch,
3336           noper is the first thing to be matched as part of that branch
3337           noper_next is the regnext() of that node.
3338            
3339           We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3340           via a "jump trie" but we also support building with NOJUMPTRIE,
3341           which restricts the trie logic to structures like /FOO|BAR/.
3342            
3343           If noper is a trieable nodetype then the branch is a possible optimization
3344           target. If we are building under NOJUMPTRIE then we require that noper_next
3345           is the same as scan (our current position in the regex program).
3346            
3347           Once we have two or more consecutive such branches we can create a
3348           trie of the EXACT's contents and stitch it in place into the program.
3349            
3350           If the sequence represents all of the branches in the alternation we
3351           replace the entire thing with a single TRIE node.
3352            
3353           Otherwise when it is a subsequence we need to stitch it in place and
3354           replace only the relevant branches. This means the first branch has
3355           to remain as it is used by the alternation logic, and its next pointer,
3356           and needs to be repointed at the item on the branch chain following
3357           the last branch we have optimized away.
3358            
3359           This could be either a BRANCH, in which case the subsequence is internal,
3360           or it could be the item following the branch sequence in which case the
3361           subsequence is at the end (which does not necessarily mean the first node
3362           is the start of the alternation).
3363            
3364           TRIE_TYPE(X) is a define which maps the optype to a trietype.
3365            
3366           optype | trietype
3367           ----------------+-----------
3368           NOTHING | NOTHING
3369           EXACT | EXACT
3370           EXACTFU | EXACTFU
3371           EXACTFU_SS | EXACTFU
3372           EXACTFA | EXACTFA
3373            
3374            
3375           */
3376           #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3377           ( EXACT == (X) ) ? EXACT : \
3378           ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3379           ( EXACTFA == (X) ) ? EXACTFA : \
3380           0 )
3381            
3382           /* dont use tail as the end marker for this traverse */
3383 37628         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3384 37620         regnode * const noper = NEXTOPER( cur );
3385 56         U8 noper_type = OP( noper );
3386 56         U8 noper_trietype = TRIE_TYPE( noper_type );
3387           #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3388 56         regnode * const noper_next = regnext( noper );
3389 56         U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3390 56         U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3391           #endif
3392            
3393 56         DEBUG_TRIE_COMPILE_r({
3394           regprop(RExC_rx, mysv, cur);
3395           PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3396           (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3397            
3398           regprop(RExC_rx, mysv, noper);
3399           PerlIO_printf( Perl_debug_log, " -> %s",
3400           SvPV_nolen_const(mysv));
3401            
3402           if ( noper_next ) {
3403           regprop(RExC_rx, mysv, noper_next );
3404           PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3405           SvPV_nolen_const(mysv));
3406           }
3407           PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3408           REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3409           PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3410           );
3411           });
3412            
3413           /* Is noper a trieable nodetype that can be merged with the
3414           * current trie (if there is one)? */
3415 56         if ( noper_trietype
3416 56         &&
3417           (
3418 39280         ( noper_trietype == NOTHING)
3419 59590         || ( trietype == NOTHING )
3420 1832712         || ( trietype == noper_trietype )
3421           )
3422           #ifdef NOJUMPTRIE
3423           && noper_next == tail
3424           #endif
3425 38191443         && count < U16_MAX)
3426           {
3427           /* Handle mergable triable node
3428           * Either we are the first node in a new trieable sequence,
3429           * in which case we do some bookkeeping, otherwise we update
3430           * the end pointer. */
3431 10896442         if ( !first ) {
3432           first = cur;
3433 10896394         if ( noper_trietype == NOTHING ) {
3434           #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3435           regnode * const noper_next = regnext( noper );
3436           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3437           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3438           #endif
3439            
3440 10896394         if ( noper_next_trietype ) {
3441           trietype = noper_next_trietype;
3442 2414754         } else if (noper_next_type) {
3443           /* a NOTHING regop is 1 regop wide. We need at least two
3444           * for a trie so we can't merge this in */
3445           first = NULL;
3446           }
3447           } else {
3448           trietype = noper_trietype;
3449           }
3450           } else {
3451 2414802         if ( trietype == NOTHING )
3452           trietype = noper_trietype;
3453           last = cur;
3454           }
3455 1144186         if (first)
3456 963624         count++;
3457           } /* end handle mergable triable node */
3458           else {
3459           /* handle unmergable node -
3460           * noper may either be a triable node which can not be tried
3461           * together with the current trie, or a non triable node */
3462 963584         if ( last ) {
3463           /* If last is set and trietype is not NOTHING then we have found
3464           * at least two triable branch sequences in a row of a similar
3465           * trietype so we can turn them into a trie. If/when we
3466           * allow NOTHING to start a trie sequence this condition will be
3467           * required, and it isn't expensive so we leave it in for now. */
3468 296026         if ( trietype && trietype != NOTHING )
3469 296026         make_trie( pRExC_state,
3470           startbranch, first, cur, tail, count,
3471           trietype, depth+1 );
3472           last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3473           }
3474 296034         if ( noper_trietype
3475           #ifdef NOJUMPTRIE
3476           && noper_next == tail
3477           #endif
3478           ){
3479           /* noper is triable, so we can start a new trie sequence */
3480           count = 1;
3481           first = cur;
3482           trietype = noper_trietype;
3483 848112         } else if (first) {
3484           /* if we already saw a first but the current node is not triable then we have
3485           * to reset the first information. */
3486           count = 0;
3487           first = NULL;
3488           trietype = 0;
3489           }
3490           } /* end handle unmergable node */
3491           } /* loop over branches */
3492 667558         DEBUG_TRIE_COMPILE_r({
3493           regprop(RExC_rx, mysv, cur);
3494           PerlIO_printf( Perl_debug_log,
3495           "%*s- %s (%d) \n", (int)depth * 2 + 2,
3496           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3497            
3498           });
3499 848120         if ( last && trietype ) {
3500 3711545         if ( trietype != NOTHING ) {
3501           /* the last branch of the sequence was part of a trie,
3502           * so we have to construct it here outside of the loop
3503           */
3504 418977         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3505           #ifdef TRIE_STUDY_OPT
3506 418985         if ( ((made == MADE_EXACT_TRIE &&
3507 418977         startbranch == first)
3508 3292574         || ( first_non_open == first )) &&
3509           depth==0 ) {
3510 3292574         flags |= SCF_TRIE_RESTUDY;
3511 2975736         if ( startbranch == first
3512 2975736         && scan == tail )
3513           {
3514 4441028         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3515           }
3516           }
3517           #endif
3518           } else {
3519           /* at this point we know whatever we have is a NOTHING sequence/branch
3520           * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3521           */
3522 806         if ( startbranch == first ) {
3523           regnode *opt;
3524           /* the entire thing is a NOTHING sequence, something like this:
3525           * (?:|) So we can turn it into a plain NOTHING op. */
3526 4440386         DEBUG_TRIE_COMPILE_r({
3527           regprop(RExC_rx, mysv, cur);
3528           PerlIO_printf( Perl_debug_log,
3529           "%*s- %s (%d) \n", (int)depth * 2 + 2,
3530           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3531            
3532           });
3533 4440386         OP(startbranch)= NOTHING;
3534 4441024         NEXT_OFF(startbranch)= tail - startbranch;
3535 4441024         for ( opt= startbranch + 1; opt < tail ; opt++ )
3536 2414754         OP(opt)= OPTIMIZED;
3537           }
3538           }
3539           } /* end if ( last) */
3540           } /* TRIE_MAXBUF is non zero */
3541          
3542           } /* do trie */
3543          
3544           }
3545 2414754         else if ( code == BRANCHJ ) { /* single branch is optimized. */
3546 4441024         scan = NEXTOPER(NEXTOPER(scan));
3547           } else /* single branch is optimized. */
3548 4441024         scan = NEXTOPER(scan);
3549 5156027         continue;
3550 3965620         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3551           scan_frame *newframe = NULL;
3552           I32 paren;
3553           regnode *start;
3554           regnode *end;
3555            
3556 3965320         if (OP(scan) != SUSPEND) {
3557           /* set the pointer */
3558 5156019         if (OP(scan) == GOSUB) {
3559 5155657         paren = ARG(scan);
3560 5155657         RExC_recurse[ARG2L(scan)] = scan;
3561 5155657         start = RExC_open_parens[paren-1];
3562 860149         end = RExC_close_parens[paren-1];
3563           } else {
3564           paren = 0;
3565 5156019         start = RExC_rxi->program + 1;
3566 1939909         end = RExC_opend;
3567           }
3568 1939909         if (!recursed) {
3569 1939909         Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3570 1939909         SAVEFREEPV(recursed);
3571           }
3572 1939909         if (!PAREN_TEST(recursed,paren+1)) {
3573 5156019         PAREN_SET(recursed,paren+1);
3574 426292         Newx(newframe,1,scan_frame);
3575           } else {
3576 5156019         if (flags & SCF_DO_SUBSTR) {
3577 5156019         SCAN_COMMIT(pRExC_state,data,minlenp);
3578 1939909         data->longest = &(data->longest_float);
3579           }
3580           is_inf = is_inf_internal = 1;
3581 5156019         if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3582 3606443         cl_anything(pRExC_state, data->start_class);
3583 245366         flags &= ~SCF_DO_STCLASS;
3584           }
3585           } else {
3586 3361077         Newx(newframe,1,scan_frame);
3587           paren = stopparen;
3588 985875         start = scan+2;
3589 985875         end = regnext(scan);
3590           }
3591 985875         if (newframe) {
3592 985875         assert(start);
3593 985875         assert(end);
3594 985875         SAVEFREEPV(newframe);
3595 1549576         newframe->next = regnext(scan);
3596 61936         newframe->last = last;
3597 61936         newframe->stop = stopparen;
3598 1487640         newframe->prev = frame;
3599            
3600           frame = newframe;
3601 646732         scan = start;
3602           stopparen = paren;
3603           last = end;
3604            
3605 1549576         continue;
3606           }
3607           }
3608 5156319         else if (OP(scan) == EXACT) {
3609 2741119         SSize_t l = STR_LEN(scan);
3610           UV uc;
3611 5156163         if (UTF) {
3612 5154387         const U8 * const s = (U8*)STRING(scan);
3613 2482219         uc = utf8_to_uvchr_buf(s, s + l, NULL);
3614 7076         l = utf8_length(s, s + l);
3615           } else {
3616 338         uc = *((U8*)STRING(scan));
3617           }
3618 166         min += l;
3619 166         if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3620           /* The code below prefers earlier match for fixed
3621           offset, later match for variable offset. */
3622 136         if (data->last_end == -1) { /* Update the start info. */
3623 5156017         data->last_start_min = data->pos_min;
3624 7688130         data->last_start_max = is_inf
3625 4320789         ? SSize_t_MAX : data->pos_min + data->pos_delta;
3626           }
3627 5156133         sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3628 5156133         if (UTF)
3629 2014207         SvUTF8_on(data->last_found);
3630           {
3631 5156133         SV * const sv = data->last_found;
3632 2414888         MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3633 1179612         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3634 478270         if (mg && mg->mg_len >= 0)
3635 478152         mg->mg_len += utf8_length((U8*)STRING(scan),
3636           (U8*)STRING(scan)+STR_LEN(scan));
3637           }
3638 478270         data->last_end = data->pos_min + l;
3639 282985         data->pos_min += l; /* As in the first entry. */
3640 182450         data->flags &= ~SF_BEFORE_EOL;
3641           }
3642 412500         if (flags & SCF_DO_STCLASS_AND) {
3643           /* Check whether it is compatible with what we know already! */
3644           int compat = 1;
3645            
3646            
3647           /* If compatible, we or it in below. It is compatible if is
3648           * in the bitmp and either 1) its bit or its fold is set, or 2)
3649           * it's for a locale. Even if there isn't unicode semantics
3650           * here, at runtime there may be because of matching against a
3651           * utf8 string, so accept a possible false positive for
3652           * latin1-range folds */
3653 412612         if (uc >= 0x100 ||
3654 412478         (!(data->start_class->flags & ANYOF_LOCALE)
3655 470         && !ANYOF_BITMAP_TEST(data->start_class, uc)
3656 348         && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3657 412356         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3658           )
3659           {
3660           compat = 0;
3661           }
3662 412490         ANYOF_CLASS_ZERO(data->start_class);
3663 412490         ANYOF_BITMAP_ZERO(data->start_class);
3664 5156149         if (compat)
3665 2002516         ANYOF_BITMAP_SET(data->start_class, uc);
3666 1453778         else if (uc >= 0x100) {
3667           int i;
3668            
3669           /* Some Unicode code points fold to the Latin1 range; as
3670           * XXX temporary code, instead of figuring out if this is
3671           * one, just assume it is and set all the start class bits
3672           * that could be some such above 255 code point's fold
3673           * which will generate fals positives. As the code
3674           * elsewhere that does compute the fold settles down, it
3675           * can be extracted out and re-used here */
3676 1456838         for (i = 0; i < 256; i++){
3677 527818         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3678 515052         ANYOF_BITMAP_SET(data->start_class, i);
3679           }
3680           }
3681           }
3682 514838         CLEAR_SSC_EOS(data->start_class);
3683 657695         if (uc < 0x100)
3684 794420         data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3685           }
3686 514714         else if (flags & SCF_DO_STCLASS_OR) {
3687           /* false positive possible if the class is case-folded */
3688 630904         if (uc < 0x100)
3689 116200         ANYOF_BITMAP_SET(data->start_class, uc);
3690           else
3691 116200         data->start_class->flags |= ANYOF_UNICODE_ALL;
3692 116200         CLEAR_SSC_EOS(data->start_class);
3693 348         cl_and(data->start_class, and_withp);
3694           }
3695 492         flags &= ~SCF_DO_STCLASS;
3696           }
3697 116356         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3698 116234         SSize_t l = STR_LEN(scan);
3699 116234         UV uc = *((U8*)STRING(scan));
3700            
3701           /* Search for fixed substrings supports EXACT only. */
3702 398538         if (flags & SCF_DO_SUBSTR) {
3703 4641317         assert(data);
3704 1487696         SCAN_COMMIT(pRExC_state, data, minlenp);
3705           }
3706 1487724         if (UTF) {
3707 709004         const U8 * const s = (U8 *)STRING(scan);
3708 708790         uc = utf8_to_uvchr_buf(s, s + l, NULL);
3709 708790         l = utf8_length(s, s + l);
3710           }
3711 34         if (has_exactf_sharp_s) {
3712 1062285         RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3713           }
3714 708824         min += l - min_subtract;
3715 5156049         assert (min >= 0);
3716 631579         delta += min_subtract;
3717 5156049         if (flags & SCF_DO_SUBSTR) {
3718 3965322         data->pos_min += l - min_subtract;
3719 3965322         if (data->pos_min < 0) {
3720 118234         data->pos_min = 0;
3721           }
3722 118240         data->pos_delta += min_subtract;
3723 118240         if (min_subtract) {
3724 118234         data->longest = &(data->longest_float);
3725           }
3726           }
3727 118268         if (flags & SCF_DO_STCLASS_AND) {
3728           /* Check whether it is compatible with what we know already! */
3729           int compat = 1;
3730 3272         if (uc >= 0x100 ||
3731 118262         (!(data->start_class->flags & ANYOF_LOCALE)
3732 118262         && !ANYOF_BITMAP_TEST(data->start_class, uc)
3733 118234         && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3734           {
3735           compat = 0;
3736           }
3737 111634         ANYOF_CLASS_ZERO(data->start_class);
3738 14296         ANYOF_BITMAP_ZERO(data->start_class);
3739 14296         if (compat) {
3740 14296         ANYOF_BITMAP_SET(data->start_class, uc);
3741 14296         CLEAR_SSC_EOS(data->start_class);
3742 14296         if (OP(scan) == EXACTFL) {
3743           /* XXX This set is probably no longer necessary, and
3744           * probably wrong as LOCALE now is on in the initial
3745           * state */
3746 14268         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3747           }
3748           else {
3749            
3750           /* Also set the other member of the fold pair. In case
3751           * that unicode semantics is called for at runtime, use
3752           * the full latin1 fold. (Can't do this for locale,
3753           * because not known until runtime) */
3754 6988         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3755            
3756           /* All other (EXACTFL handled above) folds except under
3757           * /iaa that include s, S, and sharp_s also may include
3758           * the others */
3759 14296         if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
3760           {
3761 14296         if (uc == 's' || uc == 'S') {
3762 0         ANYOF_BITMAP_SET(data->start_class,
3763           LATIN_SMALL_LETTER_SHARP_S);
3764           }
3765 14296         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3766 6628         ANYOF_BITMAP_SET(data->start_class, 's');
3767 9942         ANYOF_BITMAP_SET(data->start_class, 'S');
3768           }
3769           }
3770           }
3771           }
3772 6628         else if (uc >= 0x100) {
3773           int i;
3774 3965316         for (i = 0; i < 256; i++){
3775 5570619         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3776 3277556         ANYOF_BITMAP_SET(data->start_class, i);
3777           }
3778           }
3779           }
3780           }
3781 794028         else if (flags & SCF_DO_STCLASS_OR) {
3782 6283198         if (data->start_class->flags & ANYOF_LOC_FOLD) {
3783           /* false positive possible if the class is case-folded.
3784           Assume that the locale settings are the same... */
3785 6342588         if (uc < 0x100) {
3786 3965316         ANYOF_BITMAP_SET(data->start_class, uc);
3787 3943698         if (OP(scan) != EXACTFL) {
3788            
3789           /* And set the other member of the fold pair, but
3790           * can't do that in locale because not known until
3791           * run-time */
3792 3943698         ANYOF_BITMAP_SET(data->start_class,
3793           PL_fold_latin1[uc]);
3794            
3795           /* All folds except under /iaa that include s, S,
3796           * and sharp_s also may include the others */
3797 113028         if (OP(scan) != EXACTFA
3798 57382         && OP(scan) != EXACTFA_NO_TRIE)
3799           {
3800 116236         if (uc == 's' || uc == 'S') {
3801 113028         ANYOF_BITMAP_SET(data->start_class,
3802           LATIN_SMALL_LETTER_SHARP_S);
3803           }
3804 3208         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3805 113028         ANYOF_BITMAP_SET(data->start_class, 's');
3806 113028         ANYOF_BITMAP_SET(data->start_class, 'S');
3807           }
3808           }
3809           }
3810           }
3811 113028         CLEAR_SSC_EOS(data->start_class);
3812           }
3813 113028         cl_and(data->start_class, and_withp);
3814           }
3815 127210         flags &= ~SCF_DO_STCLASS;
3816           }
3817 14270         else if (REGNODE_VARIES(OP(scan))) {
3818           SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
3819 14148         I32 fl = 0, f = flags;
3820 3943698         regnode * const oscan = scan;
3821           struct regnode_charclass_class this_class;
3822           struct regnode_charclass_class *oclass = NULL;
3823           I32 next_is_eval = 0;
3824            
3825 3965316         switch (PL_regkind[OP(scan)]) {
3826           case WHILEM: /* End of (?:...)* . */
3827 5156015         scan = NEXTOPER(scan);
3828 672         goto finish;
3829           case PLUS:
3830 8448583         if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3831 7893119         next = NEXTOPER(scan);
3832 932226         if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3833           mincount = 1;
3834           maxcount = REG_INFTY;
3835 932226         next = regnext(scan);
3836 8448583         scan = NEXTOPER(scan);
3837 33053         goto do_curly;
3838           }
3839           }
3840 25923         if (flags & SCF_DO_SUBSTR)
3841 25923         data->pos_min++;
3842 33053         min++;
3843           /* Fall through. */
3844           case STAR:
3845 360         if (flags & SCF_DO_STCLASS) {
3846           mincount = 0;
3847           maxcount = REG_INFTY;
3848 33053         next = regnext(scan);
3849 27295001         scan = NEXTOPER(scan);
3850 18188         goto do_curly;
3851           }
3852           is_inf = is_inf_internal = 1;
3853 17490         scan = regnext(scan);
3854 17490         if (flags & SCF_DO_SUBSTR) {
3855 4486185         SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3856 4477440         data->longest = &(data->longest_float);
3857           }
3858           goto optimize_curly_tail;
3859           case CURLY:
3860 4389990         if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3861 0         && (scan->flags == stopparen))
3862           {
3863           mincount = 1;
3864           maxcount = 1;
3865           } else {
3866 0         mincount = ARG1(scan);
3867 0         maxcount = ARG2(scan);
3868           }
3869 17490         next = regnext(scan);
3870 0         if (OP(scan) == CURLYX) {
3871 17490         I32 lp = (data ? *(data->last_closep) : 0);
3872 18188         scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3873           }
3874 18188         scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3875 18188         next_is_eval = (OP(scan) == EVAL);
3876           do_curly:
3877 1116         if (flags & SCF_DO_SUBSTR) {
3878 1116         if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3879 1116         pos_before = data->pos_min;
3880           }
3881 1116         if (data) {
3882 27276813         fl = data->flags;
3883 4266835         data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3884 2184871         if (is_inf)
3885 2184871         data->flags |= SF_IS_INF;
3886           }
3887 4266835         if (flags & SCF_DO_STCLASS) {
3888 4266835         cl_init(pRExC_state, &this_class);
3889 1456087         oclass = data->start_class;
3890 1456087         data->start_class = &this_class;
3891 167964         f |= SCF_DO_STCLASS_AND;
3892 0         f &= ~SCF_DO_STCLASS_OR;
3893           }
3894           /* Exclude from super-linear cache processing any {n,m}
3895           regops for which the combination of input pos and regex
3896           pos is not enough information to determine if a match
3897           will be possible.
3898            
3899           For example, in the regex /foo(bar\s*){4,8}baz/ with the
3900           regex pos at the \s*, the prospects for a match depend not
3901           only on the input position but also on how many (bar\s*)
3902           repeats into the {4,8} we are. */
3903 279325         if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3904 111361         f &= ~SCF_WHILEM_VISITED_POS;
3905            
3906           /* This will finish on WHILEM, setting scan, or on NULL: */
3907 442         minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3908           last, data, stopparen, recursed, NULL,
3909           (mincount == 0
3910           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3911            
3912 28         if (flags & SCF_DO_STCLASS)
3913 442         data->start_class = oclass;
3914 111361         if (mincount == 0 || minnext == 0) {
3915 110947         if (flags & SCF_DO_STCLASS_OR) {
3916 856944         cl_or(pRExC_state, data->start_class, &this_class);
3917           }
3918 768424         else if (flags & SCF_DO_STCLASS_AND) {
3919           /* Switch to OR mode: cache the old value of
3920           * data->start_class */
3921 88520         INIT_AND_WITHP;
3922 294204         StructCopy(data->start_class, and_withp,
3923           struct regnode_charclass_class);
3924 294204         flags &= ~SCF_DO_STCLASS_AND;
3925 291516         StructCopy(&this_class, data->start_class,
3926           struct regnode_charclass_class);
3927 290952         flags |= SCF_DO_STCLASS_OR;
3928 74774664         SET_SSC_EOS(data->start_class);
3929           }
3930           } else { /* Non-zero len */
3931 74483712         if (flags & SCF_DO_STCLASS_OR) {
3932 64631706         cl_or(pRExC_state, data->start_class, &this_class);
3933 2688         cl_and(data->start_class, and_withp);
3934           }
3935 58         else if (flags & SCF_DO_STCLASS_AND)
3936 674595         cl_and(data->start_class, &this_class);
3937 673280         flags &= ~SCF_DO_STCLASS;
3938           }
3939 59340         if (!scan) /* It was not CURLYX, but CURLY. */
3940 25614         scan = next;
3941 25614         if (!(flags & SCF_TRIE_DOING_RESTUDY)
3942           /* ? quantifier ok, except for (?{ ... }) */
3943 22470         && (next_is_eval || !(mincount == 0 && maxcount == 1))
3944 22470         && (minnext == 0) && (deltanext == 0)
3945 5774790         && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3946 5752320         && maxcount <= REG_INFTY/3) /* Complement check for big count */
3947           {
3948           /* Fatal warnings may leak the regexp without this: */
3949 1325032         SAVEFREESV(RExC_rx_sv);
3950 3144         ckWARNreg(RExC_parse,
3951           "Quantifier unexpected on zero-length expression");
3952 0         (void)ReREFCNT_inc(RExC_rx_sv);
3953           }
3954            
3955 806616         min += minnext * mincount;
3956 804864         is_inf_internal |= deltanext == SSize_t_MAX
3957 765560         || (maxcount == REG_INFTY && minnext + deltanext > 0);
3958 3144         is_inf |= is_inf_internal;
3959 3144         if (is_inf)
3960           delta = SSize_t_MAX;
3961           else
3962 1456087         delta += (minnext + deltanext) * maxcount - minnext * mincount;
3963            
3964           /* Try powerful optimization CURLYX => CURLYN. */
3965 94794         if ( OP(oscan) == CURLYX && data
3966           && data->flags & SF_IN_PAR
3967 1456087         && !(data->flags & SF_HAS_EVAL)
3968 23009978         && !deltanext && minnext == 1 ) {
3969           /* Try to optimize to CURLYN. */
3970 4741990         regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3971           regnode * const nxt1 = nxt;
3972           #ifdef DEBUGGING
3973           regnode *nxt2;
3974           #endif
3975            
3976           /* Skip open. */
3977 2370995         nxt = regnext(nxt);
3978 2370995         if (!REGNODE_SIMPLE(OP(nxt))
3979 20638983         && !(PL_regkind[OP(nxt)] == EXACT
3980 219885         && STR_LEN(nxt) == 1))
3981           goto nogo;
3982           #ifdef DEBUGGING
3983           nxt2 = nxt;
3984           #endif
3985 219885         nxt = regnext(nxt);
3986 219885         if (OP(nxt) != CLOSE)
3987           goto nogo;
3988 144920         if (RExC_open_parens) {
3989 71920         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3990 0         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3991           }
3992           /* Now we know that nxt2 is the only contents: */
3993 0         oscan->flags = (U8)ARG(nxt);
3994 0         OP(oscan) = CURLYN;
3995 0         OP(nxt1) = NOTHING; /* was OPEN. */
3996            
3997           #ifdef DEBUGGING
3998 0         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3999 0         NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4000 0         NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4001 0         OP(nxt) = OPTIMIZED; /* was CLOSE. */
4002 219885         OP(nxt + 1) = OPTIMIZED; /* was count. */
4003 219885         NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4004           #endif
4005           }
4006           nogo:
4007            
4008           /* Try optimization CURLYX => CURLYM. */
4009 219885         if ( OP(oscan) == CURLYX && data
4010           && !(data->flags & SF_HAS_PAR)
4011 219885         && !(data->flags & SF_HAS_EVAL)
4012 219885         && !deltanext /* atom is fixed width */
4013 0         && minnext != 0 /* CURLYM can't handle zero width */
4014 219885         && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4015           ) {
4016           /* XXXX How to optimize if data == 0? */
4017           /* Optimize to a simpler form. */
4018 219885         regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4019           regnode *nxt2;
4020            
4021 64270         OP(oscan) = CURLYM;
4022 9444         while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4023 9444         && (OP(nxt2) != WHILEM))
4024           nxt = nxt2;
4025 219885         OP(nxt2) = SUCCEED; /* Whas WHILEM */
4026           /* Need to optimize away parenths. */
4027 217387         if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4028           /* Set the parenth number. */
4029 219885         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4030            
4031 219885         oscan->flags = (U8)ARG(nxt);
4032 219885         if (RExC_open_parens) {
4033 219885         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4034 46724         RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4035           }
4036 16         OP(nxt1) = OPTIMIZED; /* was OPEN. */
4037 46708         OP(nxt) = OPTIMIZED; /* was CLOSE. */
4038            
4039           #ifdef DEBUGGING
4040 2         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4041 46706         OP(nxt + 1) = OPTIMIZED; /* was count. */
4042 219867         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4043 219867         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4044           #endif
4045           #if 0
4046           while ( nxt1 && (OP(nxt1) != WHILEM)) {
4047           regnode *nnxt = regnext(nxt1);
4048           if (nnxt == nxt) {
4049           if (reg_off_by_arg[OP(nxt1)])
4050           ARG_SET(nxt1, nxt2 - nxt1);
4051           else if (nxt2 - nxt1 < U16_MAX)
4052           NEXT_OFF(nxt1) = nxt2 - nxt1;
4053           else
4054           OP(nxt) = NOTHING; /* Cannot beautify */
4055           }
4056           nxt1 = nnxt;
4057           }
4058           #endif
4059           /* Optimize again: */
4060 1634         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4061           NULL, stopparen, recursed, NULL, 0,depth+1);
4062           }
4063           else
4064 219867         oscan->flags = 0;
4065           }
4066 0         else if ((OP(oscan) == CURLYX)
4067 219867         && (flags & SCF_WHILEM_VISITED_POS)
4068           /* See the comment on a similar expression above.
4069           However, this time it's not a subexpression
4070           we care about, but the expression itself. */
4071 219867         && (maxcount == REG_INFTY)
4072 9444         && data && ++data->whilem_c < 16) {
4073           /* This stays as CURLYX, we can put the count/of pair. */
4074           /* Find WHILEM (as in regexec.c) */
4075 178         regnode *nxt = oscan + NEXT_OFF(oscan);
4076            
4077 9266         if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4078 9266         nxt += ARG(nxt);
4079 9266         PREVOPER(nxt)->flags = (U8)(data->whilem_c
4080 9266         | (RExC_whilem_seen << 4)); /* On WHILEM */
4081           }
4082 20419098         if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4083 5604918         pars++;
4084 5601450         if (flags & SCF_DO_SUBSTR) {
4085           SV *last_str = NULL;
4086 14814180         int counted = mincount != 0;
4087            
4088 5602032         if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4089           #if defined(SPARC64_GCC_WORKAROUND)
4090           SSize_t b = 0;
4091           STRLEN l = 0;
4092           const char *s = NULL;
4093           SSize_t old = 0;
4094            
4095           if (pos_before >= data->last_start_min)
4096           b = pos_before;
4097           else
4098           b = data->last_start_min;
4099            
4100           l = 0;
4101           s = SvPV_const(data->last_found, l);
4102           old = b - data->last_start_min;
4103            
4104           #else
4105 5601450         SSize_t b = pos_before >= data->last_start_min
4106 2388919         ? pos_before : data->last_start_min;
4107           STRLEN l;
4108 2388919         const char * const s = SvPV_const(data->last_found, l);
4109 5601450         SSize_t old = b - data->last_start_min;
4110           #endif
4111            
4112 5601450         if (UTF)
4113 9212148         old = utf8_hop((U8*)s, old) - (U8*)s;
4114 114456         l -= old;
4115           /* Get the added string: */
4116 114456         last_str = newSVpvn_utf8(s + old, l, UTF);
4117 9097692         if (deltanext == 0 && pos_before == b) {
4118           /* What was added is a constant string */
4119 746         if (mincount > 1) {
4120 506         SvGROW(last_str, (mincount * l) + 1);
4121 506         repeatcpy(SvPVX(last_str) + l,
4122 746         SvPVX_const(last_str), l, mincount - 1);
4123 116         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4124           /* Add additional parts. */
4125 116         SvCUR_set(data->last_found,
4126           SvCUR(data->last_found) - l);
4127 9096946         sv_catsv(data->last_found, last_str);
4128           {
4129 111760         SV * sv = data->last_found;
4130           MAGIC *mg =
4131 2856         SvUTF8(sv) && SvMAGICAL(sv) ?
4132 2856         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4133 111760         if (mg && mg->mg_len >= 0)
4134 158         mg->mg_len += CHR_SVLEN(last_str) - l;
4135           }
4136 111760         data->last_end += l * (mincount - 1);
4137           }
4138           } else {
4139           /* start offset must point into the last copy */
4140 8985186         data->last_start_min += minnext * (mincount - 1);
4141 184878         data->last_start_max += is_inf ? SSize_t_MAX
4142 184878         : (maxcount - 1) * (minnext + data->pos_delta);
4143           }
4144           }
4145           /* It is counted once already... */
4146 123612         data->pos_min += minnext * (mincount - counted);
4147           #if 0
4148           PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4149           " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4150           " maxcount=%"UVdf" mincount=%"UVdf"\n",
4151           (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4152           (UV)mincount);
4153           if (deltanext != SSize_t_MAX)
4154           PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4155           (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4156           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4157           #endif
4158 28         if (deltanext == SSize_t_MAX ||
4159 123612         -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4160 60         data->pos_delta = SSize_t_MAX;
4161           else
4162 0         data->pos_delta += - counted * deltanext +
4163 0         (minnext + deltanext) * maxcount - minnext * mincount;
4164 8861574         if (mincount != maxcount) {
4165           /* Cannot extend fixed substrings found inside
4166           the group. */
4167 36473         SCAN_COMMIT(pRExC_state,data,minlenp);
4168 36473         if (mincount && last_str) {
4169 36473         SV * const sv = data->last_found;
4170 36473         MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4171 32863         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4172            
4173 36473         if (mg)
4174 214         mg->mg_len = -1;
4175 36473         sv_setsv(sv, last_str);
4176 34725         data->last_end = data->pos_min;
4177 34725         data->last_start_min =
4178 6016         data->pos_min - CHR_SVLEN(last_str);
4179 5142         data->last_start_max = is_inf
4180           ? SSize_t_MAX
4181 5142         : data->pos_min + data->pos_delta
4182 5142         - CHR_SVLEN(last_str);
4183           }
4184 4470         data->longest = &(data->longest_float);
4185           }
4186 4470         SvREFCNT_dec(last_str);
4187           }
4188 672         if (data && (fl & SF_HAS_EVAL))
4189 5142         data->flags |= SF_HAS_EVAL;
4190           optimize_curly_tail:
4191 5142         if (OP(oscan) != CURLYX) {
4192 0         while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4193 0         && NEXT_OFF(next))
4194 5142         NEXT_OFF(oscan) += NEXT_OFF(next);
4195           }
4196 4470         continue;
4197           default: /* REF, and CLUMP only? */
4198 5142         if (flags & SCF_DO_SUBSTR) {
4199 3766         SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4200 1748         data->longest = &(data->longest_float);
4201           }
4202           is_inf = is_inf_internal = 1;
4203 3766         if (flags & SCF_DO_STCLASS_OR)
4204 3766         cl_anything(pRExC_state, data->start_class);
4205 5142         flags &= ~SCF_DO_STCLASS;
4206           break;
4207           }
4208           }
4209 3100         else if (OP(scan) == LNBREAK) {
4210 5142         if (flags & SCF_DO_STCLASS) {
4211           int value = 0;
4212 2298         CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4213 5142         if (flags & SCF_DO_STCLASS_AND) {
4214 3970         for (value = 0; value < 256; value++)
4215 1440         if (!is_VERTWS_cp(value))
4216 5142         ANYOF_BITMAP_CLEAR(data->start_class, value);
4217           }
4218           else {
4219 834         for (value = 0; value < 256; value++)
4220 5142         if (is_VERTWS_cp(value))
4221 0         ANYOF_BITMAP_SET(data->start_class, value);
4222           }
4223 0         if (flags & SCF_DO_STCLASS_OR)
4224 0         cl_and(data->start_class, and_withp);
4225 0         flags &= ~SCF_DO_STCLASS;
4226           }
4227 0         min++;
4228 5142         delta++; /* Because of the 2 char string cr-lf */
4229 4470         if (flags & SCF_DO_SUBSTR) {
4230 24         SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4231 4470         data->pos_min += 1;
4232 5142         data->pos_delta += 1;
4233 0         data->longest = &(data->longest_float);
4234           }
4235           }
4236 36595         else if (REGNODE_SIMPLE(OP(scan))) {
4237           int value = 0;
4238            
4239 32869         if (flags & SCF_DO_SUBSTR) {
4240 32869         SCAN_COMMIT(pRExC_state,data,minlenp);
4241 32869         data->pos_min++;
4242           }
4243 30019         min++;
4244 36479         if (flags & SCF_DO_STCLASS) {
4245           int loop_max = 256;
4246 36473         CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4247            
4248           /* Some of the logic below assumes that switching
4249           locale on will only add false positives. */
4250 36473         switch (PL_regkind[OP(scan)]) {
4251           U8 classnum;
4252            
4253           case SANY:
4254           default:
4255           #ifdef DEBUGGING
4256 42         Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4257           #endif
4258           do_default:
4259 42         if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4260 42         cl_anything(pRExC_state, data->start_class);
4261           break;
4262           case REG_ANY:
4263 42         if (OP(scan) == SANY)
4264           goto do_default;
4265 36431         if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4266 172         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4267 0         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4268 0         cl_anything(pRExC_state, data->start_class);
4269           }
4270 172         if (flags & SCF_DO_STCLASS_AND || !value)
4271 172         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4272           break;
4273           case ANYOF:
4274 172         if (flags & SCF_DO_STCLASS_AND)
4275 172         cl_and(data->start_class,
4276           (struct regnode_charclass_class*)scan);
4277           else
4278 172         cl_or(pRExC_state, data->start_class,
4279           (struct regnode_charclass_class*)scan);
4280           break;
4281           case POSIXA:
4282           loop_max = 128;
4283           /* FALL THROUGH */
4284           case POSIXL:
4285           case POSIXD:
4286           case POSIXU:
4287 172         classnum = FLAGS(scan);
4288 36473         if (flags & SCF_DO_STCLASS_AND) {
4289 36473         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4290 46720021         ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4291 15077722         for (value = 0; value < loop_max; value++) {
4292 19658         if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4293 19658         ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4294           }
4295           }
4296           }
4297           }
4298           else {
4299 19658         if (data->start_class->flags & ANYOF_LOCALE) {
4300 19658         ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4301           }
4302           else {
4303            
4304           /* Even if under locale, set the bits for non-locale
4305           * in case it isn't a true locale-node. This will
4306           * create false positives if it truly is locale */
4307 19658         for (value = 0; value < loop_max; value++) {
4308 17472818         if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4309 17472818         ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4310           }
4311           }
4312           }
4313           }
4314           break;
4315           case NPOSIXA:
4316           loop_max = 128;
4317           /* FALL THROUGH */
4318           case NPOSIXL:
4319           case NPOSIXU:
4320           case NPOSIXD:
4321 17472818         classnum = FLAGS(scan);
4322 4343511         if (flags & SCF_DO_STCLASS_AND) {
4323 17472818         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4324 17472818         ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4325 1306136         for (value = 0; value < loop_max; value++) {
4326 1306136         if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4327 16166682         ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4328           }
4329           }
4330           }
4331           }
4332           else {
4333 3838307         if (data->start_class->flags & ANYOF_LOCALE) {
4334 3838307         ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4335           }
4336           else {
4337            
4338           /* Even if under locale, set the bits for non-locale in
4339           * case it isn't a true locale-node. This will create
4340           * false positives if it truly is locale */
4341 17472818         for (value = 0; value < loop_max; value++) {
4342 382429         if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4343 17472818         ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4344           }
4345           }
4346 33895         if (PL_regkind[OP(scan)] == NPOSIXD) {
4347 17472818         data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4348           }
4349           }
4350           }
4351           break;
4352           }
4353 2753131         if (flags & SCF_DO_STCLASS_OR)
4354 2753131         cl_and(data->start_class, and_withp);
4355 2753131         flags &= ~SCF_DO_STCLASS;
4356           }
4357           }
4358 2753247         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4359 666260         data->flags |= (OP(scan) == MEOL
4360           ? SF_BEFORE_MEOL
4361 2086883         : SF_BEFORE_SEOL);
4362 2753135         SCAN_COMMIT(pRExC_state, data, minlenp);
4363            
4364           }
4365 2753243         else if ( PL_regkind[OP(scan)] == BRANCHJ
4366           /* Lookbehind, or need to calculate parens/evals/stclass: */
4367 2753131         && (scan->flags || data || (flags & SCF_DO_STCLASS))
4368 48         && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4369 48         if ( OP(scan) == UNLESSM &&
4370 48         scan->flags == 0 &&
4371 0         OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4372 0         OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4373           ) {
4374           regnode *opt;
4375 0         regnode *upto= regnext(scan);
4376 0         DEBUG_PARSE_r({
4377           SV * const mysv_val=sv_newmortal();
4378           DEBUG_STUDYDATA("OPFAIL",data,depth);
4379            
4380           /*DEBUG_PARSE_MSG("opfail");*/
4381           regprop(RExC_rx, mysv_val, upto);
4382           PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4383           SvPV_nolen_const(mysv_val),
4384           (IV)REG_NODE_NUM(upto),
4385           (IV)(upto - scan)
4386           );
4387           });
4388 0         OP(scan) = OPFAIL;
4389 0         NEXT_OFF(scan) = upto - scan;
4390 0         for (opt= scan + 1; opt < upto ; opt++)
4391 0         OP(opt) = OPTIMIZED;
4392 336         scan= upto;
4393 288         continue;
4394           }
4395           if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4396           || OP(scan) == UNLESSM )
4397           {
4398           /* Negative Lookahead/lookbehind
4399           In this case we can't do fixed string optimisation.
4400           */
4401            
4402 48         SSize_t deltanext, minnext, fake = 0;
4403           regnode *nscan;
4404           struct regnode_charclass_class intrnl;
4405           int f = 0;
4406            
4407 48         data_fake.flags = 0;
4408 4931834         if (data) {
4409 4931834         data_fake.whilem_c = data->whilem_c;
4410 4477820         data_fake.last_closep = data->last_closep;
4411           }
4412           else
4413 4477820         data_fake.last_closep = &fake;
4414 4477814         data_fake.pos_delta = delta;
4415 4477814         if ( flags & SCF_DO_STCLASS && !scan->flags
4416 56         && OP(scan) == IFMATCH ) { /* Lookahead */
4417 454014         cl_init(pRExC_state, &intrnl);
4418 50512         data_fake.start_class = &intrnl;
4419           f |= SCF_DO_STCLASS_AND;
4420           }
4421 50512         if (flags & SCF_WHILEM_VISITED_POS)
4422 2547655         f |= SCF_WHILEM_VISITED_POS;
4423 0         next = regnext(scan);
4424 0         nscan = NEXTOPER(NEXTOPER(scan));
4425 0         minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4426           last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4427 0         if (scan->flags) {
4428 0         if (deltanext) {
4429 0         FAIL("Variable length lookbehind not implemented");
4430           }
4431 312412         else if (minnext > (I32)U8_MAX) {
4432 312412         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4433           }
4434 312412         scan->flags = (U8)minnext;
4435           }
4436 7322298         if (data) {
4437 6853680         if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4438 6853676         pars++;
4439 4         if (data_fake.flags & SF_HAS_EVAL)
4440 4         data->flags |= SF_HAS_EVAL;
4441 6853680         data->whilem_c = data_fake.whilem_c;
4442           }
4443 3482         if (f & SCF_DO_STCLASS_AND) {
4444 202         if (flags & SCF_DO_STCLASS_OR) {
4445           /* OR before, AND after: ideally we would recurse with
4446           * data_fake to get the AND applied by study of the
4447           * remainder of the pattern, and then derecurse;
4448           * *** HACK *** for now just treat as "no information".
4449           * See [perl #56690].
4450           */
4451 202         cl_init(pRExC_state, data->start_class);
4452           } else {
4453           /* AND before and after: combine and continue */
4454 3280         const int was = TEST_SSC_EOS(data->start_class);
4455            
4456 202         cl_and(data->start_class, &intrnl);
4457 202         if (was)
4458 6853680         SET_SSC_EOS(data->start_class);
4459           }
4460           }
4461           }
4462           #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4463           else {
4464           /* Positive Lookahead/lookbehind
4465           In this case we can do fixed string optimisation,
4466           but we must be careful about it. Note in the case of
4467           lookbehind the positions will be offset by the minimum
4468