File Coverage

ext/re/re_exec.c
Criterion Covered Total %
statement 2080 2333 89.2
branch n/a
condition n/a
subroutine n/a
total 2080 2333 89.2


line stmt bran cond sub time code
1           /* regexec.c
2           */
3            
4           /*
5           * One Ring to rule them all, One Ring to find them
6           &
7           * [p.v of _The Lord of the Rings_, opening poem]
8           * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9           * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10           */
11            
12           /* This file contains functions for executing a regular expression. See
13           * also regcomp.c which funnily enough, contains functions for compiling
14           * a regular expression.
15           *
16           * This file is also copied at build time to ext/re/re_exec.c, where
17           * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18           * This causes the main functions to be compiled under new names and with
19           * debugging support added, which makes "use re 'debug'" work.
20           */
21            
22           /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23           * confused with the original package (see point 3 below). Thanks, Henry!
24           */
25            
26           /* Additional note: this code is very heavily munged from Henry's version
27           * in places. In some spots I've traded clarity for efficiency, so don't
28           * blame Henry for some of the lack of readability.
29           */
30            
31           /* The names of the functions have been changed from regcomp and
32           * regexec to pregcomp and pregexec in order to avoid conflicts
33           * with the POSIX routines of the same names.
34           */
35            
36           #ifdef PERL_EXT_RE_BUILD
37           #include "re_top.h"
38           #endif
39            
40           /* At least one required character in the target string is expressible only in
41           * UTF-8. */
42           static const char* const non_utf8_target_but_utf8_required
43           = "Can't match, because target string needs to be in UTF-8\n";
44            
45           #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46           DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47           goto target; \
48           } STMT_END
49            
50           /*
51           * pregcomp and pregexec -- regsub and regerror are not used in perl
52           *
53           * Copyright (c) 1986 by University of Toronto.
54           * Written by Henry Spencer. Not derived from licensed software.
55           *
56           * Permission is granted to anyone to use this software for any
57           * purpose on any computer system, and to redistribute it freely,
58           * subject to the following restrictions:
59           *
60           * 1. The author is not responsible for the consequences of use of
61           * this software, no matter how awful, even if they arise
62           * from defects in it.
63           *
64           * 2. The origin of this software must not be misrepresented, either
65           * by explicit claim or by omission.
66           *
67           * 3. Altered versions must be plainly marked as such, and must not
68           * be misrepresented as being the original software.
69           *
70           **** Alterations to Henry's code are...
71           ****
72           **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73           **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74           **** by Larry Wall and others
75           ****
76           **** You may distribute under the terms of either the GNU General Public
77           **** License or the Artistic License, as specified in the README file.
78           *
79           * Beware that some of this code is subtly aware of the way operator
80           * precedence is structured in regular expressions. Serious changes in
81           * regular-expression syntax might require a total rethink.
82           */
83           #include "EXTERN.h"
84           #define PERL_IN_REGEXEC_C
85           #include "perl.h"
86            
87           #ifdef PERL_IN_XSUB_RE
88           # include "re_comp.h"
89           #else
90           # include "regcomp.h"
91           #endif
92            
93           #include "inline_invlist.c"
94           #include "unicode_constants.h"
95            
96           #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97            
98           #ifndef STATIC
99           #define STATIC static
100           #endif
101            
102           /* Valid for non-utf8 strings: avoids the reginclass
103           * call if there are no complications: i.e., if everything matchable is
104           * straight forward in the bitmap */
105           #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
106           : ANYOF_BITMAP_TEST(p,*(c)))
107            
108           /*
109           * Forwards.
110           */
111            
112           #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113           #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
114            
115           #define HOPc(pos,off) \
116           (char *)(reginfo->is_utf8_target \
117           ? reghop3((U8*)pos, off, \
118           (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
119           : (U8*)(pos + off))
120           #define HOPBACKc(pos, off) \
121           (char*)(reginfo->is_utf8_target \
122           ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123           : (pos - off >= reginfo->strbeg) \
124           ? (U8*)pos - off \
125           : NULL)
126            
127           #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
128           #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129            
130            
131           #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132           #define NEXTCHR_IS_EOS (nextchr < 0)
133            
134           #define SET_nextchr \
135           nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
136            
137           #define SET_locinput(p) \
138           locinput = (p); \
139           SET_nextchr
140            
141            
142           #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
143           if (!swash_ptr) { \
144           U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
145           swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146           1, 0, NULL, &flags); \
147           assert(swash_ptr); \
148           } \
149           } STMT_END
150            
151           /* If in debug mode, we test that a known character properly matches */
152           #ifdef DEBUGGING
153           # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
154           property_name, \
155           utf8_char_in_property) \
156           LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157           assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
158           #else
159           # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
160           property_name, \
161           utf8_char_in_property) \
162           LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
163           #endif
164            
165           #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166           PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167           swash_property_names[_CC_WORDCHAR], \
168           LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
169            
170           #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
171           STMT_START { \
172           LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173           "_X_regular_begin", \
174           LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
175           LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
176           "_X_extend", \
177           COMBINING_GRAVE_ACCENT_UTF8); \
178           } STMT_END
179            
180           #define PLACEHOLDER /* Something for the preprocessor to grab onto */
181           /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
182            
183           /* for use after a quantifier and before an EXACT-like node -- japhy */
184           /* it would be nice to rework regcomp.sym to generate this stuff. sigh
185           *
186           * NOTE that *nothing* that affects backtracking should be in here, specifically
187           * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188           * node that is in between two EXACT like nodes when ascertaining what the required
189           * "follow" character is. This should probably be moved to regex compile time
190           * although it may be done at run time beause of the REF possibility - more
191           * investigation required. -- demerphq
192           */
193           #define JUMPABLE(rn) ( \
194           OP(rn) == OPEN || \
195           (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
196           OP(rn) == EVAL || \
197           OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198           OP(rn) == PLUS || OP(rn) == MINMOD || \
199           OP(rn) == KEEPS || \
200           (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
201           )
202           #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
203            
204           #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
205            
206           #if 0
207           /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208           we don't need this definition. */
209           #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
210           #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
211           #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
212            
213           #else
214           /* ... so we use this as its faster. */
215           #define IS_TEXT(rn) ( OP(rn)==EXACT )
216           #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
217           #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218           #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
219            
220           #endif
221            
222           /*
223           Search for mandatory following text node; for lookahead, the text must
224           follow but for lookbehind (rn->flags != 0) we skip to the next step.
225           */
226           #define FIND_NEXT_IMPT(rn) STMT_START { \
227           while (JUMPABLE(rn)) { \
228           const OPCODE type = OP(rn); \
229           if (type == SUSPEND || PL_regkind[type] == CURLY) \
230           rn = NEXTOPER(NEXTOPER(rn)); \
231           else if (type == PLUS) \
232           rn = NEXTOPER(rn); \
233           else if (type == IFMATCH) \
234           rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
235           else rn += NEXT_OFF(rn); \
236           } \
237           } STMT_END
238            
239           /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240           * These are for the pre-composed Hangul syllables, which are all in a
241           * contiguous block and arranged there in such a way so as to facilitate
242           * alorithmic determination of their characteristics. As such, they don't need
243           * a swash, but can be determined by simple arithmetic. Almost all are
244           * GCB=LVT, but every 28th one is a GCB=LV */
245           #define SBASE 0xAC00 /* Start of block */
246           #define SCount 11172 /* Length of block */
247           #define TCount 28
248            
249           #define SLAB_FIRST(s) (&(s)->states[0])
250           #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
251            
252           static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
253           static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
254           static regmatch_state * S_push_slab(pTHX);
255            
256           #define REGCP_PAREN_ELEMS 3
257           #define REGCP_OTHER_ELEMS 3
258           #define REGCP_FRAME_ELEMS 1
259           /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
260           * are needed for the regexp context stack bookkeeping. */
261            
262           STATIC CHECKPOINT
263 67399426         S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
264           {
265           dVAR;
266 67399426         const int retval = PL_savestack_ix;
267 67399426         const int paren_elems_to_push =
268 67399426         (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
269 67399426         const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
270 67399426         const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
271           I32 p;
272 67399426         GET_RE_DEBUG_FLAGS_DECL;
273            
274 48         PERL_ARGS_ASSERT_REGCPPUSH;
275            
276 67399426         if (paren_elems_to_push < 0)
277 0         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
278           paren_elems_to_push);
279            
280 67399426         if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
281 233812454         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
282           " out of range (%lu-%ld)",
283           total_elems,
284           (unsigned long)maxopenparen,
285           (long)parenfloor);
286            
287 166413124         SSGROW(total_elems + REGCP_FRAME_ELEMS);
288          
289 166413124         DEBUG_BUFFERS_r(
290           if ((int)maxopenparen > (int)parenfloor)
291           PerlIO_printf(Perl_debug_log,
292           "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
293           PTR2UV(rex),
294           PTR2UV(rex->offs)
295           );
296           );
297 166413124         for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
298           /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
299 67399378         SSPUSHIV(rex->offs[p].end);
300 67399378         SSPUSHIV(rex->offs[p].start);
301 67399378         SSPUSHINT(rex->offs[p].start_tmp);
302 67399378         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
303           " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
304           (UV)p,
305           (IV)rex->offs[p].start,
306           (IV)rex->offs[p].start_tmp,
307           (IV)rex->offs[p].end
308           ));
309           }
310           /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
311 67399426         SSPUSHINT(maxopenparen);
312 59469062         SSPUSHINT(rex->lastparen);
313 59469062         SSPUSHINT(rex->lastcloseparen);
314 59469062         SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
315            
316 59469062         return retval;
317           }
318            
319           /* These are needed since we do not localize EVAL nodes: */
320           #define REGCP_SET(cp) \
321           DEBUG_STATE_r( \
322           PerlIO_printf(Perl_debug_log, \
323           " Setting an EVAL scope, savestack=%"IVdf"\n", \
324           (IV)PL_savestack_ix)); \
325           cp = PL_savestack_ix
326            
327           #define REGCP_UNWIND(cp) \
328           DEBUG_STATE_r( \
329           if (cp != PL_savestack_ix) \
330           PerlIO_printf(Perl_debug_log, \
331           " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
332           (IV)(cp), (IV)PL_savestack_ix)); \
333           regcpblow(cp)
334            
335           #define UNWIND_PAREN(lp, lcp) \
336           for (n = rex->lastparen; n > lp; n--) \
337           rex->offs[n].end = -1; \
338           rex->lastparen = n; \
339           rex->lastcloseparen = lcp;
340            
341            
342           STATIC void
343 59469062         S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
344           {
345           dVAR;
346           UV i;
347           U32 paren;
348 59469062         GET_RE_DEBUG_FLAGS_DECL;
349            
350 59469062         PERL_ARGS_ASSERT_REGCPPOP;
351            
352           /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
353 59469062         i = SSPOPUV;
354 228338814         assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
355 168869800         i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
356 168869800         rex->lastcloseparen = SSPOPINT;
357 168869800         rex->lastparen = SSPOPINT;
358 168869800         *maxopenparen_p = SSPOPINT;
359            
360 17893784         i -= REGCP_OTHER_ELEMS;
361           /* Now restore the parentheses context. */
362 168869800         DEBUG_BUFFERS_r(
363           if (i || rex->lastparen + 1 <= rex->nparens)
364           PerlIO_printf(Perl_debug_log,
365           "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
366           PTR2UV(rex),
367           PTR2UV(rex->offs)
368           );
369           );
370 471769998         paren = *maxopenparen_p;
371 412300984         for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
372           SSize_t tmps;
373 261324920         rex->offs[paren].start_tmp = SSPOPINT;
374 412300936         rex->offs[paren].start = SSPOPIV;
375 59469014         tmps = SSPOPIV;
376 1877336         if (paren <= rex->lastparen)
377 1877336         rex->offs[paren].end = tmps;
378 1877336         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
379           " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
380           (UV)paren,
381           (IV)rex->offs[paren].start,
382           (IV)rex->offs[paren].start_tmp,
383           (IV)rex->offs[paren].end,
384           (paren > rex->lastparen ? "(skipped)" : ""));
385           );
386 943578         paren--;
387           }
388           #if 1
389           /* It would seem that the similar code in regtry()
390           * already takes care of this, and in fact it is in
391           * a better location to since this code can #if 0-ed out
392           * but the code in regtry() is needed or otherwise tests
393           * requiring null fields (pat.t#187 and split.t#{13,14}
394           * (as of patchlevel 7877) will fail. Then again,
395           * this code seems to be necessary or otherwise
396           * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397           * --jhi updated by dapm */
398 5574         for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 5526         if (i > *maxopenparen_p)
400 120         rex->offs[i].start = -1;
401 120         rex->offs[i].end = -1;
402 0         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
403           " \\%"UVuf": %s ..-1 undeffing\n",
404           (UV)i,
405           (i > *maxopenparen_p) ? "-1" : " "
406           ));
407           }
408           #endif
409 168         }
410            
411           /* restore the parens and associated vars at savestack position ix,
412           * but without popping the stack */
413            
414           STATIC void
415           S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
416           {
417 48         I32 tmpix = PL_savestack_ix;
418 48         PL_savestack_ix = ix;
419 168         regcppop(rex, maxopenparen_p);
420 260         PL_savestack_ix = tmpix;
421           }
422            
423           #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
424            
425           STATIC bool
426 120         S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
427           {
428           /* Returns a boolean as to whether or not 'character' is a member of the
429           * Posix character class given by 'classnum' that should be equivalent to a
430           * value in the typedef '_char_class_number'.
431           *
432           * Ideally this could be replaced by a just an array of function pointers
433           * to the C library functions that implement the macros this calls.
434           * However, to compile, the precise function signatures are required, and
435           * these may vary from platform to to platform. To avoid having to figure
436           * out what those all are on each platform, I (khw) am using this method,
437           * which adds an extra layer of function call overhead (unless the C
438           * optimizer strips it away). But we don't particularly care about
439           * performance with locales anyway. */
440            
441 120         switch ((_char_class_number) classnum) {
442 632         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
443 120         case _CC_ENUM_ALPHA: return isALPHA_LC(character);
444 120         case _CC_ENUM_ASCII: return isASCII_LC(character);
445 2744         case _CC_ENUM_BLANK: return isBLANK_LC(character);
446 120         case _CC_ENUM_CASED: return isLOWER_LC(character)
447 798         || isUPPER_LC(character);
448 60         case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
449 2763         case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
450 224         case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
451 224         case _CC_ENUM_LOWER: return isLOWER_LC(character);
452 84         case _CC_ENUM_PRINT: return isPRINT_LC(character);
453 140         case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
454 56         case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
455 84         case _CC_ENUM_SPACE: return isSPACE_LC(character);
456 56         case _CC_ENUM_UPPER: return isUPPER_LC(character);
457 0         case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
458 0         case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
459           default: /* VERTSPACE should never occur in locales */
460 56         Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
461           }
462            
463           assert(0); /* NOTREACHED */
464           return FALSE;
465           }
466            
467           STATIC bool
468 28         S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
469           {
470           /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
471           * 'character' is a member of the Posix character class given by 'classnum'
472           * that should be equivalent to a value in the typedef
473           * '_char_class_number'.
474           *
475           * This just calls isFOO_lc on the code point for the character if it is in
476           * the range 0-255. Outside that range, all characters avoid Unicode
477           * rules, ignoring any locale. So use the Unicode function if this class
478           * requires a swash, and use the Unicode macro otherwise. */
479            
480 28         PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
481            
482 0         if (UTF8_IS_INVARIANT(*character)) {
483 0         return isFOO_lc(classnum, *character);
484           }
485 112         else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
486 0         return isFOO_lc(classnum,
487           TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
488           }
489            
490 0         if (classnum < _FIRST_NON_SWASH_CC) {
491            
492           /* Initialize the swash unless done already */
493 0         if (! PL_utf8_swash_ptrs[classnum]) {
494 137167339         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
495 137167339         PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
496           swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
497           }
498            
499 137167339         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
500           character,
501           TRUE /* is UTF */ ));
502           }
503            
504 137167339         switch ((_char_class_number) classnum) {
505           case _CC_ENUM_SPACE:
506 137167339         case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
507            
508 137166981         case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
509 137166981         case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
510 137166981         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
511           default: return 0; /* Things like CNTRL are always
512           below 256 */
513           }
514            
515           assert(0); /* NOTREACHED */
516           return FALSE;
517           }
518            
519           /*
520           * pregexec and friends
521           */
522            
523           #ifndef PERL_IN_XSUB_RE
524           /*
525           - pregexec - match a regexp against a string
526           */
527           I32
528           Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
529           char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
530           /* stringarg: the point in the string at which to begin matching */
531           /* strend: pointer to null at end of string */
532           /* strbeg: real beginning of string */
533           /* minend: end of match must be >= minend bytes after stringarg. */
534           /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
535           * itself is accessed via the pointers above */
536           /* nosave: For optimizations. */
537           {
538           PERL_ARGS_ASSERT_PREGEXEC;
539            
540           return
541           regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
542           nosave ? 0 : REXEC_COPY_STR);
543           }
544           #endif
545            
546           /*
547           * Need to implement the following flags for reg_anch:
548           *
549           * USE_INTUIT_NOML - Useful to call re_intuit_start() first
550           * USE_INTUIT_ML
551           * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
552           * INTUIT_AUTORITATIVE_ML
553           * INTUIT_ONCE_NOML - Intuit can match in one location only.
554           * INTUIT_ONCE_ML
555           *
556           * Another flag for this function: SECOND_TIME (so that float substrs
557           * with giant delta may be not rechecked).
558           */
559            
560           /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
561           Otherwise, only SvCUR(sv) is used to get strbeg. */
562            
563           /* XXXX Some places assume that there is a fixed substring.
564           An update may be needed if optimizer marks as "INTUITable"
565           RExen without fixed substrings. Similarly, it is assumed that
566           lengths of all the strings are no more than minlen, thus they
567           cannot come from lookahead.
568           (Or minlen should take into account lookahead.)
569           NOTE: Some of this comment is not correct. minlen does now take account
570           of lookahead/behind. Further research is required. -- demerphq
571            
572           */
573            
574           /* A failure to find a constant substring means that there is no need to make
575           an expensive call to REx engine, thus we celebrate a failure. Similarly,
576           finding a substring too deep into the string means that fewer calls to
577           regtry() should be needed.
578            
579           REx compiler's optimizer found 4 possible hints:
580           a) Anchored substring;
581           b) Fixed substring;
582           c) Whether we are anchored (beginning-of-line or \G);
583           d) First node (of those at offset 0) which may distinguish positions;
584           We use a)b)d) and multiline-part of c), and try to find a position in the
585           string which does not contradict any of them.
586           */
587            
588           /* Most of decisions we do here should have been done at compile time.
589           The nodes of the REx which we used for the search should have been
590           deleted from the finite automaton. */
591            
592           /* args:
593           * rx: the regex to match against
594           * sv: the SV being matched: only used for utf8 flag; the string
595           * itself is accessed via the pointers below. Note that on
596           * something like an overloaded SV, SvPOK(sv) may be false
597           * and the string pointers may point to something unrelated to
598           * the SV itself.
599           * strbeg: real beginning of string
600           * strpos: the point in the string at which to begin matching
601           * strend: pointer to the byte following the last char of the string
602           * flags currently unused; set to 0
603           * data: currently unused; set to NULL
604           */
605            
606           char *
607 137167073         Perl_re_intuit_start(pTHX_
608           REGEXP * const rx,
609           SV *sv,
610           const char * const strbeg,
611           char *strpos,
612           char *strend,
613           const U32 flags,
614           re_scream_pos_data *data)
615           {
616           dVAR;
617 137167073         struct regexp *const prog = ReANY(rx);
618           SSize_t start_shift = 0;
619           /* Should be nonnegative! */
620           SSize_t end_shift = 0;
621           char *s;
622           SV *check;
623           char *t;
624 137167073         const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
625           I32 ml_anch;
626           char *other_last = NULL; /* other substr checked before this */
627           char *check_at = NULL; /* check substr found at this pos */
628           char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
629 137167073         const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
630 137167073         RXi_GET_DECL(prog,progi);
631           regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
632           regmatch_info *const reginfo = ®info_buf;
633           #ifdef DEBUGGING
634           const char * const i_strpos = strpos;
635           #endif
636 1143398         GET_RE_DEBUG_FLAGS_DECL;
637            
638 495102         PERL_ARGS_ASSERT_RE_INTUIT_START;
639           PERL_UNUSED_ARG(flags);
640           PERL_UNUSED_ARG(data);
641            
642           /* CHR_DIST() would be more correct here but it makes things slow. */
643 1143398         if (prog->minlen > strend - strpos) {
644 136023675         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
645           "String too short... [re_intuit_start]\n"));
646           goto fail;
647           }
648            
649 106664         reginfo->is_utf8_target = cBOOL(utf8_target);
650 136023739         reginfo->info_aux = NULL;
651 137167045         reginfo->strbeg = strbeg;
652 72855894         reginfo->strend = strend;
653 106369435         reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
654 70019930         reginfo->intuit = 1;
655           /* not actually used within intuit, but zero for safety anyway */
656 70019930         reginfo->poscache_maxiter = 0;
657            
658 71770960         if (utf8_target) {
659 95011864         if (!prog->check_utf8 && prog->check_substr)
660 63616418         to_utf8_substr(prog);
661 63616278         check = prog->check_utf8;
662           } else {
663 39943195         if (!prog->check_substr && prog->check_utf8) {
664 39943131         if (! to_byte_substr(prog)) {
665 39942795         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
666           }
667           }
668 39942831         check = prog->check_substr;
669           }
670 416596         if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
671 416532         && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
672           {
673 74482         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
674 70404         || ( (prog->extflags & RXf_ANCH_BOL)
675 70404         && !multiline ) ); /* Check after \n? */
676            
677 63438         if (!ml_anch) {
678 10906         if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
679 39526235         && (strpos != strbeg)) {
680 7547081         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
681           goto fail;
682           }
683 6422740         if (prog->check_offset_min == prog->check_offset_max
684 31827961         && !(prog->extflags & RXf_CANY_SEEN)
685 31827961         && ! multiline) /* /m can cause \n's to match that aren't
686           accounted for in the string max length.
687           See [perl #115242] */
688           {
689           /* Substring at constant offset from beg-of-str... */
690           SSize_t slen;
691            
692 31827961         s = HOP3c(strpos, prog->check_offset_min, strend);
693          
694 35137806         if (SvTAIL(check)) {
695 23673511         slen = SvCUR(check); /* >= 1 */
696            
697 23673511         if ( strend - s > slen || strend - s < slen - 1
698 23673511         || (strend - s == slen && strend[-1] != '\n')) {
699 65396085         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
700           goto fail_finish;
701           }
702           /* Now should match s[0..slen-2] */
703 65706360         slen--;
704 98104332         if (slen && (*SvPVX_const(check) != *s
705 0         || (slen > 1
706 0         && memNE(SvPVX_const(check), s, slen)))) {
707           report_neq:
708 98104332         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
709           goto fail_finish;
710           }
711           }
712 62         else if (*SvPVX_const(check) != *s
713 62         || ((slen = SvCUR(check)) > 1
714 98104270         && memNE(SvPVX_const(check), s, slen)))
715           goto report_neq;
716           check_at = s;
717           goto success_at_start;
718           }
719           }
720           /* Match is anchored, but substr is not anchored wrt beg-of-str. */
721           s = strpos;
722 98104270         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
723 98104332         end_shift = prog->check_end_shift;
724          
725 98104332         if (!ml_anch) {
726 84090637         const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
727 46088044         - (SvTAIL(check) != 0);
728 5113531         const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
729            
730 5113531         if (end_shift < eshift)
731           end_shift = eshift;
732           }
733           }
734           else { /* Can match at random position */
735           ml_anch = 0;
736           s = strpos;
737 1764544         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
738 1764544         end_shift = prog->check_end_shift;
739          
740           /* end shift should be non negative here */
741           }
742            
743           #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
744           if (end_shift < 0)
745           Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
746           (IV)end_shift, RX_PRECOMP(prog));
747           #endif
748            
749           restart:
750           /* Find a possible match in the region s..strend by looking for
751           the "check" substring in the region corrected by start/end_shift. */
752          
753           {
754           SSize_t srch_start_shift = start_shift;
755           SSize_t srch_end_shift = end_shift;
756           U8* start_point;
757           U8* end_point;
758 1764544         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
759 43140         srch_end_shift -= ((strbeg - s) - srch_start_shift);
760 15764         srch_start_shift = strbeg - s;
761           }
762 873639         DEBUG_OPTIMISE_MORE_r({
763           PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
764           (IV)prog->check_offset_min,
765           (IV)srch_start_shift,
766           (IV)srch_end_shift,
767           (IV)prog->check_end_shift);
768           });
769          
770 1764544         if (prog->extflags & RXf_CANY_SEEN) {
771 1764480         start_point= (U8*)(s + srch_start_shift);
772 1764480         end_point= (U8*)(strend - srch_end_shift);
773           } else {
774 1764544         start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
775 1764544         end_point= HOP3(strend, -srch_end_shift, strbeg);
776           }
777 1764544         DEBUG_OPTIMISE_MORE_r({
778           PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
779           (int)(end_point - start_point),
780           (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
781           start_point);
782           });
783            
784 1764544         s = fbm_instr( start_point, end_point,
785           check, multiline ? FBMrf_MULTILINE : 0);
786           }
787           /* Update the count-of-usability, remove useless subpatterns,
788           unshift s. */
789            
790 1764544         DEBUG_EXECUTE_r({
791           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
792           SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
793           PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
794           (s ? "Found" : "Did not find"),
795           (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
796           ? "anchored" : "floating"),
797           quoted,
798           RE_SV_TAIL(check),
799           (s ? " at offset " : "...\n") );
800           });
801            
802 160862         if (!s)
803           goto fail_finish;
804           /* Finish the diagnostic message */
805 142276         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
806            
807           /* XXX dmq: first branch is for positive lookbehind...
808           Our check string is offset from the beginning of the pattern.
809           So we need to do any stclass tests offset forward from that
810           point. I think. :-(
811           */
812          
813          
814          
815           check_at=s;
816          
817            
818           /* Got a candidate. Check MBOL anchoring, and the *other* substr.
819           Start with the other substr.
820           XXXX no SCREAM optimization yet - and a very coarse implementation
821           XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
822           *always* match. Probably should be marked during compile...
823           Probably it is right to do no SCREAM here...
824           */
825            
826 49209943         if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
827 1603750         : (prog->float_substr && prog->anchored_substr))
828           {
829           /* Take into account the "other" substring. */
830           /* XXXX May be hopelessly wrong for UTF... */
831 1603684         if (!other_last)
832           other_last = strpos;
833 1603684         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
834           do_other_anchored:
835           {
836 3353297         char * const last = HOP3c(s, -start_shift, strbeg);
837           char *last1, *last2;
838           char * const saved_s = s;
839           SV* must;
840            
841 3353297         t = s - prog->check_offset_max;
842 3353297         if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
843 868338         && (!utf8_target
844 3353295         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
845 3353295         && t > strpos)))
846           NOOP;
847           else
848           t = strpos;
849 3353297         t = HOP3c(t, prog->anchored_offset, strend);
850 3353297         if (t < other_last) /* These positions already checked */
851           t = other_last;
852 3353297         last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
853 3353297         if (last < last1)
854           last1 = last;
855           /* XXXX It is not documented what units *_offsets are in.
856           We assume bytes, but this is clearly wrong.
857           Meaning this code needs to be carefully reviewed for errors.
858           dmq.
859           */
860          
861           /* On end-of-str: see comment below. */
862 630018         must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
863 620552         if (must == &PL_sv_undef) {
864           s = (char*)NULL;
865 2723279         DEBUG_r(must = prog->anchored_utf8); /* for debug */
866           }
867           else
868 29032180         s = fbm_instr(
869           (unsigned char*)t,
870           HOP3(HOP3(last1, prog->anchored_offset, strend)
871           + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
872           must,
873           multiline ? FBMrf_MULTILINE : 0
874           );
875 29032180         DEBUG_EXECUTE_r({
876           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
877           SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
878           PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
879           (s ? "Found" : "Contradicts"),
880           quoted, RE_SV_TAIL(must));
881           });
882          
883          
884 5812949         if (!s) {
885 83286         if (last1 >= last2) {
886 71884         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
887           ", giving up...\n"));
888           goto fail_finish;
889           }
890 9041640         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
891           ", trying floating at offset %ld...\n",
892           (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
893 11988574         other_last = HOP3c(last1, prog->anchored_offset+1, strend);
894 11986670         s = HOP3c(last, 1, strend);
895           goto restart;
896           }
897           else {
898 446794         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
899           (long)(s - i_strpos)));
900 365348         t = HOP3c(s, -prog->anchored_offset, strbeg);
901 36         other_last = HOP3c(s, 1, strend);
902           s = saved_s;
903 36         if (t == strpos)
904           goto try_at_start;
905           goto try_at_offset;
906           }
907           }
908           }
909           else { /* Take into account the floating substring. */
910           char *last, *last1;
911           char * const saved_s = s;
912           SV* must;
913            
914 365312         t = HOP3c(s, -start_shift, strbeg);
915           last1 = last =
916 365312         HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
917 81446         if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
918 81446         last = HOP3c(t, prog->float_max_offset, strend);
919 11539878         s = HOP3c(t, prog->float_min_offset, strend);
920 9354398         if (s < other_last)
921           s = other_last;
922           /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
923 24371921         must = utf8_target ? prog->float_utf8 : prog->float_substr;
924           /* fbm_instr() takes into account exact value of end-of-str
925           if the check is SvTAIL(ed). Since false positives are OK,
926           and end-of-str is not later than strend we are OK. */
927 441177         if (must == &PL_sv_undef) {
928           s = (char*)NULL;
929 29159732         DEBUG_r(must = prog->float_utf8); /* for debug message */
930           }
931           else
932 19063114         s = fbm_instr((unsigned char*)s,
933           (unsigned char*)last + SvCUR(must)
934           - (SvTAIL(must)!=0),
935           must, multiline ? FBMrf_MULTILINE : 0);
936 333948         DEBUG_EXECUTE_r({
937           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
938           SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
939           PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
940           (s ? "Found" : "Contradicts"),
941           quoted, RE_SV_TAIL(must));
942           });
943 333948         if (!s) {
944 930         if (last1 == last) {
945 9197609         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946           ", giving up...\n"));
947           goto fail_finish;
948           }
949 9197609         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
950           ", trying anchored starting at offset %ld...\n",
951           (long)(saved_s + 1 - i_strpos)));
952           other_last = last;
953 1903176         s = HOP3c(t, 1, strend);
954           goto restart;
955           }
956           else {
957 194602         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
958           (long)(s - i_strpos)));
959           other_last = s; /* Fix this later. --Hugo */
960           s = saved_s;
961 194602         if (t == strpos)
962           goto try_at_start;
963           goto try_at_offset;
964           }
965           }
966           }
967            
968          
969 194664         t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
970          
971 194664         DEBUG_OPTIMISE_MORE_r(
972           PerlIO_printf(Perl_debug_log,
973           "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
974           (IV)prog->check_offset_min,
975           (IV)prog->check_offset_max,
976           (IV)(s-strpos),
977           (IV)(t-strpos),
978           (IV)(t-s),
979           (IV)(strend-strpos)
980           )
981           );
982            
983 194664         if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
984 48         && (!utf8_target
985 14874731         || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
986 38514130         && t > strpos)))
987           {
988           /* Fixed substring is found far enough so that the match
989           cannot start at strpos. */
990           try_at_offset:
991 4936068         if (ml_anch && t[-1] != '\n') {
992           /* Eventually fbm_*() should handle this, but often
993           anchored_offset is not 0, so this check will not be wasted. */
994           /* XXXX In the code below we prefer to look for "^" even in
995           presence of anchored substrings. And we search even
996           beyond the found float position. These pessimizations
997           are historical artefacts only. */
998           find_anchor:
999 5101775         while (t < strend - prog->minlen) {
1000 131458         if (*t == '\n') {
1001 68510         if (t < check_at - prog->check_offset_min) {
1002 62948         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1003           /* Since we moved from the found position,
1004           we definitely contradict the found anchored
1005           substr. Due to the above check we do not
1006           contradict "check" substr.
1007           Thus we can arrive here only if check substr
1008           is float. Redo checking for "other"=="fixed".
1009           */
1010 4936062         strpos = t + 1;
1011 253464         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1012           PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1013           goto do_other_anchored;
1014           }
1015           /* We don't contradict the found floating substring. */
1016           /* XXXX Why not check for STCLASS? */
1017 4682598         s = t + 1;
1018 4644528         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1019           PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1020           goto set_useful;
1021           }
1022           /* Position contradicts check-string */
1023           /* XXXX probably better to look for check-string
1024           than for "\n", so one should lower the limit for t? */
1025 4936062         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1026           PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1027 4936062         other_last = strpos = s = t + 1;
1028 4936062         goto restart;
1029           }
1030 295992         t++;
1031           }
1032 194332         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1033           PL_colors[0], PL_colors[1]));
1034           goto fail_finish;
1035           }
1036           else {
1037 178632         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1038           PL_colors[0], PL_colors[1]));
1039           }
1040           s = t;
1041           set_useful:
1042 178632         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1043           }
1044           else {
1045           /* The found string does not prohibit matching at strpos,
1046           - no optimization of calling REx engine can be performed,
1047           unless it was an MBOL and we are not after MBOL,
1048           or a future STCLASS check will fail this. */
1049           try_at_start:
1050           /* Even in this situation we may use MBOL flag if strpos is offset
1051           wrt the start of the string. */
1052 12898         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1053           /* May be due to an implicit anchor of m{.*foo} */
1054 119994         && !(prog->intflags & PREGf_IMPLICIT))
1055           {
1056           t = strpos;
1057           goto find_anchor;
1058           }
1059 120052         DEBUG_EXECUTE_r( if (ml_anch)
1060           PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1062           );
1063           success_at_start:
1064 36136         if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1065 4424         && (utf8_target ? (
1066 4210         prog->check_utf8 /* Could be deleted already */
1067 165786         && --BmUSEFUL(prog->check_utf8) < 0
1068 54422         && (prog->check_utf8 == prog->float_utf8)
1069           ) : (
1070 54480         prog->check_substr /* Could be deleted already */
1071 111422         && --BmUSEFUL(prog->check_substr) < 0
1072 111462         && (prog->check_substr == prog->float_substr)
1073           )))
1074           {
1075           /* If flags & SOMETHING - do not do it many times on the same match */
1076 111462         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077           /* XXX Does the destruction order has to change with utf8_target? */
1078 38218138         SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079 98747393         SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080 118156009         prog->check_substr = prog->check_utf8 = NULL; /* disable */
1081 26704655         prog->float_substr = prog->float_utf8 = NULL; /* clear */
1082           check = NULL; /* abort */
1083           s = strpos;
1084           /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085           see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086 26704655         if (prog->intflags & PREGf_IMPLICIT)
1087 26704655         prog->extflags &= ~RXf_ANCH_MBOL;
1088           /* XXXX This is a remnant of the old implementation. It
1089           looks wasteful, since now INTUIT can use many
1090           other heuristics. */
1091 26704655         prog->extflags &= ~RXf_USE_INTUIT;
1092           /* XXXX What other flags might need to be cleared in this branch? */
1093           }
1094           else
1095           s = strpos;
1096           }
1097            
1098           /* Last resort... */
1099           /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100           /* trie stclasses are too expensive to use here, we are better off to
1101           leave it to regmatch itself */
1102 26704719         if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103           /* minlen == 0 is possible if regstclass is \b or \B,
1104           and the fixed substr is ''$.
1105           Since minlen is already taken into account, s+1 is before strend;
1106           accidentally, minlen >= 1 guaranties no false positives at s + 1
1107           even for \b or \B. But (minlen? 1 : 0) below assumes that
1108           regstclass does not come from lookahead... */
1109           /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110           This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1111 26704655         const U8* const str = (U8*)STRING(progi->regstclass);
1112           /* XXX this value could be pre-computed */
1113 15725172         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1114 1134185         ? (reginfo->is_utf8_pat
1115 87529661         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1116 76         : STR_LEN(progi->regstclass))
1117           : 1);
1118           char * endpos;
1119 15198         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1120 5528         endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1121 146108         else if (prog->float_substr || prog->float_utf8)
1122 13040         endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123           else
1124           endpos= strend;
1125          
1126 446420         if (checked_upto < s)
1127           checked_upto = s;
1128 72536         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1129           (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1130            
1131           t = s;
1132 72536         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1133           reginfo);
1134 403254         if (s) {
1135           checked_upto = s;
1136           } else {
1137           #ifdef DEBUGGING
1138           const char *what = NULL;
1139           #endif
1140 403254         if (endpos == strend) {
1141 403254         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142           "Could not match STCLASS...\n") );
1143           goto fail;
1144           }
1145 403254         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146           "This position contradicts STCLASS...\n") );
1147 0         if ((prog->extflags & RXf_ANCH) && !ml_anch)
1148           goto fail;
1149 403254         checked_upto = HOPBACKc(endpos, start_shift);
1150 403254         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1151           (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1152           /* Contradict one of substrings */
1153 403254         if (prog->anchored_substr || prog->anchored_utf8) {
1154 119372         if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1155 2223869         DEBUG_EXECUTE_r( what = "anchored" );
1156           hop_and_restart:
1157 223040         s = HOP3c(t, 1, strend);
1158 223040         if (s + start_shift + end_shift > strend) {
1159           /* XXXX Should be taken into account earlier? */
1160 223040         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1161           "Could not match STCLASS...\n") );
1162           goto fail;
1163           }
1164 223040         if (!check)
1165           goto giveup;
1166 223040         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1167           "Looking for %s substr starting at offset %ld...\n",
1168           what, (long)(s + start_shift - i_strpos)) );
1169           goto restart;
1170           }
1171           /* Have both, check_string is floating */
1172 223040         if (t + start_shift >= check_at) /* Contradicts floating=check */
1173           goto retry_floating_check;
1174           /* Recheck anchored substring, but not floating... */
1175           s = check_at;
1176 223040         if (!check)
1177           goto giveup;
1178 223040         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1179           "Looking for anchored substr starting at offset %ld...\n",
1180           (long)(other_last - i_strpos)) );
1181           goto do_other_anchored;
1182           }
1183           /* Another way we could have checked stclass at the
1184           current position only: */
1185 111520         if (ml_anch) {
1186 365392         s = t = t + 1;
1187 311256         if (!check)
1188           goto giveup;
1189 311256         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1190           "Looking for /%s^%s/m starting at offset %ld...\n",
1191           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1192           goto try_at_offset;
1193           }
1194 218992         if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1195           goto fail;
1196           /* Check is floating substring. */
1197           retry_floating_check:
1198 142352         t = check_at - start_shift;
1199 32         DEBUG_EXECUTE_r( what = "floating" );
1200           goto hop_and_restart;
1201           }
1202 36         if (t != s) {
1203 32         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204           "By STCLASS: moving %ld --> %ld\n",
1205           (long)(t - i_strpos), (long)(s - i_strpos))
1206           );
1207           }
1208           else {
1209 44         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1210           "Does not contradict STCLASS...\n");
1211           );
1212           }
1213           }
1214           giveup:
1215 625788         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1216           PL_colors[4], (check ? "Guessed" : "Giving up"),
1217           PL_colors[5], (long)(s - i_strpos)) );
1218           return s;
1219            
1220           fail_finish: /* Substring not found */
1221 91406         if (prog->check_substr || prog->check_utf8) /* could be removed already */
1222 573         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1223           fail:
1224 116         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1225           PL_colors[4], PL_colors[5]));
1226           return NULL;
1227           }
1228            
1229           #define DECL_TRIE_TYPE(scan) \
1230           const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1231           trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1232           trie_type = ((scan->flags == EXACT) \
1233           ? (utf8_target ? trie_utf8 : trie_plain) \
1234           : (scan->flags == EXACTFA) \
1235           ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1236           : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1237            
1238           #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1239           STMT_START { \
1240           STRLEN skiplen; \
1241           U8 flags = FOLD_FLAGS_FULL; \
1242           switch (trie_type) { \
1243           case trie_utf8_exactfa_fold: \
1244           flags |= FOLD_FLAGS_NOMIX_ASCII; \
1245           /* FALL THROUGH */ \
1246           case trie_utf8_fold: \
1247           if ( foldlen>0 ) { \
1248           uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249           foldlen -= len; \
1250           uscan += len; \
1251           len=0; \
1252           } else { \
1253           uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \
1254           len = UTF8SKIP(uc); \
1255           skiplen = UNISKIP( uvc ); \
1256           foldlen -= skiplen; \
1257           uscan = foldbuf + skiplen; \
1258           } \
1259           break; \
1260           case trie_latin_utf8_exactfa_fold: \
1261           flags |= FOLD_FLAGS_NOMIX_ASCII; \
1262           /* FALL THROUGH */ \
1263           case trie_latin_utf8_fold: \
1264           if ( foldlen>0 ) { \
1265           uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1266           foldlen -= len; \
1267           uscan += len; \
1268           len=0; \
1269           } else { \
1270           len = 1; \
1271           uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1272           skiplen = UNISKIP( uvc ); \
1273           foldlen -= skiplen; \
1274           uscan = foldbuf + skiplen; \
1275           } \
1276           break; \
1277           case trie_utf8: \
1278           uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1279           break; \
1280           case trie_plain: \
1281           uvc = (UV)*uc; \
1282           len = 1; \
1283           } \
1284           if (uvc < 256) { \
1285           charid = trie->charmap[ uvc ]; \
1286           } \
1287           else { \
1288           charid = 0; \
1289           if (widecharmap) { \
1290           SV** const svpp = hv_fetch(widecharmap, \
1291           (char*)&uvc, sizeof(UV), 0); \
1292           if (svpp) \
1293           charid = (U16)SvIV(*svpp); \
1294           } \
1295           } \
1296           } STMT_END
1297            
1298           #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1299           STMT_START { \
1300           while (s <= e) { \
1301           if ( (CoNd) \
1302           && (ln == 1 || folder(s, pat_string, ln)) \
1303           && (reginfo->intuit || regtry(reginfo, &s)) )\
1304           goto got_it; \
1305           s++; \
1306           } \
1307           } STMT_END
1308            
1309           #define REXEC_FBC_UTF8_SCAN(CoDe) \
1310           STMT_START { \
1311           while (s < strend) { \
1312           CoDe \
1313           s += UTF8SKIP(s); \
1314           } \
1315           } STMT_END
1316            
1317           #define REXEC_FBC_SCAN(CoDe) \
1318           STMT_START { \
1319           while (s < strend) { \
1320           CoDe \
1321           s++; \
1322           } \
1323           } STMT_END
1324            
1325           #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1326           REXEC_FBC_UTF8_SCAN( \
1327           if (CoNd) { \
1328           if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1329           goto got_it; \
1330           else \
1331           tmp = doevery; \
1332           } \
1333           else \
1334           tmp = 1; \
1335           )
1336            
1337           #define REXEC_FBC_CLASS_SCAN(CoNd) \
1338           REXEC_FBC_SCAN( \
1339           if (CoNd) { \
1340           if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1341           goto got_it; \
1342           else \
1343           tmp = doevery; \
1344           } \
1345           else \
1346           tmp = 1; \
1347           )
1348            
1349           #define REXEC_FBC_TRYIT \
1350           if ((reginfo->intuit || regtry(reginfo, &s))) \
1351           goto got_it
1352            
1353           #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1354           if (utf8_target) { \
1355           REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1356           } \
1357           else { \
1358           REXEC_FBC_CLASS_SCAN(CoNd); \
1359           }
1360          
1361           #define DUMP_EXEC_POS(li,s,doutf8) \
1362           dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1363           startpos, doutf8)
1364            
1365            
1366           #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1367           tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1368           tmp = TEST_NON_UTF8(tmp); \
1369           REXEC_FBC_UTF8_SCAN( \
1370           if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1371           tmp = !tmp; \
1372           IF_SUCCESS; \
1373           } \
1374           else { \
1375           IF_FAIL; \
1376           } \
1377           ); \
1378            
1379           #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1380           if (s == reginfo->strbeg) { \
1381           tmp = '\n'; \
1382           } \
1383           else { \
1384           U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1385           tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1386           } \
1387           tmp = TeSt1_UtF8; \
1388           LOAD_UTF8_CHARCLASS_ALNUM(); \
1389           REXEC_FBC_UTF8_SCAN( \
1390           if (tmp == ! (TeSt2_UtF8)) { \
1391           tmp = !tmp; \
1392           IF_SUCCESS; \
1393           } \
1394           else { \
1395           IF_FAIL; \
1396           } \
1397           ); \
1398            
1399           /* The only difference between the BOUND and NBOUND cases is that
1400           * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1401           * NBOUND. This is accomplished by passing it in either the if or else clause,
1402           * with the other one being empty */
1403           #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404           FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1405            
1406           #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1407           FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1408            
1409           #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1410           FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1411            
1412           #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1413           FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1414            
1415            
1416           /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1417           * be passed in completely with the variable name being tested, which isn't
1418           * such a clean interface, but this is easier to read than it was before. We
1419           * are looking for the boundary (or non-boundary between a word and non-word
1420           * character. The utf8 and non-utf8 cases have the same logic, but the details
1421           * must be different. Find the "wordness" of the character just prior to this
1422           * one, and compare it with the wordness of this one. If they differ, we have
1423           * a boundary. At the beginning of the string, pretend that the previous
1424           * character was a new-line */
1425           #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1426           if (utf8_target) { \
1427           UTF8_CODE \
1428           } \
1429           else { /* Not utf8 */ \
1430           tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1431           tmp = TEST_NON_UTF8(tmp); \
1432           REXEC_FBC_SCAN( \
1433           if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1434           tmp = !tmp; \
1435           IF_SUCCESS; \
1436           } \
1437           else { \
1438           IF_FAIL; \
1439           } \
1440           ); \
1441           } \
1442           if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1443           goto got_it;
1444            
1445           /* We know what class REx starts with. Try to find this position... */
1446           /* if reginfo->intuit, its a dryrun */
1447           /* annoyingly all the vars in this routine have different names from their counterparts
1448           in regmatch. /grrr */
1449            
1450           STATIC char *
1451 68         S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1452           const char *strend, regmatch_info *reginfo)
1453           {
1454           dVAR;
1455 44         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1456           char *pat_string; /* The pattern's exactish string */
1457           char *pat_end; /* ptr to end char of pat_string */
1458           re_fold_t folder; /* Function for computing non-utf8 folds */
1459           const U8 *fold_array; /* array for folding ords < 256 */
1460           STRLEN ln;
1461           STRLEN lnc;
1462           U8 c1;
1463           U8 c2;
1464           char *e;
1465           I32 tmp = 1; /* Scratch variable? */
1466 49         const bool utf8_target = reginfo->is_utf8_target;
1467           UV utf8_fold_flags = 0;
1468 880         const bool is_utf8_pat = reginfo->is_utf8_pat;
1469           bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1470           with a result inverts that result, as 0^1 =
1471           1 and 1^1 = 0 */
1472           _char_class_number classnum;
1473            
1474 3264183         RXi_GET_DECL(prog,progi);
1475            
1476 9215480         PERL_ARGS_ASSERT_FIND_BYCLASS;
1477            
1478           /* We know what class it must start with. */
1479 145182         switch (OP(c)) {
1480           case ANYOF:
1481           case ANYOF_SYNTHETIC:
1482           case ANYOF_WARN_SUPER:
1483 3512         if (utf8_target) {
1484 83952175         REXEC_FBC_UTF8_CLASS_SCAN(
1485           reginclass(prog, c, (U8*)s, utf8_target));
1486           }
1487           else {
1488 192180         REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1489           }
1490           break;
1491           case CANY:
1492 92212         REXEC_FBC_SCAN(
1493           if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1494           goto got_it;
1495           else
1496           tmp = doevery;
1497           );
1498           break;
1499            
1500           case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1501 439544         assert(! is_utf8_pat);
1502           /* FALL THROUGH */
1503           case EXACTFA:
1504 439544         if (is_utf8_pat || utf8_target) {
1505           utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1506           goto do_exactf_utf8;
1507           }
1508           fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1509           folder = foldEQ_latin1; /* /a, except the sharp s one which */
1510           goto do_exactf_non_utf8; /* isn't dealt with by these */
1511            
1512           case EXACTF: /* This node only generated for non-utf8 patterns */
1513 375864         assert(! is_utf8_pat);
1514 354020         if (utf8_target) {
1515           utf8_fold_flags = 0;
1516           goto do_exactf_utf8;
1517           }
1518           fold_array = PL_fold;
1519           folder = foldEQ;
1520           goto do_exactf_non_utf8;
1521            
1522           case EXACTFL:
1523 149322         if (is_utf8_pat || utf8_target) {
1524           utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1525           goto do_exactf_utf8;
1526           }
1527           fold_array = PL_fold_locale;
1528           folder = foldEQ_locale;
1529           goto do_exactf_non_utf8;
1530            
1531           case EXACTFU_SS:
1532 128470         if (is_utf8_pat) {
1533           utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1534           }
1535           goto do_exactf_utf8;
1536            
1537           case EXACTFU:
1538 141610         if (is_utf8_pat || utf8_target) {
1539 20850         utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1540 14284         goto do_exactf_utf8;
1541           }
1542            
1543           /* Any 'ss' in the pattern should have been replaced by regcomp,
1544           * so we don't have to worry here about this single special case
1545           * in the Latin1 range */
1546           fold_array = PL_fold_latin1;
1547           folder = foldEQ_latin1;
1548            
1549           /* FALL THROUGH */
1550            
1551           do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1552           are no glitches with fold-length differences
1553           between the target string and pattern */
1554            
1555           /* The idea in the non-utf8 EXACTF* cases is to first find the
1556           * first character of the EXACTF* node and then, if necessary,
1557           * case-insensitively compare the full text of the node. c1 is the
1558           * first character. c2 is its fold. This logic will not work for
1559           * Unicode semantics and the german sharp ss, which hence should
1560           * not be compiled into a node that gets here. */
1561 136874         pat_string = STRING(c);
1562 200554         ln = STR_LEN(c); /* length to match in octets/bytes */
1563            
1564           /* We know that we have to match at least 'ln' bytes (which is the
1565           * same as characters, since not utf8). If we have to match 3
1566           * characters, and there are only 2 availabe, we know without
1567           * trying that it will fail; so don't start a match past the
1568           * required minimum number from the far end */
1569 169585         e = HOP3c(strend, -((SSize_t)ln), s);
1570            
1571 10745         if (reginfo->intuit && e < s) {
1572 4554         e = s; /* Due to minlen logic of intuit() */
1573           }
1574            
1575 6470         c1 = *pat_string;
1576 145308         c2 = fold_array[c1];
1577 0         if (c1 == c2) { /* If char and fold are the same */
1578 204698         REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1579           }
1580           else {
1581 822         REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1582           }
1583           break;
1584            
1585           do_exactf_utf8:
1586           {
1587           unsigned expansion;
1588            
1589           /* If one of the operands is in utf8, we can't use the simpler folding
1590           * above, due to the fact that many different characters can have the
1591           * same fold, or portion of a fold, or different- length fold */
1592 102760         pat_string = STRING(c);
1593 822         ln = STR_LEN(c); /* length to match in octets/bytes */
1594 6322109         pat_end = pat_string + ln;
1595           lnc = is_utf8_pat /* length to match in characters */
1596           ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1597 125358         : ln;
1598            
1599           /* We have 'lnc' characters to match in the pattern, but because of
1600           * multi-character folding, each character in the target can match
1601           * up to 3 characters (Unicode guarantees it will never exceed
1602           * this) if it is utf8-encoded; and up to 2 if not (based on the
1603           * fact that the Latin 1 folds are already determined, and the
1604           * only multi-char fold in that range is the sharp-s folding to
1605           * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1606           * string character. Adjust lnc accordingly, rounding up, so that
1607           * if we need to match at least 4+1/3 chars, that really is 5. */
1608 125358         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1609 125358         lnc = (lnc + expansion - 1) / expansion;
1610            
1611           /* As in the non-UTF8 case, if we have to match 3 characters, and
1612           * only 2 are left, it's guaranteed to fail, so don't start a
1613           * match that would require us to go beyond the end of the string
1614           */
1615 125358         e = HOP3c(strend, -((SSize_t)lnc), s);
1616            
1617 125358         if (reginfo->intuit && e < s) {
1618 125358         e = s; /* Due to minlen logic of intuit() */
1619           }
1620            
1621           /* XXX Note that we could recalculate e to stop the loop earlier,
1622           * as the worst case expansion above will rarely be met, and as we
1623           * go along we would usually find that e moves further to the left.
1624           * This would happen only after we reached the point in the loop
1625           * where if there were no expansion we should fail. Unclear if
1626           * worth the expense */
1627            
1628 125358         while (s <= e) {
1629 125358         char *my_strend= (char *)strend;
1630 125358         if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1631           pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1632 125358         && (reginfo->intuit || regtry(reginfo, &s)) )
1633           {
1634           goto got_it;
1635           }
1636 125358         s += (utf8_target) ? UTF8SKIP(s) : 1;
1637           }
1638           break;
1639           }
1640           case BOUNDL:
1641 125358         RXp_MATCH_TAINTED_on(prog);
1642 125358         FBC_BOUND(isWORDCHAR_LC,
1643           isWORDCHAR_LC_uvchr(tmp),
1644           isWORDCHAR_LC_utf8((U8*)s));
1645           break;
1646           case NBOUNDL:
1647 125358         RXp_MATCH_TAINTED_on(prog);
1648 123128         FBC_NBOUND(isWORDCHAR_LC,
1649           isWORDCHAR_LC_uvchr(tmp),
1650           isWORDCHAR_LC_utf8((U8*)s));
1651           break;
1652           case BOUND:
1653 116676         FBC_BOUND(isWORDCHAR,
1654           isWORDCHAR_uni(tmp),
1655           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656           break;
1657           case BOUNDA:
1658 116349         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1659           isWORDCHAR_A(tmp),
1660           isWORDCHAR_A((U8*)s));
1661           break;
1662           case NBOUND:
1663 9336         FBC_NBOUND(isWORDCHAR,
1664           isWORDCHAR_uni(tmp),
1665           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666           break;
1667           case NBOUNDA:
1668 126450         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1669           isWORDCHAR_A(tmp),
1670           isWORDCHAR_A((U8*)s));
1671           break;
1672           case BOUNDU:
1673 126350         FBC_BOUND(isWORDCHAR_L1,
1674           isWORDCHAR_uni(tmp),
1675           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1676           break;
1677           case NBOUNDU:
1678 126350         FBC_NBOUND(isWORDCHAR_L1,
1679           isWORDCHAR_uni(tmp),
1680           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1681           break;
1682           case LNBREAK:
1683 126350         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1684           is_LNBREAK_latin1_safe(s, strend)
1685           );
1686           break;
1687            
1688           /* The argument to all the POSIX node types is the class number to pass to
1689           * _generic_isCC() to build a mask for searching in PL_charclass[] */
1690            
1691           case NPOSIXL:
1692           to_complement = 1;
1693           /* FALLTHROUGH */
1694            
1695           case POSIXL:
1696 126350         RXp_MATCH_TAINTED_on(prog);
1697 126350         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1698           to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1699           break;
1700            
1701           case NPOSIXD:
1702           to_complement = 1;
1703           /* FALLTHROUGH */
1704            
1705           case POSIXD:
1706 402979         if (utf8_target) {
1707           goto posix_utf8;
1708           }
1709           goto posixa;
1710            
1711           case NPOSIXA:
1712 339804         if (utf8_target) {
1713           /* The complement of something that matches only ASCII matches all
1714           * UTF-8 variant code points, plus everything in ASCII that isn't
1715           * in the class */
1716 339804         REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1717           || ! _generic_isCC_A(*s, FLAGS(c)));
1718           break;
1719           }
1720            
1721           to_complement = 1;
1722           /* FALLTHROUGH */
1723            
1724           case POSIXA:
1725           posixa:
1726           /* Don't need to worry about utf8, as it can match only a single
1727           * byte invariant character. */
1728 171840         REXEC_FBC_CLASS_SCAN(
1729           to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1730           break;
1731            
1732           case NPOSIXU:
1733           to_complement = 1;
1734           /* FALLTHROUGH */
1735            
1736           case POSIXU:
1737 147204         if (! utf8_target) {
1738 2364         REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1739           FLAGS(c))));
1740           }
1741           else {
1742            
1743           posix_utf8:
1744 1548         classnum = (_char_class_number) FLAGS(c);
1745 4893310         if (classnum < _FIRST_NON_SWASH_CC) {
1746 4746922         while (s < strend) {
1747            
1748           /* We avoid loading in the swash as long as possible, but
1749           * should we have to, we jump to a separate loop. This
1750           * extra 'if' statement is what keeps this code from being
1751           * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1752 147204         if (UTF8_IS_ABOVE_LATIN1(*s)) {
1753           goto found_above_latin1;
1754           }
1755 171840         if ((UTF8_IS_INVARIANT(*s)
1756 258628         && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1757           classnum)))
1758 45630         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1759 45630         && to_complement ^ cBOOL(
1760           _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1761           *(s + 1)),
1762           classnum))))
1763           {
1764 45630         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1765           goto got_it;
1766           else {
1767           tmp = doevery;
1768           }
1769           }
1770           else {
1771           tmp = 1;
1772           }
1773 258628         s += UTF8SKIP(s);
1774           }
1775           }
1776 258628         else switch (classnum) { /* These classes are implemented as
1777           macros */
1778           case _CC_ENUM_SPACE: /* XXX would require separate code if we
1779           revert the change of \v matching this */
1780           /* FALL THROUGH */
1781            
1782           case _CC_ENUM_PSXSPC:
1783 253708         REXEC_FBC_UTF8_CLASS_SCAN(
1784           to_complement ^ cBOOL(isSPACE_utf8(s)));
1785           break;
1786            
1787           case _CC_ENUM_BLANK:
1788 131774         REXEC_FBC_UTF8_CLASS_SCAN(
1789           to_complement ^ cBOOL(isBLANK_utf8(s)));
1790           break;
1791            
1792           case _CC_ENUM_XDIGIT:
1793 289274         REXEC_FBC_UTF8_CLASS_SCAN(
1794           to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1795           break;
1796            
1797           case _CC_ENUM_VERTSPACE:
1798 289274         REXEC_FBC_UTF8_CLASS_SCAN(
1799           to_complement ^ cBOOL(isVERTWS_utf8(s)));
1800           break;
1801            
1802           case _CC_ENUM_CNTRL:
1803 334902         REXEC_FBC_UTF8_CLASS_SCAN(
1804           to_complement ^ cBOOL(isCNTRL_utf8(s)));
1805           break;
1806            
1807           default:
1808 361480         Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1809           assert(0); /* NOTREACHED */
1810           }
1811           }
1812           break;
1813            
1814           found_above_latin1: /* Here we have to load a swash to get the result
1815           for the current code point */
1816 180740         if (! PL_utf8_swash_ptrs[classnum]) {
1817 180736         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1818 180654         PL_utf8_swash_ptrs[classnum] =
1819 172262         _core_swash_init("utf8", swash_property_names[classnum],
1820           &PL_sv_undef, 1, 0, NULL, &flags);
1821           }
1822            
1823           /* This is a copy of the loop above for swash classes, though using the
1824           * FBC macro instead of being expanded out. Since we've loaded the
1825           * swash, we don't have to check for that each time through the loop */
1826 76568         REXEC_FBC_UTF8_CLASS_SCAN(
1827           to_complement ^ cBOOL(_generic_utf8(
1828           classnum,
1829           s,
1830           swash_fetch(PL_utf8_swash_ptrs[classnum],
1831           (U8 *) s, TRUE))));
1832           break;
1833            
1834           case AHOCORASICKC:
1835           case AHOCORASICK:
1836           {
1837 76572         DECL_TRIE_TYPE(c);
1838           /* what trie are we using right now */
1839 258632         reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1840 258632         reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1841 91172         HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1842            
1843 129728         const char *last_start = strend - trie->minlen;
1844           #ifdef DEBUGGING
1845 126354         const char *real_start = s;
1846           #endif
1847 44746         STRLEN maxlen = trie->maxlen;
1848           SV *sv_points;
1849           U8 **points; /* map of where we were in the input string
1850           when reading a given char. For ASCII this
1851           is unnecessary overhead as the relationship
1852           is always 1:1, but for Unicode, especially
1853           case folded Unicode this is not true. */
1854           U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1855           U8 *bitmap=NULL;
1856            
1857            
1858 44746         GET_RE_DEBUG_FLAGS_DECL;
1859            
1860           /* We can't just allocate points here. We need to wrap it in
1861           * an SV so it gets freed properly if there is a croak while
1862           * running the match */
1863 126354         ENTER;
1864 45178         SAVETMPS;
1865 45178         sv_points=newSV(maxlen * sizeof(U8 *));
1866 44086         SvCUR_set(sv_points,
1867           maxlen * sizeof(U8 *));
1868 44086         SvPOK_on(sv_points);
1869 44086         sv_2mortal(sv_points);
1870 1096         points=(U8**)SvPV_nolen(sv_points );
1871 81280         if ( trie_type != trie_utf8_fold
1872 81280         && (trie->bitmap || OP(c)==AHOCORASICKC) )
1873           {
1874 81278         if (trie->bitmap)
1875 2         bitmap=(U8*)trie->bitmap;
1876           else
1877 19919356         bitmap=(U8*)ANYOF_BITMAP(c);
1878           }
1879           /* this is the Aho-Corasick algorithm modified a touch
1880           to include special handling for long "unknown char" sequences.
1881           The basic idea being that we use AC as long as we are dealing
1882           with a possible matching char, when we encounter an unknown char
1883           (and we have not encountered an accepting state) we scan forward
1884           until we find a legal starting char.
1885           AC matching is basically that of trie matching, except that when
1886           we encounter a failing transition, we fall back to the current
1887           states "fail state", and try the current char again, a process
1888           we repeat until we reach the root state, state 1, or a legal
1889           transition. If we fail on the root state then we can either
1890           terminate if we have reached an accepting state previously, or
1891           restart the entire process from the beginning if we have not.
1892            
1893           */
1894 98669747         while (s <= last_start) {
1895 98669747         const U32 uniflags = UTF8_ALLOW_DEFAULT;
1896 96472125         U8 *uc = (U8*)s;
1897           U16 charid = 0;
1898           U32 base = 1;
1899           U32 state = 1;
1900 96104099         UV uvc = 0;
1901 90639355         STRLEN len = 0;
1902 90639355         STRLEN foldlen = 0;
1903           U8 *uscan = (U8*)NULL;
1904           U8 *leftmost = NULL;
1905           #ifdef DEBUGGING
1906           U32 accepted_word= 0;
1907           #endif
1908           U32 pointpos = 0;
1909            
1910 90639371         while ( state && uc <= (U8*)strend ) {
1911           int failed=0;
1912 74096421         U32 word = aho->states[ state ].wordnum;
1913            
1914 35475987         if( state==1 ) {
1915 56         if ( bitmap ) {
1916 54         DEBUG_TRIE_EXECUTE_r(
1917           if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918           dump_exec_pos( (char *)uc, c, strend, real_start,
1919           (char *)uc, utf8_target );
1920           PerlIO_printf( Perl_debug_log,
1921           " Scanning for legal start char...\n");
1922           }
1923           );
1924 145899439         if (utf8_target) {
1925 60628124         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926 96104095         uc += UTF8SKIP(uc);
1927           }
1928           } else {
1929 96104137         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1930 96104133         uc++;
1931           }
1932           }
1933 96104099         s= (char *)uc;
1934           }
1935 368032         if (uc >(U8*)last_start) break;
1936           }
1937            
1938 368040         if ( word ) {
1939 301308         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1940 22254         if (!leftmost || lpos < leftmost) {
1941           DEBUG_r(accepted_word=word);
1942           leftmost= lpos;
1943           }
1944 22254         if (base==0) break;
1945            
1946           }
1947 66734         points[pointpos++ % maxlen]= uc;
1948 368040         if (foldlen || uc < (U8*)strend) {
1949 368038         REXEC_TRIE_READ_CHAR(trie_type, trie,
1950           widecharmap, uc,
1951           uscan, len, uvc, charid, foldlen,
1952           foldbuf, uniflags);
1953 368038         DEBUG_TRIE_EXECUTE_r({
1954           dump_exec_pos( (char *)uc, c, strend,
1955           real_start, s, utf8_target);
1956           PerlIO_printf(Perl_debug_log,
1957           " Charid:%3u CP:%4"UVxf" ",
1958           charid, uvc);
1959           });
1960           }
1961           else {
1962 368028         len = 0;
1963           charid = 0;
1964           }
1965            
1966            
1967           do {
1968           #ifdef DEBUGGING
1969 368042         word = aho->states[ state ].wordnum;
1970           #endif
1971 96472137         base = aho->states[ state ].trans.base;
1972            
1973 96472137         DEBUG_TRIE_EXECUTE_r({
1974           if (failed)
1975           dump_exec_pos( (char *)uc, c, strend, real_start,
1976           s, utf8_target );
1977           PerlIO_printf( Perl_debug_log,
1978           "%sState: %4"UVxf", word=%"UVxf,
1979           failed ? " Fail transition to " : "",
1980           (UV)state, (UV)word);
1981           });
1982 16         if ( base ) {
1983           U32 tmp;
1984           I32 offset;
1985 24         if (charid &&
1986 20         ( ((offset = base + charid
1987 3296443         - 1 - trie->uniquecharcount)) >= 0)
1988 2197632         && ((U32)offset < trie->lasttrans)
1989 2197632         && trie->trans[offset].check == state
1990 2197632         && (tmp=trie->trans[offset].next))
1991           {
1992 2197632         DEBUG_TRIE_EXECUTE_r(
1993           PerlIO_printf( Perl_debug_log," - legal\n"));
1994           state = tmp;
1995 98669753         break;
1996           }
1997           else {
1998 263904222         DEBUG_TRIE_EXECUTE_r(
1999           PerlIO_printf( Perl_debug_log," - fail\n"));
2000           failed = 1;
2001 263904222         state = aho->fail[state];
2002           }
2003           }
2004           else {
2005           /* we must be accepting here */
2006 263904220         DEBUG_TRIE_EXECUTE_r(
2007           PerlIO_printf( Perl_debug_log," - accepting\n"));
2008           failed = 1;
2009           break;
2010           }
2011 263904222         } while(state);
2012 14         uc += len;
2013 263904232         if (failed) {
2014 3606940         if (leftmost)
2015           break;
2016 3606938         if (!state) state = 1;
2017           }
2018           }
2019 7170110         if ( aho->states[ state ].wordnum ) {
2020 2026462         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2021 6357987         if (!leftmost || lpos < leftmost) {
2022 3606936         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2023           leftmost = lpos;
2024           }
2025           }
2026 3155006         if (leftmost) {
2027 3155004         s = (char*)leftmost;
2028 3155004         DEBUG_TRIE_EXECUTE_r({
2029           PerlIO_printf(
2030           Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2031           (UV)accepted_word, (IV)(s - real_start)
2032           );
2033           });
2034 451936         if (reginfo->intuit || regtry(reginfo, &s)) {
2035 62         FREETMPS;
2036 30         LEAVE;
2037 451876         goto got_it;
2038           }
2039 263904190         s = HOPc(s,1);
2040 263904190         DEBUG_TRIE_EXECUTE_r({
2041           PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2042           });
2043           } else {
2044 252038470         DEBUG_TRIE_EXECUTE_r(
2045           PerlIO_printf( Perl_debug_log,"No match.\n"));
2046           break;
2047           }
2048           }
2049 252038470         FREETMPS;
2050 252038470         LEAVE;
2051           }
2052 136791179         break;
2053           default:
2054 136791177         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2055           break;
2056           }
2057           return 0;
2058           got_it:
2059 136791179         return s;
2060           }
2061            
2062           /* set RX_SAVED_COPY, RX_SUBBEG etc.
2063           * flags have same meanings as with regexec_flags() */
2064            
2065           static void
2066 37852370         S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2067           char *strbeg,
2068           char *strend,
2069           SV *sv,
2070           U32 flags,
2071           bool utf8_target)
2072           {
2073 2754166         struct regexp *const prog = ReANY(rx);
2074            
2075 1836252         if (flags & REXEC_COPY_STR) {
2076           #ifdef PERL_ANY_COW
2077 1836252         if (SvCANCOW(sv)) {
2078 2754166         if (DEBUG_C_TEST) {
2079 1836188         PerlIO_printf(Perl_debug_log,
2080           "Copy on write: regexp capture, type %d\n",
2081 3672376         (int) SvTYPE(sv));
2082           }
2083           /* Create a new COW SV to share the match string and store
2084           * in saved_copy, unless the current COW SV in saved_copy
2085           * is valid and suitable for our purpose */
2086 55148         if (( prog->saved_copy
2087 1891276         && SvIsCOW(prog->saved_copy)
2088 1836192         && SvPOKp(prog->saved_copy)
2089           && SvIsCOW(sv)
2090 1670720         && SvPOKp(sv)
2091 151263413         && SvPVX(sv) == SvPVX(prog->saved_copy)))
2092           {
2093           /* just reuse saved_copy SV */
2094 151263413         if (RXp_MATCH_COPIED(prog)) {
2095 151263341         Safefree(prog->subbeg);
2096 0         RXp_MATCH_COPIED_off(prog);
2097           }
2098           }
2099           else {
2100           /* create new COW SV to share string */
2101 151263461         RX_MATCH_COPY_FREE(rx);
2102 151263401         prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2103           }
2104 151263405         prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2105 151263405         assert (SvPOKp(prog->saved_copy));
2106 151263405         prog->sublen = strend - strbeg;
2107 151263405         prog->suboffset = 0;
2108 151263405         prog->subcoffset = 0;
2109           } else
2110           #endif
2111           {
2112           SSize_t min = 0;
2113 151263341         SSize_t max = strend - strbeg;
2114           SSize_t sublen;
2115            
2116           if ( (flags & REXEC_COPY_SKIP_POST)
2117           && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2118           && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2119           ) { /* don't copy $' part of string */
2120           U32 n = 0;
2121           max = -1;
2122           /* calculate the right-most part of the string covered
2123           * by a capture. Due to look-ahead, this may be to
2124           * the right of $&, so we have to scan all captures */
2125           while (n <= prog->lastparen) {
2126           if (prog->offs[n].end > max)
2127           max = prog->offs[n].end;
2128           n++;
2129           }
2130           if (max == -1)
2131           max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2132           ? prog->offs[0].start
2133           : 0;
2134           assert(max >= 0 && max <= strend - strbeg);
2135           }
2136            
2137           if ( (flags & REXEC_COPY_SKIP_PRE)
2138           && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2139           && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2140           ) { /* don't copy $` part of string */
2141           U32 n = 0;
2142           min = max;
2143           /* calculate the left-most part of the string covered
2144           * by a capture. Due to look-behind, this may be to
2145           * the left of $&, so we have to scan all captures */
2146           while (min && n <= prog->lastparen) {
2147           if ( prog->offs[n].start != -1
2148           && prog->offs[n].start < min)
2149           {
2150           min = prog->offs[n].start;
2151           }
2152           n++;
2153           }
2154           if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2155           && min > prog->offs[0].end
2156           )
2157           min = prog->offs[0].end;
2158            
2159           }
2160            
2161 151263341         assert(min >= 0 && min <= max && min <= strend - strbeg);
2162           sublen = max - min;
2163            
2164 151263341         if (RX_MATCH_COPIED(rx)) {
2165 151263341         if (sublen > prog->sublen)
2166 151263341         prog->subbeg =
2167 14         (char*)saferealloc(prog->subbeg, sublen+1);
2168           }
2169           else
2170 14         prog->subbeg = (char*)safemalloc(sublen+1);
2171 14         Copy(strbeg + min, prog->subbeg, sublen, char);
2172 151263341         prog->subbeg[sublen] = '\0';
2173 151263341         prog->suboffset = min;
2174 19956         prog->sublen = sublen;
2175 19956         RX_MATCH_COPIED_on(rx);
2176           }
2177 20020         prog->subcoffset = prog->suboffset;
2178 20020         if (prog->suboffset && utf8_target) {
2179           /* Convert byte offset to chars.
2180           * XXX ideally should only compute this if @-/@+
2181           * has been seen, a la PL_sawampersand ??? */
2182            
2183           /* If there's a direct correspondence between the
2184           * string which we're matching and the original SV,
2185           * then we can use the utf8 len cache associated with
2186           * the SV. In particular, it means that under //g,
2187           * sv_pos_b2u() will use the previously cached
2188           * position to speed up working out the new length of
2189           * subcoffset, rather than counting from the start of
2190           * the string each time. This stops
2191           * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2192           * from going quadratic */
2193 151263341         if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2194 151263341         prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2195           SV_GMAGIC|SV_CONST_RETURN);
2196           else
2197 453901797         prog->subcoffset = utf8_length((U8*)strbeg,
2198           (U8*)(strbeg+prog->suboffset));
2199           }
2200           }
2201           else {
2202 302638456         RX_MATCH_COPY_FREE(rx);
2203 151263341         prog->subbeg = strbeg;
2204 151375115         prog->suboffset = 0;
2205 111774         prog->subcoffset = 0;
2206 223548         prog->sublen = strend - strbeg;
2207           }
2208 111838         }
2209            
2210            
2211            
2212            
2213           /*
2214           - regexec_flags - match a regexp against a string
2215           */
2216           I32
2217 302638550         Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2218           char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2219           /* stringarg: the point in the string at which to begin matching */
2220           /* strend: pointer to null at end of string */
2221           /* strbeg: real beginning of string */
2222           /* minend: end of match must be >= minend bytes after stringarg. */
2223           /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2224           * itself is accessed via the pointers above */
2225           /* data: May be used for some additional optimizations.
2226           Currently unused. */
2227           /* flags: For optimizations. See REXEC_* in regexp.h */
2228            
2229           {
2230           dVAR;
2231 94         struct regexp *const prog = ReANY(rx);
2232           char *s;
2233           regnode *c;
2234           char *startpos;
2235           SSize_t minlen; /* must match at least this many chars */
2236           SSize_t dontbother = 0; /* how many characters not to try at end */
2237 151263435         const bool utf8_target = cBOOL(DO_UTF8(sv));
2238           I32 multiline;
2239 151263435         RXi_GET_DECL(prog,progi);
2240           regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2241           regmatch_info *const reginfo = ®info_buf;
2242           regexp_paren_pair *swap = NULL;
2243           I32 oldsave;
2244 151263435         GET_RE_DEBUG_FLAGS_DECL;
2245            
2246 151263435         PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2247           PERL_UNUSED_ARG(data);
2248            
2249           /* Be paranoid... */
2250 151263435         if (prog == NULL || stringarg == NULL) {
2251 111774         Perl_croak(aTHX_ "NULL regexp parameter");
2252           return 0;
2253           }
2254            
2255 151151661         DEBUG_EXECUTE_r(
2256           debug_start_match(rx, utf8_target, stringarg, strend,
2257           "Matching");
2258           );
2259            
2260           startpos = stringarg;
2261            
2262 151263435         if (prog->extflags & RXf_GPOS_SEEN) {
2263           MAGIC *mg;
2264            
2265           /* set reginfo->ganch, the position where \G can match */
2266            
2267 39700526         reginfo->ganch =
2268 59537470         (flags & REXEC_IGNOREPOS)
2269           ? stringarg /* use start pos rather than pos() */
2270 151263341         : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2271           /* Defined pos(): */
2272 92429392         ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2273 24225610         : strbeg; /* pos() not defined; use start of string */
2274            
2275 23306660         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2276           "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
2277            
2278           /* in the presence of \G, we may need to start looking earlier in
2279           * the string than the suggested start point of stringarg:
2280           * if gofs->prog is set, then that's a known, fixed minimum
2281           * offset, such as
2282           * /..\G/: gofs = 2
2283           * /ab|c\G/: gofs = 1
2284           * or if the minimum offset isn't known, then we have to go back
2285           * to the start of the string, e.g. /w+\G/
2286           */
2287            
2288 919224         if (prog->extflags & RXf_ANCH_GPOS) {
2289 511034         startpos = reginfo->ganch - prog->gofs;
2290 919224         if (startpos <
2291 919224         ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2292           {
2293 486108         DEBUG_r(PerlIO_printf(Perl_debug_log,
2294           "fail: ganch-gofs before earliest possible start\n"));
2295 3882         return 0;
2296           }
2297           }
2298 4296         else if (prog->gofs) {
2299 596         if (startpos - prog->gofs < strbeg)
2300           startpos = strbeg;
2301           else
2302 596         startpos -= prog->gofs;
2303           }
2304 596         else if (prog->extflags & RXf_GPOS_FLOAT)
2305           startpos = strbeg;
2306           }
2307            
2308 690         minlen = prog->minlen;
2309 94         if ((startpos + minlen) > strend || startpos < strbeg) {
2310 482226         DEBUG_r(PerlIO_printf(Perl_debug_log,
2311           "Regex match can't succeed, so not even tried\n"));
2312 837760         return 0;
2313           }
2314            
2315           /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2316           * which will call destuctors to reset PL_regmatch_state, free higher
2317           * PL_regmatch_slabs, and clean up regmatch_info_aux and
2318           * regmatch_info_aux_eval */
2319            
2320 375688         oldsave = PL_savestack_ix;
2321            
2322 375660         s = startpos;
2323            
2324 375660         if ((prog->extflags & RXf_USE_INTUIT)
2325 375658         && !(flags & REXEC_CHECKED))
2326           {
2327 92         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2328           flags, NULL);
2329 433208         if (!s)
2330           return 0;
2331            
2332 206         if (prog->extflags & RXf_CHECK_ALL) {
2333           /* we can match based purely on the result of INTUIT.
2334           * Set up captures etc just for $& and $-[0]
2335           * (an intuit-only match wont have $1,$2,..) */
2336 16010172         assert(!prog->nparens);
2337            
2338           /* s/// doesn't like it if $& is earlier than where we asked it to
2339           * start searching (which can happen on something like /.\G/) */
2340 15856296         if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2341 790218         && (s < stringarg))
2342           {
2343           /* this should only be possible under \G */
2344 58833949         assert(prog->extflags & RXf_GPOS_SEEN);
2345 2031764         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346           "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2347           goto phooey;
2348           }
2349            
2350           /* match via INTUIT shouldn't have any captures.
2351           * Let @-, @+, $^N know */
2352 56802207         prog->lastparen = prog->lastcloseparen = 0;
2353 106286         RX_MATCH_UTF8_set(rx, utf8_target);
2354 382         prog->offs[0].start = s - strbeg;
2355 44         prog->offs[0].end = utf8_target
2356 360         ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2357 2984         : s - strbeg + prog->minlenret;
2358 105904         if ( !(flags & REXEC_NOT_FIRST) )
2359 22         S_reg_set_capture_string(aTHX_ rx,
2360           strbeg, strend,
2361           sv, flags, utf8_target);
2362            
2363           return 1;
2364           }
2365           }
2366            
2367 105926         multiline = prog->extflags & RXf_PMf_MULTILINE;
2368          
2369 804182         if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2370 56695943         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2371           "String too short [regexec_flags]...\n"));
2372           goto phooey;
2373           }
2374          
2375           /* Check validity of program. */
2376 48208691         if (UCHARAT(progi->program) != REG_MAGIC) {
2377 48095871         Perl_croak(aTHX_ "corrupted regexp program");
2378           }
2379            
2380 4550440         RX_MATCH_TAINTED_off(rx);
2381            
2382 8746328         reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2383 8600116         reginfo->intuit = 0;
2384 226380         reginfo->is_utf8_target = cBOOL(utf8_target);
2385 7842         reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2386 226380         reginfo->warned = FALSE;
2387 8373780         reginfo->strbeg = strbeg;
2388 44         reginfo->sv = sv;
2389 8373780         reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2390 8600116         reginfo->strend = strend;
2391           /* see how far we have to get to not match where we matched before */
2392 146256         reginfo->till = stringarg + minend;
2393            
2394 18832         if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2395           /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2396           S_cleanup_regmatch_info_aux has executed (registered by
2397           SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2398           magic belonging to this SV.
2399           Not newSVsv, either, as it does not COW.
2400           */
2401 0         reginfo->sv = newSV(0);
2402 18788         sv_setsv(reginfo->sv, sv);
2403 127424         SAVEFREESV(reginfo->sv);
2404           }
2405            
2406           /* reserve next 2 or 3 slots in PL_regmatch_state:
2407           * slot N+0: may currently be in use: skip it
2408           * slot N+1: use for regmatch_info_aux struct
2409           * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2410           * slot N+3: ready for use by regmatch()
2411           */
2412            
2413           {
2414           regmatch_state *old_regmatch_state;
2415           regmatch_slab *old_regmatch_slab;
2416 44         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2417            
2418           /* on first ever match, allocate first slab */
2419 127468         if (!PL_regmatch_slab) {
2420 146212         Newx(PL_regmatch_slab, 1, regmatch_slab);
2421 146212         PL_regmatch_slab->prev = NULL;
2422 8746284         PL_regmatch_slab->next = NULL;
2423 8746284         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2424           }
2425            
2426 8746328         old_regmatch_state = PL_regmatch_state;
2427 7700524         old_regmatch_slab = PL_regmatch_slab;
2428            
2429 1045972         for (i=0; i <= max; i++) {
2430 8746408         if (i == 1)
2431 39691835         reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2432 17384374         else if (i ==2)
2433 16753958         reginfo->info_aux_eval =
2434 16625990         reginfo->info_aux->info_aux_eval =
2435 16625954         &(PL_regmatch_state->u.info_aux_eval);
2436            
2437 128128         if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2438 128004         PL_regmatch_state = S_push_slab(aTHX);
2439           }
2440            
2441           /* note initial PL_regmatch_state position; at end of match we'll
2442           * pop back to there and free any higher slabs */
2443            
2444 128048         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2445 128048         reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2446 16753966         reginfo->info_aux->poscache = NULL;
2447            
2448 317622         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2449            
2450 317622         if ((prog->extflags & RXf_EVAL_SEEN))
2451 142560         S_setup_eval_state(aTHX_ reginfo);
2452           else
2453 137748         reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2454           }
2455            
2456           /* If there is a "must appear" string, look for it. */
2457            
2458 137784         if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2459           /* We have to be careful. If the previous successful match
2460           was from this regex we don't want a subsequent partially
2461           successful match to clobber the old results.
2462           So when we detect this possibility we add a swap buffer
2463           to the re, and switch the buffer each match. If we fail,
2464           we switch it back; otherwise we leave it swapped.
2465           */
2466 4820         swap = prog->offs;
2467           /* do we need a save destructor here for eval dies? */
2468 25836332         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2469 16716196         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2470           "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2471           PTR2UV(prog),
2472           PTR2UV(swap),
2473           PTR2UV(prog->offs)
2474           ));
2475           }
2476            
2477           /* Simplest case: anchored match need be tried only once. */
2478           /* [unless only anchor is BOL and multiline is set] */
2479 9326306         if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2480 47949659         if (s == startpos && regtry(reginfo, &s))
2481           goto got_it;
2482 21768593         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2483 21766999         || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2484           {
2485           char *end;
2486            
2487 21766999         if (minlen)
2488 21146737         dontbother = minlen - 1;
2489 21768593         end = HOP3c(strend, -dontbother, strbeg) - 1;
2490           /* for multiline we only have to try after newlines */
2491 26181066         if (prog->check_substr || prog->check_utf8) {
2492           /* because of the goto we can not easily reuse the macros for bifurcating the
2493           unicode/non-unicode match modes here like we do elsewhere - demerphq */
2494 2506358         if (utf8_target) {
2495 31358         if (s == startpos)
2496           goto after_try_utf8;
2497           while (1) {
2498 0         if (regtry(reginfo, &s)) {
2499           goto got_it;
2500           }
2501           after_try_utf8:
2502 31358         if (s > end) {
2503           goto phooey;
2504           }
2505 2475000         if (prog->extflags & RXf_USE_INTUIT) {
2506 0         s = re_intuit_start(rx, sv, strbeg,
2507           s + UTF8SKIP(s), strend, flags, NULL);
2508 2475000         if (!s) {
2509           goto phooey;
2510           }
2511           }
2512           else {
2513 2506358         s += UTF8SKIP(s);
2514           }
2515           }
2516           } /* end search for check string in unicode */
2517           else {
2518 2506358         if (s == startpos) {
2519           goto after_try_latin;
2520           }
2521           while (1) {
2522 2084514         if (regtry(reginfo, &s)) {
2523           goto got_it;
2524           }
2525           after_try_latin:
2526 2084514         if (s > end) {
2527           goto phooey;
2528           }
2529 2084514         if (prog->extflags & RXf_USE_INTUIT) {
2530 2084514         s = re_intuit_start(rx, sv, strbeg,
2531           s + 1, strend, flags, NULL);
2532 0         if (!s) {
2533           goto phooey;
2534           }
2535           }
2536           else {
2537 0         s++;
2538           }
2539           }
2540           } /* end search for check string in latin*/
2541           } /* end search for check string */
2542           else { /* search for newline */
2543 0         if (s > startpos) {
2544           /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2545 421844         s--;
2546           }
2547           /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2548 421844         while (s <= end) { /* note it could be possible to match at the end of the string */
2549 2506358         if (*s++ == '\n') { /* don't need PL_utf8skip here */
2550 2506358         if (regtry(reginfo, &s))
2551           goto got_it;
2552           }
2553           }
2554           } /* end search for newline */
2555           } /* end anchored/multiline check string search */
2556           goto phooey;
2557 26181110         } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2558           {
2559           /* For anchored \G, the only position it can match from is
2560           * (ganch-gofs); we already set startpos to this above; if intuit
2561           * moved us on from there, we can't possibly succeed */
2562 22860482         assert(startpos == reginfo->ganch - prog->gofs);
2563 26181066         if (s == startpos && regtry(reginfo, &s))
2564           goto got_it;
2565           goto phooey;
2566           }
2567            
2568           /* Messy cases: unanchored match. */
2569 26181110         if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2570           /* we have /x+whatever/ */
2571           /* it must be a one character string (XXXX Except is_utf8_pat?) */
2572           char ch;
2573           #ifdef DEBUGGING
2574           int did_match = 0;
2575           #endif
2576 6475186         if (utf8_target) {
2577 264368         if (! prog->anchored_utf8) {
2578 242168         to_utf8_substr(prog);
2579           }
2580 10217631         ch = SvPVX_const(prog->anchored_utf8)[0];
2581 98256438         REXEC_FBC_SCAN(
2582           if (*s == ch) {
2583           DEBUG_EXECUTE_r( did_match = 1 );
2584           if (regtry(reginfo, &s)) goto got_it;
2585           s += UTF8SKIP(s);
2586           while (s < strend && *s == ch)
2587           s += UTF8SKIP(s);
2588           }
2589           );
2590            
2591           }
2592           else {
2593 81066906         if (! prog->anchored_substr) {
2594 110461603         if (! to_byte_substr(prog)) {
2595 12784408         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2596           }
2597           }
2598 110461599         ch = SvPVX_const(prog->anchored_substr)[0];
2599 110461599         REXEC_FBC_SCAN(
2600           if (*s == ch) {
2601           DEBUG_EXECUTE_r( did_match = 1 );
2602           if (regtry(reginfo, &s)) goto got_it;
2603           s++;
2604           while (s < strend && *s == ch)
2605           s++;
2606           }
2607           );
2608           }
2609 110461599         DEBUG_EXECUTE_r(if (!did_match)
2610           PerlIO_printf(Perl_debug_log,
2611           "Did not find anchored character...\n")
2612           );
2613           }
2614 11166         else if (prog->anchored_substr != NULL
2615 165476303         || prog->anchored_utf8 != NULL
2616 110461603         || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2617 96999029         && prog->float_max_offset < strend - s)) {
2618           SV *must;
2619           SSize_t back_max;
2620           SSize_t back_min;
2621           char *last;
2622           char *last1; /* Last position checked before */
2623           #ifdef DEBUGGING
2624           int did_match = 0;
2625           #endif
2626 40776694         if (prog->anchored_substr || prog->anchored_utf8) {
2627 40776694         if (utf8_target) {
2628 7496494         if (! prog->anchored_utf8) {
2629 136168564         to_utf8_substr(prog);
2630           }
2631 229782217         must = prog->anchored_utf8;
2632           }
2633           else {
2634 229782257         if (! prog->anchored_substr) {
2635 229782217         if (! to_byte_substr(prog)) {
2636 229782217         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2637           }
2638           }
2639 229782257         must = prog->anchored_substr;
2640           }
2641 229782257         back_max = back_min = prog->anchored_offset;
2642           } else {
2643 229782217         if (utf8_target) {
2644 229782217         if (! prog->float_utf8) {
2645 111403381         to_utf8_substr(prog);
2646           }
2647 382741242         must = prog->float_utf8;
2648           }
2649           else {
2650 271337861         if (! prog->float_substr) {
2651 271337861         if (! to_byte_substr(prog)) {
2652 271337861         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2653           }
2654           }
2655 229782217         must = prog->float_substr;
2656           }
2657 229782217         back_max = prog->float_max_offset;
2658 229782191         back_min = prog->float_min_offset;
2659           }
2660          
2661 110461643         if (back_min<0) {
2662           last = strend;
2663           } else {
2664 110461643         last = HOP3c(strend, /* Cannot start after this */
2665           -(SSize_t)(CHR_SVLEN(must)
2666           - (SvTAIL(must) != 0) + back_min), strbeg);
2667           }
2668 119320628         if (s > reginfo->strbeg)
2669 280         last1 = HOPc(s, -1);
2670           else
2671 174767529         last1 = s - 1; /* bogus */
2672            
2673           /* XXXX check_substr already used to find "s", can optimize if
2674           check_substr==must. */
2675           dontbother = 0;
2676 1889308         strend = HOPc(strend, -dontbother);
2677 1889388         while ( (s <= last) &&
2678 1889308         (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2679           (unsigned char*)strend, must,
2680           multiline ? FBMrf_MULTILINE : 0)) ) {
2681 514668         DEBUG_EXECUTE_r( did_match = 1 );
2682 514668         if (HOPc(s, -back_max) > last1) {
2683 514668         last1 = HOPc(s, -back_min);
2684 514668         s = HOPc(s, -back_max);
2685           }
2686           else {
2687 1889268         char * const t = (last1 >= reginfo->strbeg)
2688 1889268         ? HOPc(last1, 1) : last1 + 1;
2689            
2690 9148         last1 = HOPc(s, -back_min);
2691 9148         s = t;
2692           }
2693 9188         if (utf8_target) {
2694 9148         while (s <= last1) {
2695 14022         if (regtry(reginfo, &s))
2696           goto got_it;
2697 13672         if (s >= last1) {
2698 8848         s++; /* to break out of outer loop */
2699 8748         break;
2700           }
2701 108870617         s += UTF8SKIP(s);
2702           }
2703           }
2704           else {
2705 108870657         while (s <= last1) {
2706 108870657         if (regtry(reginfo, &s))
2707           goto got_it;
2708 108870617         s++;
2709           }
2710           }
2711           }
2712 108870617         DEBUG_EXECUTE_r(if (!did_match) {
2713           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2714           SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2715           PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2716           ((must == prog->anchored_substr || must == prog->anchored_utf8)
2717           ? "anchored" : "floating"),
2718           quoted, RE_SV_TAIL(must));
2719           });
2720           goto phooey;
2721           }
2722 107917905         else if ( (c = progi->regstclass) ) {
2723 107847143         if (minlen) {
2724 70766         const OPCODE op = OP(progi->regstclass);
2725           /* don't bother with what can't match */
2726 49494         if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2727 49490         strend = HOPc(strend, -(minlen - 1));
2728           }
2729 21276         DEBUG_EXECUTE_r({
2730           SV * const prop = sv_newmortal();
2731           regprop(prog, prop, c);
2732           {
2733           RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2734           s,strend-s,60);
2735           PerlIO_printf(Perl_debug_log,
2736           "Matching stclass %.*s against %s (%d bytes)\n",
2737           (int)SvCUR(prop), SvPVX_const(prop),
2738           quoted, (int)(strend - s));
2739           }
2740           });
2741 2700901         if (find_byclass(prog, c, s, strend, reginfo))
2742           goto got_it;
2743 168923         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2744           }
2745           else {
2746           dontbother = 0;
2747 940040         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2748           /* Trim the end. */
2749           char *last= NULL;
2750           SV* float_real;
2751           STRLEN len;
2752           const char *little;
2753            
2754 992639         if (utf8_target) {
2755 935396         if (! prog->float_utf8) {
2756 935396         to_utf8_substr(prog);
2757           }
2758 42454         float_real = prog->float_utf8;
2759           }
2760           else {
2761 0         if (! prog->float_substr) {
2762 0         if (! to_byte_substr(prog)) {
2763 0         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2764           }
2765           }
2766 42454         float_real = prog->float_substr;
2767           }
2768            
2769 18652         little = SvPV_const(float_real, len);
2770 18652         if (SvTAIL(float_real)) {
2771           /* This means that float_real contains an artificial \n on
2772           * the end due to the presence of something like this:
2773           * /foo$/ where we can match both "foo" and "foo\n" at the
2774           * end of the string. So we have to compare the end of the
2775           * string first against the float_real without the \n and
2776           * then against the full float_real with the string. We
2777           * have to watch out for cases where the string might be
2778           * smaller than the float_real or the float_real without
2779           * the \n. */
2780 8416         char *checkpos= strend - len;
2781 8416         DEBUG_OPTIMISE_r(
2782           PerlIO_printf(Perl_debug_log,
2783           "%sChecking for float_real.%s\n",
2784           PL_colors[4], PL_colors[5]));
2785 0         if (checkpos + 1 < strbeg) {
2786           /* can't match, even if we remove the trailing \n
2787           * string is too short to match */
2788 8416         DEBUG_EXECUTE_r(
2789           PerlIO_printf(Perl_debug_log,
2790           "%sString shorter than required trailing substring, cannot match.%s\n",
2791           PL_colors[4], PL_colors[5]));
2792           goto phooey;
2793 8416         } else if (memEQ(checkpos + 1, little, len - 1)) {
2794           /* can match, the end of the string matches without the
2795           * "\n" */
2796 8416         last = checkpos + 1;
2797 0         } else if (checkpos < strbeg) {
2798           /* cant match, string is too short when the "\n" is
2799           * included */
2800 8416         DEBUG_EXECUTE_r(
2801           PerlIO_printf(Perl_debug_log,
2802           "%sString does not contain required trailing substring, cannot match.%s\n",
2803           PL_colors[4], PL_colors[5]));
2804           goto phooey;
2805 8416         } else if (!multiline) {
2806           /* non multiline match, so compare with the "\n" at the
2807           * end of the string */
2808 0         if (memEQ(checkpos, little, len)) {
2809           last= checkpos;
2810           } else {
2811 0         DEBUG_EXECUTE_r(
2812           PerlIO_printf(Perl_debug_log,
2813           "%sString does not contain required trailing substring, cannot match.%s\n",
2814           PL_colors[4], PL_colors[5]));
2815           goto phooey;
2816           }
2817           } else {
2818           /* multiline match, so we have to search for a place
2819           * where the full string is located */
2820           goto find_last;
2821           }
2822           } else {
2823           find_last:
2824 0         if (len)
2825 0         last = rninstr(s, strend, little, little + len);
2826           else
2827           last = strend; /* matching "$" */
2828           }
2829 0         if (!last) {
2830           /* at one point this block contained a comment which was
2831           * probably incorrect, which said that this was a "should not
2832           * happen" case. Even if it was true when it was written I am
2833           * pretty sure it is not anymore, so I have removed the comment
2834           * and replaced it with this one. Yves */
2835 892942         DEBUG_EXECUTE_r(
2836           PerlIO_printf(Perl_debug_log,
2837           "String does not contain required substring, cannot match.\n"
2838           ));
2839           goto phooey;
2840           }
2841 192182         dontbother = strend - last + prog->float_min_offset;
2842           }
2843 93166         if (minlen && (dontbother < minlen))
2844 142572         dontbother = minlen - 1;
2845 142572         strend -= dontbother; /* this one's always in bytes! */
2846           /* We don't know much -- general case. */
2847 13336         if (utf8_target) {
2848           for (;;) {
2849 62454         if (regtry(reginfo, &s))
2850           goto got_it;
2851 830488         if (s >= strend)
2852           break;
2853 159122         s += UTF8SKIP(s);
2854 0         };
2855           }
2856           else {
2857           do {
2858 7908         if (regtry(reginfo, &s))
2859           goto got_it;
2860 5516         } while (s++ < strend);
2861           }
2862           }
2863            
2864           /* Failure. */
2865           goto phooey;
2866            
2867           got_it:
2868           /* s/// doesn't like it if $& is earlier than where we asked it to
2869           * start searching (which can happen on something like /.\G/) */
2870 42         if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2871 665850         && (prog->offs[0].start < stringarg - strbeg))
2872           {
2873           /* this should only be possible under \G */
2874 0         assert(prog->extflags & RXf_GPOS_SEEN);
2875 0         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2876           "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2877           goto phooey;
2878           }
2879            
2880 108870659         DEBUG_BUFFERS_r(
2881           if (swap)
2882           PerlIO_printf(Perl_debug_log,
2883           "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2884           PTR2UV(prog),
2885           PTR2UV(swap)
2886           );
2887           );
2888 86308         Safefree(swap);
2889            
2890           /* clean up; this will trigger destructors that will free all slabs
2891           * above the current one, and cleanup the regmatch_info_aux
2892           * and regmatch_info_aux_eval sructs */
2893            
2894 108784393         LEAVE_SCOPE(oldsave);
2895            
2896 290470         if (RXp_PAREN_NAMES(prog))
2897 240938         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2898            
2899 241022         RX_MATCH_UTF8_set(rx, utf8_target);
2900            
2901           /* make sure $`, $&, $', and $digit will work later */
2902 290470         if ( !(flags & REXEC_NOT_FIRST) )
2903 435684         S_reg_set_capture_string(aTHX_ rx,
2904           strbeg, reginfo->strend,
2905           sv, flags, utf8_target);
2906            
2907           return 1;
2908            
2909           phooey:
2910 240726         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2911           PL_colors[4], PL_colors[5]));
2912            
2913           /* clean up; this will trigger destructors that will free all slabs
2914           * above the current one, and cleanup the regmatch_info_aux
2915           * and regmatch_info_aux_eval sructs */
2916            
2917 410792         LEAVE_SCOPE(oldsave);
2918            
2919 49706         if (swap) {
2920           /* we failed :-( roll it back */
2921 108493923         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2922           "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2923           PTR2UV(prog),
2924           PTR2UV(prog->offs),
2925           PTR2UV(swap)
2926           ));
2927 10084         Safefree(prog->offs);
2928 0         prog->offs = swap;
2929           }
2930           return 0;
2931           }
2932            
2933            
2934           /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2935           * Do inc before dec, in case old and new rex are the same */
2936           #define SET_reg_curpm(Re2) \
2937           if (reginfo->info_aux_eval) { \
2938           (void)ReREFCNT_inc(Re2); \
2939           ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2940           PM_SETRE((PL_reg_curpm), (Re2)); \
2941           }
2942            
2943            
2944           /*
2945           - regtry - try match at specific point
2946           */
2947           STATIC I32 /* 0 failure, 1 success */
2948 108483881         S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2949           {
2950           dVAR;
2951           CHECKPOINT lastcp;
2952 108677270         REGEXP *const rx = reginfo->prog;
2953 229782259         regexp *const prog = ReANY(rx);
2954           SSize_t result;
2955 229782259         RXi_GET_DECL(prog,progi);
2956 229782259         GET_RE_DEBUG_FLAGS_DECL;
2957            
2958 229782259         PERL_ARGS_ASSERT_REGTRY;
2959            
2960 229782259         reginfo->cutpoint=NULL;
2961            
2962 229782259         prog->offs[0].start = *startposp - reginfo->strbeg;
2963 229782259         prog->lastparen = 0;
2964 229782259         prog->lastcloseparen = 0;
2965            
2966           /* XXXX What this code is doing here?!!! There should be no need
2967           to do this again and again, prog->lastparen should take care of
2968           this! --ilya*/
2969            
2970           /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2971           * Actually, the code in regcppop() (which Ilya may be meaning by
2972           * prog->lastparen), is not needed at all by the test suite
2973           * (op/regexp, op/pat, op/split), but that code is needed otherwise
2974           * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2975           * Meanwhile, this code *is* needed for the
2976           * above-mentioned test suite tests to succeed. The common theme
2977           * on those tests seems to be returning null fields from matches.
2978           * --jhi updated by dapm */
2979           #if 1
2980 229782259         if (prog->nparens) {
2981 229782221         regexp_paren_pair *pp = prog->offs;
2982           I32 i;
2983 229782225         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2984 229782221         ++pp;
2985 1749205844         pp->start = -1;
2986 1634564290         pp->end = -1;
2987           }
2988           }
2989           #endif
2990 1634564328         REGCP_SET(lastcp);
2991 1909591423         result = regmatch(reginfo, *startposp, progi->program + 1);
2992 2184219214         if (result != -1) {
2993 2184219214         prog->offs[0].end = result;
2994 151487096         return 1;
2995           }
2996 5549990         if (reginfo->cutpoint)
2997 5267184         *startposp= reginfo->cutpoint;
2998 1310327         REGCP_UNWIND(lastcp);
2999           return 0;
3000           }
3001            
3002            
3003           #define sayYES goto yes
3004           #define sayNO goto no
3005           #define sayNO_SILENT goto no_silent
3006            
3007           /* we dont use STMT_START/END here because it leads to
3008           "unreachable code" warnings, which are bogus, but distracting. */
3009           #define CACHEsayNO \
3010           if (ST.cache_mask) \
3011           reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3012           sayNO
3013            
3014           /* this is used to determine how far from the left messages like
3015           'failed...' are printed. It should be set such that messages
3016           are inline with the regop output that created them.
3017           */
3018           #define REPORT_CODE_OFF 32
3019            
3020            
3021           #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3022           #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3023           #define CHRTEST_NOT_A_CP_1 -999
3024           #define CHRTEST_NOT_A_CP_2 -998
3025            
3026           /* grab a new slab and return the first slot in it */
3027            
3028           STATIC regmatch_state *
3029 12804828         S_push_slab(pTHX)
3030           {
3031           #if PERL_VERSION < 9 && !defined(PERL_CORE)
3032           dMY_CXT;
3033           #endif
3034 304         regmatch_slab *s = PL_regmatch_slab->next;
3035 304         if (!s) {
3036 304         Newx(s, 1, regmatch_slab);
3037 0         s->prev = PL_regmatch_slab;
3038 0         s->next = NULL;
3039 8866578         PL_regmatch_slab->next = s;
3040           }
3041 34844664         PL_regmatch_slab = s;
3042 11191980         return SLAB_FIRST(s);
3043           }
3044            
3045            
3046           /* push a new state then goto it */
3047            
3048           #define PUSH_STATE_GOTO(state, node, input) \
3049           pushinput = input; \
3050           scan = node; \
3051           st->resume_state = state; \
3052           goto push_state;
3053            
3054           /* push a new state with success backtracking, then goto it */
3055            
3056           #define PUSH_YES_STATE_GOTO(state, node, input) \
3057           pushinput = input; \
3058           scan = node; \
3059           st->resume_state = state; \
3060           goto push_yes_state;
3061            
3062            
3063            
3064            
3065           /*
3066            
3067           regmatch() - main matching routine
3068            
3069           This is basically one big switch statement in a loop. We execute an op,
3070           set 'next' to point the next op, and continue. If we come to a point which
3071           we may need to backtrack to on failure such as (A|B|C), we push a
3072           backtrack state onto the backtrack stack. On failure, we pop the top
3073           state, and re-enter the loop at the state indicated. If there are no more
3074           states to pop, we return failure.
3075            
3076           Sometimes we also need to backtrack on success; for example /A+/, where
3077           after successfully matching one A, we need to go back and try to
3078           match another one; similarly for lookahead assertions: if the assertion
3079           completes successfully, we backtrack to the state just before the assertion
3080           and then carry on. In these cases, the pushed state is marked as
3081           'backtrack on success too'. This marking is in fact done by a chain of
3082           pointers, each pointing to the previous 'yes' state. On success, we pop to
3083           the nearest yes state, discarding any intermediate failure-only states.
3084           Sometimes a yes state is pushed just to force some cleanup code to be
3085           called at the end of a successful match or submatch; e.g. (??{$re}) uses
3086           it to free the inner regex.
3087            
3088           Note that failure backtracking rewinds the cursor position, while
3089           success backtracking leaves it alone.
3090            
3091           A pattern is complete when the END op is executed, while a subpattern
3092           such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3093           ops trigger the "pop to last yes state if any, otherwise return true"
3094           behaviour.
3095            
3096           A common convention in this function is to use A and B to refer to the two
3097           subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3098           the subpattern to be matched possibly multiple times, while B is the entire
3099           rest of the pattern. Variable and state names reflect this convention.
3100            
3101           The states in the main switch are the union of ops and failure/success of
3102           substates associated with with that op. For example, IFMATCH is the op
3103           that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3104           'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3105           successfully matched A and IFMATCH_A_fail is a state saying that we have
3106           just failed to match A. Resume states always come in pairs. The backtrack
3107           state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3108           at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3109           on success or failure.
3110            
3111           The struct that holds a backtracking state is actually a big union, with
3112           one variant for each major type of op. The variable st points to the
3113           top-most backtrack struct. To make the code clearer, within each
3114           block of code we #define ST to alias the relevant union.
3115            
3116           Here's a concrete example of a (vastly oversimplified) IFMATCH
3117           implementation:
3118            
3119           switch (state) {
3120           ....
3121            
3122           #define ST st->u.ifmatch
3123            
3124           case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3125           ST.foo = ...; // some state we wish to save
3126           ...
3127           // push a yes backtrack state with a resume value of
3128           // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3129           // first node of A:
3130           PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3131           // NOTREACHED
3132            
3133           case IFMATCH_A: // we have successfully executed A; now continue with B
3134           next = B;
3135           bar = ST.foo; // do something with the preserved value
3136           break;
3137            
3138           case IFMATCH_A_fail: // A failed, so the assertion failed
3139           ...; // do some housekeeping, then ...
3140           sayNO; // propagate the failure
3141            
3142           #undef ST
3143            
3144           ...
3145           }
3146            
3147           For any old-timers reading this who are familiar with the old recursive
3148           approach, the code above is equivalent to:
3149            
3150           case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3151           {
3152           int foo = ...
3153           ...
3154           if (regmatch(A)) {
3155           next = B;
3156           bar = foo;
3157           break;
3158           }
3159           ...; // do some housekeeping, then ...
3160           sayNO; // propagate the failure
3161           }
3162            
3163           The topmost backtrack state, pointed to by st, is usually free. If you
3164           want to claim it, populate any ST.foo fields in it with values you wish to
3165           save, then do one of
3166            
3167           PUSH_STATE_GOTO(resume_state, node, newinput);
3168           PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3169            
3170           which sets that backtrack state's resume value to 'resume_state', pushes a
3171           new free entry to the top of the backtrack stack, then goes to 'node'.
3172           On backtracking, the free slot is popped, and the saved state becomes the
3173           new free state. An ST.foo field in this new top state can be temporarily
3174           accessed to retrieve values, but once the main loop is re-entered, it
3175           becomes available for reuse.
3176            
3177           Note that the depth of the backtrack stack constantly increases during the
3178           left-to-right execution of the pattern, rather than going up and down with
3179           the pattern nesting. For example the stack is at its maximum at Z at the
3180           end of the pattern, rather than at X in the following:
3181            
3182           /(((X)+)+)+....(Y)+....Z/
3183            
3184           The only exceptions to this are lookahead/behind assertions and the cut,
3185           (?>A), which pop all the backtrack states associated with A before
3186           continuing.
3187          
3188           Backtrack state structs are allocated in slabs of about 4K in size.
3189           PL_regmatch_state and st always point to the currently active state,
3190           and PL_regmatch_slab points to the slab currently containing
3191           PL_regmatch_state. The first time regmatch() is called, the first slab is
3192           allocated, and is never freed until interpreter destruction. When the slab
3193           is full, a new one is allocated and chained to the end. At exit from
3194           regmatch(), slabs allocated since entry are freed.
3195            
3196           */
3197          
3198            
3199           #define DEBUG_STATE_pp(pp) \
3200           DEBUG_STATE_r({ \
3201           DUMP_EXEC_POS(locinput, scan, utf8_target); \
3202           PerlIO_printf(Perl_debug_log, \
3203           " %*s"pp" %s%s%s%s%s\n", \
3204           depth*2, "", \
3205           PL_reg_name[st->resume_state], \
3206           ((st==yes_state||st==mark_state) ? "[" : ""), \
3207           ((st==yes_state) ? "Y" : ""), \
3208           ((st==mark_state) ? "M" : ""), \
3209           ((st==yes_state||st==mark_state) ? "]" : "") \
3210           ); \
3211           });
3212            
3213            
3214           #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3215            
3216           #ifdef DEBUGGING
3217            
3218           STATIC void
3219 6828698         S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3220           const char *start, const char *end, const char *blurb)
3221           {
3222 1122616         const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3223            
3224 248         PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3225            
3226 244         if (!PL_colorset)
3227 152         reginitcolors();
3228           {
3229 24455914         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3230           RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3231          
3232 10937458         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3233           start, end - start, 60);
3234          
3235 45403611         PerlIO_printf(Perl_debug_log,
3236           "%s%s REx%s %s against %s\n",
3237           PL_colors[4], blurb, PL_colors[5], s0, s1);
3238          
3239 45403611         if (utf8_target||utf8_pat)
3240 45403573         PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3241           utf8_pat ? "pattern" : "",
3242 45403545         utf8_pat && utf8_target ? " and " : "",
3243           utf8_target ? "string" : ""
3244           );
3245           }
3246 45403611         }
3247            
3248           STATIC void
3249 45403697         S_dump_exec_pos(pTHX_ const char *locinput,
3250           const regnode *scan,
3251           const char *loc_regeol,
3252           const char *loc_bostr,
3253           const char *loc_reg_starttry,
3254           const bool utf8_target)
3255           {
3256 42833481         const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3257 38333452         const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3258 40024         int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3259           /* The part of the string before starttry has one color
3260           (pref0_len chars), between starttry and current
3261           position another one (pref_len - pref0_len chars),
3262           after the current position the third one.
3263           We assume that pref0_len <= pref_len, otherwise we
3264           decrease pref0_len. */
3265 7070561         int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3266 7070561         ? (5 + taill) - l : locinput - loc_bostr;
3267           int pref0_len;
3268            
3269 7070561         PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3270            
3271 7070561         while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3272 7070381         pref_len++;
3273 7070561         pref0_len = pref_len - (locinput - loc_reg_starttry);
3274 7070561         if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3275 46821545         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3276 36216692         ? (5 + taill) - pref_len : loc_regeol - locinput);
3277 36216870         while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3278 36216690         l--;
3279 36216870         if (pref0_len < 0)
3280           pref0_len = 0;
3281 4016683         if (pref0_len > pref_len)
3282           pref0_len = pref_len;
3283           {
3284 3951475         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3285            
3286 11228         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3287           (locinput - pref_len),pref0_len, 60, 4, 5);
3288          
3289 11228         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3290           (locinput - pref_len + pref0_len),
3291           pref_len - pref0_len, 60, 2, 3);
3292          
3293 3940427         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3294           locinput, loc_regeol - locinput, 10, 0, 1);
3295            
3296 3940427         const STRLEN tlen=len0+len1+len2;
3297 4016863         PerlIO_printf(Perl_debug_log,
3298           "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3299           (IV)(locinput - loc_bostr),
3300           len0, s0,
3301           len1, s1,
3302           (docolor ? "" : "> <"),
3303           len2, s2,
3304 4003609         (int)(tlen > 19 ? 0 : 19 - tlen),
3305           "");
3306           }
3307 4016683         }
3308            
3309           #endif
3310            
3311           /* reg_check_named_buff_matched()
3312           * Checks to see if a named buffer has matched. The data array of
3313           * buffer numbers corresponding to the buffer is expected to reside
3314           * in the regexp->data->data array in the slot stored in the ARG() of
3315           * node involved. Note that this routine doesn't actually care about the
3316           * name, that information is not preserved from compilation to execution.
3317           * Returns the index of the leftmost defined buffer with the given name
3318           * or 0 if non of the buffers matched.
3319           */
3320           STATIC I32
3321 68422759         S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3322           {
3323           I32 n;
3324 32206069         RXi_GET_DECL(rex,rexi);
3325 32206069         SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3326 32206069         I32 *nums=(I32*)SvPVX(sv_dat);
3327            
3328 13056         PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3329            
3330 47273609         for ( n=0; n
3331 30142261         if ((I32)rex->lastparen >= nums[n] &&
3332 30142261         rex->offs[nums[n]].end != -1)
3333           {
3334 30142137         return nums[n];
3335           }
3336           }
3337           return 0;
3338           }
3339            
3340            
3341           static bool
3342 30129601         S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3343           U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3344           {
3345           /* This function determines if there are one or two characters that match
3346           * the first character of the passed-in EXACTish node , and if
3347           * so, returns them in the passed-in pointers.
3348           *
3349           * If it determines that no possible character in the target string can
3350           * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3351           * the first character in requires UTF-8 to represent, and the
3352           * target string isn't in UTF-8.)
3353           *
3354           * If there are more than two characters that could match the beginning of
3355           * , or if more context is required to determine a match or not,
3356           * it sets both * and * to CHRTEST_VOID.
3357           *
3358           * The motiviation behind this function is to allow the caller to set up
3359           * tight loops for matching. If is of type EXACT, there is
3360           * only one possible character that can match its first character, and so
3361           * the situation is quite simple. But things get much more complicated if
3362           * folding is involved. It may be that the first character of an EXACTFish
3363           * node doesn't participate in any possible fold, e.g., punctuation, so it
3364           * can be matched only by itself. The vast majority of characters that are
3365           * in folds match just two things, their lower and upper-case equivalents.
3366           * But not all are like that; some have multiple possible matches, or match
3367           * sequences of more than one character. This function sorts all that out.
3368           *
3369           * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3370           * loop of trying to match A*, we know we can't exit where the thing
3371           * following it isn't a B. And something can't be a B unless it is the
3372           * beginning of B. By putting a quick test for that beginning in a tight
3373           * loop, we can rule out things that can't possibly be B without having to
3374           * break out of the loop, thus avoiding work. Similarly, if A is a single
3375           * character, we can make a tight loop matching A*, using the outputs of
3376           * this function.
3377           *
3378           * If the target string to match isn't in UTF-8, and there aren't
3379           * complications which require CHRTEST_VOID, * and * are set to
3380           * the one or two possible octets (which are characters in this situation)
3381           * that can match. In all cases, if there is only one character that can
3382           * match, * and * will be identical.
3383           *
3384           * If the target string is in UTF-8, the buffers pointed to by
3385           * and will contain the one or two UTF-8 sequences of bytes that
3386           * can match the beginning of . They should be declared with at
3387           * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3388           * undefined what these contain.) If one or both of the buffers are
3389           * invariant under UTF-8, *, and * will also be set to the
3390           * corresponding invariant. If variant, the corresponding * and/or
3391           * * will be set to a negative number(s) that shouldn't match any code
3392           * point (unless inappropriately coerced to unsigned). * will equal
3393           * * if and only if and are the same. */
3394            
3395 29148265         const bool utf8_target = reginfo->is_utf8_target;
3396            
3397           UV c1 = CHRTEST_NOT_A_CP_1;
3398           UV c2 = CHRTEST_NOT_A_CP_2;
3399           bool use_chrtest_void = FALSE;
3400 32206069         const bool is_utf8_pat = reginfo->is_utf8_pat;
3401            
3402           /* Used when we have both utf8 input and utf8 output, to avoid converting
3403           * to/from code points */
3404           bool utf8_has_been_setup = FALSE;
3405            
3406           dVAR;
3407            
3408 7070381         U8 *pat = (U8*)STRING(text_node);
3409            
3410 3951295         if (OP(text_node) == EXACT) {
3411            
3412           /* In an exact node, only one thing can be matched, that first
3413           * character. If both the pat and the target are UTF-8, we can just
3414           * copy the input to the output, avoiding finding the code point of
3415           * that character */
3416 9967643         if (!is_utf8_pat) {
3417 4041419         c2 = c1 = *pat;
3418           }
3419 4041419         else if (utf8_target) {
3420 3951295         Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3421 3951295         Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3422           utf8_has_been_setup = TRUE;
3423           }
3424           else {
3425 34150         c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3426           }
3427           }
3428           else /* an EXACTFish node */
3429 21244         if ((is_utf8_pat
3430 21457         && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3431           pat + STR_LEN(text_node)))
3432 34150         || (!is_utf8_pat
3433 34150         && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3434           pat + STR_LEN(text_node))))
3435           {
3436           /* Multi-character folds require more context to sort out. Also
3437           * PL_utf8_foldclosures used below doesn't handle them, so have to be
3438           * handled outside this routine */
3439           use_chrtest_void = TRUE;
3440           }
3441           else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3442 34150         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3443 34150         if (c1 > 256) {
3444           /* Load the folds hash, if not already done */
3445           SV** listp;
3446 154956         if (! PL_utf8_foldclosures) {
3447 120806         if (! PL_utf8_tofold) {
3448           U8 dummy[UTF8_MAXBYTES_CASE+1];
3449            
3450           /* Force loading this by folding an above-Latin1 char */
3451 34150         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3452 3985445         assert(PL_utf8_tofold); /* Verify that worked */
3453           }
3454 3985445         PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3455           }
3456            
3457           /* The fold closures data structure is a hash with the keys being
3458           * the UTF-8 of every character that is folded to, like 'k', and
3459           * the values each an array of all code points that fold to its
3460           * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3461           * not included */
3462 264046         if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3463           (char *) pat,
3464           UTF8SKIP(pat),
3465           FALSE))))
3466           {
3467           /* Not found in the hash, therefore there are no folds
3468           * containing it, so there is only a single character that
3469           * could match */
3470           c2 = c1;
3471           }
3472           else { /* Does participate in folds */
3473 264046         AV* list = (AV*) *listp;
3474 264046         if (av_len(list) != 1) {
3475            
3476           /* If there aren't exactly two folds to this, it is outside
3477           * the scope of this function */
3478           use_chrtest_void = TRUE;
3479           }
3480           else { /* There are two. Get them */
3481 3985445         SV** c_p = av_fetch(list, 0, FALSE);
3482 3985445         if (c_p == NULL) {
3483 7970890         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3484           }
3485 3985445         c1 = SvUV(*c_p);
3486            
3487 3985445         c_p = av_fetch(list, 1, FALSE);
3488 3985445         if (c_p == NULL) {
3489 22112         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3490           }
3491 11064         c2 = SvUV(*c_p);
3492            
3493           /* Folds that cross the 255/256 boundary are forbidden if
3494           * EXACTFL, or EXACTFA and one is ASCIII. Since the
3495           * pattern character is above 256, and its only other match
3496           * is below 256, the only legal match will be to itself.
3497           * We have thrown away the original, so have to compute
3498           * which is the one above 255 */
3499 9104         if ((c1 < 256) != (c2 < 256)) {
3500 9104         if (OP(text_node) == EXACTFL
3501 1960         || ((OP(text_node) == EXACTFA
3502 1960         || OP(text_node) == EXACTFA_NO_TRIE)
3503 11064         && (isASCII(c1) || isASCII(c2))))
3504           {
3505 35192         if (c1 < 256) {
3506           c1 = c2;
3507           }
3508           else {
3509           c2 = c1;
3510           }
3511           }
3512           }
3513           }
3514           }
3515           }
3516           else /* Here, c1 is < 255 */
3517 24120         if (utf8_target
3518 13072         && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3519 13072         && OP(text_node) != EXACTFL
3520 13072         && ((OP(text_node) != EXACTFA
3521 3974397         && OP(text_node) != EXACTFA_NO_TRIE)
3522 98906         || ! isASCII(c1)))
3523           {
3524           /* Here, there could be something above Latin1 in the target which
3525           * folds to this character in the pattern. All such cases except
3526           * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3527           * involved in their folds, so are outside the scope of this
3528           * function */
3529 14142         if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3530           c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3531           }
3532           else {
3533           use_chrtest_void = TRUE;
3534           }
3535           }
3536           else { /* Here nothing above Latin1 can fold to the pattern character */
3537 3889633         switch (OP(text_node)) {
3538            
3539           case EXACTFL: /* /l rules */
3540 5977449         c2 = PL_fold_locale[c1];
3541 182238         break;
3542            
3543           case EXACTF: /* This node only generated for non-utf8
3544           patterns */
3545 3803207         assert(! is_utf8_pat);
3546 3985445         if (! utf8_target) { /* /d rules */
3547 89066         c2 = PL_fold[c1];
3548 3896379         break;
3549           }
3550           /* FALLTHROUGH */
3551           /* /u rules for all these. This happens to work for
3552           * EXACTFA as nothing in Latin1 folds to ASCII */
3553           case EXACTFA_NO_TRIE: /* This node only generated for
3554           non-utf8 patterns */
3555 254384000         assert(! is_utf8_pat);
3556           /* FALL THROUGH */
3557           case EXACTFA:
3558           case EXACTFU_SS:
3559           case EXACTFU:
3560 254384000         c2 = PL_fold_latin1[c1];
3561 254384000         break;
3562            
3563           default:
3564 839620         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3565           assert(0); /* NOTREACHED */
3566           }
3567           }
3568           }
3569            
3570           /* Here have figured things out. Set up the returns */
3571 839620         if (use_chrtest_void) {
3572 1251868         *c2p = *c1p = CHRTEST_VOID;
3573           }
3574 855546         else if (utf8_target) {
3575 802728         if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3576 796324         uvchr_to_utf8(c1_utf8, c1);
3577 791964         uvchr_to_utf8(c2_utf8, c2);
3578           }
3579            
3580           /* Invariants are stored in both the utf8 and byte outputs; Use
3581           * negative numbers otherwise for the byte ones. Make sure that the
3582           * byte ones are the same iff the utf8 ones are the same */
3583 558280         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3584 4360         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3585 1404         ? *c2_utf8
3586 559684         : (c1 == c2)
3587           ? CHRTEST_NOT_A_CP_1
3588 474421         : CHRTEST_NOT_A_CP_2;
3589           }
3590 148864         else if (c1 > 255) {
3591 135068         if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3592           can represent */
3593           return FALSE;
3594           }
3595            
3596 135048         *c1p = *c2p = c2; /* c2 is the only representable value */
3597           }
3598           else { /* c1 is representable; see about c2 */
3599 132538         *c1p = c1;
3600 127090         *c2p = (c2 < 256) ? c2 : c1;
3601           }
3602            
3603           return TRUE;
3604           }
3605            
3606           /* returns -1 on failure, $+[0] on success */
3607           STATIC SSize_t
3608 2552         S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3609           {
3610           #if PERL_VERSION < 9 && !defined(PERL_CORE)
3611           dMY_CXT;
3612           #endif
3613           dVAR;
3614 1776         const bool utf8_target = reginfo->is_utf8_target;
3615 128866         const U32 uniflags = UTF8_ALLOW_DEFAULT;
3616 253544422         REGEXP *rex_sv = reginfo->prog;
3617 249016736         regexp *rex = ReANY(rex_sv);
3618 170452682         RXi_GET_DECL(rex,rexi);
3619           /* the current state. This is a cached copy of PL_regmatch_state */
3620           regmatch_state *st;
3621           /* cache heavy used fields of st in registers */
3622           regnode *scan;
3623           regnode *next;
3624           U32 n = 0; /* general value; init to avoid compiler warning */
3625           SSize_t ln = 0; /* len or last; init to avoid compiler warning */
3626           char *locinput = startpos;
3627           char *pushinput; /* where to continue after a PUSH */
3628           I32 nextchr; /* is always set to UCHARAT(locinput) */
3629            
3630           bool result = 0; /* return value of S_regmatch */
3631           int depth = 0; /* depth of backtrack stack */
3632           U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3633 120297742         const U32 max_nochange_depth =
3634           (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3635 813210         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3636           regmatch_state *yes_state = NULL; /* state to pop to on success of
3637           subpattern */
3638           /* mark_state piggy backs on the yes_state logic so that when we unwind
3639           the stack on success we can update the mark_state as we go */
3640           regmatch_state *mark_state = NULL; /* last mark state we have seen */
3641           regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3642           struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3643           U32 state_num;
3644           bool no_final = 0; /* prevent failure from backtracking? */
3645           bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3646           char *startpoint = locinput;
3647           SV *popmark = NULL; /* are we looking for a mark? */
3648           SV *sv_commit = NULL; /* last mark name seen in failure */
3649           SV *sv_yes_mark = NULL; /* last mark name we have seen
3650           during a successful match */
3651           U32 lastopen = 0; /* last open we saw */
3652 406626         bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3653 848506         SV* const oreplsv = GvSV(PL_replgv);
3654           /* these three flags are set by various ops to signal information to
3655           * the very next op. They have a useful lifetime of exactly one loop
3656           * iteration, and are not preserved or restored by state pushes/pops
3657           */
3658           bool sw = 0; /* the condition value in (?(cond)a|b) */
3659           bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3660           int logical = 0; /* the following EVAL is:
3661           0: (?{...})
3662           1: (?(?{...})X|Y)
3663           2: (??{...})
3664           or the following IFMATCH/UNLESSM is:
3665           false: plain (?=foo)
3666           true: used as a condition: (?(?=foo))
3667           */
3668           PAD* last_pad = NULL;
3669           dMULTICALL;
3670           I32 gimme = G_SCALAR;
3671           CV *caller_cv = NULL; /* who called us */
3672           CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3673           CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3674 872782         U32 maxopenparen = 0; /* max '(' index seen so far */
3675           int to_complement; /* Invert the result? */
3676           _char_class_number classnum;
3677 1411512         bool is_utf8_pat = reginfo->is_utf8_pat;
3678            
3679           #ifdef DEBUGGING
3680 1411512         GET_RE_DEBUG_FLAGS_DECL;
3681           #endif
3682            
3683           /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3684           multicall_oldcatch = 0;
3685           multicall_cv = NULL;
3686           cx = NULL;
3687           PERL_UNUSED_VAR(multicall_cop);
3688           PERL_UNUSED_VAR(newsp);
3689            
3690            
3691 1411512         PERL_ARGS_ASSERT_REGMATCH;
3692            
3693 920324         DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3694           PerlIO_printf(Perl_debug_log,"regmatch start\n");
3695           }));
3696            
3697 920324         st = PL_regmatch_state;
3698            
3699           /* Note that nextchr is a byte even in UTF */
3700           SET_nextchr;
3701           scan = prog;
3702 462520         while (scan != NULL) {
3703            
3704 462478         DEBUG_EXECUTE_r( {
3705           SV * const prop = sv_newmortal();
3706           regnode *rnext=regnext(scan);
3707           DUMP_EXEC_POS( locinput, scan, utf8_target );
3708           regprop(rex, prop, scan);
3709          
3710           PerlIO_printf(Perl_debug_log,
3711           "%3"IVdf":%*s%s(%"IVdf")\n",
3712           (IV)(scan - rexi->program), depth*2, "",
3713           SvPVX_const(prop),
3714           (PL_regkind[OP(scan)] == END || !rnext) ?
3715           0 : (IV)(rnext - rexi->program));
3716           });
3717            
3718 735670         next = scan + NEXT_OFF(scan);
3719 491336         if (next == scan)
3720           next = NULL;
3721 366988         state_num = OP(scan);
3722            
3723           reenter_switch:
3724           to_complement = 0;
3725            
3726 201142         SET_nextchr;
3727 172098         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3728            
3729 161836         switch (state_num) {
3730           case BOL: /* /^../ */
3731 161688         if (locinput == reginfo->strbeg)
3732           break;
3733           sayNO;
3734            
3735           case MBOL: /* /^../m */
3736 498         if (locinput == reginfo->strbeg ||
3737 12146608         (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3738           {
3739           break;
3740           }
3741           sayNO;
3742            
3743           case SBOL: /* /^../s */
3744 5450         if (locinput == reginfo->strbeg)
3745           break;
3746           sayNO;
3747            
3748           case GPOS: /* \G */
3749 5222         if (locinput == reginfo->ganch)
3750           break;
3751           sayNO;
3752            
3753           case KEEPS: /* \K */
3754           /* update the startpoint */
3755 4218         st->u.keeper.val = rex->offs[0].start;
3756 4002         rex->offs[0].start = locinput - reginfo->strbeg;
3757 4002         PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3758           assert(0); /*NOTREACHED*/
3759           case KEEPS_next_fail:
3760           /* rollback the start point change */
3761 4002         rex->offs[0].start = st->u.keeper.val;
3762 4218         sayNO_SILENT;
3763           assert(0); /*NOTREACHED*/
3764            
3765           case EOL: /* /..$/ */
3766           goto seol;
3767            
3768           case MEOL: /* /..$/m */
3769 3990         if (!NEXTCHR_IS_EOS && nextchr != '\n')
3770           sayNO;
3771           break;
3772            
3773           case SEOL: /* /..$/s */
3774           seol:
3775 3992         if (!NEXTCHR_IS_EOS && nextchr != '\n')
3776           sayNO;
3777 3776         if (reginfo->strend - locinput > 1)
3778           sayNO;
3779           break;
3780            
3781           case EOS: /* \z */
3782 3774         if (!NEXTCHR_IS_EOS)
3783           sayNO;
3784           break;
3785            
3786           case SANY: /* /./s */
3787 228         if (NEXTCHR_IS_EOS)
3788           sayNO;
3789           goto increment_locinput;
3790            
3791           case CANY: /* \C */
3792 228         if (NEXTCHR_IS_EOS)
3793           sayNO;
3794 17744483         locinput++;
3795 11204186         break;
3796            
3797           case REG_ANY: /* /./ */
3798 12142390         if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3799           sayNO;
3800           goto increment_locinput;
3801            
3802            
3803           #undef ST
3804           #define ST st->u.trie
3805           case TRIEC: /* (ab|cd) with known charclass */
3806           /* In this case the charclass data is available inline so
3807           we can fail fast without a lot of extra overhead.
3808           */
3809 1170         if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3810 1168         DEBUG_EXECUTE_r(
3811           PerlIO_printf(Perl_debug_log,
3812           "%*s %sfailed to match trie start class...%s\n",
3813           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3814           );
3815           sayNO_SILENT;
3816           assert(0); /* NOTREACHED */
3817           }
3818           /* FALL THROUGH */
3819           case TRIE: /* (ab|cd) */
3820           /* the basic plan of execution of the trie is:
3821           * At the beginning, run though all the states, and
3822           * find the longest-matching word. Also remember the position
3823           * of the shortest matching word. For example, this pattern:
3824           * 1 2 3 4 5
3825           * ab|a|x|abcd|abc
3826           * when matched against the string "abcde", will generate
3827           * accept states for all words except 3, with the longest
3828           * matching word being 4, and the shortest being 2 (with
3829           * the position being after char 1 of the string).
3830           *
3831           * Then for each matching word, in word order (i.e. 1,2,4,5),
3832           * we run the remainder of the pattern; on each try setting
3833           * the current position to the character following the word,
3834           * returning to try the next word on failure.
3835           *
3836           * We avoid having to build a list of words at runtime by
3837           * using a compile-time structure, wordinfo[].prev, which
3838           * gives, for each word, the previous accepting word (if any).
3839           * In the case above it would contain the mappings 1->2, 2->0,
3840           * 3->0, 4->5, 5->1. We can use this table to generate, from
3841           * the longest word (4 above), a list of all words, by
3842           * following the list of prev pointers; this gives us the
3843           * unordered list 4,5,1,2. Then given the current word we have
3844           * just tried, we can go through the list and find the
3845           * next-biggest word to try (so if we just failed on word 2,
3846           * the next in the list is 4).
3847           *
3848           * Since at runtime we don't record the matching position in
3849           * the string for each word, we have to work that out for
3850           * each word we're about to process. The wordinfo table holds
3851           * the character length of each word; given that we recorded
3852           * at the start: the position of the shortest word and its
3853           * length in chars, we just need to move the pointer the
3854           * difference between the two char lengths. Depending on
3855           * Unicode status and folding, that's cheap or expensive.
3856           *
3857           * This algorithm is optimised for the case where are only a
3858           * small number of accept states, i.e. 0,1, or maybe 2.
3859           * With lots of accepts states, and having to try all of them,
3860           * it becomes quadratic on number of accept states to find all
3861           * the next words.
3862           */
3863            
3864           {
3865           /* what type of TRIE am I? (utf8 makes this contextual) */
3866 1174         DECL_TRIE_TYPE(scan);
3867            
3868           /* what trie are we using right now */
3869 110         reg_trie_data * const trie
3870 110         = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3871 110         HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3872 10947424         U32 state = trie->startstate;
3873            
3874 10947424         if ( trie->bitmap
3875 10947420         && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3876           {
3877 1193702         if (trie->states[ state ].wordnum) {
3878 1193702         DEBUG_EXECUTE_r(
3879           PerlIO_printf(Perl_debug_log,
3880           "%*s %smatched empty string...%s\n",
3881           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3882           );
3883 1193702         if (!trie->jump)
3884           break;
3885           } else {
3886 0         DEBUG_EXECUTE_r(
3887           PerlIO_printf(Perl_debug_log,
3888           "%*s %sfailed to match trie start class...%s\n",
3889           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3890           );
3891           sayNO_SILENT;
3892           }
3893           }
3894            
3895           {
3896           U8 *uc = ( U8* )locinput;
3897            
3898 12146612         STRLEN len = 0;
3899 58775536         STRLEN foldlen = 0;
3900           U8 *uscan = (U8*)NULL;
3901           U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3902           U32 charcount = 0; /* how many input chars we have matched */
3903           U32 accepted = 0; /* have we seen any accepting states? */
3904            
3905 57344290         ST.jump = trie->jump;
3906 826002         ST.me = scan;
3907 522784         ST.firstpos = NULL;
3908 56518292         ST.longfold = FALSE; /* char longer if folded => it's harder */
3909 27052025         ST.nextword = 0;
3910            
3911           /* fully traverse the TRIE; note the position of the
3912           shortest accept state and the wordnum of the longest
3913           accept state */
3914            
3915 112         while ( state && uc <= (U8*)(reginfo->strend) ) {
3916 194         U32 base = trie->states[ state ].trans.base;
3917 104         UV uvc = 0;
3918           U16 charid = 0;
3919           U16 wordnum;
3920 94         wordnum = trie->states[ state ].wordnum;
3921            
3922 24         if (wordnum) { /* it's an accept state */
3923 8         if (!accepted) {
3924           accepted = 1;
3925           /* record first match position */
3926 88         if (ST.longfold) {
3927 1250552         ST.firstpos = (U8*)locinput;
3928 7708673         ST.firstchars = 0;
3929           }
3930           else {
3931 182216         ST.firstpos = uc;
3932 182216         ST.firstchars = charcount;
3933           }
3934           }
3935 7839793         if (!ST.nextword || wordnum < ST.nextword)
3936 7664131         ST.nextword = wordnum;
3937 5121993         ST.topword = wordnum;
3938           }
3939            
3940 5141532         DEBUG_TRIE_EXECUTE_r({
3941           DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3942           PerlIO_printf( Perl_debug_log,
3943           "%*s %sState: %4"UVxf" Accepted: %c ",
3944           2+depth * 2, "", PL_colors[4],
3945           (UV)state, (accepted ? 'Y' : 'N'));
3946           });
3947            
3948           /* read a char and goto next state */
3949 293500         if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3950           I32 offset;
3951 290924         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3952           uscan, len, uvc, charid, foldlen,
3953           foldbuf, uniflags);
3954 159530         charcount++;
3955 54556         if (foldlen>0)
3956 131394         ST.longfold = TRUE;
3957 24908         if (charid &&
3958 18676         ( ((offset =
3959 11412         base + charid - 1 - trie->uniquecharcount)) >= 0)
3960            
3961 118960         && ((U32)offset < trie->lasttrans)
3962 118960         && trie->trans[offset].check == state)
3963           {
3964 79528         state = trie->trans[offset].next;
3965           }
3966           else {
3967           state = 0;
3968           }
3969 46         uc += len;
3970            
3971           }
3972           else {
3973           state = 0;
3974           }
3975 50         DEBUG_TRIE_EXECUTE_r(
3976           PerlIO_printf( Perl_debug_log,
3977           "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3978           charid, uvc, (UV)state, PL_colors[5] );
3979           );
3980           }
3981 79522         if (!accepted)
3982           sayNO;
3983            
3984           /* calculate total number of accept states */
3985           {
3986 79522         U16 w = ST.topword;
3987           accepted = 0;
3988 39444         while (w) {
3989 39260         w = trie->wordinfo[w].prev;
3990 39260         accepted++;
3991           }
3992 120         ST.accepted = accepted;
3993           }
3994            
3995 120         DEBUG_EXECUTE_r(
3996           PerlIO_printf( Perl_debug_log,
3997           "%*s %sgot %"IVdf" possible matches%s\n",
3998           REPORT_CODE_OFF + depth * 2, "",
3999           PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4000           );
4001           goto trie_first_try; /* jump into the fail handler */
4002           }}
4003           assert(0); /* NOTREACHED */
4004            
4005           case TRIE_next_fail: /* we failed - try next alternative */
4006           {
4007           U8 *uc;
4008 4         if ( ST.jump) {
4009 4         REGCP_UNWIND(ST.cp);
4010 104         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4011           }
4012 71         if (!--ST.accepted) {
4013 4         DEBUG_EXECUTE_r({
4014           PerlIO_printf( Perl_debug_log,
4015           "%*s %sTRIE failed...%s\n",
4016           REPORT_CODE_OFF+depth*2, "",
4017           PL_colors[4],
4018           PL_colors[5] );
4019           });
4020           sayNO_SILENT;
4021           }
4022           {
4023           /* Find next-highest word to process. Note that this code
4024           * is O(N^2) per trie run (O(N) per branch), so keep tight */
4025           U16 min = 0;
4026           U16 word;
4027 118698         U16 const nextword = ST.nextword;
4028 4379828         reg_trie_wordinfo * const wordinfo
4029 4067818         = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4030 3731350         for (word=ST.topword; word; word=wordinfo[word].prev) {
4031 3731350         if (word > nextword && (!min || word < min))
4032           min = word;
4033           }
4034 16         ST.nextword = min;
4035           }
4036            
4037           trie_first_try:
4038 12         if (do_cutgroup) {
4039           do_cutgroup = 0;
4040           no_final = 0;
4041           }
4042            
4043 6         if ( ST.jump) {
4044 336468         ST.lastparen = rex->lastparen;
4045 56         ST.lastcloseparen = rex->lastcloseparen;
4046 2         REGCP_SET(ST.cp);
4047           }
4048            
4049           /* find start char of end of current word */
4050           {
4051           U32 chars; /* how many chars to skip */
4052 336470         reg_trie_data * const trie
4053 336470         = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4054            
4055 335027         assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4056           >= ST.firstchars);
4057 1788         chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4058 1110         - ST.firstchars;
4059 1110         uc = ST.firstpos;
4060            
4061 678         if (ST.longfold) {
4062           /* the hard option - fold each char in turn and find
4063           * its folded length (which may be different */
4064           U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4065           STRLEN foldlen;
4066           STRLEN len;
4067           UV uvc;
4068           U8 *uscan;
4069            
4070 132         while (chars) {
4071 224         if (utf8_target) {
4072 88         uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4073           uniflags);
4074 26         uc += len;
4075           }
4076           else {
4077 542         uvc = *uc;
4078 104         uc++;
4079           }
4080 158         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4081           uscan = foldbuf;
4082 56         while (foldlen) {
4083 2         if (!--chars)
4084           break;
4085 552         uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4086           uniflags);
4087 496         uscan += len;
4088 114         foldlen -= len;
4089           }
4090           }
4091           }
4092           else {
4093 442         if (utf8_target)
4094 382         while (chars--)
4095 446         uc += UTF8SKIP(uc);
4096           else
4097 228         uc += chars;
4098           }
4099           }
4100            
4101 232         scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4102 224         ? ST.jump[ST.nextword]
4103 227         : NEXT_OFF(ST.me));
4104            
4105 226         DEBUG_EXECUTE_r({
4106           PerlIO_printf( Perl_debug_log,
4107           "%*s %sTRIE matched word #%d, continuing%s\n",
4108           REPORT_CODE_OFF+depth*2, "",
4109           PL_colors[4],
4110           ST.nextword,
4111           PL_colors[5]
4112           );
4113           });
4114            
4115 341         if (ST.accepted > 1 || has_cutgroup) {
4116 112         PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4117           assert(0); /* NOTREACHED */
4118           }
4119           /* only one choice left - just continue */
4120 8         DEBUG_EXECUTE_r({
4121           AV *const trie_words
4122           = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4123           SV ** const tmp = av_fetch( trie_words,
4124           ST.nextword-1, 0 );
4125           SV *sv= tmp ? sv_newmortal() : NULL;
4126            
4127           PerlIO_printf( Perl_debug_log,
4128           "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4129           REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4130           ST.nextword,
4131           tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4132           PL_colors[0], PL_colors[1],
4133           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4134           )
4135           : "not compiled under -Dr",
4136           PL_colors[5] );
4137           });
4138            
4139           locinput = (char*)uc;
4140 344         continue; /* execute rest of RE */
4141           assert(0); /* NOTREACHED */
4142           }
4143           #undef ST
4144            
4145           case EXACT: { /* /abc/ */
4146 206         char *s = STRING(scan);
4147 48         ln = STR_LEN(scan);
4148 338178         if (utf8_target != is_utf8_pat) {
4149           /* The target and the pattern have differing utf8ness. */
4150           char *l = locinput;
4151 328218         const char * const e = s + ln;
4152            
4153 2776         if (utf8_target) {
4154           /* The target is utf8, the pattern is not utf8.
4155           * Above-Latin1 code points can't match the pattern;
4156           * invariants match exactly, and the other Latin1 ones need
4157           * to be downgraded to a single byte in order to do the
4158           * comparison. (If we could be confident that the target
4159           * is not malformed, this could be refactored to have fewer
4160           * tests by just assuming that if the first bytes match, it
4161           * is an invariant, but there are tests in the test suite
4162           * dealing with (??{...}) which violate this) */
4163 336468         while (s < e) {
4164 852         if (l >= reginfo->strend
4165 426         || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4166           {
4167           sayNO;
4168           }
4169 1746         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4170 3091         if (*l != *s) {
4171           sayNO;
4172           }
4173 8648         l++;
4174           }
4175           else {
4176 8648         if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4177           {
4178           sayNO;
4179           }
4180 852         l += 2;
4181           }
4182 426         s++;
4183           }
4184           }
4185           else {
4186           /* The target is not utf8, the pattern is utf8. */
4187 1778         while (s < e) {
4188 61845         if (l >= reginfo->strend
4189 122465         || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4190           {
4191           sayNO;
4192           }
4193 122465         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4194 131113         if (*s != *l) {
4195           sayNO;
4196           }
4197 131113         s++;
4198           }
4199           else {
4200 131113         if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4201           {
4202           sayNO;
4203           }
4204 128813         s += 2;
4205           }
4206 127283         l++;
4207           }
4208           }
4209           locinput = l;
4210           }
4211           else {
4212           /* The target and the pattern have the same utf8ness. */
4213           /* Inline the first character, for speed. */
4214 127325         if (reginfo->strend - locinput < ln
4215 5930         || UCHARAT(s) != nextchr
4216 5430         || (ln > 1 && memNE(s, locinput, ln)))
4217           {
4218           sayNO;
4219           }
4220 5430         locinput += ln;
4221           }
4222           break;
4223           }
4224            
4225           case EXACTFL: { /* /abc/il */
4226           re_fold_t folder;
4227           const U8 * fold_array;
4228           const char * s;
4229           U32 fold_utf8_flags;
4230            
4231 4132         RX_MATCH_TAINTED_on(reginfo->prog);
4232           folder = foldEQ_locale;
4233           fold_array = PL_fold_locale;
4234           fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4235 4132         goto do_exactf;
4236            
4237           case EXACTFU_SS: /* /\x{df}/iu */
4238           case EXACTFU: /* /abc/iu */
4239           folder = foldEQ_latin1;
4240           fold_array = PL_fold_latin1;
4241 180062         fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4242 157351         goto do_exactf;
4243            
4244           case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4245           patterns */
4246 334         assert(! is_utf8_pat);
4247           /* FALL THROUGH */
4248           case EXACTFA: /* /abc/iaa */
4249           folder = foldEQ_latin1;
4250           fold_array = PL_fold_latin1;
4251           fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4252           goto do_exactf;
4253            
4254           case EXACTF: /* /abc/i This node only generated for
4255           non-utf8 patterns */
4256 334         assert(! is_utf8_pat);
4257           folder = foldEQ;
4258           fold_array = PL_fold;
4259           fold_utf8_flags = 0;
4260            
4261           do_exactf:
4262 22869         s = STRING(scan);
4263 22869         ln = STR_LEN(scan);
4264            
4265 21966         if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4266           /* Either target or the pattern are utf8, or has the issue where
4267           * the fold lengths may differ. */
4268           const char * const l = locinput;
4269 3750         char *e = reginfo->strend;
4270            
4271 102         if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
4272           l, &e, 0, utf8_target, fold_utf8_flags))
4273           {
4274           sayNO;
4275           }
4276 19935         locinput = e;
4277 19935         break;
4278           }
4279            
4280           /* Neither the target nor the pattern are utf8 */
4281 2997858         if (UCHARAT(s) != nextchr
4282 2290614         && !NEXTCHR_IS_EOS
4283 0         && UCHARAT(s) != fold_array[nextchr])
4284           {
4285           sayNO;
4286           }
4287 2290614         if (reginfo->strend - locinput < ln)
4288           sayNO;
4289 0         if (ln > 1 && ! folder(s, locinput, ln))
4290           sayNO;
4291 2997858         locinput += ln;
4292 2997242         break;
4293           }
4294            
4295           /* XXX Could improve efficiency by separating these all out using a
4296           * macro or in-line function. At that point regcomp.c would no longer
4297           * have to set the FLAGS fields of these */
4298           case BOUNDL: /* /\b/l */
4299           case NBOUNDL: /* /\B/l */
4300 2997242         RX_MATCH_TAINTED_on(reginfo->prog);
4301           /* FALL THROUGH */
4302           case BOUND: /* /\b/ */
4303           case BOUNDU: /* /\b/u */
4304           case BOUNDA: /* /\b/a */
4305           case NBOUND: /* /\B/ */
4306           case NBOUNDU: /* /\B/u */
4307           case NBOUNDA: /* /\B/a */
4308           /* was last char in word? */
4309 616         if (utf8_target
4310 616         && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4311 1235174         && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4312           {
4313 1906         if (locinput == reginfo->strbeg)
4314           ln = '\n';
4315           else {
4316 0         const U8 * const r =
4317 1235174         reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4318            
4319 1235174         ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4320           }
4321 1235174         if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4322 1235174         ln = isWORDCHAR_uni(ln);
4323 1235174         if (NEXTCHR_IS_EOS)
4324           n = 0;
4325           else {
4326 1235174         LOAD_UTF8_CHARCLASS_ALNUM();
4327 35934         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4328           utf8_target);
4329           }
4330           }
4331           else {
4332 1235174         ln = isWORDCHAR_LC_uvchr(ln);
4333 1235174         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4334           }
4335           }
4336           else {
4337            
4338           /* Here the string isn't utf8, or is utf8 and only ascii
4339           * characters are to match \w. In the latter case looking at
4340           * the byte just prior to the current one may be just the final
4341           * byte of a multi-byte character. This is ok. There are two
4342           * cases:
4343           * 1) it is a single byte character, and then the test is doing
4344           * just what it's supposed to.
4345           * 2) it is a multi-byte character, in which case the final
4346           * byte is never mistakable for ASCII, and so the test
4347           * will say it is not a word character, which is the
4348           * correct answer. */
4349 100768         ln = (locinput != reginfo->strbeg) ?
4350 50384         UCHARAT(locinput - 1) : '\n';
4351 1184790         switch (FLAGS(scan)) {
4352           case REGEX_UNICODE_CHARSET:
4353 1026212         ln = isWORDCHAR_L1(ln);
4354 158578         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4355 158578         break;
4356           case REGEX_LOCALE_CHARSET:
4357 1235174         ln = isWORDCHAR_LC(ln);
4358 47410         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4359 47410         break;
4360           case REGEX_DEPENDS_CHARSET:
4361 47410         ln = isWORDCHAR(ln);
4362 44505         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4363 120933         break;
4364           case REGEX_ASCII_RESTRICTED_CHARSET:
4365           case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4366 1235174         ln = isWORDCHAR_A(ln);
4367 1235174         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4368 1235174         break;
4369           default:
4370 207512         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4371           break;
4372           }
4373           }
4374           /* Note requires that all BOUNDs be lower than all NBOUNDs in
4375           * regcomp.sym */
4376 1027662         if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4377           sayNO;
4378           break;
4379            
4380           case ANYOF: /* /[abc]/ */
4381           case ANYOF_WARN_SUPER:
4382 1027662         if (NEXTCHR_IS_EOS)
4383           sayNO;
4384 1235174         if (utf8_target) {
4385 1234540         if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4386           sayNO;
4387 1235174         locinput += UTF8SKIP(locinput);
4388           }
4389           else {
4390 1235174         if (!REGINCLASS(rex, scan, (U8*)locinput))
4391           sayNO;
4392 1235174         locinput++;
4393           }
4394           break;
4395            
4396           /* The argument (FLAGS) to all the POSIX node types is the class number
4397           * */
4398            
4399           case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4400           to_complement = 1;
4401           /* FALLTHROUGH */
4402            
4403           case POSIXL: /* \w or [:punct:] etc. under /l */
4404 1235174         if (NEXTCHR_IS_EOS)
4405           sayNO;
4406            
4407           /* The locale hasn't influenced the outcome before this, so defer
4408           * tainting until now */
4409 1235174         RX_MATCH_TAINTED_on(reginfo->prog);
4410            
4411           /* Use isFOO_lc() for characters within Latin1. (Note that
4412           * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4413           * wouldn't be invariant) */
4414 32         if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4415 32         if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4416           sayNO;
4417           }
4418           }
4419 1235174         else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4420 1235174         if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4421           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4422           *(locinput + 1))))))
4423           {
4424           sayNO;
4425           }
4426           }
4427           else { /* Here, must be an above Latin-1 code point */
4428           goto utf8_posix_not_eos;
4429           }
4430            
4431           /* Here, must be utf8 */
4432 1235174         locinput += UTF8SKIP(locinput);
4433 1235148         break;
4434            
4435           case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4436           to_complement = 1;
4437           /* FALLTHROUGH */
4438            
4439           case POSIXD: /* \w or [:punct:] etc. under /d */
4440 1235148         if (utf8_target) {
4441           goto utf8_posix;
4442           }
4443           goto posixa;
4444            
4445           case NPOSIXA: /* \W or [:^punct:] etc. under /a */
4446            
4447 2         if (NEXTCHR_IS_EOS) {
4448           sayNO;
4449           }
4450            
4451           /* All UTF-8 variants match */
4452 1235146         if (! UTF8_IS_INVARIANT(nextchr)) {
4453           goto increment_locinput;
4454           }
4455            
4456           to_complement = 1;
4457           /* FALLTHROUGH */
4458            
4459           case POSIXA: /* \w or [:punct:] etc. under /a */
4460            
4461           posixa:
4462           /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4463           * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4464           * character is a single byte */
4465            
4466 1235146         if (NEXTCHR_IS_EOS
4467 1235148         || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4468           FLAGS(scan)))))
4469           {
4470           sayNO;
4471           }
4472            
4473           /* Here we are either not in utf8, or we matched a utf8-invariant,
4474           * so the next char is the next byte */
4475 1235148         locinput++;
4476 1016512         break;
4477            
4478           case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4479           to_complement = 1;
4480           /* FALLTHROUGH */
4481            
4482           case POSIXU: /* \w or [:punct:] etc. under /u */
4483           utf8_posix:
4484 218636         if (NEXTCHR_IS_EOS) {
4485           sayNO;
4486           }
4487           utf8_posix_not_eos:
4488            
4489           /* Use _generic_isCC() for characters within Latin1. (Note that
4490           * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4491           * wouldn't be invariant) */
4492 506         if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4493 218130         if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4494           FLAGS(scan)))))
4495           {
4496           sayNO;
4497           }
4498 218130         locinput++;
4499           }
4500 218130         else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4501 208310         if (! (to_complement
4502 218130         ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4503           *(locinput + 1)),
4504           FLAGS(scan)))))
4505           {
4506           sayNO;
4507           }
4508 9820         locinput += 2;
4509           }
4510           else { /* Handle above Latin-1 code points */
4511 860         classnum = (_char_class_number) FLAGS(scan);
4512 860         if (classnum < _FIRST_NON_SWASH_CC) {
4513            
4514           /* Here, uses a swash to find such code points. Load if if
4515           * not done already */
4516 0         if (! PL_utf8_swash_ptrs[classnum]) {
4517 218130         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4518           PL_utf8_swash_ptrs[classnum]
4519 9820         = _core_swash_init("utf8",
4520           swash_property_names[classnum],
4521           &PL_sv_undef, 1, 0, NULL, &flags);
4522           }
4523 9820         if (! (to_complement
4524 1235148         ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4525           (U8 *) locinput, TRUE))))
4526           {
4527           sayNO;
4528           }
4529           }
4530           else { /* Here, uses macros to find above Latin-1 code points */
4531 1235148         switch (classnum) {
4532           case _CC_ENUM_SPACE: /* XXX would require separate
4533           code if we revert the change
4534           of \v matching this */
4535           case _CC_ENUM_PSXSPC:
4536 1235148         if (! (to_complement
4537 1235148         ^ cBOOL(is_XPERLSPACE_high(locinput))))
4538           {
4539           sayNO;
4540           }
4541           break;
4542           case _CC_ENUM_BLANK:
4543 218130         if (! (to_complement
4544 208310         ^ cBOOL(is_HORIZWS_high(locinput))))
4545           {
4546           sayNO;
4547           }
4548           break;
4549           case _CC_ENUM_XDIGIT:
4550 9820         if (! (to_complement
4551 0         ^ cBOOL(is_XDIGIT_high(locinput))))
4552           {
4553           sayNO;
4554           }
4555           break;
4556           case _CC_ENUM_VERTSPACE:
4557 0         if (! (to_complement
4558 9820         ^ cBOOL(is_VERTWS_high(locinput))))
4559           {
4560           sayNO;
4561           }
4562           break;
4563           default: /* The rest, e.g. [:cntrl:], can't match
4564           above Latin1 */
4565 14730         if (! to_complement) {
4566           sayNO;
4567           }
4568           break;
4569           }
4570           }
4571 9820         locinput += UTF8SKIP(locinput);
4572           }
4573           break;
4574            
4575           case CLUMP: /* Match \X: logical Unicode character. This is defined as
4576           a Unicode extended Grapheme Cluster */
4577           /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4578           extended Grapheme Cluster is:
4579            
4580           CR LF
4581           | Prepend* Begin Extend*
4582           | .
4583            
4584           Begin is: ( Special_Begin | ! Control )
4585           Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4586           Extend is: ( Grapheme_Extend | Spacing_Mark )
4587           Control is: [ GCB_Control | CR | LF ]
4588           Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4589            
4590           If we create a 'Regular_Begin' = Begin - Special_Begin, then
4591           we can rewrite
4592            
4593           Begin is ( Regular_Begin + Special Begin )
4594            
4595           It turns out that 98.4% of all Unicode code points match
4596           Regular_Begin. Doing it this way eliminates a table match in
4597           the previous implementation for almost all Unicode code points.
4598            
4599           There is a subtlety with Prepend* which showed up in testing.
4600           Note that the Begin, and only the Begin is required in:
4601           | Prepend* Begin Extend*
4602           Also, Begin contains '! Control'. A Prepend must be a
4603           '! Control', which means it must also be a Begin. What it
4604           comes down to is that if we match Prepend* and then find no
4605           suitable Begin afterwards, that if we backtrack the last
4606           Prepend, that one will be a suitable Begin.
4607           */
4608            
4609 14730         if (NEXTCHR_IS_EOS)
4610           sayNO;
4611 9820         if (! utf8_target) {
4612            
4613           /* Match either CR LF or '.', as all the other possibilities
4614           * require utf8 */
4615 0         locinput++; /* Match the . or CR */
4616 218130         if (nextchr == '\r' /* And if it was CR, and the next is LF,
4617           match the LF */
4618 218130         && locinput < reginfo->strend
4619 218130         && UCHARAT(locinput) == '\n')
4620           {
4621 218130         locinput++;
4622           }
4623           }
4624           else {
4625            
4626           /* Utf8: See if is ( CR LF ); already know that locinput <
4627           * reginfo->strend, so locinput+1 is in bounds */
4628 218130         if ( nextchr == '\r' && locinput+1 < reginfo->strend
4629 218130         && UCHARAT(locinput + 1) == '\n')
4630           {
4631 218130         locinput += 2;
4632           }
4633           else {
4634           STRLEN len;
4635            
4636           /* In case have to backtrack to beginning, then match '.' */
4637           char *starting = locinput;
4638            
4639           /* In case have to backtrack the last prepend */
4640           char *previous_prepend = NULL;
4641            
4642 218130         LOAD_UTF8_CHARCLASS_GCB();
4643            
4644           /* Match (prepend)* */
4645           while (locinput < reginfo->strend
4646           && (len = is_GCB_Prepend_utf8(locinput)))
4647           {
4648           previous_prepend = locinput;
4649           locinput += len;
4650           }
4651            
4652           /* As noted above, if we matched a prepend character, but
4653           * the next thing won't match, back off the last prepend we
4654           * matched, as it is guaranteed to match the begin */
4655           if (previous_prepend
4656           && (locinput >= reginfo->strend
4657           || (! swash_fetch(PL_utf8_X_regular_begin,
4658           (U8*)locinput, utf8_target)
4659           && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4660           )
4661           {
4662           locinput = previous_prepend;
4663           }
4664            
4665           /* Note that here we know reginfo->strend > locinput, as we
4666           * tested that upon input to this switch case, and if we
4667           * moved locinput forward, we tested the result just above
4668           * and it either passed, or we backed off so that it will
4669           * now pass */
4670 218130         if (swash_fetch(PL_utf8_X_regular_begin,
4671           (U8*)locinput, utf8_target)) {
4672 3215988         locinput += UTF8SKIP(locinput);
4673           }
4674 3215988         else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4675            
4676           /* Here did not match the required 'Begin' in the
4677           * second term. So just match the very first
4678           * character, the '.' of the final term of the regex */
4679 3215988         locinput = starting + UTF8SKIP(starting);
4680 3215988         goto exit_utf8;
4681           } else {
4682            
4683           /* Here is a special begin. It can be composed of
4684           * several individual characters. One possibility is
4685           * RI+ */
4686 3215988         if ((len = is_GCB_RI_utf8(locinput))) {
4687 3215988         locinput += len;
4688 3215988         while (locinput < reginfo->strend
4689 3215988         && (len = is_GCB_RI_utf8(locinput)))
4690           {
4691 3215988         locinput += len;
4692           }
4693 3434148         } else if ((len = is_GCB_T_utf8(locinput))) {
4694           /* Another possibility is T+ */
4695 3215988         locinput += len;
4696 3215988         while (locinput < reginfo->strend
4697 3215988         && (len = is_GCB_T_utf8(locinput)))
4698           {
4699 378504         locinput += len;
4700           }
4701           } else {
4702            
4703           /* Here, neither RI+ nor T+; must be some other
4704           * Hangul. That means it is one of the others: L,
4705           * LV, LVT or V, and matches:
4706           * L* (L | LVT T* | V * V* T* | LV V* T*) */
4707            
4708           /* Match L* */
4709 378504         while (locinput < reginfo->strend
4710 561556         && (len = is_GCB_L_utf8(locinput)))
4711           {
4712 378504         locinput += len;
4713           }
4714            
4715           /* Here, have exhausted L*. If the next character
4716           * is not an LV, LVT nor V, it means we had to have
4717           * at least one L, so matches L+ in the original
4718           * equation, we have a complete hangul syllable.
4719           * Are done. */
4720            
4721 378504         if (locinput < reginfo->strend
4722 378504         && is_GCB_LV_LVT_V_utf8(locinput))
4723           {
4724           /* Otherwise keep going. Must be LV, LVT or V.
4725           * See if LVT, by first ruling out V, then LV */
4726 378504         if (! is_GCB_V_utf8(locinput)
4727           /* All but every TCount one is LV */
4728 378504         && (valid_utf8_to_uvchr((U8 *) locinput,
4729           NULL)
4730 378504         - SBASE)
4731 26438         % TCount != 0)
4732           {
4733 3469852         locinput += UTF8SKIP(locinput);
4734           } else {
4735            
4736           /* Must be V or LV. Take it, then match
4737           * V* */
4738 3469852         locinput += UTF8SKIP(locinput);
4739 3602116         while (locinput < reginfo->strend
4740 3469852         && (len = is_GCB_V_utf8(locinput)))
4741           {
4742 3469852         locinput += len;
4743           }
4744           }
4745            
4746           /* And any of LV, LVT, or V can be followed
4747           * by T* */
4748 3469852         while (locinput < reginfo->strend
4749 3469852         && (len = is_GCB_T_utf8(locinput)))
4750           {
4751 3469852         locinput += len;
4752           }
4753           }
4754           }
4755           }
4756            
4757           /* Match any extender */
4758 3469852         while (locinput < reginfo->strend
4759 3469852         && swash_fetch(PL_utf8_X_extend,
4760           (U8*)locinput, utf8_target))
4761           {
4762 1989360         locinput += UTF8SKIP(locinput);
4763           }
4764           }
4765           exit_utf8:
4766 149852615         if (locinput > reginfo->strend) sayNO;
4767           }
4768           break;
4769          
4770           case NREFFL: /* /\g{name}/il */
4771           { /* The capture buffer cases. The ones beginning with N for the
4772           named buffers just convert to the equivalent numbered and
4773           pretend they were called as the corresponding numbered buffer
4774           op. */
4775           /* don't initialize these in the declaration, it makes C++
4776           unhappy */
4777           const char *s;
4778           char type;
4779           re_fold_t folder;
4780           const U8 *fold_array;
4781           UV utf8_fold_flags;
4782            
4783 149852615         RX_MATCH_TAINTED_on(reginfo->prog);
4784           folder = foldEQ_locale;
4785           fold_array = PL_fold_locale;
4786           type = REFFL;
4787           utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4788 149852615         goto do_nref;
4789            
4790           case NREFFA: /* /\g{name}/iaa */
4791           folder = foldEQ_latin1;
4792           fold_array = PL_fold_latin1;
4793           type = REFFA;
4794           utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4795           goto do_nref;
4796            
4797           case NREFFU: /* /\g{name}/iu */
4798           folder = foldEQ_latin1;
4799           fold_array = PL_fold_latin1;
4800           type = REFFU;
4801           utf8_fold_flags = 0;
4802 143530275         goto do_nref;
4803            
4804           case NREFF: /* /\g{name}/i */
4805           folder = foldEQ;
4806           fold_array = PL_fold;
4807           type = REFF;
4808           utf8_fold_flags = 0;
4809 149852615         goto do_nref;
4810            
4811           case NREF: /* /\g{name}/ */
4812           type = REF;
4813           folder = NULL;
4814           fold_array = NULL;
4815           utf8_fold_flags = 0;
4816           do_nref:
4817            
4818           /* For the named back references, find the corresponding buffer
4819           * number */
4820 82383719         n = reg_check_named_buff_matched(rex,scan);
4821            
4822 82383719         if ( ! n ) {
4823           sayNO;
4824           }
4825           goto do_nref_ref_common;
4826            
4827           case REFFL: /* /\1/il */
4828 82383719         RX_MATCH_TAINTED_on(reginfo->prog);
4829           folder = foldEQ_locale;
4830           fold_array = PL_fold_locale;
4831           utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4832 74406611         goto do_ref;
4833            
4834           case REFFA: /* /\1/iaa */
4835           folder = foldEQ_latin1;
4836           fold_array = PL_fold_latin1;
4837           utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4838           goto do_ref;
4839            
4840           case REFFU: /* /\1/iu */
4841           folder = foldEQ_latin1;
4842           fold_array = PL_fold_latin1;
4843           utf8_fold_flags = 0;
4844 82383719         goto do_ref;
4845            
4846           case REFF: /* /\1/i */
4847           folder = foldEQ;
4848           fold_array = PL_fold;
4849           utf8_fold_flags = 0;
4850 82383719         goto do_ref;
4851            
4852           case REF: /* /\1/ */
4853           folder = NULL;
4854           fold_array = NULL;
4855           utf8_fold_flags = 0;
4856            
4857           do_ref:
4858 50         type = OP(scan);
4859 200         n = ARG(scan); /* which paren pair */
4860            
4861           do_nref_ref_common:
4862 200         ln = rex->offs[n].start;
4863 150         reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4864 200         if (rex->lastparen < n || ln == -1)
4865           sayNO; /* Do not match unless seen CLOSEn. */
4866 100         if (ln == rex->offs[n].end)
4867           break;
4868            
4869 100         s = reginfo->strbeg + ln;
4870 100         if (type != REF /* REF can do byte comparison */
4871 100         && (utf8_target || type == REFFU))
4872           { /* XXX handle REFFL better */
4873 50         char * limit = reginfo->strend;
4874            
4875           /* This call case insensitively compares the entire buffer
4876           * at s, with the current input starting at locinput, but
4877           * not going off the end given by reginfo->strend, and
4878           * returns in upon success, how much of the
4879           * current input was matched */
4880 100         if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4881           locinput, &limit, 0, utf8_target, utf8_fold_flags))
4882           {
4883           sayNO;
4884           }
4885 100         locinput = limit;
4886 0         break;
4887           }
4888            
4889           /* Not utf8: Inline the first character, for speed. */
4890 2546         if (!NEXTCHR_IS_EOS &&
4891 2546         UCHARAT(s) != nextchr &&
4892 2546         (type == REF ||
4893 500         UCHARAT(s) != fold_array[nextchr]))
4894           sayNO;
4895 500         ln = rex->offs[n].end - ln;
4896 600         if (locinput + ln > reginfo->strend)
4897           sayNO;
4898 600         if (ln > 1 && (type == REF
4899 600         ? memNE(s, locinput, ln)
4900 176         : ! folder(s, locinput, ln)))
4901           sayNO;
4902 4978         locinput += ln;
4903 4978         break;
4904           }
4905            
4906           case NOTHING: /* null op; e.g. the 'nothing' following
4907           * the '*' in m{(a+|b)*}' */
4908           break;
4909           case TAIL: /* placeholder while compiling (A|B|C) */
4910           break;
4911            
4912           case BACK: /* ??? doesn't appear to be used ??? */
4913           break;
4914            
4915           #undef ST
4916           #define ST st->u.eval
4917           {
4918           SV *ret;
4919           REGEXP *re_sv;
4920           regexp *re;
4921           regexp_internal *rei;
4922           regnode *startpoint;
4923            
4924           case GOSTART: /* (?R) */
4925           case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4926 2200         if (cur_eval && cur_eval->locinput==locinput) {
4927 2778         if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4928 2778         Perl_croak(aTHX_ "Infinite recursion in regex");
4929 1952         if ( ++nochange_depth > max_nochange_depth )
4930 219286         Perl_croak(aTHX_
4931           "Pattern subroutine nesting without pos change"
4932           " exceeded limit in regex");
4933           } else {
4934           nochange_depth = 0;
4935           }
4936           re_sv = rex_sv;
4937           re = rex;
4938           rei = rexi;
4939 219286         if (OP(scan)==GOSUB) {
4940 47784842         startpoint = scan + ARG2L(scan);
4941 47784842         ST.close_paren = ARG(scan);
4942           } else {
4943 0         startpoint = rei->program+1;
4944 47784842         ST.close_paren = 0;
4945           }
4946           goto eval_recurse_doit;
4947           assert(0); /* NOTREACHED */
4948            
4949           case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4950 7819878         if (cur_eval && cur_eval->locinput==locinput) {
4951 47784842         if ( ++nochange_depth > max_nochange_depth )
4952 47784842         Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4953           } else {
4954           nochange_depth = 0;
4955           }
4956           {
4957           /* execute the code in the {...} */
4958            
4959 47784890         dSP;
4960           IV before;
4961 47784890         OP * const oop = PL_op;
4962 47784890         COP * const ocurcop = PL_curcop;
4963           OP *nop;
4964           CV *newcv;
4965            
4966           /* save *all* paren positions */
4967 47784890         regcppush(rex, 0, maxopenparen);
4968 47784890         REGCP_SET(runops_cp);
4969            
4970 47784890         if (!caller_cv)
4971 47784878         caller_cv = find_runcv(NULL);
4972            
4973 7117694         n = ARG(scan);
4974            
4975 7117694         if (rexi->data->what[n] == 'r') { /* code from an external qr */
4976 40667292         newcv = (ReANY(
4977           (REGEXP*)(rexi->data->data[n])
4978           ))->qr_anoncv
4979           ;
4980 40667244         nop = (OP*)rexi->data->data[n+1];
4981           }
4982 40667196         else if (rexi->data->what[n] == 'l') { /* literal code */
4983           newcv = caller_cv;
4984 65757916         nop = (OP*)rexi->data->data[n];
4985 65757916         assert(CvDEPTH(newcv));
4986           }
4987           else {
4988           /* literal with own CV */
4989 65757916         assert(rexi->data->what[n] == 'L');
4990 65757916         newcv = rex->qr_anoncv;
4991 65757916         nop = (OP*)rexi->data->data[n];
4992           }
4993            
4994           /* normally if we're about to execute code from the same
4995           * CV that we used previously, we just use the existing
4996           * CX stack entry. However, its possible that in the
4997           * meantime we may have backtracked, popped from the save
4998           * stack, and undone the SAVECOMPPAD(s) associated with
4999           * PUSH_MULTICALL; in which case PL_comppad no longer
5000           * points to newcv's pad. */
5001 65757964         if (newcv != last_pushed_cv || PL_comppad != last_pad)
5002 65757964         {
5003 65757964         U8 flags = (CXp_SUB_RE |
5004           ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5005 16837620         if (last_pushed_cv) {
5006 16837656         CHANGE_MULTICALL_FLAGS(newcv, flags);
5007           }
5008           else {
5009 16837788         PUSH_MULTICALL_FLAGS(newcv, flags);
5010           }
5011           last_pushed_cv = newcv;
5012           }
5013           else {
5014           /* these assignments are just to silence compiler
5015           * warnings */
5016           multicall_cop = NULL;
5017           newsp = NULL;
5018           }
5019 16837620         last_pad = PL_comppad;
5020            
5021           /* the initial nextstate you would normally execute
5022           * at the start of an eval (which would cause error
5023           * messages to come from the eval), may be optimised
5024           * away from the execution path in the regex code blocks;
5025           * so manually set PL_curcop to it initially */
5026           {
5027 48920392         OP *o = cUNOPx(nop)->op_first;
5028 47746874         assert(o->op_type == OP_NULL);
5029 10637254         if (o->op_targ == OP_SCOPE) {
5030 4089468         o = cUNOPo->op_first;
5031           }
5032           else {
5033 2044710         assert(o->op_targ == OP_LEAVE);
5034 2044710         o = cUNOPo->op_first;
5035 0         assert(o->op_type == OP_ENTER);
5036 2044710         o = o->op_sibling;
5037           }
5038            
5039 10637254         if (o->op_type != OP_STUB) {
5040 3148         assert( o->op_type == OP_NEXTSTATE
5041           || o->op_type == OP_DBSTATE
5042           || (o->op_type == OP_NULL
5043           && ( o->op_targ == OP_NEXTSTATE
5044           || o->op_targ == OP_DBSTATE
5045           )
5046           )
5047           );
5048 3148         PL_curcop = (COP*)o;
5049           }
5050           }
5051 3148         nop = nop->op_next;
5052            
5053 48         DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5054           " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5055            
5056 48         rex->offs[0].end = locinput - reginfo->strbeg;
5057 48         if (reginfo->info_aux_eval->pos_magic)
5058 48         MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5059           reginfo->sv, reginfo->strbeg,
5060           locinput - reginfo->strbeg);
5061            
5062 3148         if (sv_yes_mark) {
5063 3100         SV *sv_mrk = get_sv("REGMARK", 1);
5064 10637206         sv_setsv(sv_mrk, sv_yes_mark);
5065           }
5066            
5067           /* we don't use MULTICALL here as we want to call the
5068           * first op of the block of interest, rather than the
5069           * first op of the sub */
5070 334798         before = (IV)(SP-PL_stack_base);
5071 669548         PL_op = nop;
5072 334798         CALLRUNOPS(aTHX); /* Scalar context. */
5073 334798         SPAGAIN;
5074 334798         if ((IV)(SP-PL_stack_base) == before)
5075 334750         ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
5076           else {
5077 334798         ret = POPs;
5078 29764         PUTBACK;
5079           }
5080            
5081           /* before restoring everything, evaluate the returned
5082           * value, so that 'uninit' warnings don't use the wrong
5083           * PL_op or pad. Also need to process any magic vars
5084           * (e.g. $1) *before* parentheses are restored */
5085            
5086 29764         PL_op = NULL;
5087            
5088           re_sv = NULL;
5089 47441840         if (logical == 0) /* (?{})/ */
5090 1290690         sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5091 1290642         else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5092 1290642         sw = cBOOL(SvTRUE(ret));
5093           logical = 0;
5094           }
5095           else { /* /(??{}) */
5096           /* if its overloaded, let the regex compiler handle
5097           * it; otherwise extract regex, or stringify */
5098 1290642         if (!SvAMAGIC(ret)) {
5099 1290642         SV *sv = ret;
5100 46151150         if (SvROK(sv))
5101 43391304         sv = SvRV(sv);
5102 43391304         if (SvTYPE(sv) == SVt_REGEXP)
5103           re_sv = (REGEXP*) sv;
5104 43391304         else if (SvSMAGICAL(sv)) {
5105 43391304         MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5106 7117646         if (mg)
5107 7117646         re_sv = (REGEXP *) mg->mg_obj;
5108           }
5109            
5110           /* force any magic, undef warnings here */
5111 34139456         if (!re_sv) {
5112 34139456         ret = sv_mortalcopy(ret);
5113 34139456         (void) SvPV_force_nolen(ret);
5114           }
5115           }
5116            
5117           }
5118            
5119           /* *** Note that at this point we don't restore
5120           * PL_comppad, (or pop the CxSUB) on the assumption it may
5121           * be used again soon. This is safe as long as nothing
5122           * in the regexp code uses the pad ! */
5123 34139504         PL_op = oop;
5124 15541786         PL_curcop = ocurcop;
5125           S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5126 15541786         PL_curpm = PL_reg_curpm;
5127            
5128 15541786         if (logical != 2)
5129           break;
5130           }
5131            
5132           /* only /(??{})/ from now on */
5133           logical = 0;
5134           {
5135           /* extract RE object from returned value; compiling if
5136           * necessary */
5137            
5138 15541738         if (re_sv) {
5139 15541738         re_sv = reg_temp_copy(NULL, re_sv);
5140           }
5141           else {
5142           U32 pm_flags = 0;
5143            
5144 37306592         if (SvUTF8(ret) && IN_BYTES) {
5145           /* In use 'bytes': make a copy of the octet
5146           * sequence, but without the flag on */
5147           STRLEN len;
5148 37306592         const char *const p = SvPV(ret, len);
5149 41239956         ret = newSVpvn_flags(p, len, SVs_TEMP);
5150           }
5151 2         if (rex->intflags & PREGf_USE_RE_EVAL)
5152           pm_flags |= PMf_USE_RE_EVAL;
5153            
5154           /* if we got here, it should be an engine which
5155           * supports compiling code blocks and stuff */
5156 2         assert(rex->engine && rex->engine->op_comp);
5157 2         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5158 2         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5159           rex->engine, NULL, NULL,
5160           /* copy /msix etc to inner pattern */
5161 41239956         scan->flags,
5162           pm_flags);
5163            
5164 41239956         if (!(SvFLAGS(ret)
5165 41239956         & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5166           | SVs_GMG))) {
5167           /* This isn't a first class regexp. Instead, it's
5168           caching a regexp onto an existing, Perl visible
5169           scalar. */
5170 1273496         sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5171           }
5172           /* safe to do now that any $1 etc has been
5173           * interpolated into the new pattern string and
5174           * compiled */
5175           S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5176           }
5177 1273496         SAVEFREESV(re_sv);
5178 1273496         re = ReANY(re_sv);
5179           }
5180 1273496         RXp_MATCH_COPIED_off(re);
5181 477166         re->subbeg = rex->subbeg;
5182 0         re->sublen = rex->sublen;
5183 0         re->suboffset = rex->suboffset;
5184 0         re->subcoffset = rex->subcoffset;
5185 0         rei = RXi_GET(re);
5186 477166         DEBUG_EXECUTE_r(
5187           debug_start_match(re_sv, utf8_target, locinput,
5188           reginfo->strend, "Matching embedded");
5189           );
5190 477166         startpoint = rei->program + 1;
5191 796330         ST.close_paren = 0; /* only used for GOSUB */
5192            
5193           eval_recurse_doit: /* Share code with GOSUB below this line */
5194           /* run the pattern returned from (??{...}) */
5195            
5196           /* Save *all* the positions. */
5197 796330         ST.cp = regcppush(rex, 0, maxopenparen);
5198 796330         REGCP_SET(ST.lastcp);
5199          
5200 796330         re->lastparen = 0;
5201 0         re->lastcloseparen = 0;
5202            
5203 0         maxopenparen = 0;
5204            
5205           /* invalidate the S-L poscache. We're now executing a
5206           * different set of WHILEM ops (and their associated
5207           * indexes) against the same string, so the bits in the
5208           * cache are meaningless. Setting maxiter to zero forces
5209           * the cache to be invalidated and zeroed before reuse.
5210           * XXX This is too dramatic a measure. Ideally we should
5211           * save the old cache and restore when running the outer
5212           * pattern again */
5213 0         reginfo->poscache_maxiter = 0;
5214            
5215 248917174         is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5216            
5217 248917174         ST.prev_rex = rex_sv;
5218 248917174         ST.prev_curlyx = cur_curlyx;
5219           rex_sv = re_sv;
5220 248917174         SET_reg_curpm(rex_sv);
5221           rex = re;
5222           rexi = rei;
5223           cur_curlyx = NULL;
5224 248917174         ST.B = next;
5225 248917174         ST.prev_eval = cur_eval;
5226           cur_eval = st;
5227           /* now continue from first node in postoned RE */
5228 0         PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5229           assert(0); /* NOTREACHED */
5230           }
5231            
5232           case EVAL_AB: /* cleanup after a successful (??{A})B */
5233           /* note: this is called twice; first after popping B, then A */
5234 248917174         rex_sv = ST.prev_rex;
5235 648         is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5236 0         SET_reg_curpm(rex_sv);
5237 432         rex = ReANY(rex_sv);
5238 228         rexi = RXi_GET(rex);
5239 0         regcpblow(ST.cp);
5240 237460570         cur_eval = ST.prev_eval;
5241 237460570         cur_curlyx = ST.prev_curlyx;
5242            
5243           /* Invalidate cache. See "invalidate" comment above. */
5244 245974287         reginfo->poscache_maxiter = 0;
5245 237460570         if ( nochange_depth )
5246 237460570         nochange_depth--;
5247           sayYES;
5248            
5249            
5250           case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5251           /* note: this is called twice; first after popping B, then A */
5252 146639292         rex_sv = ST.prev_rex;
5253 8171312         is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5254 15384612         SET_reg_curpm(rex_sv);
5255 15384612         rex = ReANY(rex_sv);
5256 15384612         rexi = RXi_GET(rex);
5257            
5258 15384612         REGCP_UNWIND(ST.lastcp);
5259 15384612         regcppop(rex, &maxopenparen);
5260 1971074         cur_eval = ST.prev_eval;
5261 1971074         cur_curlyx = ST.prev_curlyx;
5262           /* Invalidate cache. See "invalidate" comment above. */
5263 1951236         reginfo->poscache_maxiter = 0;
5264 1971074         if ( nochange_depth )
5265 15384612         nochange_depth--;
5266           sayNO_SILENT;
5267           #undef ST
5268            
5269           case OPEN: /* ( */
5270 15384616         n = ARG(scan); /* which paren pair */
5271 15384616         rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5272 15384616         if (n > maxopenparen)
5273 15384616         maxopenparen = n;
5274 15384616         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5275           "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5276           PTR2UV(rex),
5277           PTR2UV(rex->offs),
5278           (UV)n,
5279           (IV)rex->offs[n].start_tmp,
5280           (UV)maxopenparen
5281           ));
5282           lastopen = n;
5283 15384616         break;
5284            
5285           /* XXX really need to log other places start/end are set too */
5286           #define CLOSE_CAPTURE \
5287           rex->offs[n].start = rex->offs[n].start_tmp; \
5288           rex->offs[n].end = locinput - reginfo->strbeg; \
5289           DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5290           "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5291           PTR2UV(rex), \
5292           PTR2UV(rex->offs), \
5293           (UV)n, \
5294           (IV)rex->offs[n].start, \
5295           (IV)rex->offs[n].end \
5296           ))
5297            
5298           case CLOSE: /* ) */
5299 15384616         n = ARG(scan); /* which paren pair */
5300 21528948         CLOSE_CAPTURE;
5301 6178950         if (n > rex->lastparen)
5302 6178950         rex->lastparen = n;
5303 475098         rex->lastcloseparen = n;
5304 44070         if (cur_eval && cur_eval->u.eval.close_paren == n) {
5305           goto fake_end;
5306           }
5307           break;
5308            
5309           case ACCEPT: /* (*ACCEPT) */
5310 130009         if (ARG(scan)){
5311           regnode *cursor;
5312 63910         for (cursor=scan;
5313 63910         cursor && OP(cursor)!=END;
5314 431028         cursor=regnext(cursor))
5315           {
5316 475094         if ( OP(cursor)==CLOSE ){
5317 0         n = ARG(cursor);
5318 6219608         if ( n <= lastopen ) {
5319 81324         CLOSE_CAPTURE;
5320 6178696         if (n > rex->lastparen)
5321 6178696         rex->lastparen = n;
5322 15349998         rex->lastcloseparen = n;
5323 15349998         if ( n == ARG(scan) || (cur_eval &&
5324 15116219         cur_eval->u.eval.close_paren == n))
5325           break;
5326           }
5327           }
5328           }
5329           }
5330           goto fake_end;
5331           /*NOTREACHED*/
5332            
5333           case GROUPP: /* (?(1)) */
5334 432650         n = ARG(scan); /* which paren pair */
5335 17713074         sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5336 14940102         break;
5337            
5338           case NGROUPP: /* (?()) */
5339           /* reg_check_named_buff_matched returns 0 for no match */
5340 14940102         sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5341 14326810         break;
5342            
5343           case INSUBP: /* (?(R)) */
5344 14326810         n = ARG(scan);
5345 29084276         sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5346 14326810         break;
5347            
5348           case DEFINEP: /* (?(DEFINE)) */
5349           sw = 0;
5350 3997672         break;
5351            
5352           case IFTHEN: /* (?(cond)A|B) */
5353 3997672         reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5354 17710626         if (sw)
5355 2119552         next = NEXTOPER(NEXTOPER(scan));
5356           else {
5357 3348         next = scan + ARG(scan);
5358 2916         if (OP(next) == IFTHEN) /* Fake one. */
5359 2116204         next = NEXTOPER(NEXTOPER(next));
5360           }
5361           break;
5362            
5363           case LOGICAL: /* modifier for EVAL and IFMATCH */
5364 16958982         logical = scan->flags;
5365 1882854         break;
5366            
5367           /*******************************************************************
5368            
5369           The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5370           pattern, where A and B are subpatterns. (For simple A, CURLYM or
5371           STAR/PLUS/CURLY/CURLYN are used instead.)
5372            
5373           A*B is compiled as
5374            
5375           On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5376           state, which contains the current count, initialised to -1. It also sets
5377           cur_curlyx to point to this state, with any previous value saved in the
5378           state block.
5379            
5380           CURLYX then jumps straight to the WHILEM op, rather than executing A,
5381           since the pattern may possibly match zero times (i.e. it's a while {} loop
5382           rather than a do {} while loop).
5383            
5384           Each entry to WHILEM represents a successful match of A. The count in the
5385           CURLYX block is incremented, another WHILEM state is pushed, and execution
5386           passes to A or B depending on greediness and the current count.
5387            
5388           For example, if matching against the string a1a2a3b (where the aN are
5389           substrings that match /A/), then the match progresses as follows: (the
5390           pushed states are interspersed with the bits of strings matched so far):
5391            
5392          
5393          
5394           a1
5395           a1 a2
5396           a1 a2 a3
5397           a1 a2 a3 b
5398            
5399           (Contrast this with something like CURLYM, which maintains only a single
5400           backtrack state:
5401            
5402           a1
5403           a1 a2
5404           a1 a2 a3
5405           a1 a2 a3 b
5406           )
5407            
5408           Each WHILEM state block marks a point to backtrack to upon partial failure
5409           of A or B, and also contains some minor state data related to that
5410           iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5411           overall state, such as the count, and pointers to the A and B ops.
5412            
5413           This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5414           must always point to the *current* CURLYX block, the rules are:
5415            
5416           When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5417           and set cur_curlyx to point the new block.
5418            
5419           When popping the CURLYX block after a successful or unsuccessful match,
5420           restore the previous cur_curlyx.
5421            
5422           When WHILEM is about to execute B, save the current cur_curlyx, and set it
5423           to the outer one saved in the CURLYX block.
5424            
5425           When popping the WHILEM block after a successful or unsuccessful B match,
5426           restore the previous cur_curlyx.
5427            
5428           Here's an example for the pattern (AI* BI)*BO
5429           I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5430            
5431           cur_
5432           curlyx backtrack stack
5433           ------ ---------------
5434           NULL
5435           CO
5436           CI ai
5437           CO ai bi
5438           NULL ai bi bo
5439            
5440           At this point the pattern succeeds, and we work back down the stack to
5441           clean up, restoring as we go:
5442            
5443           CO ai bi
5444           CI ai
5445           CO
5446           NULL
5447            
5448           *******************************************************************/
5449            
5450           #define ST st->u.curlyx
5451            
5452           case CURLYX: /* start of /A*B/ (for complex A) */
5453           {
5454           /* No need to save/restore up to this paren */
5455 1882854         I32 parenfloor = scan->flags;
5456          
5457 110032         assert(next); /* keep Coverity happy */
5458 110032         if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5459 110032         next += ARG(next);
5460            
5461           /* XXXX Probably it is better to teach regpush to support
5462           parenfloor > maxopenparen ... */
5463 110032         if (parenfloor > (I32)rex->lastparen)
5464 109686         parenfloor = rex->lastparen; /* Pessimization... */
5465            
5466 110032         ST.prev_curlyx= cur_curlyx;
5467           cur_curlyx = st;
5468 1772822         ST.cp = PL_savestack_ix;
5469            
5470           /* these fields contain the state of the current curly.
5471           * they are accessed by subsequent WHILEMs */
5472 1882854         ST.parenfloor = parenfloor;
5473 0         ST.me = scan;
5474 0         ST.B = next;
5475 16958982         ST.minmod = minmod;
5476           minmod = 0;
5477 15807628         ST.count = -1; /* this will be updated by WHILEM */
5478 21728853         ST.lastloc = NULL; /* this will be updated by WHILEM */
5479            
5480 15807628         PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5481           assert(0); /* NOTREACHED */
5482           }
5483            
5484           case CURLYX_end: /* just finished matching all of A*B */
5485 144980         cur_curlyx = ST.prev_curlyx;
5486 144980         sayYES;
5487           assert(0); /* NOTREACHED */
5488            
5489           case CURLYX_end_fail: /* just failed to match all of A*B */
5490 15662648         regcpblow(ST.cp);
5491 2635444         cur_curlyx = ST.prev_curlyx;
5492 2635444         sayNO;
5493           assert(0); /* NOTREACHED */
5494            
5495            
5496           #undef ST
5497           #define ST st->u.whilem
5498            
5499           case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5500           {
5501           /* see the discussion above about CURLYX/WHILEM */
5502           I32 n;
5503 104255967         int min = ARG1(cur_curlyx->u.curlyx.me);
5504 104255967         int max = ARG2(cur_curlyx->u.curlyx.me);
5505 104255967         regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5506            
5507 104255967         assert(cur_curlyx); /* keep Coverity happy */
5508 104255967         n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5509 46879280         ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5510 46879280         ST.cache_offset = 0;
5511 46879280         ST.cache_mask = 0;
5512          
5513            
5514 46879280         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5515           "%*s whilem: matched %ld out of %d..%d\n",
5516           REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5517           );
5518            
5519           /* First just match a string of min A's. */
5520            
5521 46879280         if (n < min) {
5522 593850         ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5523           maxopenparen);
5524 593850         cur_curlyx->u.curlyx.lastloc = locinput;
5525 593850         REGCP_SET(ST.lastcp);
5526            
5527 593850         PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5528           assert(0); /* NOTREACHED */
5529           }
5530            
5531           /* If degenerate A matches "", assume A done. */
5532            
5533 438074         if (locinput == cur_curlyx->u.curlyx.lastloc) {
5534 593850         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5535           "%*s whilem: empty match detected, trying continuation...\n",
5536           REPORT_CODE_OFF+depth*2, "")
5537           );
5538           goto do_whilem_B_max;
5539           }
5540            
5541           /* super-linear cache processing.
5542           *
5543           * The idea here is that for certain types of CURLYX/WHILEM -
5544           * principally those whose upper bound is infinity (and
5545           * excluding regexes that have things like \1 and other very
5546           * non-regular expresssiony things), then if a pattern like
5547           * /....A*.../ fails and we backtrack to the WHILEM, then we
5548           * make a note that this particular WHILEM op was at string
5549           * position 47 (say) when the rest of pattern failed. Then, if
5550           * we ever find ourselves back at that WHILEM, and at string
5551           * position 47 again, we can just fail immediately rather than
5552           * running the rest of the pattern again.
5553           *
5554           * This is very handy when patterns start to go
5555           * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5556           * with a combinatorial explosion of backtracking.
5557           *
5558           * The cache is implemented as a bit array, with one bit per
5559           * string byte position per WHILEM op (up to 16) - so its
5560           * between 0.25 and 2x the string size.
5561           *
5562           * To avoid allocating a poscache buffer every time, we do an
5563           * initially countdown; only after we have executed a WHILEM
5564           * op (string-length x #WHILEMs) times do we allocate the
5565           * cache.
5566           *
5567           * The top 4 bits of scan->flags byte say how many different
5568           * relevant CURLLYX/WHILEM op pairs there are, while the
5569           * bottom 4-bits is the identifying index number of this
5570           * WHILEM.
5571           */
5572            
5573 593850         if (scan->flags) {
5574            
5575 594100         if (!reginfo->poscache_maxiter) {
5576           /* start the countdown: Postpone detection until we
5577           * know the match is not *that* much linear. */
5578           reginfo->poscache_maxiter
5579 500         = (reginfo->strend - reginfo->strbeg + 1)
5580 500         * (scan->flags>>4);
5581           /* possible overflow for long strings and many CURLYX's */
5582 500         if (reginfo->poscache_maxiter < 0)
5583 593850         reginfo->poscache_maxiter = I32_MAX;
5584 593850         reginfo->poscache_iter = reginfo->poscache_maxiter;
5585           }
5586            
5587 37202816         if (reginfo->poscache_iter-- == 0) {
5588           /* initialise cache */
5589 37202816         const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5590 37202816         regmatch_info_aux *const aux = reginfo->info_aux;
5591 37202816         if (aux->poscache) {
5592 188931913         if ((SSize_t)reginfo->poscache_size < size) {
5593 59466306         Renew(aux->poscache, size, char);
5594 129465607         reginfo->poscache_size = size;
5595           }
5596 66929416         Zero(aux->poscache, size, char);
5597           }
5598           else {
5599 129465607         reginfo->poscache_size = size;
5600 22838179         Newxz(aux->poscache, size, char);
5601           }
5602 106627428         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5603           "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5604           PL_colors[4], PL_colors[5])
5605           );
5606           }
5607            
5608 16383         if (reginfo->poscache_iter < 0) {
5609           /* have we already failed at this position? */
5610           SSize_t offset, mask;
5611            
5612 212496611         reginfo->poscache_iter = -1; /* stop eventual underflow */
5613 213222090         offset = (scan->flags & 0xf) - 1
5614 188930225         + (locinput - reginfo->strbeg)
5615 188930225         * (scan->flags>>4);
5616 188930225         mask = 1 << (offset % 8);
5617 5695768         offset /= 8;
5618 5830660         if (reginfo->info_aux->poscache[offset] & mask) {
5619 134892         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5620           "%*s whilem: (cache) already tried at this position...\n",
5621           REPORT_CODE_OFF+depth*2, "")
5622           );
5623           sayNO; /* cache records failure */
5624           }
5625 134892         ST.cache_offset = offset;
5626 5682418         ST.cache_mask = mask;
5627           }
5628           }
5629            
5630           /* Prefer B over A for minimal matching. */
5631            
5632 5682418         if (cur_curlyx->u.curlyx.minmod) {
5633 5682418         ST.save_curlyx = cur_curlyx;
5634 5682418         cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5635 2811375         ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5636           maxopenparen);
5637 2811375         REGCP_SET(ST.lastcp);
5638 2808393         PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5639           locinput);
5640           assert(0); /* NOTREACHED */
5641           }
5642            
5643           /* Prefer A over B for maximal matching. */
5644            
5645 2808393         if (n < max) { /* More greed allowed? */
5646 2868         ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5647           maxopenparen);
5648 1026         cur_curlyx->u.curlyx.lastloc = locinput;
5649 2982         REGCP_SET(ST.lastcp);
5650 2172         PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5651           assert(0); /* NOTREACHED */
5652           }
5653           goto do_whilem_B_max;
5654           }
5655           assert(0); /* NOTREACHED */
5656            
5657           case WHILEM_B_min: /* just matched B in a minimal match */
5658           case WHILEM_B_max: /* just matched B in a maximal match */
5659 4998         cur_curlyx = ST.save_curlyx;
5660 3162         sayYES;
5661           assert(0); /* NOTREACHED */
5662            
5663           case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5664 1740         cur_curlyx = ST.save_curlyx;
5665 810         cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5666 810         cur_curlyx->u.curlyx.count--;
5667 358         CACHEsayNO;
5668           assert(0); /* NOTREACHED */
5669            
5670           case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5671           /* FALL THROUGH */
5672           case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5673 183234457         REGCP_UNWIND(ST.lastcp);
5674 183234457         regcppop(rex, &maxopenparen);
5675 183234457         cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5676 160374317         cur_curlyx->u.curlyx.count--;
5677 160374317         CACHEsayNO;
5678           assert(0); /* NOTREACHED */
5679            
5680           case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5681 90106554         REGCP_UNWIND(ST.lastcp);
5682 4426240         regcppop(rex, &maxopenparen); /* Restore some previous $s? */
5683 4426240         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5684           "%*s whilem: failed, trying continuation...\n",
5685           REPORT_CODE_OFF+depth*2, "")
5686           );
5687           do_whilem_B_max:
5688 56540         if (cur_curlyx->u.curlyx.count >= REG_INFTY
5689 160374317         && ckWARN(WARN_REGEXP)
5690 160374317         && !reginfo->warned)
5691           {
5692 27186         reginfo->warned = TRUE;
5693 27186         Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5694           "Complex regular subexpression recursion limit (%d) "
5695           "exceeded",
5696           REG_INFTY - 1);
5697           }
5698            
5699           /* now try B */
5700 0         ST.save_curlyx = cur_curlyx;
5701 27186         cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5702 27186         PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5703           locinput);
5704           assert(0); /* NOTREACHED */
5705            
5706           case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5707 504         cur_curlyx = ST.save_curlyx;
5708 26682         REGCP_UNWIND(ST.lastcp);
5709 27186         regcppop(rex, &maxopenparen);
5710            
5711 2838561         if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5712           /* Maximum greed exceeded */
5713 4518         if (cur_curlyx->u.curlyx.count >= REG_INFTY
5714 4518         && ckWARN(WARN_REGEXP)
5715 15798         && !reginfo->warned)
5716           {
5717 13896         reginfo->warned = TRUE;
5718 13144         Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5719           "Complex regular subexpression recursion "
5720           "limit (%d) exceeded",
5721           REG_INFTY - 1);
5722           }
5723 13144         cur_curlyx->u.curlyx.count--;
5724 3364         CACHEsayNO;
5725           }
5726            
5727 2564         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5728           "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5729           );
5730           /* Try grabbing another A and see if it helps. */
5731 2396         cur_curlyx->u.curlyx.lastloc = locinput;
5732 1500         ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5733           maxopenparen);
5734 1500         REGCP_SET(ST.lastcp);
5735 2834043         PUSH_STATE_GOTO(WHILEM_A_min,
5736           /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5737           locinput);
5738           assert(0); /* NOTREACHED */
5739            
5740           #undef ST
5741           #define ST st->u.branch
5742            
5743           case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5744 58182923         next = scan + ARG(scan);
5745 38788433         if (next == scan)
5746           next = NULL;
5747 35957336         scan = NEXTOPER(scan);
5748           /* FALL THROUGH */
5749            
5750           case BRANCH: /* /(...|A|...)/ */
5751 2912         scan = NEXTOPER(scan); /* scan now points to inner node */
5752 2786         ST.lastparen = rex->lastparen;
5753 2576         ST.lastcloseparen = rex->lastcloseparen;
5754 1496         ST.next_branch = next;
5755 2834043         REGCP_SET(ST.cp);
5756            
5757           /* Now go into the branch */
5758 2838561         if (has_cutgroup) {
5759 2834203         PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5760           } else {
5761 2274643         PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5762           }
5763           assert(0); /* NOTREACHED */
5764            
5765           case CUTGROUP: /* /(*THEN)/ */
5766 2274643         sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5767 2274643         MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5768 2818339         PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5769           assert(0); /* NOTREACHED */
5770            
5771           case CUTGROUP_next_fail:
5772           do_cutgroup = 1;
5773           no_final = 1;
5774 2818339         if (st->u.mark.mark_name)
5775 0         sv_commit = st->u.mark.mark_name;
5776           sayNO;
5777           assert(0); /* NOTREACHED */
5778            
5779           case BRANCH_next:
5780           sayYES;
5781           assert(0); /* NOTREACHED */
5782            
5783           case BRANCH_next_fail: /* that branch failed; try the next, if any */
5784 2818339         if (do_cutgroup) {
5785           do_cutgroup = 0;
5786           no_final = 0;
5787           }
5788 6118662         REGCP_UNWIND(ST.cp);
5789 6118662         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5790 0         scan = ST.next_branch;
5791           /* no more branches? */
5792 6118662         if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5793 6118662         DEBUG_EXECUTE_r({
5794           PerlIO_printf( Perl_debug_log,
5795           "%*s %sBRANCH failed...%s\n",
5796           REPORT_CODE_OFF+depth*2, "",
5797           PL_colors[4],
5798           PL_colors[5] );
5799           });
5800           sayNO_SILENT;
5801           }
5802 3509270         continue; /* execute next BRANCH[J] op */
5803           assert(0); /* NOTREACHED */
5804          
5805           case MINMOD: /* next op will be non-greedy, e.g. A*? */
5806           minmod = 1;
5807 3509270         break;
5808            
5809           #undef ST
5810           #define ST st->u.curlym
5811            
5812           case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5813            
5814           /* This is an optimisation of CURLYX that enables us to push
5815           * only a single backtracking state, no matter how many matches
5816           * there are in {m,n}. It relies on the pattern being constant
5817           * length, with no parens to influence future backrefs
5818           */
5819            
5820 3509270         ST.me = scan;
5821 0         scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5822            
5823 6380021         ST.lastparen = rex->lastparen;
5824 6380021         ST.lastcloseparen = rex->lastcloseparen;
5825            
5826           /* if paren positive, emulate an OPEN/CLOSE around A */
5827 0         if (ST.me->flags) {
5828 6380021         U32 paren = ST.me->flags;
5829 529450262         if (paren > maxopenparen)
5830 4174         maxopenparen = paren;
5831 529447875         scan += NEXT_OFF(scan); /* Skip former OPEN. */
5832           }
5833 529447875         ST.A = scan;
5834 410544422         ST.B = next;
5835 20439         ST.alen = 0;
5836 13626         ST.count = 0;
5837 615424876         ST.minmod = minmod;
5838           minmod = 0;
5839 410530796         ST.c1 = CHRTEST_UNINIT;
5840 529447875         REGCP_SET(ST.cp);
5841            
5842 195992257         if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5843           goto curlym_do_B;
5844            
5845           curlym_do_A: /* execute the A in /A{m,n}B/ */
5846 195992257         PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5847           assert(0); /* NOTREACHED */
5848            
5849           case CURLYM_A: /* we've just matched an A */
5850 439292424         ST.count++;
5851           /* after first match, determine A's length: u.curlym.alen */
5852 439292424         if (ST.count == 1) {
5853 259827         if (reginfo->is_utf8_target) {
5854 439292424         char *s = st->locinput;
5855 369073858         while (s < locinput) {
5856 112441273         ST.alen++;
5857 632368         s += UTF8SKIP(s);
5858           }
5859           }
5860           else {
5861 632368         ST.alen = locinput - st->locinput;
5862           }
5863 632368         if (ST.alen == 0)
5864 632368         ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5865           }
5866 729524         DEBUG_EXECUTE_r(
5867           PerlIO_printf(Perl_debug_log,
5868           "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5869           (int)(REPORT_CODE_OFF+(depth*2)), "",
5870           (IV) ST.count, (IV)ST.alen)
5871           );
5872            
5873 632368         if (cur_eval && cur_eval->u.eval.close_paren &&
5874 632368         cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5875           goto fake_end;
5876          
5877           {
5878 632368         I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5879 632368         if ( max == REG_INFTY || ST.count < max )
5880           goto curlym_do_A; /* try to match another A */
5881           }
5882           goto curlym_do_B; /* try to match B */
5883            
5884           case CURLYM_A_fail: /* just failed to match an A */
5885 632368         REGCP_UNWIND(ST.cp);
5886            
5887 632368         if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5888 632368         || (cur_eval && cur_eval->u.eval.close_paren &&
5889 261636         cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5890           sayNO;
5891            
5892           curlym_do_B: /* execute the B in /A{m,n}B/ */
5893 632368         if (ST.c1 == CHRTEST_UNINIT) {
5894           /* calculate c1 and c2 for possible match of 1st char
5895           * following curly */
5896 111808905         ST.c1 = ST.c2 = CHRTEST_VOID;
5897 840636         if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5898 840636         regnode *text_node = ST.B;
5899 3509278         if (! HAS_TEXT(text_node))
5900 3509278         FIND_NEXT_IMPT(text_node);
5901           /* this used to be
5902          
5903           (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5904          
5905           But the former is redundant in light of the latter.
5906          
5907           if this changes back then the macro for
5908           IS_TEXT and friends need to change.
5909           */
5910 11840382         if (PL_regkind[OP(text_node)] == EXACT) {
5911 15349660         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5912 8764976         text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5913           reginfo))
5914           {
5915           sayNO;
5916           }
5917           }
5918           }
5919           }
5920            
5921 8764976         DEBUG_EXECUTE_r(
5922           PerlIO_printf(Perl_debug_log,
5923           "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5924           (int)(REPORT_CODE_OFF+(depth*2)),
5925           "", (IV)ST.count)
5926           );
5927 362910         if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5928 0         if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5929 362910         if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5930 70974         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5931           {
5932           /* simulate B failing */
5933 70974         DEBUG_OPTIMISE_r(
5934           PerlIO_printf(Perl_debug_log,
5935           "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
5936           (int)(REPORT_CODE_OFF+(depth*2)),"",
5937           valid_utf8_to_uvchr((U8 *) locinput, NULL),
5938           valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5939           valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5940           );
5941           state_num = CURLYM_B_fail;
5942           goto reenter_switch;
5943           }
5944           }
5945 15827386         else if (nextchr != ST.c1 && nextchr != ST.c2) {
5946           /* simulate B failing */
5947 15827386         DEBUG_OPTIMISE_r(
5948           PerlIO_printf(Perl_debug_log,
5949           "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5950           (int)(REPORT_CODE_OFF+(depth*2)),"",
5951           (int) nextchr, ST.c1, ST.c2)
5952           );
5953           state_num = CURLYM_B_fail;
5954           goto reenter_switch;
5955           }
5956           }
5957            
5958 15827386         if (ST.me->flags) {
5959           /* emulate CLOSE: mark current A as captured */
5960 10422058         I32 paren = ST.me->flags;
5961 15827386         if (ST.count) {
5962 650         rex->offs[paren].start
5963 15826736         = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5964 7031634         rex->offs[paren].end = locinput - reginfo->strbeg;
5965 6516736         if ((U32)paren > rex->lastparen)
5966 7031634         rex->lastparen = paren;
5967 7031634         rex->lastcloseparen = paren;
5968           }
5969           else
5970 7031634         rex->offs[paren].end = -1;
5971 0         if (cur_eval && cur_eval->u.eval.close_paren &&
5972 0         cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5973           {
5974 54         if (ST.count)
5975           goto fake_end;
5976           else
5977           sayNO;
5978           }
5979           }
5980          
5981 894         PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5982           assert(0); /* NOTREACHED */
5983            
5984           case CURLYM_B_fail: /* just failed to match a B */
5985 22         REGCP_UNWIND(ST.cp);
5986 1233         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5987 278         if (ST.minmod) {
5988 278         I32 max = ARG2(ST.me);
5989 278         if (max != REG_INFTY && ST.count == max)
5990           sayNO;
5991           goto curlym_do_A; /* try to match a further A */
5992           }
5993           /* backtrack one A */
5994 278         if (ST.count == ARG1(ST.me) /* min */)
5995           sayNO;
5996 278         ST.count--;
5997 20         SET_locinput(HOPc(locinput, -ST.alen));
5998           goto curlym_do_B; /* try to match B */
5999            
6000           #undef ST
6001           #define ST st->u.curly
6002            
6003           #define CURLY_SETPAREN(paren, success) \
6004           if (paren) { \
6005           if (success) { \
6006           rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6007           rex->offs[paren].end = locinput - reginfo->strbeg; \
6008           if (paren > rex->lastparen) \
6009           rex->lastparen = paren; \
6010           rex->lastcloseparen = paren; \
6011           } \
6012           else { \
6013           rex->offs[paren].end = -1; \
6014           rex->lastparen = ST.lastparen; \
6015           rex->lastcloseparen = ST.lastcloseparen; \
6016           } \
6017           }
6018            
6019           case STAR: /* /A*B/ where A is width 1 char */
6020 20         ST.paren = 0;
6021 258         ST.min = 0;
6022 164         ST.max = REG_INFTY;
6023 154         scan = NEXTOPER(scan);
6024 164         goto repeat;
6025            
6026           case PLUS: /* /A+B/ where A is width 1 char */
6027 258         ST.paren = 0;
6028 258         ST.min = 1;
6029 258         ST.max = REG_INFTY;
6030 252         scan = NEXTOPER(scan);
6031 84         goto repeat;
6032            
6033           case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
6034 84         ST.paren = scan->flags; /* Which paren to set */
6035 84         ST.lastparen = rex->lastparen;
6036 168         ST.lastcloseparen = rex->lastcloseparen;
6037 268         if (ST.paren > maxopenparen)
6038 184         maxopenparen = ST.paren;
6039 168         ST.min = ARG1(scan); /* min to match */
6040 168         ST.max = ARG2(scan); /* max to match */
6041 16         if (cur_eval && cur_eval->u.eval.close_paren &&
6042 244         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6043 164         ST.min=1;
6044 80         ST.max=1;
6045           }
6046 80         scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6047 80         goto repeat;
6048            
6049           case CURLY: /* /A{m,n}B/ where A is width 1 char */
6050 14         ST.paren = 0;
6051 46422         ST.min = ARG1(scan); /* min to match */
6052 882         ST.max = ARG2(scan); /* max to match */
6053 882         scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6054           repeat:
6055           /*
6056           * Lookahead to avoid useless match attempts
6057           * when we know what character comes next.
6058           *
6059           * Used to only do .*x and .*?x, but now it allows
6060           * for )'s, ('s and (?{ ... })'s to be in the way
6061           * of the quantifier and the EXACT-like node. -- japhy
6062           */
6063            
6064 0         assert(ST.min <= ST.max);
6065 0         if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6066 0         ST.c1 = ST.c2 = CHRTEST_VOID;
6067           }
6068           else {
6069           regnode *text_node = next;
6070            
6071 25264060         if (! HAS_TEXT(text_node))
6072 6363152         FIND_NEXT_IMPT(text_node);
6073            
6074 6363152         if (! HAS_TEXT(text_node))
6075 18900908         ST.c1 = ST.c2 = CHRTEST_VOID;
6076           else {
6077 583511433         if ( PL_regkind[OP(text_node)] != EXACT ) {
6078 131520404         ST.c1 = ST.c2 = CHRTEST_VOID;
6079           }
6080           else {
6081          
6082           /* Currently we only get here when
6083          
6084           PL_rekind[OP(text_node)] == EXACT
6085          
6086           if this changes back then the macro for IS_TEXT and
6087           friends need to change. */
6088 663703331         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6089 663703331         text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6090           reginfo))
6091           {
6092           sayNO;
6093           }
6094           }
6095           }
6096           }
6097            
6098 663703331         ST.A = scan;
6099 663703331         ST.B = next;
6100 1889268         if (minmod) {
6101 663703331         char *li = locinput;
6102           minmod = 0;
6103 1036151107         if (ST.min &&
6104 0         regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6105 136660069         < ST.min)
6106           sayNO;
6107 26376090         SET_locinput(li);
6108 26213858         ST.count = ST.min;
6109 177624         REGCP_SET(ST.cp);
6110 177624         if (ST.c1 == CHRTEST_VOID)
6111           goto curly_try_B_min;
6112            
6113 177624         ST.oldloc = locinput;
6114            
6115           /* set ST.maxpos to the furthest point along the
6116           * string that could possibly match */
6117 26198466         if (ST.max == REG_INFTY) {
6118 26198466         ST.maxpos = reginfo->strend - 1;
6119 26198466         if (utf8_target)
6120 26198466         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6121 376         ST.maxpos--;
6122           }
6123 26198466         else if (utf8_target) {
6124 26198466         int m = ST.max - ST.min;
6125 110461603         for (ST.maxpos = locinput;
6126 10232         m >0 && ST.maxpos < reginfo->strend; m--)
6127 1690         ST.maxpos += UTF8SKIP(ST.maxpos);
6128           }
6129           else {
6130 642025740         ST.maxpos = locinput + ST.max - ST.min;
6131 1526         if (ST.maxpos >= reginfo->strend)
6132 642024214         ST.maxpos = reginfo->strend - 1;
6133           }
6134           goto curly_try_B_min_known;
6135            
6136           }
6137           else {
6138           /* avoid taking address of locinput, so it can remain
6139           * a register var */
6140 522704776         char *li = locinput;
6141 522704776         ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6142 1710774         if (ST.count < ST.min)
6143           sayNO;
6144 1710774         SET_locinput(li);
6145 522704776         if ((ST.count > ST.min)
6146 522704776         && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6147           {
6148           /* A{m,n} must come at the end of the string, there's
6149           * no point in backing off ... */
6150 522704776         ST.min = ST.count;
6151           /* ...except that $ and \Z can match before *and* after
6152           newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6153           We may back off by one in this case. */
6154 522704776         if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6155 105321938         ST.min--;
6156           }
6157 522704776         REGCP_SET(ST.cp);
6158 522704776         goto curly_try_B_max;
6159           }
6160           assert(0); /* NOTREACHED */
6161            
6162            
6163           case CURLY_B_min_known_fail:
6164           /* failed to find B in a non-greedy match where c1,c2 valid */
6165            
6166 229782191         REGCP_UNWIND(ST.cp);
6167 1786         if (ST.paren) {
6168 1786         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6169           }
6170           /* Couldn't or didn't -- move forward. */
6171 1786         ST.oldloc = locinput;
6172 544         if (utf8_target)
6173 1242         locinput += UTF8SKIP(locinput);
6174           else
6175 1786         locinput++;
6176 1786         ST.count++;
6177           curly_try_B_min_known:
6178           /* find the next place where 'B' could work, then call B */
6179           {
6180           int n;
6181 229782191         if (utf8_target) {
6182 53862         n = (ST.oldloc == locinput) ? 0 : 1;
6183 229782191         if (ST.c1 == ST.c2) {
6184           /* set n to utf8_distance(oldloc, locinput) */
6185 191762654         while (locinput <= ST.maxpos
6186 191762654         && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6187           {
6188 191762654         locinput += UTF8SKIP(locinput);
6189 191762654         n++;
6190           }
6191           }
6192           else {
6193           /* set n to utf8_distance(oldloc, locinput) */
6194 191762654         while (locinput <= ST.maxpos
6195 45772085         && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6196 43693479         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6197           {
6198 191762654         locinput += UTF8SKIP(locinput);
6199 59601419         n++;
6200           }
6201           }
6202           }
6203           else { /* Not utf8_target */
6204 235980         if (ST.c1 == ST.c2) {
6205 193688         while (locinput <= ST.maxpos &&
6206 193688         UCHARAT(locinput) != ST.c1)
6207 718968725         locinput++;
6208           }
6209           else {
6210 659409598         while (locinput <= ST.maxpos
6211 3434907         && UCHARAT(locinput) != ST.c1
6212 15508         && UCHARAT(locinput) != ST.c2)
6213 14482         locinput++;
6214           }
6215 14482         n = locinput - ST.oldloc;
6216           }
6217 64         if (locinput > ST.maxpos)
6218           sayNO;
6219 18         if (n) {
6220           /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6221           * at b; check that everything between oldloc and
6222           * locinput matches */
6223 29807566         char *li = ST.oldloc;
6224 29807566         ST.count += n;
6225 29804414         if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6226           sayNO;
6227 14937594         assert(n == REG_INFTY || locinput == li);
6228           }
6229 33412070         CURLY_SETPAREN(ST.paren, ST.count);
6230 3607656         if (cur_eval && cur_eval->u.eval.close_paren &&
6231 3152         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6232           goto fake_end;
6233           }
6234 1610         PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6235           }
6236           assert(0); /* NOTREACHED */
6237            
6238            
6239           case CURLY_B_min_fail:
6240           /* failed to find B in a non-greedy match where c1,c2 invalid */
6241            
6242 3147         REGCP_UNWIND(ST.cp);
6243 2098         if (ST.paren) {
6244 1452         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6245           }
6246           /* failed -- move forward one */
6247           {
6248 1452         char *li = locinput;
6249 1042         if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6250           sayNO;
6251           }
6252 1042         locinput = li;
6253           }
6254           {
6255 554         ST.count++;
6256 512         if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6257 1280         ST.count > 0)) /* count overflow ? */
6258           {
6259           curly_try_B_min:
6260 512         CURLY_SETPAREN(ST.paren, ST.count);
6261 1542         if (cur_eval && cur_eval->u.eval.close_paren &&
6262 1542         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6263           goto fake_end;
6264           }
6265 3479         PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6266           }
6267           }
6268           sayNO;
6269           assert(0); /* NOTREACHED */
6270            
6271            
6272           curly_try_B_max:
6273           /* a successful greedy match: now try to match B */
6274 2578         if (cur_eval && cur_eval->u.eval.close_paren &&
6275 1168         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6276           goto fake_end;
6277           }
6278           {
6279 1166         bool could_match = locinput < reginfo->strend;
6280            
6281           /* If it could work, try it. */
6282 1166         if (ST.c1 != CHRTEST_VOID && could_match) {
6283 1166         if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6284           {
6285 116440         could_match = memEQ(locinput,
6286           ST.c1_utf8,
6287           UTF8SKIP(locinput))
6288 116440         || memEQ(locinput,
6289           ST.c2_utf8,
6290           UTF8SKIP(locinput));
6291           }
6292           else {
6293 14800         could_match = UCHARAT(locinput) == ST.c1
6294 101154         || UCHARAT(locinput) == ST.c2;
6295           }
6296           }
6297 260736         if (ST.c1 == CHRTEST_VOID || could_match) {
6298 254788         CURLY_SETPAREN(ST.paren, ST.count);
6299 63646         PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6300           assert(0); /* NOTREACHED */
6301           }
6302           }
6303           /* FALL THROUGH */
6304            
6305           case CURLY_B_max_fail:
6306           /* failed to find B in a greedy match */
6307            
6308 63646         REGCP_UNWIND(ST.cp);
6309 144461         if (ST.paren) {
6310 105790         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6311           }
6312           /* back up. */
6313 48992         if (--ST.count < ST.min)
6314           sayNO;
6315 48992         locinput = HOPc(locinput, -1);
6316           goto curly_try_B_max;
6317            
6318           #undef ST
6319            
6320           case END: /* last op of main pattern */
6321           fake_end:
6322 49034         if (cur_eval) {
6323           /* we've just finished A in /(??{A})B/; now continue with B */
6324            
6325 191142         st->u.eval.prev_rex = rex_sv; /* inner */
6326            
6327           /* Save *all* the positions. */
6328 107280         st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6329 123819         rex_sv = cur_eval->u.eval.prev_rex;
6330 82546         is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6331 45852         SET_reg_curpm(rex_sv);
6332 6080         rex = ReANY(rex_sv);
6333 6080         rexi = RXi_GET(rex);
6334 69222         cur_curlyx = cur_eval->u.eval.prev_curlyx;
6335            
6336 46148         REGCP_SET(st->u.eval.lastcp);
6337            
6338           /* Restore parens of the outer rex without popping the
6339           * savestack */
6340 34134         S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6341           &maxopenparen);
6342            
6343 31636         st->u.eval.prev_eval = cur_eval;
6344 15334         cur_eval = cur_eval->u.eval.prev_eval;
6345 15334         DEBUG_EXECUTE_r(
6346           PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6347           REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6348 83862         if ( nochange_depth )
6349 48340         nochange_depth--;
6350            
6351 6336         PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6352           locinput); /* match B */
6353           }
6354            
6355 93612         if (locinput < reginfo->till) {
6356 62618         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6357           "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6358           PL_colors[4],
6359           (long)(locinput - startpos),
6360           (long)(reginfo->till - startpos),
6361           PL_colors[5]));
6362          
6363           sayNO_SILENT; /* Cannot match: too short. */
6364           }
6365           sayYES; /* Success! */
6366            
6367           case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6368 28634         DEBUG_EXECUTE_r(
6369           PerlIO_printf(Perl_debug_log,
6370           "%*s %ssubpattern success...%s\n",
6371           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6372           sayYES; /* Success! */
6373            
6374           #undef ST
6375           #define ST st->u.ifmatch
6376            
6377           {
6378           char *newstart;
6379            
6380           case SUSPEND: /* (?>A) */
6381 43894685         ST.wanted = 1;
6382           newstart = locinput;
6383 1604094         goto do_ifmatch;
6384            
6385           case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?
6386 1069396         ST.wanted = 0;
6387 894900         goto ifmatch_trivial_fail_test;
6388            
6389           case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6390 701202         ST.wanted = 1;
6391           ifmatch_trivial_fail_test:
6392 701202         if (scan->flags) {
6393 187674251         char * const s = HOPBACKc(locinput, scan->flags);
6394 144147756         if (!s) {
6395           /* trivial fail */
6396 350         if (logical) {
6397           logical = 0;
6398 350         sw = 1 - cBOOL(ST.wanted);
6399           }
6400 862         else if (ST.wanted)
6401           sayNO;
6402 528         next = scan + ARG(scan);
6403 176         if (next == scan)
6404           next = NULL;
6405           break;
6406           }
6407           newstart = s;
6408           }
6409           else
6410           newstart = locinput;
6411            
6412           do_ifmatch:
6413 160         ST.me = scan;
6414 160         ST.logical = logical;
6415           logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6416          
6417           /* execute body of (?...A) */
6418 160         PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6419           assert(0); /* NOTREACHED */
6420           }
6421            
6422           case IFMATCH_A_fail: /* body of (?...A) failed */
6423 47524475         ST.wanted = !ST.wanted;
6424           /* FALL THROUGH */
6425            
6426           case IFMATCH_A: /* body of (?...A) succeeded */
6427 52829331         if (ST.logical) {
6428 26968780         sw = cBOOL(ST.wanted);
6429           }
6430 112680115         else if (!ST.wanted)
6431           sayNO;
6432            
6433 59850784         if (OP(ST.me) != SUSPEND) {
6434           /* restore old position except for (?>...) */
6435 1780822         locinput = st->locinput;
6436           }
6437 1780136         scan = ST.me + ARG(ST.me);
6438 13635316         if (scan == ST.me)
6439           scan = NULL;
6440 11855446         continue; /* execute B */
6441            
6442           #undef ST
6443            
6444           case LONGJMP: /* alternative with many branches compiles to
6445           * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6446 2542         next = scan + ARG(scan);
6447 2276         if (next == scan)
6448           next = NULL;
6449           break;
6450            
6451           case COMMIT: /* (*COMMIT) */
6452 1933         reginfo->cutpoint = reginfo->strend;
6453           /* FALLTHROUGH */
6454            
6455           case PRUNE: /* (*PRUNE) */
6456 2276         if (!scan->flags)
6457 2276         sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6458 47030         PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6459           assert(0); /* NOTREACHED */
6460            
6461           case COMMIT_next_fail:
6462           no_final = 1;
6463           /* FALLTHROUGH */
6464            
6465           case OPFAIL: /* (*FAIL) */
6466           sayNO;
6467           assert(0); /* NOTREACHED */
6468            
6469           #define ST st->u.mark
6470           case MARKPOINT: /* (*MARK:foo) */
6471 99707         ST.prev_mark = mark_state;
6472 64374         ST.mark_name = sv_commit = sv_yes_mark
6473 20912         = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6474           mark_state = st;
6475 106510         ST.mark_loc = locinput;
6476 106510         PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6477           assert(0); /* NOTREACHED */
6478            
6479           case MARKPOINT_next:
6480 10558         mark_state = ST.prev_mark;
6481 9542         sayYES;
6482           assert(0); /* NOTREACHED */
6483            
6484           case MARKPOINT_next_fail:
6485 9264         if (popmark && sv_eq(ST.mark_name,popmark))
6486           {
6487 7360         if (ST.mark_loc > startpoint)
6488 278         reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6489           popmark = NULL; /* we found our mark */
6490 320         sv_commit = ST.mark_name;
6491            
6492 240         DEBUG_EXECUTE_r({
6493           PerlIO_printf(Perl_debug_log,
6494           "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6495           REPORT_CODE_OFF+depth*2, "",
6496           PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6497           });
6498           }
6499 160         mark_state = ST.prev_mark;
6500           sv_yes_mark = mark_state ?
6501 7520         mark_state->u.mark.mark_name : NULL;
6502 103472         sayNO;
6503           assert(0); /* NOTREACHED */
6504            
6505           case SKIP: /* (*SKIP) */
6506 213654         if (scan->flags) {
6507           /* (*SKIP) : if we fail we cut here*/
6508 142436         ST.mark_name = NULL;
6509 139348         ST.mark_loc = locinput;
6510 39332         PUSH_STATE_GOTO(SKIP_next,next, locinput);
6511           } else {
6512           /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6513           otherwise do nothing. Meaning we need to scan
6514           */
6515           regmatch_state *cur = mark_state;
6516 39332         SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6517          
6518 588         while (cur) {
6519 392         if ( sv_eq( cur->u.mark.mark_name,
6520           find ) )
6521           {
6522 316         ST.mark_name = find;
6523 244         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6524           }
6525 244         cur = cur->u.mark.prev_mark;
6526           }
6527           }
6528           /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6529           break;
6530            
6531           case SKIP_next_fail:
6532 720         if (ST.mark_name) {
6533           /* (*CUT:NAME) - Set up to search for the name as we
6534           collapse the stack*/
6535 480         popmark = ST.mark_name;
6536           } else {
6537           /* (*CUT) - No name, we cut here.*/
6538 456         if (ST.mark_loc > startpoint)
6539 432         reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6540           /* but we set sv_commit to latest mark_name if there
6541           is one so they can test to see how things lead to this
6542           cut */
6543 432         if (mark_state)
6544 876         sv_commit=mark_state->u.mark.mark_name;
6545           }
6546           no_final = 1;
6547           sayNO;
6548           assert(0); /* NOTREACHED */
6549           #undef ST
6550            
6551           case LNBREAK: /* \R */
6552 584         if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6553 556         locinput += n;
6554           } else
6555           sayNO;
6556 460         break;
6557            
6558           default:
6559 460         PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6560 648         PTR2UV(scan), OP(scan));
6561 432         Perl_croak(aTHX_ "regexp memory corruption");
6562            
6563           /* this is a point to jump to in order to increment
6564           * locinput by one character */
6565           increment_locinput:
6566 432         assert(!NEXTCHR_IS_EOS);
6567 384         if (utf8_target) {
6568 384         locinput += PL_utf8skip[nextchr];
6569           /* locinput is allowed to go 1 char off the end, but not 2+ */
6570 0         if (locinput > reginfo->strend)
6571           sayNO;
6572           }
6573           else
6574 118         locinput++;
6575           break;
6576          
6577           } /* end switch */
6578            
6579           /* switch break jumps here */
6580           scan = next; /* prepare to execute the next op and ... */
6581 116         continue; /* ... jump back to the top, reusing st */
6582           assert(0); /* NOTREACHED */
6583            
6584           push_yes_state:
6585           /* push a state that backtracks on success */
6586 66         st->u.yes.prev_yes_state = yes_state;
6587           yes_state = st;
6588           /* FALL THROUGH */
6589           push_state:
6590           /* push a new regex state, then continue at scan */
6591           {
6592           regmatch_state *newst;
6593            
6594 310         DEBUG_STACK_r({
6595           regmatch_state *cur = st;
6596           regmatch_state *curyes = yes_state;
6597           int curd = depth;
6598           regmatch_slab *slab = PL_regmatch_slab;
6599           for (;curd > -1;cur--,curd--) {
6600           if (cur < SLAB_FIRST(slab)) {
6601           slab = slab->prev;
6602           cur = SLAB_LAST(slab);
6603           }
6604           PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6605           REPORT_CODE_OFF + 2 + depth * 2,"",
6606           curd, PL_reg_name[cur->resume_state],
6607           (curyes == cur) ? "yes" : ""
6608           );
6609           if (curyes == cur)
6610           curyes = cur->u.yes.prev_yes_state;
6611           }
6612           } else
6613           DEBUG_STATE_pp("push")
6614           );
6615 208         depth++;
6616 192         st->locinput = locinput;
6617 192         newst = st+1;
6618 332         if (newst > SLAB_LAST(PL_regmatch_slab))
6619 840         newst = S_push_slab(aTHX);
6620 392         PL_regmatch_state = newst;
6621            
6622           locinput = pushinput;
6623           st = newst;
6624 272         continue;
6625           assert(0); /* NOTREACHED */
6626           }
6627           }
6628            
6629           /*
6630           * We get here only if there's trouble -- normally "case END" is
6631           * the terminating point.
6632           */
6633 272         Perl_croak(aTHX_ "corrupted regexp pointers");
6634           /*NOTREACHED*/
6635           sayNO;
6636            
6637           yes:
6638 198         if (yes_state) {
6639           /* we have successfully completed a subexpression, but we must now
6640           * pop to the state marked by yes_state and continue from there */
6641 494         assert(st != yes_state);
6642           #ifdef DEBUGGING
6643 260         while (st != yes_state) {
6644 260         st--;
6645 0         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6646 191762654         PL_regmatch_slab = PL_regmatch_slab->prev;
6647 191486484         st = SLAB_LAST(PL_regmatch_slab);
6648           }
6649 191762654         DEBUG_STATE_r({
6650           if (no_final) {
6651           DEBUG_STATE_pp("pop (no final)");
6652           } else {
6653           DEBUG_STATE_pp("pop (yes)");
6654           }
6655           });
6656 191762654         depth--;
6657           }
6658           #else
6659           while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6660           || yes_state > SLAB_LAST(PL_regmatch_slab))
6661           {
6662           /* not in this slab, pop slab */
6663           depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6664           PL_regmatch_slab = PL_regmatch_slab->prev;
6665           st = SLAB_LAST(PL_regmatch_slab);
6666           }
6667           depth -= (st - yes_state);
6668           #endif
6669           st = yes_state;
6670 0         yes_state = st->u.yes.prev_yes_state;
6671 0         PL_regmatch_state = st;
6672          
6673 0         if (no_final)
6674 0         locinput= st->locinput;
6675 961082         state_num = st->resume_state + no_final;
6676 961082         goto reenter_switch;
6677           }
6678            
6679 961124         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6680           PL_colors[4], PL_colors[5]));
6681            
6682 961124         if (reginfo->info_aux_eval) {
6683           /* each successfully executed (?{...}) block does the equivalent of
6684           * local $^R = do {...}
6685           * When popping the save stack, all these locals would be undone;
6686           * bypass this by setting the outermost saved $^R to the latest
6687           * value */
6688 961118         if (oreplsv != GvSV(PL_replgv))
6689 961118         sv_setsv(oreplsv, GvSV(PL_replgv));
6690           }
6691           result = 1;
6692           goto final_exit;
6693            
6694           no:
6695 961082         DEBUG_EXECUTE_r(
6696           PerlIO_printf(Perl_debug_log,
6697           "%*s %sfailed...%s\n",
6698           REPORT_CODE_OFF+depth*2, "",
6699           PL_colors[4], PL_colors[5])
6700           );
6701            
6702           no_silent:
6703 961082         if (no_final) {
6704 961082         if (yes_state) {
6705           goto yes;
6706           } else {
6707           goto final_exit;
6708           }
6709           }
6710 961082         if (depth) {
6711           /* there's a previous state to backtrack to */
6712 961082         st--;
6713 961082         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6714 669000         PL_regmatch_slab = PL_regmatch_slab->prev;
6715 669000         st = SLAB_LAST(PL_regmatch_slab);
6716           }
6717 54         PL_regmatch_state = st;
6718 961082         locinput= st->locinput;
6719            
6720 530768         DEBUG_STATE_pp("pop");
6721 430314         depth--;
6722 430314         if (yes_state == st)
6723 405188         yes_state = st->u.yes.prev_yes_state;
6724            
6725 935956         state_num = st->resume_state + 1; /* failure = success + 1 */
6726 0         goto reenter_switch;
6727           }
6728           result = 0;
6729            
6730           final_exit:
6731 42         if (rex->intflags & PREGf_VERBARG_SEEN) {
6732 0         SV *sv_err = get_sv("REGERROR", 1);
6733 0         SV *sv_mrk = get_sv("REGMARK", 1);
6734 0         if (result) {
6735           sv_commit = &PL_sv_no;
6736 0         if (!sv_yes_mark)
6737           sv_yes_mark = &PL_sv_yes;
6738           } else {
6739 0         if (!sv_commit)
6740           sv_commit = &PL_sv_yes;
6741           sv_yes_mark = &PL_sv_no;
6742           }
6743 935956         sv_setsv(sv_err, sv_commit);
6744 124854668         sv_setsv(sv_mrk, sv_yes_mark);
6745           }
6746            
6747            
6748 124854710         if (last_pushed_cv) {
6749           dSP;
6750 124854740         POP_MULTICALL;
6751           PERL_UNUSED_VAR(SP);
6752           }
6753            
6754 124854710         assert(!result || locinput - reginfo->strbeg >= 0);
6755 1398610         return result ? locinput - reginfo->strbeg : -1;
6756           }
6757            
6758           /*
6759           - regrepeat - repeatedly match something simple, report how many
6760           *
6761           * What 'simple' means is a node which can be the operand of a quantifier like
6762           * '+', or {1,3}
6763           *
6764           * startposp - pointer a pointer to the start position. This is updated
6765           * to point to the byte following the highest successful
6766           * match.
6767           * p - the regnode to be repeatedly matched against.
6768           * reginfo - struct holding match state, such as strend
6769           * max - maximum number of things to match.
6770           * depth - (for debugging) backtracking depth.
6771           */
6772           STATIC I32
6773 1398568         S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6774           regmatch_info *const reginfo, I32 max, int depth)
6775           {
6776           dVAR;
6777           char *scan; /* Pointer to current position in target string */
6778           I32 c;
6779 1398568         char *loceol = reginfo->strend; /* local version */
6780           I32 hardcount = 0; /* How many matches so far */
6781 4         bool utf8_target = reginfo->is_utf8_target;
6782           int to_complement = 0; /* Invert the result? */
6783           UV utf8_flags;
6784           _char_class_number classnum;
6785           #ifndef DEBUGGING
6786           PERL_UNUSED_ARG(depth);
6787           #endif
6788            
6789 124854664         PERL_ARGS_ASSERT_REGREPEAT;
6790            
6791 123606588         scan = *startposp;
6792 27134164         if (max == REG_INFTY)
6793           max = I32_MAX;
6794 1278132         else if (! utf8_target && loceol - scan > max)
6795 1277930         loceol = scan + max;
6796            
6797           /* Here, for the case of a non-UTF-8 target we have adjusted down
6798           * to the maximum of how far we should go in it (leaving it set to the real
6799           * end, if the maximum permissible would take us beyond that). This allows
6800           * us to make the loop exit condition that we haven't gone past to
6801           * also mean that we haven't exceeded the max permissible count, saving a
6802           * test each time through the loop. But it assumes that the OP matches a
6803           * single byte, which is true for most of the OPs below when applied to a
6804           * non-UTF-8 target. Those relatively few OPs that don't have this
6805           * characteristic will have to compensate.
6806           *
6807           * There is no adjustment for UTF-8 targets, as the number of bytes per
6808           * character varies. OPs will have to test both that the count is less
6809           * than the max permissible (using to keep track), and that we
6810           * are still within the bounds of the string (using . A few OPs
6811           * match a single byte no matter what the encoding. They can omit the max
6812           * test if, for the UTF-8 case, they do the adjustment that was skipped
6813           * above.
6814           *
6815           * Thus, the code above sets things up for the common case; and exceptional
6816           * cases need extra work; the common case is to make sure doesn't
6817           * go past , and for UTF-8 to also use to make sure the
6818           * count doesn't exceed the maximum permissible */
6819            
6820 27133826         switch (OP(p)) {
6821           case REG_ANY:
6822 141758         if (utf8_target) {
6823 141758         while (scan < loceol && hardcount < max && *scan != '\n') {
6824 139442         scan += UTF8SKIP(scan);
6825 128830         hardcount++;
6826           }
6827           } else {
6828 28610         while (scan < loceol && *scan != '\n')
6829 28378         scan++;
6830           }
6831           break;
6832           case SANY:
6833 1790         if (utf8_target) {
6834 26820         while (scan < loceol && hardcount < max) {
6835 26820         scan += UTF8SKIP(scan);
6836 124854664         hardcount++;
6837           }
6838           }
6839           else
6840           scan = loceol;
6841           break;
6842           case CANY: /* Move forward bytes, unless goes off end */
6843 28367416         if (utf8_target && loceol - scan > max) {
6844            
6845           /* hadn't been adjusted in the UTF-8 case */
6846 28182320         scan += max;
6847           }
6848           else {
6849           scan = loceol;
6850           }
6851           break;
6852           case EXACT:
6853 14898562         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6854            
6855 14873336         c = (U8)*STRING(p);
6856            
6857           /* Can use a simple loop if the pattern char to match on is invariant
6858           * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6859           * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6860           * true iff it doesn't matter if the argument is in UTF-8 or not */
6861 988418         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6862 371176         if (utf8_target && loceol - scan > max) {
6863           /* We didn't adjust because is UTF-8, but ok to do so,
6864           * since here, to match at all, 1 char == 1 byte */
6865 52562         loceol = scan + max;
6866           }
6867 961082         while (scan < loceol && UCHARAT(scan) == c) {
6868 935956         scan++;
6869           }
6870           }
6871 935956         else if (reginfo->is_utf8_pat) {
6872 100         if (utf8_target) {
6873           STRLEN scan_char_len;
6874            
6875           /* When both target and pattern are UTF-8, we have to do
6876           * string EQ */
6877 100         while (hardcount < max
6878 935956         && scan < loceol
6879 935956         && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6880 28342290         && memEQ(scan, STRING(p), scan_char_len))
6881           {
6882 936         scan += scan_char_len;
6883 358         hardcount++;
6884           }
6885           }
6886 178         else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6887            
6888           /* Target isn't utf8; convert the character in the UTF-8
6889           * pattern to non-UTF8, and do a simple loop */
6890 124829538         c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
6891 7571424         while (scan < loceol && UCHARAT(scan) == c) {
6892 7571424         scan++;
6893           }
6894           } /* else pattern char is above Latin1, can't possibly match the
6895           non-UTF-8 target */
6896           }
6897           else {
6898            
6899           /* Here, the string must be utf8; pattern isn't, and is
6900           * different in utf8 than not, so can't compare them directly.
6901           * Outside the loop, find the two utf8 bytes that represent c, and
6902           * then look for those in sequence in the utf8 string */
6903 6890156         U8 high = UTF8_TWO_BYTE_HI(c);
6904 1136136         U8 low = UTF8_TWO_BYTE_LO(c);
6905            
6906 4899172         while (hardcount < max
6907 3081768         && scan + 1 < loceol
6908 3081768         && UCHARAT(scan) == high
6909 3568440         && UCHARAT(scan + 1) == low)
6910           {
6911 1470242         scan += 2;
6912 7571424         hardcount++;
6913           }
6914           }
6915           break;
6916            
6917           case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
6918 127394         assert(! reginfo->is_utf8_pat);
6919           /* FALL THROUGH */
6920           case EXACTFA:
6921           utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6922           goto do_exactf;
6923            
6924           case EXACTFL:
6925 127394         RXp_MATCH_TAINTED_on(prog);
6926           utf8_flags = FOLDEQ_UTF8_LOCALE;
6927 43614         goto do_exactf;
6928            
6929           case EXACTF: /* This node only generated for non-utf8 patterns */
6930 0         assert(! reginfo->is_utf8_pat);
6931           utf8_flags = 0;
6932           goto do_exactf;
6933            
6934           case EXACTFU_SS:
6935           case EXACTFU:
6936 43614         utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6937            
6938           do_exactf: {
6939           int c1, c2;
6940           U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6941            
6942 222158         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6943            
6944 138378         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6945           reginfo))
6946           {
6947 138378         if (c1 == CHRTEST_VOID) {
6948           /* Use full Unicode fold matching */
6949 219634         char *tmpeol = reginfo->strend;
6950 109416         STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6951 83780         while (hardcount < max
6952 116865         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6953           STRING(p), NULL, pat_len,
6954           reginfo->is_utf8_pat, utf8_flags))
6955           {
6956 111774         scan = tmpeol;
6957 111774         tmpeol = reginfo->strend;
6958 111774         hardcount++;
6959           }
6960           }
6961 111774         else if (utf8_target) {
6962 111774         if (c1 == c2) {
6963 111774         while (scan < loceol
6964 111334         && hardcount < max
6965 222668         && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6966           {
6967 111774         scan += UTF8SKIP(scan);
6968 107096         hardcount++;
6969           }
6970           }
6971           else {
6972 107096         while (scan < loceol
6973 111774         && hardcount < max
6974 111774         && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6975 111774         || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6976           {
6977 0         scan += UTF8SKIP(scan);
6978 111774         hardcount++;
6979           }
6980           }
6981           }
6982 4622         else if (c1 == c2) {
6983 223548         while (scan < loceol && UCHARAT(scan) == c1) {
6984 111774         scan++;
6985           }
6986           }
6987           else {
6988 111774         while (scan < loceol &&
6989 111774         (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6990           {
6991 30         scan++;
6992           }
6993           }
6994           }
6995           break;
6996           }
6997           case ANYOF:
6998           case ANYOF_WARN_SUPER:
6999 30         if (utf8_target) {
7000 30         while (hardcount < max
7001 30         && scan < loceol
7002 30         && reginclass(prog, p, (U8*)scan, utf8_target))
7003           {
7004 30         scan += UTF8SKIP(scan);
7005 111744         hardcount++;
7006           }
7007           } else {
7008 111774         while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7009 111774         scan++;
7010           }
7011           break;
7012            
7013           /* The argument (FLAGS) to all the POSIX node types is the class number */
7014            
7015           case NPOSIXL:
7016           to_complement = 1;
7017           /* FALLTHROUGH */
7018            
7019           case POSIXL:
7020 111774         RXp_MATCH_TAINTED_on(prog);
7021 111774         if (! utf8_target) {
7022 111774         while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7023           *scan)))
7024           {
7025 151263341         scan++;
7026           }
7027           } else {
7028 151263341         while (hardcount < max && scan < loceol
7029 151263341         && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7030           (U8 *) scan)))
7031           {
7032 151263341         scan += UTF8SKIP(scan);
7033 111774         hardcount++;
7034           }
7035           }
7036           break;
7037            
7038           case POSIXD:
7039 30         if (utf8_target) {
7040           goto utf8_posix;
7041           }
7042           /* FALLTHROUGH */
7043            
7044           case POSIXA:
7045 30         if (utf8_target && loceol - scan > max) {
7046            
7047           /* We didn't adjust at the beginning of this routine
7048           * because is UTF-8, but it is actually ok to do so, since here, to
7049           * match, 1 char == 1 byte. */
7050 30         loceol = scan + max;
7051           }
7052 30         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7053 30         scan++;
7054           }
7055           break;
7056            
7057           case NPOSIXD:
7058 30         if (utf8_target) {
7059           to_complement = 1;
7060           goto utf8_posix;
7061           }
7062           /* FALL THROUGH */
7063            
7064           case NPOSIXA:
7065 30         if (! utf8_target) {
7066 111774         while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7067 111774         scan++;
7068           }
7069           }
7070           else {
7071            
7072           /* The complement of something that matches only ASCII matches all
7073           * UTF-8 variant code points, plus everything in ASCII that isn't
7074           * in the class. */
7075 167661         while (hardcount < max && scan < loceol
7076 111774         && (! UTF8_IS_INVARIANT(*scan)
7077 111774         || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7078           {
7079 111774         scan += UTF8SKIP(scan);
7080 151263341         hardcount++;
7081           }
7082           }
7083           break;
7084            
7085           case NPOSIXU:
7086           to_complement = 1;
7087           /* FALLTHROUGH */
7088            
7089           case POSIXU:
7090 151263341         if (! utf8_target) {
7091 151263341         while (scan < loceol && to_complement
7092 151263341         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7093           {
7094 417738         scan++;
7095           }
7096           }
7097           else {
7098           utf8_posix:
7099 1141235         classnum = (_char_class_number) FLAGS(p);
7100 514628         if (classnum < _FIRST_NON_SWASH_CC) {
7101            
7102           /* Here, a swash is needed for above-Latin1 code points.
7103           * Process as many Latin1 code points using the built-in rules.
7104           * Go to another loop to finish processing upon encountering
7105           * the first Latin1 code point. We could do that in this loop
7106           * as well, but the other way saves having to test if the swash
7107           * has been loaded every time through the loop: extra space to
7108           * save a test. */
7109 514628         while (hardcount < max && scan < loceol) {
7110 151263341         if (UTF8_IS_INVARIANT(*scan)) {
7111 502808         if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7112           classnum))))
7113           {
7114           break;
7115           }
7116 1005616         scan++;
7117           }
7118 627696         else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7119 627696         if (! (to_complement
7120 627696         ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7121           *(scan + 1)),
7122           classnum))))
7123           {
7124           break;
7125           }
7126 627696         scan += 2;
7127           }
7128           else {
7129           goto found_above_latin1;
7130           }
7131            
7132 627696         hardcount++;
7133           }
7134           }
7135           else {
7136           /* For these character classes, the knowledge of how to handle
7137           * every code point is compiled in to Perl via a macro. This
7138           * code is written for making the loops as tight as possible.
7139           * It could be refactored to save space instead */
7140 627696         switch (classnum) {
7141           case _CC_ENUM_SPACE: /* XXX would require separate code
7142           if we revert the change of \v
7143           matching this */
7144           /* FALL THROUGH */
7145           case _CC_ENUM_PSXSPC:
7146 316852         while (hardcount < max
7147 316852         && scan < loceol
7148 310844         && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7149           {
7150 627696         scan += UTF8SKIP(scan);
7151 495010         hardcount++;
7152           }
7153           break;
7154           case _CC_ENUM_BLANK:
7155 1005616         while (hardcount < max
7156 502808         && scan < loceol
7157 106572         && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7158           {
7159 213134         scan += UTF8SKIP(scan);
7160 158168         hardcount++;
7161           }
7162           break;
7163           case _CC_ENUM_XDIGIT:
7164 158168         while (hardcount < max
7165 158168         && scan < loceol
7166 158140         && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7167           {
7168 158140         scan += UTF8SKIP(scan);
7169 52120         hardcount++;
7170           }
7171           break;
7172           case _CC_ENUM_VERTSPACE:
7173 52120         while (hardcount < max
7174 106020         && scan < loceol
7175 158140         && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7176           {
7177 158140         scan += UTF8SKIP(scan);
7178 106548         hardcount++;
7179           }
7180           break;
7181           case _CC_ENUM_CNTRL:
7182 213120         while (hardcount < max
7183 1576113953         && scan < loceol
7184 0         && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7185           {
7186 0         scan += UTF8SKIP(scan);
7187 0         hardcount++;
7188           }
7189           break;
7190           default:
7191 0         Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7192           }
7193           }
7194           }
7195           break;
7196            
7197           found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7198            
7199           /* Load the swash if not already present */
7200 0         if (! PL_utf8_swash_ptrs[classnum]) {
7201 0         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7202 0         PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7203           "utf8", swash_property_names[classnum],
7204           &PL_sv_undef, 1, 0, NULL, &flags);
7205           }
7206            
7207 0         while (hardcount < max && scan < loceol
7208 0         && to_complement ^ cBOOL(_generic_utf8(
7209           classnum,
7210           scan,
7211           swash_fetch(PL_utf8_swash_ptrs[classnum],
7212           (U8 *) scan,
7213           TRUE))))
7214           {
7215 0         scan += UTF8SKIP(scan);
7216 0         hardcount++;
7217           }
7218           break;
7219            
7220           case LNBREAK:
7221 0         if (utf8_target) {
7222 0         while (hardcount < max && scan < loceol &&
7223 0         (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7224 0         scan += c;
7225 0         hardcount++;
7226           }
7227           } else {
7228           /* LNBREAK can match one or two latin chars, which is ok, but we
7229           * have to use hardcount in this situation, and throw away the
7230           * adjustment to done before the switch statement */
7231 0         loceol = reginfo->strend;
7232 0         while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7233 0         scan+=c;
7234 0         hardcount++;
7235           }
7236           }
7237           break;
7238            
7239           case BOUND:
7240           case BOUNDA:
7241           case BOUNDL:
7242           case BOUNDU:
7243           case EOS:
7244           case GPOS:
7245           case KEEPS:
7246           case NBOUND:
7247           case NBOUNDA:
7248           case NBOUNDL:
7249           case NBOUNDU:
7250           case OPFAIL:
7251           case SBOL:
7252           case SEOL:
7253           /* These are all 0 width, so match right here or not at all. */
7254           break;
7255            
7256           default:
7257 0         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7258           assert(0); /* NOTREACHED */
7259            
7260           }
7261            
7262 0         if (hardcount)
7263           c = hardcount;
7264           else
7265 0         c = scan - *startposp;
7266 0         *startposp = scan;
7267            
7268 0         DEBUG_r({
7269           GET_RE_DEBUG_FLAGS_DECL;
7270           DEBUG_EXECUTE_r({
7271           SV * const prop = sv_newmortal();
7272           regprop(prog, prop, p);
7273           PerlIO_printf(Perl_debug_log,
7274           "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7275           REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7276           });
7277           });
7278            
7279 0         return(c);
7280           }
7281            
7282            
7283           #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7284           /*
7285           - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7286           create a copy so that changes the caller makes won't change the shared one.
7287           If is non-null, will return NULL in it, for back-compat.
7288           */
7289           SV *
7290           Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7291           {
7292           PERL_ARGS_ASSERT_REGCLASS_SWASH;
7293            
7294           if (altsvp) {
7295           *altsvp = NULL;
7296           }
7297            
7298           return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7299           }
7300           #endif
7301            
7302           STATIC SV *
7303 0         S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7304           {
7305           /* Returns the swash for the input 'node' in the regex 'prog'.
7306           * If is 'true', will attempt to create the swash if not already
7307           * done.
7308           * If is non-null, will return the printable contents of the
7309           * swash. This can be used to get debugging information even before the
7310           * swash exists, by calling this function with 'doinit' set to false, in
7311           * which case the components that will be used to eventually create the
7312           * swash are returned (in a printable form).
7313           * Tied intimately to how regcomp.c sets up the data structure */
7314            
7315           dVAR;
7316           SV *sw = NULL;
7317           SV *si = NULL; /* Input swash initialization string */
7318           SV* invlist = NULL;
7319            
7320 0         RXi_GET_DECL(prog,progi);
7321 0         const struct reg_data * const data = prog ? progi->data : NULL;
7322            
7323 0         PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7324            
7325 0         assert(ANYOF_NONBITMAP(node));
7326            
7327 0         if (data && data->count) {
7328 0         const U32 n = ARG(node);
7329            
7330 0         if (data->what[n] == 's') {
7331 0         SV * const rv = MUTABLE_SV(data->data[n]);
7332 0         AV * const av = MUTABLE_AV(SvRV(rv));
7333 0         SV **const ary = AvARRAY(av);
7334 0         U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7335          
7336 0         si = *ary; /* ary[0] = the string to initialize the swash with */
7337            
7338           /* Elements 2 and 3 are either both present or both absent. [2] is
7339           * any inversion list generated at compile time; [3] indicates if
7340           * that inversion list has any user-defined properties in it. */
7341 0         if (av_len(av) >= 2) {
7342 0         invlist = ary[2];
7343 0         if (SvUV(ary[3])) {
7344 0         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7345           }
7346           }
7347           else {
7348           invlist = NULL;
7349           }
7350            
7351           /* Element [1] is reserved for the set-up swash. If already there,
7352           * return it; if not, create it and store it there */
7353 0         if (ary[1] && SvROK(ary[1])) {
7354 0         sw = ary[1];
7355           }
7356 0         else if (si && doinit) {
7357            
7358 0         sw = _core_swash_init("utf8", /* the utf8 package */
7359           "", /* nameless */
7360           si,
7361           1, /* binary */
7362           0, /* not from tr/// */
7363           invlist,
7364           &swash_init_flags);
7365 0         (void)av_store(av, 1, sw);
7366           }
7367           }
7368           }
7369          
7370           /* If requested, return a printable version of what this swash matches */
7371 0         if (listsvp) {
7372 0         SV* matches_string = newSVpvn("", 0);
7373            
7374           /* The swash should be used, if possible, to get the data, as it
7375           * contains the resolved data. But this function can be called at
7376           * compile-time, before everything gets resolved, in which case we
7377           * return the currently best available information, which is the string
7378           * that will eventually be used to do that resolving, 'si' */
7379 0         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7380 0         && (si && si != &PL_sv_undef))
7381           {
7382 0         sv_catsv(matches_string, si);
7383           }
7384            
7385           /* Add the inversion list to whatever we have. This may have come from
7386           * the swash, or from an input parameter */
7387 0         if (invlist) {
7388 0         sv_catsv(matches_string, _invlist_contents(invlist));
7389           }
7390 0         *listsvp = matches_string;
7391           }
7392            
7393 0         return sw;
7394           }
7395            
7396           /*
7397           - reginclass - determine if a character falls into a character class
7398          
7399           n is the ANYOF regnode
7400           p is the target string
7401           utf8_target tells whether p is in UTF-8.
7402            
7403           Returns true if matched; false otherwise.
7404            
7405           Note that this can be a synthetic start class, a combination of various
7406           nodes, so things you think might be mutually exclusive, such as locale,
7407           aren't. It can match both locale and non-locale
7408            
7409           */
7410            
7411           STATIC bool
7412 0         S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7413           {
7414           dVAR;
7415 0         const char flags = ANYOF_FLAGS(n);
7416           bool match = FALSE;
7417 0         UV c = *p;
7418            
7419 0         PERL_ARGS_ASSERT_REGINCLASS;
7420            
7421           /* If c is not already the code point, get it. Note that
7422           * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7423 0         if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7424 0         STRLEN c_len = 0;
7425 0         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7426           (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7427           | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7428           /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7429           * UTF8_ALLOW_FFFF */
7430 0         if (c_len == (STRLEN)-1)
7431 0         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7432           }
7433            
7434           /* If this character is potentially in the bitmap, check it */
7435 0         if (c < 256) {
7436 0         if (ANYOF_BITMAP_TEST(n, c))
7437           match = TRUE;
7438 0         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7439 0         && ! utf8_target
7440 0         && ! isASCII(c))
7441           {
7442           match = TRUE;
7443           }
7444 0         else if (flags & ANYOF_LOCALE) {
7445 0         RXp_MATCH_TAINTED_on(prog);
7446            
7447 0         if ((flags & ANYOF_LOC_FOLD)
7448 0         && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7449           {
7450           match = TRUE;
7451           }
7452 0         else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7453            
7454           /* The data structure is arranged so bits 0, 2, 4, ... are set
7455           * if the class includes the Posix character class given by
7456           * bit/2; and 1, 3, 5, ... are set if the class includes the
7457           * complemented Posix class given by int(bit/2). So we loop
7458           * through the bits, each time changing whether we complement
7459           * the result or not. Suppose for the sake of illustration
7460           * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7461           * is set, it means there is a match for this ANYOF node if the
7462           * character is in the class given by the expression (0 / 2 = 0
7463           * = \w). If it is in that class, isFOO_lc() will return 1,
7464           * and since 'to_complement' is 0, the result will stay TRUE,
7465           * and we exit the loop. Suppose instead that bit 0 is 0, but
7466           * bit 1 is 1. That means there is a match if the character
7467           * matches \W. We won't bother to call isFOO_lc() on bit 0,
7468           * but will on bit 1. On the second iteration 'to_complement'
7469           * will be 1, so the exclusive or will reverse things, so we
7470           * are testing for \W. On the third iteration, 'to_complement'
7471           * will be 0, and we would be testing for \s; the fourth
7472           * iteration would test for \S, etc.
7473           *
7474           * Note that this code assumes that all the classes are closed
7475           * under folding. For example, if a character matches \w, then
7476           * its fold does too; and vice versa. This should be true for
7477           * any well-behaved locale for all the currently defined Posix
7478           * classes, except for :lower: and :upper:, which are handled
7479           * by the pseudo-class :cased: which matches if either of the
7480           * other two does. To get rid of this assumption, an outer
7481           * loop could be used below to iterate over both the source
7482           * character, and its fold (if different) */
7483            
7484           int count = 0;
7485           int to_complement = 0;
7486 0         while (count < ANYOF_MAX) {
7487 0         if (ANYOF_CLASS_TEST(n, count)
7488 0         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7489           {
7490           match = TRUE;
7491           break;
7492           }
7493 0         count++;
7494 0         to_complement ^= 1;
7495           }
7496           }
7497           }
7498           }
7499            
7500           /* If the bitmap didn't (or couldn't) match, and something outside the
7501           * bitmap could match, try that. Locale nodes specify completely the
7502           * behavior of code points in the bit map (otherwise, a utf8 target would
7503           * cause them to be treated as Unicode and not locale), except in
7504           * the very unlikely event when this node is a synthetic start class, which
7505           * could be a combination of locale and non-locale nodes. So allow locale
7506           * to match for the synthetic start class, which will give a false
7507           * positive that will be resolved when the match is done again as not part
7508           * of the synthetic start class */
7509 0         if (!match) {
7510 0         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7511           match = TRUE; /* Everything above 255 matches */
7512           }
7513 0         else if (ANYOF_NONBITMAP(n)
7514 0         && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7515 0         || (utf8_target
7516 0         && (c >=256
7517 0         || (! (flags & ANYOF_LOCALE))
7518 0         || OP(n) == ANYOF_SYNTHETIC))))
7519           {
7520 0         SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7521 0         if (sw) {
7522           U8 * utf8_p;
7523 0         if (utf8_target) {
7524           utf8_p = (U8 *) p;
7525           } else { /* Convert to utf8 */
7526 0         STRLEN len = 1;
7527 0         utf8_p = bytes_to_utf8(p, &len);
7528           }
7529            
7530 0         if (swash_fetch(sw, utf8_p, TRUE)) {
7531           match = TRUE;
7532           }
7533            
7534           /* If we allocated a string above, free it */
7535 0         if (! utf8_target) Safefree(utf8_p);
7536           }
7537           }
7538            
7539 0         if (UNICODE_IS_SUPER(c)
7540 0         && OP(n) == ANYOF_WARN_SUPER
7541 0         && ckWARN_d(WARN_NON_UNICODE))
7542           {
7543 0         Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7544           "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7545           }
7546           }
7547            
7548           /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7549 0         return cBOOL(flags & ANYOF_INVERT) ^ match;
7550           }
7551            
7552           STATIC U8 *
7553 0         S_reghop3(U8 *s, SSize_t off, const U8* lim)
7554           {
7555           /* return the position 'off' UTF-8 characters away from 's', forward if
7556           * 'off' >= 0, backwards if negative. But don't go outside of position
7557           * 'lim', which better be < s if off < 0 */
7558            
7559           dVAR;
7560            
7561 0         PERL_ARGS_ASSERT_REGHOP3;
7562            
7563 0         if (off >= 0) {
7564 0         while (off-- && s < lim) {
7565           /* XXX could check well-formedness here */
7566 0         s += UTF8SKIP(s);
7567           }
7568           }
7569           else {
7570 0         while (off++ && s > lim) {
7571 0         s--;
7572 0         if (UTF8_IS_CONTINUED(*s)) {
7573 0         while (s > lim && UTF8_IS_CONTINUATION(*s))
7574 0         s--;
7575           }
7576           /* XXX could check well-formedness here */
7577           }
7578           }
7579 0         return s;
7580           }
7581            
7582           #ifdef XXX_dmq
7583           /* there are a bunch of places where we use two reghop3's that should
7584           be replaced with this routine. but since thats not done yet
7585           we ifdef it out - dmq
7586           */
7587           STATIC U8 *
7588           S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7589           {
7590           dVAR;
7591            
7592           PERL_ARGS_ASSERT_REGHOP4;
7593            
7594           if (off >= 0) {
7595           while (off-- && s < rlim) {
7596           /* XXX could check well-formedness here */
7597           s += UTF8SKIP(s);
7598           }
7599           }
7600           else {
7601           while (off++ && s > llim) {
7602           s--;
7603           if (UTF8_IS_CONTINUED(*s)) {
7604           while (s > llim && UTF8_IS_CONTINUATION(*s))
7605           s--;
7606           }
7607           /* XXX could check well-formedness here */
7608           }
7609           }
7610           return s;
7611           }
7612           #endif
7613            
7614           STATIC U8 *
7615 0         S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7616           {
7617           dVAR;
7618            
7619 0         PERL_ARGS_ASSERT_REGHOPMAYBE3;
7620            
7621 0         if (off >= 0) {
7622 0         while (off-- && s < lim) {
7623           /* XXX could check well-formedness here */
7624 0         s += UTF8SKIP(s);
7625           }
7626 0         if (off >= 0)
7627           return NULL;
7628           }
7629           else {
7630 0         while (off++ && s > lim) {
7631 0         s--;
7632 0         if (UTF8_IS_CONTINUED(*s)) {
7633 0         while (s > lim && UTF8_IS_CONTINUATION(*s))
7634 0         s--;
7635           }
7636           /* XXX could check well-formedness here */
7637           }
7638 0         if (off <= 0)
7639           return NULL;
7640           }
7641 0         return s;
7642           }
7643            
7644            
7645           /* when executing a regex that may have (?{}), extra stuff needs setting
7646           up that will be visible to the called code, even before the current
7647           match has finished. In particular:
7648            
7649           * $_ is localised to the SV currently being matched;
7650           * pos($_) is created if necessary, ready to be updated on each call-out
7651           to code;
7652           * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7653           isn't set until the current pattern is successfully finished), so that
7654           $1 etc of the match-so-far can be seen;
7655           * save the old values of subbeg etc of the current regex, and set then
7656           to the current string (again, this is normally only done at the end
7657           of execution)
7658           */
7659            
7660           static void
7661 36         S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7662           {
7663           MAGIC *mg;
7664 72         regexp *const rex = ReANY(reginfo->prog);
7665 36         regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7666            
7667 36         eval_state->rex = rex;
7668            
7669 36         if (reginfo->sv) {
7670           /* Make $_ available to executed code. */
7671 36         if (reginfo->sv != DEFSV) {
7672 36         SAVE_DEFSV;
7673 36         DEFSV_set(reginfo->sv);
7674           }
7675            
7676 36         if (!(mg = mg_find_mglob(reginfo->sv))) {
7677           /* prepare for quick setting of pos */
7678 12         mg = sv_magicext_mglob(reginfo->sv);
7679 12         mg->mg_len = -1;
7680           }
7681 36         eval_state->pos_magic = mg;
7682 36         eval_state->pos = mg->mg_len;
7683 36         eval_state->pos_flags = mg->mg_flags;
7684           }
7685           else
7686 0         eval_state->pos_magic = NULL;
7687            
7688 36         if (!PL_reg_curpm) {
7689           /* PL_reg_curpm is a fake PMOP that we can attach the current
7690           * regex to and point PL_curpm at, so that $1 et al are visible
7691           * within a /(?{})/. It's just allocated once per interpreter the
7692           * first time its needed */
7693 12         Newxz(PL_reg_curpm, 1, PMOP);
7694           #ifdef USE_ITHREADS
7695           {
7696           SV* const repointer = &PL_sv_undef;
7697           /* this regexp is also owned by the new PL_reg_curpm, which
7698           will try to free it. */
7699           av_push(PL_regex_padav, repointer);
7700           PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7701           PL_regex_pad = AvARRAY(PL_regex_padav);
7702           }
7703           #endif
7704           }
7705 72         SET_reg_curpm(reginfo->prog);
7706 36         eval_state->curpm = PL_curpm;
7707 36         PL_curpm = PL_reg_curpm;
7708 36         if (RXp_MATCH_COPIED(rex)) {
7709           /* Here is a serious problem: we cannot rewrite subbeg,
7710           since it may be needed if this match fails. Thus
7711           $` inside (?{}) could fail... */
7712 0         eval_state->subbeg = rex->subbeg;
7713 0         eval_state->sublen = rex->sublen;
7714 0         eval_state->suboffset = rex->suboffset;
7715 0         eval_state->subcoffset = rex->subcoffset;
7716           #ifdef PERL_ANY_COW
7717 0         eval_state->saved_copy = rex->saved_copy;
7718           #endif
7719 0         RXp_MATCH_COPIED_off(rex);
7720           }
7721           else
7722 36         eval_state->subbeg = NULL;
7723 36         rex->subbeg = (char *)reginfo->strbeg;
7724 36         rex->suboffset = 0;
7725 36         rex->subcoffset = 0;
7726 36         rex->sublen = reginfo->strend - reginfo->strbeg;
7727 36         }
7728            
7729            
7730           /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7731            
7732           static void
7733 44         S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7734           {
7735           dVAR;
7736           regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7737 44         regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
7738           regmatch_slab *s;
7739            
7740 44         Safefree(aux->poscache);
7741            
7742 44         if (eval_state) {
7743            
7744           /* undo the effects of S_setup_eval_state() */
7745            
7746 36         if (eval_state->subbeg) {
7747 0         regexp * const rex = eval_state->rex;
7748 0         rex->subbeg = eval_state->subbeg;
7749 0         rex->sublen = eval_state->sublen;
7750 0         rex->suboffset = eval_state->suboffset;
7751 0         rex->subcoffset = eval_state->subcoffset;
7752           #ifdef PERL_ANY_COW
7753 0         rex->saved_copy = eval_state->saved_copy;
7754           #endif
7755 0         RXp_MATCH_COPIED_on(rex);
7756           }
7757 36         if (eval_state->pos_magic)
7758           {
7759 36         eval_state->pos_magic->mg_len = eval_state->pos;
7760 72         eval_state->pos_magic->mg_flags =
7761 36         (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
7762 36         | (eval_state->pos_flags & MGf_BYTES);
7763           }
7764            
7765 36         PL_curpm = eval_state->curpm;
7766           }
7767            
7768 44         PL_regmatch_state = aux->old_regmatch_state;
7769 44         PL_regmatch_slab = aux->old_regmatch_slab;
7770            
7771           /* free all slabs above current one - this must be the last action
7772           * of this function, as aux and eval_state are allocated within
7773           * slabs and may be freed here */
7774            
7775 44         s = PL_regmatch_slab->next;
7776 44         if (s) {
7777 0         PL_regmatch_slab->next = NULL;
7778 0         while (s) {
7779           regmatch_slab * const osl = s;
7780 0         s = s->next;
7781 0         Safefree(osl);
7782           }
7783           }
7784 44         }
7785            
7786            
7787           STATIC void
7788 0         S_to_utf8_substr(pTHX_ regexp *prog)
7789           {
7790           /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7791           * on the converted value */
7792            
7793           int i = 1;
7794            
7795 0         PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7796            
7797           do {
7798 0         if (prog->substrs->data[i].substr
7799 0         && !prog->substrs->data[i].utf8_substr) {
7800 0         SV* const sv = newSVsv(prog->substrs->data[i].substr);
7801 0         prog->substrs->data[i].utf8_substr = sv;
7802 0         sv_utf8_upgrade(sv);
7803 0         if (SvVALID(prog->substrs->data[i].substr)) {
7804 0         if (SvTAIL(prog->substrs->data[i].substr)) {
7805           /* Trim the trailing \n that fbm_compile added last
7806           time. */
7807 0         SvCUR_set(sv, SvCUR(sv) - 1);
7808           /* Whilst this makes the SV technically "invalid" (as its
7809           buffer is no longer followed by "\0") when fbm_compile()
7810           adds the "\n" back, a "\0" is restored. */
7811 0         fbm_compile(sv, FBMcf_TAIL);
7812           } else
7813 0         fbm_compile(sv, 0);
7814           }
7815 0         if (prog->substrs->data[i].substr == prog->check_substr)
7816 0         prog->check_utf8 = sv;
7817           }
7818 0         } while (i--);
7819 0         }
7820            
7821           STATIC bool
7822 28         S_to_byte_substr(pTHX_ regexp *prog)
7823           {
7824           /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7825           * on the converted value; returns FALSE if can't be converted. */
7826            
7827           dVAR;
7828           int i = 1;
7829            
7830 28         PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7831            
7832           do {
7833 56         if (prog->substrs->data[i].utf8_substr
7834 28         && !prog->substrs->data[i].substr) {
7835 28         SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7836 28         if (! sv_utf8_downgrade(sv, TRUE)) {
7837           return FALSE;
7838           }
7839 0         if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7840 0         if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7841           /* Trim the trailing \n that fbm_compile added last
7842           time. */
7843 0         SvCUR_set(sv, SvCUR(sv) - 1);
7844 0         fbm_compile(sv, FBMcf_TAIL);
7845           } else
7846 0         fbm_compile(sv, 0);
7847           }
7848 0         prog->substrs->data[i].substr = sv;
7849 0         if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7850 0         prog->check_substr = sv;
7851           }
7852 28         } while (i--);
7853            
7854           return TRUE;
7855 1216         }
7856            
7857           /*
7858           * Local variables:
7859           * c-indentation-style: bsd
7860           * c-basic-offset: 4
7861           * indent-tabs-mode: nil
7862           * End:
7863           *
7864           * ex: set ts=8 sts=4 sw=4 et:
7865           */