File Coverage

ext/re/re_exec.c
Criterion Covered Total %
statement 2080 2333 89.2
branch n/a
condition n/a
subroutine n/a
total 2080 2333 89.2


line stmt bran cond sub time code
1           /* regexec.c
2           */
3            
4           /*
5           * One Ring to rule them all, One Ring to find them
6           &
7           * [p.v of _The Lord of the Rings_, opening poem]
8           * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9           * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10           */
11            
12           /* This file contains functions for executing a regular expression. See
13           * also regcomp.c which funnily enough, contains functions for compiling
14           * a regular expression.
15           *
16           * This file is also copied at build time to ext/re/re_exec.c, where
17           * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18           * This causes the main functions to be compiled under new names and with
19           * debugging support added, which makes "use re 'debug'" work.
20           */
21            
22           /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23           * confused with the original package (see point 3 below). Thanks, Henry!
24           */
25            
26           /* Additional note: this code is very heavily munged from Henry's version
27           * in places. In some spots I've traded clarity for efficiency, so don't
28           * blame Henry for some of the lack of readability.
29           */
30            
31           /* The names of the functions have been changed from regcomp and
32           * regexec to pregcomp and pregexec in order to avoid conflicts
33           * with the POSIX routines of the same names.
34           */
35            
36           #ifdef PERL_EXT_RE_BUILD
37           #include "re_top.h"
38           #endif
39            
40           /* At least one required character in the target string is expressible only in
41           * UTF-8. */
42           static const char* const non_utf8_target_but_utf8_required
43           = "Can't match, because target string needs to be in UTF-8\n";
44            
45           #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46           DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47           goto target; \
48           } STMT_END
49            
50           /*
51           * pregcomp and pregexec -- regsub and regerror are not used in perl
52           *
53           * Copyright (c) 1986 by University of Toronto.
54           * Written by Henry Spencer. Not derived from licensed software.
55           *
56           * Permission is granted to anyone to use this software for any
57           * purpose on any computer system, and to redistribute it freely,
58           * subject to the following restrictions:
59           *
60           * 1. The author is not responsible for the consequences of use of
61           * this software, no matter how awful, even if they arise
62           * from defects in it.
63           *
64           * 2. The origin of this software must not be misrepresented, either
65           * by explicit claim or by omission.
66           *
67           * 3. Altered versions must be plainly marked as such, and must not
68           * be misrepresented as being the original software.
69           *
70           **** Alterations to Henry's code are...
71           ****
72           **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73           **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74           **** by Larry Wall and others
75           ****
76           **** You may distribute under the terms of either the GNU General Public
77           **** License or the Artistic License, as specified in the README file.
78           *
79           * Beware that some of this code is subtly aware of the way operator
80           * precedence is structured in regular expressions. Serious changes in
81           * regular-expression syntax might require a total rethink.
82           */
83           #include "EXTERN.h"
84           #define PERL_IN_REGEXEC_C
85           #include "perl.h"
86            
87           #ifdef PERL_IN_XSUB_RE
88           # include "re_comp.h"
89           #else
90           # include "regcomp.h"
91           #endif
92            
93           #include "inline_invlist.c"
94           #include "unicode_constants.h"
95            
96           #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97            
98           #ifndef STATIC
99           #define STATIC static
100           #endif
101            
102           /* Valid for non-utf8 strings: avoids the reginclass
103           * call if there are no complications: i.e., if everything matchable is
104           * straight forward in the bitmap */
105           #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
106           : ANYOF_BITMAP_TEST(p,*(c)))
107            
108           /*
109           * Forwards.
110           */
111            
112           #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
113           #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
114            
115           #define HOPc(pos,off) \
116           (char *)(reginfo->is_utf8_target \
117           ? reghop3((U8*)pos, off, \
118           (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
119           : (U8*)(pos + off))
120           #define HOPBACKc(pos, off) \
121           (char*)(reginfo->is_utf8_target \
122           ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123           : (pos - off >= reginfo->strbeg) \
124           ? (U8*)pos - off \
125           : NULL)
126            
127           #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
128           #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129            
130            
131           #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132           #define NEXTCHR_IS_EOS (nextchr < 0)
133            
134           #define SET_nextchr \
135           nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
136            
137           #define SET_locinput(p) \
138           locinput = (p); \
139           SET_nextchr
140            
141            
142           #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
143           if (!swash_ptr) { \
144           U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
145           swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146           1, 0, NULL, &flags); \
147           assert(swash_ptr); \
148           } \
149           } STMT_END
150            
151           /* If in debug mode, we test that a known character properly matches */
152           #ifdef DEBUGGING
153           # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
154           property_name, \
155           utf8_char_in_property) \
156           LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157           assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
158           #else
159           # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
160           property_name, \
161           utf8_char_in_property) \
162           LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
163           #endif
164            
165           #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166           PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167           swash_property_names[_CC_WORDCHAR], \
168           LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
169            
170           #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
171           STMT_START { \
172           LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173           "_X_regular_begin", \
174           LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
175           LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
176           "_X_extend", \
177           COMBINING_GRAVE_ACCENT_UTF8); \
178           } STMT_END
179            
180           #define PLACEHOLDER /* Something for the preprocessor to grab onto */
181           /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
182            
183           /* for use after a quantifier and before an EXACT-like node -- japhy */
184           /* it would be nice to rework regcomp.sym to generate this stuff. sigh
185           *
186           * NOTE that *nothing* that affects backtracking should be in here, specifically
187           * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188           * node that is in between two EXACT like nodes when ascertaining what the required
189           * "follow" character is. This should probably be moved to regex compile time
190           * although it may be done at run time beause of the REF possibility - more
191           * investigation required. -- demerphq
192           */
193           #define JUMPABLE(rn) ( \
194           OP(rn) == OPEN || \
195           (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
196           OP(rn) == EVAL || \
197           OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198           OP(rn) == PLUS || OP(rn) == MINMOD || \
199           OP(rn) == KEEPS || \
200           (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
201           )
202           #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
203            
204           #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
205            
206           #if 0
207           /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208           we don't need this definition. */
209           #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
210           #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
211           #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
212            
213           #else
214           /* ... so we use this as its faster. */
215           #define IS_TEXT(rn) ( OP(rn)==EXACT )
216           #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
217           #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218           #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
219            
220           #endif
221            
222           /*
223           Search for mandatory following text node; for lookahead, the text must
224           follow but for lookbehind (rn->flags != 0) we skip to the next step.
225           */
226           #define FIND_NEXT_IMPT(rn) STMT_START { \
227           while (JUMPABLE(rn)) { \
228           const OPCODE type = OP(rn); \
229           if (type == SUSPEND || PL_regkind[type] == CURLY) \
230           rn = NEXTOPER(NEXTOPER(rn)); \
231           else if (type == PLUS) \
232           rn = NEXTOPER(rn); \
233           else if (type == IFMATCH) \
234           rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
235           else rn += NEXT_OFF(rn); \
236           } \
237           } STMT_END
238            
239           /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240           * These are for the pre-composed Hangul syllables, which are all in a
241           * contiguous block and arranged there in such a way so as to facilitate
242           * alorithmic determination of their characteristics. As such, they don't need
243           * a swash, but can be determined by simple arithmetic. Almost all are
244           * GCB=LVT, but every 28th one is a GCB=LV */
245           #define SBASE 0xAC00 /* Start of block */
246           #define SCount 11172 /* Length of block */
247           #define TCount 28
248            
249           #define SLAB_FIRST(s) (&(s)->states[0])
250           #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
251            
252           static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
253           static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
254           static regmatch_state * S_push_slab(pTHX);
255            
256           #define REGCP_PAREN_ELEMS 3
257           #define REGCP_OTHER_ELEMS 3
258           #define REGCP_FRAME_ELEMS 1
259           /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
260           * are needed for the regexp context stack bookkeeping. */
261            
262           STATIC CHECKPOINT
263 67399426         S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
264           {
265           dVAR;
266 67399426         const int retval = PL_savestack_ix;
267 67399426         const int paren_elems_to_push =
268 67399426         (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
269 67399426         const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
270 67399426         const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
271           I32 p;
272 67399426         GET_RE_DEBUG_FLAGS_DECL;
273            
274 48         PERL_ARGS_ASSERT_REGCPPUSH;
275            
276 67399426         if (paren_elems_to_push < 0)
277 0         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
278           paren_elems_to_push);
279            
280 67399426         if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
281 233812454         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
282           " out of range (%lu-%ld)",
283           total_elems,
284           (unsigned long)maxopenparen,
285           (long)parenfloor);
286            
287 166413124         SSGROW(total_elems + REGCP_FRAME_ELEMS);
288          
289 166413124         DEBUG_BUFFERS_r(
290           if ((int)maxopenparen > (int)parenfloor)
291           PerlIO_printf(Perl_debug_log,
292           "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
293           PTR2UV(rex),
294           PTR2UV(rex->offs)
295           );
296           );
297 166413124         for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
298           /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
299 67399378         SSPUSHIV(rex->offs[p].end);
300 67399378         SSPUSHIV(rex->offs[p].start);
301 67399378         SSPUSHINT(rex->offs[p].start_tmp);
302 67399378         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
303           " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
304           (UV)p,
305           (IV)rex->offs[p].start,
306           (IV)rex->offs[p].start_tmp,
307           (IV)rex->offs[p].end
308           ));
309           }
310           /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
311 67399426         SSPUSHINT(maxopenparen);
312 59469062         SSPUSHINT(rex->lastparen);
313 59469062         SSPUSHINT(rex->lastcloseparen);
314 59469062         SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
315            
316 59469062         return retval;
317           }
318            
319           /* These are needed since we do not localize EVAL nodes: */
320           #define REGCP_SET(cp) \
321           DEBUG_STATE_r( \
322           PerlIO_printf(Perl_debug_log, \
323           " Setting an EVAL scope, savestack=%"IVdf"\n", \
324           (IV)PL_savestack_ix)); \
325           cp = PL_savestack_ix
326            
327           #define REGCP_UNWIND(cp) \
328           DEBUG_STATE_r( \
329           if (cp != PL_savestack_ix) \
330           PerlIO_printf(Perl_debug_log, \
331           " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
332           (IV)(cp), (IV)PL_savestack_ix)); \
333           regcpblow(cp)
334            
335           #define UNWIND_PAREN(lp, lcp) \
336           for (n = rex->lastparen; n > lp; n--) \
337           rex->offs[n].end = -1; \
338           rex->lastparen = n; \
339           rex->lastcloseparen = lcp;
340            
341            
342           STATIC void
343 59469062         S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
344           {
345           dVAR;
346           UV i;
347           U32 paren;
348 59469062         GET_RE_DEBUG_FLAGS_DECL;
349            
350 59469062         PERL_ARGS_ASSERT_REGCPPOP;
351            
352           /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
353 59469062         i = SSPOPUV;
354 228338814         assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
355 168869800         i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
356 168869800         rex->lastcloseparen = SSPOPINT;
357 168869800         rex->lastparen = SSPOPINT;
358 168869800         *maxopenparen_p = SSPOPINT;
359            
360 17893784         i -= REGCP_OTHER_ELEMS;
361           /* Now restore the parentheses context. */
362 168869800         DEBUG_BUFFERS_r(
363           if (i || rex->lastparen + 1 <= rex->nparens)
364           PerlIO_printf(Perl_debug_log,
365           "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
366           PTR2UV(rex),
367           PTR2UV(rex->offs)
368           );
369           );
370 471769998         paren = *maxopenparen_p;
371 412300984         for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
372           SSize_t tmps;
373 261324920         rex->offs[paren].start_tmp = SSPOPINT;
374 412300936         rex->offs[paren].start = SSPOPIV;
375 59469014         tmps = SSPOPIV;
376 1877336         if (paren <= rex->lastparen)
377 1877336         rex->offs[paren].end = tmps;
378 1877336         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
379           " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
380           (UV)paren,
381           (IV)rex->offs[paren].start,
382           (IV)rex->offs[paren].start_tmp,
383           (IV)rex->offs[paren].end,
384           (paren > rex->lastparen ? "(skipped)" : ""));
385           );
386 943578         paren--;
387           }
388           #if 1
389           /* It would seem that the similar code in regtry()
390           * already takes care of this, and in fact it is in
391           * a better location to since this code can #if 0-ed out
392           * but the code in regtry() is needed or otherwise tests
393           * requiring null fields (pat.t#187 and split.t#{13,14}
394           * (as of patchlevel 7877) will fail. Then again,
395           * this code seems to be necessary or otherwise
396           * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397           * --jhi updated by dapm */
398 5574         for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 5526         if (i > *maxopenparen_p)
400 120         rex->offs[i].start = -1;
401 120         rex->offs[i].end = -1;
402 0         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
403           " \\%"UVuf": %s ..-1 undeffing\n",
404           (UV)i,
405           (i > *maxopenparen_p) ? "-1" : " "
406           ));
407           }
408           #endif
409 168         }
410            
411           /* restore the parens and associated vars at savestack position ix,
412           * but without popping the stack */
413            
414           STATIC void
415           S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
416           {
417 48         I32 tmpix = PL_savestack_ix;
418 48         PL_savestack_ix = ix;
419 168         regcppop(rex, maxopenparen_p);
420 260         PL_savestack_ix = tmpix;
421           }
422            
423           #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
424            
425           STATIC bool
426 120         S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
427           {
428           /* Returns a boolean as to whether or not 'character' is a member of the
429           * Posix character class given by 'classnum' that should be equivalent to a
430           * value in the typedef '_char_class_number'.
431           *
432           * Ideally this could be replaced by a just an array of function pointers
433           * to the C library functions that implement the macros this calls.
434           * However, to compile, the precise function signatures are required, and
435           * these may vary from platform to to platform. To avoid having to figure
436           * out what those all are on each platform, I (khw) am using this method,
437           * which adds an extra layer of function call overhead (unless the C
438           * optimizer strips it away). But we don't particularly care about
439           * performance with locales anyway. */
440            
441 120         switch ((_char_class_number) classnum) {
442 632         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
443 120         case _CC_ENUM_ALPHA: return isALPHA_LC(character);
444 120         case _CC_ENUM_ASCII: return isASCII_LC(character);
445 2744         case _CC_ENUM_BLANK: return isBLANK_LC(character);
446 120         case _CC_ENUM_CASED: return isLOWER_LC(character)
447 798         || isUPPER_LC(character);
448 60         case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
449 2763         case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
450 224         case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
451 224         case _CC_ENUM_LOWER: return isLOWER_LC(character);
452 84         case _CC_ENUM_PRINT: return isPRINT_LC(character);
453 140         case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
454 56         case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
455 84         case _CC_ENUM_SPACE: return isSPACE_LC(character);
456 56         case _CC_ENUM_UPPER: return isUPPER_LC(character);
457 0         case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
458 0         case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
459           default: /* VERTSPACE should never occur in locales */
460 56         Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
461           }
462            
463           assert(0); /* NOTREACHED */
464           return FALSE;
465           }
466            
467           STATIC bool
468 28         S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
469           {
470           /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
471           * 'character' is a member of the Posix character class given by 'classnum'
472           * that should be equivalent to a value in the typedef
473           * '_char_class_number'.
474           *
475           * This just calls isFOO_lc on the code point for the character if it is in
476           * the range 0-255. Outside that range, all characters avoid Unicode
477           * rules, ignoring any locale. So use the Unicode function if this class
478           * requires a swash, and use the Unicode macro otherwise. */
479            
480 28         PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
481            
482 0         if (UTF8_IS_INVARIANT(*character)) {
483 0         return isFOO_lc(classnum, *character);
484           }
485 112         else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
486 0         return isFOO_lc(classnum,
487           TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
488           }
489            
490 0         if (classnum < _FIRST_NON_SWASH_CC) {
491            
492           /* Initialize the swash unless done already */
493 0         if (! PL_utf8_swash_ptrs[classnum]) {
494 137167339         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
495 137167339         PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
496           swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
497           }
498            
499 137167339         return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
500           character,
501           TRUE /* is UTF */ ));
502           }
503            
504 137167339         switch ((_char_class_number) classnum) {
505           case _CC_ENUM_SPACE:
506 137167339         case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
507            
508 137166981         case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
509 137166981         case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
510 137166981         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
511           default: return 0; /* Things like CNTRL are always
512           below 256 */
513           }
514            
515           assert(0); /* NOTREACHED */
516           return FALSE;
517           }
518            
519           /*
520           * pregexec and friends
521           */
522            
523           #ifndef PERL_IN_XSUB_RE
524           /*
525           - pregexec - match a regexp against a string
526           */
527           I32
528           Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
529           char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
530           /* stringarg: the point in the string at which to begin matching */
531           /* strend: pointer to null at end of string */
532           /* strbeg: real beginning of string */
533           /* minend: end of match must be >= minend bytes after stringarg. */
534           /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
535           * itself is accessed via the pointers above */
536           /* nosave: For optimizations. */
537           {
538           PERL_ARGS_ASSERT_PREGEXEC;
539            
540           return
541           regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
542           nosave ? 0 : REXEC_COPY_STR);
543           }
544           #endif
545            
546           /*
547           * Need to implement the following flags for reg_anch:
548           *
549           * USE_INTUIT_NOML - Useful to call re_intuit_start() first
550           * USE_INTUIT_ML
551           * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
552           * INTUIT_AUTORITATIVE_ML
553           * INTUIT_ONCE_NOML - Intuit can match in one location only.
554           * INTUIT_ONCE_ML
555           *
556           * Another flag for this function: SECOND_TIME (so that float substrs
557           * with giant delta may be not rechecked).
558           */
559            
560           /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
561           Otherwise, only SvCUR(sv) is used to get strbeg. */
562            
563           /* XXXX Some places assume that there is a fixed substring.
564           An update may be needed if optimizer marks as "INTUITable"
565           RExen without fixed substrings. Similarly, it is assumed that
566           lengths of all the strings are no more than minlen, thus they
567           cannot come from lookahead.
568           (Or minlen should take into account lookahead.)
569           NOTE: Some of this comment is not correct. minlen does now take account
570           of lookahead/behind. Further research is required. -- demerphq
571            
572           */
573            
574           /* A failure to find a constant substring means that there is no need to make
575           an expensive call to REx engine, thus we celebrate a failure. Similarly,
576           finding a substring too deep into the string means that fewer calls to
577           regtry() should be needed.
578            
579           REx compiler's optimizer found 4 possible hints:
580           a) Anchored substring;
581           b) Fixed substring;
582           c) Whether we are anchored (beginning-of-line or \G);
583           d) First node (of those at offset 0) which may distinguish positions;
584           We use a)b)d) and multiline-part of c), and try to find a position in the
585           string which does not contradict any of them.
586           */
587            
588           /* Most of decisions we do here should have been done at compile time.
589           The nodes of the REx which we used for the search should have been
590           deleted from the finite automaton. */
591            
592           /* args:
593           * rx: the regex to match against
594           * sv: the SV being matched: only used for utf8 flag; the string
595           * itself is accessed via the pointers below. Note that on
596           * something like an overloaded SV, SvPOK(sv) may be false
597           * and the string pointers may point to something unrelated to
598           * the SV itself.
599           * strbeg: real beginning of string
600           * strpos: the point in the string at which to begin matching
601           * strend: pointer to the byte following the last char of the string
602           * flags currently unused; set to 0
603           * data: currently unused; set to NULL
604           */
605            
606           char *
607 137167073         Perl_re_intuit_start(pTHX_
608           REGEXP * const rx,
609           SV *sv,
610           const char * const strbeg,
611           char *strpos,
612           char *strend,
613           const U32 flags,
614           re_scream_pos_data *data)
615           {
616           dVAR;
617 137167073         struct regexp *const prog = ReANY(rx);
618           SSize_t start_shift = 0;
619           /* Should be nonnegative! */
620           SSize_t end_shift = 0;
621           char *s;
622           SV *check;
623           char *t;
624 137167073         const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
625           I32 ml_anch;
626           char *other_last = NULL; /* other substr checked before this */
627           char *check_at = NULL; /* check substr found at this pos */
628           char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
629 137167073         const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
630 137167073         RXi_GET_DECL(prog,progi);
631           regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
632           regmatch_info *const reginfo = ®info_buf;
633           #ifdef DEBUGGING
634           const char * const i_strpos = strpos;
635           #endif
636 1143398         GET_RE_DEBUG_FLAGS_DECL;
637            
638 495102         PERL_ARGS_ASSERT_RE_INTUIT_START;
639           PERL_UNUSED_ARG(flags);
640           PERL_UNUSED_ARG(data);
641            
642           /* CHR_DIST() would be more correct here but it makes things slow. */
643 1143398         if (prog->minlen > strend - strpos) {
644 136023675         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
645           "String too short... [re_intuit_start]\n"));
646           goto fail;
647           }
648            
649 106664         reginfo->is_utf8_target = cBOOL(utf8_target);
650 136023739         reginfo->info_aux = NULL;
651 137167045         reginfo->strbeg = strbeg;
652 72855894         reginfo->strend = strend;
653 106369435         reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
654 70019930         reginfo->intuit = 1;
655           /* not actually used within intuit, but zero for safety anyway */
656 70019930         reginfo->poscache_maxiter = 0;
657            
658 71770960         if (utf8_target) {
659 95011864         if (!prog->check_utf8 && prog->check_substr)
660 63616418         to_utf8_substr(prog);
661 63616278         check = prog->check_utf8;
662           } else {
663 39943195         if (!prog->check_substr && prog->check_utf8) {
664 39943131         if (! to_byte_substr(prog)) {
665 39942795         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
666           }
667           }
668 39942831         check = prog->check_substr;
669           }
670 416596         if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
671 416532         && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
672           {
673 74482         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
674 70404         || ( (prog->extflags & RXf_ANCH_BOL)
675 70404         && !multiline ) ); /* Check after \n? */
676            
677 63438         if (!ml_anch) {
678 10906         if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
679 39526235         && (strpos != strbeg)) {
680 7547081         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
681           goto fail;
682           }
683 6422740         if (prog->check_offset_min == prog->check_offset_max
684 31827961         && !(prog->extflags & RXf_CANY_SEEN)
685 31827961         && ! multiline) /* /m can cause \n's to match that aren't
686           accounted for in the string max length.
687           See [perl #115242] */
688           {
689           /* Substring at constant offset from beg-of-str... */
690           SSize_t slen;
691            
692 31827961         s = HOP3c(strpos, prog->check_offset_min, strend);
693          
694 35137806         if (SvTAIL(check)) {
695 23673511         slen = SvCUR(check); /* >= 1 */
696            
697 23673511         if ( strend - s > slen || strend - s < slen - 1
698 23673511         || (strend - s == slen && strend[-1] != '\n')) {
699 65396085         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
700           goto fail_finish;
701           }
702           /* Now should match s[0..slen-2] */
703 65706360         slen--;
704 98104332         if (slen && (*SvPVX_const(check) != *s
705 0         || (slen > 1
706 0         && memNE(SvPVX_const(check), s, slen)))) {
707           report_neq:
708 98104332         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
709           goto fail_finish;
710           }
711           }
712 62         else if (*SvPVX_const(check) != *s
713 62         || ((slen = SvCUR(check)) > 1
714 98104270         && memNE(SvPVX_const(check), s, slen)))
715           goto report_neq;
716           check_at = s;
717           goto success_at_start;
718           }
719           }
720           /* Match is anchored, but substr is not anchored wrt beg-of-str. */
721           s = strpos;
722 98104270         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
723 98104332         end_shift = prog->check_end_shift;
724          
725 98104332         if (!ml_anch) {
726 84090637         const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
727 46088044         - (SvTAIL(check) != 0);
728 5113531         const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
729            
730 5113531         if (end_shift < eshift)
731           end_shift = eshift;
732           }
733           }
734           else { /* Can match at random position */
735           ml_anch = 0;
736           s = strpos;
737 1764544         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
738 1764544         end_shift = prog->check_end_shift;
739          
740           /* end shift should be non negative here */
741           }
742            
743           #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
744           if (end_shift < 0)
745           Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
746           (IV)end_shift, RX_PRECOMP(prog));
747           #endif
748            
749           restart:
750           /* Find a possible match in the region s..strend by looking for
751           the "check" substring in the region corrected by start/end_shift. */
752          
753           {
754           SSize_t srch_start_shift = start_shift;
755           SSize_t srch_end_shift = end_shift;
756           U8* start_point;
757           U8* end_point;
758 1764544         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
759 43140         srch_end_shift -= ((strbeg - s) - srch_start_shift);
760 15764         srch_start_shift = strbeg - s;
761           }
762 873639         DEBUG_OPTIMISE_MORE_r({
763           PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
764           (IV)prog->check_offset_min,
765           (IV)srch_start_shift,
766           (IV)srch_end_shift,
767           (IV)prog->check_end_shift);
768           });
769          
770 1764544         if (prog->extflags & RXf_CANY_SEEN) {
771 1764480         start_point= (U8*)(s + srch_start_shift);
772 1764480         end_point= (U8*)(strend - srch_end_shift);
773           } else {
774 1764544         start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
775 1764544         end_point= HOP3(strend, -srch_end_shift, strbeg);
776           }
777 1764544         DEBUG_OPTIMISE_MORE_r({
778           PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
779           (int)(end_point - start_point),
780           (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
781           start_point);
782           });
783            
784 1764544         s = fbm_instr( start_point, end_point,
785           check, multiline ? FBMrf_MULTILINE : 0);
786           }
787           /* Update the count-of-usability, remove useless subpatterns,
788           unshift s. */
789            
790 1764544         DEBUG_EXECUTE_r({
791           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
792           SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
793           PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
794           (s ? "Found" : "Did not find"),
795           (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
796           ? "anchored" : "floating"),
797           quoted,
798           RE_SV_TAIL(check),
799           (s ? " at offset " : "...\n") );
800           });
801            
802 160862         if (!s)
803           goto fail_finish;
804           /* Finish the diagnostic message */
805 142276         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
806            
807           /* XXX dmq: first branch is for positive lookbehind...
808           Our check string is offset from the beginning of the pattern.
809           So we need to do any stclass tests offset forward from that
810           point. I think. :-(
811           */
812          
813          
814          
815           check_at=s;
816          
817            
818           /* Got a candidate. Check MBOL anchoring, and the *other* substr.
819           Start with the other substr.
820           XXXX no SCREAM optimization yet - and a very coarse implementation
821           XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
822           *always* match. Probably should be marked during compile...
823           Probably it is right to do no SCREAM here...
824           */
825            
826 49209943         if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
827 1603750         : (prog->float_substr && prog->anchored_substr))
828           {
829           /* Take into account the "other" substring. */
830           /* XXXX May be hopelessly wrong for UTF... */
831 1603684         if (!other_last)
832           other_last = strpos;
833 1603684         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
834           do_other_anchored:
835           {
836 3353297         char * const last = HOP3c(s, -start_shift, strbeg);
837           char *last1, *last2;
838           char * const saved_s = s;
839           SV* must;
840            
841 3353297         t = s - prog->check_offset_max;
842 3353297         if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
843 868338         && (!utf8_target
844 3353295         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
845 3353295         && t > strpos)))
846           NOOP;
847           else
848           t = strpos;
849 3353297         t = HOP3c(t, prog->anchored_offset, strend);
850 3353297         if (t < other_last) /* These positions already checked */
851           t = other_last;
852 3353297         last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
853 3353297         if (last < last1)
854           last1 = last;
855           /* XXXX It is not documented what units *_offsets are in.
856           We assume bytes, but this is clearly wrong.
857           Meaning this code needs to be carefully reviewed for errors.
858           dmq.
859           */
860          
861           /* On end-of-str: see comment below. */
862 630018         must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
863 620552         if (must == &PL_sv_undef) {
864           s = (char*)NULL;
865 2723279         DEBUG_r(must = prog->anchored_utf8); /* for debug */
866           }
867           else
868 29032180         s = fbm_instr(
869           (unsigned char*)t,
870           HOP3(HOP3(last1, prog->anchored_offset, strend)
871           + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
872           must,
873           multiline ? FBMrf_MULTILINE : 0
874           );
875 29032180         DEBUG_EXECUTE_r({
876           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
877           SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
878           PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
879           (s ? "Found" : "Contradicts"),
880           quoted, RE_SV_TAIL(must));
881           });
882          
883          
884 5812949         if (!s) {
885 83286         if (last1 >= last2) {
886 71884         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
887           ", giving up...\n"));
888           goto fail_finish;
889           }
890 9041640         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
891           ", trying floating at offset %ld...\n",
892           (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
893 11988574         other_last = HOP3c(last1, prog->anchored_offset+1, strend);
894 11986670         s = HOP3c(last, 1, strend);
895           goto restart;
896           }
897           else {
898 446794         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
899           (long)(s - i_strpos)));
900 365348         t = HOP3c(s, -prog->anchored_offset, strbeg);
901 36         other_last = HOP3c(s, 1, strend);
902           s = saved_s;
903 36         if (t == strpos)
904           goto try_at_start;
905           goto try_at_offset;
906           }
907           }
908           }
909           else { /* Take into account the floating substring. */
910           char *last, *last1;
911           char * const saved_s = s;
912           SV* must;
913            
914 365312         t = HOP3c(s, -start_shift, strbeg);
915           last1 = last =
916 365312         HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
917 81446         if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
918 81446         last = HOP3c(t, prog->float_max_offset, strend);
919 11539878         s = HOP3c(t, prog->float_min_offset, strend);
920 9354398         if (s < other_last)
921           s = other_last;
922           /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
923 24371921         must = utf8_target ? prog->float_utf8 : prog->float_substr;
924           /* fbm_instr() takes into account exact value of end-of-str
925           if the check is SvTAIL(ed). Since false positives are OK,
926           and end-of-str is not later than strend we are OK. */
927 441177         if (must == &PL_sv_undef) {
928           s = (char*)NULL;
929 29159732         DEBUG_r(must = prog->float_utf8); /* for debug message */
930           }
931           else
932 19063114         s = fbm_instr((unsigned char*)s,
933           (unsigned char*)last + SvCUR(must)
934           - (SvTAIL(must)!=0),
935           must, multiline ? FBMrf_MULTILINE : 0);
936 333948         DEBUG_EXECUTE_r({
937           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
938           SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
939           PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
940           (s ? "Found" : "Contradicts"),
941           quoted, RE_SV_TAIL(must));
942           });
943 333948         if (!s) {
944 930         if (last1 == last) {
945 9197609         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946           ", giving up...\n"));
947           goto fail_finish;
948           }
949 9197609         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
950           ", trying anchored starting at offset %ld...\n",
951           (long)(saved_s + 1 - i_strpos)));
952           other_last = last;
953 1903176         s = HOP3c(t, 1, strend);
954           goto restart;
955           }
956           else {
957 194602         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
958           (long)(s - i_strpos)));
959           other_last = s; /* Fix this later. --Hugo */
960           s = saved_s;
961 194602         if (t == strpos)
962           goto try_at_start;
963           goto try_at_offset;
964           }
965           }
966           }
967            
968          
969 194664         t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
970          
971 194664         DEBUG_OPTIMISE_MORE_r(
972           PerlIO_printf(Perl_debug_log,
973           "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
974           (IV)prog->check_offset_min,
975           (IV)prog->check_offset_max,
976           (IV)(s-strpos),
977           (IV)(t-strpos),
978           (IV)(t-s),
979           (IV)(strend-strpos)
980           )
981           );
982            
983 194664         if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
984 48         && (!utf8_target
985 14874731         || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
986 38514130         && t > strpos)))
987           {
988           /* Fixed substring is found far enough so that the match
989           cannot start at strpos. */
990           try_at_offset:
991 4936068         if (ml_anch && t[-1] != '\n') {
992           /* Eventually fbm_*() should handle this, but often
993           anchored_offset is not 0, so this check will not be wasted. */
994           /* XXXX In the code below we prefer to look for "^" even in
995           presence of anchored substrings. And we search even
996           beyond the found float position. These pessimizations
997           are historical artefacts only. */
998           find_anchor:
999 5101775         while (t < strend - prog->minlen) {
1000 131458         if (*t == '\n') {
1001 68510         if (t < check_at - prog->check_offset_min) {
1002 62948         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1003           /* Since we moved from the found position,
1004           we definitely contradict the found anchored
1005           substr. Due to the above check we do not
1006           contradict "check" substr.
1007           Thus we can arrive here only if check substr
1008           is float. Redo checking for "other"=="fixed".
1009           */
1010 4936062         strpos = t + 1;
1011 253464         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1012           PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1013           goto do_other_anchored;
1014           }
1015           /* We don't contradict the found floating substring. */
1016           /* XXXX Why not check for STCLASS? */
1017 4682598         s = t + 1;
1018 4644528         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1019           PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1020           goto set_useful;
1021           }
1022           /* Position contradicts check-string */
1023           /* XXXX probably better to look for check-string
1024           than for "\n", so one should lower the limit for t? */
1025 4936062         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1026           PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1027 4936062         other_last = strpos = s = t + 1;
1028 4936062         goto restart;
1029           }
1030 295992         t++;
1031           }
1032 194332         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1033           PL_colors[0], PL_colors[1]));
1034           goto fail_finish;
1035           }
1036           else {
1037 178632         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1038           PL_colors[0], PL_colors[1]));
1039           }
1040           s = t;
1041           set_useful:
1042 178632         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1043           }
1044           else {
1045           /* The found string does not prohibit matching at strpos,
1046           - no optimization of calling REx engine can be performed,
1047           unless it was an MBOL and we are not after MBOL,
1048           or a future STCLASS check will fail this. */
1049           try_at_start:
1050           /* Even in this situation we may use MBOL flag if strpos is offset
1051           wrt the start of the string. */
1052 12898         if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1053           /* May be due to an implicit anchor of m{.*foo} */
1054 119994         && !(prog->intflags & PREGf_IMPLICIT))
1055           {
1056           t = strpos;
1057           goto find_anchor;
1058           }
1059 120052         DEBUG_EXECUTE_r( if (ml_anch)
1060           PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1061           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1062           );
1063           success_at_start:
1064 36136         if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1065 4424         && (utf8_target ? (
1066 4210         prog->check_utf8 /* Could be deleted already */
1067 165786         && --BmUSEFUL(prog->check_utf8) < 0
1068 54422         && (prog->check_utf8 == prog->float_utf8)
1069           ) : (
1070 54480         prog->check_substr /* Could be deleted already */
1071 111422         && --BmUSEFUL(prog->check_substr) < 0
1072 111462         && (prog->check_substr == prog->float_substr)
1073           )))
1074           {
1075           /* If flags & SOMETHING - do not do it many times on the same match */
1076 111462         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1077           /* XXX Does the destruction order has to change with utf8_target? */
1078 38218138         SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079 98747393         SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1080 118156009         prog->check_substr = prog->check_utf8 = NULL; /* disable */
1081 26704655         prog->float_substr = prog->float_utf8 = NULL; /* clear */
1082           check = NULL; /* abort */
1083           s = strpos;
1084           /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1085           see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086 26704655         if (prog->intflags & PREGf_IMPLICIT)
1087 26704655         prog->extflags &= ~RXf_ANCH_MBOL;
1088           /* XXXX This is a remnant of the old implementation. It
1089           looks wasteful, since now INTUIT can use many
1090           other heuristics. */
1091 26704655         prog->extflags &= ~RXf_USE_INTUIT;
1092           /* XXXX What other flags might need to be cleared in this branch? */
1093           }
1094           else
1095           s = strpos;
1096           }
1097            
1098           /* Last resort... */
1099           /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1100           /* trie stclasses are too expensive to use here, we are better off to
1101           leave it to regmatch itself */
1102 26704719         if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1103           /* minlen == 0 is possible if regstclass is \b or \B,
1104           and the fixed substr is ''$.
1105           Since minlen is already taken into account, s+1 is before strend;
1106           accidentally, minlen >= 1 guaranties no false positives at s + 1
1107           even for \b or \B. But (minlen? 1 : 0) below assumes that
1108           regstclass does not come from lookahead... */
1109           /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1110           This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1111 26704655         const U8* const str = (U8*)STRING(progi->regstclass);
1112           /* XXX this value could be pre-computed */
1113 15725172         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1114 1134185         ? (reginfo->is_utf8_pat
1115 87529661         ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1116 76         : STR_LEN(progi->regstclass))
1117           : 1);
1118           char * endpos;
1119 15198         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1120 5528         endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1121 146108         else if (prog->float_substr || prog->float_utf8)
1122 13040         endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123           else
1124           endpos= strend;
1125          
1126 446420         if (checked_upto < s)
1127           checked_upto = s;
1128 72536         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1129           (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1130            
1131           t = s;
1132 72536         s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1133           reginfo);
1134 403254         if (s) {
1135           checked_upto = s;
1136           } else {
1137           #ifdef DEBUGGING
1138           const char *what = NULL;
1139           #endif
1140 403254         if (endpos == strend) {
1141 403254         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142           "Could not match STCLASS...\n") );
1143           goto fail;
1144           }
1145 403254         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1146           "This position contradicts STCLASS...\n") );
1147 0         if ((prog->extflags & RXf_ANCH) && !ml_anch)
1148           goto fail;
1149 403254         checked_upto = HOPBACKc(endpos, start_shift);
1150 403254         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1151           (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1152           /* Contradict one of substrings */
1153 403254         if (prog->anchored_substr || prog->anchored_utf8) {
1154 119372         if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1155 2223869         DEBUG_EXECUTE_r( what = "anchored" );
1156           hop_and_restart:
1157 223040         s = HOP3c(t, 1, strend);
1158 223040         if (s + start_shift + end_shift > strend) {
1159           /* XXXX Should be taken into account earlier? */
1160 223040         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1161           "Could not match STCLASS...\n") );
1162           goto fail;
1163           }
1164 223040         if (!check)
1165           goto giveup;
1166 223040         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1167           "Looking for %s substr starting at offset %ld...\n",
1168           what, (long)(s + start_shift - i_strpos)) );
1169           goto restart;
1170           }
1171           /* Have both, check_string is floating */
1172 223040         if (t + start_shift >= check_at) /* Contradicts floating=check */
1173           goto retry_floating_check;
1174           /* Recheck anchored substring, but not floating... */
1175           s = check_at;
1176 223040         if (!check)
1177           goto giveup;
1178 223040         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1179           "Looking for anchored substr starting at offset %ld...\n",
1180           (long)(other_last - i_strpos)) );
1181           goto do_other_anchored;
1182           }
1183           /* Another way we could have checked stclass at the
1184           current position only: */
1185 111520         if (ml_anch) {
1186 365392         s = t = t + 1;
1187 311256         if (!check)
1188           goto giveup;
1189 311256         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1190           "Looking for /%s^%s/m starting at offset %ld...\n",
1191           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1192           goto try_at_offset;
1193           }
1194 218992         if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1195           goto fail;
1196           /* Check is floating substring. */
1197           retry_floating_check:
1198 142352         t = check_at - start_shift;
1199 32         DEBUG_EXECUTE_r( what = "floating" );
1200           goto hop_and_restart;
1201           }
1202 36         if (t != s) {
1203 32         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204           "By STCLASS: moving %ld --> %ld\n",
1205           (long)(t - i_strpos), (long)(s - i_strpos))
1206           );
1207           }
1208           else {
1209 44         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1210           "Does not contradict STCLASS...\n");
1211           );
1212           }
1213           }
1214           giveup:
1215 625788         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1216           PL_colors[4], (check ? "Guessed" : "Giving up"),
1217           PL_colors[5], (long)(s - i_strpos)) );
1218           return s;
1219            
1220           fail_finish: /* Substring not found */
1221 91406         if (prog->check_substr || prog->check_utf8) /* could be removed already */
1222 573         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1223           fail:
1224 116         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1225           PL_colors[4], PL_colors[5]));
1226           return NULL;
1227           }
1228            
1229           #define DECL_TRIE_TYPE(scan) \
1230           const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1231           trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1232           trie_type = ((scan->flags == EXACT) \
1233           ? (utf8_target ? trie_utf8 : trie_plain) \
1234           : (scan->flags == EXACTFA) \
1235           ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1236           : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1237            
1238           #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1239           STMT_START { \
1240           STRLEN skiplen; \
1241           U8 flags = FOLD_FLAGS_FULL; \
1242           switch (trie_type) { \
1243           case trie_utf8_exactfa_fold: \
1244           flags |= FOLD_FLAGS_NOMIX_ASCII; \
1245           /* FALL THROUGH */ \
1246           case trie_utf8_fold: \
1247           if ( foldlen>0 ) { \
1248           uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249           foldlen -= len; \
1250           uscan += len; \
1251           len=0; \
1252           } else { \
1253           uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \
1254           len = UTF8SKIP(uc); \
1255           skiplen = UNISKIP( uvc ); \
1256           foldlen -= skiplen; \
1257           uscan = foldbuf + skiplen; \
1258           } \
1259           break; \
1260           case trie_latin_utf8_exactfa_fold: \
1261           flags |= FOLD_FLAGS_NOMIX_ASCII; \
1262           /* FALL THROUGH */ \
1263           case trie_latin_utf8_fold: \
1264           if ( foldlen>0 ) { \
1265           uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1266           foldlen -= len; \
1267           uscan += len; \
1268           len=0; \
1269           } else { \
1270           len = 1; \
1271           uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1272           skiplen = UNISKIP( uvc ); \
1273           foldlen -= skiplen; \
1274           uscan = foldbuf + skiplen; \
1275           } \
1276           break; \
1277           case trie_utf8: \
1278           uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1279           break; \
1280           case trie_plain: \
1281           uvc = (UV)*uc; \
1282           len = 1; \
1283           } \
1284           if (uvc < 256) { \
1285           charid = trie->charmap[ uvc ]; \
1286           } \
1287           else { \
1288           charid = 0; \
1289           if (widecharmap) { \
1290           SV** const svpp = hv_fetch(widecharmap, \
1291           (char*)&uvc, sizeof(UV), 0); \
1292           if (svpp) \
1293           charid = (U16)SvIV(*svpp); \
1294           } \
1295           } \
1296           } STMT_END
1297            
1298           #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1299           STMT_START { \
1300           while (s <= e) { \
1301           if ( (CoNd) \
1302           && (ln == 1 || folder(s, pat_string, ln)) \
1303           && (reginfo->intuit || regtry(reginfo, &s)) )\
1304           goto got_it; \
1305           s++; \
1306           } \
1307           } STMT_END
1308            
1309           #define REXEC_FBC_UTF8_SCAN(CoDe) \
1310           STMT_START { \
1311           while (s < strend) { \
1312           CoDe \
1313           s += UTF8SKIP(s); \
1314           } \
1315           } STMT_END
1316            
1317           #define REXEC_FBC_SCAN(CoDe) \
1318           STMT_START { \
1319           while (s < strend) { \
1320           CoDe \
1321           s++; \
1322           } \
1323           } STMT_END
1324            
1325           #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1326           REXEC_FBC_UTF8_SCAN( \
1327           if (CoNd) { \
1328           if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1329           goto got_it; \
1330           else \
1331           tmp = doevery; \
1332           } \
1333           else \
1334           tmp = 1; \
1335           )
1336            
1337           #define REXEC_FBC_CLASS_SCAN(CoNd) \
1338           REXEC_FBC_SCAN( \
1339           if (CoNd) { \
1340           if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1341           goto got_it; \
1342           else \
1343           tmp = doevery; \
1344           } \
1345           else \
1346           tmp = 1; \
1347           )
1348            
1349           #define REXEC_FBC_TRYIT \
1350           if ((reginfo->intuit || regtry(reginfo, &s))) \
1351           goto got_it
1352            
1353           #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1354           if (utf8_target) { \
1355           REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1356           } \
1357           else { \
1358           REXEC_FBC_CLASS_SCAN(CoNd); \
1359           }
1360          
1361           #define DUMP_EXEC_POS(li,s,doutf8) \
1362           dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1363           startpos, doutf8)
1364            
1365            
1366           #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1367           tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1368           tmp = TEST_NON_UTF8(tmp); \
1369           REXEC_FBC_UTF8_SCAN( \
1370           if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1371           tmp = !tmp; \
1372           IF_SUCCESS; \
1373           } \
1374           else { \
1375           IF_FAIL; \
1376           } \
1377           ); \
1378            
1379           #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1380           if (s == reginfo->strbeg) { \
1381           tmp = '\n'; \
1382           } \
1383           else { \
1384           U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1385           tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1386           } \
1387           tmp = TeSt1_UtF8; \
1388           LOAD_UTF8_CHARCLASS_ALNUM(); \
1389           REXEC_FBC_UTF8_SCAN( \
1390           if (tmp == ! (TeSt2_UtF8)) { \
1391           tmp = !tmp; \
1392           IF_SUCCESS; \
1393           } \
1394           else { \
1395           IF_FAIL; \
1396           } \
1397           ); \
1398            
1399           /* The only difference between the BOUND and NBOUND cases is that
1400           * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1401           * NBOUND. This is accomplished by passing it in either the if or else clause,
1402           * with the other one being empty */
1403           #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404           FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1405            
1406           #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1407           FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1408            
1409           #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1410           FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1411            
1412           #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1413           FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1414            
1415            
1416           /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1417           * be passed in completely with the variable name being tested, which isn't
1418           * such a clean interface, but this is easier to read than it was before. We
1419           * are looking for the boundary (or non-boundary between a word and non-word
1420           * character. The utf8 and non-utf8 cases have the same logic, but the details
1421           * must be different. Find the "wordness" of the character just prior to this
1422           * one, and compare it with the wordness of this one. If they differ, we have
1423           * a boundary. At the beginning of the string, pretend that the previous
1424           * character was a new-line */
1425           #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1426           if (utf8_target) { \
1427           UTF8_CODE \
1428           } \
1429           else { /* Not utf8 */ \
1430           tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1431           tmp = TEST_NON_UTF8(tmp); \
1432           REXEC_FBC_SCAN( \
1433           if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1434           tmp = !tmp; \
1435           IF_SUCCESS; \
1436           } \
1437           else { \
1438           IF_FAIL; \
1439           } \
1440           ); \
1441           } \
1442           if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1443           goto got_it;
1444            
1445           /* We know what class REx starts with. Try to find this position... */
1446           /* if reginfo->intuit, its a dryrun */
1447           /* annoyingly all the vars in this routine have different names from their counterparts
1448           in regmatch. /grrr */
1449            
1450           STATIC char *
1451 68         S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1452           const char *strend, regmatch_info *reginfo)
1453           {
1454           dVAR;
1455 44         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1456           char *pat_string; /* The pattern's exactish string */
1457           char *pat_end; /* ptr to end char of pat_string */
1458           re_fold_t folder; /* Function for computing non-utf8 folds */
1459           const U8 *fold_array; /* array for folding ords < 256 */
1460           STRLEN ln;
1461           STRLEN lnc;
1462           U8 c1;
1463           U8 c2;
1464           char *e;
1465           I32 tmp = 1; /* Scratch variable? */
1466 49         const bool utf8_target = reginfo->is_utf8_target;
1467           UV utf8_fold_flags = 0;
1468 880         const bool is_utf8_pat = reginfo->is_utf8_pat;
1469           bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1470           with a result inverts that result, as 0^1 =
1471           1 and 1^1 = 0 */
1472           _char_class_number classnum;
1473            
1474 3264183         RXi_GET_DECL(prog,progi);
1475            
1476 9215480         PERL_ARGS_ASSERT_FIND_BYCLASS;
1477            
1478           /* We know what class it must start with. */
1479 145182         switch (OP(c)) {
1480           case ANYOF:
1481           case ANYOF_SYNTHETIC:
1482           case ANYOF_WARN_SUPER:
1483 3512         if (utf8_target) {
1484 83952175         REXEC_FBC_UTF8_CLASS_SCAN(
1485           reginclass(prog, c, (U8*)s, utf8_target));
1486           }
1487           else {
1488 192180         REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1489           }
1490           break;
1491           case CANY:
1492 92212         REXEC_FBC_SCAN(
1493           if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1494           goto got_it;
1495           else
1496           tmp = doevery;
1497           );
1498           break;
1499            
1500           case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1501 439544         assert(! is_utf8_pat);
1502           /* FALL THROUGH */
1503           case EXACTFA:
1504 439544         if (is_utf8_pat || utf8_target) {
1505           utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1506           goto do_exactf_utf8;
1507           }
1508           fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1509           folder = foldEQ_latin1; /* /a, except the sharp s one which */
1510           goto do_exactf_non_utf8; /* isn't dealt with by these */
1511            
1512           case EXACTF: /* This node only generated for non-utf8 patterns */
1513 375864         assert(! is_utf8_pat);
1514 354020         if (utf8_target) {
1515           utf8_fold_flags = 0;
1516           goto do_exactf_utf8;
1517           }
1518           fold_array = PL_fold;
1519           folder = foldEQ;
1520           goto do_exactf_non_utf8;
1521            
1522           case EXACTFL:
1523 149322         if (is_utf8_pat || utf8_target) {
1524           utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1525           goto do_exactf_utf8;
1526           }
1527           fold_array = PL_fold_locale;
1528           folder = foldEQ_locale;
1529           goto do_exactf_non_utf8;
1530            
1531           case EXACTFU_SS:
1532 128470         if (is_utf8_pat) {
1533           utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1534           }
1535           goto do_exactf_utf8;
1536            
1537           case EXACTFU:
1538 141610         if (is_utf8_pat || utf8_target) {
1539 20850         utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1540 14284         goto do_exactf_utf8;
1541           }
1542            
1543           /* Any 'ss' in the pattern should have been replaced by regcomp,
1544           * so we don't have to worry here about this single special case
1545           * in the Latin1 range */
1546           fold_array = PL_fold_latin1;
1547           folder = foldEQ_latin1;
1548            
1549           /* FALL THROUGH */
1550            
1551           do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1552           are no glitches with fold-length differences
1553           between the target string and pattern */
1554            
1555           /* The idea in the non-utf8 EXACTF* cases is to first find the
1556           * first character of the EXACTF* node and then, if necessary,
1557           * case-insensitively compare the full text of the node. c1 is the
1558           * first character. c2 is its fold. This logic will not work for
1559           * Unicode semantics and the german sharp ss, which hence should
1560           * not be compiled into a node that gets here. */
1561 136874         pat_string = STRING(c);
1562 200554         ln = STR_LEN(c); /* length to match in octets/bytes */
1563            
1564           /* We know that we have to match at least 'ln' bytes (which is the
1565           * same as characters, since not utf8). If we have to match 3
1566           * characters, and there are only 2 availabe, we know without
1567           * trying that it will fail; so don't start a match past the
1568           * required minimum number from the far end */
1569 169585         e = HOP3c(strend, -((SSize_t)ln), s);
1570            
1571 10745         if (reginfo->intuit && e < s) {
1572 4554         e = s; /* Due to minlen logic of intuit() */
1573           }
1574            
1575 6470         c1 = *pat_string;
1576 145308         c2 = fold_array[c1];
1577 0         if (c1 == c2) { /* If char and fold are the same */
1578 204698         REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1579           }
1580           else {
1581 822         REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1582           }
1583           break;
1584            
1585           do_exactf_utf8:
1586           {
1587           unsigned expansion;
1588            
1589           /* If one of the operands is in utf8, we can't use the simpler folding
1590           * above, due to the fact that many different characters can have the
1591           * same fold, or portion of a fold, or different- length fold */
1592 102760         pat_string = STRING(c);
1593 822         ln = STR_LEN(c); /* length to match in octets/bytes */
1594 6322109         pat_end = pat_string + ln;
1595           lnc = is_utf8_pat /* length to match in characters */
1596           ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1597 125358         : ln;
1598            
1599           /* We have 'lnc' characters to match in the pattern, but because of
1600           * multi-character folding, each character in the target can match
1601           * up to 3 characters (Unicode guarantees it will never exceed
1602           * this) if it is utf8-encoded; and up to 2 if not (based on the
1603           * fact that the Latin 1 folds are already determined, and the
1604           * only multi-char fold in that range is the sharp-s folding to
1605           * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1606           * string character. Adjust lnc accordingly, rounding up, so that
1607           * if we need to match at least 4+1/3 chars, that really is 5. */
1608 125358         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1609 125358         lnc = (lnc + expansion - 1) / expansion;
1610            
1611           /* As in the non-UTF8 case, if we have to match 3 characters, and
1612           * only 2 are left, it's guaranteed to fail, so don't start a
1613           * match that would require us to go beyond the end of the string
1614           */
1615 125358         e = HOP3c(strend, -((SSize_t)lnc), s);
1616            
1617 125358         if (reginfo->intuit && e < s) {
1618 125358         e = s; /* Due to minlen logic of intuit() */
1619           }
1620            
1621           /* XXX Note that we could recalculate e to stop the loop earlier,
1622           * as the worst case expansion above will rarely be met, and as we
1623           * go along we would usually find that e moves further to the left.
1624           * This would happen only after we reached the point in the loop
1625           * where if there were no expansion we should fail. Unclear if
1626           * worth the expense */
1627            
1628 125358         while (s <= e) {
1629 125358         char *my_strend= (char *)strend;
1630 125358         if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1631           pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1632 125358         && (reginfo->intuit || regtry(reginfo, &s)) )
1633           {
1634           goto got_it;
1635           }
1636 125358         s += (utf8_target) ? UTF8SKIP(s) : 1;
1637           }
1638           break;
1639           }
1640           case BOUNDL:
1641 125358         RXp_MATCH_TAINTED_on(prog);
1642 125358         FBC_BOUND(isWORDCHAR_LC,
1643           isWORDCHAR_LC_uvchr(tmp),
1644           isWORDCHAR_LC_utf8((U8*)s));
1645           break;
1646           case NBOUNDL:
1647 125358         RXp_MATCH_TAINTED_on(prog);
1648 123128         FBC_NBOUND(isWORDCHAR_LC,
1649           isWORDCHAR_LC_uvchr(tmp),
1650           isWORDCHAR_LC_utf8((U8*)s));
1651           break;
1652           case BOUND:
1653 116676         FBC_BOUND(isWORDCHAR,
1654           isWORDCHAR_uni(tmp),
1655           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656           break;
1657           case BOUNDA:
1658 116349         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1659           isWORDCHAR_A(tmp),
1660           isWORDCHAR_A((U8*)s));
1661           break;
1662           case NBOUND:
1663 9336         FBC_NBOUND(isWORDCHAR,
1664           isWORDCHAR_uni(tmp),
1665           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666           break;
1667           case NBOUNDA:
1668 126450         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1669           isWORDCHAR_A(tmp),
1670           isWORDCHAR_A((U8*)s));
1671           break;
1672           case BOUNDU:
1673 126350         FBC_BOUND(isWORDCHAR_L1,
1674           isWORDCHAR_uni(tmp),
1675           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1676           break;
1677           case NBOUNDU:
1678 126350         FBC_NBOUND(isWORDCHAR_L1,
1679           isWORDCHAR_uni(tmp),
1680           cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1681           break;
1682           case LNBREAK:
1683 126350         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1684           is_LNBREAK_latin1_safe(s, strend)
1685           );
1686           break;
1687            
1688           /* The argument to all the POSIX node types is the class number to pass to
1689           * _generic_isCC() to build a mask for searching in PL_charclass[] */
1690            
1691           case NPOSIXL:
1692           to_complement = 1;
1693           /* FALLTHROUGH */
1694            
1695           case POSIXL:
1696 126350         RXp_MATCH_TAINTED_on(prog);
1697 126350         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1698           to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1699           break;
1700            
1701           case NPOSIXD:
1702           to_complement = 1;
1703           /* FALLTHROUGH */
1704            
1705           case POSIXD:
1706 402979         if (utf8_target) {
1707           goto posix_utf8;
1708           }
1709           goto posixa;
1710            
1711           case NPOSIXA:
1712 339804         if (utf8_target) {
1713           /* The complement of something that matches only ASCII matches all
1714           * UTF-8 variant code points, plus everything in ASCII that isn't
1715           * in the class */
1716 339804         REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1717           || ! _generic_isCC_A(*s, FLAGS(c)));
1718           break;
1719           }
1720            
1721           to_complement = 1;
1722           /* FALLTHROUGH */
1723            
1724           case POSIXA:
1725           posixa:
1726           /* Don't need to worry about utf8, as it can match only a single
1727           * byte invariant character. */
1728 171840         REXEC_FBC_CLASS_SCAN(
1729           to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1730           break;
1731            
1732           case NPOSIXU:
1733           to_complement = 1;
1734           /* FALLTHROUGH */
1735            
1736           case POSIXU:
1737 147204         if (! utf8_target) {
1738 2364         REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1739           FLAGS(c))));
1740           }
1741           else {
1742            
1743           posix_utf8:
1744 1548         classnum = (_char_class_number) FLAGS(c);
1745 4893310         if (classnum < _FIRST_NON_SWASH_CC) {
1746 4746922         while (s < strend) {
1747            
1748           /* We avoid loading in the swash as long as possible, but
1749           * should we have to, we jump to a separate loop. This
1750           * extra 'if' statement is what keeps this code from being
1751           * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1752 147204         if (UTF8_IS_ABOVE_LATIN1(*s)) {
1753           goto found_above_latin1;
1754           }
1755 171840         if ((UTF8_IS_INVARIANT(*s)
1756 258628         && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1757           classnum)))
1758 45630         || (UTF8_IS_DOWNGRADEABLE_START(*s)
1759 45630         && to_complement ^ cBOOL(
1760           _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1761           *(s + 1)),
1762           classnum))))
1763           {
1764 45630         if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1765           goto got_it;
1766           else {
1767           tmp = doevery;
1768           }
1769           }
1770           else {
1771           tmp = 1;
1772           }
1773 258628         s += UTF8SKIP(s);
1774           }
1775           }
1776 258628         else switch (classnum) { /* These classes are implemented as
1777           macros */
1778           case _CC_ENUM_SPACE: /* XXX would require separate code if we
1779           revert the change of \v matching this */
1780           /* FALL THROUGH */
1781            
1782           case _CC_ENUM_PSXSPC:
1783 253708         REXEC_FBC_UTF8_CLASS_SCAN(
1784           to_complement ^ cBOOL(isSPACE_utf8(s)));
1785           break;
1786            
1787           case _CC_ENUM_BLANK:
1788 131774         REXEC_FBC_UTF8_CLASS_SCAN(
1789           to_complement ^ cBOOL(isBLANK_utf8(s)));
1790           break;
1791            
1792           case _CC_ENUM_XDIGIT:
1793 289274         REXEC_FBC_UTF8_CLASS_SCAN(
1794           to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1795           break;
1796            
1797           case _CC_ENUM_VERTSPACE:
1798 289274         REXEC_FBC_UTF8_CLASS_SCAN(
1799           to_complement ^ cBOOL(isVERTWS_utf8(s)));
1800           break;
1801            
1802           case _CC_ENUM_CNTRL:
1803 334902         REXEC_FBC_UTF8_CLASS_SCAN(
1804           to_complement ^ cBOOL(isCNTRL_utf8(s)));
1805           break;
1806            
1807           default:
1808 361480         Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1809           assert(0); /* NOTREACHED */
1810           }
1811           }
1812           break;
1813            
1814           found_above_latin1: /* Here we have to load a swash to get the result
1815           for the current code point */
1816 180740         if (! PL_utf8_swash_ptrs[classnum]) {
1817 180736         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1818 180654         PL_utf8_swash_ptrs[classnum] =
1819 172262         _core_swash_init("utf8", swash_property_names[classnum],
1820           &PL_sv_undef, 1, 0, NULL, &flags);
1821           }
1822            
1823           /* This is a copy of the loop above for swash classes, though using the
1824           * FBC macro instead of being expanded out. Since we've loaded the
1825           * swash, we don't have to check for that each time through the loop */
1826 76568         REXEC_FBC_UTF8_CLASS_SCAN(
1827           to_complement ^ cBOOL(_generic_utf8(
1828           classnum,
1829           s,
1830           swash_fetch(PL_utf8_swash_ptrs[classnum],
1831           (U8 *) s, TRUE))));
1832           break;
1833            
1834           case AHOCORASICKC:
1835           case AHOCORASICK:
1836           {
1837 76572         DECL_TRIE_TYPE(c);
1838           /* what trie are we using right now */
1839 258632         reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1840 258632         reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1841 91172         HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1842            
1843 129728         const char *last_start = strend - trie->minlen;
1844           #ifdef DEBUGGING
1845 126354         const char *real_start = s;
1846           #endif
1847 44746         STRLEN maxlen = trie->maxlen;
1848           SV *sv_points;
1849           U8 **points; /* map of where we were in the input string
1850           when reading a given char. For ASCII this
1851           is unnecessary overhead as the relationship
1852           is always 1:1, but for Unicode, especially
1853           case folded Unicode this is not true. */
1854           U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1855           U8 *bitmap=NULL;
1856            
1857            
1858 44746         GET_RE_DEBUG_FLAGS_DECL;
1859            
1860           /* We can't just allocate points here. We need to wrap it in
1861           * an SV so it gets freed properly if there is a croak while
1862           * running the match */
1863 126354         ENTER;
1864 45178         SAVETMPS;
1865 45178         sv_points=newSV(maxlen * sizeof(U8 *));
1866 44086         SvCUR_set(sv_points,
1867           maxlen * sizeof(U8 *));
1868 44086         SvPOK_on(sv_points);
1869 44086         sv_2mortal(sv_points);
1870 1096         points=(U8**)SvPV_nolen(sv_points );
1871 81280         if ( trie_type != trie_utf8_fold
1872 81280         && (trie->bitmap || OP(c)==AHOCORASICKC) )
1873           {
1874 81278         if (trie->bitmap)
1875 2         bitmap=(U8*)trie->bitmap;
1876           else
1877 19919356         bitmap=(U8*)ANYOF_BITMAP(c);
1878           }
1879           /* this is the Aho-Corasick algorithm modified a touch
1880           to include special handling for long "unknown char" sequences.
1881           The basic idea being that we use AC as long as we are dealing
1882           with a possible matching char, when we encounter an unknown char
1883           (and we have not encountered an accepting state) we scan forward
1884           until we find a legal starting char.
1885           AC matching is basically that of trie matching, except that when
1886           we encounter a failing transition, we fall back to the current
1887           states "fail state", and try the current char again, a process
1888           we repeat until we reach the root state, state 1, or a legal
1889           transition. If we fail on the root state then we can either
1890           terminate if we have reached an accepting state previously, or
1891           restart the entire process from the beginning if we have not.
1892            
1893           */
1894 98669747         while (s <= last_start) {
1895 98669747         const U32 uniflags = UTF8_ALLOW_DEFAULT;
1896 96472125         U8 *uc = (U8*)s;
1897           U16 charid = 0;
1898           U32 base = 1;
1899           U32 state = 1;
1900 96104099         UV uvc = 0;
1901 90639355         STRLEN len = 0;
1902 90639355         STRLEN foldlen = 0;
1903           U8 *uscan = (U8*)NULL;
1904           U8 *leftmost = NULL;
1905           #ifdef DEBUGGING
1906           U32 accepted_word= 0;
1907           #endif
1908           U32 pointpos = 0;
1909            
1910 90639371         while ( state && uc <= (U8*)strend ) {
1911           int failed=0;
1912 74096421         U32 word = aho->states[ state ].wordnum;
1913            
1914 35475987         if( state==1 ) {
1915 56         if ( bitmap ) {
1916 54         DEBUG_TRIE_EXECUTE_r(
1917           if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918           dump_exec_pos( (char *)uc, c, strend, real_start,
1919           (char *)uc, utf8_target );
1920           PerlIO_printf( Perl_debug_log,
1921           " Scanning for legal start char...\n");
1922           }
1923           );
1924 145899439         if (utf8_target) {
1925 60628124         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926 96104095         uc += UTF8SKIP(uc);
1927           }
1928           } else {
1929 96104137         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1930 96104133         uc++;
1931           }
1932           }
1933 96104099         s= (char *)uc;
1934           }
1935 368032         if (uc >(U8*)last_start) break;
1936           }
1937            
1938 368040         if ( word ) {
1939 301308         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1940 22254         if (!leftmost || lpos < leftmost) {
1941           DEBUG_r(accepted_word=word);
1942           leftmost= lpos;
1943           }
1944 22254         if (base==0) break;
1945            
1946           }
1947 66734         points[pointpos++ % maxlen]= uc;
1948 368040         if (foldlen || uc < (U8*)strend) {
1949 368038         REXEC_TRIE_READ_CHAR(trie_type, trie,
1950           widecharmap, uc,
1951           uscan, len, uvc, charid, foldlen,
1952           foldbuf, uniflags);
1953 368038         DEBUG_TRIE_EXECUTE_r({
1954           dump_exec_pos( (char *)uc, c, strend,
1955           real_start, s, utf8_target);
1956           PerlIO_printf(Perl_debug_log,
1957           " Charid:%3u CP:%4"UVxf" ",
1958           charid, uvc);
1959           });
1960           }
1961           else {
1962 368028         len = 0;
1963           charid = 0;
1964           }
1965            
1966            
1967           do {
1968           #ifdef DEBUGGING
1969 368042         word = aho->states[ state ].wordnum;
1970           #endif
1971 96472137         base = aho->states[ state ].trans.base;
1972            
1973 96472137         DEBUG_TRIE_EXECUTE_r({
1974           if (failed)
1975           dump_exec_pos( (char *)uc, c, strend, real_start,
1976           s, utf8_target );
1977           PerlIO_printf( Perl_debug_log,
1978           "%sState: %4"UVxf", word=%"UVxf,
1979           failed ? " Fail transition to " : "",
1980           (UV)state, (UV)word);
1981           });
1982 16         if ( base ) {
1983           U32 tmp;
1984           I32 offset;
1985 24         if (charid &&
1986 20         ( ((offset = base + charid
1987 3296443         - 1 - trie->uniquecharcount)) >= 0)
1988 2197632         && ((U32)offset < trie->lasttrans)
1989 2197632         && trie->trans[offset].check == state
1990 2197632         && (tmp=trie->trans[offset].next))
1991           {
1992 2197632         DEBUG_TRIE_EXECUTE_r(
1993           PerlIO_printf( Perl_debug_log," - legal\n"));
1994           state = tmp;
1995 98669753         break;
1996           }
1997           else {
1998 263904222         DEBUG_TRIE_EXECUTE_r(
1999           PerlIO_printf( Perl_debug_log," - fail\n"));
2000           failed = 1;
2001 263904222         state = aho->fail[state];
2002           }
2003           }
2004           else {
2005           /* we must be accepting here */
2006 263904220         DEBUG_TRIE_EXECUTE_r(
2007           PerlIO_printf( Perl_debug_log," - accepting\n"));
2008           failed = 1;
2009           break;
2010           }
2011 263904222         } while(state);
2012 14         uc += len;
2013 263904232         if (failed) {
2014 3606940         if (leftmost)
2015           break;
2016 3606938         if (!state) state = 1;
2017           }
2018           }
2019 7170110         if ( aho->states[ state ].wordnum ) {
2020 2026462         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2021 6357987         if (!leftmost || lpos < leftmost) {
2022 3606936         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2023           leftmost = lpos;
2024           }
2025           }
2026 3155006         if (leftmost) {
2027 3155004         s = (char*)leftmost;
2028 3155004         DEBUG_TRIE_EXECUTE_r({
2029           PerlIO_printf(
2030           Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2031           (UV)accepted_word, (IV)(s - real_start)
2032           );
2033           });
2034 451936         if (reginfo->intuit || regtry(reginfo, &s)) {
2035 62         FREETMPS;
2036 30         LEAVE;
2037 451876         goto got_it;
2038           }
2039 263904190         s = HOPc(s,1);
2040 263904190         DEBUG_TRIE_EXECUTE_r({
2041           PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2042           });
2043           } else {
2044 252038470         DEBUG_TRIE_EXECUTE_r(
2045           PerlIO_printf( Perl_debug_log,"No match.\n"));
2046           break;
2047           }
2048           }
2049 252038470         FREETMPS;
2050 252038470         LEAVE;
2051           }
2052 136791179         break;
2053           default:
2054 136791177         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2055           break;
2056           }
2057           return 0;
2058           got_it:
2059 136791179         return s;
2060           }
2061            
2062           /* set RX_SAVED_COPY, RX_SUBBEG etc.
2063           * flags have same meanings as with regexec_flags() */
2064            
2065           static void
2066 37852370         S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2067           char *strbeg,
2068           char *strend,
2069           SV *sv,
2070           U32 flags,
2071           bool utf8_target)
2072           {
2073 2754166         struct regexp *const prog = ReANY(rx);
2074            
2075 1836252         if (flags & REXEC_COPY_STR) {
2076           #ifdef PERL_ANY_COW
2077 1836252         if (SvCANCOW(sv)) {
2078 2754166         if (DEBUG_C_TEST) {
2079 1836188         PerlIO_printf(Perl_debug_log,
2080           "Copy on write: regexp capture, type %d\n",
2081 3672376         (int) SvTYPE(sv));
2082           }
2083           /* Create a new COW SV to share the match string and store
2084           * in saved_copy, unless the current COW SV in saved_copy
2085           * is valid and suitable for our purpose */
2086 55148         if (( prog->saved_copy
2087 1891276         && SvIsCOW(prog->saved_copy)
2088 1836192         && SvPOKp(prog->saved_copy)
2089           && SvIsCOW(sv)
2090 1670720         && SvPOKp(sv)
2091 151263413         && SvPVX(sv) == SvPVX(prog->saved_copy)))
2092           {
2093           /* just reuse saved_copy SV */
2094 151263413         if (RXp_MATCH_COPIED(prog)) {
2095 151263341         Safefree(prog->subbeg);
2096 0         RXp_MATCH_COPIED_off(prog);
2097           }
2098           }
2099           else {
2100           /* create new COW SV to share string */
2101 151263461         RX_MATCH_COPY_FREE(rx);
2102 151263401         prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2103           }
2104 151263405         prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2105 151263405         assert (SvPOKp(prog->saved_copy));
2106 151263405         prog->sublen = strend - strbeg;
2107 151263405         prog->suboffset = 0;
2108 151263405         prog->subcoffset = 0;
2109           } else
2110           #endif
2111           {
2112           SSize_t min = 0;
2113 151263341         SSize_t max = strend - strbeg;
2114           SSize_t sublen;
2115            
2116           if ( (flags & REXEC_COPY_SKIP_POST)
2117           && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2118           && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2119           ) { /* don't copy $' part of string */
2120           U32 n = 0;
2121           max = -1;
2122           /* calculate the right-most part of the string covered
2123           * by a capture. Due to look-ahead, this may be to
2124           * the right of $&, so we have to scan all captures */
2125           while (n <= prog->lastparen) {
2126           if (prog->offs[n].end > max)
2127           max = prog->offs[n].end;
2128           n++;
2129           }
2130           if (max == -1)
2131           max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2132           ? prog->offs[0].start
2133           : 0;
2134           assert(max >= 0 && max <= strend - strbeg);
2135           }
2136            
2137           if ( (flags & REXEC_COPY_SKIP_PRE)
2138           && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2139           && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2140           ) { /* don't copy $` part of string */
2141           U32 n = 0;
2142           min = max;
2143           /* calculate the left-most part of the string covered
2144           * by a capture. Due to look-behind, this may be to
2145           * the left of $&, so we have to scan all captures */
2146           while (min && n <= prog->lastparen) {
2147           if ( prog->offs[n].start != -1
2148           && prog->offs[n].start < min)
2149           {
2150           min = prog->offs[n].start;
2151           }
2152           n++;
2153           }
2154           if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2155           && min > prog->offs[0].end
2156           )
2157           min = prog->offs[0].end;
2158            
2159           }
2160            
2161 151263341         assert(min >= 0 && min <= max && min <= strend - strbeg);
2162           sublen = max - min;
2163            
2164 151263341         if (RX_MATCH_COPIED(rx)) {
2165 151263341         if (sublen > prog->sublen)
2166 151263341         prog->subbeg =
2167 14         (char*)saferealloc(prog->subbeg, sublen+1);
2168           }
2169           else
2170 14         prog->subbeg = (char*)safemalloc(sublen+1);
2171 14         Copy(strbeg + min, prog->subbeg, sublen, char);
2172 151263341         prog->subbeg[sublen] = '\0';
2173 151263341         prog->suboffset = min;
2174 19956         prog->sublen = sublen;
2175 19956         RX_MATCH_COPIED_on(rx);
2176           }
2177 20020         prog->subcoffset = prog->suboffset;
2178 20020         if (prog->suboffset && utf8_target) {
2179           /* Convert byte offset to chars.
2180           * XXX ideally should only compute this if @-/@+
2181           * has been seen, a la PL_sawampersand ??? */
2182            
2183           /* If there's a direct correspondence between the
2184           * string which we're matching and the original SV,
2185           * then we can use the utf8 len cache associated with
2186           * the SV. In particular, it means that under //g,
2187           * sv_pos_b2u() will use the previously cached
2188           * position to speed up working out the new length of
2189           * subcoffset, rather than counting from the start of
2190           * the string each time. This stops
2191           * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2192           * from going quadratic */
2193 151263341         if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2194 151263341         prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2195           SV_GMAGIC|SV_CONST_RETURN);
2196           else
2197 453901797         prog->subcoffset = utf8_length((U8*)strbeg,
2198           (U8*)(strbeg+prog->suboffset));
2199           }
2200           }
2201           else {
2202 302638456         RX_MATCH_COPY_FREE(rx);
2203 151263341         prog->subbeg = strbeg;
2204 151375115         prog->suboffset = 0;
2205 111774         prog->subcoffset = 0;
2206 223548         prog->sublen = strend - strbeg;
2207           }
2208 111838         }
2209            
2210            
2211            
2212            
2213           /*
2214           - regexec_flags - match a regexp against a string
2215           */
2216           I32
2217 302638550         Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2218           char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2219           /* stringarg: the point in the string at which to begin matching */
2220           /* strend: pointer to null at end of string */
2221           /* strbeg: real beginning of string */
2222           /* minend: end of match must be >= minend bytes after stringarg. */
2223           /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2224           * itself is accessed via the pointers above */
2225           /* data: May be used for some additional optimizations.
2226           Currently unused. */
2227           /* flags: For optimizations. See REXEC_* in regexp.h */
2228            
2229           {
2230           dVAR;
2231 94         struct regexp *const prog = ReANY(rx);
2232           char *s;
2233           regnode *c;
2234           char *startpos;
2235           SSize_t minlen; /* must match at least this many chars */
2236           SSize_t dontbother = 0; /* how many characters not to try at end */
2237 151263435         const bool utf8_target = cBOOL(DO_UTF8(sv));
2238           I32 multiline;
2239 151263435         RXi_GET_DECL(prog,progi);
2240           regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2241           regmatch_info *const reginfo = ®info_buf;
2242           regexp_paren_pair *swap = NULL;
2243           I32 oldsave;
2244 151263435         GET_RE_DEBUG_FLAGS_DECL;
2245            
2246 151263435         PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2247           PERL_UNUSED_ARG(data);
2248            
2249           /* Be paranoid... */
2250 151263435         if (prog == NULL || stringarg == NULL) {
2251 111774         Perl_croak(aTHX_ "NULL regexp parameter");
2252           return 0;
2253           }
2254            
2255 151151661         DEBUG_EXECUTE_r(
2256           debug_start_match(rx, utf8_target, stringarg, strend,
2257           "Matching");
2258           );
2259            
2260           startpos = stringarg;
2261            
2262 151263435         if (prog->extflags & RXf_GPOS_SEEN) {
2263           MAGIC *mg;
2264            
2265           /* set reginfo->ganch, the position where \G can match */
2266            
2267 39700526         reginfo->ganch =
2268 59537470         (flags & REXEC_IGNOREPOS)
2269           ? stringarg /* use start pos rather than pos() */
2270 151263341         : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2271           /* Defined pos(): */
2272 92429392         ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2273 24225610         : strbeg; /* pos() not defined; use start of string */
2274            
2275 23306660         DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2276           "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
2277            
2278           /* in the presence of \G, we may need to start looking earlier in
2279           * the string than the suggested start point of stringarg:
2280           * if gofs->prog is set, then that's a known, fixed minimum
2281           * offset, such as
2282           * /..\G/: gofs = 2
2283           * /ab|c\G/: gofs = 1
2284           * or if the minimum offset isn't known, then we have to go back
2285           * to the start of the string, e.g. /w+\G/
2286           */
2287            
2288 919224         if (prog->extflags & RXf_ANCH_GPOS) {
2289 511034         startpos = reginfo->ganch - prog->gofs;
2290 919224         if (startpos <
2291 919224         ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2292           {
2293 486108         DEBUG_r(PerlIO_printf(Perl_debug_log,
2294           "fail: ganch-gofs before earliest possible start\n"));
2295 3882         return 0;
2296           }
2297           }
2298 4296         else if (prog->gofs) {
2299 596         if (startpos - prog->gofs < strbeg)
2300           startpos = strbeg;
2301           else
2302 596         startpos -= prog->gofs;
2303           }
2304 596         else if (prog->extflags & RXf_GPOS_FLOAT)
2305           startpos = strbeg;
2306           }
2307            
2308 690         minlen = prog->minlen;
2309 94         if ((startpos + minlen) > strend || startpos < strbeg) {
2310 482226         DEBUG_r(PerlIO_printf(Perl_debug_log,
2311           "Regex match can't succeed, so not even tried\n"));
2312 837760         return 0;
2313           }
2314            
2315           /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2316           * which will call destuctors to reset PL_regmatch_state, free higher
2317           * PL_regmatch_slabs, and clean up regmatch_info_aux and
2318           * regmatch_info_aux_eval */
2319            
2320 375688         oldsave = PL_savestack_ix;
2321            
2322 375660         s = startpos;
2323            
2324 375660         if ((prog->extflags & RXf_USE_INTUIT)
2325 375658         && !(flags & REXEC_CHECKED))
2326           {
2327 92         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2328           flags, NULL);
2329 433208         if (!s)
2330           return 0;
2331            
2332 206         if (prog->extflags & RXf_CHECK_ALL) {
2333           /* we can match based purely on the result of INTUIT.
2334           * Set up captures etc just for $& and $-[0]
2335           * (an intuit-only match wont have $1,$2,..) */
2336 16010172         assert(!prog->nparens);
2337            
2338           /* s/// doesn't like it if $& is earlier than where we asked it to
2339           * start searching (which can happen on something like /.\G/) */
2340 15856296         if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2341 790218         && (s < stringarg))
2342           {
2343           /* this should only be possible under \G */
2344 58833949         assert(prog->extflags & RXf_GPOS_SEEN);
2345 2031764         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346           "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2347           goto phooey;
2348           }
2349            
2350           /* match via INTUIT shouldn't have any captures.
2351           * Let @-, @+, $^N know */
2352 56802207         prog->lastparen = prog->lastcloseparen = 0;
2353 106286         RX_MATCH_UTF8_set(rx, utf8_target);
2354 382         prog->offs[0].start = s - strbeg;
2355 44         prog->offs[0].end = utf8_target
2356 360         ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2357 2984         : s - strbeg + prog->minlenret;
2358 105904         if ( !(flags & REXEC_NOT_FIRST) )
2359 22         S_reg_set_capture_string(aTHX_ rx,
2360           strbeg, strend,
2361           sv, flags, utf8_target);
2362            
2363           return 1;
2364           }
2365           }
2366            
2367 105926         multiline = prog->extflags & RXf_PMf_MULTILINE;
2368          
2369 804182         if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2370 56695943         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2371           "String too short [regexec_flags]...\n"));
2372           goto phooey;
2373           }
2374          
2375           /* Check validity of program. */
2376 48208691         if (UCHARAT(progi->program) != REG_MAGIC) {
2377 48095871         Perl_croak(aTHX_ "corrupted regexp program");
2378           }
2379            
2380 4550440         RX_MATCH_TAINTED_off(rx);
2381            
2382 8746328         reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2383 8600116         reginfo->intuit = 0;
2384 226380         reginfo->is_utf8_target = cBOOL(utf8_target);
2385 7842         reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2386 226380         reginfo->warned = FALSE;
2387 8373780         reginfo->strbeg = strbeg;
2388 44         reginfo->sv = sv;
2389 8373780         reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2390 8600116         reginfo->strend = strend;
2391           /* see how far we have to get to not match where we matched before */
2392 146256         reginfo->till = stringarg + minend;
2393            
2394 18832         if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2395           /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2396           S_cleanup_regmatch_info_aux has executed (registered by
2397           SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2398           magic belonging to this SV.
2399           Not newSVsv, either, as it does not COW.
2400           */
2401 0         reginfo->sv = newSV(0);
2402 18788         sv_setsv(reginfo->sv, sv);
2403 127424         SAVEFREESV(reginfo->sv);
2404           }
2405            
2406           /* reserve next 2 or 3 slots in PL_regmatch_state:
2407           * slot N+0: may currently be in use: skip it
2408           * slot N+1: use for regmatch_info_aux struct
2409           * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2410           * slot N+3: ready for use by regmatch()
2411           */
2412            
2413           {
2414           regmatch_state *old_regmatch_state;
2415           regmatch_slab *old_regmatch_slab;
2416 44         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2417            
2418           /* on first ever match, allocate first slab */
2419 127468         if (!PL_regmatch_slab) {
2420 146212         Newx(PL_regmatch_slab, 1, regmatch_slab);
2421 146212         PL_regmatch_slab->prev = NULL;
2422 8746284         PL_regmatch_slab->next = NULL;
2423 8746284         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2424           }
2425            
2426 8746328         old_regmatch_state = PL_regmatch_state;
2427 7700524         old_regmatch_slab = PL_regmatch_slab;
2428            
2429 1045972         for (i=0; i <= max; i++) {
2430 8746408         if (i == 1)
2431 39691835         reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2432 17384374         else if (i ==2)
2433 16753958         reginfo->info_aux_eval =
2434 16625990         reginfo->info_aux->info_aux_eval =
2435 16625954         &(PL_regmatch_state->u.info_aux_eval);
2436            
2437 128128         if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2438 128004         PL_regmatch_state = S_push_slab(aTHX);
2439           }
2440            
2441           /* note initial PL_regmatch_state position; at end of match we'll
2442           * pop back to there and free any higher slabs */
2443            
2444 128048         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2445 128048         reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2446 16753966         reginfo->info_aux->poscache = NULL;
2447            
2448 317622         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2449            
2450 317622         if ((prog->extflags & RXf_EVAL_SEEN))
2451 142560         S_setup_eval_state(aTHX_ reginfo);
2452           else
2453 137748         reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2454           }
2455            
2456           /* If there is a "must appear" string, look for it. */
2457            
2458 137784         if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2459           /* We have to be careful. If the previous successful match
2460           was from this regex we don't want a subsequent partially
2461           successful match to clobber the old results.
2462           So when we detect this possibility we add a swap buffer
2463           to the re, and switch the buffer each match. If we fail,
2464           we switch it back; otherwise we leave it swapped.
2465           */
2466 4820         swap = prog->offs;
2467           /* do we need a save destructor here for eval dies? */
2468 25836332         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2469 16716196         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2470           "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2471           PTR2UV(prog),
2472           PTR2UV(swap),
2473           PTR2UV(prog->offs)
2474           ));
2475           }
2476            
2477           /* Simplest case: anchored match need be tried only once. */
2478           /* [unless only anchor is BOL and multiline is set] */
2479 9326306         if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2480 47949659         if (s == startpos && regtry(reginfo, &s))
2481           goto got_it;
2482 21768593         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2483 21766999         || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2484           {
2485           char *end;
2486            
2487 21766999         if (minlen)
2488 21146737         dontbother = minlen - 1;
2489 21768593         end = HOP3c(strend, -dontbother, strbeg) - 1;
2490           /* for multiline we only have to try after newlines */
2491 26181066         if (prog->check_substr || prog->check_utf8) {
2492           /* because of the goto we can not easily reuse the macros for bifurcating the
2493           unicode/non-unicode match modes here like we do elsewhere - demerphq */
2494 2506358         if (utf8_target) {
2495 31358         if (s == startpos)
2496           goto after_try_utf8;
2497           while (1) {
2498 0         if (regtry(reginfo, &s)) {
2499           goto got_it;
2500           }
2501           after_try_utf8:
2502 31358         if (s > end) {
2503           goto phooey;
2504           }
2505 2475000         if (prog->extflags & RXf_USE_INTUIT) {
2506 0         s = re_intuit_start(rx, sv, strbeg,
2507           s + UTF8SKIP(s), strend, flags, NULL);
2508 2475000         if (!s) {
2509           goto phooey;
2510           }
2511           }
2512           else {
2513 2506358         s += UTF8SKIP(s);
2514           }
2515           }
2516           } /* end search for check string in unicode */
2517           else {
2518 2506358         if (s == startpos) {
2519           goto after_try_latin;
2520           }
2521           while (1) {
2522 2084514         if (regtry(reginfo, &s)) {
2523           goto got_it;
2524           }
2525           after_try_latin:
2526 2084514         if (s > end) {
2527           goto phooey;
2528           }
2529 2084514         if (prog->extflags & RXf_USE_INTUIT) {
2530 2084514         s = re_intuit_start(rx, sv, strbeg,
2531           s + 1, strend, flags, NULL);
2532 0         if (!s) {
2533           goto phooey;
2534           }
2535           }
2536           else {
2537 0         s++;
2538           }
2539           }
2540           } /* end search for check string in latin*/
2541           } /* end search for check string */
2542           else { /* search for newline */
2543 0         if (s > startpos) {
2544           /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2545 421844         s--;
2546           }
2547           /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2548 421844         while (s <= end) { /* note it could be possible to match at the end of the string */
2549 2506358         if (*s++ == '\n') { /* don't need PL_utf8skip here */
2550 2506358         if (regtry(reginfo, &s))
2551           goto got_it;
2552           }
2553           }
2554           } /* end search for newline */
2555           } /* end anchored/multiline check string search */
2556           goto phooey;
2557 26181110         } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2558           {
2559           /* For anchored \G, the only position it can match from is
2560           * (ganch-gofs); we already set startpos to this above; if intuit
2561           * moved us on from there, we can't possibly succeed */
2562 22860482         assert(startpos == reginfo->ganch - prog->gofs);
2563 26181066         if (s == startpos && regtry(reginfo, &s))
2564           goto got_it;
2565           goto phooey;
2566           }
2567            
2568           /* Messy cases: unanchored match. */
2569 26181110         if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2570           /* we have /x+whatever/ */
2571           /* it must be a one character string (XXXX Except is_utf8_pat?) */
2572           char ch;
2573           #ifdef DEBUGGING
2574           int did_match = 0;
2575           #endif
2576 6475186         if (utf8_target) {
2577 264368         if (! prog->anchored_utf8) {
2578 242168         to_utf8_substr(prog);
2579           }
2580 10217631         ch = SvPVX_const(prog->anchored_utf8)[0];
2581 98256438         REXEC_FBC_SCAN(
2582           if (*s == ch) {
2583           DEBUG_EXECUTE_r( did_match = 1 );
2584           if (regtry(reginfo, &s)) goto got_it;
2585           s += UTF8SKIP(s);
2586           while (s < strend && *s == ch)
2587           s += UTF8SKIP(s);
2588           }
2589           );
2590            
2591           }
2592           else {
2593 81066906         if (! prog->anchored_substr) {
2594 110461603         if (! to_byte_substr(prog)) {
2595 12784408         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2596           }
2597           }
2598 110461599         ch = SvPVX_const(prog->anchored_substr)[0];
2599 110461599         REXEC_FBC_SCAN(
2600           if (*s == ch) {
2601           DEBUG_EXECUTE_r( did_match = 1 );
2602           if (regtry(reginfo, &s)) goto got_it;
2603           s++;
2604           while (s < strend && *s == ch)
2605           s++;
2606           }
2607           );
2608           }
2609 110461599         DEBUG_EXECUTE_r(if (!did_match)
2610           PerlIO_printf(Perl_debug_log,
2611           "Did not find anchored character...\n")
2612           );
2613           }
2614 11166         else if (prog->anchored_substr != NULL
2615 165476303         || prog->anchored_utf8 != NULL
2616 110461603         || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2617 96999029         && prog->float_max_offset < strend - s)) {
2618           SV *must;
2619           SSize_t back_max;
2620           SSize_t back_min;
2621           char *last;
2622           char *last1; /* Last position checked before */
2623           #ifdef DEBUGGING
2624           int did_match = 0;
2625           #endif
2626 40776694         if (prog->anchored_substr || prog->anchored_utf8) {
2627 40776694         if (utf8_target) {
2628 7496494         if (! prog->anchored_utf8) {
2629 136168564         to_utf8_substr(prog);
2630           }
2631 229782217         must = prog->anchored_utf8;
2632           }
2633           else {
2634 229782257         if (! prog->anchored_substr) {
2635 229782217         if (! to_byte_substr(prog)) {
2636 229782217         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2637           }
2638           }
2639 229782257         must = prog->anchored_substr;
2640           }
2641 229782257         back_max = back_min = prog->anchored_offset;
2642           } else {
2643 229782217         if (utf8_target) {
2644 229782217         if (! prog->float_utf8) {
2645 111403381         to_utf8_substr(prog);
2646           }
2647 382741242         must = prog->float_utf8;
2648           }
2649           else {
2650 271337861         if (! prog->float_substr) {
2651 271337861         if (! to_byte_substr(prog)) {
2652 271337861         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2653           }
2654           }
2655 229782217         must = prog->float_substr;
2656           }
2657 229782217         back_max = prog->float_max_offset;
2658 229782191         back_min = prog->float_min_offset;
2659           }
2660          
2661 110461643         if (back_min<0) {
2662           last = strend;
2663           } else {
2664 110461643         last = HOP3c(strend, /* Cannot start after this */
2665           -(SSize_t)(CHR_SVLEN(must)
2666           - (SvTAIL(must) != 0) + back_min), strbeg);
2667           }
2668 119320628         if (s > reginfo->strbeg)
2669 280         last1 = HOPc(s, -1);
2670           else
2671 174767529         last1 = s - 1; /* bogus */
2672            
2673           /* XXXX check_substr already used to find "s", can optimize if
2674           check_substr==must. */
2675           dontbother = 0;
2676 1889308         strend = HOPc(strend, -dontbother);
2677 1889388         while ( (s <= last) &&
2678 1889308         (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2679           (unsigned char*)strend, must,
2680           multiline ? FBMrf_MULTILINE : 0)) ) {
2681 514668         DEBUG_EXECUTE_r( did_match = 1 );
2682 514668         if (HOPc(s, -back_max) > last1) {
2683 514668         last1 = HOPc(s, -back_min);
2684 514668         s = HOPc(s, -back_max);
2685           }
2686           else {
2687 1889268         char * const t = (last1 >= reginfo->strbeg)
2688 1889268         ? HOPc(last1, 1) : last1 + 1;
2689            
2690 9148         last1 = HOPc(s, -back_min);
2691 9148         s = t;
2692           }
2693 9188         if (utf8_target) {
2694 9148         while (s <= last1) {
2695 14022         if (regtry(reginfo, &s))
2696           goto got_it;
2697 13672         if (s >= last1) {
2698 8848         s++; /* to break out of outer loop */
2699 8748         break;
2700           }
2701 108870617         s += UTF8SKIP(s);
2702           }
2703           }
2704           else {
2705 108870657         while (s <= last1) {
2706 108870657         if (regtry(reginfo, &s))
2707           goto got_it;
2708 108870617         s++;
2709           }
2710           }
2711           }
2712 108870617         DEBUG_EXECUTE_r(if (!did_match) {
2713           RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2714           SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2715           PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2716           ((must == prog->anchored_substr || must == prog->anchored_utf8)
2717           ? "anchored" : "floating"),
2718           quoted, RE_SV_TAIL(must));
2719           });
2720           goto phooey;
2721           }
2722 107917905         else if ( (c = progi->regstclass) ) {
2723 107847143         if (minlen) {
2724 70766         const OPCODE op = OP(progi->regstclass);
2725           /* don't bother with what can't match */
2726 49494         if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2727 49490         strend = HOPc(strend, -(minlen - 1));
2728           }
2729 21276         DEBUG_EXECUTE_r({
2730           SV * const prop = sv_newmortal();
2731           regprop(prog, prop, c);
2732           {
2733           RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2734           s,strend-s,60);
2735           PerlIO_printf(Perl_debug_log,
2736           "Matching stclass %.*s against %s (%d bytes)\n",
2737           (int)SvCUR(prop), SvPVX_const(prop),
2738           quoted, (int)(strend - s));
2739           }
2740           });
2741 2700901         if (find_byclass(prog, c, s, strend, reginfo))
2742           goto got_it;
2743 168923         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2744           }
2745           else {
2746           dontbother = 0;
2747 940040         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2748           /* Trim the end. */
2749           char *last= NULL;
2750           SV* float_real;
2751           STRLEN len;
2752           const char *little;
2753            
2754 992639         if (utf8_target) {
2755 935396         if (! prog->float_utf8) {
2756 935396         to_utf8_substr(prog);
2757           }
2758 42454         float_real = prog->float_utf8;
2759           }
2760           else {
2761 0         if (! prog->float_substr) {
2762 0         if (! to_byte_substr(prog)) {
2763 0         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2764           }
2765           }
2766 42454         float_real = prog->float_substr;
2767           }
2768            
2769 18652         little = SvPV_const(float_real, len);
2770 18652         if (SvTAIL(float_real)) {
2771           /* This means that float_real contains an artificial \n on
2772           * the end due to the presence of something like this:
2773           * /foo$/ where we can match both "foo" and "foo\n" at the
2774           * end of the string. So we have to compare the end of the
2775           * string first against the float_real without the \n and
2776           * then against the full float_real with the string. We
2777           * have to watch out for cases where the string might be
2778           * smaller than the float_real or the float_real without
2779           * the \n. */
2780 8416         char *checkpos= strend - len;
2781 8416         DEBUG_OPTIMISE_r(
2782           PerlIO_printf(Perl_debug_log,
2783           "%sChecking for float_real.%s\n",
2784           PL_colors[4], PL_colors[5]));
2785 0         if (checkpos + 1 < strbeg) {
2786           /* can't match, even if we remove the trailing \n
2787           * string is too short to match */
2788 8416         DEBUG_EXECUTE_r(
2789           PerlIO_printf(Perl_debug_log,
2790           "%sString shorter than required trailing substring, cannot match.%s\n",
2791           PL_colors[4], PL_colors[5]));
2792           goto phooey;
2793 8416         } else if (memEQ(checkpos + 1, little, len - 1)) {
2794           /* can match, the end of the string matches without the
2795           * "\n" */
2796 8416         last = checkpos + 1;
2797 0         } else if (checkpos < strbeg) {
2798           /* cant match, string is too short when the "\n" is
2799           * included */
2800 8416         DEBUG_EXECUTE_r(
2801           PerlIO_printf(Perl_debug_log,
2802           "%sString does not contain required trailing substring, cannot match.%s\n",
2803           PL_colors[4], PL_colors[5]));
2804           goto phooey;
2805 8416         } else if (!multiline) {
2806           /* non multiline match, so compare with the "\n" at the
2807           * end of the string */
2808 0         if (memEQ(checkpos, little, len)) {
2809           last= checkpos;
2810           } else {
2811 0         DEBUG_EXECUTE_r(
2812           PerlIO_printf(Perl_debug_log,
2813           "%sString does not contain required trailing substring, cannot match.%s\n",
2814           PL_colors[4], PL_colors[5]));
2815           goto phooey;
2816           }
2817           } else {
2818           /* multiline match, so we have to search for a place
2819           * where the full string is located */
2820           goto find_last;
2821           }
2822           } else {
2823           find_last:
2824 0         if (len)
2825 0         last = rninstr(s, strend, little, little + len);
2826           else
2827           last = strend; /* matching "$" */
2828           }
2829 0         if (!last) {
2830           /* at one point this block contained a comment which was
2831           * probably incorrect, which said that this was a "should not
2832           * happen" case. Even if it was true when it was written I am
2833           * pretty sure it is not anymore, so I have removed the comment
2834           * and replaced it with this one. Yves */
2835 892942         DEBUG_EXECUTE_r(
2836           PerlIO_printf(Perl_debug_log,
2837           "String does not contain required substring, cannot match.\n"
2838           ));
2839           goto phooey;
2840           }
2841 192182         dontbother = strend - last + prog->float_min_offset;
2842           }
2843 93166         if (minlen && (dontbother < minlen))
2844 142572         dontbother = minlen - 1;
2845 142572         strend -= dontbother; /* this one's always in bytes! */
2846           /* We don't know much -- general case. */
2847 13336         if (utf8_target) {
2848           for (;;) {
2849 62454         if (regtry(reginfo, &s))
2850           goto got_it;
2851 830488         if (s >= strend)
2852           break;
2853 159122         s += UTF8SKIP(s);
2854 0         };
2855           }
2856           else {
2857           do {
2858 7908         if (regtry(reginfo, &s))
2859           goto got_it;
2860 5516         } while (s++ < strend);
2861           }
2862           }
2863            
2864           /* Failure. */
2865           goto phooey;
2866            
2867           got_it:
2868           /* s/// doesn't like it if $& is earlier than where we asked it to
2869           * start searching (which can happen on something like /.\G/) */
2870 42         if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2871 665850         && (prog->offs[0].start < stringarg - strbeg))
2872           {
2873           /* this should only be possible under \G */
2874 0         assert(prog->extflags & RXf_GPOS_SEEN);
2875 0         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2876           "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2877           goto phooey;
2878           }
2879            
2880 108870659         DEBUG_BUFFERS_r(
2881           if (swap)
2882           PerlIO_printf(Perl_debug_log,
2883           "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2884           PTR2UV(prog),
2885           PTR2UV(swap)
2886           );
2887           );
2888 86308         Safefree(swap);
2889            
2890           /* clean up; this will trigger destructors that will free all slabs
2891           * above the current one, and cleanup the regmatch_info_aux
2892           * and regmatch_info_aux_eval sructs */
2893            
2894 108784393         LEAVE_SCOPE(oldsave);
2895            
2896 290470         if (RXp_PAREN_NAMES(prog))
2897 240938         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2898            
2899 241022         RX_MATCH_UTF8_set(rx, utf8_target);
2900            
2901           /* make sure $`, $&, $', and $digit will work later */
2902 290470         if ( !(flags & REXEC_NOT_FIRST) )
2903 435684         S_reg_set_capture_string(aTHX_ rx,
2904           strbeg, reginfo->strend,
2905           sv, flags, utf8_target);
2906            
2907           return 1;
2908            
2909           phooey:
2910 240726         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2911           PL_colors[4], PL_colors[5]));
2912            
2913           /* clean up; this will trigger destructors that will free all slabs
2914           * above the current one, and cleanup the regmatch_info_aux
2915           * and regmatch_info_aux_eval sructs */
2916            
2917 410792         LEAVE_SCOPE(oldsave);
2918            
2919 49706         if (swap) {
2920           /* we failed :-( roll it back */
2921 108493923         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2922           "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2923           PTR2UV(prog),
2924           PTR2UV(prog->offs),
2925           PTR2UV(swap)
2926           ));
2927 10084         Safefree(prog->offs);
2928 0         prog->offs = swap;
2929           }
2930           return 0;
2931           }
2932            
2933            
2934           /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2935           * Do inc before dec, in case old and new rex are the same */
2936           #define SET_reg_curpm(Re2) \
2937           if (reginfo->info_aux_eval) { \
2938           (void)ReREFCNT_inc(Re2); \
2939           ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2940           PM_SETRE((PL_reg_curpm), (Re2)); \
2941           }
2942            
2943            
2944           /*
2945           - regtry - try match at specific point
2946           */
2947           STATIC I32 /* 0 failure, 1 success */
2948 108483881         S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2949           {
2950           dVAR;
2951           CHECKPOINT lastcp;
2952 108677270         REGEXP *const rx = reginfo->prog;
2953 229782259         regexp *const prog = ReANY(rx);
2954           SSize_t result;
2955 229782259         RXi_GET_DECL(prog,progi);
2956 229782259         GET_RE_DEBUG_FLAGS_DECL;
2957            
2958 229782259         PERL_ARGS_ASSERT_REGTRY;
2959            
2960 229782259         reginfo->cutpoint=NULL;
2961            
2962 229782259         prog->offs[0].start = *startposp - reginfo->strbeg;
2963 229782259         prog->lastparen = 0;
2964 229782259         prog->lastcloseparen = 0;
2965            
2966           /* XXXX What this code is doing here?!!! There should be no need
2967           to do this again and again, prog->lastparen should take care of
2968           this! --ilya*/
2969            
2970           /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2971           * Actually, the code in regcppop() (which Ilya may be meaning by
2972           * prog->lastparen), is not needed at all by the test suite
2973           * (op/regexp, op/pat, op/split), but that code is needed otherwise
2974           * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2975           * Meanwhile, this code *is* needed for the
2976           * above-mentioned test suite tests to succeed. The common theme
2977           * on those tests seems to be returning null fields from matches.
2978           * --jhi updated by dapm */
2979           #if 1
2980 229782259         if (prog->nparens) {
2981 229782221         regexp_paren_pair *pp = prog->offs;
2982           I32 i;
2983 229782225         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2984 229782221         ++pp;
2985 1749205844         pp->start = -1;
2986 1634564290         pp->end = -1;
2987           }
2988           }
2989           #endif
2990 1634564328         REGCP_SET(lastcp);
2991 1909591423         result = regmatch(reginfo, *startposp, progi->program + 1);
2992 2184219214         if (result != -1) {
2993 2184219214         prog->offs[0].end = result;
2994 151487096         return 1;
2995           }
2996 5549990         if (reginfo->cutpoint)
2997 5267184         *startposp= reginfo->cutpoint;
2998 1310327         REGCP_UNWIND(lastcp);
2999           return 0;
3000           }
3001            
3002            
3003           #define sayYES goto yes
3004           #define sayNO goto no
3005           #define sayNO_SILENT goto no_silent
3006            
3007           /* we dont use STMT_START/END here because it leads to
3008           "unreachable code" warnings, which are bogus, but distracting. */
3009           #define CACHEsayNO \
3010           if (ST.cache_mask) \
3011           reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3012           sayNO
3013            
3014           /* this is used to determine how far from the left messages like
3015           'failed...' are printed. It should be set such that messages
3016           are inline with the regop output that created them.
3017           */
3018           #define REPORT_CODE_OFF 32
3019            
3020            
3021           #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3022           #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3023           #define CHRTEST_NOT_A_CP_1 -999
3024           #define CHRTEST_NOT_A_CP_2 -998
3025            
3026           /* grab a new slab and return the first slot in it */
3027            
3028           STATIC regmatch_state *
3029 12804828         S_push_slab(pTHX)
3030           {
3031           #if PERL_VERSION < 9 && !defined(PERL_CORE)
3032           dMY_CXT;
3033           #endif
3034 304         regmatch_slab *s = PL_regmatch_slab->next;
3035 304         if (!s) {
3036 304         Newx(s, 1, regmatch_slab);
3037 0         s->prev = PL_regmatch_slab;
3038 0         s->next = NULL;
3039 8866578         PL_regmatch_slab->next = s;
3040           }
3041 34844664         PL_regmatch_slab = s;
3042 11191980         return SLAB_FIRST(s);
3043           }
3044            
3045            
3046           /* push a new state then goto it */
3047            
3048           #define PUSH_STATE_GOTO(state, node, input) \
3049           pushinput = input; \
3050           scan = node; \
3051           st->resume_state = state; \
3052           goto push_state;
3053            
3054           /* push a new state with success backtracking, then goto it */
3055            
3056           #define PUSH_YES_STATE_GOTO(state, node, input) \
3057           pushinput = input; \
3058           scan = node; \
3059           st->resume_state = state; \
3060           goto push_yes_state;
3061            
3062            
3063            
3064            
3065           /*
3066            
3067           regmatch() - main matching routine
3068            
3069           This is basically one big switch statement in a loop. We execute an op,
3070           set 'next' to point the next op, and continue. If we come to a point which
3071           we may need to backtrack to on failure such as (A|B|C), we push a
3072           backtrack state onto the backtrack stack. On failure, we pop the top
3073           state, and re-enter the loop at the state indicated. If there are no more
3074           states to pop, we return failure.
3075            
3076           Sometimes we also need to backtrack on success; for example /A+/, where
3077           after successfully matching one A, we need to go back and try to
3078           match another one; similarly for lookahead assertions: if the assertion
3079           completes successfully, we backtrack to the state just before the assertion
3080           and then carry on. In these cases, the pushed state is marked as
3081           'backtrack on success too'. This marking is in fact done by a chain of
3082           pointers, each pointing to the previous 'yes' state. On success, we pop to
3083           the nearest yes state, discarding any intermediate failure-only states.
3084           Sometimes a yes state is pushed just to force some cleanup code to be
3085           called at the end of a successful match or submatch; e.g. (??{$re}) uses
3086           it to free the inner regex.
3087            
3088           Note that failure backtracking rewinds the cursor position, while
3089           success backtracking leaves it alone.
3090            
3091           A pattern is complete when the END op is executed, while a subpattern
3092           such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3093           ops trigger the "pop to last yes state if any, otherwise return true"
3094           behaviour.
3095            
3096           A common convention in this function is to use A and B to refer to the two
3097           subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3098           the subpattern to be matched possibly multiple times, while B is the entire
3099           rest of the pattern. Variable and state names reflect this convention.
3100            
3101           The states in the main switch are the union of ops and failure/success of
3102           substates associated with with that op. For example, IFMATCH is the op
3103           that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3104           'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3105           successfully matched A and IFMATCH_A_fail is a state saying that we have
3106           just failed to match A. Resume states always come in pairs. The backtrack
3107           state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3108           at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3109           on success or failure.
3110            
3111           The struct that holds a backtracking state is actually a big union, with
3112           one variant for each major type of op. The variable st points to the
3113           top-most backtrack struct. To make the code clearer, within each
3114           block of code we #define ST to alias the relevant union.
3115            
3116           Here's a concrete example of a (vastly oversimplified) IFMATCH
3117           implementation:
3118            
3119           switch (state) {
3120           ....
3121            
3122           #define ST st->u.ifmatch
3123            
3124           case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3125           ST.foo = ...; // some state we wish to save
3126           ...
3127           // push a yes backtrack state with a resume value of
3128           // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3129           // first node of A:
3130           PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3131           // NOTREACHED
3132            
3133           case IFMATCH_A: // we have successfully executed A; now continue with B
3134           next = B;
3135           bar = ST.foo; // do something with the preserved value
3136           break;
3137            
3138           case IFMATCH_A_fail: // A failed, so the assertion failed
3139           ...; // do some housekeeping, then ...
3140           sayNO; // propagate the failure
3141            
3142           #undef ST
3143            
3144           ...
3145           }
3146            
3147           For any old-timers reading this who are familiar with the old recursive
3148           approach, the code above is equivalent to:
3149            
3150           case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3151           {
3152           int foo = ...
3153           ...
3154           if (regmatch(A)) {
3155           next = B;
3156           bar = foo;
3157           break;
3158           }
3159           ...; // do some housekeeping, then ...
3160           sayNO; // propagate the failure
3161           }
3162            
3163           The topmost backtrack state, pointed to by st, is usually free. If you
3164           want to claim it, populate any ST.foo fields in it with values you wish to
3165           save, then do one of
3166            
3167           PUSH_STATE_GOTO(resume_state, node, newinput);
3168           PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3169            
3170           which sets that backtrack state's resume value to 'resume_state', pushes a
3171           new free entry to the top of the backtrack stack, then goes to 'node'.
3172           On backtracking, the free slot is popped, and the saved state becomes the
3173           new free state. An ST.foo field in this new top state can be temporarily
3174           accessed to retrieve values, but once the main loop is re-entered, it
3175           becomes available for reuse.
3176            
3177           Note that the depth of the backtrack stack constantly increases during the
3178           left-to-right execution of the pattern, rather than going up and down with
3179           the pattern nesting. For example the stack is at its maximum at Z at the
3180           end of the pattern, rather than at X in the following:
3181            
3182           /(((X)+)+)+....(Y)+....Z/
3183            
3184           The only exceptions to this are lookahead/behind assertions and the cut,
3185           (?>A), which pop all the backtrack states associated with A before
3186           continuing.
3187          
3188           Backtrack state structs are allocated in slabs of about 4K in size.
3189           PL_regmatch_state and st always point to the currently active state,
3190           and PL_regmatch_slab points to the slab currently containing
3191           PL_regmatch_state. The first time regmatch() is called, the first slab is
3192           allocated, and is never freed until interpreter destruction. When the slab
3193           is full, a new one is allocated and chained to the end. At exit from
3194           regmatch(), slabs allocated since entry are freed.
3195            
3196           */
3197          
3198            
3199           #define DEBUG_STATE_pp(pp) \
3200           DEBUG_STATE_r({ \
3201           DUMP_EXEC_POS(locinput, scan, utf8_target); \
3202           PerlIO_printf(Perl_debug_log, \
3203           " %*s"pp" %s%s%s%s%s\n", \
3204           depth*2, "", \
3205           PL_reg_name[st->resume_state], \
3206           ((st==yes_state||st==mark_state) ? "[" : ""), \
3207           ((st==yes_state) ? "Y" : ""), \
3208           ((st==mark_state) ? "M" : ""), \
3209           ((st==yes_state||st==mark_state) ? "]" : "") \
3210           ); \
3211           });
3212            
3213            
3214           #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3215            
3216           #ifdef DEBUGGING
3217            
3218           STATIC void
3219 6828698         S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3220           const char *start, const char *end, const char *blurb)
3221           {
3222 1122616         const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3223            
3224 248         PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3225            
3226 244         if (!PL_colorset)
3227 152         reginitcolors();
3228           {
3229 24455914         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3230           RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3231          
3232 10937458         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3233           start, end - start, 60);
3234          
3235 45403611         PerlIO_printf(Perl_debug_log,
3236           "%s%s REx%s %s against %s\n",
3237           PL_colors[4], blurb, PL_colors[5], s0, s1);
3238          
3239 45403611         if (utf8_target||utf8_pat)
3240 45403573         PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3241           utf8_pat ? "pattern" : "",
3242 45403545         utf8_pat && utf8_target ? " and " : "",
3243           utf8_target ? "string" : ""
3244           );
3245           }
3246 45403611         }
3247            
3248           STATIC void
3249 45403697         S_dump_exec_pos(pTHX_ const char *locinput,
3250           const regnode *scan,
3251           const char *loc_regeol,
3252           const char *loc_bostr,
3253           const char *loc_reg_starttry,
3254           const bool utf8_target)
3255           {
3256 42833481         const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3257 38333452         const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3258 40024         int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3259           /* The part of the string before starttry has one color
3260           (pref0_len chars), between starttry and current
3261           position another one (pref_len - pref0_len chars),
3262           after the current position the third one.
3263           We assume that pref0_len <= pref_len, otherwise we
3264           decrease pref0_len. */
3265 7070561         int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3266 7070561         ? (5 + taill) - l : locinput - loc_bostr;
3267           int pref0_len;
3268            
3269 7070561         PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3270            
3271 7070561         while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3272 7070381         pref_len++;
3273 7070561         pref0_len = pref_len - (locinput - loc_reg_starttry);
3274 7070561         if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3275 46821545         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3276 36216692         ? (5 + taill) - pref_len : loc_regeol - locinput);
3277 36216870         while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3278 36216690         l--;
3279 36216870         if (pref0_len < 0)
3280           pref0_len = 0;
3281 4016683         if (pref0_len > pref_len)
3282           pref0_len = pref_len;
3283           {
3284 3951475         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3285            
3286 11228         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3287           (locinput - pref_len),pref0_len, 60, 4, 5);
3288          
3289 11228         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3290           (locinput - pref_len + pref0_len),
3291           pref_len - pref0_len, 60, 2, 3);
3292          
3293 3940427         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3294           locinput, loc_regeol - locinput, 10, 0, 1);
3295            
3296 3940427         const STRLEN tlen=len0+len1+len2;
3297 4016863         PerlIO_printf(Perl_debug_log,
3298           "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3299           (IV)(locinput - loc_bostr),
3300           len0, s0,
3301           len1, s1,
3302           (docolor ? "" : "> <"),
3303           len2, s2,
3304 4003609         (int)(tlen > 19 ? 0 : 19 - tlen),
3305           "");
3306           }
3307 4016683         }
3308            
3309           #endif
3310            
3311           /* reg_check_named_buff_matched()
3312           * Checks to see if a named buffer has matched. The data array of
3313           * buffer numbers corresponding to the buffer is expected to reside
3314           * in the regexp->data->data array in the slot stored in the ARG() of
3315           * node involved. Note that this routine doesn't actually care about the
3316           * name, that information is not preserved from compilation to execution.
3317           * Returns the index of the leftmost defined buffer with the given name
3318           * or 0 if non of the buffers matched.
3319           */
3320           STATIC I32
3321 68422759         S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3322           {
3323           I32 n;
3324 32206069         RXi_GET_DECL(rex,rexi);
3325 32206069         SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3326 32206069         I32 *nums=(I32*)SvPVX(sv_dat);
3327            
3328 13056         PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3329            
3330 47273609         for ( n=0; n
3331 30142261         if ((I32)rex->lastparen >= nums[n] &&
3332 30142261         rex->offs[nums[n]].end != -1)
3333           {
3334 30142137         return nums[n];
3335           }
3336           }
3337           return 0;
3338           }
3339            
3340            
3341           static bool
3342 30129601         S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3343           U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3344           {
3345           /* This function determines if there are one or two characters that match
3346           * the first character of the passed-in EXACTish node , and if
3347           * so, returns them in the passed-in pointers.
3348           *
3349           * If it determines that no possible character in the target string can
3350           * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3351           * the first character in requires UTF-8 to represent, and the
3352           * target string isn't in UTF-8.)
3353           *
3354           * If there are more than two characters that could match the beginning of
3355           * , or if more context is required to determine a match or not,
3356           * it sets both * and * to CHRTEST_VOID.
3357           *
3358           * The motiviation behind this function is to allow the caller to set up
3359           * tight loops for matching. If is of type EXACT, there is
3360           * only one possible character that can match its first character, and so
3361           * the situation is quite simple. But things get much more complicated if
3362           * folding is involved. It may be that the first character of an EXACTFish
3363           * node doesn't participate in any possible fold, e.g., punctuation, so it
3364           * can be matched only by itself. The vast majority of characters that are
3365           * in folds match just two things, their lower and upper-case equivalents.
3366           * But not all are like that; some have multiple possible matches, or match
3367           * sequences of more than one character. This function sorts all that out.
3368           *
3369           * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3370           * loop of trying to match A*, we know we can't exit where the thing
3371           * following it isn't a B. And something can't be a B unless it is the
3372           * beginning of B. By putting a quick test for that beginning in a tight
3373           * loop, we can rule out things that can't possibly be B without having to
3374           * break out of the loop, thus avoiding work. Similarly, if A is a single
3375           * character, we can make a tight loop matching A*, using the outputs of
3376           * this function.
3377           *
3378           * If the target string to match isn't in UTF-8, and there aren't
3379           * complications which require CHRTEST_VOID, * and * are set to
3380           * the one or two possible octets (which are characters in this situation)
3381           * that can match. In all cases, if there is only one character that can
3382           * match, * and * will be identical.
3383           *
3384           * If the target string is in UTF-8, the buffers pointed to by
3385           * and will contain the one or two UTF-8 sequences of bytes that
3386           * can match the beginning of . They should be declared with at
3387           * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3388           * undefined what these contain.) If one or both of the buffers are
3389           * invariant under UTF-8, *, and * will also be set to the
3390           * corresponding invariant. If variant, the corresponding * and/or
3391           * * will be set to a negative number(s) that shouldn't match any code
3392           * point (unless inappropriately coerced to unsigned). * will equal
3393           * * if and only if and are the same. */
3394            
3395 29148265         const bool utf8_target = reginfo->is_utf8_target;
3396            
3397           UV c1 = CHRTEST_NOT_A_CP_1;
3398           UV c2 = CHRTEST_NOT_A_CP_2;
3399           bool use_chrtest_void = FALSE;
3400 32206069         const bool is_utf8_pat = reginfo->is_utf8_pat;
3401            
3402           /* Used when we have both utf8 input and utf8 output, to avoid converting
3403           * to/from code points */
3404           bool utf8_has_been_setup = FALSE;
3405            
3406           dVAR;
3407            
3408 7070381         U8 *pat = (U8*)STRING(text_node);
3409            
3410 3951295         if (OP(text_node) == EXACT) {
3411            
3412           /* In an exact node, only one thing can be matched, that first
3413           * character. If both the pat and the target are UTF-8, we can just
3414           * copy the input to the output, avoiding finding the code point of
3415           * that character */
3416 9967643         if (!is_utf8_pat) {
3417 4041419         c2 = c1 = *pat;
3418           }
3419 4041419         else if (utf8_target) {
3420 3951295         Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3421 3951295         Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3422           utf8_has_been_setup = TRUE;
3423           }
3424           else {
3425 34150         c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3426           }
3427           }
3428           else /* an EXACTFish node */
3429 21244         if ((is_utf8_pat
3430 21457         && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3431           pat + STR_LEN(text_node)))
3432 34150         || (!is_utf8_pat
3433 34150         && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3434           pat + STR_LEN(text_node))))
3435           {
3436           /* Multi-character folds require more context to sort out. Also
3437           * PL_utf8_foldclosures used below doesn't handle them, so have to be
3438           * handled outside this routine */
3439           use_chrtest_void = TRUE;
3440           }
3441           else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3442 34150         c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3443 34150         if (c1 > 256) {
3444           /* Load the folds hash, if not already done */
3445           SV** listp;
3446 154956         if (! PL_utf8_foldclosures) {
3447 120806         if (! PL_utf8_tofold) {
3448           U8 dummy[UTF8_MAXBYTES_CASE+1];
3449            
3450           /* Force loading this by folding an above-Latin1 char */
3451 34150         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3452 3985445         assert(PL_utf8_tofold); /* Verify that worked */
3453           }
3454 3985445         PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3455           }
3456            
3457           /* The fold closures data structure is a hash with the keys being
3458           * the UTF-8 of every character that is folded to, like 'k', and
3459           * the values each an array of all code points that fold to its
3460           * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3461           * not included */
3462 264046         if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3463           (char *) pat,
3464           UTF8SKIP(pat),
3465           FALSE))))
3466           {
3467           /* Not found in the hash, therefore there are no folds
3468           * containing it, so there is only a single character that
3469           * could match */
3470           c2 = c1;
3471           }
3472           else { /* Does participate in folds */
3473 264046         AV* list = (AV*) *listp;
3474 264046         if (av_len(list) != 1) {
3475            
3476           /* If there aren't exactly two folds to this, it is outside
3477           * the scope of this function */
3478           use_chrtest_void = TRUE;
3479           }
3480           else { /* There are two. Get them */
3481 3985445         SV** c_p = av_fetch(list, 0, FALSE);
3482 3985445         if (c_p == NULL) {
3483 7970890         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3484           }
3485 3985445         c1 = SvUV(*c_p);
3486            
3487 3985445         c_p = av_fetch(list, 1, FALSE);
3488 3985445         if (c_p == NULL) {
3489 22112         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3490           }
3491 11064         c2 = SvUV(*c_p);
3492            
3493           /* Folds that cross the 255/256 boundary are forbidden if
3494           * EXACTFL, or EXACTFA and one is ASCIII. Since the
3495           * pattern character is above 256, and its only other match
3496           * is below 256, the only legal match will be to itself.
3497           * We have thrown away the original, so have to compute
3498           * which is the one above 255 */
3499 9104         if ((c1 < 256) != (c2 < 256)) {
3500 9104         if (OP(text_node) == EXACTFL
3501 1960         || ((OP(text_node) == EXACTFA
3502 1960         || OP(text_node) == EXACTFA_NO_TRIE)
3503 11064         && (isASCII(c1) || isASCII(c2))))
3504           {
3505 35192         if (c1 < 256) {
3506           c1 = c2;
3507           }
3508           else {
3509           c2 = c1;
3510           }
3511           }
3512           }
3513           }
3514           }
3515           }
3516           else /* Here, c1 is < 255 */
3517 24120         if (utf8_target
3518 13072         && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3519 13072         && OP(text_node) != EXACTFL
3520 13072         && ((OP(text_node) != EXACTFA
3521 3974397         && OP(text_node) != EXACTFA_NO_TRIE)
3522 98906         || ! isASCII(c1)))
3523           {
3524           /* Here, there could be something above Latin1 in the target which
3525           * folds to this character in the pattern. All such cases except
3526           * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3527           * involved in their folds, so are outside the scope of this
3528           * function */
3529 14142         if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3530           c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3531           }
3532           else {
3533           use_chrtest_void = TRUE;
3534           }
3535           }
3536           else { /* Here nothing above Latin1 can fold to the pattern character */
3537 3889633         switch (OP(text_node)) {
3538            
3539           case EXACTFL: /* /l rules */
3540 5977449         c2 = PL_fold_locale[c1];
3541 182238         break;
3542            
3543           case EXACTF: /* This node only generated for non-utf8
3544           patterns */
3545 3803207         assert(! is_utf8_pat);
3546 3985445         if (! utf8_target) { /* /d rules */
3547 89066         c2 = PL_fold[c1];
3548 3896379         break;
3549           }
3550           /* FALLTHROUGH */
3551           /* /u rules for all these. This happens to work for
3552           * EXACTFA as nothing in Latin1 folds to ASCII */
3553           case EXACTFA_NO_TRIE: /* This node only generated for
3554           non-utf8 patterns */
3555 254384000         assert(! is_utf8_pat);
3556           /* FALL THROUGH */
3557           case EXACTFA:
3558           case EXACTFU_SS:
3559           case EXACTFU:
3560 254384000         c2 = PL_fold_latin1[c1];
3561 254384000         break;
3562            
3563           default:
3564 839620         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3565           assert(0); /* NOTREACHED */
3566           }
3567           }
3568           }
3569            
3570           /* Here have figured things out. Set up the returns */
3571 839620         if (use_chrtest_void) {
3572 1251868         *c2p = *c1p = CHRTEST_VOID;
3573           }
3574 855546         else if (utf8_target) {
3575 802728         if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3576 796324         uvchr_to_utf8(c1_utf8, c1);
3577 791964         uvchr_to_utf8(c2_utf8, c2);
3578           }
3579            
3580           /* Invariants are stored in both the utf8 and byte outputs; Use
3581           * negative numbers otherwise for the byte ones. Make sure that the
3582           * byte ones are the same iff the utf8 ones are the same */
3583 558280         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3584 4360         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3585 1404         ? *c2_utf8
3586 559684         : (c1 == c2)
3587           ? CHRTEST_NOT_A_CP_1
3588 474421         : CHRTEST_NOT_A_CP_2;
3589           }
3590 148864         else if (c1 > 255) {
3591 135068         if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3592           can represent */
3593           return FALSE;
3594           }
3595            
3596 135048         *c1p = *c2p = c2; /* c2 is the only representable value */
3597           }
3598           else { /* c1 is representable; see about c2 */
3599 132538         *c1p = c1;
3600 127090         *c2p = (c2 < 256) ? c2 : c1;
3601           }
3602            
3603           return TRUE;
3604           }
3605            
3606           /* returns -1 on failure, $+[0] on success */
3607           STATIC SSize_t
3608 2552         S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3609           {
3610           #if PERL_VERSION < 9 && !defined(PERL_CORE)
3611           dMY_CXT;
3612           #endif
3613           dVAR;
3614 1776         const bool utf8_target = reginfo->is_utf8_target;
3615 128866         const U32 uniflags = UTF8_ALLOW_DEFAULT;
3616 253544422         REGEXP *rex_sv = reginfo->prog;
3617 249016736         regexp *rex = ReANY(rex_sv);
3618 170452682         RXi_GET_DECL(rex,rexi);
3619           /* the current state. This is a cached copy of PL_regmatch_state */
3620           regmatch_state *st;
3621           /* cache heavy used fields of st in registers */
3622           regnode *scan;
3623           regnode *next;
3624           U32 n = 0; /* general value; init to avoid compiler warning */
3625           SSize_t ln = 0; /* len or last; init to avoid compiler warning */
3626           char *locinput = startpos;
3627           char *pushinput; /* where to continue after a PUSH */
3628           I32 nextchr; /* is always set to UCHARAT(locinput) */
3629            
3630           bool result = 0; /* return value of S_regmatch */
3631           int depth = 0; /* depth of backtrack stack */
3632           U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3633 120297742         const U32 max_nochange_depth =
3634           (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3635 813210         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3636           regmatch_state *yes_state = NULL; /* state to pop to on success of
3637           subpattern */
3638           /* mark_state piggy backs on the yes_state logic so that when we unwind
3639           the stack on success we can update the mark_state as we go */
3640           regmatch_state *mark_state = NULL; /* last mark state we have seen */
3641           regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3642           struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3643           U32 state_num;
3644           bool no_final = 0; /* prevent failure from backtracking? */
3645           bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3646           char *startpoint = locinput;
3647           SV *popmark = NULL; /* are we looking for a mark? */
3648           SV *sv_commit = NULL; /* last mark name seen in failure */
3649           SV *sv_yes_mark = NULL; /* last mark name we have seen
3650           during a successful match */
3651           U32 lastopen = 0; /* last open we saw */
3652 406626         bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3653 848506         SV* const oreplsv = GvSV(PL_replgv);
3654           /* these three flags are set by various ops to signal information to
3655           * the very next op. They have a useful lifetime of exactly one loop
3656           * iteration, and are not preserved or restored by state pushes/pops
3657           */
3658           bool sw = 0; /* the condition value in (?(cond)a|b) */
3659           bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3660           int logical = 0; /* the following EVAL is:
3661           0: (?{...})
3662           1: (?(?{...})X|Y)
3663           2: (??{...})
3664           or the following IFMATCH/UNLESSM is:
3665           false: plain (?=foo)
3666           true: used as a condition: (?(?=foo))
3667           */
3668           PAD* last_pad = NULL;
3669           dMULTICALL;
3670           I32 gimme = G_SCALAR;
3671           CV *caller_cv = NULL; /* who called us */
3672           CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3673           CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3674 872782         U32 maxopenparen = 0; /* max '(' index seen so far */
3675           int to_complement; /* Invert the result? */
3676           _char_class_number classnum;
3677 1411512         bool is_utf8_pat = reginfo->is_utf8_pat;
3678            
3679           #ifdef DEBUGGING
3680 1411512         GET_RE_DEBUG_FLAGS_DECL;
3681           #endif
3682            
3683           /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3684           multicall_oldcatch = 0;
3685           multicall_cv = NULL;
3686           cx = NULL;
3687           PERL_UNUSED_VAR(multicall_cop);
3688           PERL_UNUSED_VAR(newsp);
3689            
3690            
3691 1411512         PERL_ARGS_ASSERT_REGMATCH;
3692            
3693 920324         DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3694           PerlIO_printf(Perl_debug_log,"regmatch start\n");
3695           }));
3696            
3697 920324         st = PL_regmatch_state;
3698            
3699           /* Note that nextchr is a byte even in UTF */
3700           SET_nextchr;
3701           scan = prog;
3702 462520         while (scan != NULL) {
3703            
3704 462478         DEBUG_EXECUTE_r( {
3705           SV * const prop = sv_newmortal();
3706           regnode *rnext=regnext(scan);
3707           DUMP_EXEC_POS( locinput, scan, utf8_target );
3708           regprop(rex, prop, scan);
3709          
3710           PerlIO_printf(Perl_debug_log,
3711           "%3"IVdf":%*s%s(%"IVdf")\n",
3712           (IV)(scan - rexi->program), depth*2, "",
3713           SvPVX_const(prop),
3714           (PL_regkind[OP(scan)] == END || !rnext) ?
3715           0 : (IV)(rnext - rexi->program));
3716           });
3717            
3718 735670         next = scan + NEXT_OFF(scan);
3719 491336         if (next == scan)
3720           next = NULL;
3721 366988         state_num = OP(scan);
3722            
3723           reenter_switch:
3724           to_complement = 0;
3725            
3726 201142         SET_nextchr;
3727 172098         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3728            
3729 161836         switch (state_num) {
3730           case BOL: /* /^../ */
3731 161688         if (locinput == reginfo->strbeg)
3732           break;
3733           sayNO;
3734            
3735           case MBOL: /* /^../m */
3736 498         if (locinput == reginfo->strbeg ||
3737 12146608         (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3738           {
3739           break;
3740           }
3741           sayNO;
3742            
3743           case SBOL: /* /^../s */
3744 5450         if (locinput == reginfo->strbeg)
3745           break;
3746           sayNO;
3747            
3748           case GPOS: /* \G */
3749 5222         if (locinput == reginfo->ganch)
3750           break;
3751           sayNO;
3752            
3753           case KEEPS: /* \K */
3754           /* update the startpoint */
3755 4218         st->u.keeper.val = rex->offs[0].start;
3756 4002         rex->offs[0].start = locinput - reginfo->strbeg;
3757 4002         PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3758           assert(0); /*NOTREACHED*/
3759           case KEEPS_next_fail:
3760           /* rollback the start point change */
3761 4002         rex->offs[0].start = st->u.keeper.val;
3762 4218         sayNO_SILENT;
3763           assert(0); /*NOTREACHED*/
3764            
3765           case EOL: /* /..$/ */
3766           goto seol;
3767            
3768           case MEOL: /* /..$/m */
3769 3990         if (!NEXTCHR_IS_EOS && nextchr != '\n')
3770           sayNO;
3771           break;
3772            
3773           case SEOL: /* /..$/s */
3774           seol:
3775 3992         if (!NEXTCHR_IS_EOS && nextchr != '\n')
3776           sayNO;
3777 3776         if (reginfo->strend - locinput > 1)
3778           sayNO;
3779           break;
3780            
3781           case EOS: /* \z */
3782 3774         if (!NEXTCHR_IS_EOS)
3783           sayNO;
3784           break;
3785            
3786           case SANY: /* /./s */
3787 228         if (NEXTCHR_IS_EOS)
3788           sayNO;
3789           goto increment_locinput;
3790            
3791           case CANY: /* \C */
3792 228         if (NEXTCHR_IS_EOS)
3793           sayNO;
3794 17744483         locinput++;
3795 11204186         break;
3796            
3797           case REG_ANY: /* /./ */
3798 12142390         if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3799           sayNO;
3800           goto increment_locinput;
3801            
3802            
3803           #undef ST
3804           #define ST st->u.trie
3805           case TRIEC: /* (ab|cd) with known charclass */
3806           /* In this case the charclass data is available inline so
3807           we can fail fast without a lot of extra overhead.
3808           */
3809 1170         if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3810 1168         DEBUG_EXECUTE_r(
3811           PerlIO_printf(Perl_debug_log,
3812           "%*s %sfailed to match trie start class...%s\n",
3813           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3814           );
3815           sayNO_SILENT;
3816           assert(0); /* NOTREACHED */
3817           }
3818           /* FALL THROUGH */
3819           case TRIE: /* (ab|cd) */
3820           /* the basic plan of execution of the trie is:
3821           * At the beginning, run though all the states, and
3822           * find the longest-matching word. Also remember the position
3823           * of the shortest matching word. For example, this pattern:
3824           * 1 2 3 4 5
3825           * ab|a|x|abcd|abc
3826           * when matched against the string "abcde", will generate
3827           * accept states for all words except 3, with the longest
3828           * matching word being 4, and the shortest being 2 (with
3829           * the position being after char 1 of the string).
3830           *
3831           * Then for each matching word, in word order (i.e. 1,2,4,5),
3832           * we run the remainder of the pattern; on each try setting
3833           * the current position to the character following the word,
3834           * returning to try the next word on failure.
3835           *
3836           * We avoid having to build a list of words at runtime by
3837           * using a compile-time structure, wordinfo[].prev, which
3838           * gives, for each word, the previous accepting word (if any).
3839           * In the case above it would contain the mappings 1->2, 2->0,
3840           * 3->0, 4->5, 5->1. We can use this table to generate, from
3841           * the longest word (4 above), a list of all words, by
3842           * following the list of prev pointers; this gives us the
3843           * unordered list 4,5,1,2. Then given the current word we have
3844           * just tried, we can go through the list and find the
3845           * next-biggest word to try (so if we just failed on word 2,
3846           * the next in the list is 4).
3847           *
3848           * Since at runtime we don't record the matching position in
3849           * the string for each word, we have to work that out for
3850           * each word we're about to process. The wordinfo table holds
3851           * the character length of each word; given that we recorded
3852           * at the start: the position of the shortest word and its
3853           * length in chars, we just need to move the pointer the
3854           * difference between the two char lengths. Depending on
3855           * Unicode status and folding, that's cheap or expensive.
3856           *
3857           * This algorithm is optimised for the case where are only a
3858           * small number of accept states, i.e. 0,1, or maybe 2.
3859           * With lots of accepts states, and having to try all of them,
3860           * it becomes quadratic on number of accept states to find all
3861           * the next words.
3862           */
3863            
3864           {
3865           /* what type of TRIE am I? (utf8 makes this contextual) */
3866 1174         DECL_TRIE_TYPE(scan);
3867            
3868           /* what trie are we using right now */
3869 110         reg_trie_data * const trie
3870 110         = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3871 110         HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3872 10947424         U32 state = trie->startstate;
3873            
3874 10947424         if ( trie->bitmap
3875 10947420         && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3876           {
3877 1193702         if (trie->states[ state ].wordnum) {
3878 1193702         DEBUG_EXECUTE_r(
3879           PerlIO_printf(Perl_debug_log,
3880           "%*s %smatched empty string...%s\n",
3881           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3882           );
3883 1193702         if (!trie->jump)
3884           break;
3885           } else {
3886 0         DEBUG_EXECUTE_r(
3887           PerlIO_printf(Perl_debug_log,
3888           "%*s %sfailed to match trie start class...%s\n",
3889           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3890           );
3891           sayNO_SILENT;
3892           }
3893           }
3894            
3895           {
3896           U8 *uc = ( U8* )locinput;
3897            
3898 12146612         STRLEN len = 0;
3899 58775536         STRLEN foldlen = 0;
3900           U8 *uscan = (U8*)NULL;
3901           U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3902           U32 charcount = 0; /* how many input chars we have matched */
3903           U32 accepted = 0; /* have we seen any accepting states? */
3904            
3905 57344290         ST.jump = trie->jump;
3906 826002         ST.me = scan;
3907 522784         ST.firstpos = NULL;
3908 56518292         ST.longfold = FALSE; /* char longer if folded => it's harder */
3909 27052025         ST.nextword = 0;
3910            
3911           /* fully traverse the TRIE; note the position of the
3912           shortest accept state and the wordnum of the longest
3913           accept state */
3914            
3915 112         while ( state && uc <= (U8*)(reginfo->strend) ) {
3916 194         U32 base = trie->states[ state ].trans.base;
3917 104         UV uvc = 0;
3918           U16 charid = 0;
3919           U16 wordnum;
3920 94         wordnum = trie->states[ state ].wordnum;
3921            
3922 24         if (wordnum) { /* it's an accept state */
3923 8         if (!accepted) {
3924           accepted = 1;
3925           /* record first match position */
3926 88         if (ST.longfold) {
3927 1250552         ST.firstpos = (U8*)locinput;
3928 7708673         ST.firstchars = 0;
3929           }
3930           else {
3931 182216         ST.firstpos = uc;
3932 182216         ST.firstchars = charcount;
3933           }
3934           }
3935 7839793         if (!ST.nextword || wordnum < ST.nextword)
3936 7664131         ST.nextword = wordnum;
3937 5121993         ST.topword = wordnum;
3938           }
3939            
3940 5141532         DEBUG_TRIE_EXECUTE_r({
3941           DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3942           PerlIO_printf( Perl_debug_log,
3943           "%*s %sState: %4"UVxf" Accepted: %c ",
3944           2+depth * 2, "", PL_colors[4],
3945           (UV)state, (accepted ? 'Y' : 'N'));
3946           });
3947            
3948           /* read a char and goto next state */
3949 293500         if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3950           I32 offset;
3951 290924         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3952           uscan, len, uvc, charid, foldlen,
3953           foldbuf, uniflags);
3954 159530         charcount++;
3955 54556         if (foldlen>0)
3956 131394         ST.longfold = TRUE;
3957 24908         if (charid &&
3958 18676         ( ((offset =
3959 11412         base + charid - 1 - trie->uniquecharcount)) >= 0)
3960            
3961 118960         && ((U32)offset < trie->lasttrans)
3962 118960         && trie->trans[offset].check == state)
3963           {
3964 79528         state = trie->trans[offset].next;
3965           }
3966           else {
3967           state = 0;
3968           }
3969 46         uc += len;
3970            
3971           }
3972           else {
3973           state = 0;
3974           }
3975 50         DEBUG_TRIE_EXECUTE_r(
3976           PerlIO_printf( Perl_debug_log,
3977           "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3978           charid, uvc, (UV)state, PL_colors[5] );
3979           );
3980           }
3981 79522         if (!accepted)
3982           sayNO;
3983            
3984           /* calculate total number of accept states */
3985           {
3986 79522         U16 w = ST.topword;
3987           accepted = 0;
3988 39444         while (w) {
3989 39260         w = trie->wordinfo[w].prev;
3990 39260         accepted++;
3991           }
3992 120         ST.accepted = accepted;
3993           }
3994            
3995 120         DEBUG_EXECUTE_r(
3996           PerlIO_printf( Perl_debug_log,
3997           "%*s %sgot %"IVdf" possible matches%s\n",
3998           REPORT_CODE_OFF + depth * 2, "",
3999           PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4000           );
4001           goto trie_first_try; /* jump into the fail handler */
4002           }}
4003           assert(0); /* NOTREACHED */
4004            
4005           case TRIE_next_fail: /* we failed - try next alternative */
4006           {
4007           U8 *uc;
4008 4         if ( ST.jump) {
4009 4         REGCP_UNWIND(ST.cp);
4010 104         UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4011           }
4012 71         if (!--ST.accepted) {
4013 4         DEBUG_EXECUTE_r({
4014           PerlIO_printf( Perl_debug_log,
4015           "%*s %sTRIE failed...%s\n",
4016           REPORT_CODE_OFF+depth*2, "",
4017           PL_colors[4],
4018           PL_colors[5] );
4019           });
4020           sayNO_SILENT;
4021           }
4022           {
4023           /* Find next-highest word to process. Note that this code
4024           * is O(N^2) per trie run (O(N) per branch), so keep tight */
4025           U16 min = 0;
4026           U16 word;
4027 118698         U16 const nextword = ST.nextword;
4028 4379828         reg_trie_wordinfo * const wordinfo
4029 4067818         = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4030 3731350         for (word=ST.topword; word; word=wordinfo[word].prev) {
4031 3731350         if (word > nextword && (!min || word < min))
4032           min = word;
4033           }
4034 16         ST.nextword = min;
4035           }
4036            
4037           trie_first_try:
4038 12         if (do_cutgroup) {
4039           do_cutgroup = 0;
4040           no_final = 0;
4041           }
4042            
4043 6         if ( ST.jump) {
4044 336468         ST.lastparen = rex->lastparen;
4045 56         ST.lastcloseparen = rex->lastcloseparen;
4046 2         REGCP_SET(ST.cp);
4047           }
4048            
4049           /* find start char of end of current word */
4050           {
4051           U32 chars; /* how many chars to skip */
4052 336470         reg_trie_data * const trie
4053 336470         = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4054            
4055 335027         assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4056           >= ST.firstchars);
4057 1788         chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4058 1110         - ST.firstchars;
4059 1110         uc = ST.firstpos;
4060            
4061 678         if (ST.longfold) {
4062           /* the hard option - fold each char in turn and find
4063           * its folded length (which may be different */
4064           U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4065           STRLEN foldlen;
4066           STRLEN len;
4067           UV uvc;
4068           U8 *uscan;
4069            
4070 132         while (chars) {
4071 224         if (utf8_target) {
4072 88         uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4073           uniflags);
4074 26         uc += len;
4075           }
4076           else {
4077 542         uvc = *uc;
4078 104         uc++;
4079           }
4080 158         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4081           uscan = foldbuf;
4082 56         while (foldlen) {
4083 2         if (!--chars)
4084           break;
4085 552         uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4086           uniflags);
4087 496         uscan += len;
4088 114         foldlen -= len;
4089           }
4090           }
4091           }
4092           else {
4093 442         if (utf8_target)
4094 382         while (chars--)
4095 446         uc += UTF8SKIP(uc);
4096           else
4097 228         uc += chars;
4098           }
4099           }
4100            
4101 232         scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4102 224         ? ST.jump[ST.nextword]
4103 227         : NEXT_OFF(ST.me));
4104            
4105 226         DEBUG_EXECUTE_r({
4106           PerlIO_printf( Perl_debug_log,
4107           "%*s %sTRIE matched word #%d, continuing%s\n",
4108           REPORT_CODE_OFF+depth*2, "",
4109           PL_colors[4],
4110           ST.nextword,
4111           PL_colors[5]
4112           );
4113           });
4114            
4115 341         if (ST.accepted > 1 || has_cutgroup) {
4116 112         PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4117           assert(0); /* NOTREACHED */
4118           }
4119           /* only one choice left - just continue */
4120 8         DEBUG_EXECUTE_r({
4121           AV *const trie_words
4122           = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4123           SV ** const tmp = av_fetch( trie_words,
4124           ST.nextword-1, 0 );
4125           SV *sv= tmp ? sv_newmortal() : NULL;
4126            
4127           PerlIO_printf( Perl_debug_log,
4128           "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4129           REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4130           ST.nextword,
4131           tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4132           PL_colors[0], PL_colors[1],
4133           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4134           )
4135           : "not compiled under -Dr",
4136           PL_colors[5] );
4137           });
4138            
4139           locinput = (char*)uc;
4140 344         continue; /* execute rest of RE */
4141           assert(0); /* NOTREACHED */
4142           }
4143           #undef ST
4144            
4145           case EXACT: { /* /abc/ */
4146 206         char *s = STRING(scan);
4147 48         ln = STR_LEN(scan);
4148 338178         if (utf8_target != is_utf8_pat) {
4149           /* The target and the pattern have differing utf8ness. */
4150           char *l = locinput;
4151 328218         const char * const e = s + ln;
4152            
4153 2776         if (utf8_target) {
4154           /* The target is utf8, the pattern is not utf8.
4155           * Above-Latin1 code points can't match the pattern;
4156           * invariants match exactly, and the other Latin1 ones need
4157           * to be downgraded to a single byte in order to do the
4158           * comparison. (If we could be confident that the target
4159           * is not malformed, this could be refactored to have fewer
4160           * tests by just assuming that if the first bytes match, it
4161           * is an invariant, but there are tests in the test suite
4162           * dealing with (??{...}) which violate this) */
4163 336468         while (s < e) {
4164 852         if (l >= reginfo->strend
4165 426         || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4166           {
4167           sayNO;
4168           }
4169 1746         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4170 3091         if (*l != *s) {
4171           sayNO;
4172           }
4173 8648         l++;
4174           }
4175           else {
4176 8648         if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4177           {
4178           sayNO;
4179           }
4180 852         l += 2;
4181           }
4182 426         s++;
4183           }
4184           }
4185           else {
4186           /* The target is not utf8, the pattern is utf8. */
4187 1778         while (s < e) {
4188 61845         if (l >= reginfo->strend
4189 122465         || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4190           {
4191           sayNO;
4192           }
4193 122465         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4194 131113         if (*s != *l) {
4195           sayNO;
4196           }
4197 131113         s++;
4198           }
4199           else {
4200 131113         if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4201           {
4202           sayNO;
4203           }
4204 128813         s += 2;
4205           }
4206 127283         l++;
4207           }
4208           }
4209           locinput = l;
4210           }
4211           else {
4212           /* The target and the pattern have the same utf8ness. */
4213           /* Inline the first character, for speed. */
4214 127325         if (reginfo->strend - locinput < ln
4215 5930         || UCHARAT(s) != nextchr
4216 5430         || (ln > 1 && memNE(s, locinput, ln)))
4217           {
4218           sayNO;
4219           }
4220 5430         locinput += ln;
4221           }
4222           break;
4223           }
4224            
4225           case EXACTFL: { /* /abc/il */
4226           re_fold_t folder;
4227           const U8 * fold_array;
4228           const char * s;
4229           U32 fold_utf8_flags;
4230            
4231 4132         RX_MATCH_TAINTED_on(reginfo->prog);
4232           folder = foldEQ_locale;
4233           fold_array = PL_fold_locale;
4234           fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4235 4132         goto do_exactf;
4236            
4237           case EXACTFU_SS: /* /\x{df}/iu */
4238           case EXACTFU: /* /abc/iu */
4239           folder = foldEQ_latin1;
4240           fold_array = PL_fold_latin1;
4241 180062         fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4242 157351         goto do_exactf;
4243            
4244           case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4245           patterns */
4246 334         assert(! is_utf8_pat);
4247           /* FALL THROUGH */
4248           case EXACTFA: /* /abc/iaa */
4249           folder = foldEQ_latin1;
4250           fold_array = PL_fold_latin1;
4251           fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4252           goto do_exactf;
4253            
4254           case EXACTF: /* /abc/i This node only generated for
4255           non-utf8 patterns */
4256 334         assert(! is_utf8_pat);
4257           folder = foldEQ;
4258           fold_array = PL_fold;
4259           fold_utf8_flags = 0;
4260            
4261           do_exactf:
4262 22869         s = STRING(scan);
4263 22869         ln = STR_LEN(scan);
4264            
4265 21966         if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4266           /* Either target or the pattern are utf8, or has the issue where
4267           * the fold lengths may differ. */
4268           const char * const l = locinput;
4269 3750         char *e = reginfo->strend;
4270            
4271 102         if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
4272           l, &e, 0, utf8_target, fold_utf8_flags))
4273           {
4274           sayNO;
4275           }
4276 19935         locinput = e;
4277 19935         break;
4278           }
4279            
4280           /* Neither the target nor the pattern are utf8 */
4281 2997858         if (UCHARAT(s) != nextchr
4282 2290614         && !NEXTCHR_IS_EOS
4283 0         && UCHARAT(s) != fold_array[nextchr])
4284           {
4285           sayNO;
4286           }
4287 2290614         if (reginfo->strend - locinput < ln)
4288           sayNO;
4289 0         if (ln > 1 && ! folder(s, locinput, ln))
4290           sayNO;
4291 2997858         locinput += ln;
4292 2997242         break;
4293           }
4294            
4295           /* XXX Could improve efficiency by separating these all out using a
4296           * macro or in-line function. At that point regcomp.c would no longer
4297           * have to set the FLAGS fields of these */
4298           case BOUNDL: /* /\b/l */
4299           case NBOUNDL: /* /\B/l */
4300 2997242         RX_MATCH_TAINTED_on(reginfo->prog);
4301           /* FALL THROUGH */
4302           case BOUND: /* /\b/ */
4303           case BOUNDU: /* /\b/u */
4304           case BOUNDA: /* /\b/a */
4305           case NBOUND: /* /\B/ */
4306           case NBOUNDU: /* /\B/u */
4307           case NBOUNDA: /* /\B/a */
4308           /* was last char in word? */
4309 616         if (utf8_target
4310 616         && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4311 1235174         && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4312           {
4313 1906         if (locinput == reginfo->strbeg)
4314           ln = '\n';
4315           else {
4316 0         const U8 * const r =
4317 1235174         reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4318            
4319 1235174         ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4320           }
4321 1235174         if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4322 1235174         ln = isWORDCHAR_uni(ln);
4323 1235174         if (NEXTCHR_IS_EOS)
4324           n = 0;
4325           else {
4326 1235174         LOAD_UTF8_CHARCLASS_ALNUM();
4327 35934         n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4328           utf8_target);
4329           }
4330           }
4331           else {
4332 1235174         ln = isWORDCHAR_LC_uvchr(ln);
4333 1235174         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4334           }
4335           }
4336           else {
4337            
4338           /* Here the string isn't utf8, or is utf8 and only ascii
4339           * characters are to match \w. In the latter case looking at
4340           * the byte just prior to the current one may be just the final
4341           * byte of a multi-byte character. This is ok. There are two
4342           * cases:
4343           * 1) it is a single byte character, and then the test is doing
4344           * just what it's supposed to.
4345           * 2) it is a multi-byte character, in which case the final
4346           * byte is never mistakable for ASCII, and so the test
4347           * will say it is not a word character, which is the
4348           * correct answer. */
4349 100768         ln = (locinput != reginfo->strbeg) ?
4350 50384         UCHARAT(locinput - 1) : '\n';
4351 1184790         switch (FLAGS(scan)) {
4352           case REGEX_UNICODE_CHARSET:
4353 1026212         ln = isWORDCHAR_L1(ln);
4354 158578         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4355 158578         break;
4356           case REGEX_LOCALE_CHARSET:
4357 1235174         ln = isWORDCHAR_LC(ln);
4358 47410         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4359 47410         break;
4360           case REGEX_DEPENDS_CHARSET:
4361 47410         ln = isWORDCHAR(ln);
4362 44505         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4363 120933         break;
4364           case REGEX_ASCII_RESTRICTED_CHARSET:
4365           case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4366 1235174         ln = isWORDCHAR_A(ln);
4367 1235174         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4368 1235174         break;
4369           default:
4370 207512         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4371           break;
4372           }
4373           }
4374           /* Note requires that all BOUNDs be lower than all NBOUNDs in
4375           * regcomp.sym */
4376 1027662         if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4377           sayNO;
4378           break;
4379            
4380           case ANYOF: /* /[abc]/ */
4381           case ANYOF_WARN_SUPER:
4382 1027662         if (NEXTCHR_IS_EOS)
4383           sayNO;
4384 1235174         if (utf8_target) {
4385 1234540         if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4386           sayNO;
4387 1235174         locinput += UTF8SKIP(locinput);
4388           }
4389           else {
4390 1235174         if (!REGINCLASS(rex, scan, (U8*)locinput))
4391           sayNO;
4392 1235174         locinput++;
4393           }
4394           break;
4395            
4396           /* The argument (FLAGS) to all the POSIX node types is the class number
4397           * */
4398            
4399           case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4400           to_complement = 1;
4401           /* FALLTHROUGH */
4402            
4403           case POSIXL: /* \w or [:punct:] etc. under /l */
4404 1235174         if (NEXTCHR_IS_EOS)
4405           sayNO;
4406            
4407           /* The locale hasn't influenced the outcome before this, so defer
4408           * tainting until now */
4409 1235174         RX_MATCH_TAINTED_on(reginfo->prog);
4410            
4411           /* Use isFOO_lc() for characters within Latin1. (Note that
4412           * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4413           * wouldn't be invariant) */
4414 32         if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4415 32         if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4416           sayNO;
4417           }
4418           }
4419 1235174         else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4420 1235174         if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4421           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4422           *(locinput + 1))))))
4423           {
4424           sayNO;
4425           }
4426           }
4427           else { /* Here, must be an above Latin-1 code point */
4428           goto utf8_posix_not_eos;
4429           }
4430            
4431           /* Here, must be utf8 */
4432 1235174         locinput += UTF8SKIP(locinput);
4433 1235148         break;
4434            
4435           case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4436           to_complement = 1;
4437           /* FALLTHROUGH */
4438            
4439           case POSIXD: /* \w or [:punct:] etc. under /d */
4440 1235148         if (utf8_target) {
4441           goto utf8_posix;
4442           }
4443           goto posixa;
4444            
4445           case NPOSIXA: /* \W or [:^punct:] etc. under /a */
4446            
4447 2         if (NEXTCHR_IS_EOS) {
4448           sayNO;
4449           }
4450            
4451           /* All UTF-8 variants match */
4452 1235146         if (! UTF8_IS_INVARIANT(nextchr)) {
4453           goto increment_locinput;
4454           }
4455            
4456           to_complement = 1;
4457           /* FALLTHROUGH */
4458            
4459           case POSIXA: /* \w or [:punct:] etc. under /a */
4460            
4461           posixa:
4462           /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4463           * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4464           * character is a single byte */
4465            
4466 1235146         if (NEXTCHR_IS_EOS
4467 1235148         || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4468           FLAGS(scan)))))
4469           {
4470           sayNO;
4471           }
4472            
4473           /* Here we are either not in utf8, or we matched a utf8-invariant,
4474           * so the next char is the next byte */
4475 1235148         locinput++;
4476 1016512         break;
4477            
4478           case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4479           to_complement = 1;
4480           /* FALLTHROUGH */
4481            
4482           case POSIXU: /* \w or [:punct:] etc. under /u */
4483           utf8_posix:
4484 218636         if (NEXTCHR_IS_EOS) {
4485           sayNO;
4486           }
4487           utf8_posix_not_eos:
4488            
4489           /* Use _generic_isCC() for characters within Latin1. (Note that
4490           * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4491           * wouldn't be invariant) */
4492 506         if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4493 218130         if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4494           FLAGS(scan)))))
4495           {
4496           sayNO;
4497           }
4498 218130         locinput++;
4499           }
4500 218130         else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4501 208310         if (! (to_complement
4502 218130         ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4503           *(locinput + 1)),
4504           FLAGS(scan)))))
4505           {
4506           sayNO;
4507           }
4508 9820         locinput += 2;
4509           }
4510           else { /* Handle above Latin-1 code points */
4511 860         classnum = (_char_class_number) FLAGS(scan);
4512 860         if (classnum < _FIRST_NON_SWASH_CC) {
4513