line |
stmt |
bran |
cond |
sub |
time |
code |
1
|
|
|
|
|
|
/* regexec.c |
2
|
|
|
|
|
|
*/ |
3
|
|
|
|
|
|
|
4
|
|
|
|
|
|
/* |
5
|
|
|
|
|
|
* One Ring to rule them all, One Ring to find them |
6
|
|
|
|
|
|
& |
7
|
|
|
|
|
|
* [p.v of _The Lord of the Rings_, opening poem] |
8
|
|
|
|
|
|
* [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] |
9
|
|
|
|
|
|
* [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] |
10
|
|
|
|
|
|
*/ |
11
|
|
|
|
|
|
|
12
|
|
|
|
|
|
/* This file contains functions for executing a regular expression. See |
13
|
|
|
|
|
|
* also regcomp.c which funnily enough, contains functions for compiling |
14
|
|
|
|
|
|
* a regular expression. |
15
|
|
|
|
|
|
* |
16
|
|
|
|
|
|
* This file is also copied at build time to ext/re/re_exec.c, where |
17
|
|
|
|
|
|
* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. |
18
|
|
|
|
|
|
* This causes the main functions to be compiled under new names and with |
19
|
|
|
|
|
|
* debugging support added, which makes "use re 'debug'" work. |
20
|
|
|
|
|
|
*/ |
21
|
|
|
|
|
|
|
22
|
|
|
|
|
|
/* NOTE: this is derived from Henry Spencer's regexp code, and should not |
23
|
|
|
|
|
|
* confused with the original package (see point 3 below). Thanks, Henry! |
24
|
|
|
|
|
|
*/ |
25
|
|
|
|
|
|
|
26
|
|
|
|
|
|
/* Additional note: this code is very heavily munged from Henry's version |
27
|
|
|
|
|
|
* in places. In some spots I've traded clarity for efficiency, so don't |
28
|
|
|
|
|
|
* blame Henry for some of the lack of readability. |
29
|
|
|
|
|
|
*/ |
30
|
|
|
|
|
|
|
31
|
|
|
|
|
|
/* The names of the functions have been changed from regcomp and |
32
|
|
|
|
|
|
* regexec to pregcomp and pregexec in order to avoid conflicts |
33
|
|
|
|
|
|
* with the POSIX routines of the same names. |
34
|
|
|
|
|
|
*/ |
35
|
|
|
|
|
|
|
36
|
|
|
|
|
|
#ifdef PERL_EXT_RE_BUILD |
37
|
|
|
|
|
|
#include "re_top.h" |
38
|
|
|
|
|
|
#endif |
39
|
|
|
|
|
|
|
40
|
|
|
|
|
|
/* At least one required character in the target string is expressible only in |
41
|
|
|
|
|
|
* UTF-8. */ |
42
|
|
|
|
|
|
static const char* const non_utf8_target_but_utf8_required |
43
|
|
|
|
|
|
= "Can't match, because target string needs to be in UTF-8\n"; |
44
|
|
|
|
|
|
|
45
|
|
|
|
|
|
#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ |
46
|
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ |
47
|
|
|
|
|
|
goto target; \ |
48
|
|
|
|
|
|
} STMT_END |
49
|
|
|
|
|
|
|
50
|
|
|
|
|
|
/* |
51
|
|
|
|
|
|
* pregcomp and pregexec -- regsub and regerror are not used in perl |
52
|
|
|
|
|
|
* |
53
|
|
|
|
|
|
* Copyright (c) 1986 by University of Toronto. |
54
|
|
|
|
|
|
* Written by Henry Spencer. Not derived from licensed software. |
55
|
|
|
|
|
|
* |
56
|
|
|
|
|
|
* Permission is granted to anyone to use this software for any |
57
|
|
|
|
|
|
* purpose on any computer system, and to redistribute it freely, |
58
|
|
|
|
|
|
* subject to the following restrictions: |
59
|
|
|
|
|
|
* |
60
|
|
|
|
|
|
* 1. The author is not responsible for the consequences of use of |
61
|
|
|
|
|
|
* this software, no matter how awful, even if they arise |
62
|
|
|
|
|
|
* from defects in it. |
63
|
|
|
|
|
|
* |
64
|
|
|
|
|
|
* 2. The origin of this software must not be misrepresented, either |
65
|
|
|
|
|
|
* by explicit claim or by omission. |
66
|
|
|
|
|
|
* |
67
|
|
|
|
|
|
* 3. Altered versions must be plainly marked as such, and must not |
68
|
|
|
|
|
|
* be misrepresented as being the original software. |
69
|
|
|
|
|
|
* |
70
|
|
|
|
|
|
**** Alterations to Henry's code are... |
71
|
|
|
|
|
|
**** |
72
|
|
|
|
|
|
**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, |
73
|
|
|
|
|
|
**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 |
74
|
|
|
|
|
|
**** by Larry Wall and others |
75
|
|
|
|
|
|
**** |
76
|
|
|
|
|
|
**** You may distribute under the terms of either the GNU General Public |
77
|
|
|
|
|
|
**** License or the Artistic License, as specified in the README file. |
78
|
|
|
|
|
|
* |
79
|
|
|
|
|
|
* Beware that some of this code is subtly aware of the way operator |
80
|
|
|
|
|
|
* precedence is structured in regular expressions. Serious changes in |
81
|
|
|
|
|
|
* regular-expression syntax might require a total rethink. |
82
|
|
|
|
|
|
*/ |
83
|
|
|
|
|
|
#include "EXTERN.h" |
84
|
|
|
|
|
|
#define PERL_IN_REGEXEC_C |
85
|
|
|
|
|
|
#include "perl.h" |
86
|
|
|
|
|
|
|
87
|
|
|
|
|
|
#ifdef PERL_IN_XSUB_RE |
88
|
|
|
|
|
|
# include "re_comp.h" |
89
|
|
|
|
|
|
#else |
90
|
|
|
|
|
|
# include "regcomp.h" |
91
|
|
|
|
|
|
#endif |
92
|
|
|
|
|
|
|
93
|
|
|
|
|
|
#include "inline_invlist.c" |
94
|
|
|
|
|
|
#include "unicode_constants.h" |
95
|
|
|
|
|
|
|
96
|
|
|
|
|
|
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) |
97
|
|
|
|
|
|
|
98
|
|
|
|
|
|
#ifndef STATIC |
99
|
|
|
|
|
|
#define STATIC static |
100
|
|
|
|
|
|
#endif |
101
|
|
|
|
|
|
|
102
|
|
|
|
|
|
/* Valid for non-utf8 strings: avoids the reginclass |
103
|
|
|
|
|
|
* call if there are no complications: i.e., if everything matchable is |
104
|
|
|
|
|
|
* straight forward in the bitmap */ |
105
|
|
|
|
|
|
#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ |
106
|
|
|
|
|
|
: ANYOF_BITMAP_TEST(p,*(c))) |
107
|
|
|
|
|
|
|
108
|
|
|
|
|
|
/* |
109
|
|
|
|
|
|
* Forwards. |
110
|
|
|
|
|
|
*/ |
111
|
|
|
|
|
|
|
112
|
|
|
|
|
|
#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) |
113
|
|
|
|
|
|
#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) |
114
|
|
|
|
|
|
|
115
|
|
|
|
|
|
#define HOPc(pos,off) \ |
116
|
|
|
|
|
|
(char *)(reginfo->is_utf8_target \ |
117
|
|
|
|
|
|
? reghop3((U8*)pos, off, \ |
118
|
|
|
|
|
|
(U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ |
119
|
|
|
|
|
|
: (U8*)(pos + off)) |
120
|
|
|
|
|
|
#define HOPBACKc(pos, off) \ |
121
|
|
|
|
|
|
(char*)(reginfo->is_utf8_target \ |
122
|
|
|
|
|
|
? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ |
123
|
|
|
|
|
|
: (pos - off >= reginfo->strbeg) \ |
124
|
|
|
|
|
|
? (U8*)pos - off \ |
125
|
|
|
|
|
|
: NULL) |
126
|
|
|
|
|
|
|
127
|
|
|
|
|
|
#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) |
128
|
|
|
|
|
|
#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) |
129
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
131
|
|
|
|
|
|
#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ |
132
|
|
|
|
|
|
#define NEXTCHR_IS_EOS (nextchr < 0) |
133
|
|
|
|
|
|
|
134
|
|
|
|
|
|
#define SET_nextchr \ |
135
|
|
|
|
|
|
nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) |
136
|
|
|
|
|
|
|
137
|
|
|
|
|
|
#define SET_locinput(p) \ |
138
|
|
|
|
|
|
locinput = (p); \ |
139
|
|
|
|
|
|
SET_nextchr |
140
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
142
|
|
|
|
|
|
#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ |
143
|
|
|
|
|
|
if (!swash_ptr) { \ |
144
|
|
|
|
|
|
U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ |
145
|
|
|
|
|
|
swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ |
146
|
|
|
|
|
|
1, 0, NULL, &flags); \ |
147
|
|
|
|
|
|
assert(swash_ptr); \ |
148
|
|
|
|
|
|
} \ |
149
|
|
|
|
|
|
} STMT_END |
150
|
|
|
|
|
|
|
151
|
|
|
|
|
|
/* If in debug mode, we test that a known character properly matches */ |
152
|
|
|
|
|
|
#ifdef DEBUGGING |
153
|
|
|
|
|
|
# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ |
154
|
|
|
|
|
|
property_name, \ |
155
|
|
|
|
|
|
utf8_char_in_property) \ |
156
|
|
|
|
|
|
LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ |
157
|
|
|
|
|
|
assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); |
158
|
|
|
|
|
|
#else |
159
|
|
|
|
|
|
# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ |
160
|
|
|
|
|
|
property_name, \ |
161
|
|
|
|
|
|
utf8_char_in_property) \ |
162
|
|
|
|
|
|
LOAD_UTF8_CHARCLASS(swash_ptr, property_name) |
163
|
|
|
|
|
|
#endif |
164
|
|
|
|
|
|
|
165
|
|
|
|
|
|
#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ |
166
|
|
|
|
|
|
PL_utf8_swash_ptrs[_CC_WORDCHAR], \ |
167
|
|
|
|
|
|
swash_property_names[_CC_WORDCHAR], \ |
168
|
|
|
|
|
|
LATIN_CAPITAL_LETTER_SHARP_S_UTF8); |
169
|
|
|
|
|
|
|
170
|
|
|
|
|
|
#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ |
171
|
|
|
|
|
|
STMT_START { \ |
172
|
|
|
|
|
|
LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ |
173
|
|
|
|
|
|
"_X_regular_begin", \ |
174
|
|
|
|
|
|
LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ |
175
|
|
|
|
|
|
LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ |
176
|
|
|
|
|
|
"_X_extend", \ |
177
|
|
|
|
|
|
COMBINING_GRAVE_ACCENT_UTF8); \ |
178
|
|
|
|
|
|
} STMT_END |
179
|
|
|
|
|
|
|
180
|
|
|
|
|
|
#define PLACEHOLDER /* Something for the preprocessor to grab onto */ |
181
|
|
|
|
|
|
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ |
182
|
|
|
|
|
|
|
183
|
|
|
|
|
|
/* for use after a quantifier and before an EXACT-like node -- japhy */ |
184
|
|
|
|
|
|
/* it would be nice to rework regcomp.sym to generate this stuff. sigh |
185
|
|
|
|
|
|
* |
186
|
|
|
|
|
|
* NOTE that *nothing* that affects backtracking should be in here, specifically |
187
|
|
|
|
|
|
* VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a |
188
|
|
|
|
|
|
* node that is in between two EXACT like nodes when ascertaining what the required |
189
|
|
|
|
|
|
* "follow" character is. This should probably be moved to regex compile time |
190
|
|
|
|
|
|
* although it may be done at run time beause of the REF possibility - more |
191
|
|
|
|
|
|
* investigation required. -- demerphq |
192
|
|
|
|
|
|
*/ |
193
|
|
|
|
|
|
#define JUMPABLE(rn) ( \ |
194
|
|
|
|
|
|
OP(rn) == OPEN || \ |
195
|
|
|
|
|
|
(OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ |
196
|
|
|
|
|
|
OP(rn) == EVAL || \ |
197
|
|
|
|
|
|
OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ |
198
|
|
|
|
|
|
OP(rn) == PLUS || OP(rn) == MINMOD || \ |
199
|
|
|
|
|
|
OP(rn) == KEEPS || \ |
200
|
|
|
|
|
|
(PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ |
201
|
|
|
|
|
|
) |
202
|
|
|
|
|
|
#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) |
203
|
|
|
|
|
|
|
204
|
|
|
|
|
|
#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) |
205
|
|
|
|
|
|
|
206
|
|
|
|
|
|
#if 0 |
207
|
|
|
|
|
|
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so |
208
|
|
|
|
|
|
we don't need this definition. */ |
209
|
|
|
|
|
|
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) |
210
|
|
|
|
|
|
#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) |
211
|
|
|
|
|
|
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) |
212
|
|
|
|
|
|
|
213
|
|
|
|
|
|
#else |
214
|
|
|
|
|
|
/* ... so we use this as its faster. */ |
215
|
|
|
|
|
|
#define IS_TEXT(rn) ( OP(rn)==EXACT ) |
216
|
|
|
|
|
|
#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) |
217
|
|
|
|
|
|
#define IS_TEXTF(rn) ( OP(rn)==EXACTF ) |
218
|
|
|
|
|
|
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) |
219
|
|
|
|
|
|
|
220
|
|
|
|
|
|
#endif |
221
|
|
|
|
|
|
|
222
|
|
|
|
|
|
/* |
223
|
|
|
|
|
|
Search for mandatory following text node; for lookahead, the text must |
224
|
|
|
|
|
|
follow but for lookbehind (rn->flags != 0) we skip to the next step. |
225
|
|
|
|
|
|
*/ |
226
|
|
|
|
|
|
#define FIND_NEXT_IMPT(rn) STMT_START { \ |
227
|
|
|
|
|
|
while (JUMPABLE(rn)) { \ |
228
|
|
|
|
|
|
const OPCODE type = OP(rn); \ |
229
|
|
|
|
|
|
if (type == SUSPEND || PL_regkind[type] == CURLY) \ |
230
|
|
|
|
|
|
rn = NEXTOPER(NEXTOPER(rn)); \ |
231
|
|
|
|
|
|
else if (type == PLUS) \ |
232
|
|
|
|
|
|
rn = NEXTOPER(rn); \ |
233
|
|
|
|
|
|
else if (type == IFMATCH) \ |
234
|
|
|
|
|
|
rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ |
235
|
|
|
|
|
|
else rn += NEXT_OFF(rn); \ |
236
|
|
|
|
|
|
} \ |
237
|
|
|
|
|
|
} STMT_END |
238
|
|
|
|
|
|
|
239
|
|
|
|
|
|
/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. |
240
|
|
|
|
|
|
* These are for the pre-composed Hangul syllables, which are all in a |
241
|
|
|
|
|
|
* contiguous block and arranged there in such a way so as to facilitate |
242
|
|
|
|
|
|
* alorithmic determination of their characteristics. As such, they don't need |
243
|
|
|
|
|
|
* a swash, but can be determined by simple arithmetic. Almost all are |
244
|
|
|
|
|
|
* GCB=LVT, but every 28th one is a GCB=LV */ |
245
|
|
|
|
|
|
#define SBASE 0xAC00 /* Start of block */ |
246
|
|
|
|
|
|
#define SCount 11172 /* Length of block */ |
247
|
|
|
|
|
|
#define TCount 28 |
248
|
|
|
|
|
|
|
249
|
|
|
|
|
|
#define SLAB_FIRST(s) (&(s)->states[0]) |
250
|
|
|
|
|
|
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) |
251
|
|
|
|
|
|
|
252
|
|
|
|
|
|
static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); |
253
|
|
|
|
|
|
static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); |
254
|
|
|
|
|
|
static regmatch_state * S_push_slab(pTHX); |
255
|
|
|
|
|
|
|
256
|
|
|
|
|
|
#define REGCP_PAREN_ELEMS 3 |
257
|
|
|
|
|
|
#define REGCP_OTHER_ELEMS 3 |
258
|
|
|
|
|
|
#define REGCP_FRAME_ELEMS 1 |
259
|
|
|
|
|
|
/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and |
260
|
|
|
|
|
|
* are needed for the regexp context stack bookkeeping. */ |
261
|
|
|
|
|
|
|
262
|
|
|
|
|
|
STATIC CHECKPOINT |
263
|
67399426
|
|
|
|
|
S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) |
264
|
|
|
|
|
|
{ |
265
|
|
|
|
|
|
dVAR; |
266
|
67399426
|
|
|
|
|
const int retval = PL_savestack_ix; |
267
|
67399426
|
|
|
|
|
const int paren_elems_to_push = |
268
|
67399426
|
|
|
|
|
(maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; |
269
|
67399426
|
|
|
|
|
const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; |
270
|
67399426
|
|
|
|
|
const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; |
271
|
|
|
|
|
|
I32 p; |
272
|
67399426
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
273
|
|
|
|
|
|
|
274
|
48
|
|
|
|
|
PERL_ARGS_ASSERT_REGCPPUSH; |
275
|
|
|
|
|
|
|
276
|
67399426
|
|
|
|
|
if (paren_elems_to_push < 0) |
277
|
0
|
|
|
|
|
Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", |
278
|
|
|
|
|
|
paren_elems_to_push); |
279
|
|
|
|
|
|
|
280
|
67399426
|
|
|
|
|
if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) |
281
|
233812454
|
|
|
|
|
Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf |
282
|
|
|
|
|
|
" out of range (%lu-%ld)", |
283
|
|
|
|
|
|
total_elems, |
284
|
|
|
|
|
|
(unsigned long)maxopenparen, |
285
|
|
|
|
|
|
(long)parenfloor); |
286
|
|
|
|
|
|
|
287
|
166413124
|
|
|
|
|
SSGROW(total_elems + REGCP_FRAME_ELEMS); |
288
|
|
|
|
|
|
|
289
|
166413124
|
|
|
|
|
DEBUG_BUFFERS_r( |
290
|
|
|
|
|
|
if ((int)maxopenparen > (int)parenfloor) |
291
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
292
|
|
|
|
|
|
"rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", |
293
|
|
|
|
|
|
PTR2UV(rex), |
294
|
|
|
|
|
|
PTR2UV(rex->offs) |
295
|
|
|
|
|
|
); |
296
|
|
|
|
|
|
); |
297
|
166413124
|
|
|
|
|
for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { |
298
|
|
|
|
|
|
/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ |
299
|
67399378
|
|
|
|
|
SSPUSHIV(rex->offs[p].end); |
300
|
67399378
|
|
|
|
|
SSPUSHIV(rex->offs[p].start); |
301
|
67399378
|
|
|
|
|
SSPUSHINT(rex->offs[p].start_tmp); |
302
|
67399378
|
|
|
|
|
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, |
303
|
|
|
|
|
|
" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", |
304
|
|
|
|
|
|
(UV)p, |
305
|
|
|
|
|
|
(IV)rex->offs[p].start, |
306
|
|
|
|
|
|
(IV)rex->offs[p].start_tmp, |
307
|
|
|
|
|
|
(IV)rex->offs[p].end |
308
|
|
|
|
|
|
)); |
309
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ |
311
|
67399426
|
|
|
|
|
SSPUSHINT(maxopenparen); |
312
|
59469062
|
|
|
|
|
SSPUSHINT(rex->lastparen); |
313
|
59469062
|
|
|
|
|
SSPUSHINT(rex->lastcloseparen); |
314
|
59469062
|
|
|
|
|
SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ |
315
|
|
|
|
|
|
|
316
|
59469062
|
|
|
|
|
return retval; |
317
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
319
|
|
|
|
|
|
/* These are needed since we do not localize EVAL nodes: */ |
320
|
|
|
|
|
|
#define REGCP_SET(cp) \ |
321
|
|
|
|
|
|
DEBUG_STATE_r( \ |
322
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, \ |
323
|
|
|
|
|
|
" Setting an EVAL scope, savestack=%"IVdf"\n", \ |
324
|
|
|
|
|
|
(IV)PL_savestack_ix)); \ |
325
|
|
|
|
|
|
cp = PL_savestack_ix |
326
|
|
|
|
|
|
|
327
|
|
|
|
|
|
#define REGCP_UNWIND(cp) \ |
328
|
|
|
|
|
|
DEBUG_STATE_r( \ |
329
|
|
|
|
|
|
if (cp != PL_savestack_ix) \ |
330
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, \ |
331
|
|
|
|
|
|
" Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ |
332
|
|
|
|
|
|
(IV)(cp), (IV)PL_savestack_ix)); \ |
333
|
|
|
|
|
|
regcpblow(cp) |
334
|
|
|
|
|
|
|
335
|
|
|
|
|
|
#define UNWIND_PAREN(lp, lcp) \ |
336
|
|
|
|
|
|
for (n = rex->lastparen; n > lp; n--) \ |
337
|
|
|
|
|
|
rex->offs[n].end = -1; \ |
338
|
|
|
|
|
|
rex->lastparen = n; \ |
339
|
|
|
|
|
|
rex->lastcloseparen = lcp; |
340
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
342
|
|
|
|
|
|
STATIC void |
343
|
59469062
|
|
|
|
|
S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) |
344
|
|
|
|
|
|
{ |
345
|
|
|
|
|
|
dVAR; |
346
|
|
|
|
|
|
UV i; |
347
|
|
|
|
|
|
U32 paren; |
348
|
59469062
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
349
|
|
|
|
|
|
|
350
|
59469062
|
|
|
|
|
PERL_ARGS_ASSERT_REGCPPOP; |
351
|
|
|
|
|
|
|
352
|
|
|
|
|
|
/* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ |
353
|
59469062
|
|
|
|
|
i = SSPOPUV; |
354
|
228338814
|
|
|
|
|
assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ |
355
|
168869800
|
|
|
|
|
i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ |
356
|
168869800
|
|
|
|
|
rex->lastcloseparen = SSPOPINT; |
357
|
168869800
|
|
|
|
|
rex->lastparen = SSPOPINT; |
358
|
168869800
|
|
|
|
|
*maxopenparen_p = SSPOPINT; |
359
|
|
|
|
|
|
|
360
|
17893784
|
|
|
|
|
i -= REGCP_OTHER_ELEMS; |
361
|
|
|
|
|
|
/* Now restore the parentheses context. */ |
362
|
168869800
|
|
|
|
|
DEBUG_BUFFERS_r( |
363
|
|
|
|
|
|
if (i || rex->lastparen + 1 <= rex->nparens) |
364
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
365
|
|
|
|
|
|
"rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", |
366
|
|
|
|
|
|
PTR2UV(rex), |
367
|
|
|
|
|
|
PTR2UV(rex->offs) |
368
|
|
|
|
|
|
); |
369
|
|
|
|
|
|
); |
370
|
471769998
|
|
|
|
|
paren = *maxopenparen_p; |
371
|
412300984
|
|
|
|
|
for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { |
372
|
|
|
|
|
|
SSize_t tmps; |
373
|
261324920
|
|
|
|
|
rex->offs[paren].start_tmp = SSPOPINT; |
374
|
412300936
|
|
|
|
|
rex->offs[paren].start = SSPOPIV; |
375
|
59469014
|
|
|
|
|
tmps = SSPOPIV; |
376
|
1877336
|
|
|
|
|
if (paren <= rex->lastparen) |
377
|
1877336
|
|
|
|
|
rex->offs[paren].end = tmps; |
378
|
1877336
|
|
|
|
|
DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, |
379
|
|
|
|
|
|
" \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", |
380
|
|
|
|
|
|
(UV)paren, |
381
|
|
|
|
|
|
(IV)rex->offs[paren].start, |
382
|
|
|
|
|
|
(IV)rex->offs[paren].start_tmp, |
383
|
|
|
|
|
|
(IV)rex->offs[paren].end, |
384
|
|
|
|
|
|
(paren > rex->lastparen ? "(skipped)" : "")); |
385
|
|
|
|
|
|
); |
386
|
943578
|
|
|
|
|
paren--; |
387
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
#if 1 |
389
|
|
|
|
|
|
/* It would seem that the similar code in regtry() |
390
|
|
|
|
|
|
* already takes care of this, and in fact it is in |
391
|
|
|
|
|
|
* a better location to since this code can #if 0-ed out |
392
|
|
|
|
|
|
* but the code in regtry() is needed or otherwise tests |
393
|
|
|
|
|
|
* requiring null fields (pat.t#187 and split.t#{13,14} |
394
|
|
|
|
|
|
* (as of patchlevel 7877) will fail. Then again, |
395
|
|
|
|
|
|
* this code seems to be necessary or otherwise |
396
|
|
|
|
|
|
* this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ |
397
|
|
|
|
|
|
* --jhi updated by dapm */ |
398
|
5574
|
|
|
|
|
for (i = rex->lastparen + 1; i <= rex->nparens; i++) { |
399
|
5526
|
|
|
|
|
if (i > *maxopenparen_p) |
400
|
120
|
|
|
|
|
rex->offs[i].start = -1; |
401
|
120
|
|
|
|
|
rex->offs[i].end = -1; |
402
|
0
|
|
|
|
|
DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, |
403
|
|
|
|
|
|
" \\%"UVuf": %s ..-1 undeffing\n", |
404
|
|
|
|
|
|
(UV)i, |
405
|
|
|
|
|
|
(i > *maxopenparen_p) ? "-1" : " " |
406
|
|
|
|
|
|
)); |
407
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
#endif |
409
|
168
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
411
|
|
|
|
|
|
/* restore the parens and associated vars at savestack position ix, |
412
|
|
|
|
|
|
* but without popping the stack */ |
413
|
|
|
|
|
|
|
414
|
|
|
|
|
|
STATIC void |
415
|
|
|
|
|
|
S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p) |
416
|
|
|
|
|
|
{ |
417
|
48
|
|
|
|
|
I32 tmpix = PL_savestack_ix; |
418
|
48
|
|
|
|
|
PL_savestack_ix = ix; |
419
|
168
|
|
|
|
|
regcppop(rex, maxopenparen_p); |
420
|
260
|
|
|
|
|
PL_savestack_ix = tmpix; |
421
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
423
|
|
|
|
|
|
#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ |
424
|
|
|
|
|
|
|
425
|
|
|
|
|
|
STATIC bool |
426
|
120
|
|
|
|
|
S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) |
427
|
|
|
|
|
|
{ |
428
|
|
|
|
|
|
/* Returns a boolean as to whether or not 'character' is a member of the |
429
|
|
|
|
|
|
* Posix character class given by 'classnum' that should be equivalent to a |
430
|
|
|
|
|
|
* value in the typedef '_char_class_number'. |
431
|
|
|
|
|
|
* |
432
|
|
|
|
|
|
* Ideally this could be replaced by a just an array of function pointers |
433
|
|
|
|
|
|
* to the C library functions that implement the macros this calls. |
434
|
|
|
|
|
|
* However, to compile, the precise function signatures are required, and |
435
|
|
|
|
|
|
* these may vary from platform to to platform. To avoid having to figure |
436
|
|
|
|
|
|
* out what those all are on each platform, I (khw) am using this method, |
437
|
|
|
|
|
|
* which adds an extra layer of function call overhead (unless the C |
438
|
|
|
|
|
|
* optimizer strips it away). But we don't particularly care about |
439
|
|
|
|
|
|
* performance with locales anyway. */ |
440
|
|
|
|
|
|
|
441
|
120
|
|
|
|
|
switch ((_char_class_number) classnum) { |
442
|
632
|
|
|
|
|
case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); |
443
|
120
|
|
|
|
|
case _CC_ENUM_ALPHA: return isALPHA_LC(character); |
444
|
120
|
|
|
|
|
case _CC_ENUM_ASCII: return isASCII_LC(character); |
445
|
2744
|
|
|
|
|
case _CC_ENUM_BLANK: return isBLANK_LC(character); |
446
|
120
|
|
|
|
|
case _CC_ENUM_CASED: return isLOWER_LC(character) |
447
|
798
|
|
|
|
|
|| isUPPER_LC(character); |
448
|
60
|
|
|
|
|
case _CC_ENUM_CNTRL: return isCNTRL_LC(character); |
449
|
2763
|
|
|
|
|
case _CC_ENUM_DIGIT: return isDIGIT_LC(character); |
450
|
224
|
|
|
|
|
case _CC_ENUM_GRAPH: return isGRAPH_LC(character); |
451
|
224
|
|
|
|
|
case _CC_ENUM_LOWER: return isLOWER_LC(character); |
452
|
84
|
|
|
|
|
case _CC_ENUM_PRINT: return isPRINT_LC(character); |
453
|
140
|
|
|
|
|
case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character); |
454
|
56
|
|
|
|
|
case _CC_ENUM_PUNCT: return isPUNCT_LC(character); |
455
|
84
|
|
|
|
|
case _CC_ENUM_SPACE: return isSPACE_LC(character); |
456
|
56
|
|
|
|
|
case _CC_ENUM_UPPER: return isUPPER_LC(character); |
457
|
0
|
|
|
|
|
case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); |
458
|
0
|
|
|
|
|
case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); |
459
|
|
|
|
|
|
default: /* VERTSPACE should never occur in locales */ |
460
|
56
|
|
|
|
|
Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); |
461
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
463
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
464
|
|
|
|
|
|
return FALSE; |
465
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
467
|
|
|
|
|
|
STATIC bool |
468
|
28
|
|
|
|
|
S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) |
469
|
|
|
|
|
|
{ |
470
|
|
|
|
|
|
/* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded |
471
|
|
|
|
|
|
* 'character' is a member of the Posix character class given by 'classnum' |
472
|
|
|
|
|
|
* that should be equivalent to a value in the typedef |
473
|
|
|
|
|
|
* '_char_class_number'. |
474
|
|
|
|
|
|
* |
475
|
|
|
|
|
|
* This just calls isFOO_lc on the code point for the character if it is in |
476
|
|
|
|
|
|
* the range 0-255. Outside that range, all characters avoid Unicode |
477
|
|
|
|
|
|
* rules, ignoring any locale. So use the Unicode function if this class |
478
|
|
|
|
|
|
* requires a swash, and use the Unicode macro otherwise. */ |
479
|
|
|
|
|
|
|
480
|
28
|
|
|
|
|
PERL_ARGS_ASSERT_ISFOO_UTF8_LC; |
481
|
|
|
|
|
|
|
482
|
0
|
|
|
|
|
if (UTF8_IS_INVARIANT(*character)) { |
483
|
0
|
|
|
|
|
return isFOO_lc(classnum, *character); |
484
|
|
|
|
|
|
} |
485
|
112
|
|
|
|
|
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { |
486
|
0
|
|
|
|
|
return isFOO_lc(classnum, |
487
|
|
|
|
|
|
TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); |
488
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
490
|
0
|
|
|
|
|
if (classnum < _FIRST_NON_SWASH_CC) { |
491
|
|
|
|
|
|
|
492
|
|
|
|
|
|
/* Initialize the swash unless done already */ |
493
|
0
|
|
|
|
|
if (! PL_utf8_swash_ptrs[classnum]) { |
494
|
137167339
|
|
|
|
|
U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; |
495
|
137167339
|
|
|
|
|
PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", |
496
|
|
|
|
|
|
swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); |
497
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
499
|
137167339
|
|
|
|
|
return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) |
500
|
|
|
|
|
|
character, |
501
|
|
|
|
|
|
TRUE /* is UTF */ )); |
502
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
504
|
137167339
|
|
|
|
|
switch ((_char_class_number) classnum) { |
505
|
|
|
|
|
|
case _CC_ENUM_SPACE: |
506
|
137167339
|
|
|
|
|
case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); |
507
|
|
|
|
|
|
|
508
|
137166981
|
|
|
|
|
case _CC_ENUM_BLANK: return is_HORIZWS_high(character); |
509
|
137166981
|
|
|
|
|
case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); |
510
|
137166981
|
|
|
|
|
case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); |
511
|
|
|
|
|
|
default: return 0; /* Things like CNTRL are always |
512
|
|
|
|
|
|
below 256 */ |
513
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
515
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
516
|
|
|
|
|
|
return FALSE; |
517
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
519
|
|
|
|
|
|
/* |
520
|
|
|
|
|
|
* pregexec and friends |
521
|
|
|
|
|
|
*/ |
522
|
|
|
|
|
|
|
523
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
524
|
|
|
|
|
|
/* |
525
|
|
|
|
|
|
- pregexec - match a regexp against a string |
526
|
|
|
|
|
|
*/ |
527
|
|
|
|
|
|
I32 |
528
|
|
|
|
|
|
Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, |
529
|
|
|
|
|
|
char *strbeg, SSize_t minend, SV *screamer, U32 nosave) |
530
|
|
|
|
|
|
/* stringarg: the point in the string at which to begin matching */ |
531
|
|
|
|
|
|
/* strend: pointer to null at end of string */ |
532
|
|
|
|
|
|
/* strbeg: real beginning of string */ |
533
|
|
|
|
|
|
/* minend: end of match must be >= minend bytes after stringarg. */ |
534
|
|
|
|
|
|
/* screamer: SV being matched: only used for utf8 flag, pos() etc; string |
535
|
|
|
|
|
|
* itself is accessed via the pointers above */ |
536
|
|
|
|
|
|
/* nosave: For optimizations. */ |
537
|
|
|
|
|
|
{ |
538
|
|
|
|
|
|
PERL_ARGS_ASSERT_PREGEXEC; |
539
|
|
|
|
|
|
|
540
|
|
|
|
|
|
return |
541
|
|
|
|
|
|
regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, |
542
|
|
|
|
|
|
nosave ? 0 : REXEC_COPY_STR); |
543
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
#endif |
545
|
|
|
|
|
|
|
546
|
|
|
|
|
|
/* |
547
|
|
|
|
|
|
* Need to implement the following flags for reg_anch: |
548
|
|
|
|
|
|
* |
549
|
|
|
|
|
|
* USE_INTUIT_NOML - Useful to call re_intuit_start() first |
550
|
|
|
|
|
|
* USE_INTUIT_ML |
551
|
|
|
|
|
|
* INTUIT_AUTORITATIVE_NOML - Can trust a positive answer |
552
|
|
|
|
|
|
* INTUIT_AUTORITATIVE_ML |
553
|
|
|
|
|
|
* INTUIT_ONCE_NOML - Intuit can match in one location only. |
554
|
|
|
|
|
|
* INTUIT_ONCE_ML |
555
|
|
|
|
|
|
* |
556
|
|
|
|
|
|
* Another flag for this function: SECOND_TIME (so that float substrs |
557
|
|
|
|
|
|
* with giant delta may be not rechecked). |
558
|
|
|
|
|
|
*/ |
559
|
|
|
|
|
|
|
560
|
|
|
|
|
|
/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. |
561
|
|
|
|
|
|
Otherwise, only SvCUR(sv) is used to get strbeg. */ |
562
|
|
|
|
|
|
|
563
|
|
|
|
|
|
/* XXXX Some places assume that there is a fixed substring. |
564
|
|
|
|
|
|
An update may be needed if optimizer marks as "INTUITable" |
565
|
|
|
|
|
|
RExen without fixed substrings. Similarly, it is assumed that |
566
|
|
|
|
|
|
lengths of all the strings are no more than minlen, thus they |
567
|
|
|
|
|
|
cannot come from lookahead. |
568
|
|
|
|
|
|
(Or minlen should take into account lookahead.) |
569
|
|
|
|
|
|
NOTE: Some of this comment is not correct. minlen does now take account |
570
|
|
|
|
|
|
of lookahead/behind. Further research is required. -- demerphq |
571
|
|
|
|
|
|
|
572
|
|
|
|
|
|
*/ |
573
|
|
|
|
|
|
|
574
|
|
|
|
|
|
/* A failure to find a constant substring means that there is no need to make |
575
|
|
|
|
|
|
an expensive call to REx engine, thus we celebrate a failure. Similarly, |
576
|
|
|
|
|
|
finding a substring too deep into the string means that fewer calls to |
577
|
|
|
|
|
|
regtry() should be needed. |
578
|
|
|
|
|
|
|
579
|
|
|
|
|
|
REx compiler's optimizer found 4 possible hints: |
580
|
|
|
|
|
|
a) Anchored substring; |
581
|
|
|
|
|
|
b) Fixed substring; |
582
|
|
|
|
|
|
c) Whether we are anchored (beginning-of-line or \G); |
583
|
|
|
|
|
|
d) First node (of those at offset 0) which may distinguish positions; |
584
|
|
|
|
|
|
We use a)b)d) and multiline-part of c), and try to find a position in the |
585
|
|
|
|
|
|
string which does not contradict any of them. |
586
|
|
|
|
|
|
*/ |
587
|
|
|
|
|
|
|
588
|
|
|
|
|
|
/* Most of decisions we do here should have been done at compile time. |
589
|
|
|
|
|
|
The nodes of the REx which we used for the search should have been |
590
|
|
|
|
|
|
deleted from the finite automaton. */ |
591
|
|
|
|
|
|
|
592
|
|
|
|
|
|
/* args: |
593
|
|
|
|
|
|
* rx: the regex to match against |
594
|
|
|
|
|
|
* sv: the SV being matched: only used for utf8 flag; the string |
595
|
|
|
|
|
|
* itself is accessed via the pointers below. Note that on |
596
|
|
|
|
|
|
* something like an overloaded SV, SvPOK(sv) may be false |
597
|
|
|
|
|
|
* and the string pointers may point to something unrelated to |
598
|
|
|
|
|
|
* the SV itself. |
599
|
|
|
|
|
|
* strbeg: real beginning of string |
600
|
|
|
|
|
|
* strpos: the point in the string at which to begin matching |
601
|
|
|
|
|
|
* strend: pointer to the byte following the last char of the string |
602
|
|
|
|
|
|
* flags currently unused; set to 0 |
603
|
|
|
|
|
|
* data: currently unused; set to NULL |
604
|
|
|
|
|
|
*/ |
605
|
|
|
|
|
|
|
606
|
|
|
|
|
|
char * |
607
|
137167073
|
|
|
|
|
Perl_re_intuit_start(pTHX_ |
608
|
|
|
|
|
|
REGEXP * const rx, |
609
|
|
|
|
|
|
SV *sv, |
610
|
|
|
|
|
|
const char * const strbeg, |
611
|
|
|
|
|
|
char *strpos, |
612
|
|
|
|
|
|
char *strend, |
613
|
|
|
|
|
|
const U32 flags, |
614
|
|
|
|
|
|
re_scream_pos_data *data) |
615
|
|
|
|
|
|
{ |
616
|
|
|
|
|
|
dVAR; |
617
|
137167073
|
|
|
|
|
struct regexp *const prog = ReANY(rx); |
618
|
|
|
|
|
|
SSize_t start_shift = 0; |
619
|
|
|
|
|
|
/* Should be nonnegative! */ |
620
|
|
|
|
|
|
SSize_t end_shift = 0; |
621
|
|
|
|
|
|
char *s; |
622
|
|
|
|
|
|
SV *check; |
623
|
|
|
|
|
|
char *t; |
624
|
137167073
|
|
|
|
|
const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ |
625
|
|
|
|
|
|
I32 ml_anch; |
626
|
|
|
|
|
|
char *other_last = NULL; /* other substr checked before this */ |
627
|
|
|
|
|
|
char *check_at = NULL; /* check substr found at this pos */ |
628
|
|
|
|
|
|
char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ |
629
|
137167073
|
|
|
|
|
const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; |
630
|
137167073
|
|
|
|
|
RXi_GET_DECL(prog,progi); |
631
|
|
|
|
|
|
regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ |
632
|
|
|
|
|
|
regmatch_info *const reginfo = ®info_buf; |
633
|
|
|
|
|
|
#ifdef DEBUGGING |
634
|
|
|
|
|
|
const char * const i_strpos = strpos; |
635
|
|
|
|
|
|
#endif |
636
|
1143398
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
637
|
|
|
|
|
|
|
638
|
495102
|
|
|
|
|
PERL_ARGS_ASSERT_RE_INTUIT_START; |
639
|
|
|
|
|
|
PERL_UNUSED_ARG(flags); |
640
|
|
|
|
|
|
PERL_UNUSED_ARG(data); |
641
|
|
|
|
|
|
|
642
|
|
|
|
|
|
/* CHR_DIST() would be more correct here but it makes things slow. */ |
643
|
1143398
|
|
|
|
|
if (prog->minlen > strend - strpos) { |
644
|
136023675
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
645
|
|
|
|
|
|
"String too short... [re_intuit_start]\n")); |
646
|
|
|
|
|
|
goto fail; |
647
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
649
|
106664
|
|
|
|
|
reginfo->is_utf8_target = cBOOL(utf8_target); |
650
|
136023739
|
|
|
|
|
reginfo->info_aux = NULL; |
651
|
137167045
|
|
|
|
|
reginfo->strbeg = strbeg; |
652
|
72855894
|
|
|
|
|
reginfo->strend = strend; |
653
|
106369435
|
|
|
|
|
reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); |
654
|
70019930
|
|
|
|
|
reginfo->intuit = 1; |
655
|
|
|
|
|
|
/* not actually used within intuit, but zero for safety anyway */ |
656
|
70019930
|
|
|
|
|
reginfo->poscache_maxiter = 0; |
657
|
|
|
|
|
|
|
658
|
71770960
|
|
|
|
|
if (utf8_target) { |
659
|
95011864
|
|
|
|
|
if (!prog->check_utf8 && prog->check_substr) |
660
|
63616418
|
|
|
|
|
to_utf8_substr(prog); |
661
|
63616278
|
|
|
|
|
check = prog->check_utf8; |
662
|
|
|
|
|
|
} else { |
663
|
39943195
|
|
|
|
|
if (!prog->check_substr && prog->check_utf8) { |
664
|
39943131
|
|
|
|
|
if (! to_byte_substr(prog)) { |
665
|
39942795
|
|
|
|
|
NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); |
666
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
} |
668
|
39942831
|
|
|
|
|
check = prog->check_substr; |
669
|
|
|
|
|
|
} |
670
|
416596
|
|
|
|
|
if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */ |
671
|
416532
|
|
|
|
|
&& !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */ |
672
|
|
|
|
|
|
{ |
673
|
74482
|
|
|
|
|
ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) |
674
|
70404
|
|
|
|
|
|| ( (prog->extflags & RXf_ANCH_BOL) |
675
|
70404
|
|
|
|
|
&& !multiline ) ); /* Check after \n? */ |
676
|
|
|
|
|
|
|
677
|
63438
|
|
|
|
|
if (!ml_anch) { |
678
|
10906
|
|
|
|
|
if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ |
679
|
39526235
|
|
|
|
|
&& (strpos != strbeg)) { |
680
|
7547081
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); |
681
|
|
|
|
|
|
goto fail; |
682
|
|
|
|
|
|
} |
683
|
6422740
|
|
|
|
|
if (prog->check_offset_min == prog->check_offset_max |
684
|
31827961
|
|
|
|
|
&& !(prog->extflags & RXf_CANY_SEEN) |
685
|
31827961
|
|
|
|
|
&& ! multiline) /* /m can cause \n's to match that aren't |
686
|
|
|
|
|
|
accounted for in the string max length. |
687
|
|
|
|
|
|
See [perl #115242] */ |
688
|
|
|
|
|
|
{ |
689
|
|
|
|
|
|
/* Substring at constant offset from beg-of-str... */ |
690
|
|
|
|
|
|
SSize_t slen; |
691
|
|
|
|
|
|
|
692
|
31827961
|
|
|
|
|
s = HOP3c(strpos, prog->check_offset_min, strend); |
693
|
|
|
|
|
|
|
694
|
35137806
|
|
|
|
|
if (SvTAIL(check)) { |
695
|
23673511
|
|
|
|
|
slen = SvCUR(check); /* >= 1 */ |
696
|
|
|
|
|
|
|
697
|
23673511
|
|
|
|
|
if ( strend - s > slen || strend - s < slen - 1 |
698
|
23673511
|
|
|
|
|
|| (strend - s == slen && strend[-1] != '\n')) { |
699
|
65396085
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); |
700
|
|
|
|
|
|
goto fail_finish; |
701
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
/* Now should match s[0..slen-2] */ |
703
|
65706360
|
|
|
|
|
slen--; |
704
|
98104332
|
|
|
|
|
if (slen && (*SvPVX_const(check) != *s |
705
|
0
|
|
|
|
|
|| (slen > 1 |
706
|
0
|
|
|
|
|
&& memNE(SvPVX_const(check), s, slen)))) { |
707
|
|
|
|
|
|
report_neq: |
708
|
98104332
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); |
709
|
|
|
|
|
|
goto fail_finish; |
710
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
} |
712
|
62
|
|
|
|
|
else if (*SvPVX_const(check) != *s |
713
|
62
|
|
|
|
|
|| ((slen = SvCUR(check)) > 1 |
714
|
98104270
|
|
|
|
|
&& memNE(SvPVX_const(check), s, slen))) |
715
|
|
|
|
|
|
goto report_neq; |
716
|
|
|
|
|
|
check_at = s; |
717
|
|
|
|
|
|
goto success_at_start; |
718
|
|
|
|
|
|
} |
719
|
|
|
|
|
|
} |
720
|
|
|
|
|
|
/* Match is anchored, but substr is not anchored wrt beg-of-str. */ |
721
|
|
|
|
|
|
s = strpos; |
722
|
98104270
|
|
|
|
|
start_shift = prog->check_offset_min; /* okay to underestimate on CC */ |
723
|
98104332
|
|
|
|
|
end_shift = prog->check_end_shift; |
724
|
|
|
|
|
|
|
725
|
98104332
|
|
|
|
|
if (!ml_anch) { |
726
|
84090637
|
|
|
|
|
const SSize_t end = prog->check_offset_max + CHR_SVLEN(check) |
727
|
46088044
|
|
|
|
|
- (SvTAIL(check) != 0); |
728
|
5113531
|
|
|
|
|
const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end; |
729
|
|
|
|
|
|
|
730
|
5113531
|
|
|
|
|
if (end_shift < eshift) |
731
|
|
|
|
|
|
end_shift = eshift; |
732
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
else { /* Can match at random position */ |
735
|
|
|
|
|
|
ml_anch = 0; |
736
|
|
|
|
|
|
s = strpos; |
737
|
1764544
|
|
|
|
|
start_shift = prog->check_offset_min; /* okay to underestimate on CC */ |
738
|
1764544
|
|
|
|
|
end_shift = prog->check_end_shift; |
739
|
|
|
|
|
|
|
740
|
|
|
|
|
|
/* end shift should be non negative here */ |
741
|
|
|
|
|
|
} |
742
|
|
|
|
|
|
|
743
|
|
|
|
|
|
#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ |
744
|
|
|
|
|
|
if (end_shift < 0) |
745
|
|
|
|
|
|
Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", |
746
|
|
|
|
|
|
(IV)end_shift, RX_PRECOMP(prog)); |
747
|
|
|
|
|
|
#endif |
748
|
|
|
|
|
|
|
749
|
|
|
|
|
|
restart: |
750
|
|
|
|
|
|
/* Find a possible match in the region s..strend by looking for |
751
|
|
|
|
|
|
the "check" substring in the region corrected by start/end_shift. */ |
752
|
|
|
|
|
|
|
753
|
|
|
|
|
|
{ |
754
|
|
|
|
|
|
SSize_t srch_start_shift = start_shift; |
755
|
|
|
|
|
|
SSize_t srch_end_shift = end_shift; |
756
|
|
|
|
|
|
U8* start_point; |
757
|
|
|
|
|
|
U8* end_point; |
758
|
1764544
|
|
|
|
|
if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { |
759
|
43140
|
|
|
|
|
srch_end_shift -= ((strbeg - s) - srch_start_shift); |
760
|
15764
|
|
|
|
|
srch_start_shift = strbeg - s; |
761
|
|
|
|
|
|
} |
762
|
873639
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r({ |
763
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", |
764
|
|
|
|
|
|
(IV)prog->check_offset_min, |
765
|
|
|
|
|
|
(IV)srch_start_shift, |
766
|
|
|
|
|
|
(IV)srch_end_shift, |
767
|
|
|
|
|
|
(IV)prog->check_end_shift); |
768
|
|
|
|
|
|
}); |
769
|
|
|
|
|
|
|
770
|
1764544
|
|
|
|
|
if (prog->extflags & RXf_CANY_SEEN) { |
771
|
1764480
|
|
|
|
|
start_point= (U8*)(s + srch_start_shift); |
772
|
1764480
|
|
|
|
|
end_point= (U8*)(strend - srch_end_shift); |
773
|
|
|
|
|
|
} else { |
774
|
1764544
|
|
|
|
|
start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); |
775
|
1764544
|
|
|
|
|
end_point= HOP3(strend, -srch_end_shift, strbeg); |
776
|
|
|
|
|
|
} |
777
|
1764544
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r({ |
778
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", |
779
|
|
|
|
|
|
(int)(end_point - start_point), |
780
|
|
|
|
|
|
(int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), |
781
|
|
|
|
|
|
start_point); |
782
|
|
|
|
|
|
}); |
783
|
|
|
|
|
|
|
784
|
1764544
|
|
|
|
|
s = fbm_instr( start_point, end_point, |
785
|
|
|
|
|
|
check, multiline ? FBMrf_MULTILINE : 0); |
786
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
/* Update the count-of-usability, remove useless subpatterns, |
788
|
|
|
|
|
|
unshift s. */ |
789
|
|
|
|
|
|
|
790
|
1764544
|
|
|
|
|
DEBUG_EXECUTE_r({ |
791
|
|
|
|
|
|
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), |
792
|
|
|
|
|
|
SvPVX_const(check), RE_SV_DUMPLEN(check), 30); |
793
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", |
794
|
|
|
|
|
|
(s ? "Found" : "Did not find"), |
795
|
|
|
|
|
|
(check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) |
796
|
|
|
|
|
|
? "anchored" : "floating"), |
797
|
|
|
|
|
|
quoted, |
798
|
|
|
|
|
|
RE_SV_TAIL(check), |
799
|
|
|
|
|
|
(s ? " at offset " : "...\n") ); |
800
|
|
|
|
|
|
}); |
801
|
|
|
|
|
|
|
802
|
160862
|
|
|
|
|
if (!s) |
803
|
|
|
|
|
|
goto fail_finish; |
804
|
|
|
|
|
|
/* Finish the diagnostic message */ |
805
|
142276
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); |
806
|
|
|
|
|
|
|
807
|
|
|
|
|
|
/* XXX dmq: first branch is for positive lookbehind... |
808
|
|
|
|
|
|
Our check string is offset from the beginning of the pattern. |
809
|
|
|
|
|
|
So we need to do any stclass tests offset forward from that |
810
|
|
|
|
|
|
point. I think. :-( |
811
|
|
|
|
|
|
*/ |
812
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
815
|
|
|
|
|
|
check_at=s; |
816
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
818
|
|
|
|
|
|
/* Got a candidate. Check MBOL anchoring, and the *other* substr. |
819
|
|
|
|
|
|
Start with the other substr. |
820
|
|
|
|
|
|
XXXX no SCREAM optimization yet - and a very coarse implementation |
821
|
|
|
|
|
|
XXXX /ttx+/ results in anchored="ttx", floating="x". floating will |
822
|
|
|
|
|
|
*always* match. Probably should be marked during compile... |
823
|
|
|
|
|
|
Probably it is right to do no SCREAM here... |
824
|
|
|
|
|
|
*/ |
825
|
|
|
|
|
|
|
826
|
49209943
|
|
|
|
|
if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) |
827
|
1603750
|
|
|
|
|
: (prog->float_substr && prog->anchored_substr)) |
828
|
|
|
|
|
|
{ |
829
|
|
|
|
|
|
/* Take into account the "other" substring. */ |
830
|
|
|
|
|
|
/* XXXX May be hopelessly wrong for UTF... */ |
831
|
1603684
|
|
|
|
|
if (!other_last) |
832
|
|
|
|
|
|
other_last = strpos; |
833
|
1603684
|
|
|
|
|
if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { |
834
|
|
|
|
|
|
do_other_anchored: |
835
|
|
|
|
|
|
{ |
836
|
3353297
|
|
|
|
|
char * const last = HOP3c(s, -start_shift, strbeg); |
837
|
|
|
|
|
|
char *last1, *last2; |
838
|
|
|
|
|
|
char * const saved_s = s; |
839
|
|
|
|
|
|
SV* must; |
840
|
|
|
|
|
|
|
841
|
3353297
|
|
|
|
|
t = s - prog->check_offset_max; |
842
|
3353297
|
|
|
|
|
if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ |
843
|
868338
|
|
|
|
|
&& (!utf8_target |
844
|
3353295
|
|
|
|
|
|| ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) |
845
|
3353295
|
|
|
|
|
&& t > strpos))) |
846
|
|
|
|
|
|
NOOP; |
847
|
|
|
|
|
|
else |
848
|
|
|
|
|
|
t = strpos; |
849
|
3353297
|
|
|
|
|
t = HOP3c(t, prog->anchored_offset, strend); |
850
|
3353297
|
|
|
|
|
if (t < other_last) /* These positions already checked */ |
851
|
|
|
|
|
|
t = other_last; |
852
|
3353297
|
|
|
|
|
last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); |
853
|
3353297
|
|
|
|
|
if (last < last1) |
854
|
|
|
|
|
|
last1 = last; |
855
|
|
|
|
|
|
/* XXXX It is not documented what units *_offsets are in. |
856
|
|
|
|
|
|
We assume bytes, but this is clearly wrong. |
857
|
|
|
|
|
|
Meaning this code needs to be carefully reviewed for errors. |
858
|
|
|
|
|
|
dmq. |
859
|
|
|
|
|
|
*/ |
860
|
|
|
|
|
|
|
861
|
|
|
|
|
|
/* On end-of-str: see comment below. */ |
862
|
630018
|
|
|
|
|
must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; |
863
|
620552
|
|
|
|
|
if (must == &PL_sv_undef) { |
864
|
|
|
|
|
|
s = (char*)NULL; |
865
|
2723279
|
|
|
|
|
DEBUG_r(must = prog->anchored_utf8); /* for debug */ |
866
|
|
|
|
|
|
} |
867
|
|
|
|
|
|
else |
868
|
29032180
|
|
|
|
|
s = fbm_instr( |
869
|
|
|
|
|
|
(unsigned char*)t, |
870
|
|
|
|
|
|
HOP3(HOP3(last1, prog->anchored_offset, strend) |
871
|
|
|
|
|
|
+ SvCUR(must), -(SvTAIL(must)!=0), strbeg), |
872
|
|
|
|
|
|
must, |
873
|
|
|
|
|
|
multiline ? FBMrf_MULTILINE : 0 |
874
|
|
|
|
|
|
); |
875
|
29032180
|
|
|
|
|
DEBUG_EXECUTE_r({ |
876
|
|
|
|
|
|
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), |
877
|
|
|
|
|
|
SvPVX_const(must), RE_SV_DUMPLEN(must), 30); |
878
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", |
879
|
|
|
|
|
|
(s ? "Found" : "Contradicts"), |
880
|
|
|
|
|
|
quoted, RE_SV_TAIL(must)); |
881
|
|
|
|
|
|
}); |
882
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
884
|
5812949
|
|
|
|
|
if (!s) { |
885
|
83286
|
|
|
|
|
if (last1 >= last2) { |
886
|
71884
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
887
|
|
|
|
|
|
", giving up...\n")); |
888
|
|
|
|
|
|
goto fail_finish; |
889
|
|
|
|
|
|
} |
890
|
9041640
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
891
|
|
|
|
|
|
", trying floating at offset %ld...\n", |
892
|
|
|
|
|
|
(long)(HOP3c(saved_s, 1, strend) - i_strpos))); |
893
|
11988574
|
|
|
|
|
other_last = HOP3c(last1, prog->anchored_offset+1, strend); |
894
|
11986670
|
|
|
|
|
s = HOP3c(last, 1, strend); |
895
|
|
|
|
|
|
goto restart; |
896
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
else { |
898
|
446794
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", |
899
|
|
|
|
|
|
(long)(s - i_strpos))); |
900
|
365348
|
|
|
|
|
t = HOP3c(s, -prog->anchored_offset, strbeg); |
901
|
36
|
|
|
|
|
other_last = HOP3c(s, 1, strend); |
902
|
|
|
|
|
|
s = saved_s; |
903
|
36
|
|
|
|
|
if (t == strpos) |
904
|
|
|
|
|
|
goto try_at_start; |
905
|
|
|
|
|
|
goto try_at_offset; |
906
|
|
|
|
|
|
} |
907
|
|
|
|
|
|
} |
908
|
|
|
|
|
|
} |
909
|
|
|
|
|
|
else { /* Take into account the floating substring. */ |
910
|
|
|
|
|
|
char *last, *last1; |
911
|
|
|
|
|
|
char * const saved_s = s; |
912
|
|
|
|
|
|
SV* must; |
913
|
|
|
|
|
|
|
914
|
365312
|
|
|
|
|
t = HOP3c(s, -start_shift, strbeg); |
915
|
|
|
|
|
|
last1 = last = |
916
|
365312
|
|
|
|
|
HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); |
917
|
81446
|
|
|
|
|
if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) |
918
|
81446
|
|
|
|
|
last = HOP3c(t, prog->float_max_offset, strend); |
919
|
11539878
|
|
|
|
|
s = HOP3c(t, prog->float_min_offset, strend); |
920
|
9354398
|
|
|
|
|
if (s < other_last) |
921
|
|
|
|
|
|
s = other_last; |
922
|
|
|
|
|
|
/* XXXX It is not documented what units *_offsets are in. Assume bytes. */ |
923
|
24371921
|
|
|
|
|
must = utf8_target ? prog->float_utf8 : prog->float_substr; |
924
|
|
|
|
|
|
/* fbm_instr() takes into account exact value of end-of-str |
925
|
|
|
|
|
|
if the check is SvTAIL(ed). Since false positives are OK, |
926
|
|
|
|
|
|
and end-of-str is not later than strend we are OK. */ |
927
|
441177
|
|
|
|
|
if (must == &PL_sv_undef) { |
928
|
|
|
|
|
|
s = (char*)NULL; |
929
|
29159732
|
|
|
|
|
DEBUG_r(must = prog->float_utf8); /* for debug message */ |
930
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
else |
932
|
19063114
|
|
|
|
|
s = fbm_instr((unsigned char*)s, |
933
|
|
|
|
|
|
(unsigned char*)last + SvCUR(must) |
934
|
|
|
|
|
|
- (SvTAIL(must)!=0), |
935
|
|
|
|
|
|
must, multiline ? FBMrf_MULTILINE : 0); |
936
|
333948
|
|
|
|
|
DEBUG_EXECUTE_r({ |
937
|
|
|
|
|
|
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), |
938
|
|
|
|
|
|
SvPVX_const(must), RE_SV_DUMPLEN(must), 30); |
939
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", |
940
|
|
|
|
|
|
(s ? "Found" : "Contradicts"), |
941
|
|
|
|
|
|
quoted, RE_SV_TAIL(must)); |
942
|
|
|
|
|
|
}); |
943
|
333948
|
|
|
|
|
if (!s) { |
944
|
930
|
|
|
|
|
if (last1 == last) { |
945
|
9197609
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
946
|
|
|
|
|
|
", giving up...\n")); |
947
|
|
|
|
|
|
goto fail_finish; |
948
|
|
|
|
|
|
} |
949
|
9197609
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
950
|
|
|
|
|
|
", trying anchored starting at offset %ld...\n", |
951
|
|
|
|
|
|
(long)(saved_s + 1 - i_strpos))); |
952
|
|
|
|
|
|
other_last = last; |
953
|
1903176
|
|
|
|
|
s = HOP3c(t, 1, strend); |
954
|
|
|
|
|
|
goto restart; |
955
|
|
|
|
|
|
} |
956
|
|
|
|
|
|
else { |
957
|
194602
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", |
958
|
|
|
|
|
|
(long)(s - i_strpos))); |
959
|
|
|
|
|
|
other_last = s; /* Fix this later. --Hugo */ |
960
|
|
|
|
|
|
s = saved_s; |
961
|
194602
|
|
|
|
|
if (t == strpos) |
962
|
|
|
|
|
|
goto try_at_start; |
963
|
|
|
|
|
|
goto try_at_offset; |
964
|
|
|
|
|
|
} |
965
|
|
|
|
|
|
} |
966
|
|
|
|
|
|
} |
967
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
969
|
194664
|
|
|
|
|
t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); |
970
|
|
|
|
|
|
|
971
|
194664
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r( |
972
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
973
|
|
|
|
|
|
"Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", |
974
|
|
|
|
|
|
(IV)prog->check_offset_min, |
975
|
|
|
|
|
|
(IV)prog->check_offset_max, |
976
|
|
|
|
|
|
(IV)(s-strpos), |
977
|
|
|
|
|
|
(IV)(t-strpos), |
978
|
|
|
|
|
|
(IV)(t-s), |
979
|
|
|
|
|
|
(IV)(strend-strpos) |
980
|
|
|
|
|
|
) |
981
|
|
|
|
|
|
); |
982
|
|
|
|
|
|
|
983
|
194664
|
|
|
|
|
if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ |
984
|
48
|
|
|
|
|
&& (!utf8_target |
985
|
14874731
|
|
|
|
|
|| ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) |
986
|
38514130
|
|
|
|
|
&& t > strpos))) |
987
|
|
|
|
|
|
{ |
988
|
|
|
|
|
|
/* Fixed substring is found far enough so that the match |
989
|
|
|
|
|
|
cannot start at strpos. */ |
990
|
|
|
|
|
|
try_at_offset: |
991
|
4936068
|
|
|
|
|
if (ml_anch && t[-1] != '\n') { |
992
|
|
|
|
|
|
/* Eventually fbm_*() should handle this, but often |
993
|
|
|
|
|
|
anchored_offset is not 0, so this check will not be wasted. */ |
994
|
|
|
|
|
|
/* XXXX In the code below we prefer to look for "^" even in |
995
|
|
|
|
|
|
presence of anchored substrings. And we search even |
996
|
|
|
|
|
|
beyond the found float position. These pessimizations |
997
|
|
|
|
|
|
are historical artefacts only. */ |
998
|
|
|
|
|
|
find_anchor: |
999
|
5101775
|
|
|
|
|
while (t < strend - prog->minlen) { |
1000
|
131458
|
|
|
|
|
if (*t == '\n') { |
1001
|
68510
|
|
|
|
|
if (t < check_at - prog->check_offset_min) { |
1002
|
62948
|
|
|
|
|
if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { |
1003
|
|
|
|
|
|
/* Since we moved from the found position, |
1004
|
|
|
|
|
|
we definitely contradict the found anchored |
1005
|
|
|
|
|
|
substr. Due to the above check we do not |
1006
|
|
|
|
|
|
contradict "check" substr. |
1007
|
|
|
|
|
|
Thus we can arrive here only if check substr |
1008
|
|
|
|
|
|
is float. Redo checking for "other"=="fixed". |
1009
|
|
|
|
|
|
*/ |
1010
|
4936062
|
|
|
|
|
strpos = t + 1; |
1011
|
253464
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", |
1012
|
|
|
|
|
|
PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); |
1013
|
|
|
|
|
|
goto do_other_anchored; |
1014
|
|
|
|
|
|
} |
1015
|
|
|
|
|
|
/* We don't contradict the found floating substring. */ |
1016
|
|
|
|
|
|
/* XXXX Why not check for STCLASS? */ |
1017
|
4682598
|
|
|
|
|
s = t + 1; |
1018
|
4644528
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", |
1019
|
|
|
|
|
|
PL_colors[0], PL_colors[1], (long)(s - i_strpos))); |
1020
|
|
|
|
|
|
goto set_useful; |
1021
|
|
|
|
|
|
} |
1022
|
|
|
|
|
|
/* Position contradicts check-string */ |
1023
|
|
|
|
|
|
/* XXXX probably better to look for check-string |
1024
|
|
|
|
|
|
than for "\n", so one should lower the limit for t? */ |
1025
|
4936062
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", |
1026
|
|
|
|
|
|
PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); |
1027
|
4936062
|
|
|
|
|
other_last = strpos = s = t + 1; |
1028
|
4936062
|
|
|
|
|
goto restart; |
1029
|
|
|
|
|
|
} |
1030
|
295992
|
|
|
|
|
t++; |
1031
|
|
|
|
|
|
} |
1032
|
194332
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", |
1033
|
|
|
|
|
|
PL_colors[0], PL_colors[1])); |
1034
|
|
|
|
|
|
goto fail_finish; |
1035
|
|
|
|
|
|
} |
1036
|
|
|
|
|
|
else { |
1037
|
178632
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", |
1038
|
|
|
|
|
|
PL_colors[0], PL_colors[1])); |
1039
|
|
|
|
|
|
} |
1040
|
|
|
|
|
|
s = t; |
1041
|
|
|
|
|
|
set_useful: |
1042
|
178632
|
|
|
|
|
++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ |
1043
|
|
|
|
|
|
} |
1044
|
|
|
|
|
|
else { |
1045
|
|
|
|
|
|
/* The found string does not prohibit matching at strpos, |
1046
|
|
|
|
|
|
- no optimization of calling REx engine can be performed, |
1047
|
|
|
|
|
|
unless it was an MBOL and we are not after MBOL, |
1048
|
|
|
|
|
|
or a future STCLASS check will fail this. */ |
1049
|
|
|
|
|
|
try_at_start: |
1050
|
|
|
|
|
|
/* Even in this situation we may use MBOL flag if strpos is offset |
1051
|
|
|
|
|
|
wrt the start of the string. */ |
1052
|
12898
|
|
|
|
|
if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' |
1053
|
|
|
|
|
|
/* May be due to an implicit anchor of m{.*foo} */ |
1054
|
119994
|
|
|
|
|
&& !(prog->intflags & PREGf_IMPLICIT)) |
1055
|
|
|
|
|
|
{ |
1056
|
|
|
|
|
|
t = strpos; |
1057
|
|
|
|
|
|
goto find_anchor; |
1058
|
|
|
|
|
|
} |
1059
|
120052
|
|
|
|
|
DEBUG_EXECUTE_r( if (ml_anch) |
1060
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", |
1061
|
|
|
|
|
|
(long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); |
1062
|
|
|
|
|
|
); |
1063
|
|
|
|
|
|
success_at_start: |
1064
|
36136
|
|
|
|
|
if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ |
1065
|
4424
|
|
|
|
|
&& (utf8_target ? ( |
1066
|
4210
|
|
|
|
|
prog->check_utf8 /* Could be deleted already */ |
1067
|
165786
|
|
|
|
|
&& --BmUSEFUL(prog->check_utf8) < 0 |
1068
|
54422
|
|
|
|
|
&& (prog->check_utf8 == prog->float_utf8) |
1069
|
|
|
|
|
|
) : ( |
1070
|
54480
|
|
|
|
|
prog->check_substr /* Could be deleted already */ |
1071
|
111422
|
|
|
|
|
&& --BmUSEFUL(prog->check_substr) < 0 |
1072
|
111462
|
|
|
|
|
&& (prog->check_substr == prog->float_substr) |
1073
|
|
|
|
|
|
))) |
1074
|
|
|
|
|
|
{ |
1075
|
|
|
|
|
|
/* If flags & SOMETHING - do not do it many times on the same match */ |
1076
|
111462
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); |
1077
|
|
|
|
|
|
/* XXX Does the destruction order has to change with utf8_target? */ |
1078
|
38218138
|
|
|
|
|
SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); |
1079
|
98747393
|
|
|
|
|
SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); |
1080
|
118156009
|
|
|
|
|
prog->check_substr = prog->check_utf8 = NULL; /* disable */ |
1081
|
26704655
|
|
|
|
|
prog->float_substr = prog->float_utf8 = NULL; /* clear */ |
1082
|
|
|
|
|
|
check = NULL; /* abort */ |
1083
|
|
|
|
|
|
s = strpos; |
1084
|
|
|
|
|
|
/* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag |
1085
|
|
|
|
|
|
see http://bugs.activestate.com/show_bug.cgi?id=87173 */ |
1086
|
26704655
|
|
|
|
|
if (prog->intflags & PREGf_IMPLICIT) |
1087
|
26704655
|
|
|
|
|
prog->extflags &= ~RXf_ANCH_MBOL; |
1088
|
|
|
|
|
|
/* XXXX This is a remnant of the old implementation. It |
1089
|
|
|
|
|
|
looks wasteful, since now INTUIT can use many |
1090
|
|
|
|
|
|
other heuristics. */ |
1091
|
26704655
|
|
|
|
|
prog->extflags &= ~RXf_USE_INTUIT; |
1092
|
|
|
|
|
|
/* XXXX What other flags might need to be cleared in this branch? */ |
1093
|
|
|
|
|
|
} |
1094
|
|
|
|
|
|
else |
1095
|
|
|
|
|
|
s = strpos; |
1096
|
|
|
|
|
|
} |
1097
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
/* Last resort... */ |
1099
|
|
|
|
|
|
/* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ |
1100
|
|
|
|
|
|
/* trie stclasses are too expensive to use here, we are better off to |
1101
|
|
|
|
|
|
leave it to regmatch itself */ |
1102
|
26704719
|
|
|
|
|
if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { |
1103
|
|
|
|
|
|
/* minlen == 0 is possible if regstclass is \b or \B, |
1104
|
|
|
|
|
|
and the fixed substr is ''$. |
1105
|
|
|
|
|
|
Since minlen is already taken into account, s+1 is before strend; |
1106
|
|
|
|
|
|
accidentally, minlen >= 1 guaranties no false positives at s + 1 |
1107
|
|
|
|
|
|
even for \b or \B. But (minlen? 1 : 0) below assumes that |
1108
|
|
|
|
|
|
regstclass does not come from lookahead... */ |
1109
|
|
|
|
|
|
/* If regstclass takes bytelength more than 1: If charlength==1, OK. |
1110
|
|
|
|
|
|
This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ |
1111
|
26704655
|
|
|
|
|
const U8* const str = (U8*)STRING(progi->regstclass); |
1112
|
|
|
|
|
|
/* XXX this value could be pre-computed */ |
1113
|
15725172
|
|
|
|
|
const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT |
1114
|
1134185
|
|
|
|
|
? (reginfo->is_utf8_pat |
1115
|
87529661
|
|
|
|
|
? utf8_distance(str + STR_LEN(progi->regstclass), str) |
1116
|
76
|
|
|
|
|
: STR_LEN(progi->regstclass)) |
1117
|
|
|
|
|
|
: 1); |
1118
|
|
|
|
|
|
char * endpos; |
1119
|
15198
|
|
|
|
|
if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) |
1120
|
5528
|
|
|
|
|
endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); |
1121
|
146108
|
|
|
|
|
else if (prog->float_substr || prog->float_utf8) |
1122
|
13040
|
|
|
|
|
endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); |
1123
|
|
|
|
|
|
else |
1124
|
|
|
|
|
|
endpos= strend; |
1125
|
|
|
|
|
|
|
1126
|
446420
|
|
|
|
|
if (checked_upto < s) |
1127
|
|
|
|
|
|
checked_upto = s; |
1128
|
72536
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", |
1129
|
|
|
|
|
|
(IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); |
1130
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
t = s; |
1132
|
72536
|
|
|
|
|
s = find_byclass(prog, progi->regstclass, checked_upto, endpos, |
1133
|
|
|
|
|
|
reginfo); |
1134
|
403254
|
|
|
|
|
if (s) { |
1135
|
|
|
|
|
|
checked_upto = s; |
1136
|
|
|
|
|
|
} else { |
1137
|
|
|
|
|
|
#ifdef DEBUGGING |
1138
|
|
|
|
|
|
const char *what = NULL; |
1139
|
|
|
|
|
|
#endif |
1140
|
403254
|
|
|
|
|
if (endpos == strend) { |
1141
|
403254
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
1142
|
|
|
|
|
|
"Could not match STCLASS...\n") ); |
1143
|
|
|
|
|
|
goto fail; |
1144
|
|
|
|
|
|
} |
1145
|
403254
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
1146
|
|
|
|
|
|
"This position contradicts STCLASS...\n") ); |
1147
|
0
|
|
|
|
|
if ((prog->extflags & RXf_ANCH) && !ml_anch) |
1148
|
|
|
|
|
|
goto fail; |
1149
|
403254
|
|
|
|
|
checked_upto = HOPBACKc(endpos, start_shift); |
1150
|
403254
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", |
1151
|
|
|
|
|
|
(IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); |
1152
|
|
|
|
|
|
/* Contradict one of substrings */ |
1153
|
403254
|
|
|
|
|
if (prog->anchored_substr || prog->anchored_utf8) { |
1154
|
119372
|
|
|
|
|
if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { |
1155
|
2223869
|
|
|
|
|
DEBUG_EXECUTE_r( what = "anchored" ); |
1156
|
|
|
|
|
|
hop_and_restart: |
1157
|
223040
|
|
|
|
|
s = HOP3c(t, 1, strend); |
1158
|
223040
|
|
|
|
|
if (s + start_shift + end_shift > strend) { |
1159
|
|
|
|
|
|
/* XXXX Should be taken into account earlier? */ |
1160
|
223040
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
1161
|
|
|
|
|
|
"Could not match STCLASS...\n") ); |
1162
|
|
|
|
|
|
goto fail; |
1163
|
|
|
|
|
|
} |
1164
|
223040
|
|
|
|
|
if (!check) |
1165
|
|
|
|
|
|
goto giveup; |
1166
|
223040
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
1167
|
|
|
|
|
|
"Looking for %s substr starting at offset %ld...\n", |
1168
|
|
|
|
|
|
what, (long)(s + start_shift - i_strpos)) ); |
1169
|
|
|
|
|
|
goto restart; |
1170
|
|
|
|
|
|
} |
1171
|
|
|
|
|
|
/* Have both, check_string is floating */ |
1172
|
223040
|
|
|
|
|
if (t + start_shift >= check_at) /* Contradicts floating=check */ |
1173
|
|
|
|
|
|
goto retry_floating_check; |
1174
|
|
|
|
|
|
/* Recheck anchored substring, but not floating... */ |
1175
|
|
|
|
|
|
s = check_at; |
1176
|
223040
|
|
|
|
|
if (!check) |
1177
|
|
|
|
|
|
goto giveup; |
1178
|
223040
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
1179
|
|
|
|
|
|
"Looking for anchored substr starting at offset %ld...\n", |
1180
|
|
|
|
|
|
(long)(other_last - i_strpos)) ); |
1181
|
|
|
|
|
|
goto do_other_anchored; |
1182
|
|
|
|
|
|
} |
1183
|
|
|
|
|
|
/* Another way we could have checked stclass at the |
1184
|
|
|
|
|
|
current position only: */ |
1185
|
111520
|
|
|
|
|
if (ml_anch) { |
1186
|
365392
|
|
|
|
|
s = t = t + 1; |
1187
|
311256
|
|
|
|
|
if (!check) |
1188
|
|
|
|
|
|
goto giveup; |
1189
|
311256
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
1190
|
|
|
|
|
|
"Looking for /%s^%s/m starting at offset %ld...\n", |
1191
|
|
|
|
|
|
PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); |
1192
|
|
|
|
|
|
goto try_at_offset; |
1193
|
|
|
|
|
|
} |
1194
|
218992
|
|
|
|
|
if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ |
1195
|
|
|
|
|
|
goto fail; |
1196
|
|
|
|
|
|
/* Check is floating substring. */ |
1197
|
|
|
|
|
|
retry_floating_check: |
1198
|
142352
|
|
|
|
|
t = check_at - start_shift; |
1199
|
32
|
|
|
|
|
DEBUG_EXECUTE_r( what = "floating" ); |
1200
|
|
|
|
|
|
goto hop_and_restart; |
1201
|
|
|
|
|
|
} |
1202
|
36
|
|
|
|
|
if (t != s) { |
1203
|
32
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
1204
|
|
|
|
|
|
"By STCLASS: moving %ld --> %ld\n", |
1205
|
|
|
|
|
|
(long)(t - i_strpos), (long)(s - i_strpos)) |
1206
|
|
|
|
|
|
); |
1207
|
|
|
|
|
|
} |
1208
|
|
|
|
|
|
else { |
1209
|
44
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
1210
|
|
|
|
|
|
"Does not contradict STCLASS...\n"); |
1211
|
|
|
|
|
|
); |
1212
|
|
|
|
|
|
} |
1213
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
giveup: |
1215
|
625788
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", |
1216
|
|
|
|
|
|
PL_colors[4], (check ? "Guessed" : "Giving up"), |
1217
|
|
|
|
|
|
PL_colors[5], (long)(s - i_strpos)) ); |
1218
|
|
|
|
|
|
return s; |
1219
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
fail_finish: /* Substring not found */ |
1221
|
91406
|
|
|
|
|
if (prog->check_substr || prog->check_utf8) /* could be removed already */ |
1222
|
573
|
|
|
|
|
BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ |
1223
|
|
|
|
|
|
fail: |
1224
|
116
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", |
1225
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
1226
|
|
|
|
|
|
return NULL; |
1227
|
|
|
|
|
|
} |
1228
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
#define DECL_TRIE_TYPE(scan) \ |
1230
|
|
|
|
|
|
const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ |
1231
|
|
|
|
|
|
trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ |
1232
|
|
|
|
|
|
trie_type = ((scan->flags == EXACT) \ |
1233
|
|
|
|
|
|
? (utf8_target ? trie_utf8 : trie_plain) \ |
1234
|
|
|
|
|
|
: (scan->flags == EXACTFA) \ |
1235
|
|
|
|
|
|
? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ |
1236
|
|
|
|
|
|
: (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) |
1237
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ |
1239
|
|
|
|
|
|
STMT_START { \ |
1240
|
|
|
|
|
|
STRLEN skiplen; \ |
1241
|
|
|
|
|
|
U8 flags = FOLD_FLAGS_FULL; \ |
1242
|
|
|
|
|
|
switch (trie_type) { \ |
1243
|
|
|
|
|
|
case trie_utf8_exactfa_fold: \ |
1244
|
|
|
|
|
|
flags |= FOLD_FLAGS_NOMIX_ASCII; \ |
1245
|
|
|
|
|
|
/* FALL THROUGH */ \ |
1246
|
|
|
|
|
|
case trie_utf8_fold: \ |
1247
|
|
|
|
|
|
if ( foldlen>0 ) { \ |
1248
|
|
|
|
|
|
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ |
1249
|
|
|
|
|
|
foldlen -= len; \ |
1250
|
|
|
|
|
|
uscan += len; \ |
1251
|
|
|
|
|
|
len=0; \ |
1252
|
|
|
|
|
|
} else { \ |
1253
|
|
|
|
|
|
uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \ |
1254
|
|
|
|
|
|
len = UTF8SKIP(uc); \ |
1255
|
|
|
|
|
|
skiplen = UNISKIP( uvc ); \ |
1256
|
|
|
|
|
|
foldlen -= skiplen; \ |
1257
|
|
|
|
|
|
uscan = foldbuf + skiplen; \ |
1258
|
|
|
|
|
|
} \ |
1259
|
|
|
|
|
|
break; \ |
1260
|
|
|
|
|
|
case trie_latin_utf8_exactfa_fold: \ |
1261
|
|
|
|
|
|
flags |= FOLD_FLAGS_NOMIX_ASCII; \ |
1262
|
|
|
|
|
|
/* FALL THROUGH */ \ |
1263
|
|
|
|
|
|
case trie_latin_utf8_fold: \ |
1264
|
|
|
|
|
|
if ( foldlen>0 ) { \ |
1265
|
|
|
|
|
|
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ |
1266
|
|
|
|
|
|
foldlen -= len; \ |
1267
|
|
|
|
|
|
uscan += len; \ |
1268
|
|
|
|
|
|
len=0; \ |
1269
|
|
|
|
|
|
} else { \ |
1270
|
|
|
|
|
|
len = 1; \ |
1271
|
|
|
|
|
|
uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ |
1272
|
|
|
|
|
|
skiplen = UNISKIP( uvc ); \ |
1273
|
|
|
|
|
|
foldlen -= skiplen; \ |
1274
|
|
|
|
|
|
uscan = foldbuf + skiplen; \ |
1275
|
|
|
|
|
|
} \ |
1276
|
|
|
|
|
|
break; \ |
1277
|
|
|
|
|
|
case trie_utf8: \ |
1278
|
|
|
|
|
|
uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ |
1279
|
|
|
|
|
|
break; \ |
1280
|
|
|
|
|
|
case trie_plain: \ |
1281
|
|
|
|
|
|
uvc = (UV)*uc; \ |
1282
|
|
|
|
|
|
len = 1; \ |
1283
|
|
|
|
|
|
} \ |
1284
|
|
|
|
|
|
if (uvc < 256) { \ |
1285
|
|
|
|
|
|
charid = trie->charmap[ uvc ]; \ |
1286
|
|
|
|
|
|
} \ |
1287
|
|
|
|
|
|
else { \ |
1288
|
|
|
|
|
|
charid = 0; \ |
1289
|
|
|
|
|
|
if (widecharmap) { \ |
1290
|
|
|
|
|
|
SV** const svpp = hv_fetch(widecharmap, \ |
1291
|
|
|
|
|
|
(char*)&uvc, sizeof(UV), 0); \ |
1292
|
|
|
|
|
|
if (svpp) \ |
1293
|
|
|
|
|
|
charid = (U16)SvIV(*svpp); \ |
1294
|
|
|
|
|
|
} \ |
1295
|
|
|
|
|
|
} \ |
1296
|
|
|
|
|
|
} STMT_END |
1297
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
#define REXEC_FBC_EXACTISH_SCAN(CoNd) \ |
1299
|
|
|
|
|
|
STMT_START { \ |
1300
|
|
|
|
|
|
while (s <= e) { \ |
1301
|
|
|
|
|
|
if ( (CoNd) \ |
1302
|
|
|
|
|
|
&& (ln == 1 || folder(s, pat_string, ln)) \ |
1303
|
|
|
|
|
|
&& (reginfo->intuit || regtry(reginfo, &s)) )\ |
1304
|
|
|
|
|
|
goto got_it; \ |
1305
|
|
|
|
|
|
s++; \ |
1306
|
|
|
|
|
|
} \ |
1307
|
|
|
|
|
|
} STMT_END |
1308
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
#define REXEC_FBC_UTF8_SCAN(CoDe) \ |
1310
|
|
|
|
|
|
STMT_START { \ |
1311
|
|
|
|
|
|
while (s < strend) { \ |
1312
|
|
|
|
|
|
CoDe \ |
1313
|
|
|
|
|
|
s += UTF8SKIP(s); \ |
1314
|
|
|
|
|
|
} \ |
1315
|
|
|
|
|
|
} STMT_END |
1316
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
#define REXEC_FBC_SCAN(CoDe) \ |
1318
|
|
|
|
|
|
STMT_START { \ |
1319
|
|
|
|
|
|
while (s < strend) { \ |
1320
|
|
|
|
|
|
CoDe \ |
1321
|
|
|
|
|
|
s++; \ |
1322
|
|
|
|
|
|
} \ |
1323
|
|
|
|
|
|
} STMT_END |
1324
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \ |
1326
|
|
|
|
|
|
REXEC_FBC_UTF8_SCAN( \ |
1327
|
|
|
|
|
|
if (CoNd) { \ |
1328
|
|
|
|
|
|
if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ |
1329
|
|
|
|
|
|
goto got_it; \ |
1330
|
|
|
|
|
|
else \ |
1331
|
|
|
|
|
|
tmp = doevery; \ |
1332
|
|
|
|
|
|
} \ |
1333
|
|
|
|
|
|
else \ |
1334
|
|
|
|
|
|
tmp = 1; \ |
1335
|
|
|
|
|
|
) |
1336
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
#define REXEC_FBC_CLASS_SCAN(CoNd) \ |
1338
|
|
|
|
|
|
REXEC_FBC_SCAN( \ |
1339
|
|
|
|
|
|
if (CoNd) { \ |
1340
|
|
|
|
|
|
if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ |
1341
|
|
|
|
|
|
goto got_it; \ |
1342
|
|
|
|
|
|
else \ |
1343
|
|
|
|
|
|
tmp = doevery; \ |
1344
|
|
|
|
|
|
} \ |
1345
|
|
|
|
|
|
else \ |
1346
|
|
|
|
|
|
tmp = 1; \ |
1347
|
|
|
|
|
|
) |
1348
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
#define REXEC_FBC_TRYIT \ |
1350
|
|
|
|
|
|
if ((reginfo->intuit || regtry(reginfo, &s))) \ |
1351
|
|
|
|
|
|
goto got_it |
1352
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ |
1354
|
|
|
|
|
|
if (utf8_target) { \ |
1355
|
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ |
1356
|
|
|
|
|
|
} \ |
1357
|
|
|
|
|
|
else { \ |
1358
|
|
|
|
|
|
REXEC_FBC_CLASS_SCAN(CoNd); \ |
1359
|
|
|
|
|
|
} |
1360
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
#define DUMP_EXEC_POS(li,s,doutf8) \ |
1362
|
|
|
|
|
|
dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ |
1363
|
|
|
|
|
|
startpos, doutf8) |
1364
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ |
1367
|
|
|
|
|
|
tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ |
1368
|
|
|
|
|
|
tmp = TEST_NON_UTF8(tmp); \ |
1369
|
|
|
|
|
|
REXEC_FBC_UTF8_SCAN( \ |
1370
|
|
|
|
|
|
if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ |
1371
|
|
|
|
|
|
tmp = !tmp; \ |
1372
|
|
|
|
|
|
IF_SUCCESS; \ |
1373
|
|
|
|
|
|
} \ |
1374
|
|
|
|
|
|
else { \ |
1375
|
|
|
|
|
|
IF_FAIL; \ |
1376
|
|
|
|
|
|
} \ |
1377
|
|
|
|
|
|
); \ |
1378
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ |
1380
|
|
|
|
|
|
if (s == reginfo->strbeg) { \ |
1381
|
|
|
|
|
|
tmp = '\n'; \ |
1382
|
|
|
|
|
|
} \ |
1383
|
|
|
|
|
|
else { \ |
1384
|
|
|
|
|
|
U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ |
1385
|
|
|
|
|
|
tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ |
1386
|
|
|
|
|
|
} \ |
1387
|
|
|
|
|
|
tmp = TeSt1_UtF8; \ |
1388
|
|
|
|
|
|
LOAD_UTF8_CHARCLASS_ALNUM(); \ |
1389
|
|
|
|
|
|
REXEC_FBC_UTF8_SCAN( \ |
1390
|
|
|
|
|
|
if (tmp == ! (TeSt2_UtF8)) { \ |
1391
|
|
|
|
|
|
tmp = !tmp; \ |
1392
|
|
|
|
|
|
IF_SUCCESS; \ |
1393
|
|
|
|
|
|
} \ |
1394
|
|
|
|
|
|
else { \ |
1395
|
|
|
|
|
|
IF_FAIL; \ |
1396
|
|
|
|
|
|
} \ |
1397
|
|
|
|
|
|
); \ |
1398
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
/* The only difference between the BOUND and NBOUND cases is that |
1400
|
|
|
|
|
|
* REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in |
1401
|
|
|
|
|
|
* NBOUND. This is accomplished by passing it in either the if or else clause, |
1402
|
|
|
|
|
|
* with the other one being empty */ |
1403
|
|
|
|
|
|
#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ |
1404
|
|
|
|
|
|
FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) |
1405
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ |
1407
|
|
|
|
|
|
FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) |
1408
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ |
1410
|
|
|
|
|
|
FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) |
1411
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ |
1413
|
|
|
|
|
|
FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) |
1414
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to |
1417
|
|
|
|
|
|
* be passed in completely with the variable name being tested, which isn't |
1418
|
|
|
|
|
|
* such a clean interface, but this is easier to read than it was before. We |
1419
|
|
|
|
|
|
* are looking for the boundary (or non-boundary between a word and non-word |
1420
|
|
|
|
|
|
* character. The utf8 and non-utf8 cases have the same logic, but the details |
1421
|
|
|
|
|
|
* must be different. Find the "wordness" of the character just prior to this |
1422
|
|
|
|
|
|
* one, and compare it with the wordness of this one. If they differ, we have |
1423
|
|
|
|
|
|
* a boundary. At the beginning of the string, pretend that the previous |
1424
|
|
|
|
|
|
* character was a new-line */ |
1425
|
|
|
|
|
|
#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ |
1426
|
|
|
|
|
|
if (utf8_target) { \ |
1427
|
|
|
|
|
|
UTF8_CODE \ |
1428
|
|
|
|
|
|
} \ |
1429
|
|
|
|
|
|
else { /* Not utf8 */ \ |
1430
|
|
|
|
|
|
tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ |
1431
|
|
|
|
|
|
tmp = TEST_NON_UTF8(tmp); \ |
1432
|
|
|
|
|
|
REXEC_FBC_SCAN( \ |
1433
|
|
|
|
|
|
if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ |
1434
|
|
|
|
|
|
tmp = !tmp; \ |
1435
|
|
|
|
|
|
IF_SUCCESS; \ |
1436
|
|
|
|
|
|
} \ |
1437
|
|
|
|
|
|
else { \ |
1438
|
|
|
|
|
|
IF_FAIL; \ |
1439
|
|
|
|
|
|
} \ |
1440
|
|
|
|
|
|
); \ |
1441
|
|
|
|
|
|
} \ |
1442
|
|
|
|
|
|
if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ |
1443
|
|
|
|
|
|
goto got_it; |
1444
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
/* We know what class REx starts with. Try to find this position... */ |
1446
|
|
|
|
|
|
/* if reginfo->intuit, its a dryrun */ |
1447
|
|
|
|
|
|
/* annoyingly all the vars in this routine have different names from their counterparts |
1448
|
|
|
|
|
|
in regmatch. /grrr */ |
1449
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
STATIC char * |
1451
|
68
|
|
|
|
|
S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, |
1452
|
|
|
|
|
|
const char *strend, regmatch_info *reginfo) |
1453
|
|
|
|
|
|
{ |
1454
|
|
|
|
|
|
dVAR; |
1455
|
44
|
|
|
|
|
const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; |
1456
|
|
|
|
|
|
char *pat_string; /* The pattern's exactish string */ |
1457
|
|
|
|
|
|
char *pat_end; /* ptr to end char of pat_string */ |
1458
|
|
|
|
|
|
re_fold_t folder; /* Function for computing non-utf8 folds */ |
1459
|
|
|
|
|
|
const U8 *fold_array; /* array for folding ords < 256 */ |
1460
|
|
|
|
|
|
STRLEN ln; |
1461
|
|
|
|
|
|
STRLEN lnc; |
1462
|
|
|
|
|
|
U8 c1; |
1463
|
|
|
|
|
|
U8 c2; |
1464
|
|
|
|
|
|
char *e; |
1465
|
|
|
|
|
|
I32 tmp = 1; /* Scratch variable? */ |
1466
|
49
|
|
|
|
|
const bool utf8_target = reginfo->is_utf8_target; |
1467
|
|
|
|
|
|
UV utf8_fold_flags = 0; |
1468
|
880
|
|
|
|
|
const bool is_utf8_pat = reginfo->is_utf8_pat; |
1469
|
|
|
|
|
|
bool to_complement = FALSE; /* Invert the result? Taking the xor of this |
1470
|
|
|
|
|
|
with a result inverts that result, as 0^1 = |
1471
|
|
|
|
|
|
1 and 1^1 = 0 */ |
1472
|
|
|
|
|
|
_char_class_number classnum; |
1473
|
|
|
|
|
|
|
1474
|
3264183
|
|
|
|
|
RXi_GET_DECL(prog,progi); |
1475
|
|
|
|
|
|
|
1476
|
9215480
|
|
|
|
|
PERL_ARGS_ASSERT_FIND_BYCLASS; |
1477
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
/* We know what class it must start with. */ |
1479
|
145182
|
|
|
|
|
switch (OP(c)) { |
1480
|
|
|
|
|
|
case ANYOF: |
1481
|
|
|
|
|
|
case ANYOF_SYNTHETIC: |
1482
|
|
|
|
|
|
case ANYOF_WARN_SUPER: |
1483
|
3512
|
|
|
|
|
if (utf8_target) { |
1484
|
83952175
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1485
|
|
|
|
|
|
reginclass(prog, c, (U8*)s, utf8_target)); |
1486
|
|
|
|
|
|
} |
1487
|
|
|
|
|
|
else { |
1488
|
192180
|
|
|
|
|
REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); |
1489
|
|
|
|
|
|
} |
1490
|
|
|
|
|
|
break; |
1491
|
|
|
|
|
|
case CANY: |
1492
|
92212
|
|
|
|
|
REXEC_FBC_SCAN( |
1493
|
|
|
|
|
|
if (tmp && (reginfo->intuit || regtry(reginfo, &s))) |
1494
|
|
|
|
|
|
goto got_it; |
1495
|
|
|
|
|
|
else |
1496
|
|
|
|
|
|
tmp = doevery; |
1497
|
|
|
|
|
|
); |
1498
|
|
|
|
|
|
break; |
1499
|
|
|
|
|
|
|
1500
|
|
|
|
|
|
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ |
1501
|
439544
|
|
|
|
|
assert(! is_utf8_pat); |
1502
|
|
|
|
|
|
/* FALL THROUGH */ |
1503
|
|
|
|
|
|
case EXACTFA: |
1504
|
439544
|
|
|
|
|
if (is_utf8_pat || utf8_target) { |
1505
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; |
1506
|
|
|
|
|
|
goto do_exactf_utf8; |
1507
|
|
|
|
|
|
} |
1508
|
|
|
|
|
|
fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */ |
1509
|
|
|
|
|
|
folder = foldEQ_latin1; /* /a, except the sharp s one which */ |
1510
|
|
|
|
|
|
goto do_exactf_non_utf8; /* isn't dealt with by these */ |
1511
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
case EXACTF: /* This node only generated for non-utf8 patterns */ |
1513
|
375864
|
|
|
|
|
assert(! is_utf8_pat); |
1514
|
354020
|
|
|
|
|
if (utf8_target) { |
1515
|
|
|
|
|
|
utf8_fold_flags = 0; |
1516
|
|
|
|
|
|
goto do_exactf_utf8; |
1517
|
|
|
|
|
|
} |
1518
|
|
|
|
|
|
fold_array = PL_fold; |
1519
|
|
|
|
|
|
folder = foldEQ; |
1520
|
|
|
|
|
|
goto do_exactf_non_utf8; |
1521
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
case EXACTFL: |
1523
|
149322
|
|
|
|
|
if (is_utf8_pat || utf8_target) { |
1524
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_UTF8_LOCALE; |
1525
|
|
|
|
|
|
goto do_exactf_utf8; |
1526
|
|
|
|
|
|
} |
1527
|
|
|
|
|
|
fold_array = PL_fold_locale; |
1528
|
|
|
|
|
|
folder = foldEQ_locale; |
1529
|
|
|
|
|
|
goto do_exactf_non_utf8; |
1530
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
case EXACTFU_SS: |
1532
|
128470
|
|
|
|
|
if (is_utf8_pat) { |
1533
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; |
1534
|
|
|
|
|
|
} |
1535
|
|
|
|
|
|
goto do_exactf_utf8; |
1536
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
case EXACTFU: |
1538
|
141610
|
|
|
|
|
if (is_utf8_pat || utf8_target) { |
1539
|
20850
|
|
|
|
|
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; |
1540
|
14284
|
|
|
|
|
goto do_exactf_utf8; |
1541
|
|
|
|
|
|
} |
1542
|
|
|
|
|
|
|
1543
|
|
|
|
|
|
/* Any 'ss' in the pattern should have been replaced by regcomp, |
1544
|
|
|
|
|
|
* so we don't have to worry here about this single special case |
1545
|
|
|
|
|
|
* in the Latin1 range */ |
1546
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
1547
|
|
|
|
|
|
folder = foldEQ_latin1; |
1548
|
|
|
|
|
|
|
1549
|
|
|
|
|
|
/* FALL THROUGH */ |
1550
|
|
|
|
|
|
|
1551
|
|
|
|
|
|
do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there |
1552
|
|
|
|
|
|
are no glitches with fold-length differences |
1553
|
|
|
|
|
|
between the target string and pattern */ |
1554
|
|
|
|
|
|
|
1555
|
|
|
|
|
|
/* The idea in the non-utf8 EXACTF* cases is to first find the |
1556
|
|
|
|
|
|
* first character of the EXACTF* node and then, if necessary, |
1557
|
|
|
|
|
|
* case-insensitively compare the full text of the node. c1 is the |
1558
|
|
|
|
|
|
* first character. c2 is its fold. This logic will not work for |
1559
|
|
|
|
|
|
* Unicode semantics and the german sharp ss, which hence should |
1560
|
|
|
|
|
|
* not be compiled into a node that gets here. */ |
1561
|
136874
|
|
|
|
|
pat_string = STRING(c); |
1562
|
200554
|
|
|
|
|
ln = STR_LEN(c); /* length to match in octets/bytes */ |
1563
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
/* We know that we have to match at least 'ln' bytes (which is the |
1565
|
|
|
|
|
|
* same as characters, since not utf8). If we have to match 3 |
1566
|
|
|
|
|
|
* characters, and there are only 2 availabe, we know without |
1567
|
|
|
|
|
|
* trying that it will fail; so don't start a match past the |
1568
|
|
|
|
|
|
* required minimum number from the far end */ |
1569
|
169585
|
|
|
|
|
e = HOP3c(strend, -((SSize_t)ln), s); |
1570
|
|
|
|
|
|
|
1571
|
10745
|
|
|
|
|
if (reginfo->intuit && e < s) { |
1572
|
4554
|
|
|
|
|
e = s; /* Due to minlen logic of intuit() */ |
1573
|
|
|
|
|
|
} |
1574
|
|
|
|
|
|
|
1575
|
6470
|
|
|
|
|
c1 = *pat_string; |
1576
|
145308
|
|
|
|
|
c2 = fold_array[c1]; |
1577
|
0
|
|
|
|
|
if (c1 == c2) { /* If char and fold are the same */ |
1578
|
204698
|
|
|
|
|
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); |
1579
|
|
|
|
|
|
} |
1580
|
|
|
|
|
|
else { |
1581
|
822
|
|
|
|
|
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); |
1582
|
|
|
|
|
|
} |
1583
|
|
|
|
|
|
break; |
1584
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
do_exactf_utf8: |
1586
|
|
|
|
|
|
{ |
1587
|
|
|
|
|
|
unsigned expansion; |
1588
|
|
|
|
|
|
|
1589
|
|
|
|
|
|
/* If one of the operands is in utf8, we can't use the simpler folding |
1590
|
|
|
|
|
|
* above, due to the fact that many different characters can have the |
1591
|
|
|
|
|
|
* same fold, or portion of a fold, or different- length fold */ |
1592
|
102760
|
|
|
|
|
pat_string = STRING(c); |
1593
|
822
|
|
|
|
|
ln = STR_LEN(c); /* length to match in octets/bytes */ |
1594
|
6322109
|
|
|
|
|
pat_end = pat_string + ln; |
1595
|
|
|
|
|
|
lnc = is_utf8_pat /* length to match in characters */ |
1596
|
|
|
|
|
|
? utf8_length((U8 *) pat_string, (U8 *) pat_end) |
1597
|
125358
|
|
|
|
|
: ln; |
1598
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
/* We have 'lnc' characters to match in the pattern, but because of |
1600
|
|
|
|
|
|
* multi-character folding, each character in the target can match |
1601
|
|
|
|
|
|
* up to 3 characters (Unicode guarantees it will never exceed |
1602
|
|
|
|
|
|
* this) if it is utf8-encoded; and up to 2 if not (based on the |
1603
|
|
|
|
|
|
* fact that the Latin 1 folds are already determined, and the |
1604
|
|
|
|
|
|
* only multi-char fold in that range is the sharp-s folding to |
1605
|
|
|
|
|
|
* 'ss'. Thus, a pattern character can match as little as 1/3 of a |
1606
|
|
|
|
|
|
* string character. Adjust lnc accordingly, rounding up, so that |
1607
|
|
|
|
|
|
* if we need to match at least 4+1/3 chars, that really is 5. */ |
1608
|
125358
|
|
|
|
|
expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; |
1609
|
125358
|
|
|
|
|
lnc = (lnc + expansion - 1) / expansion; |
1610
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
/* As in the non-UTF8 case, if we have to match 3 characters, and |
1612
|
|
|
|
|
|
* only 2 are left, it's guaranteed to fail, so don't start a |
1613
|
|
|
|
|
|
* match that would require us to go beyond the end of the string |
1614
|
|
|
|
|
|
*/ |
1615
|
125358
|
|
|
|
|
e = HOP3c(strend, -((SSize_t)lnc), s); |
1616
|
|
|
|
|
|
|
1617
|
125358
|
|
|
|
|
if (reginfo->intuit && e < s) { |
1618
|
125358
|
|
|
|
|
e = s; /* Due to minlen logic of intuit() */ |
1619
|
|
|
|
|
|
} |
1620
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
/* XXX Note that we could recalculate e to stop the loop earlier, |
1622
|
|
|
|
|
|
* as the worst case expansion above will rarely be met, and as we |
1623
|
|
|
|
|
|
* go along we would usually find that e moves further to the left. |
1624
|
|
|
|
|
|
* This would happen only after we reached the point in the loop |
1625
|
|
|
|
|
|
* where if there were no expansion we should fail. Unclear if |
1626
|
|
|
|
|
|
* worth the expense */ |
1627
|
|
|
|
|
|
|
1628
|
125358
|
|
|
|
|
while (s <= e) { |
1629
|
125358
|
|
|
|
|
char *my_strend= (char *)strend; |
1630
|
125358
|
|
|
|
|
if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, |
1631
|
|
|
|
|
|
pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) |
1632
|
125358
|
|
|
|
|
&& (reginfo->intuit || regtry(reginfo, &s)) ) |
1633
|
|
|
|
|
|
{ |
1634
|
|
|
|
|
|
goto got_it; |
1635
|
|
|
|
|
|
} |
1636
|
125358
|
|
|
|
|
s += (utf8_target) ? UTF8SKIP(s) : 1; |
1637
|
|
|
|
|
|
} |
1638
|
|
|
|
|
|
break; |
1639
|
|
|
|
|
|
} |
1640
|
|
|
|
|
|
case BOUNDL: |
1641
|
125358
|
|
|
|
|
RXp_MATCH_TAINTED_on(prog); |
1642
|
125358
|
|
|
|
|
FBC_BOUND(isWORDCHAR_LC, |
1643
|
|
|
|
|
|
isWORDCHAR_LC_uvchr(tmp), |
1644
|
|
|
|
|
|
isWORDCHAR_LC_utf8((U8*)s)); |
1645
|
|
|
|
|
|
break; |
1646
|
|
|
|
|
|
case NBOUNDL: |
1647
|
125358
|
|
|
|
|
RXp_MATCH_TAINTED_on(prog); |
1648
|
123128
|
|
|
|
|
FBC_NBOUND(isWORDCHAR_LC, |
1649
|
|
|
|
|
|
isWORDCHAR_LC_uvchr(tmp), |
1650
|
|
|
|
|
|
isWORDCHAR_LC_utf8((U8*)s)); |
1651
|
|
|
|
|
|
break; |
1652
|
|
|
|
|
|
case BOUND: |
1653
|
116676
|
|
|
|
|
FBC_BOUND(isWORDCHAR, |
1654
|
|
|
|
|
|
isWORDCHAR_uni(tmp), |
1655
|
|
|
|
|
|
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); |
1656
|
|
|
|
|
|
break; |
1657
|
|
|
|
|
|
case BOUNDA: |
1658
|
116349
|
|
|
|
|
FBC_BOUND_NOLOAD(isWORDCHAR_A, |
1659
|
|
|
|
|
|
isWORDCHAR_A(tmp), |
1660
|
|
|
|
|
|
isWORDCHAR_A((U8*)s)); |
1661
|
|
|
|
|
|
break; |
1662
|
|
|
|
|
|
case NBOUND: |
1663
|
9336
|
|
|
|
|
FBC_NBOUND(isWORDCHAR, |
1664
|
|
|
|
|
|
isWORDCHAR_uni(tmp), |
1665
|
|
|
|
|
|
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); |
1666
|
|
|
|
|
|
break; |
1667
|
|
|
|
|
|
case NBOUNDA: |
1668
|
126450
|
|
|
|
|
FBC_NBOUND_NOLOAD(isWORDCHAR_A, |
1669
|
|
|
|
|
|
isWORDCHAR_A(tmp), |
1670
|
|
|
|
|
|
isWORDCHAR_A((U8*)s)); |
1671
|
|
|
|
|
|
break; |
1672
|
|
|
|
|
|
case BOUNDU: |
1673
|
126350
|
|
|
|
|
FBC_BOUND(isWORDCHAR_L1, |
1674
|
|
|
|
|
|
isWORDCHAR_uni(tmp), |
1675
|
|
|
|
|
|
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); |
1676
|
|
|
|
|
|
break; |
1677
|
|
|
|
|
|
case NBOUNDU: |
1678
|
126350
|
|
|
|
|
FBC_NBOUND(isWORDCHAR_L1, |
1679
|
|
|
|
|
|
isWORDCHAR_uni(tmp), |
1680
|
|
|
|
|
|
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target))); |
1681
|
|
|
|
|
|
break; |
1682
|
|
|
|
|
|
case LNBREAK: |
1683
|
126350
|
|
|
|
|
REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), |
1684
|
|
|
|
|
|
is_LNBREAK_latin1_safe(s, strend) |
1685
|
|
|
|
|
|
); |
1686
|
|
|
|
|
|
break; |
1687
|
|
|
|
|
|
|
1688
|
|
|
|
|
|
/* The argument to all the POSIX node types is the class number to pass to |
1689
|
|
|
|
|
|
* _generic_isCC() to build a mask for searching in PL_charclass[] */ |
1690
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
case NPOSIXL: |
1692
|
|
|
|
|
|
to_complement = 1; |
1693
|
|
|
|
|
|
/* FALLTHROUGH */ |
1694
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
case POSIXL: |
1696
|
126350
|
|
|
|
|
RXp_MATCH_TAINTED_on(prog); |
1697
|
126350
|
|
|
|
|
REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), |
1698
|
|
|
|
|
|
to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); |
1699
|
|
|
|
|
|
break; |
1700
|
|
|
|
|
|
|
1701
|
|
|
|
|
|
case NPOSIXD: |
1702
|
|
|
|
|
|
to_complement = 1; |
1703
|
|
|
|
|
|
/* FALLTHROUGH */ |
1704
|
|
|
|
|
|
|
1705
|
|
|
|
|
|
case POSIXD: |
1706
|
402979
|
|
|
|
|
if (utf8_target) { |
1707
|
|
|
|
|
|
goto posix_utf8; |
1708
|
|
|
|
|
|
} |
1709
|
|
|
|
|
|
goto posixa; |
1710
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
case NPOSIXA: |
1712
|
339804
|
|
|
|
|
if (utf8_target) { |
1713
|
|
|
|
|
|
/* The complement of something that matches only ASCII matches all |
1714
|
|
|
|
|
|
* UTF-8 variant code points, plus everything in ASCII that isn't |
1715
|
|
|
|
|
|
* in the class */ |
1716
|
339804
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s) |
1717
|
|
|
|
|
|
|| ! _generic_isCC_A(*s, FLAGS(c))); |
1718
|
|
|
|
|
|
break; |
1719
|
|
|
|
|
|
} |
1720
|
|
|
|
|
|
|
1721
|
|
|
|
|
|
to_complement = 1; |
1722
|
|
|
|
|
|
/* FALLTHROUGH */ |
1723
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
case POSIXA: |
1725
|
|
|
|
|
|
posixa: |
1726
|
|
|
|
|
|
/* Don't need to worry about utf8, as it can match only a single |
1727
|
|
|
|
|
|
* byte invariant character. */ |
1728
|
171840
|
|
|
|
|
REXEC_FBC_CLASS_SCAN( |
1729
|
|
|
|
|
|
to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); |
1730
|
|
|
|
|
|
break; |
1731
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
case NPOSIXU: |
1733
|
|
|
|
|
|
to_complement = 1; |
1734
|
|
|
|
|
|
/* FALLTHROUGH */ |
1735
|
|
|
|
|
|
|
1736
|
|
|
|
|
|
case POSIXU: |
1737
|
147204
|
|
|
|
|
if (! utf8_target) { |
1738
|
2364
|
|
|
|
|
REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, |
1739
|
|
|
|
|
|
FLAGS(c)))); |
1740
|
|
|
|
|
|
} |
1741
|
|
|
|
|
|
else { |
1742
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
posix_utf8: |
1744
|
1548
|
|
|
|
|
classnum = (_char_class_number) FLAGS(c); |
1745
|
4893310
|
|
|
|
|
if (classnum < _FIRST_NON_SWASH_CC) { |
1746
|
4746922
|
|
|
|
|
while (s < strend) { |
1747
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
/* We avoid loading in the swash as long as possible, but |
1749
|
|
|
|
|
|
* should we have to, we jump to a separate loop. This |
1750
|
|
|
|
|
|
* extra 'if' statement is what keeps this code from being |
1751
|
|
|
|
|
|
* just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ |
1752
|
147204
|
|
|
|
|
if (UTF8_IS_ABOVE_LATIN1(*s)) { |
1753
|
|
|
|
|
|
goto found_above_latin1; |
1754
|
|
|
|
|
|
} |
1755
|
171840
|
|
|
|
|
if ((UTF8_IS_INVARIANT(*s) |
1756
|
258628
|
|
|
|
|
&& to_complement ^ cBOOL(_generic_isCC((U8) *s, |
1757
|
|
|
|
|
|
classnum))) |
1758
|
45630
|
|
|
|
|
|| (UTF8_IS_DOWNGRADEABLE_START(*s) |
1759
|
45630
|
|
|
|
|
&& to_complement ^ cBOOL( |
1760
|
|
|
|
|
|
_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, |
1761
|
|
|
|
|
|
*(s + 1)), |
1762
|
|
|
|
|
|
classnum)))) |
1763
|
|
|
|
|
|
{ |
1764
|
45630
|
|
|
|
|
if (tmp && (reginfo->intuit || regtry(reginfo, &s))) |
1765
|
|
|
|
|
|
goto got_it; |
1766
|
|
|
|
|
|
else { |
1767
|
|
|
|
|
|
tmp = doevery; |
1768
|
|
|
|
|
|
} |
1769
|
|
|
|
|
|
} |
1770
|
|
|
|
|
|
else { |
1771
|
|
|
|
|
|
tmp = 1; |
1772
|
|
|
|
|
|
} |
1773
|
258628
|
|
|
|
|
s += UTF8SKIP(s); |
1774
|
|
|
|
|
|
} |
1775
|
|
|
|
|
|
} |
1776
|
258628
|
|
|
|
|
else switch (classnum) { /* These classes are implemented as |
1777
|
|
|
|
|
|
macros */ |
1778
|
|
|
|
|
|
case _CC_ENUM_SPACE: /* XXX would require separate code if we |
1779
|
|
|
|
|
|
revert the change of \v matching this */ |
1780
|
|
|
|
|
|
/* FALL THROUGH */ |
1781
|
|
|
|
|
|
|
1782
|
|
|
|
|
|
case _CC_ENUM_PSXSPC: |
1783
|
253708
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1784
|
|
|
|
|
|
to_complement ^ cBOOL(isSPACE_utf8(s))); |
1785
|
|
|
|
|
|
break; |
1786
|
|
|
|
|
|
|
1787
|
|
|
|
|
|
case _CC_ENUM_BLANK: |
1788
|
131774
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1789
|
|
|
|
|
|
to_complement ^ cBOOL(isBLANK_utf8(s))); |
1790
|
|
|
|
|
|
break; |
1791
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
case _CC_ENUM_XDIGIT: |
1793
|
289274
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1794
|
|
|
|
|
|
to_complement ^ cBOOL(isXDIGIT_utf8(s))); |
1795
|
|
|
|
|
|
break; |
1796
|
|
|
|
|
|
|
1797
|
|
|
|
|
|
case _CC_ENUM_VERTSPACE: |
1798
|
289274
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1799
|
|
|
|
|
|
to_complement ^ cBOOL(isVERTWS_utf8(s))); |
1800
|
|
|
|
|
|
break; |
1801
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
case _CC_ENUM_CNTRL: |
1803
|
334902
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1804
|
|
|
|
|
|
to_complement ^ cBOOL(isCNTRL_utf8(s))); |
1805
|
|
|
|
|
|
break; |
1806
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
default: |
1808
|
361480
|
|
|
|
|
Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); |
1809
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
1810
|
|
|
|
|
|
} |
1811
|
|
|
|
|
|
} |
1812
|
|
|
|
|
|
break; |
1813
|
|
|
|
|
|
|
1814
|
|
|
|
|
|
found_above_latin1: /* Here we have to load a swash to get the result |
1815
|
|
|
|
|
|
for the current code point */ |
1816
|
180740
|
|
|
|
|
if (! PL_utf8_swash_ptrs[classnum]) { |
1817
|
180736
|
|
|
|
|
U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; |
1818
|
180654
|
|
|
|
|
PL_utf8_swash_ptrs[classnum] = |
1819
|
172262
|
|
|
|
|
_core_swash_init("utf8", swash_property_names[classnum], |
1820
|
|
|
|
|
|
&PL_sv_undef, 1, 0, NULL, &flags); |
1821
|
|
|
|
|
|
} |
1822
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
/* This is a copy of the loop above for swash classes, though using the |
1824
|
|
|
|
|
|
* FBC macro instead of being expanded out. Since we've loaded the |
1825
|
|
|
|
|
|
* swash, we don't have to check for that each time through the loop */ |
1826
|
76568
|
|
|
|
|
REXEC_FBC_UTF8_CLASS_SCAN( |
1827
|
|
|
|
|
|
to_complement ^ cBOOL(_generic_utf8( |
1828
|
|
|
|
|
|
classnum, |
1829
|
|
|
|
|
|
s, |
1830
|
|
|
|
|
|
swash_fetch(PL_utf8_swash_ptrs[classnum], |
1831
|
|
|
|
|
|
(U8 *) s, TRUE)))); |
1832
|
|
|
|
|
|
break; |
1833
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
case AHOCORASICKC: |
1835
|
|
|
|
|
|
case AHOCORASICK: |
1836
|
|
|
|
|
|
{ |
1837
|
76572
|
|
|
|
|
DECL_TRIE_TYPE(c); |
1838
|
|
|
|
|
|
/* what trie are we using right now */ |
1839
|
258632
|
|
|
|
|
reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; |
1840
|
258632
|
|
|
|
|
reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; |
1841
|
91172
|
|
|
|
|
HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); |
1842
|
|
|
|
|
|
|
1843
|
129728
|
|
|
|
|
const char *last_start = strend - trie->minlen; |
1844
|
|
|
|
|
|
#ifdef DEBUGGING |
1845
|
126354
|
|
|
|
|
const char *real_start = s; |
1846
|
|
|
|
|
|
#endif |
1847
|
44746
|
|
|
|
|
STRLEN maxlen = trie->maxlen; |
1848
|
|
|
|
|
|
SV *sv_points; |
1849
|
|
|
|
|
|
U8 **points; /* map of where we were in the input string |
1850
|
|
|
|
|
|
when reading a given char. For ASCII this |
1851
|
|
|
|
|
|
is unnecessary overhead as the relationship |
1852
|
|
|
|
|
|
is always 1:1, but for Unicode, especially |
1853
|
|
|
|
|
|
case folded Unicode this is not true. */ |
1854
|
|
|
|
|
|
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; |
1855
|
|
|
|
|
|
U8 *bitmap=NULL; |
1856
|
|
|
|
|
|
|
1857
|
|
|
|
|
|
|
1858
|
44746
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
1859
|
|
|
|
|
|
|
1860
|
|
|
|
|
|
/* We can't just allocate points here. We need to wrap it in |
1861
|
|
|
|
|
|
* an SV so it gets freed properly if there is a croak while |
1862
|
|
|
|
|
|
* running the match */ |
1863
|
126354
|
|
|
|
|
ENTER; |
1864
|
45178
|
|
|
|
|
SAVETMPS; |
1865
|
45178
|
|
|
|
|
sv_points=newSV(maxlen * sizeof(U8 *)); |
1866
|
44086
|
|
|
|
|
SvCUR_set(sv_points, |
1867
|
|
|
|
|
|
maxlen * sizeof(U8 *)); |
1868
|
44086
|
|
|
|
|
SvPOK_on(sv_points); |
1869
|
44086
|
|
|
|
|
sv_2mortal(sv_points); |
1870
|
1096
|
|
|
|
|
points=(U8**)SvPV_nolen(sv_points ); |
1871
|
81280
|
|
|
|
|
if ( trie_type != trie_utf8_fold |
1872
|
81280
|
|
|
|
|
&& (trie->bitmap || OP(c)==AHOCORASICKC) ) |
1873
|
|
|
|
|
|
{ |
1874
|
81278
|
|
|
|
|
if (trie->bitmap) |
1875
|
2
|
|
|
|
|
bitmap=(U8*)trie->bitmap; |
1876
|
|
|
|
|
|
else |
1877
|
19919356
|
|
|
|
|
bitmap=(U8*)ANYOF_BITMAP(c); |
1878
|
|
|
|
|
|
} |
1879
|
|
|
|
|
|
/* this is the Aho-Corasick algorithm modified a touch |
1880
|
|
|
|
|
|
to include special handling for long "unknown char" sequences. |
1881
|
|
|
|
|
|
The basic idea being that we use AC as long as we are dealing |
1882
|
|
|
|
|
|
with a possible matching char, when we encounter an unknown char |
1883
|
|
|
|
|
|
(and we have not encountered an accepting state) we scan forward |
1884
|
|
|
|
|
|
until we find a legal starting char. |
1885
|
|
|
|
|
|
AC matching is basically that of trie matching, except that when |
1886
|
|
|
|
|
|
we encounter a failing transition, we fall back to the current |
1887
|
|
|
|
|
|
states "fail state", and try the current char again, a process |
1888
|
|
|
|
|
|
we repeat until we reach the root state, state 1, or a legal |
1889
|
|
|
|
|
|
transition. If we fail on the root state then we can either |
1890
|
|
|
|
|
|
terminate if we have reached an accepting state previously, or |
1891
|
|
|
|
|
|
restart the entire process from the beginning if we have not. |
1892
|
|
|
|
|
|
|
1893
|
|
|
|
|
|
*/ |
1894
|
98669747
|
|
|
|
|
while (s <= last_start) { |
1895
|
98669747
|
|
|
|
|
const U32 uniflags = UTF8_ALLOW_DEFAULT; |
1896
|
96472125
|
|
|
|
|
U8 *uc = (U8*)s; |
1897
|
|
|
|
|
|
U16 charid = 0; |
1898
|
|
|
|
|
|
U32 base = 1; |
1899
|
|
|
|
|
|
U32 state = 1; |
1900
|
96104099
|
|
|
|
|
UV uvc = 0; |
1901
|
90639355
|
|
|
|
|
STRLEN len = 0; |
1902
|
90639355
|
|
|
|
|
STRLEN foldlen = 0; |
1903
|
|
|
|
|
|
U8 *uscan = (U8*)NULL; |
1904
|
|
|
|
|
|
U8 *leftmost = NULL; |
1905
|
|
|
|
|
|
#ifdef DEBUGGING |
1906
|
|
|
|
|
|
U32 accepted_word= 0; |
1907
|
|
|
|
|
|
#endif |
1908
|
|
|
|
|
|
U32 pointpos = 0; |
1909
|
|
|
|
|
|
|
1910
|
90639371
|
|
|
|
|
while ( state && uc <= (U8*)strend ) { |
1911
|
|
|
|
|
|
int failed=0; |
1912
|
74096421
|
|
|
|
|
U32 word = aho->states[ state ].wordnum; |
1913
|
|
|
|
|
|
|
1914
|
35475987
|
|
|
|
|
if( state==1 ) { |
1915
|
56
|
|
|
|
|
if ( bitmap ) { |
1916
|
54
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r( |
1917
|
|
|
|
|
|
if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { |
1918
|
|
|
|
|
|
dump_exec_pos( (char *)uc, c, strend, real_start, |
1919
|
|
|
|
|
|
(char *)uc, utf8_target ); |
1920
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
1921
|
|
|
|
|
|
" Scanning for legal start char...\n"); |
1922
|
|
|
|
|
|
} |
1923
|
|
|
|
|
|
); |
1924
|
145899439
|
|
|
|
|
if (utf8_target) { |
1925
|
60628124
|
|
|
|
|
while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { |
1926
|
96104095
|
|
|
|
|
uc += UTF8SKIP(uc); |
1927
|
|
|
|
|
|
} |
1928
|
|
|
|
|
|
} else { |
1929
|
96104137
|
|
|
|
|
while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { |
1930
|
96104133
|
|
|
|
|
uc++; |
1931
|
|
|
|
|
|
} |
1932
|
|
|
|
|
|
} |
1933
|
96104099
|
|
|
|
|
s= (char *)uc; |
1934
|
|
|
|
|
|
} |
1935
|
368032
|
|
|
|
|
if (uc >(U8*)last_start) break; |
1936
|
|
|
|
|
|
} |
1937
|
|
|
|
|
|
|
1938
|
368040
|
|
|
|
|
if ( word ) { |
1939
|
301308
|
|
|
|
|
U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; |
1940
|
22254
|
|
|
|
|
if (!leftmost || lpos < leftmost) { |
1941
|
|
|
|
|
|
DEBUG_r(accepted_word=word); |
1942
|
|
|
|
|
|
leftmost= lpos; |
1943
|
|
|
|
|
|
} |
1944
|
22254
|
|
|
|
|
if (base==0) break; |
1945
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
} |
1947
|
66734
|
|
|
|
|
points[pointpos++ % maxlen]= uc; |
1948
|
368040
|
|
|
|
|
if (foldlen || uc < (U8*)strend) { |
1949
|
368038
|
|
|
|
|
REXEC_TRIE_READ_CHAR(trie_type, trie, |
1950
|
|
|
|
|
|
widecharmap, uc, |
1951
|
|
|
|
|
|
uscan, len, uvc, charid, foldlen, |
1952
|
|
|
|
|
|
foldbuf, uniflags); |
1953
|
368038
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r({ |
1954
|
|
|
|
|
|
dump_exec_pos( (char *)uc, c, strend, |
1955
|
|
|
|
|
|
real_start, s, utf8_target); |
1956
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
1957
|
|
|
|
|
|
" Charid:%3u CP:%4"UVxf" ", |
1958
|
|
|
|
|
|
charid, uvc); |
1959
|
|
|
|
|
|
}); |
1960
|
|
|
|
|
|
} |
1961
|
|
|
|
|
|
else { |
1962
|
368028
|
|
|
|
|
len = 0; |
1963
|
|
|
|
|
|
charid = 0; |
1964
|
|
|
|
|
|
} |
1965
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
do { |
1968
|
|
|
|
|
|
#ifdef DEBUGGING |
1969
|
368042
|
|
|
|
|
word = aho->states[ state ].wordnum; |
1970
|
|
|
|
|
|
#endif |
1971
|
96472137
|
|
|
|
|
base = aho->states[ state ].trans.base; |
1972
|
|
|
|
|
|
|
1973
|
96472137
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r({ |
1974
|
|
|
|
|
|
if (failed) |
1975
|
|
|
|
|
|
dump_exec_pos( (char *)uc, c, strend, real_start, |
1976
|
|
|
|
|
|
s, utf8_target ); |
1977
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
1978
|
|
|
|
|
|
"%sState: %4"UVxf", word=%"UVxf, |
1979
|
|
|
|
|
|
failed ? " Fail transition to " : "", |
1980
|
|
|
|
|
|
(UV)state, (UV)word); |
1981
|
|
|
|
|
|
}); |
1982
|
16
|
|
|
|
|
if ( base ) { |
1983
|
|
|
|
|
|
U32 tmp; |
1984
|
|
|
|
|
|
I32 offset; |
1985
|
24
|
|
|
|
|
if (charid && |
1986
|
20
|
|
|
|
|
( ((offset = base + charid |
1987
|
3296443
|
|
|
|
|
- 1 - trie->uniquecharcount)) >= 0) |
1988
|
2197632
|
|
|
|
|
&& ((U32)offset < trie->lasttrans) |
1989
|
2197632
|
|
|
|
|
&& trie->trans[offset].check == state |
1990
|
2197632
|
|
|
|
|
&& (tmp=trie->trans[offset].next)) |
1991
|
|
|
|
|
|
{ |
1992
|
2197632
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r( |
1993
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log," - legal\n")); |
1994
|
|
|
|
|
|
state = tmp; |
1995
|
98669753
|
|
|
|
|
break; |
1996
|
|
|
|
|
|
} |
1997
|
|
|
|
|
|
else { |
1998
|
263904222
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r( |
1999
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log," - fail\n")); |
2000
|
|
|
|
|
|
failed = 1; |
2001
|
263904222
|
|
|
|
|
state = aho->fail[state]; |
2002
|
|
|
|
|
|
} |
2003
|
|
|
|
|
|
} |
2004
|
|
|
|
|
|
else { |
2005
|
|
|
|
|
|
/* we must be accepting here */ |
2006
|
263904220
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r( |
2007
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log," - accepting\n")); |
2008
|
|
|
|
|
|
failed = 1; |
2009
|
|
|
|
|
|
break; |
2010
|
|
|
|
|
|
} |
2011
|
263904222
|
|
|
|
|
} while(state); |
2012
|
14
|
|
|
|
|
uc += len; |
2013
|
263904232
|
|
|
|
|
if (failed) { |
2014
|
3606940
|
|
|
|
|
if (leftmost) |
2015
|
|
|
|
|
|
break; |
2016
|
3606938
|
|
|
|
|
if (!state) state = 1; |
2017
|
|
|
|
|
|
} |
2018
|
|
|
|
|
|
} |
2019
|
7170110
|
|
|
|
|
if ( aho->states[ state ].wordnum ) { |
2020
|
2026462
|
|
|
|
|
U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; |
2021
|
6357987
|
|
|
|
|
if (!leftmost || lpos < leftmost) { |
2022
|
3606936
|
|
|
|
|
DEBUG_r(accepted_word=aho->states[ state ].wordnum); |
2023
|
|
|
|
|
|
leftmost = lpos; |
2024
|
|
|
|
|
|
} |
2025
|
|
|
|
|
|
} |
2026
|
3155006
|
|
|
|
|
if (leftmost) { |
2027
|
3155004
|
|
|
|
|
s = (char*)leftmost; |
2028
|
3155004
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r({ |
2029
|
|
|
|
|
|
PerlIO_printf( |
2030
|
|
|
|
|
|
Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", |
2031
|
|
|
|
|
|
(UV)accepted_word, (IV)(s - real_start) |
2032
|
|
|
|
|
|
); |
2033
|
|
|
|
|
|
}); |
2034
|
451936
|
|
|
|
|
if (reginfo->intuit || regtry(reginfo, &s)) { |
2035
|
62
|
|
|
|
|
FREETMPS; |
2036
|
30
|
|
|
|
|
LEAVE; |
2037
|
451876
|
|
|
|
|
goto got_it; |
2038
|
|
|
|
|
|
} |
2039
|
263904190
|
|
|
|
|
s = HOPc(s,1); |
2040
|
263904190
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r({ |
2041
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); |
2042
|
|
|
|
|
|
}); |
2043
|
|
|
|
|
|
} else { |
2044
|
252038470
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r( |
2045
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log,"No match.\n")); |
2046
|
|
|
|
|
|
break; |
2047
|
|
|
|
|
|
} |
2048
|
|
|
|
|
|
} |
2049
|
252038470
|
|
|
|
|
FREETMPS; |
2050
|
252038470
|
|
|
|
|
LEAVE; |
2051
|
|
|
|
|
|
} |
2052
|
136791179
|
|
|
|
|
break; |
2053
|
|
|
|
|
|
default: |
2054
|
136791177
|
|
|
|
|
Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); |
2055
|
|
|
|
|
|
break; |
2056
|
|
|
|
|
|
} |
2057
|
|
|
|
|
|
return 0; |
2058
|
|
|
|
|
|
got_it: |
2059
|
136791179
|
|
|
|
|
return s; |
2060
|
|
|
|
|
|
} |
2061
|
|
|
|
|
|
|
2062
|
|
|
|
|
|
/* set RX_SAVED_COPY, RX_SUBBEG etc. |
2063
|
|
|
|
|
|
* flags have same meanings as with regexec_flags() */ |
2064
|
|
|
|
|
|
|
2065
|
|
|
|
|
|
static void |
2066
|
37852370
|
|
|
|
|
S_reg_set_capture_string(pTHX_ REGEXP * const rx, |
2067
|
|
|
|
|
|
char *strbeg, |
2068
|
|
|
|
|
|
char *strend, |
2069
|
|
|
|
|
|
SV *sv, |
2070
|
|
|
|
|
|
U32 flags, |
2071
|
|
|
|
|
|
bool utf8_target) |
2072
|
|
|
|
|
|
{ |
2073
|
2754166
|
|
|
|
|
struct regexp *const prog = ReANY(rx); |
2074
|
|
|
|
|
|
|
2075
|
1836252
|
|
|
|
|
if (flags & REXEC_COPY_STR) { |
2076
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
2077
|
1836252
|
|
|
|
|
if (SvCANCOW(sv)) { |
2078
|
2754166
|
|
|
|
|
if (DEBUG_C_TEST) { |
2079
|
1836188
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2080
|
|
|
|
|
|
"Copy on write: regexp capture, type %d\n", |
2081
|
3672376
|
|
|
|
|
(int) SvTYPE(sv)); |
2082
|
|
|
|
|
|
} |
2083
|
|
|
|
|
|
/* Create a new COW SV to share the match string and store |
2084
|
|
|
|
|
|
* in saved_copy, unless the current COW SV in saved_copy |
2085
|
|
|
|
|
|
* is valid and suitable for our purpose */ |
2086
|
55148
|
|
|
|
|
if (( prog->saved_copy |
2087
|
1891276
|
|
|
|
|
&& SvIsCOW(prog->saved_copy) |
2088
|
1836192
|
|
|
|
|
&& SvPOKp(prog->saved_copy) |
2089
|
|
|
|
|
|
&& SvIsCOW(sv) |
2090
|
1670720
|
|
|
|
|
&& SvPOKp(sv) |
2091
|
151263413
|
|
|
|
|
&& SvPVX(sv) == SvPVX(prog->saved_copy))) |
2092
|
|
|
|
|
|
{ |
2093
|
|
|
|
|
|
/* just reuse saved_copy SV */ |
2094
|
151263413
|
|
|
|
|
if (RXp_MATCH_COPIED(prog)) { |
2095
|
151263341
|
|
|
|
|
Safefree(prog->subbeg); |
2096
|
0
|
|
|
|
|
RXp_MATCH_COPIED_off(prog); |
2097
|
|
|
|
|
|
} |
2098
|
|
|
|
|
|
} |
2099
|
|
|
|
|
|
else { |
2100
|
|
|
|
|
|
/* create new COW SV to share string */ |
2101
|
151263461
|
|
|
|
|
RX_MATCH_COPY_FREE(rx); |
2102
|
151263401
|
|
|
|
|
prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); |
2103
|
|
|
|
|
|
} |
2104
|
151263405
|
|
|
|
|
prog->subbeg = (char *)SvPVX_const(prog->saved_copy); |
2105
|
151263405
|
|
|
|
|
assert (SvPOKp(prog->saved_copy)); |
2106
|
151263405
|
|
|
|
|
prog->sublen = strend - strbeg; |
2107
|
151263405
|
|
|
|
|
prog->suboffset = 0; |
2108
|
151263405
|
|
|
|
|
prog->subcoffset = 0; |
2109
|
|
|
|
|
|
} else |
2110
|
|
|
|
|
|
#endif |
2111
|
|
|
|
|
|
{ |
2112
|
|
|
|
|
|
SSize_t min = 0; |
2113
|
151263341
|
|
|
|
|
SSize_t max = strend - strbeg; |
2114
|
|
|
|
|
|
SSize_t sublen; |
2115
|
|
|
|
|
|
|
2116
|
|
|
|
|
|
if ( (flags & REXEC_COPY_SKIP_POST) |
2117
|
|
|
|
|
|
&& !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ |
2118
|
|
|
|
|
|
&& !(PL_sawampersand & SAWAMPERSAND_RIGHT) |
2119
|
|
|
|
|
|
) { /* don't copy $' part of string */ |
2120
|
|
|
|
|
|
U32 n = 0; |
2121
|
|
|
|
|
|
max = -1; |
2122
|
|
|
|
|
|
/* calculate the right-most part of the string covered |
2123
|
|
|
|
|
|
* by a capture. Due to look-ahead, this may be to |
2124
|
|
|
|
|
|
* the right of $&, so we have to scan all captures */ |
2125
|
|
|
|
|
|
while (n <= prog->lastparen) { |
2126
|
|
|
|
|
|
if (prog->offs[n].end > max) |
2127
|
|
|
|
|
|
max = prog->offs[n].end; |
2128
|
|
|
|
|
|
n++; |
2129
|
|
|
|
|
|
} |
2130
|
|
|
|
|
|
if (max == -1) |
2131
|
|
|
|
|
|
max = (PL_sawampersand & SAWAMPERSAND_LEFT) |
2132
|
|
|
|
|
|
? prog->offs[0].start |
2133
|
|
|
|
|
|
: 0; |
2134
|
|
|
|
|
|
assert(max >= 0 && max <= strend - strbeg); |
2135
|
|
|
|
|
|
} |
2136
|
|
|
|
|
|
|
2137
|
|
|
|
|
|
if ( (flags & REXEC_COPY_SKIP_PRE) |
2138
|
|
|
|
|
|
&& !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ |
2139
|
|
|
|
|
|
&& !(PL_sawampersand & SAWAMPERSAND_LEFT) |
2140
|
|
|
|
|
|
) { /* don't copy $` part of string */ |
2141
|
|
|
|
|
|
U32 n = 0; |
2142
|
|
|
|
|
|
min = max; |
2143
|
|
|
|
|
|
/* calculate the left-most part of the string covered |
2144
|
|
|
|
|
|
* by a capture. Due to look-behind, this may be to |
2145
|
|
|
|
|
|
* the left of $&, so we have to scan all captures */ |
2146
|
|
|
|
|
|
while (min && n <= prog->lastparen) { |
2147
|
|
|
|
|
|
if ( prog->offs[n].start != -1 |
2148
|
|
|
|
|
|
&& prog->offs[n].start < min) |
2149
|
|
|
|
|
|
{ |
2150
|
|
|
|
|
|
min = prog->offs[n].start; |
2151
|
|
|
|
|
|
} |
2152
|
|
|
|
|
|
n++; |
2153
|
|
|
|
|
|
} |
2154
|
|
|
|
|
|
if ((PL_sawampersand & SAWAMPERSAND_RIGHT) |
2155
|
|
|
|
|
|
&& min > prog->offs[0].end |
2156
|
|
|
|
|
|
) |
2157
|
|
|
|
|
|
min = prog->offs[0].end; |
2158
|
|
|
|
|
|
|
2159
|
|
|
|
|
|
} |
2160
|
|
|
|
|
|
|
2161
|
151263341
|
|
|
|
|
assert(min >= 0 && min <= max && min <= strend - strbeg); |
2162
|
|
|
|
|
|
sublen = max - min; |
2163
|
|
|
|
|
|
|
2164
|
151263341
|
|
|
|
|
if (RX_MATCH_COPIED(rx)) { |
2165
|
151263341
|
|
|
|
|
if (sublen > prog->sublen) |
2166
|
151263341
|
|
|
|
|
prog->subbeg = |
2167
|
14
|
|
|
|
|
(char*)saferealloc(prog->subbeg, sublen+1); |
2168
|
|
|
|
|
|
} |
2169
|
|
|
|
|
|
else |
2170
|
14
|
|
|
|
|
prog->subbeg = (char*)safemalloc(sublen+1); |
2171
|
14
|
|
|
|
|
Copy(strbeg + min, prog->subbeg, sublen, char); |
2172
|
151263341
|
|
|
|
|
prog->subbeg[sublen] = '\0'; |
2173
|
151263341
|
|
|
|
|
prog->suboffset = min; |
2174
|
19956
|
|
|
|
|
prog->sublen = sublen; |
2175
|
19956
|
|
|
|
|
RX_MATCH_COPIED_on(rx); |
2176
|
|
|
|
|
|
} |
2177
|
20020
|
|
|
|
|
prog->subcoffset = prog->suboffset; |
2178
|
20020
|
|
|
|
|
if (prog->suboffset && utf8_target) { |
2179
|
|
|
|
|
|
/* Convert byte offset to chars. |
2180
|
|
|
|
|
|
* XXX ideally should only compute this if @-/@+ |
2181
|
|
|
|
|
|
* has been seen, a la PL_sawampersand ??? */ |
2182
|
|
|
|
|
|
|
2183
|
|
|
|
|
|
/* If there's a direct correspondence between the |
2184
|
|
|
|
|
|
* string which we're matching and the original SV, |
2185
|
|
|
|
|
|
* then we can use the utf8 len cache associated with |
2186
|
|
|
|
|
|
* the SV. In particular, it means that under //g, |
2187
|
|
|
|
|
|
* sv_pos_b2u() will use the previously cached |
2188
|
|
|
|
|
|
* position to speed up working out the new length of |
2189
|
|
|
|
|
|
* subcoffset, rather than counting from the start of |
2190
|
|
|
|
|
|
* the string each time. This stops |
2191
|
|
|
|
|
|
* $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; |
2192
|
|
|
|
|
|
* from going quadratic */ |
2193
|
151263341
|
|
|
|
|
if (SvPOKp(sv) && SvPVX(sv) == strbeg) |
2194
|
151263341
|
|
|
|
|
prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, |
2195
|
|
|
|
|
|
SV_GMAGIC|SV_CONST_RETURN); |
2196
|
|
|
|
|
|
else |
2197
|
453901797
|
|
|
|
|
prog->subcoffset = utf8_length((U8*)strbeg, |
2198
|
|
|
|
|
|
(U8*)(strbeg+prog->suboffset)); |
2199
|
|
|
|
|
|
} |
2200
|
|
|
|
|
|
} |
2201
|
|
|
|
|
|
else { |
2202
|
302638456
|
|
|
|
|
RX_MATCH_COPY_FREE(rx); |
2203
|
151263341
|
|
|
|
|
prog->subbeg = strbeg; |
2204
|
151375115
|
|
|
|
|
prog->suboffset = 0; |
2205
|
111774
|
|
|
|
|
prog->subcoffset = 0; |
2206
|
223548
|
|
|
|
|
prog->sublen = strend - strbeg; |
2207
|
|
|
|
|
|
} |
2208
|
111838
|
|
|
|
|
} |
2209
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
2211
|
|
|
|
|
|
|
2212
|
|
|
|
|
|
|
2213
|
|
|
|
|
|
/* |
2214
|
|
|
|
|
|
- regexec_flags - match a regexp against a string |
2215
|
|
|
|
|
|
*/ |
2216
|
|
|
|
|
|
I32 |
2217
|
302638550
|
|
|
|
|
Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, |
2218
|
|
|
|
|
|
char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) |
2219
|
|
|
|
|
|
/* stringarg: the point in the string at which to begin matching */ |
2220
|
|
|
|
|
|
/* strend: pointer to null at end of string */ |
2221
|
|
|
|
|
|
/* strbeg: real beginning of string */ |
2222
|
|
|
|
|
|
/* minend: end of match must be >= minend bytes after stringarg. */ |
2223
|
|
|
|
|
|
/* sv: SV being matched: only used for utf8 flag, pos() etc; string |
2224
|
|
|
|
|
|
* itself is accessed via the pointers above */ |
2225
|
|
|
|
|
|
/* data: May be used for some additional optimizations. |
2226
|
|
|
|
|
|
Currently unused. */ |
2227
|
|
|
|
|
|
/* flags: For optimizations. See REXEC_* in regexp.h */ |
2228
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
{ |
2230
|
|
|
|
|
|
dVAR; |
2231
|
94
|
|
|
|
|
struct regexp *const prog = ReANY(rx); |
2232
|
|
|
|
|
|
char *s; |
2233
|
|
|
|
|
|
regnode *c; |
2234
|
|
|
|
|
|
char *startpos; |
2235
|
|
|
|
|
|
SSize_t minlen; /* must match at least this many chars */ |
2236
|
|
|
|
|
|
SSize_t dontbother = 0; /* how many characters not to try at end */ |
2237
|
151263435
|
|
|
|
|
const bool utf8_target = cBOOL(DO_UTF8(sv)); |
2238
|
|
|
|
|
|
I32 multiline; |
2239
|
151263435
|
|
|
|
|
RXi_GET_DECL(prog,progi); |
2240
|
|
|
|
|
|
regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ |
2241
|
|
|
|
|
|
regmatch_info *const reginfo = ®info_buf; |
2242
|
|
|
|
|
|
regexp_paren_pair *swap = NULL; |
2243
|
|
|
|
|
|
I32 oldsave; |
2244
|
151263435
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
2245
|
|
|
|
|
|
|
2246
|
151263435
|
|
|
|
|
PERL_ARGS_ASSERT_REGEXEC_FLAGS; |
2247
|
|
|
|
|
|
PERL_UNUSED_ARG(data); |
2248
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
/* Be paranoid... */ |
2250
|
151263435
|
|
|
|
|
if (prog == NULL || stringarg == NULL) { |
2251
|
111774
|
|
|
|
|
Perl_croak(aTHX_ "NULL regexp parameter"); |
2252
|
|
|
|
|
|
return 0; |
2253
|
|
|
|
|
|
} |
2254
|
|
|
|
|
|
|
2255
|
151151661
|
|
|
|
|
DEBUG_EXECUTE_r( |
2256
|
|
|
|
|
|
debug_start_match(rx, utf8_target, stringarg, strend, |
2257
|
|
|
|
|
|
"Matching"); |
2258
|
|
|
|
|
|
); |
2259
|
|
|
|
|
|
|
2260
|
|
|
|
|
|
startpos = stringarg; |
2261
|
|
|
|
|
|
|
2262
|
151263435
|
|
|
|
|
if (prog->extflags & RXf_GPOS_SEEN) { |
2263
|
|
|
|
|
|
MAGIC *mg; |
2264
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
/* set reginfo->ganch, the position where \G can match */ |
2266
|
|
|
|
|
|
|
2267
|
39700526
|
|
|
|
|
reginfo->ganch = |
2268
|
59537470
|
|
|
|
|
(flags & REXEC_IGNOREPOS) |
2269
|
|
|
|
|
|
? stringarg /* use start pos rather than pos() */ |
2270
|
151263341
|
|
|
|
|
: (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) |
2271
|
|
|
|
|
|
/* Defined pos(): */ |
2272
|
92429392
|
|
|
|
|
? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) |
2273
|
24225610
|
|
|
|
|
: strbeg; /* pos() not defined; use start of string */ |
2274
|
|
|
|
|
|
|
2275
|
23306660
|
|
|
|
|
DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, |
2276
|
|
|
|
|
|
"GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg)); |
2277
|
|
|
|
|
|
|
2278
|
|
|
|
|
|
/* in the presence of \G, we may need to start looking earlier in |
2279
|
|
|
|
|
|
* the string than the suggested start point of stringarg: |
2280
|
|
|
|
|
|
* if gofs->prog is set, then that's a known, fixed minimum |
2281
|
|
|
|
|
|
* offset, such as |
2282
|
|
|
|
|
|
* /..\G/: gofs = 2 |
2283
|
|
|
|
|
|
* /ab|c\G/: gofs = 1 |
2284
|
|
|
|
|
|
* or if the minimum offset isn't known, then we have to go back |
2285
|
|
|
|
|
|
* to the start of the string, e.g. /w+\G/ |
2286
|
|
|
|
|
|
*/ |
2287
|
|
|
|
|
|
|
2288
|
919224
|
|
|
|
|
if (prog->extflags & RXf_ANCH_GPOS) { |
2289
|
511034
|
|
|
|
|
startpos = reginfo->ganch - prog->gofs; |
2290
|
919224
|
|
|
|
|
if (startpos < |
2291
|
919224
|
|
|
|
|
((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) |
2292
|
|
|
|
|
|
{ |
2293
|
486108
|
|
|
|
|
DEBUG_r(PerlIO_printf(Perl_debug_log, |
2294
|
|
|
|
|
|
"fail: ganch-gofs before earliest possible start\n")); |
2295
|
3882
|
|
|
|
|
return 0; |
2296
|
|
|
|
|
|
} |
2297
|
|
|
|
|
|
} |
2298
|
4296
|
|
|
|
|
else if (prog->gofs) { |
2299
|
596
|
|
|
|
|
if (startpos - prog->gofs < strbeg) |
2300
|
|
|
|
|
|
startpos = strbeg; |
2301
|
|
|
|
|
|
else |
2302
|
596
|
|
|
|
|
startpos -= prog->gofs; |
2303
|
|
|
|
|
|
} |
2304
|
596
|
|
|
|
|
else if (prog->extflags & RXf_GPOS_FLOAT) |
2305
|
|
|
|
|
|
startpos = strbeg; |
2306
|
|
|
|
|
|
} |
2307
|
|
|
|
|
|
|
2308
|
690
|
|
|
|
|
minlen = prog->minlen; |
2309
|
94
|
|
|
|
|
if ((startpos + minlen) > strend || startpos < strbeg) { |
2310
|
482226
|
|
|
|
|
DEBUG_r(PerlIO_printf(Perl_debug_log, |
2311
|
|
|
|
|
|
"Regex match can't succeed, so not even tried\n")); |
2312
|
837760
|
|
|
|
|
return 0; |
2313
|
|
|
|
|
|
} |
2314
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
/* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), |
2316
|
|
|
|
|
|
* which will call destuctors to reset PL_regmatch_state, free higher |
2317
|
|
|
|
|
|
* PL_regmatch_slabs, and clean up regmatch_info_aux and |
2318
|
|
|
|
|
|
* regmatch_info_aux_eval */ |
2319
|
|
|
|
|
|
|
2320
|
375688
|
|
|
|
|
oldsave = PL_savestack_ix; |
2321
|
|
|
|
|
|
|
2322
|
375660
|
|
|
|
|
s = startpos; |
2323
|
|
|
|
|
|
|
2324
|
375660
|
|
|
|
|
if ((prog->extflags & RXf_USE_INTUIT) |
2325
|
375658
|
|
|
|
|
&& !(flags & REXEC_CHECKED)) |
2326
|
|
|
|
|
|
{ |
2327
|
92
|
|
|
|
|
s = re_intuit_start(rx, sv, strbeg, startpos, strend, |
2328
|
|
|
|
|
|
flags, NULL); |
2329
|
433208
|
|
|
|
|
if (!s) |
2330
|
|
|
|
|
|
return 0; |
2331
|
|
|
|
|
|
|
2332
|
206
|
|
|
|
|
if (prog->extflags & RXf_CHECK_ALL) { |
2333
|
|
|
|
|
|
/* we can match based purely on the result of INTUIT. |
2334
|
|
|
|
|
|
* Set up captures etc just for $& and $-[0] |
2335
|
|
|
|
|
|
* (an intuit-only match wont have $1,$2,..) */ |
2336
|
16010172
|
|
|
|
|
assert(!prog->nparens); |
2337
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
/* s/// doesn't like it if $& is earlier than where we asked it to |
2339
|
|
|
|
|
|
* start searching (which can happen on something like /.\G/) */ |
2340
|
15856296
|
|
|
|
|
if ( (flags & REXEC_FAIL_ON_UNDERFLOW) |
2341
|
790218
|
|
|
|
|
&& (s < stringarg)) |
2342
|
|
|
|
|
|
{ |
2343
|
|
|
|
|
|
/* this should only be possible under \G */ |
2344
|
58833949
|
|
|
|
|
assert(prog->extflags & RXf_GPOS_SEEN); |
2345
|
2031764
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
2346
|
|
|
|
|
|
"matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); |
2347
|
|
|
|
|
|
goto phooey; |
2348
|
|
|
|
|
|
} |
2349
|
|
|
|
|
|
|
2350
|
|
|
|
|
|
/* match via INTUIT shouldn't have any captures. |
2351
|
|
|
|
|
|
* Let @-, @+, $^N know */ |
2352
|
56802207
|
|
|
|
|
prog->lastparen = prog->lastcloseparen = 0; |
2353
|
106286
|
|
|
|
|
RX_MATCH_UTF8_set(rx, utf8_target); |
2354
|
382
|
|
|
|
|
prog->offs[0].start = s - strbeg; |
2355
|
44
|
|
|
|
|
prog->offs[0].end = utf8_target |
2356
|
360
|
|
|
|
|
? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg |
2357
|
2984
|
|
|
|
|
: s - strbeg + prog->minlenret; |
2358
|
105904
|
|
|
|
|
if ( !(flags & REXEC_NOT_FIRST) ) |
2359
|
22
|
|
|
|
|
S_reg_set_capture_string(aTHX_ rx, |
2360
|
|
|
|
|
|
strbeg, strend, |
2361
|
|
|
|
|
|
sv, flags, utf8_target); |
2362
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
return 1; |
2364
|
|
|
|
|
|
} |
2365
|
|
|
|
|
|
} |
2366
|
|
|
|
|
|
|
2367
|
105926
|
|
|
|
|
multiline = prog->extflags & RXf_PMf_MULTILINE; |
2368
|
|
|
|
|
|
|
2369
|
804182
|
|
|
|
|
if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { |
2370
|
56695943
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
2371
|
|
|
|
|
|
"String too short [regexec_flags]...\n")); |
2372
|
|
|
|
|
|
goto phooey; |
2373
|
|
|
|
|
|
} |
2374
|
|
|
|
|
|
|
2375
|
|
|
|
|
|
/* Check validity of program. */ |
2376
|
48208691
|
|
|
|
|
if (UCHARAT(progi->program) != REG_MAGIC) { |
2377
|
48095871
|
|
|
|
|
Perl_croak(aTHX_ "corrupted regexp program"); |
2378
|
|
|
|
|
|
} |
2379
|
|
|
|
|
|
|
2380
|
4550440
|
|
|
|
|
RX_MATCH_TAINTED_off(rx); |
2381
|
|
|
|
|
|
|
2382
|
8746328
|
|
|
|
|
reginfo->prog = rx; /* Yes, sorry that this is confusing. */ |
2383
|
8600116
|
|
|
|
|
reginfo->intuit = 0; |
2384
|
226380
|
|
|
|
|
reginfo->is_utf8_target = cBOOL(utf8_target); |
2385
|
7842
|
|
|
|
|
reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); |
2386
|
226380
|
|
|
|
|
reginfo->warned = FALSE; |
2387
|
8373780
|
|
|
|
|
reginfo->strbeg = strbeg; |
2388
|
44
|
|
|
|
|
reginfo->sv = sv; |
2389
|
8373780
|
|
|
|
|
reginfo->poscache_maxiter = 0; /* not yet started a countdown */ |
2390
|
8600116
|
|
|
|
|
reginfo->strend = strend; |
2391
|
|
|
|
|
|
/* see how far we have to get to not match where we matched before */ |
2392
|
146256
|
|
|
|
|
reginfo->till = stringarg + minend; |
2393
|
|
|
|
|
|
|
2394
|
18832
|
|
|
|
|
if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) { |
2395
|
|
|
|
|
|
/* SAVEFREESV, not sv_mortalcopy, as this SV must last until after |
2396
|
|
|
|
|
|
S_cleanup_regmatch_info_aux has executed (registered by |
2397
|
|
|
|
|
|
SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies |
2398
|
|
|
|
|
|
magic belonging to this SV. |
2399
|
|
|
|
|
|
Not newSVsv, either, as it does not COW. |
2400
|
|
|
|
|
|
*/ |
2401
|
0
|
|
|
|
|
reginfo->sv = newSV(0); |
2402
|
18788
|
|
|
|
|
sv_setsv(reginfo->sv, sv); |
2403
|
127424
|
|
|
|
|
SAVEFREESV(reginfo->sv); |
2404
|
|
|
|
|
|
} |
2405
|
|
|
|
|
|
|
2406
|
|
|
|
|
|
/* reserve next 2 or 3 slots in PL_regmatch_state: |
2407
|
|
|
|
|
|
* slot N+0: may currently be in use: skip it |
2408
|
|
|
|
|
|
* slot N+1: use for regmatch_info_aux struct |
2409
|
|
|
|
|
|
* slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s |
2410
|
|
|
|
|
|
* slot N+3: ready for use by regmatch() |
2411
|
|
|
|
|
|
*/ |
2412
|
|
|
|
|
|
|
2413
|
|
|
|
|
|
{ |
2414
|
|
|
|
|
|
regmatch_state *old_regmatch_state; |
2415
|
|
|
|
|
|
regmatch_slab *old_regmatch_slab; |
2416
|
44
|
|
|
|
|
int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; |
2417
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
/* on first ever match, allocate first slab */ |
2419
|
127468
|
|
|
|
|
if (!PL_regmatch_slab) { |
2420
|
146212
|
|
|
|
|
Newx(PL_regmatch_slab, 1, regmatch_slab); |
2421
|
146212
|
|
|
|
|
PL_regmatch_slab->prev = NULL; |
2422
|
8746284
|
|
|
|
|
PL_regmatch_slab->next = NULL; |
2423
|
8746284
|
|
|
|
|
PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); |
2424
|
|
|
|
|
|
} |
2425
|
|
|
|
|
|
|
2426
|
8746328
|
|
|
|
|
old_regmatch_state = PL_regmatch_state; |
2427
|
7700524
|
|
|
|
|
old_regmatch_slab = PL_regmatch_slab; |
2428
|
|
|
|
|
|
|
2429
|
1045972
|
|
|
|
|
for (i=0; i <= max; i++) { |
2430
|
8746408
|
|
|
|
|
if (i == 1) |
2431
|
39691835
|
|
|
|
|
reginfo->info_aux = &(PL_regmatch_state->u.info_aux); |
2432
|
17384374
|
|
|
|
|
else if (i ==2) |
2433
|
16753958
|
|
|
|
|
reginfo->info_aux_eval = |
2434
|
16625990
|
|
|
|
|
reginfo->info_aux->info_aux_eval = |
2435
|
16625954
|
|
|
|
|
&(PL_regmatch_state->u.info_aux_eval); |
2436
|
|
|
|
|
|
|
2437
|
128128
|
|
|
|
|
if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) |
2438
|
128004
|
|
|
|
|
PL_regmatch_state = S_push_slab(aTHX); |
2439
|
|
|
|
|
|
} |
2440
|
|
|
|
|
|
|
2441
|
|
|
|
|
|
/* note initial PL_regmatch_state position; at end of match we'll |
2442
|
|
|
|
|
|
* pop back to there and free any higher slabs */ |
2443
|
|
|
|
|
|
|
2444
|
128048
|
|
|
|
|
reginfo->info_aux->old_regmatch_state = old_regmatch_state; |
2445
|
128048
|
|
|
|
|
reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; |
2446
|
16753966
|
|
|
|
|
reginfo->info_aux->poscache = NULL; |
2447
|
|
|
|
|
|
|
2448
|
317622
|
|
|
|
|
SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); |
2449
|
|
|
|
|
|
|
2450
|
317622
|
|
|
|
|
if ((prog->extflags & RXf_EVAL_SEEN)) |
2451
|
142560
|
|
|
|
|
S_setup_eval_state(aTHX_ reginfo); |
2452
|
|
|
|
|
|
else |
2453
|
137748
|
|
|
|
|
reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; |
2454
|
|
|
|
|
|
} |
2455
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
/* If there is a "must appear" string, look for it. */ |
2457
|
|
|
|
|
|
|
2458
|
137784
|
|
|
|
|
if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { |
2459
|
|
|
|
|
|
/* We have to be careful. If the previous successful match |
2460
|
|
|
|
|
|
was from this regex we don't want a subsequent partially |
2461
|
|
|
|
|
|
successful match to clobber the old results. |
2462
|
|
|
|
|
|
So when we detect this possibility we add a swap buffer |
2463
|
|
|
|
|
|
to the re, and switch the buffer each match. If we fail, |
2464
|
|
|
|
|
|
we switch it back; otherwise we leave it swapped. |
2465
|
|
|
|
|
|
*/ |
2466
|
4820
|
|
|
|
|
swap = prog->offs; |
2467
|
|
|
|
|
|
/* do we need a save destructor here for eval dies? */ |
2468
|
25836332
|
|
|
|
|
Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); |
2469
|
16716196
|
|
|
|
|
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, |
2470
|
|
|
|
|
|
"rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", |
2471
|
|
|
|
|
|
PTR2UV(prog), |
2472
|
|
|
|
|
|
PTR2UV(swap), |
2473
|
|
|
|
|
|
PTR2UV(prog->offs) |
2474
|
|
|
|
|
|
)); |
2475
|
|
|
|
|
|
} |
2476
|
|
|
|
|
|
|
2477
|
|
|
|
|
|
/* Simplest case: anchored match need be tried only once. */ |
2478
|
|
|
|
|
|
/* [unless only anchor is BOL and multiline is set] */ |
2479
|
9326306
|
|
|
|
|
if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { |
2480
|
47949659
|
|
|
|
|
if (s == startpos && regtry(reginfo, &s)) |
2481
|
|
|
|
|
|
goto got_it; |
2482
|
21768593
|
|
|
|
|
else if (multiline || (prog->intflags & PREGf_IMPLICIT) |
2483
|
21766999
|
|
|
|
|
|| (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ |
2484
|
|
|
|
|
|
{ |
2485
|
|
|
|
|
|
char *end; |
2486
|
|
|
|
|
|
|
2487
|
21766999
|
|
|
|
|
if (minlen) |
2488
|
21146737
|
|
|
|
|
dontbother = minlen - 1; |
2489
|
21768593
|
|
|
|
|
end = HOP3c(strend, -dontbother, strbeg) - 1; |
2490
|
|
|
|
|
|
/* for multiline we only have to try after newlines */ |
2491
|
26181066
|
|
|
|
|
if (prog->check_substr || prog->check_utf8) { |
2492
|
|
|
|
|
|
/* because of the goto we can not easily reuse the macros for bifurcating the |
2493
|
|
|
|
|
|
unicode/non-unicode match modes here like we do elsewhere - demerphq */ |
2494
|
2506358
|
|
|
|
|
if (utf8_target) { |
2495
|
31358
|
|
|
|
|
if (s == startpos) |
2496
|
|
|
|
|
|
goto after_try_utf8; |
2497
|
|
|
|
|
|
while (1) { |
2498
|
0
|
|
|
|
|
if (regtry(reginfo, &s)) { |
2499
|
|
|
|
|
|
goto got_it; |
2500
|
|
|
|
|
|
} |
2501
|
|
|
|
|
|
after_try_utf8: |
2502
|
31358
|
|
|
|
|
if (s > end) { |
2503
|
|
|
|
|
|
goto phooey; |
2504
|
|
|
|
|
|
} |
2505
|
2475000
|
|
|
|
|
if (prog->extflags & RXf_USE_INTUIT) { |
2506
|
0
|
|
|
|
|
s = re_intuit_start(rx, sv, strbeg, |
2507
|
|
|
|
|
|
s + UTF8SKIP(s), strend, flags, NULL); |
2508
|
2475000
|
|
|
|
|
if (!s) { |
2509
|
|
|
|
|
|
goto phooey; |
2510
|
|
|
|
|
|
} |
2511
|
|
|
|
|
|
} |
2512
|
|
|
|
|
|
else { |
2513
|
2506358
|
|
|
|
|
s += UTF8SKIP(s); |
2514
|
|
|
|
|
|
} |
2515
|
|
|
|
|
|
} |
2516
|
|
|
|
|
|
} /* end search for check string in unicode */ |
2517
|
|
|
|
|
|
else { |
2518
|
2506358
|
|
|
|
|
if (s == startpos) { |
2519
|
|
|
|
|
|
goto after_try_latin; |
2520
|
|
|
|
|
|
} |
2521
|
|
|
|
|
|
while (1) { |
2522
|
2084514
|
|
|
|
|
if (regtry(reginfo, &s)) { |
2523
|
|
|
|
|
|
goto got_it; |
2524
|
|
|
|
|
|
} |
2525
|
|
|
|
|
|
after_try_latin: |
2526
|
2084514
|
|
|
|
|
if (s > end) { |
2527
|
|
|
|
|
|
goto phooey; |
2528
|
|
|
|
|
|
} |
2529
|
2084514
|
|
|
|
|
if (prog->extflags & RXf_USE_INTUIT) { |
2530
|
2084514
|
|
|
|
|
s = re_intuit_start(rx, sv, strbeg, |
2531
|
|
|
|
|
|
s + 1, strend, flags, NULL); |
2532
|
0
|
|
|
|
|
if (!s) { |
2533
|
|
|
|
|
|
goto phooey; |
2534
|
|
|
|
|
|
} |
2535
|
|
|
|
|
|
} |
2536
|
|
|
|
|
|
else { |
2537
|
0
|
|
|
|
|
s++; |
2538
|
|
|
|
|
|
} |
2539
|
|
|
|
|
|
} |
2540
|
|
|
|
|
|
} /* end search for check string in latin*/ |
2541
|
|
|
|
|
|
} /* end search for check string */ |
2542
|
|
|
|
|
|
else { /* search for newline */ |
2543
|
0
|
|
|
|
|
if (s > startpos) { |
2544
|
|
|
|
|
|
/*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ |
2545
|
421844
|
|
|
|
|
s--; |
2546
|
|
|
|
|
|
} |
2547
|
|
|
|
|
|
/* We can use a more efficient search as newlines are the same in unicode as they are in latin */ |
2548
|
421844
|
|
|
|
|
while (s <= end) { /* note it could be possible to match at the end of the string */ |
2549
|
2506358
|
|
|
|
|
if (*s++ == '\n') { /* don't need PL_utf8skip here */ |
2550
|
2506358
|
|
|
|
|
if (regtry(reginfo, &s)) |
2551
|
|
|
|
|
|
goto got_it; |
2552
|
|
|
|
|
|
} |
2553
|
|
|
|
|
|
} |
2554
|
|
|
|
|
|
} /* end search for newline */ |
2555
|
|
|
|
|
|
} /* end anchored/multiline check string search */ |
2556
|
|
|
|
|
|
goto phooey; |
2557
|
26181110
|
|
|
|
|
} else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) |
2558
|
|
|
|
|
|
{ |
2559
|
|
|
|
|
|
/* For anchored \G, the only position it can match from is |
2560
|
|
|
|
|
|
* (ganch-gofs); we already set startpos to this above; if intuit |
2561
|
|
|
|
|
|
* moved us on from there, we can't possibly succeed */ |
2562
|
22860482
|
|
|
|
|
assert(startpos == reginfo->ganch - prog->gofs); |
2563
|
26181066
|
|
|
|
|
if (s == startpos && regtry(reginfo, &s)) |
2564
|
|
|
|
|
|
goto got_it; |
2565
|
|
|
|
|
|
goto phooey; |
2566
|
|
|
|
|
|
} |
2567
|
|
|
|
|
|
|
2568
|
|
|
|
|
|
/* Messy cases: unanchored match. */ |
2569
|
26181110
|
|
|
|
|
if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { |
2570
|
|
|
|
|
|
/* we have /x+whatever/ */ |
2571
|
|
|
|
|
|
/* it must be a one character string (XXXX Except is_utf8_pat?) */ |
2572
|
|
|
|
|
|
char ch; |
2573
|
|
|
|
|
|
#ifdef DEBUGGING |
2574
|
|
|
|
|
|
int did_match = 0; |
2575
|
|
|
|
|
|
#endif |
2576
|
6475186
|
|
|
|
|
if (utf8_target) { |
2577
|
264368
|
|
|
|
|
if (! prog->anchored_utf8) { |
2578
|
242168
|
|
|
|
|
to_utf8_substr(prog); |
2579
|
|
|
|
|
|
} |
2580
|
10217631
|
|
|
|
|
ch = SvPVX_const(prog->anchored_utf8)[0]; |
2581
|
98256438
|
|
|
|
|
REXEC_FBC_SCAN( |
2582
|
|
|
|
|
|
if (*s == ch) { |
2583
|
|
|
|
|
|
DEBUG_EXECUTE_r( did_match = 1 ); |
2584
|
|
|
|
|
|
if (regtry(reginfo, &s)) goto got_it; |
2585
|
|
|
|
|
|
s += UTF8SKIP(s); |
2586
|
|
|
|
|
|
while (s < strend && *s == ch) |
2587
|
|
|
|
|
|
s += UTF8SKIP(s); |
2588
|
|
|
|
|
|
} |
2589
|
|
|
|
|
|
); |
2590
|
|
|
|
|
|
|
2591
|
|
|
|
|
|
} |
2592
|
|
|
|
|
|
else { |
2593
|
81066906
|
|
|
|
|
if (! prog->anchored_substr) { |
2594
|
110461603
|
|
|
|
|
if (! to_byte_substr(prog)) { |
2595
|
12784408
|
|
|
|
|
NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); |
2596
|
|
|
|
|
|
} |
2597
|
|
|
|
|
|
} |
2598
|
110461599
|
|
|
|
|
ch = SvPVX_const(prog->anchored_substr)[0]; |
2599
|
110461599
|
|
|
|
|
REXEC_FBC_SCAN( |
2600
|
|
|
|
|
|
if (*s == ch) { |
2601
|
|
|
|
|
|
DEBUG_EXECUTE_r( did_match = 1 ); |
2602
|
|
|
|
|
|
if (regtry(reginfo, &s)) goto got_it; |
2603
|
|
|
|
|
|
s++; |
2604
|
|
|
|
|
|
while (s < strend && *s == ch) |
2605
|
|
|
|
|
|
s++; |
2606
|
|
|
|
|
|
} |
2607
|
|
|
|
|
|
); |
2608
|
|
|
|
|
|
} |
2609
|
110461599
|
|
|
|
|
DEBUG_EXECUTE_r(if (!did_match) |
2610
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2611
|
|
|
|
|
|
"Did not find anchored character...\n") |
2612
|
|
|
|
|
|
); |
2613
|
|
|
|
|
|
} |
2614
|
11166
|
|
|
|
|
else if (prog->anchored_substr != NULL |
2615
|
165476303
|
|
|
|
|
|| prog->anchored_utf8 != NULL |
2616
|
110461603
|
|
|
|
|
|| ((prog->float_substr != NULL || prog->float_utf8 != NULL) |
2617
|
96999029
|
|
|
|
|
&& prog->float_max_offset < strend - s)) { |
2618
|
|
|
|
|
|
SV *must; |
2619
|
|
|
|
|
|
SSize_t back_max; |
2620
|
|
|
|
|
|
SSize_t back_min; |
2621
|
|
|
|
|
|
char *last; |
2622
|
|
|
|
|
|
char *last1; /* Last position checked before */ |
2623
|
|
|
|
|
|
#ifdef DEBUGGING |
2624
|
|
|
|
|
|
int did_match = 0; |
2625
|
|
|
|
|
|
#endif |
2626
|
40776694
|
|
|
|
|
if (prog->anchored_substr || prog->anchored_utf8) { |
2627
|
40776694
|
|
|
|
|
if (utf8_target) { |
2628
|
7496494
|
|
|
|
|
if (! prog->anchored_utf8) { |
2629
|
136168564
|
|
|
|
|
to_utf8_substr(prog); |
2630
|
|
|
|
|
|
} |
2631
|
229782217
|
|
|
|
|
must = prog->anchored_utf8; |
2632
|
|
|
|
|
|
} |
2633
|
|
|
|
|
|
else { |
2634
|
229782257
|
|
|
|
|
if (! prog->anchored_substr) { |
2635
|
229782217
|
|
|
|
|
if (! to_byte_substr(prog)) { |
2636
|
229782217
|
|
|
|
|
NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); |
2637
|
|
|
|
|
|
} |
2638
|
|
|
|
|
|
} |
2639
|
229782257
|
|
|
|
|
must = prog->anchored_substr; |
2640
|
|
|
|
|
|
} |
2641
|
229782257
|
|
|
|
|
back_max = back_min = prog->anchored_offset; |
2642
|
|
|
|
|
|
} else { |
2643
|
229782217
|
|
|
|
|
if (utf8_target) { |
2644
|
229782217
|
|
|
|
|
if (! prog->float_utf8) { |
2645
|
111403381
|
|
|
|
|
to_utf8_substr(prog); |
2646
|
|
|
|
|
|
} |
2647
|
382741242
|
|
|
|
|
must = prog->float_utf8; |
2648
|
|
|
|
|
|
} |
2649
|
|
|
|
|
|
else { |
2650
|
271337861
|
|
|
|
|
if (! prog->float_substr) { |
2651
|
271337861
|
|
|
|
|
if (! to_byte_substr(prog)) { |
2652
|
271337861
|
|
|
|
|
NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); |
2653
|
|
|
|
|
|
} |
2654
|
|
|
|
|
|
} |
2655
|
229782217
|
|
|
|
|
must = prog->float_substr; |
2656
|
|
|
|
|
|
} |
2657
|
229782217
|
|
|
|
|
back_max = prog->float_max_offset; |
2658
|
229782191
|
|
|
|
|
back_min = prog->float_min_offset; |
2659
|
|
|
|
|
|
} |
2660
|
|
|
|
|
|
|
2661
|
110461643
|
|
|
|
|
if (back_min<0) { |
2662
|
|
|
|
|
|
last = strend; |
2663
|
|
|
|
|
|
} else { |
2664
|
110461643
|
|
|
|
|
last = HOP3c(strend, /* Cannot start after this */ |
2665
|
|
|
|
|
|
-(SSize_t)(CHR_SVLEN(must) |
2666
|
|
|
|
|
|
- (SvTAIL(must) != 0) + back_min), strbeg); |
2667
|
|
|
|
|
|
} |
2668
|
119320628
|
|
|
|
|
if (s > reginfo->strbeg) |
2669
|
280
|
|
|
|
|
last1 = HOPc(s, -1); |
2670
|
|
|
|
|
|
else |
2671
|
174767529
|
|
|
|
|
last1 = s - 1; /* bogus */ |
2672
|
|
|
|
|
|
|
2673
|
|
|
|
|
|
/* XXXX check_substr already used to find "s", can optimize if |
2674
|
|
|
|
|
|
check_substr==must. */ |
2675
|
|
|
|
|
|
dontbother = 0; |
2676
|
1889308
|
|
|
|
|
strend = HOPc(strend, -dontbother); |
2677
|
1889388
|
|
|
|
|
while ( (s <= last) && |
2678
|
1889308
|
|
|
|
|
(s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), |
2679
|
|
|
|
|
|
(unsigned char*)strend, must, |
2680
|
|
|
|
|
|
multiline ? FBMrf_MULTILINE : 0)) ) { |
2681
|
514668
|
|
|
|
|
DEBUG_EXECUTE_r( did_match = 1 ); |
2682
|
514668
|
|
|
|
|
if (HOPc(s, -back_max) > last1) { |
2683
|
514668
|
|
|
|
|
last1 = HOPc(s, -back_min); |
2684
|
514668
|
|
|
|
|
s = HOPc(s, -back_max); |
2685
|
|
|
|
|
|
} |
2686
|
|
|
|
|
|
else { |
2687
|
1889268
|
|
|
|
|
char * const t = (last1 >= reginfo->strbeg) |
2688
|
1889268
|
|
|
|
|
? HOPc(last1, 1) : last1 + 1; |
2689
|
|
|
|
|
|
|
2690
|
9148
|
|
|
|
|
last1 = HOPc(s, -back_min); |
2691
|
9148
|
|
|
|
|
s = t; |
2692
|
|
|
|
|
|
} |
2693
|
9188
|
|
|
|
|
if (utf8_target) { |
2694
|
9148
|
|
|
|
|
while (s <= last1) { |
2695
|
14022
|
|
|
|
|
if (regtry(reginfo, &s)) |
2696
|
|
|
|
|
|
goto got_it; |
2697
|
13672
|
|
|
|
|
if (s >= last1) { |
2698
|
8848
|
|
|
|
|
s++; /* to break out of outer loop */ |
2699
|
8748
|
|
|
|
|
break; |
2700
|
|
|
|
|
|
} |
2701
|
108870617
|
|
|
|
|
s += UTF8SKIP(s); |
2702
|
|
|
|
|
|
} |
2703
|
|
|
|
|
|
} |
2704
|
|
|
|
|
|
else { |
2705
|
108870657
|
|
|
|
|
while (s <= last1) { |
2706
|
108870657
|
|
|
|
|
if (regtry(reginfo, &s)) |
2707
|
|
|
|
|
|
goto got_it; |
2708
|
108870617
|
|
|
|
|
s++; |
2709
|
|
|
|
|
|
} |
2710
|
|
|
|
|
|
} |
2711
|
|
|
|
|
|
} |
2712
|
108870617
|
|
|
|
|
DEBUG_EXECUTE_r(if (!did_match) { |
2713
|
|
|
|
|
|
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), |
2714
|
|
|
|
|
|
SvPVX_const(must), RE_SV_DUMPLEN(must), 30); |
2715
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", |
2716
|
|
|
|
|
|
((must == prog->anchored_substr || must == prog->anchored_utf8) |
2717
|
|
|
|
|
|
? "anchored" : "floating"), |
2718
|
|
|
|
|
|
quoted, RE_SV_TAIL(must)); |
2719
|
|
|
|
|
|
}); |
2720
|
|
|
|
|
|
goto phooey; |
2721
|
|
|
|
|
|
} |
2722
|
107917905
|
|
|
|
|
else if ( (c = progi->regstclass) ) { |
2723
|
107847143
|
|
|
|
|
if (minlen) { |
2724
|
70766
|
|
|
|
|
const OPCODE op = OP(progi->regstclass); |
2725
|
|
|
|
|
|
/* don't bother with what can't match */ |
2726
|
49494
|
|
|
|
|
if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) |
2727
|
49490
|
|
|
|
|
strend = HOPc(strend, -(minlen - 1)); |
2728
|
|
|
|
|
|
} |
2729
|
21276
|
|
|
|
|
DEBUG_EXECUTE_r({ |
2730
|
|
|
|
|
|
SV * const prop = sv_newmortal(); |
2731
|
|
|
|
|
|
regprop(prog, prop, c); |
2732
|
|
|
|
|
|
{ |
2733
|
|
|
|
|
|
RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), |
2734
|
|
|
|
|
|
s,strend-s,60); |
2735
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2736
|
|
|
|
|
|
"Matching stclass %.*s against %s (%d bytes)\n", |
2737
|
|
|
|
|
|
(int)SvCUR(prop), SvPVX_const(prop), |
2738
|
|
|
|
|
|
quoted, (int)(strend - s)); |
2739
|
|
|
|
|
|
} |
2740
|
|
|
|
|
|
}); |
2741
|
2700901
|
|
|
|
|
if (find_byclass(prog, c, s, strend, reginfo)) |
2742
|
|
|
|
|
|
goto got_it; |
2743
|
168923
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); |
2744
|
|
|
|
|
|
} |
2745
|
|
|
|
|
|
else { |
2746
|
|
|
|
|
|
dontbother = 0; |
2747
|
940040
|
|
|
|
|
if (prog->float_substr != NULL || prog->float_utf8 != NULL) { |
2748
|
|
|
|
|
|
/* Trim the end. */ |
2749
|
|
|
|
|
|
char *last= NULL; |
2750
|
|
|
|
|
|
SV* float_real; |
2751
|
|
|
|
|
|
STRLEN len; |
2752
|
|
|
|
|
|
const char *little; |
2753
|
|
|
|
|
|
|
2754
|
992639
|
|
|
|
|
if (utf8_target) { |
2755
|
935396
|
|
|
|
|
if (! prog->float_utf8) { |
2756
|
935396
|
|
|
|
|
to_utf8_substr(prog); |
2757
|
|
|
|
|
|
} |
2758
|
42454
|
|
|
|
|
float_real = prog->float_utf8; |
2759
|
|
|
|
|
|
} |
2760
|
|
|
|
|
|
else { |
2761
|
0
|
|
|
|
|
if (! prog->float_substr) { |
2762
|
0
|
|
|
|
|
if (! to_byte_substr(prog)) { |
2763
|
0
|
|
|
|
|
NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); |
2764
|
|
|
|
|
|
} |
2765
|
|
|
|
|
|
} |
2766
|
42454
|
|
|
|
|
float_real = prog->float_substr; |
2767
|
|
|
|
|
|
} |
2768
|
|
|
|
|
|
|
2769
|
18652
|
|
|
|
|
little = SvPV_const(float_real, len); |
2770
|
18652
|
|
|
|
|
if (SvTAIL(float_real)) { |
2771
|
|
|
|
|
|
/* This means that float_real contains an artificial \n on |
2772
|
|
|
|
|
|
* the end due to the presence of something like this: |
2773
|
|
|
|
|
|
* /foo$/ where we can match both "foo" and "foo\n" at the |
2774
|
|
|
|
|
|
* end of the string. So we have to compare the end of the |
2775
|
|
|
|
|
|
* string first against the float_real without the \n and |
2776
|
|
|
|
|
|
* then against the full float_real with the string. We |
2777
|
|
|
|
|
|
* have to watch out for cases where the string might be |
2778
|
|
|
|
|
|
* smaller than the float_real or the float_real without |
2779
|
|
|
|
|
|
* the \n. */ |
2780
|
8416
|
|
|
|
|
char *checkpos= strend - len; |
2781
|
8416
|
|
|
|
|
DEBUG_OPTIMISE_r( |
2782
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2783
|
|
|
|
|
|
"%sChecking for float_real.%s\n", |
2784
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
2785
|
0
|
|
|
|
|
if (checkpos + 1 < strbeg) { |
2786
|
|
|
|
|
|
/* can't match, even if we remove the trailing \n |
2787
|
|
|
|
|
|
* string is too short to match */ |
2788
|
8416
|
|
|
|
|
DEBUG_EXECUTE_r( |
2789
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2790
|
|
|
|
|
|
"%sString shorter than required trailing substring, cannot match.%s\n", |
2791
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
2792
|
|
|
|
|
|
goto phooey; |
2793
|
8416
|
|
|
|
|
} else if (memEQ(checkpos + 1, little, len - 1)) { |
2794
|
|
|
|
|
|
/* can match, the end of the string matches without the |
2795
|
|
|
|
|
|
* "\n" */ |
2796
|
8416
|
|
|
|
|
last = checkpos + 1; |
2797
|
0
|
|
|
|
|
} else if (checkpos < strbeg) { |
2798
|
|
|
|
|
|
/* cant match, string is too short when the "\n" is |
2799
|
|
|
|
|
|
* included */ |
2800
|
8416
|
|
|
|
|
DEBUG_EXECUTE_r( |
2801
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2802
|
|
|
|
|
|
"%sString does not contain required trailing substring, cannot match.%s\n", |
2803
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
2804
|
|
|
|
|
|
goto phooey; |
2805
|
8416
|
|
|
|
|
} else if (!multiline) { |
2806
|
|
|
|
|
|
/* non multiline match, so compare with the "\n" at the |
2807
|
|
|
|
|
|
* end of the string */ |
2808
|
0
|
|
|
|
|
if (memEQ(checkpos, little, len)) { |
2809
|
|
|
|
|
|
last= checkpos; |
2810
|
|
|
|
|
|
} else { |
2811
|
0
|
|
|
|
|
DEBUG_EXECUTE_r( |
2812
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2813
|
|
|
|
|
|
"%sString does not contain required trailing substring, cannot match.%s\n", |
2814
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
2815
|
|
|
|
|
|
goto phooey; |
2816
|
|
|
|
|
|
} |
2817
|
|
|
|
|
|
} else { |
2818
|
|
|
|
|
|
/* multiline match, so we have to search for a place |
2819
|
|
|
|
|
|
* where the full string is located */ |
2820
|
|
|
|
|
|
goto find_last; |
2821
|
|
|
|
|
|
} |
2822
|
|
|
|
|
|
} else { |
2823
|
|
|
|
|
|
find_last: |
2824
|
0
|
|
|
|
|
if (len) |
2825
|
0
|
|
|
|
|
last = rninstr(s, strend, little, little + len); |
2826
|
|
|
|
|
|
else |
2827
|
|
|
|
|
|
last = strend; /* matching "$" */ |
2828
|
|
|
|
|
|
} |
2829
|
0
|
|
|
|
|
if (!last) { |
2830
|
|
|
|
|
|
/* at one point this block contained a comment which was |
2831
|
|
|
|
|
|
* probably incorrect, which said that this was a "should not |
2832
|
|
|
|
|
|
* happen" case. Even if it was true when it was written I am |
2833
|
|
|
|
|
|
* pretty sure it is not anymore, so I have removed the comment |
2834
|
|
|
|
|
|
* and replaced it with this one. Yves */ |
2835
|
892942
|
|
|
|
|
DEBUG_EXECUTE_r( |
2836
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2837
|
|
|
|
|
|
"String does not contain required substring, cannot match.\n" |
2838
|
|
|
|
|
|
)); |
2839
|
|
|
|
|
|
goto phooey; |
2840
|
|
|
|
|
|
} |
2841
|
192182
|
|
|
|
|
dontbother = strend - last + prog->float_min_offset; |
2842
|
|
|
|
|
|
} |
2843
|
93166
|
|
|
|
|
if (minlen && (dontbother < minlen)) |
2844
|
142572
|
|
|
|
|
dontbother = minlen - 1; |
2845
|
142572
|
|
|
|
|
strend -= dontbother; /* this one's always in bytes! */ |
2846
|
|
|
|
|
|
/* We don't know much -- general case. */ |
2847
|
13336
|
|
|
|
|
if (utf8_target) { |
2848
|
|
|
|
|
|
for (;;) { |
2849
|
62454
|
|
|
|
|
if (regtry(reginfo, &s)) |
2850
|
|
|
|
|
|
goto got_it; |
2851
|
830488
|
|
|
|
|
if (s >= strend) |
2852
|
|
|
|
|
|
break; |
2853
|
159122
|
|
|
|
|
s += UTF8SKIP(s); |
2854
|
0
|
|
|
|
|
}; |
2855
|
|
|
|
|
|
} |
2856
|
|
|
|
|
|
else { |
2857
|
|
|
|
|
|
do { |
2858
|
7908
|
|
|
|
|
if (regtry(reginfo, &s)) |
2859
|
|
|
|
|
|
goto got_it; |
2860
|
5516
|
|
|
|
|
} while (s++ < strend); |
2861
|
|
|
|
|
|
} |
2862
|
|
|
|
|
|
} |
2863
|
|
|
|
|
|
|
2864
|
|
|
|
|
|
/* Failure. */ |
2865
|
|
|
|
|
|
goto phooey; |
2866
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
got_it: |
2868
|
|
|
|
|
|
/* s/// doesn't like it if $& is earlier than where we asked it to |
2869
|
|
|
|
|
|
* start searching (which can happen on something like /.\G/) */ |
2870
|
42
|
|
|
|
|
if ( (flags & REXEC_FAIL_ON_UNDERFLOW) |
2871
|
665850
|
|
|
|
|
&& (prog->offs[0].start < stringarg - strbeg)) |
2872
|
|
|
|
|
|
{ |
2873
|
|
|
|
|
|
/* this should only be possible under \G */ |
2874
|
0
|
|
|
|
|
assert(prog->extflags & RXf_GPOS_SEEN); |
2875
|
0
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
2876
|
|
|
|
|
|
"matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); |
2877
|
|
|
|
|
|
goto phooey; |
2878
|
|
|
|
|
|
} |
2879
|
|
|
|
|
|
|
2880
|
108870659
|
|
|
|
|
DEBUG_BUFFERS_r( |
2881
|
|
|
|
|
|
if (swap) |
2882
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2883
|
|
|
|
|
|
"rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", |
2884
|
|
|
|
|
|
PTR2UV(prog), |
2885
|
|
|
|
|
|
PTR2UV(swap) |
2886
|
|
|
|
|
|
); |
2887
|
|
|
|
|
|
); |
2888
|
86308
|
|
|
|
|
Safefree(swap); |
2889
|
|
|
|
|
|
|
2890
|
|
|
|
|
|
/* clean up; this will trigger destructors that will free all slabs |
2891
|
|
|
|
|
|
* above the current one, and cleanup the regmatch_info_aux |
2892
|
|
|
|
|
|
* and regmatch_info_aux_eval sructs */ |
2893
|
|
|
|
|
|
|
2894
|
108784393
|
|
|
|
|
LEAVE_SCOPE(oldsave); |
2895
|
|
|
|
|
|
|
2896
|
290470
|
|
|
|
|
if (RXp_PAREN_NAMES(prog)) |
2897
|
240938
|
|
|
|
|
(void)hv_iterinit(RXp_PAREN_NAMES(prog)); |
2898
|
|
|
|
|
|
|
2899
|
241022
|
|
|
|
|
RX_MATCH_UTF8_set(rx, utf8_target); |
2900
|
|
|
|
|
|
|
2901
|
|
|
|
|
|
/* make sure $`, $&, $', and $digit will work later */ |
2902
|
290470
|
|
|
|
|
if ( !(flags & REXEC_NOT_FIRST) ) |
2903
|
435684
|
|
|
|
|
S_reg_set_capture_string(aTHX_ rx, |
2904
|
|
|
|
|
|
strbeg, reginfo->strend, |
2905
|
|
|
|
|
|
sv, flags, utf8_target); |
2906
|
|
|
|
|
|
|
2907
|
|
|
|
|
|
return 1; |
2908
|
|
|
|
|
|
|
2909
|
|
|
|
|
|
phooey: |
2910
|
240726
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", |
2911
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
2912
|
|
|
|
|
|
|
2913
|
|
|
|
|
|
/* clean up; this will trigger destructors that will free all slabs |
2914
|
|
|
|
|
|
* above the current one, and cleanup the regmatch_info_aux |
2915
|
|
|
|
|
|
* and regmatch_info_aux_eval sructs */ |
2916
|
|
|
|
|
|
|
2917
|
410792
|
|
|
|
|
LEAVE_SCOPE(oldsave); |
2918
|
|
|
|
|
|
|
2919
|
49706
|
|
|
|
|
if (swap) { |
2920
|
|
|
|
|
|
/* we failed :-( roll it back */ |
2921
|
108493923
|
|
|
|
|
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, |
2922
|
|
|
|
|
|
"rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", |
2923
|
|
|
|
|
|
PTR2UV(prog), |
2924
|
|
|
|
|
|
PTR2UV(prog->offs), |
2925
|
|
|
|
|
|
PTR2UV(swap) |
2926
|
|
|
|
|
|
)); |
2927
|
10084
|
|
|
|
|
Safefree(prog->offs); |
2928
|
0
|
|
|
|
|
prog->offs = swap; |
2929
|
|
|
|
|
|
} |
2930
|
|
|
|
|
|
return 0; |
2931
|
|
|
|
|
|
} |
2932
|
|
|
|
|
|
|
2933
|
|
|
|
|
|
|
2934
|
|
|
|
|
|
/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. |
2935
|
|
|
|
|
|
* Do inc before dec, in case old and new rex are the same */ |
2936
|
|
|
|
|
|
#define SET_reg_curpm(Re2) \ |
2937
|
|
|
|
|
|
if (reginfo->info_aux_eval) { \ |
2938
|
|
|
|
|
|
(void)ReREFCNT_inc(Re2); \ |
2939
|
|
|
|
|
|
ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ |
2940
|
|
|
|
|
|
PM_SETRE((PL_reg_curpm), (Re2)); \ |
2941
|
|
|
|
|
|
} |
2942
|
|
|
|
|
|
|
2943
|
|
|
|
|
|
|
2944
|
|
|
|
|
|
/* |
2945
|
|
|
|
|
|
- regtry - try match at specific point |
2946
|
|
|
|
|
|
*/ |
2947
|
|
|
|
|
|
STATIC I32 /* 0 failure, 1 success */ |
2948
|
108483881
|
|
|
|
|
S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) |
2949
|
|
|
|
|
|
{ |
2950
|
|
|
|
|
|
dVAR; |
2951
|
|
|
|
|
|
CHECKPOINT lastcp; |
2952
|
108677270
|
|
|
|
|
REGEXP *const rx = reginfo->prog; |
2953
|
229782259
|
|
|
|
|
regexp *const prog = ReANY(rx); |
2954
|
|
|
|
|
|
SSize_t result; |
2955
|
229782259
|
|
|
|
|
RXi_GET_DECL(prog,progi); |
2956
|
229782259
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
2957
|
|
|
|
|
|
|
2958
|
229782259
|
|
|
|
|
PERL_ARGS_ASSERT_REGTRY; |
2959
|
|
|
|
|
|
|
2960
|
229782259
|
|
|
|
|
reginfo->cutpoint=NULL; |
2961
|
|
|
|
|
|
|
2962
|
229782259
|
|
|
|
|
prog->offs[0].start = *startposp - reginfo->strbeg; |
2963
|
229782259
|
|
|
|
|
prog->lastparen = 0; |
2964
|
229782259
|
|
|
|
|
prog->lastcloseparen = 0; |
2965
|
|
|
|
|
|
|
2966
|
|
|
|
|
|
/* XXXX What this code is doing here?!!! There should be no need |
2967
|
|
|
|
|
|
to do this again and again, prog->lastparen should take care of |
2968
|
|
|
|
|
|
this! --ilya*/ |
2969
|
|
|
|
|
|
|
2970
|
|
|
|
|
|
/* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. |
2971
|
|
|
|
|
|
* Actually, the code in regcppop() (which Ilya may be meaning by |
2972
|
|
|
|
|
|
* prog->lastparen), is not needed at all by the test suite |
2973
|
|
|
|
|
|
* (op/regexp, op/pat, op/split), but that code is needed otherwise |
2974
|
|
|
|
|
|
* this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ |
2975
|
|
|
|
|
|
* Meanwhile, this code *is* needed for the |
2976
|
|
|
|
|
|
* above-mentioned test suite tests to succeed. The common theme |
2977
|
|
|
|
|
|
* on those tests seems to be returning null fields from matches. |
2978
|
|
|
|
|
|
* --jhi updated by dapm */ |
2979
|
|
|
|
|
|
#if 1 |
2980
|
229782259
|
|
|
|
|
if (prog->nparens) { |
2981
|
229782221
|
|
|
|
|
regexp_paren_pair *pp = prog->offs; |
2982
|
|
|
|
|
|
I32 i; |
2983
|
229782225
|
|
|
|
|
for (i = prog->nparens; i > (I32)prog->lastparen; i--) { |
2984
|
229782221
|
|
|
|
|
++pp; |
2985
|
1749205844
|
|
|
|
|
pp->start = -1; |
2986
|
1634564290
|
|
|
|
|
pp->end = -1; |
2987
|
|
|
|
|
|
} |
2988
|
|
|
|
|
|
} |
2989
|
|
|
|
|
|
#endif |
2990
|
1634564328
|
|
|
|
|
REGCP_SET(lastcp); |
2991
|
1909591423
|
|
|
|
|
result = regmatch(reginfo, *startposp, progi->program + 1); |
2992
|
2184219214
|
|
|
|
|
if (result != -1) { |
2993
|
2184219214
|
|
|
|
|
prog->offs[0].end = result; |
2994
|
151487096
|
|
|
|
|
return 1; |
2995
|
|
|
|
|
|
} |
2996
|
5549990
|
|
|
|
|
if (reginfo->cutpoint) |
2997
|
5267184
|
|
|
|
|
*startposp= reginfo->cutpoint; |
2998
|
1310327
|
|
|
|
|
REGCP_UNWIND(lastcp); |
2999
|
|
|
|
|
|
return 0; |
3000
|
|
|
|
|
|
} |
3001
|
|
|
|
|
|
|
3002
|
|
|
|
|
|
|
3003
|
|
|
|
|
|
#define sayYES goto yes |
3004
|
|
|
|
|
|
#define sayNO goto no |
3005
|
|
|
|
|
|
#define sayNO_SILENT goto no_silent |
3006
|
|
|
|
|
|
|
3007
|
|
|
|
|
|
/* we dont use STMT_START/END here because it leads to |
3008
|
|
|
|
|
|
"unreachable code" warnings, which are bogus, but distracting. */ |
3009
|
|
|
|
|
|
#define CACHEsayNO \ |
3010
|
|
|
|
|
|
if (ST.cache_mask) \ |
3011
|
|
|
|
|
|
reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ |
3012
|
|
|
|
|
|
sayNO |
3013
|
|
|
|
|
|
|
3014
|
|
|
|
|
|
/* this is used to determine how far from the left messages like |
3015
|
|
|
|
|
|
'failed...' are printed. It should be set such that messages |
3016
|
|
|
|
|
|
are inline with the regop output that created them. |
3017
|
|
|
|
|
|
*/ |
3018
|
|
|
|
|
|
#define REPORT_CODE_OFF 32 |
3019
|
|
|
|
|
|
|
3020
|
|
|
|
|
|
|
3021
|
|
|
|
|
|
#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ |
3022
|
|
|
|
|
|
#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ |
3023
|
|
|
|
|
|
#define CHRTEST_NOT_A_CP_1 -999 |
3024
|
|
|
|
|
|
#define CHRTEST_NOT_A_CP_2 -998 |
3025
|
|
|
|
|
|
|
3026
|
|
|
|
|
|
/* grab a new slab and return the first slot in it */ |
3027
|
|
|
|
|
|
|
3028
|
|
|
|
|
|
STATIC regmatch_state * |
3029
|
12804828
|
|
|
|
|
S_push_slab(pTHX) |
3030
|
|
|
|
|
|
{ |
3031
|
|
|
|
|
|
#if PERL_VERSION < 9 && !defined(PERL_CORE) |
3032
|
|
|
|
|
|
dMY_CXT; |
3033
|
|
|
|
|
|
#endif |
3034
|
304
|
|
|
|
|
regmatch_slab *s = PL_regmatch_slab->next; |
3035
|
304
|
|
|
|
|
if (!s) { |
3036
|
304
|
|
|
|
|
Newx(s, 1, regmatch_slab); |
3037
|
0
|
|
|
|
|
s->prev = PL_regmatch_slab; |
3038
|
0
|
|
|
|
|
s->next = NULL; |
3039
|
8866578
|
|
|
|
|
PL_regmatch_slab->next = s; |
3040
|
|
|
|
|
|
} |
3041
|
34844664
|
|
|
|
|
PL_regmatch_slab = s; |
3042
|
11191980
|
|
|
|
|
return SLAB_FIRST(s); |
3043
|
|
|
|
|
|
} |
3044
|
|
|
|
|
|
|
3045
|
|
|
|
|
|
|
3046
|
|
|
|
|
|
/* push a new state then goto it */ |
3047
|
|
|
|
|
|
|
3048
|
|
|
|
|
|
#define PUSH_STATE_GOTO(state, node, input) \ |
3049
|
|
|
|
|
|
pushinput = input; \ |
3050
|
|
|
|
|
|
scan = node; \ |
3051
|
|
|
|
|
|
st->resume_state = state; \ |
3052
|
|
|
|
|
|
goto push_state; |
3053
|
|
|
|
|
|
|
3054
|
|
|
|
|
|
/* push a new state with success backtracking, then goto it */ |
3055
|
|
|
|
|
|
|
3056
|
|
|
|
|
|
#define PUSH_YES_STATE_GOTO(state, node, input) \ |
3057
|
|
|
|
|
|
pushinput = input; \ |
3058
|
|
|
|
|
|
scan = node; \ |
3059
|
|
|
|
|
|
st->resume_state = state; \ |
3060
|
|
|
|
|
|
goto push_yes_state; |
3061
|
|
|
|
|
|
|
3062
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
3064
|
|
|
|
|
|
|
3065
|
|
|
|
|
|
/* |
3066
|
|
|
|
|
|
|
3067
|
|
|
|
|
|
regmatch() - main matching routine |
3068
|
|
|
|
|
|
|
3069
|
|
|
|
|
|
This is basically one big switch statement in a loop. We execute an op, |
3070
|
|
|
|
|
|
set 'next' to point the next op, and continue. If we come to a point which |
3071
|
|
|
|
|
|
we may need to backtrack to on failure such as (A|B|C), we push a |
3072
|
|
|
|
|
|
backtrack state onto the backtrack stack. On failure, we pop the top |
3073
|
|
|
|
|
|
state, and re-enter the loop at the state indicated. If there are no more |
3074
|
|
|
|
|
|
states to pop, we return failure. |
3075
|
|
|
|
|
|
|
3076
|
|
|
|
|
|
Sometimes we also need to backtrack on success; for example /A+/, where |
3077
|
|
|
|
|
|
after successfully matching one A, we need to go back and try to |
3078
|
|
|
|
|
|
match another one; similarly for lookahead assertions: if the assertion |
3079
|
|
|
|
|
|
completes successfully, we backtrack to the state just before the assertion |
3080
|
|
|
|
|
|
and then carry on. In these cases, the pushed state is marked as |
3081
|
|
|
|
|
|
'backtrack on success too'. This marking is in fact done by a chain of |
3082
|
|
|
|
|
|
pointers, each pointing to the previous 'yes' state. On success, we pop to |
3083
|
|
|
|
|
|
the nearest yes state, discarding any intermediate failure-only states. |
3084
|
|
|
|
|
|
Sometimes a yes state is pushed just to force some cleanup code to be |
3085
|
|
|
|
|
|
called at the end of a successful match or submatch; e.g. (??{$re}) uses |
3086
|
|
|
|
|
|
it to free the inner regex. |
3087
|
|
|
|
|
|
|
3088
|
|
|
|
|
|
Note that failure backtracking rewinds the cursor position, while |
3089
|
|
|
|
|
|
success backtracking leaves it alone. |
3090
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
A pattern is complete when the END op is executed, while a subpattern |
3092
|
|
|
|
|
|
such as (?=foo) is complete when the SUCCESS op is executed. Both of these |
3093
|
|
|
|
|
|
ops trigger the "pop to last yes state if any, otherwise return true" |
3094
|
|
|
|
|
|
behaviour. |
3095
|
|
|
|
|
|
|
3096
|
|
|
|
|
|
A common convention in this function is to use A and B to refer to the two |
3097
|
|
|
|
|
|
subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is |
3098
|
|
|
|
|
|
the subpattern to be matched possibly multiple times, while B is the entire |
3099
|
|
|
|
|
|
rest of the pattern. Variable and state names reflect this convention. |
3100
|
|
|
|
|
|
|
3101
|
|
|
|
|
|
The states in the main switch are the union of ops and failure/success of |
3102
|
|
|
|
|
|
substates associated with with that op. For example, IFMATCH is the op |
3103
|
|
|
|
|
|
that does lookahead assertions /(?=A)B/ and so the IFMATCH state means |
3104
|
|
|
|
|
|
'execute IFMATCH'; while IFMATCH_A is a state saying that we have just |
3105
|
|
|
|
|
|
successfully matched A and IFMATCH_A_fail is a state saying that we have |
3106
|
|
|
|
|
|
just failed to match A. Resume states always come in pairs. The backtrack |
3107
|
|
|
|
|
|
state we push is marked as 'IFMATCH_A', but when that is popped, we resume |
3108
|
|
|
|
|
|
at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking |
3109
|
|
|
|
|
|
on success or failure. |
3110
|
|
|
|
|
|
|
3111
|
|
|
|
|
|
The struct that holds a backtracking state is actually a big union, with |
3112
|
|
|
|
|
|
one variant for each major type of op. The variable st points to the |
3113
|
|
|
|
|
|
top-most backtrack struct. To make the code clearer, within each |
3114
|
|
|
|
|
|
block of code we #define ST to alias the relevant union. |
3115
|
|
|
|
|
|
|
3116
|
|
|
|
|
|
Here's a concrete example of a (vastly oversimplified) IFMATCH |
3117
|
|
|
|
|
|
implementation: |
3118
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
switch (state) { |
3120
|
|
|
|
|
|
.... |
3121
|
|
|
|
|
|
|
3122
|
|
|
|
|
|
#define ST st->u.ifmatch |
3123
|
|
|
|
|
|
|
3124
|
|
|
|
|
|
case IFMATCH: // we are executing the IFMATCH op, (?=A)B |
3125
|
|
|
|
|
|
ST.foo = ...; // some state we wish to save |
3126
|
|
|
|
|
|
... |
3127
|
|
|
|
|
|
// push a yes backtrack state with a resume value of |
3128
|
|
|
|
|
|
// IFMATCH_A/IFMATCH_A_fail, then continue execution at the |
3129
|
|
|
|
|
|
// first node of A: |
3130
|
|
|
|
|
|
PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); |
3131
|
|
|
|
|
|
// NOTREACHED |
3132
|
|
|
|
|
|
|
3133
|
|
|
|
|
|
case IFMATCH_A: // we have successfully executed A; now continue with B |
3134
|
|
|
|
|
|
next = B; |
3135
|
|
|
|
|
|
bar = ST.foo; // do something with the preserved value |
3136
|
|
|
|
|
|
break; |
3137
|
|
|
|
|
|
|
3138
|
|
|
|
|
|
case IFMATCH_A_fail: // A failed, so the assertion failed |
3139
|
|
|
|
|
|
...; // do some housekeeping, then ... |
3140
|
|
|
|
|
|
sayNO; // propagate the failure |
3141
|
|
|
|
|
|
|
3142
|
|
|
|
|
|
#undef ST |
3143
|
|
|
|
|
|
|
3144
|
|
|
|
|
|
... |
3145
|
|
|
|
|
|
} |
3146
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
For any old-timers reading this who are familiar with the old recursive |
3148
|
|
|
|
|
|
approach, the code above is equivalent to: |
3149
|
|
|
|
|
|
|
3150
|
|
|
|
|
|
case IFMATCH: // we are executing the IFMATCH op, (?=A)B |
3151
|
|
|
|
|
|
{ |
3152
|
|
|
|
|
|
int foo = ... |
3153
|
|
|
|
|
|
... |
3154
|
|
|
|
|
|
if (regmatch(A)) { |
3155
|
|
|
|
|
|
next = B; |
3156
|
|
|
|
|
|
bar = foo; |
3157
|
|
|
|
|
|
break; |
3158
|
|
|
|
|
|
} |
3159
|
|
|
|
|
|
...; // do some housekeeping, then ... |
3160
|
|
|
|
|
|
sayNO; // propagate the failure |
3161
|
|
|
|
|
|
} |
3162
|
|
|
|
|
|
|
3163
|
|
|
|
|
|
The topmost backtrack state, pointed to by st, is usually free. If you |
3164
|
|
|
|
|
|
want to claim it, populate any ST.foo fields in it with values you wish to |
3165
|
|
|
|
|
|
save, then do one of |
3166
|
|
|
|
|
|
|
3167
|
|
|
|
|
|
PUSH_STATE_GOTO(resume_state, node, newinput); |
3168
|
|
|
|
|
|
PUSH_YES_STATE_GOTO(resume_state, node, newinput); |
3169
|
|
|
|
|
|
|
3170
|
|
|
|
|
|
which sets that backtrack state's resume value to 'resume_state', pushes a |
3171
|
|
|
|
|
|
new free entry to the top of the backtrack stack, then goes to 'node'. |
3172
|
|
|
|
|
|
On backtracking, the free slot is popped, and the saved state becomes the |
3173
|
|
|
|
|
|
new free state. An ST.foo field in this new top state can be temporarily |
3174
|
|
|
|
|
|
accessed to retrieve values, but once the main loop is re-entered, it |
3175
|
|
|
|
|
|
becomes available for reuse. |
3176
|
|
|
|
|
|
|
3177
|
|
|
|
|
|
Note that the depth of the backtrack stack constantly increases during the |
3178
|
|
|
|
|
|
left-to-right execution of the pattern, rather than going up and down with |
3179
|
|
|
|
|
|
the pattern nesting. For example the stack is at its maximum at Z at the |
3180
|
|
|
|
|
|
end of the pattern, rather than at X in the following: |
3181
|
|
|
|
|
|
|
3182
|
|
|
|
|
|
/(((X)+)+)+....(Y)+....Z/ |
3183
|
|
|
|
|
|
|
3184
|
|
|
|
|
|
The only exceptions to this are lookahead/behind assertions and the cut, |
3185
|
|
|
|
|
|
(?>A), which pop all the backtrack states associated with A before |
3186
|
|
|
|
|
|
continuing. |
3187
|
|
|
|
|
|
|
3188
|
|
|
|
|
|
Backtrack state structs are allocated in slabs of about 4K in size. |
3189
|
|
|
|
|
|
PL_regmatch_state and st always point to the currently active state, |
3190
|
|
|
|
|
|
and PL_regmatch_slab points to the slab currently containing |
3191
|
|
|
|
|
|
PL_regmatch_state. The first time regmatch() is called, the first slab is |
3192
|
|
|
|
|
|
allocated, and is never freed until interpreter destruction. When the slab |
3193
|
|
|
|
|
|
is full, a new one is allocated and chained to the end. At exit from |
3194
|
|
|
|
|
|
regmatch(), slabs allocated since entry are freed. |
3195
|
|
|
|
|
|
|
3196
|
|
|
|
|
|
*/ |
3197
|
|
|
|
|
|
|
3198
|
|
|
|
|
|
|
3199
|
|
|
|
|
|
#define DEBUG_STATE_pp(pp) \ |
3200
|
|
|
|
|
|
DEBUG_STATE_r({ \ |
3201
|
|
|
|
|
|
DUMP_EXEC_POS(locinput, scan, utf8_target); \ |
3202
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, \ |
3203
|
|
|
|
|
|
" %*s"pp" %s%s%s%s%s\n", \ |
3204
|
|
|
|
|
|
depth*2, "", \ |
3205
|
|
|
|
|
|
PL_reg_name[st->resume_state], \ |
3206
|
|
|
|
|
|
((st==yes_state||st==mark_state) ? "[" : ""), \ |
3207
|
|
|
|
|
|
((st==yes_state) ? "Y" : ""), \ |
3208
|
|
|
|
|
|
((st==mark_state) ? "M" : ""), \ |
3209
|
|
|
|
|
|
((st==yes_state||st==mark_state) ? "]" : "") \ |
3210
|
|
|
|
|
|
); \ |
3211
|
|
|
|
|
|
}); |
3212
|
|
|
|
|
|
|
3213
|
|
|
|
|
|
|
3214
|
|
|
|
|
|
#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) |
3215
|
|
|
|
|
|
|
3216
|
|
|
|
|
|
#ifdef DEBUGGING |
3217
|
|
|
|
|
|
|
3218
|
|
|
|
|
|
STATIC void |
3219
|
6828698
|
|
|
|
|
S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, |
3220
|
|
|
|
|
|
const char *start, const char *end, const char *blurb) |
3221
|
|
|
|
|
|
{ |
3222
|
1122616
|
|
|
|
|
const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; |
3223
|
|
|
|
|
|
|
3224
|
248
|
|
|
|
|
PERL_ARGS_ASSERT_DEBUG_START_MATCH; |
3225
|
|
|
|
|
|
|
3226
|
244
|
|
|
|
|
if (!PL_colorset) |
3227
|
152
|
|
|
|
|
reginitcolors(); |
3228
|
|
|
|
|
|
{ |
3229
|
24455914
|
|
|
|
|
RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), |
3230
|
|
|
|
|
|
RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); |
3231
|
|
|
|
|
|
|
3232
|
10937458
|
|
|
|
|
RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), |
3233
|
|
|
|
|
|
start, end - start, 60); |
3234
|
|
|
|
|
|
|
3235
|
45403611
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
3236
|
|
|
|
|
|
"%s%s REx%s %s against %s\n", |
3237
|
|
|
|
|
|
PL_colors[4], blurb, PL_colors[5], s0, s1); |
3238
|
|
|
|
|
|
|
3239
|
45403611
|
|
|
|
|
if (utf8_target||utf8_pat) |
3240
|
45403573
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", |
3241
|
|
|
|
|
|
utf8_pat ? "pattern" : "", |
3242
|
45403545
|
|
|
|
|
utf8_pat && utf8_target ? " and " : "", |
3243
|
|
|
|
|
|
utf8_target ? "string" : "" |
3244
|
|
|
|
|
|
); |
3245
|
|
|
|
|
|
} |
3246
|
45403611
|
|
|
|
|
} |
3247
|
|
|
|
|
|
|
3248
|
|
|
|
|
|
STATIC void |
3249
|
45403697
|
|
|
|
|
S_dump_exec_pos(pTHX_ const char *locinput, |
3250
|
|
|
|
|
|
const regnode *scan, |
3251
|
|
|
|
|
|
const char *loc_regeol, |
3252
|
|
|
|
|
|
const char *loc_bostr, |
3253
|
|
|
|
|
|
const char *loc_reg_starttry, |
3254
|
|
|
|
|
|
const bool utf8_target) |
3255
|
|
|
|
|
|
{ |
3256
|
42833481
|
|
|
|
|
const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; |
3257
|
38333452
|
|
|
|
|
const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ |
3258
|
40024
|
|
|
|
|
int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); |
3259
|
|
|
|
|
|
/* The part of the string before starttry has one color |
3260
|
|
|
|
|
|
(pref0_len chars), between starttry and current |
3261
|
|
|
|
|
|
position another one (pref_len - pref0_len chars), |
3262
|
|
|
|
|
|
after the current position the third one. |
3263
|
|
|
|
|
|
We assume that pref0_len <= pref_len, otherwise we |
3264
|
|
|
|
|
|
decrease pref0_len. */ |
3265
|
7070561
|
|
|
|
|
int pref_len = (locinput - loc_bostr) > (5 + taill) - l |
3266
|
7070561
|
|
|
|
|
? (5 + taill) - l : locinput - loc_bostr; |
3267
|
|
|
|
|
|
int pref0_len; |
3268
|
|
|
|
|
|
|
3269
|
7070561
|
|
|
|
|
PERL_ARGS_ASSERT_DUMP_EXEC_POS; |
3270
|
|
|
|
|
|
|
3271
|
7070561
|
|
|
|
|
while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) |
3272
|
7070381
|
|
|
|
|
pref_len++; |
3273
|
7070561
|
|
|
|
|
pref0_len = pref_len - (locinput - loc_reg_starttry); |
3274
|
7070561
|
|
|
|
|
if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) |
3275
|
46821545
|
|
|
|
|
l = ( loc_regeol - locinput > (5 + taill) - pref_len |
3276
|
36216692
|
|
|
|
|
? (5 + taill) - pref_len : loc_regeol - locinput); |
3277
|
36216870
|
|
|
|
|
while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) |
3278
|
36216690
|
|
|
|
|
l--; |
3279
|
36216870
|
|
|
|
|
if (pref0_len < 0) |
3280
|
|
|
|
|
|
pref0_len = 0; |
3281
|
4016683
|
|
|
|
|
if (pref0_len > pref_len) |
3282
|
|
|
|
|
|
pref0_len = pref_len; |
3283
|
|
|
|
|
|
{ |
3284
|
3951475
|
|
|
|
|
const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; |
3285
|
|
|
|
|
|
|
3286
|
11228
|
|
|
|
|
RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), |
3287
|
|
|
|
|
|
(locinput - pref_len),pref0_len, 60, 4, 5); |
3288
|
|
|
|
|
|
|
3289
|
11228
|
|
|
|
|
RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), |
3290
|
|
|
|
|
|
(locinput - pref_len + pref0_len), |
3291
|
|
|
|
|
|
pref_len - pref0_len, 60, 2, 3); |
3292
|
|
|
|
|
|
|
3293
|
3940427
|
|
|
|
|
RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), |
3294
|
|
|
|
|
|
locinput, loc_regeol - locinput, 10, 0, 1); |
3295
|
|
|
|
|
|
|
3296
|
3940427
|
|
|
|
|
const STRLEN tlen=len0+len1+len2; |
3297
|
4016863
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
3298
|
|
|
|
|
|
"%4"IVdf" <%.*s%.*s%s%.*s>%*s|", |
3299
|
|
|
|
|
|
(IV)(locinput - loc_bostr), |
3300
|
|
|
|
|
|
len0, s0, |
3301
|
|
|
|
|
|
len1, s1, |
3302
|
|
|
|
|
|
(docolor ? "" : "> <"), |
3303
|
|
|
|
|
|
len2, s2, |
3304
|
4003609
|
|
|
|
|
(int)(tlen > 19 ? 0 : 19 - tlen), |
3305
|
|
|
|
|
|
""); |
3306
|
|
|
|
|
|
} |
3307
|
4016683
|
|
|
|
|
} |
3308
|
|
|
|
|
|
|
3309
|
|
|
|
|
|
#endif |
3310
|
|
|
|
|
|
|
3311
|
|
|
|
|
|
/* reg_check_named_buff_matched() |
3312
|
|
|
|
|
|
* Checks to see if a named buffer has matched. The data array of |
3313
|
|
|
|
|
|
* buffer numbers corresponding to the buffer is expected to reside |
3314
|
|
|
|
|
|
* in the regexp->data->data array in the slot stored in the ARG() of |
3315
|
|
|
|
|
|
* node involved. Note that this routine doesn't actually care about the |
3316
|
|
|
|
|
|
* name, that information is not preserved from compilation to execution. |
3317
|
|
|
|
|
|
* Returns the index of the leftmost defined buffer with the given name |
3318
|
|
|
|
|
|
* or 0 if non of the buffers matched. |
3319
|
|
|
|
|
|
*/ |
3320
|
|
|
|
|
|
STATIC I32 |
3321
|
68422759
|
|
|
|
|
S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) |
3322
|
|
|
|
|
|
{ |
3323
|
|
|
|
|
|
I32 n; |
3324
|
32206069
|
|
|
|
|
RXi_GET_DECL(rex,rexi); |
3325
|
32206069
|
|
|
|
|
SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); |
3326
|
32206069
|
|
|
|
|
I32 *nums=(I32*)SvPVX(sv_dat); |
3327
|
|
|
|
|
|
|
3328
|
13056
|
|
|
|
|
PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; |
3329
|
|
|
|
|
|
|
3330
|
47273609
|
|
|
|
|
for ( n=0; n
|
3331
|
30142261
|
|
|
|
|
if ((I32)rex->lastparen >= nums[n] && |
3332
|
30142261
|
|
|
|
|
rex->offs[nums[n]].end != -1) |
3333
|
|
|
|
|
|
{ |
3334
|
30142137
|
|
|
|
|
return nums[n]; |
3335
|
|
|
|
|
|
} |
3336
|
|
|
|
|
|
} |
3337
|
|
|
|
|
|
return 0; |
3338
|
|
|
|
|
|
} |
3339
|
|
|
|
|
|
|
3340
|
|
|
|
|
|
|
3341
|
|
|
|
|
|
static bool |
3342
|
30129601
|
|
|
|
|
S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, |
3343
|
|
|
|
|
|
U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) |
3344
|
|
|
|
|
|
{ |
3345
|
|
|
|
|
|
/* This function determines if there are one or two characters that match |
3346
|
|
|
|
|
|
* the first character of the passed-in EXACTish node , and if |
3347
|
|
|
|
|
|
* so, returns them in the passed-in pointers. |
3348
|
|
|
|
|
|
* |
3349
|
|
|
|
|
|
* If it determines that no possible character in the target string can |
3350
|
|
|
|
|
|
* match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if |
3351
|
|
|
|
|
|
* the first character in requires UTF-8 to represent, and the |
3352
|
|
|
|
|
|
* target string isn't in UTF-8.) |
3353
|
|
|
|
|
|
* |
3354
|
|
|
|
|
|
* If there are more than two characters that could match the beginning of |
3355
|
|
|
|
|
|
* , or if more context is required to determine a match or not, |
3356
|
|
|
|
|
|
* it sets both * and * to CHRTEST_VOID. |
3357
|
|
|
|
|
|
* |
3358
|
|
|
|
|
|
* The motiviation behind this function is to allow the caller to set up |
3359
|
|
|
|
|
|
* tight loops for matching. If is of type EXACT, there is |
3360
|
|
|
|
|
|
* only one possible character that can match its first character, and so |
3361
|
|
|
|
|
|
* the situation is quite simple. But things get much more complicated if |
3362
|
|
|
|
|
|
* folding is involved. It may be that the first character of an EXACTFish |
3363
|
|
|
|
|
|
* node doesn't participate in any possible fold, e.g., punctuation, so it |
3364
|
|
|
|
|
|
* can be matched only by itself. The vast majority of characters that are |
3365
|
|
|
|
|
|
* in folds match just two things, their lower and upper-case equivalents. |
3366
|
|
|
|
|
|
* But not all are like that; some have multiple possible matches, or match |
3367
|
|
|
|
|
|
* sequences of more than one character. This function sorts all that out. |
3368
|
|
|
|
|
|
* |
3369
|
|
|
|
|
|
* Consider the patterns A*B or A*?B where A and B are arbitrary. In a |
3370
|
|
|
|
|
|
* loop of trying to match A*, we know we can't exit where the thing |
3371
|
|
|
|
|
|
* following it isn't a B. And something can't be a B unless it is the |
3372
|
|
|
|
|
|
* beginning of B. By putting a quick test for that beginning in a tight |
3373
|
|
|
|
|
|
* loop, we can rule out things that can't possibly be B without having to |
3374
|
|
|
|
|
|
* break out of the loop, thus avoiding work. Similarly, if A is a single |
3375
|
|
|
|
|
|
* character, we can make a tight loop matching A*, using the outputs of |
3376
|
|
|
|
|
|
* this function. |
3377
|
|
|
|
|
|
* |
3378
|
|
|
|
|
|
* If the target string to match isn't in UTF-8, and there aren't |
3379
|
|
|
|
|
|
* complications which require CHRTEST_VOID, * and * are set to |
3380
|
|
|
|
|
|
* the one or two possible octets (which are characters in this situation) |
3381
|
|
|
|
|
|
* that can match. In all cases, if there is only one character that can |
3382
|
|
|
|
|
|
* match, * and * will be identical. |
3383
|
|
|
|
|
|
* |
3384
|
|
|
|
|
|
* If the target string is in UTF-8, the buffers pointed to by |
3385
|
|
|
|
|
|
* and will contain the one or two UTF-8 sequences of bytes that |
3386
|
|
|
|
|
|
* can match the beginning of . They should be declared with at |
3387
|
|
|
|
|
|
* least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is |
3388
|
|
|
|
|
|
* undefined what these contain.) If one or both of the buffers are |
3389
|
|
|
|
|
|
* invariant under UTF-8, *, and * will also be set to the |
3390
|
|
|
|
|
|
* corresponding invariant. If variant, the corresponding * and/or |
3391
|
|
|
|
|
|
* * will be set to a negative number(s) that shouldn't match any code |
3392
|
|
|
|
|
|
* point (unless inappropriately coerced to unsigned). * will equal |
3393
|
|
|
|
|
|
* * if and only if and are the same. */ |
3394
|
|
|
|
|
|
|
3395
|
29148265
|
|
|
|
|
const bool utf8_target = reginfo->is_utf8_target; |
3396
|
|
|
|
|
|
|
3397
|
|
|
|
|
|
UV c1 = CHRTEST_NOT_A_CP_1; |
3398
|
|
|
|
|
|
UV c2 = CHRTEST_NOT_A_CP_2; |
3399
|
|
|
|
|
|
bool use_chrtest_void = FALSE; |
3400
|
32206069
|
|
|
|
|
const bool is_utf8_pat = reginfo->is_utf8_pat; |
3401
|
|
|
|
|
|
|
3402
|
|
|
|
|
|
/* Used when we have both utf8 input and utf8 output, to avoid converting |
3403
|
|
|
|
|
|
* to/from code points */ |
3404
|
|
|
|
|
|
bool utf8_has_been_setup = FALSE; |
3405
|
|
|
|
|
|
|
3406
|
|
|
|
|
|
dVAR; |
3407
|
|
|
|
|
|
|
3408
|
7070381
|
|
|
|
|
U8 *pat = (U8*)STRING(text_node); |
3409
|
|
|
|
|
|
|
3410
|
3951295
|
|
|
|
|
if (OP(text_node) == EXACT) { |
3411
|
|
|
|
|
|
|
3412
|
|
|
|
|
|
/* In an exact node, only one thing can be matched, that first |
3413
|
|
|
|
|
|
* character. If both the pat and the target are UTF-8, we can just |
3414
|
|
|
|
|
|
* copy the input to the output, avoiding finding the code point of |
3415
|
|
|
|
|
|
* that character */ |
3416
|
9967643
|
|
|
|
|
if (!is_utf8_pat) { |
3417
|
4041419
|
|
|
|
|
c2 = c1 = *pat; |
3418
|
|
|
|
|
|
} |
3419
|
4041419
|
|
|
|
|
else if (utf8_target) { |
3420
|
3951295
|
|
|
|
|
Copy(pat, c1_utf8, UTF8SKIP(pat), U8); |
3421
|
3951295
|
|
|
|
|
Copy(pat, c2_utf8, UTF8SKIP(pat), U8); |
3422
|
|
|
|
|
|
utf8_has_been_setup = TRUE; |
3423
|
|
|
|
|
|
} |
3424
|
|
|
|
|
|
else { |
3425
|
34150
|
|
|
|
|
c2 = c1 = valid_utf8_to_uvchr(pat, NULL); |
3426
|
|
|
|
|
|
} |
3427
|
|
|
|
|
|
} |
3428
|
|
|
|
|
|
else /* an EXACTFish node */ |
3429
|
21244
|
|
|
|
|
if ((is_utf8_pat |
3430
|
21457
|
|
|
|
|
&& is_MULTI_CHAR_FOLD_utf8_safe(pat, |
3431
|
|
|
|
|
|
pat + STR_LEN(text_node))) |
3432
|
34150
|
|
|
|
|
|| (!is_utf8_pat |
3433
|
34150
|
|
|
|
|
&& is_MULTI_CHAR_FOLD_latin1_safe(pat, |
3434
|
|
|
|
|
|
pat + STR_LEN(text_node)))) |
3435
|
|
|
|
|
|
{ |
3436
|
|
|
|
|
|
/* Multi-character folds require more context to sort out. Also |
3437
|
|
|
|
|
|
* PL_utf8_foldclosures used below doesn't handle them, so have to be |
3438
|
|
|
|
|
|
* handled outside this routine */ |
3439
|
|
|
|
|
|
use_chrtest_void = TRUE; |
3440
|
|
|
|
|
|
} |
3441
|
|
|
|
|
|
else { /* an EXACTFish node which doesn't begin with a multi-char fold */ |
3442
|
34150
|
|
|
|
|
c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; |
3443
|
34150
|
|
|
|
|
if (c1 > 256) { |
3444
|
|
|
|
|
|
/* Load the folds hash, if not already done */ |
3445
|
|
|
|
|
|
SV** listp; |
3446
|
154956
|
|
|
|
|
if (! PL_utf8_foldclosures) { |
3447
|
120806
|
|
|
|
|
if (! PL_utf8_tofold) { |
3448
|
|
|
|
|
|
U8 dummy[UTF8_MAXBYTES_CASE+1]; |
3449
|
|
|
|
|
|
|
3450
|
|
|
|
|
|
/* Force loading this by folding an above-Latin1 char */ |
3451
|
34150
|
|
|
|
|
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); |
3452
|
3985445
|
|
|
|
|
assert(PL_utf8_tofold); /* Verify that worked */ |
3453
|
|
|
|
|
|
} |
3454
|
3985445
|
|
|
|
|
PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); |
3455
|
|
|
|
|
|
} |
3456
|
|
|
|
|
|
|
3457
|
|
|
|
|
|
/* The fold closures data structure is a hash with the keys being |
3458
|
|
|
|
|
|
* the UTF-8 of every character that is folded to, like 'k', and |
3459
|
|
|
|
|
|
* the values each an array of all code points that fold to its |
3460
|
|
|
|
|
|
* key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are |
3461
|
|
|
|
|
|
* not included */ |
3462
|
264046
|
|
|
|
|
if ((! (listp = hv_fetch(PL_utf8_foldclosures, |
3463
|
|
|
|
|
|
(char *) pat, |
3464
|
|
|
|
|
|
UTF8SKIP(pat), |
3465
|
|
|
|
|
|
FALSE)))) |
3466
|
|
|
|
|
|
{ |
3467
|
|
|
|
|
|
/* Not found in the hash, therefore there are no folds |
3468
|
|
|
|
|
|
* containing it, so there is only a single character that |
3469
|
|
|
|
|
|
* could match */ |
3470
|
|
|
|
|
|
c2 = c1; |
3471
|
|
|
|
|
|
} |
3472
|
|
|
|
|
|
else { /* Does participate in folds */ |
3473
|
264046
|
|
|
|
|
AV* list = (AV*) *listp; |
3474
|
264046
|
|
|
|
|
if (av_len(list) != 1) { |
3475
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
/* If there aren't exactly two folds to this, it is outside |
3477
|
|
|
|
|
|
* the scope of this function */ |
3478
|
|
|
|
|
|
use_chrtest_void = TRUE; |
3479
|
|
|
|
|
|
} |
3480
|
|
|
|
|
|
else { /* There are two. Get them */ |
3481
|
3985445
|
|
|
|
|
SV** c_p = av_fetch(list, 0, FALSE); |
3482
|
3985445
|
|
|
|
|
if (c_p == NULL) { |
3483
|
7970890
|
|
|
|
|
Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); |
3484
|
|
|
|
|
|
} |
3485
|
3985445
|
|
|
|
|
c1 = SvUV(*c_p); |
3486
|
|
|
|
|
|
|
3487
|
3985445
|
|
|
|
|
c_p = av_fetch(list, 1, FALSE); |
3488
|
3985445
|
|
|
|
|
if (c_p == NULL) { |
3489
|
22112
|
|
|
|
|
Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); |
3490
|
|
|
|
|
|
} |
3491
|
11064
|
|
|
|
|
c2 = SvUV(*c_p); |
3492
|
|
|
|
|
|
|
3493
|
|
|
|
|
|
/* Folds that cross the 255/256 boundary are forbidden if |
3494
|
|
|
|
|
|
* EXACTFL, or EXACTFA and one is ASCIII. Since the |
3495
|
|
|
|
|
|
* pattern character is above 256, and its only other match |
3496
|
|
|
|
|
|
* is below 256, the only legal match will be to itself. |
3497
|
|
|
|
|
|
* We have thrown away the original, so have to compute |
3498
|
|
|
|
|
|
* which is the one above 255 */ |
3499
|
9104
|
|
|
|
|
if ((c1 < 256) != (c2 < 256)) { |
3500
|
9104
|
|
|
|
|
if (OP(text_node) == EXACTFL |
3501
|
1960
|
|
|
|
|
|| ((OP(text_node) == EXACTFA |
3502
|
1960
|
|
|
|
|
|| OP(text_node) == EXACTFA_NO_TRIE) |
3503
|
11064
|
|
|
|
|
&& (isASCII(c1) || isASCII(c2)))) |
3504
|
|
|
|
|
|
{ |
3505
|
35192
|
|
|
|
|
if (c1 < 256) { |
3506
|
|
|
|
|
|
c1 = c2; |
3507
|
|
|
|
|
|
} |
3508
|
|
|
|
|
|
else { |
3509
|
|
|
|
|
|
c2 = c1; |
3510
|
|
|
|
|
|
} |
3511
|
|
|
|
|
|
} |
3512
|
|
|
|
|
|
} |
3513
|
|
|
|
|
|
} |
3514
|
|
|
|
|
|
} |
3515
|
|
|
|
|
|
} |
3516
|
|
|
|
|
|
else /* Here, c1 is < 255 */ |
3517
|
24120
|
|
|
|
|
if (utf8_target |
3518
|
13072
|
|
|
|
|
&& HAS_NONLATIN1_FOLD_CLOSURE(c1) |
3519
|
13072
|
|
|
|
|
&& OP(text_node) != EXACTFL |
3520
|
13072
|
|
|
|
|
&& ((OP(text_node) != EXACTFA |
3521
|
3974397
|
|
|
|
|
&& OP(text_node) != EXACTFA_NO_TRIE) |
3522
|
98906
|
|
|
|
|
|| ! isASCII(c1))) |
3523
|
|
|
|
|
|
{ |
3524
|
|
|
|
|
|
/* Here, there could be something above Latin1 in the target which |
3525
|
|
|
|
|
|
* folds to this character in the pattern. All such cases except |
3526
|
|
|
|
|
|
* LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters |
3527
|
|
|
|
|
|
* involved in their folds, so are outside the scope of this |
3528
|
|
|
|
|
|
* function */ |
3529
|
14142
|
|
|
|
|
if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { |
3530
|
|
|
|
|
|
c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; |
3531
|
|
|
|
|
|
} |
3532
|
|
|
|
|
|
else { |
3533
|
|
|
|
|
|
use_chrtest_void = TRUE; |
3534
|
|
|
|
|
|
} |
3535
|
|
|
|
|
|
} |
3536
|
|
|
|
|
|
else { /* Here nothing above Latin1 can fold to the pattern character */ |
3537
|
3889633
|
|
|
|
|
switch (OP(text_node)) { |
3538
|
|
|
|
|
|
|
3539
|
|
|
|
|
|
case EXACTFL: /* /l rules */ |
3540
|
5977449
|
|
|
|
|
c2 = PL_fold_locale[c1]; |
3541
|
182238
|
|
|
|
|
break; |
3542
|
|
|
|
|
|
|
3543
|
|
|
|
|
|
case EXACTF: /* This node only generated for non-utf8 |
3544
|
|
|
|
|
|
patterns */ |
3545
|
3803207
|
|
|
|
|
assert(! is_utf8_pat); |
3546
|
3985445
|
|
|
|
|
if (! utf8_target) { /* /d rules */ |
3547
|
89066
|
|
|
|
|
c2 = PL_fold[c1]; |
3548
|
3896379
|
|
|
|
|
break; |
3549
|
|
|
|
|
|
} |
3550
|
|
|
|
|
|
/* FALLTHROUGH */ |
3551
|
|
|
|
|
|
/* /u rules for all these. This happens to work for |
3552
|
|
|
|
|
|
* EXACTFA as nothing in Latin1 folds to ASCII */ |
3553
|
|
|
|
|
|
case EXACTFA_NO_TRIE: /* This node only generated for |
3554
|
|
|
|
|
|
non-utf8 patterns */ |
3555
|
254384000
|
|
|
|
|
assert(! is_utf8_pat); |
3556
|
|
|
|
|
|
/* FALL THROUGH */ |
3557
|
|
|
|
|
|
case EXACTFA: |
3558
|
|
|
|
|
|
case EXACTFU_SS: |
3559
|
|
|
|
|
|
case EXACTFU: |
3560
|
254384000
|
|
|
|
|
c2 = PL_fold_latin1[c1]; |
3561
|
254384000
|
|
|
|
|
break; |
3562
|
|
|
|
|
|
|
3563
|
|
|
|
|
|
default: |
3564
|
839620
|
|
|
|
|
Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); |
3565
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
3566
|
|
|
|
|
|
} |
3567
|
|
|
|
|
|
} |
3568
|
|
|
|
|
|
} |
3569
|
|
|
|
|
|
|
3570
|
|
|
|
|
|
/* Here have figured things out. Set up the returns */ |
3571
|
839620
|
|
|
|
|
if (use_chrtest_void) { |
3572
|
1251868
|
|
|
|
|
*c2p = *c1p = CHRTEST_VOID; |
3573
|
|
|
|
|
|
} |
3574
|
855546
|
|
|
|
|
else if (utf8_target) { |
3575
|
802728
|
|
|
|
|
if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ |
3576
|
796324
|
|
|
|
|
uvchr_to_utf8(c1_utf8, c1); |
3577
|
791964
|
|
|
|
|
uvchr_to_utf8(c2_utf8, c2); |
3578
|
|
|
|
|
|
} |
3579
|
|
|
|
|
|
|
3580
|
|
|
|
|
|
/* Invariants are stored in both the utf8 and byte outputs; Use |
3581
|
|
|
|
|
|
* negative numbers otherwise for the byte ones. Make sure that the |
3582
|
|
|
|
|
|
* byte ones are the same iff the utf8 ones are the same */ |
3583
|
558280
|
|
|
|
|
*c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; |
3584
|
4360
|
|
|
|
|
*c2p = (UTF8_IS_INVARIANT(*c2_utf8)) |
3585
|
1404
|
|
|
|
|
? *c2_utf8 |
3586
|
559684
|
|
|
|
|
: (c1 == c2) |
3587
|
|
|
|
|
|
? CHRTEST_NOT_A_CP_1 |
3588
|
474421
|
|
|
|
|
: CHRTEST_NOT_A_CP_2; |
3589
|
|
|
|
|
|
} |
3590
|
148864
|
|
|
|
|
else if (c1 > 255) { |
3591
|
135068
|
|
|
|
|
if (c2 > 255) { /* both possibilities are above what a non-utf8 string |
3592
|
|
|
|
|
|
can represent */ |
3593
|
|
|
|
|
|
return FALSE; |
3594
|
|
|
|
|
|
} |
3595
|
|
|
|
|
|
|
3596
|
135048
|
|
|
|
|
*c1p = *c2p = c2; /* c2 is the only representable value */ |
3597
|
|
|
|
|
|
} |
3598
|
|
|
|
|
|
else { /* c1 is representable; see about c2 */ |
3599
|
132538
|
|
|
|
|
*c1p = c1; |
3600
|
127090
|
|
|
|
|
*c2p = (c2 < 256) ? c2 : c1; |
3601
|
|
|
|
|
|
} |
3602
|
|
|
|
|
|
|
3603
|
|
|
|
|
|
return TRUE; |
3604
|
|
|
|
|
|
} |
3605
|
|
|
|
|
|
|
3606
|
|
|
|
|
|
/* returns -1 on failure, $+[0] on success */ |
3607
|
|
|
|
|
|
STATIC SSize_t |
3608
|
2552
|
|
|
|
|
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) |
3609
|
|
|
|
|
|
{ |
3610
|
|
|
|
|
|
#if PERL_VERSION < 9 && !defined(PERL_CORE) |
3611
|
|
|
|
|
|
dMY_CXT; |
3612
|
|
|
|
|
|
#endif |
3613
|
|
|
|
|
|
dVAR; |
3614
|
1776
|
|
|
|
|
const bool utf8_target = reginfo->is_utf8_target; |
3615
|
128866
|
|
|
|
|
const U32 uniflags = UTF8_ALLOW_DEFAULT; |
3616
|
253544422
|
|
|
|
|
REGEXP *rex_sv = reginfo->prog; |
3617
|
249016736
|
|
|
|
|
regexp *rex = ReANY(rex_sv); |
3618
|
170452682
|
|
|
|
|
RXi_GET_DECL(rex,rexi); |
3619
|
|
|
|
|
|
/* the current state. This is a cached copy of PL_regmatch_state */ |
3620
|
|
|
|
|
|
regmatch_state *st; |
3621
|
|
|
|
|
|
/* cache heavy used fields of st in registers */ |
3622
|
|
|
|
|
|
regnode *scan; |
3623
|
|
|
|
|
|
regnode *next; |
3624
|
|
|
|
|
|
U32 n = 0; /* general value; init to avoid compiler warning */ |
3625
|
|
|
|
|
|
SSize_t ln = 0; /* len or last; init to avoid compiler warning */ |
3626
|
|
|
|
|
|
char *locinput = startpos; |
3627
|
|
|
|
|
|
char *pushinput; /* where to continue after a PUSH */ |
3628
|
|
|
|
|
|
I32 nextchr; /* is always set to UCHARAT(locinput) */ |
3629
|
|
|
|
|
|
|
3630
|
|
|
|
|
|
bool result = 0; /* return value of S_regmatch */ |
3631
|
|
|
|
|
|
int depth = 0; /* depth of backtrack stack */ |
3632
|
|
|
|
|
|
U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ |
3633
|
120297742
|
|
|
|
|
const U32 max_nochange_depth = |
3634
|
|
|
|
|
|
(3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? |
3635
|
813210
|
|
|
|
|
3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; |
3636
|
|
|
|
|
|
regmatch_state *yes_state = NULL; /* state to pop to on success of |
3637
|
|
|
|
|
|
subpattern */ |
3638
|
|
|
|
|
|
/* mark_state piggy backs on the yes_state logic so that when we unwind |
3639
|
|
|
|
|
|
the stack on success we can update the mark_state as we go */ |
3640
|
|
|
|
|
|
regmatch_state *mark_state = NULL; /* last mark state we have seen */ |
3641
|
|
|
|
|
|
regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ |
3642
|
|
|
|
|
|
struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ |
3643
|
|
|
|
|
|
U32 state_num; |
3644
|
|
|
|
|
|
bool no_final = 0; /* prevent failure from backtracking? */ |
3645
|
|
|
|
|
|
bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ |
3646
|
|
|
|
|
|
char *startpoint = locinput; |
3647
|
|
|
|
|
|
SV *popmark = NULL; /* are we looking for a mark? */ |
3648
|
|
|
|
|
|
SV *sv_commit = NULL; /* last mark name seen in failure */ |
3649
|
|
|
|
|
|
SV *sv_yes_mark = NULL; /* last mark name we have seen |
3650
|
|
|
|
|
|
during a successful match */ |
3651
|
|
|
|
|
|
U32 lastopen = 0; /* last open we saw */ |
3652
|
406626
|
|
|
|
|
bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; |
3653
|
848506
|
|
|
|
|
SV* const oreplsv = GvSV(PL_replgv); |
3654
|
|
|
|
|
|
/* these three flags are set by various ops to signal information to |
3655
|
|
|
|
|
|
* the very next op. They have a useful lifetime of exactly one loop |
3656
|
|
|
|
|
|
* iteration, and are not preserved or restored by state pushes/pops |
3657
|
|
|
|
|
|
*/ |
3658
|
|
|
|
|
|
bool sw = 0; /* the condition value in (?(cond)a|b) */ |
3659
|
|
|
|
|
|
bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ |
3660
|
|
|
|
|
|
int logical = 0; /* the following EVAL is: |
3661
|
|
|
|
|
|
0: (?{...}) |
3662
|
|
|
|
|
|
1: (?(?{...})X|Y) |
3663
|
|
|
|
|
|
2: (??{...}) |
3664
|
|
|
|
|
|
or the following IFMATCH/UNLESSM is: |
3665
|
|
|
|
|
|
false: plain (?=foo) |
3666
|
|
|
|
|
|
true: used as a condition: (?(?=foo)) |
3667
|
|
|
|
|
|
*/ |
3668
|
|
|
|
|
|
PAD* last_pad = NULL; |
3669
|
|
|
|
|
|
dMULTICALL; |
3670
|
|
|
|
|
|
I32 gimme = G_SCALAR; |
3671
|
|
|
|
|
|
CV *caller_cv = NULL; /* who called us */ |
3672
|
|
|
|
|
|
CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ |
3673
|
|
|
|
|
|
CHECKPOINT runops_cp; /* savestack position before executing EVAL */ |
3674
|
872782
|
|
|
|
|
U32 maxopenparen = 0; /* max '(' index seen so far */ |
3675
|
|
|
|
|
|
int to_complement; /* Invert the result? */ |
3676
|
|
|
|
|
|
_char_class_number classnum; |
3677
|
1411512
|
|
|
|
|
bool is_utf8_pat = reginfo->is_utf8_pat; |
3678
|
|
|
|
|
|
|
3679
|
|
|
|
|
|
#ifdef DEBUGGING |
3680
|
1411512
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
3681
|
|
|
|
|
|
#endif |
3682
|
|
|
|
|
|
|
3683
|
|
|
|
|
|
/* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ |
3684
|
|
|
|
|
|
multicall_oldcatch = 0; |
3685
|
|
|
|
|
|
multicall_cv = NULL; |
3686
|
|
|
|
|
|
cx = NULL; |
3687
|
|
|
|
|
|
PERL_UNUSED_VAR(multicall_cop); |
3688
|
|
|
|
|
|
PERL_UNUSED_VAR(newsp); |
3689
|
|
|
|
|
|
|
3690
|
|
|
|
|
|
|
3691
|
1411512
|
|
|
|
|
PERL_ARGS_ASSERT_REGMATCH; |
3692
|
|
|
|
|
|
|
3693
|
920324
|
|
|
|
|
DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ |
3694
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"regmatch start\n"); |
3695
|
|
|
|
|
|
})); |
3696
|
|
|
|
|
|
|
3697
|
920324
|
|
|
|
|
st = PL_regmatch_state; |
3698
|
|
|
|
|
|
|
3699
|
|
|
|
|
|
/* Note that nextchr is a byte even in UTF */ |
3700
|
|
|
|
|
|
SET_nextchr; |
3701
|
|
|
|
|
|
scan = prog; |
3702
|
462520
|
|
|
|
|
while (scan != NULL) { |
3703
|
|
|
|
|
|
|
3704
|
462478
|
|
|
|
|
DEBUG_EXECUTE_r( { |
3705
|
|
|
|
|
|
SV * const prop = sv_newmortal(); |
3706
|
|
|
|
|
|
regnode *rnext=regnext(scan); |
3707
|
|
|
|
|
|
DUMP_EXEC_POS( locinput, scan, utf8_target ); |
3708
|
|
|
|
|
|
regprop(rex, prop, scan); |
3709
|
|
|
|
|
|
|
3710
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
3711
|
|
|
|
|
|
"%3"IVdf":%*s%s(%"IVdf")\n", |
3712
|
|
|
|
|
|
(IV)(scan - rexi->program), depth*2, "", |
3713
|
|
|
|
|
|
SvPVX_const(prop), |
3714
|
|
|
|
|
|
(PL_regkind[OP(scan)] == END || !rnext) ? |
3715
|
|
|
|
|
|
0 : (IV)(rnext - rexi->program)); |
3716
|
|
|
|
|
|
}); |
3717
|
|
|
|
|
|
|
3718
|
735670
|
|
|
|
|
next = scan + NEXT_OFF(scan); |
3719
|
491336
|
|
|
|
|
if (next == scan) |
3720
|
|
|
|
|
|
next = NULL; |
3721
|
366988
|
|
|
|
|
state_num = OP(scan); |
3722
|
|
|
|
|
|
|
3723
|
|
|
|
|
|
reenter_switch: |
3724
|
|
|
|
|
|
to_complement = 0; |
3725
|
|
|
|
|
|
|
3726
|
201142
|
|
|
|
|
SET_nextchr; |
3727
|
172098
|
|
|
|
|
assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); |
3728
|
|
|
|
|
|
|
3729
|
161836
|
|
|
|
|
switch (state_num) { |
3730
|
|
|
|
|
|
case BOL: /* /^../ */ |
3731
|
161688
|
|
|
|
|
if (locinput == reginfo->strbeg) |
3732
|
|
|
|
|
|
break; |
3733
|
|
|
|
|
|
sayNO; |
3734
|
|
|
|
|
|
|
3735
|
|
|
|
|
|
case MBOL: /* /^../m */ |
3736
|
498
|
|
|
|
|
if (locinput == reginfo->strbeg || |
3737
|
12146608
|
|
|
|
|
(!NEXTCHR_IS_EOS && locinput[-1] == '\n')) |
3738
|
|
|
|
|
|
{ |
3739
|
|
|
|
|
|
break; |
3740
|
|
|
|
|
|
} |
3741
|
|
|
|
|
|
sayNO; |
3742
|
|
|
|
|
|
|
3743
|
|
|
|
|
|
case SBOL: /* /^../s */ |
3744
|
5450
|
|
|
|
|
if (locinput == reginfo->strbeg) |
3745
|
|
|
|
|
|
break; |
3746
|
|
|
|
|
|
sayNO; |
3747
|
|
|
|
|
|
|
3748
|
|
|
|
|
|
case GPOS: /* \G */ |
3749
|
5222
|
|
|
|
|
if (locinput == reginfo->ganch) |
3750
|
|
|
|
|
|
break; |
3751
|
|
|
|
|
|
sayNO; |
3752
|
|
|
|
|
|
|
3753
|
|
|
|
|
|
case KEEPS: /* \K */ |
3754
|
|
|
|
|
|
/* update the startpoint */ |
3755
|
4218
|
|
|
|
|
st->u.keeper.val = rex->offs[0].start; |
3756
|
4002
|
|
|
|
|
rex->offs[0].start = locinput - reginfo->strbeg; |
3757
|
4002
|
|
|
|
|
PUSH_STATE_GOTO(KEEPS_next, next, locinput); |
3758
|
|
|
|
|
|
assert(0); /*NOTREACHED*/ |
3759
|
|
|
|
|
|
case KEEPS_next_fail: |
3760
|
|
|
|
|
|
/* rollback the start point change */ |
3761
|
4002
|
|
|
|
|
rex->offs[0].start = st->u.keeper.val; |
3762
|
4218
|
|
|
|
|
sayNO_SILENT; |
3763
|
|
|
|
|
|
assert(0); /*NOTREACHED*/ |
3764
|
|
|
|
|
|
|
3765
|
|
|
|
|
|
case EOL: /* /..$/ */ |
3766
|
|
|
|
|
|
goto seol; |
3767
|
|
|
|
|
|
|
3768
|
|
|
|
|
|
case MEOL: /* /..$/m */ |
3769
|
3990
|
|
|
|
|
if (!NEXTCHR_IS_EOS && nextchr != '\n') |
3770
|
|
|
|
|
|
sayNO; |
3771
|
|
|
|
|
|
break; |
3772
|
|
|
|
|
|
|
3773
|
|
|
|
|
|
case SEOL: /* /..$/s */ |
3774
|
|
|
|
|
|
seol: |
3775
|
3992
|
|
|
|
|
if (!NEXTCHR_IS_EOS && nextchr != '\n') |
3776
|
|
|
|
|
|
sayNO; |
3777
|
3776
|
|
|
|
|
if (reginfo->strend - locinput > 1) |
3778
|
|
|
|
|
|
sayNO; |
3779
|
|
|
|
|
|
break; |
3780
|
|
|
|
|
|
|
3781
|
|
|
|
|
|
case EOS: /* \z */ |
3782
|
3774
|
|
|
|
|
if (!NEXTCHR_IS_EOS) |
3783
|
|
|
|
|
|
sayNO; |
3784
|
|
|
|
|
|
break; |
3785
|
|
|
|
|
|
|
3786
|
|
|
|
|
|
case SANY: /* /./s */ |
3787
|
228
|
|
|
|
|
if (NEXTCHR_IS_EOS) |
3788
|
|
|
|
|
|
sayNO; |
3789
|
|
|
|
|
|
goto increment_locinput; |
3790
|
|
|
|
|
|
|
3791
|
|
|
|
|
|
case CANY: /* \C */ |
3792
|
228
|
|
|
|
|
if (NEXTCHR_IS_EOS) |
3793
|
|
|
|
|
|
sayNO; |
3794
|
17744483
|
|
|
|
|
locinput++; |
3795
|
11204186
|
|
|
|
|
break; |
3796
|
|
|
|
|
|
|
3797
|
|
|
|
|
|
case REG_ANY: /* /./ */ |
3798
|
12142390
|
|
|
|
|
if ((NEXTCHR_IS_EOS) || nextchr == '\n') |
3799
|
|
|
|
|
|
sayNO; |
3800
|
|
|
|
|
|
goto increment_locinput; |
3801
|
|
|
|
|
|
|
3802
|
|
|
|
|
|
|
3803
|
|
|
|
|
|
#undef ST |
3804
|
|
|
|
|
|
#define ST st->u.trie |
3805
|
|
|
|
|
|
case TRIEC: /* (ab|cd) with known charclass */ |
3806
|
|
|
|
|
|
/* In this case the charclass data is available inline so |
3807
|
|
|
|
|
|
we can fail fast without a lot of extra overhead. |
3808
|
|
|
|
|
|
*/ |
3809
|
1170
|
|
|
|
|
if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { |
3810
|
1168
|
|
|
|
|
DEBUG_EXECUTE_r( |
3811
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
3812
|
|
|
|
|
|
"%*s %sfailed to match trie start class...%s\n", |
3813
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) |
3814
|
|
|
|
|
|
); |
3815
|
|
|
|
|
|
sayNO_SILENT; |
3816
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
3817
|
|
|
|
|
|
} |
3818
|
|
|
|
|
|
/* FALL THROUGH */ |
3819
|
|
|
|
|
|
case TRIE: /* (ab|cd) */ |
3820
|
|
|
|
|
|
/* the basic plan of execution of the trie is: |
3821
|
|
|
|
|
|
* At the beginning, run though all the states, and |
3822
|
|
|
|
|
|
* find the longest-matching word. Also remember the position |
3823
|
|
|
|
|
|
* of the shortest matching word. For example, this pattern: |
3824
|
|
|
|
|
|
* 1 2 3 4 5 |
3825
|
|
|
|
|
|
* ab|a|x|abcd|abc |
3826
|
|
|
|
|
|
* when matched against the string "abcde", will generate |
3827
|
|
|
|
|
|
* accept states for all words except 3, with the longest |
3828
|
|
|
|
|
|
* matching word being 4, and the shortest being 2 (with |
3829
|
|
|
|
|
|
* the position being after char 1 of the string). |
3830
|
|
|
|
|
|
* |
3831
|
|
|
|
|
|
* Then for each matching word, in word order (i.e. 1,2,4,5), |
3832
|
|
|
|
|
|
* we run the remainder of the pattern; on each try setting |
3833
|
|
|
|
|
|
* the current position to the character following the word, |
3834
|
|
|
|
|
|
* returning to try the next word on failure. |
3835
|
|
|
|
|
|
* |
3836
|
|
|
|
|
|
* We avoid having to build a list of words at runtime by |
3837
|
|
|
|
|
|
* using a compile-time structure, wordinfo[].prev, which |
3838
|
|
|
|
|
|
* gives, for each word, the previous accepting word (if any). |
3839
|
|
|
|
|
|
* In the case above it would contain the mappings 1->2, 2->0, |
3840
|
|
|
|
|
|
* 3->0, 4->5, 5->1. We can use this table to generate, from |
3841
|
|
|
|
|
|
* the longest word (4 above), a list of all words, by |
3842
|
|
|
|
|
|
* following the list of prev pointers; this gives us the |
3843
|
|
|
|
|
|
* unordered list 4,5,1,2. Then given the current word we have |
3844
|
|
|
|
|
|
* just tried, we can go through the list and find the |
3845
|
|
|
|
|
|
* next-biggest word to try (so if we just failed on word 2, |
3846
|
|
|
|
|
|
* the next in the list is 4). |
3847
|
|
|
|
|
|
* |
3848
|
|
|
|
|
|
* Since at runtime we don't record the matching position in |
3849
|
|
|
|
|
|
* the string for each word, we have to work that out for |
3850
|
|
|
|
|
|
* each word we're about to process. The wordinfo table holds |
3851
|
|
|
|
|
|
* the character length of each word; given that we recorded |
3852
|
|
|
|
|
|
* at the start: the position of the shortest word and its |
3853
|
|
|
|
|
|
* length in chars, we just need to move the pointer the |
3854
|
|
|
|
|
|
* difference between the two char lengths. Depending on |
3855
|
|
|
|
|
|
* Unicode status and folding, that's cheap or expensive. |
3856
|
|
|
|
|
|
* |
3857
|
|
|
|
|
|
* This algorithm is optimised for the case where are only a |
3858
|
|
|
|
|
|
* small number of accept states, i.e. 0,1, or maybe 2. |
3859
|
|
|
|
|
|
* With lots of accepts states, and having to try all of them, |
3860
|
|
|
|
|
|
* it becomes quadratic on number of accept states to find all |
3861
|
|
|
|
|
|
* the next words. |
3862
|
|
|
|
|
|
*/ |
3863
|
|
|
|
|
|
|
3864
|
|
|
|
|
|
{ |
3865
|
|
|
|
|
|
/* what type of TRIE am I? (utf8 makes this contextual) */ |
3866
|
1174
|
|
|
|
|
DECL_TRIE_TYPE(scan); |
3867
|
|
|
|
|
|
|
3868
|
|
|
|
|
|
/* what trie are we using right now */ |
3869
|
110
|
|
|
|
|
reg_trie_data * const trie |
3870
|
110
|
|
|
|
|
= (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; |
3871
|
110
|
|
|
|
|
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); |
3872
|
10947424
|
|
|
|
|
U32 state = trie->startstate; |
3873
|
|
|
|
|
|
|
3874
|
10947424
|
|
|
|
|
if ( trie->bitmap |
3875
|
10947420
|
|
|
|
|
&& (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) |
3876
|
|
|
|
|
|
{ |
3877
|
1193702
|
|
|
|
|
if (trie->states[ state ].wordnum) { |
3878
|
1193702
|
|
|
|
|
DEBUG_EXECUTE_r( |
3879
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
3880
|
|
|
|
|
|
"%*s %smatched empty string...%s\n", |
3881
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) |
3882
|
|
|
|
|
|
); |
3883
|
1193702
|
|
|
|
|
if (!trie->jump) |
3884
|
|
|
|
|
|
break; |
3885
|
|
|
|
|
|
} else { |
3886
|
0
|
|
|
|
|
DEBUG_EXECUTE_r( |
3887
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
3888
|
|
|
|
|
|
"%*s %sfailed to match trie start class...%s\n", |
3889
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) |
3890
|
|
|
|
|
|
); |
3891
|
|
|
|
|
|
sayNO_SILENT; |
3892
|
|
|
|
|
|
} |
3893
|
|
|
|
|
|
} |
3894
|
|
|
|
|
|
|
3895
|
|
|
|
|
|
{ |
3896
|
|
|
|
|
|
U8 *uc = ( U8* )locinput; |
3897
|
|
|
|
|
|
|
3898
|
12146612
|
|
|
|
|
STRLEN len = 0; |
3899
|
58775536
|
|
|
|
|
STRLEN foldlen = 0; |
3900
|
|
|
|
|
|
U8 *uscan = (U8*)NULL; |
3901
|
|
|
|
|
|
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; |
3902
|
|
|
|
|
|
U32 charcount = 0; /* how many input chars we have matched */ |
3903
|
|
|
|
|
|
U32 accepted = 0; /* have we seen any accepting states? */ |
3904
|
|
|
|
|
|
|
3905
|
57344290
|
|
|
|
|
ST.jump = trie->jump; |
3906
|
826002
|
|
|
|
|
ST.me = scan; |
3907
|
522784
|
|
|
|
|
ST.firstpos = NULL; |
3908
|
56518292
|
|
|
|
|
ST.longfold = FALSE; /* char longer if folded => it's harder */ |
3909
|
27052025
|
|
|
|
|
ST.nextword = 0; |
3910
|
|
|
|
|
|
|
3911
|
|
|
|
|
|
/* fully traverse the TRIE; note the position of the |
3912
|
|
|
|
|
|
shortest accept state and the wordnum of the longest |
3913
|
|
|
|
|
|
accept state */ |
3914
|
|
|
|
|
|
|
3915
|
112
|
|
|
|
|
while ( state && uc <= (U8*)(reginfo->strend) ) { |
3916
|
194
|
|
|
|
|
U32 base = trie->states[ state ].trans.base; |
3917
|
104
|
|
|
|
|
UV uvc = 0; |
3918
|
|
|
|
|
|
U16 charid = 0; |
3919
|
|
|
|
|
|
U16 wordnum; |
3920
|
94
|
|
|
|
|
wordnum = trie->states[ state ].wordnum; |
3921
|
|
|
|
|
|
|
3922
|
24
|
|
|
|
|
if (wordnum) { /* it's an accept state */ |
3923
|
8
|
|
|
|
|
if (!accepted) { |
3924
|
|
|
|
|
|
accepted = 1; |
3925
|
|
|
|
|
|
/* record first match position */ |
3926
|
88
|
|
|
|
|
if (ST.longfold) { |
3927
|
1250552
|
|
|
|
|
ST.firstpos = (U8*)locinput; |
3928
|
7708673
|
|
|
|
|
ST.firstchars = 0; |
3929
|
|
|
|
|
|
} |
3930
|
|
|
|
|
|
else { |
3931
|
182216
|
|
|
|
|
ST.firstpos = uc; |
3932
|
182216
|
|
|
|
|
ST.firstchars = charcount; |
3933
|
|
|
|
|
|
} |
3934
|
|
|
|
|
|
} |
3935
|
7839793
|
|
|
|
|
if (!ST.nextword || wordnum < ST.nextword) |
3936
|
7664131
|
|
|
|
|
ST.nextword = wordnum; |
3937
|
5121993
|
|
|
|
|
ST.topword = wordnum; |
3938
|
|
|
|
|
|
} |
3939
|
|
|
|
|
|
|
3940
|
5141532
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r({ |
3941
|
|
|
|
|
|
DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); |
3942
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
3943
|
|
|
|
|
|
"%*s %sState: %4"UVxf" Accepted: %c ", |
3944
|
|
|
|
|
|
2+depth * 2, "", PL_colors[4], |
3945
|
|
|
|
|
|
(UV)state, (accepted ? 'Y' : 'N')); |
3946
|
|
|
|
|
|
}); |
3947
|
|
|
|
|
|
|
3948
|
|
|
|
|
|
/* read a char and goto next state */ |
3949
|
293500
|
|
|
|
|
if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { |
3950
|
|
|
|
|
|
I32 offset; |
3951
|
290924
|
|
|
|
|
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, |
3952
|
|
|
|
|
|
uscan, len, uvc, charid, foldlen, |
3953
|
|
|
|
|
|
foldbuf, uniflags); |
3954
|
159530
|
|
|
|
|
charcount++; |
3955
|
54556
|
|
|
|
|
if (foldlen>0) |
3956
|
131394
|
|
|
|
|
ST.longfold = TRUE; |
3957
|
24908
|
|
|
|
|
if (charid && |
3958
|
18676
|
|
|
|
|
( ((offset = |
3959
|
11412
|
|
|
|
|
base + charid - 1 - trie->uniquecharcount)) >= 0) |
3960
|
|
|
|
|
|
|
3961
|
118960
|
|
|
|
|
&& ((U32)offset < trie->lasttrans) |
3962
|
118960
|
|
|
|
|
&& trie->trans[offset].check == state) |
3963
|
|
|
|
|
|
{ |
3964
|
79528
|
|
|
|
|
state = trie->trans[offset].next; |
3965
|
|
|
|
|
|
} |
3966
|
|
|
|
|
|
else { |
3967
|
|
|
|
|
|
state = 0; |
3968
|
|
|
|
|
|
} |
3969
|
46
|
|
|
|
|
uc += len; |
3970
|
|
|
|
|
|
|
3971
|
|
|
|
|
|
} |
3972
|
|
|
|
|
|
else { |
3973
|
|
|
|
|
|
state = 0; |
3974
|
|
|
|
|
|
} |
3975
|
50
|
|
|
|
|
DEBUG_TRIE_EXECUTE_r( |
3976
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
3977
|
|
|
|
|
|
"Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", |
3978
|
|
|
|
|
|
charid, uvc, (UV)state, PL_colors[5] ); |
3979
|
|
|
|
|
|
); |
3980
|
|
|
|
|
|
} |
3981
|
79522
|
|
|
|
|
if (!accepted) |
3982
|
|
|
|
|
|
sayNO; |
3983
|
|
|
|
|
|
|
3984
|
|
|
|
|
|
/* calculate total number of accept states */ |
3985
|
|
|
|
|
|
{ |
3986
|
79522
|
|
|
|
|
U16 w = ST.topword; |
3987
|
|
|
|
|
|
accepted = 0; |
3988
|
39444
|
|
|
|
|
while (w) { |
3989
|
39260
|
|
|
|
|
w = trie->wordinfo[w].prev; |
3990
|
39260
|
|
|
|
|
accepted++; |
3991
|
|
|
|
|
|
} |
3992
|
120
|
|
|
|
|
ST.accepted = accepted; |
3993
|
|
|
|
|
|
} |
3994
|
|
|
|
|
|
|
3995
|
120
|
|
|
|
|
DEBUG_EXECUTE_r( |
3996
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
3997
|
|
|
|
|
|
"%*s %sgot %"IVdf" possible matches%s\n", |
3998
|
|
|
|
|
|
REPORT_CODE_OFF + depth * 2, "", |
3999
|
|
|
|
|
|
PL_colors[4], (IV)ST.accepted, PL_colors[5] ); |
4000
|
|
|
|
|
|
); |
4001
|
|
|
|
|
|
goto trie_first_try; /* jump into the fail handler */ |
4002
|
|
|
|
|
|
}} |
4003
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
4004
|
|
|
|
|
|
|
4005
|
|
|
|
|
|
case TRIE_next_fail: /* we failed - try next alternative */ |
4006
|
|
|
|
|
|
{ |
4007
|
|
|
|
|
|
U8 *uc; |
4008
|
4
|
|
|
|
|
if ( ST.jump) { |
4009
|
4
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
4010
|
104
|
|
|
|
|
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); |
4011
|
|
|
|
|
|
} |
4012
|
71
|
|
|
|
|
if (!--ST.accepted) { |
4013
|
4
|
|
|
|
|
DEBUG_EXECUTE_r({ |
4014
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
4015
|
|
|
|
|
|
"%*s %sTRIE failed...%s\n", |
4016
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", |
4017
|
|
|
|
|
|
PL_colors[4], |
4018
|
|
|
|
|
|
PL_colors[5] ); |
4019
|
|
|
|
|
|
}); |
4020
|
|
|
|
|
|
sayNO_SILENT; |
4021
|
|
|
|
|
|
} |
4022
|
|
|
|
|
|
{ |
4023
|
|
|
|
|
|
/* Find next-highest word to process. Note that this code |
4024
|
|
|
|
|
|
* is O(N^2) per trie run (O(N) per branch), so keep tight */ |
4025
|
|
|
|
|
|
U16 min = 0; |
4026
|
|
|
|
|
|
U16 word; |
4027
|
118698
|
|
|
|
|
U16 const nextword = ST.nextword; |
4028
|
4379828
|
|
|
|
|
reg_trie_wordinfo * const wordinfo |
4029
|
4067818
|
|
|
|
|
= ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; |
4030
|
3731350
|
|
|
|
|
for (word=ST.topword; word; word=wordinfo[word].prev) { |
4031
|
3731350
|
|
|
|
|
if (word > nextword && (!min || word < min)) |
4032
|
|
|
|
|
|
min = word; |
4033
|
|
|
|
|
|
} |
4034
|
16
|
|
|
|
|
ST.nextword = min; |
4035
|
|
|
|
|
|
} |
4036
|
|
|
|
|
|
|
4037
|
|
|
|
|
|
trie_first_try: |
4038
|
12
|
|
|
|
|
if (do_cutgroup) { |
4039
|
|
|
|
|
|
do_cutgroup = 0; |
4040
|
|
|
|
|
|
no_final = 0; |
4041
|
|
|
|
|
|
} |
4042
|
|
|
|
|
|
|
4043
|
6
|
|
|
|
|
if ( ST.jump) { |
4044
|
336468
|
|
|
|
|
ST.lastparen = rex->lastparen; |
4045
|
56
|
|
|
|
|
ST.lastcloseparen = rex->lastcloseparen; |
4046
|
2
|
|
|
|
|
REGCP_SET(ST.cp); |
4047
|
|
|
|
|
|
} |
4048
|
|
|
|
|
|
|
4049
|
|
|
|
|
|
/* find start char of end of current word */ |
4050
|
|
|
|
|
|
{ |
4051
|
|
|
|
|
|
U32 chars; /* how many chars to skip */ |
4052
|
336470
|
|
|
|
|
reg_trie_data * const trie |
4053
|
336470
|
|
|
|
|
= (reg_trie_data*)rexi->data->data[ARG(ST.me)]; |
4054
|
|
|
|
|
|
|
4055
|
335027
|
|
|
|
|
assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) |
4056
|
|
|
|
|
|
>= ST.firstchars); |
4057
|
1788
|
|
|
|
|
chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) |
4058
|
1110
|
|
|
|
|
- ST.firstchars; |
4059
|
1110
|
|
|
|
|
uc = ST.firstpos; |
4060
|
|
|
|
|
|
|
4061
|
678
|
|
|
|
|
if (ST.longfold) { |
4062
|
|
|
|
|
|
/* the hard option - fold each char in turn and find |
4063
|
|
|
|
|
|
* its folded length (which may be different */ |
4064
|
|
|
|
|
|
U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; |
4065
|
|
|
|
|
|
STRLEN foldlen; |
4066
|
|
|
|
|
|
STRLEN len; |
4067
|
|
|
|
|
|
UV uvc; |
4068
|
|
|
|
|
|
U8 *uscan; |
4069
|
|
|
|
|
|
|
4070
|
132
|
|
|
|
|
while (chars) { |
4071
|
224
|
|
|
|
|
if (utf8_target) { |
4072
|
88
|
|
|
|
|
uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, |
4073
|
|
|
|
|
|
uniflags); |
4074
|
26
|
|
|
|
|
uc += len; |
4075
|
|
|
|
|
|
} |
4076
|
|
|
|
|
|
else { |
4077
|
542
|
|
|
|
|
uvc = *uc; |
4078
|
104
|
|
|
|
|
uc++; |
4079
|
|
|
|
|
|
} |
4080
|
158
|
|
|
|
|
uvc = to_uni_fold(uvc, foldbuf, &foldlen); |
4081
|
|
|
|
|
|
uscan = foldbuf; |
4082
|
56
|
|
|
|
|
while (foldlen) { |
4083
|
2
|
|
|
|
|
if (!--chars) |
4084
|
|
|
|
|
|
break; |
4085
|
552
|
|
|
|
|
uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, |
4086
|
|
|
|
|
|
uniflags); |
4087
|
496
|
|
|
|
|
uscan += len; |
4088
|
114
|
|
|
|
|
foldlen -= len; |
4089
|
|
|
|
|
|
} |
4090
|
|
|
|
|
|
} |
4091
|
|
|
|
|
|
} |
4092
|
|
|
|
|
|
else { |
4093
|
442
|
|
|
|
|
if (utf8_target) |
4094
|
382
|
|
|
|
|
while (chars--) |
4095
|
446
|
|
|
|
|
uc += UTF8SKIP(uc); |
4096
|
|
|
|
|
|
else |
4097
|
228
|
|
|
|
|
uc += chars; |
4098
|
|
|
|
|
|
} |
4099
|
|
|
|
|
|
} |
4100
|
|
|
|
|
|
|
4101
|
232
|
|
|
|
|
scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) |
4102
|
224
|
|
|
|
|
? ST.jump[ST.nextword] |
4103
|
227
|
|
|
|
|
: NEXT_OFF(ST.me)); |
4104
|
|
|
|
|
|
|
4105
|
226
|
|
|
|
|
DEBUG_EXECUTE_r({ |
4106
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
4107
|
|
|
|
|
|
"%*s %sTRIE matched word #%d, continuing%s\n", |
4108
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", |
4109
|
|
|
|
|
|
PL_colors[4], |
4110
|
|
|
|
|
|
ST.nextword, |
4111
|
|
|
|
|
|
PL_colors[5] |
4112
|
|
|
|
|
|
); |
4113
|
|
|
|
|
|
}); |
4114
|
|
|
|
|
|
|
4115
|
341
|
|
|
|
|
if (ST.accepted > 1 || has_cutgroup) { |
4116
|
112
|
|
|
|
|
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); |
4117
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
4118
|
|
|
|
|
|
} |
4119
|
|
|
|
|
|
/* only one choice left - just continue */ |
4120
|
8
|
|
|
|
|
DEBUG_EXECUTE_r({ |
4121
|
|
|
|
|
|
AV *const trie_words |
4122
|
|
|
|
|
|
= MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); |
4123
|
|
|
|
|
|
SV ** const tmp = av_fetch( trie_words, |
4124
|
|
|
|
|
|
ST.nextword-1, 0 ); |
4125
|
|
|
|
|
|
SV *sv= tmp ? sv_newmortal() : NULL; |
4126
|
|
|
|
|
|
|
4127
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
4128
|
|
|
|
|
|
"%*s %sonly one match left, short-circuiting: #%d <%s>%s\n", |
4129
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", PL_colors[4], |
4130
|
|
|
|
|
|
ST.nextword, |
4131
|
|
|
|
|
|
tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, |
4132
|
|
|
|
|
|
PL_colors[0], PL_colors[1], |
4133
|
|
|
|
|
|
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII |
4134
|
|
|
|
|
|
) |
4135
|
|
|
|
|
|
: "not compiled under -Dr", |
4136
|
|
|
|
|
|
PL_colors[5] ); |
4137
|
|
|
|
|
|
}); |
4138
|
|
|
|
|
|
|
4139
|
|
|
|
|
|
locinput = (char*)uc; |
4140
|
344
|
|
|
|
|
continue; /* execute rest of RE */ |
4141
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
4142
|
|
|
|
|
|
} |
4143
|
|
|
|
|
|
#undef ST |
4144
|
|
|
|
|
|
|
4145
|
|
|
|
|
|
case EXACT: { /* /abc/ */ |
4146
|
206
|
|
|
|
|
char *s = STRING(scan); |
4147
|
48
|
|
|
|
|
ln = STR_LEN(scan); |
4148
|
338178
|
|
|
|
|
if (utf8_target != is_utf8_pat) { |
4149
|
|
|
|
|
|
/* The target and the pattern have differing utf8ness. */ |
4150
|
|
|
|
|
|
char *l = locinput; |
4151
|
328218
|
|
|
|
|
const char * const e = s + ln; |
4152
|
|
|
|
|
|
|
4153
|
2776
|
|
|
|
|
if (utf8_target) { |
4154
|
|
|
|
|
|
/* The target is utf8, the pattern is not utf8. |
4155
|
|
|
|
|
|
* Above-Latin1 code points can't match the pattern; |
4156
|
|
|
|
|
|
* invariants match exactly, and the other Latin1 ones need |
4157
|
|
|
|
|
|
* to be downgraded to a single byte in order to do the |
4158
|
|
|
|
|
|
* comparison. (If we could be confident that the target |
4159
|
|
|
|
|
|
* is not malformed, this could be refactored to have fewer |
4160
|
|
|
|
|
|
* tests by just assuming that if the first bytes match, it |
4161
|
|
|
|
|
|
* is an invariant, but there are tests in the test suite |
4162
|
|
|
|
|
|
* dealing with (??{...}) which violate this) */ |
4163
|
336468
|
|
|
|
|
while (s < e) { |
4164
|
852
|
|
|
|
|
if (l >= reginfo->strend |
4165
|
426
|
|
|
|
|
|| UTF8_IS_ABOVE_LATIN1(* (U8*) l)) |
4166
|
|
|
|
|
|
{ |
4167
|
|
|
|
|
|
sayNO; |
4168
|
|
|
|
|
|
} |
4169
|
1746
|
|
|
|
|
if (UTF8_IS_INVARIANT(*(U8*)l)) { |
4170
|
3091
|
|
|
|
|
if (*l != *s) { |
4171
|
|
|
|
|
|
sayNO; |
4172
|
|
|
|
|
|
} |
4173
|
8648
|
|
|
|
|
l++; |
4174
|
|
|
|
|
|
} |
4175
|
|
|
|
|
|
else { |
4176
|
8648
|
|
|
|
|
if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) |
4177
|
|
|
|
|
|
{ |
4178
|
|
|
|
|
|
sayNO; |
4179
|
|
|
|
|
|
} |
4180
|
852
|
|
|
|
|
l += 2; |
4181
|
|
|
|
|
|
} |
4182
|
426
|
|
|
|
|
s++; |
4183
|
|
|
|
|
|
} |
4184
|
|
|
|
|
|
} |
4185
|
|
|
|
|
|
else { |
4186
|
|
|
|
|
|
/* The target is not utf8, the pattern is utf8. */ |
4187
|
1778
|
|
|
|
|
while (s < e) { |
4188
|
61845
|
|
|
|
|
if (l >= reginfo->strend |
4189
|
122465
|
|
|
|
|
|| UTF8_IS_ABOVE_LATIN1(* (U8*) s)) |
4190
|
|
|
|
|
|
{ |
4191
|
|
|
|
|
|
sayNO; |
4192
|
|
|
|
|
|
} |
4193
|
122465
|
|
|
|
|
if (UTF8_IS_INVARIANT(*(U8*)s)) { |
4194
|
131113
|
|
|
|
|
if (*s != *l) { |
4195
|
|
|
|
|
|
sayNO; |
4196
|
|
|
|
|
|
} |
4197
|
131113
|
|
|
|
|
s++; |
4198
|
|
|
|
|
|
} |
4199
|
|
|
|
|
|
else { |
4200
|
131113
|
|
|
|
|
if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) |
4201
|
|
|
|
|
|
{ |
4202
|
|
|
|
|
|
sayNO; |
4203
|
|
|
|
|
|
} |
4204
|
128813
|
|
|
|
|
s += 2; |
4205
|
|
|
|
|
|
} |
4206
|
127283
|
|
|
|
|
l++; |
4207
|
|
|
|
|
|
} |
4208
|
|
|
|
|
|
} |
4209
|
|
|
|
|
|
locinput = l; |
4210
|
|
|
|
|
|
} |
4211
|
|
|
|
|
|
else { |
4212
|
|
|
|
|
|
/* The target and the pattern have the same utf8ness. */ |
4213
|
|
|
|
|
|
/* Inline the first character, for speed. */ |
4214
|
127325
|
|
|
|
|
if (reginfo->strend - locinput < ln |
4215
|
5930
|
|
|
|
|
|| UCHARAT(s) != nextchr |
4216
|
5430
|
|
|
|
|
|| (ln > 1 && memNE(s, locinput, ln))) |
4217
|
|
|
|
|
|
{ |
4218
|
|
|
|
|
|
sayNO; |
4219
|
|
|
|
|
|
} |
4220
|
5430
|
|
|
|
|
locinput += ln; |
4221
|
|
|
|
|
|
} |
4222
|
|
|
|
|
|
break; |
4223
|
|
|
|
|
|
} |
4224
|
|
|
|
|
|
|
4225
|
|
|
|
|
|
case EXACTFL: { /* /abc/il */ |
4226
|
|
|
|
|
|
re_fold_t folder; |
4227
|
|
|
|
|
|
const U8 * fold_array; |
4228
|
|
|
|
|
|
const char * s; |
4229
|
|
|
|
|
|
U32 fold_utf8_flags; |
4230
|
|
|
|
|
|
|
4231
|
4132
|
|
|
|
|
RX_MATCH_TAINTED_on(reginfo->prog); |
4232
|
|
|
|
|
|
folder = foldEQ_locale; |
4233
|
|
|
|
|
|
fold_array = PL_fold_locale; |
4234
|
|
|
|
|
|
fold_utf8_flags = FOLDEQ_UTF8_LOCALE; |
4235
|
4132
|
|
|
|
|
goto do_exactf; |
4236
|
|
|
|
|
|
|
4237
|
|
|
|
|
|
case EXACTFU_SS: /* /\x{df}/iu */ |
4238
|
|
|
|
|
|
case EXACTFU: /* /abc/iu */ |
4239
|
|
|
|
|
|
folder = foldEQ_latin1; |
4240
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
4241
|
180062
|
|
|
|
|
fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; |
4242
|
157351
|
|
|
|
|
goto do_exactf; |
4243
|
|
|
|
|
|
|
4244
|
|
|
|
|
|
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 |
4245
|
|
|
|
|
|
patterns */ |
4246
|
334
|
|
|
|
|
assert(! is_utf8_pat); |
4247
|
|
|
|
|
|
/* FALL THROUGH */ |
4248
|
|
|
|
|
|
case EXACTFA: /* /abc/iaa */ |
4249
|
|
|
|
|
|
folder = foldEQ_latin1; |
4250
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
4251
|
|
|
|
|
|
fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; |
4252
|
|
|
|
|
|
goto do_exactf; |
4253
|
|
|
|
|
|
|
4254
|
|
|
|
|
|
case EXACTF: /* /abc/i This node only generated for |
4255
|
|
|
|
|
|
non-utf8 patterns */ |
4256
|
334
|
|
|
|
|
assert(! is_utf8_pat); |
4257
|
|
|
|
|
|
folder = foldEQ; |
4258
|
|
|
|
|
|
fold_array = PL_fold; |
4259
|
|
|
|
|
|
fold_utf8_flags = 0; |
4260
|
|
|
|
|
|
|
4261
|
|
|
|
|
|
do_exactf: |
4262
|
22869
|
|
|
|
|
s = STRING(scan); |
4263
|
22869
|
|
|
|
|
ln = STR_LEN(scan); |
4264
|
|
|
|
|
|
|
4265
|
21966
|
|
|
|
|
if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { |
4266
|
|
|
|
|
|
/* Either target or the pattern are utf8, or has the issue where |
4267
|
|
|
|
|
|
* the fold lengths may differ. */ |
4268
|
|
|
|
|
|
const char * const l = locinput; |
4269
|
3750
|
|
|
|
|
char *e = reginfo->strend; |
4270
|
|
|
|
|
|
|
4271
|
102
|
|
|
|
|
if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, |
4272
|
|
|
|
|
|
l, &e, 0, utf8_target, fold_utf8_flags)) |
4273
|
|
|
|
|
|
{ |
4274
|
|
|
|
|
|
sayNO; |
4275
|
|
|
|
|
|
} |
4276
|
19935
|
|
|
|
|
locinput = e; |
4277
|
19935
|
|
|
|
|
break; |
4278
|
|
|
|
|
|
} |
4279
|
|
|
|
|
|
|
4280
|
|
|
|
|
|
/* Neither the target nor the pattern are utf8 */ |
4281
|
2997858
|
|
|
|
|
if (UCHARAT(s) != nextchr |
4282
|
2290614
|
|
|
|
|
&& !NEXTCHR_IS_EOS |
4283
|
0
|
|
|
|
|
&& UCHARAT(s) != fold_array[nextchr]) |
4284
|
|
|
|
|
|
{ |
4285
|
|
|
|
|
|
sayNO; |
4286
|
|
|
|
|
|
} |
4287
|
2290614
|
|
|
|
|
if (reginfo->strend - locinput < ln) |
4288
|
|
|
|
|
|
sayNO; |
4289
|
0
|
|
|
|
|
if (ln > 1 && ! folder(s, locinput, ln)) |
4290
|
|
|
|
|
|
sayNO; |
4291
|
2997858
|
|
|
|
|
locinput += ln; |
4292
|
2997242
|
|
|
|
|
break; |
4293
|
|
|
|
|
|
} |
4294
|
|
|
|
|
|
|
4295
|
|
|
|
|
|
/* XXX Could improve efficiency by separating these all out using a |
4296
|
|
|
|
|
|
* macro or in-line function. At that point regcomp.c would no longer |
4297
|
|
|
|
|
|
* have to set the FLAGS fields of these */ |
4298
|
|
|
|
|
|
case BOUNDL: /* /\b/l */ |
4299
|
|
|
|
|
|
case NBOUNDL: /* /\B/l */ |
4300
|
2997242
|
|
|
|
|
RX_MATCH_TAINTED_on(reginfo->prog); |
4301
|
|
|
|
|
|
/* FALL THROUGH */ |
4302
|
|
|
|
|
|
case BOUND: /* /\b/ */ |
4303
|
|
|
|
|
|
case BOUNDU: /* /\b/u */ |
4304
|
|
|
|
|
|
case BOUNDA: /* /\b/a */ |
4305
|
|
|
|
|
|
case NBOUND: /* /\B/ */ |
4306
|
|
|
|
|
|
case NBOUNDU: /* /\B/u */ |
4307
|
|
|
|
|
|
case NBOUNDA: /* /\B/a */ |
4308
|
|
|
|
|
|
/* was last char in word? */ |
4309
|
616
|
|
|
|
|
if (utf8_target |
4310
|
616
|
|
|
|
|
&& FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET |
4311
|
1235174
|
|
|
|
|
&& FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) |
4312
|
|
|
|
|
|
{ |
4313
|
1906
|
|
|
|
|
if (locinput == reginfo->strbeg) |
4314
|
|
|
|
|
|
ln = '\n'; |
4315
|
|
|
|
|
|
else { |
4316
|
0
|
|
|
|
|
const U8 * const r = |
4317
|
1235174
|
|
|
|
|
reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); |
4318
|
|
|
|
|
|
|
4319
|
1235174
|
|
|
|
|
ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); |
4320
|
|
|
|
|
|
} |
4321
|
1235174
|
|
|
|
|
if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { |
4322
|
1235174
|
|
|
|
|
ln = isWORDCHAR_uni(ln); |
4323
|
1235174
|
|
|
|
|
if (NEXTCHR_IS_EOS) |
4324
|
|
|
|
|
|
n = 0; |
4325
|
|
|
|
|
|
else { |
4326
|
1235174
|
|
|
|
|
LOAD_UTF8_CHARCLASS_ALNUM(); |
4327
|
35934
|
|
|
|
|
n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, |
4328
|
|
|
|
|
|
utf8_target); |
4329
|
|
|
|
|
|
} |
4330
|
|
|
|
|
|
} |
4331
|
|
|
|
|
|
else { |
4332
|
1235174
|
|
|
|
|
ln = isWORDCHAR_LC_uvchr(ln); |
4333
|
1235174
|
|
|
|
|
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); |
4334
|
|
|
|
|
|
} |
4335
|
|
|
|
|
|
} |
4336
|
|
|
|
|
|
else { |
4337
|
|
|
|
|
|
|
4338
|
|
|
|
|
|
/* Here the string isn't utf8, or is utf8 and only ascii |
4339
|
|
|
|
|
|
* characters are to match \w. In the latter case looking at |
4340
|
|
|
|
|
|
* the byte just prior to the current one may be just the final |
4341
|
|
|
|
|
|
* byte of a multi-byte character. This is ok. There are two |
4342
|
|
|
|
|
|
* cases: |
4343
|
|
|
|
|
|
* 1) it is a single byte character, and then the test is doing |
4344
|
|
|
|
|
|
* just what it's supposed to. |
4345
|
|
|
|
|
|
* 2) it is a multi-byte character, in which case the final |
4346
|
|
|
|
|
|
* byte is never mistakable for ASCII, and so the test |
4347
|
|
|
|
|
|
* will say it is not a word character, which is the |
4348
|
|
|
|
|
|
* correct answer. */ |
4349
|
100768
|
|
|
|
|
ln = (locinput != reginfo->strbeg) ? |
4350
|
50384
|
|
|
|
|
UCHARAT(locinput - 1) : '\n'; |
4351
|
1184790
|
|
|
|
|
switch (FLAGS(scan)) { |
4352
|
|
|
|
|
|
case REGEX_UNICODE_CHARSET: |
4353
|
1026212
|
|
|
|
|
ln = isWORDCHAR_L1(ln); |
4354
|
158578
|
|
|
|
|
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); |
4355
|
158578
|
|
|
|
|
break; |
4356
|
|
|
|
|
|
case REGEX_LOCALE_CHARSET: |
4357
|
1235174
|
|
|
|
|
ln = isWORDCHAR_LC(ln); |
4358
|
47410
|
|
|
|
|
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); |
4359
|
47410
|
|
|
|
|
break; |
4360
|
|
|
|
|
|
case REGEX_DEPENDS_CHARSET: |
4361
|
47410
|
|
|
|
|
ln = isWORDCHAR(ln); |
4362
|
44505
|
|
|
|
|
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); |
4363
|
120933
|
|
|
|
|
break; |
4364
|
|
|
|
|
|
case REGEX_ASCII_RESTRICTED_CHARSET: |
4365
|
|
|
|
|
|
case REGEX_ASCII_MORE_RESTRICTED_CHARSET: |
4366
|
1235174
|
|
|
|
|
ln = isWORDCHAR_A(ln); |
4367
|
1235174
|
|
|
|
|
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); |
4368
|
1235174
|
|
|
|
|
break; |
4369
|
|
|
|
|
|
default: |
4370
|
207512
|
|
|
|
|
Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); |
4371
|
|
|
|
|
|
break; |
4372
|
|
|
|
|
|
} |
4373
|
|
|
|
|
|
} |
4374
|
|
|
|
|
|
/* Note requires that all BOUNDs be lower than all NBOUNDs in |
4375
|
|
|
|
|
|
* regcomp.sym */ |
4376
|
1027662
|
|
|
|
|
if (((!ln) == (!n)) == (OP(scan) < NBOUND)) |
4377
|
|
|
|
|
|
sayNO; |
4378
|
|
|
|
|
|
break; |
4379
|
|
|
|
|
|
|
4380
|
|
|
|
|
|
case ANYOF: /* /[abc]/ */ |
4381
|
|
|
|
|
|
case ANYOF_WARN_SUPER: |
4382
|
1027662
|
|
|
|
|
if (NEXTCHR_IS_EOS) |
4383
|
|
|
|
|
|
sayNO; |
4384
|
1235174
|
|
|
|
|
if (utf8_target) { |
4385
|
1234540
|
|
|
|
|
if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) |
4386
|
|
|
|
|
|
sayNO; |
4387
|
1235174
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4388
|
|
|
|
|
|
} |
4389
|
|
|
|
|
|
else { |
4390
|
1235174
|
|
|
|
|
if (!REGINCLASS(rex, scan, (U8*)locinput)) |
4391
|
|
|
|
|
|
sayNO; |
4392
|
1235174
|
|
|
|
|
locinput++; |
4393
|
|
|
|
|
|
} |
4394
|
|
|
|
|
|
break; |
4395
|
|
|
|
|
|
|
4396
|
|
|
|
|
|
/* The argument (FLAGS) to all the POSIX node types is the class number |
4397
|
|
|
|
|
|
* */ |
4398
|
|
|
|
|
|
|
4399
|
|
|
|
|
|
case NPOSIXL: /* \W or [:^punct:] etc. under /l */ |
4400
|
|
|
|
|
|
to_complement = 1; |
4401
|
|
|
|
|
|
/* FALLTHROUGH */ |
4402
|
|
|
|
|
|
|
4403
|
|
|
|
|
|
case POSIXL: /* \w or [:punct:] etc. under /l */ |
4404
|
1235174
|
|
|
|
|
if (NEXTCHR_IS_EOS) |
4405
|
|
|
|
|
|
sayNO; |
4406
|
|
|
|
|
|
|
4407
|
|
|
|
|
|
/* The locale hasn't influenced the outcome before this, so defer |
4408
|
|
|
|
|
|
* tainting until now */ |
4409
|
1235174
|
|
|
|
|
RX_MATCH_TAINTED_on(reginfo->prog); |
4410
|
|
|
|
|
|
|
4411
|
|
|
|
|
|
/* Use isFOO_lc() for characters within Latin1. (Note that |
4412
|
|
|
|
|
|
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else |
4413
|
|
|
|
|
|
* wouldn't be invariant) */ |
4414
|
32
|
|
|
|
|
if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { |
4415
|
32
|
|
|
|
|
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { |
4416
|
|
|
|
|
|
sayNO; |
4417
|
|
|
|
|
|
} |
4418
|
|
|
|
|
|
} |
4419
|
1235174
|
|
|
|
|
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { |
4420
|
1235174
|
|
|
|
|
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), |
4421
|
|
|
|
|
|
(U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, |
4422
|
|
|
|
|
|
*(locinput + 1)))))) |
4423
|
|
|
|
|
|
{ |
4424
|
|
|
|
|
|
sayNO; |
4425
|
|
|
|
|
|
} |
4426
|
|
|
|
|
|
} |
4427
|
|
|
|
|
|
else { /* Here, must be an above Latin-1 code point */ |
4428
|
|
|
|
|
|
goto utf8_posix_not_eos; |
4429
|
|
|
|
|
|
} |
4430
|
|
|
|
|
|
|
4431
|
|
|
|
|
|
/* Here, must be utf8 */ |
4432
|
1235174
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4433
|
1235148
|
|
|
|
|
break; |
4434
|
|
|
|
|
|
|
4435
|
|
|
|
|
|
case NPOSIXD: /* \W or [:^punct:] etc. under /d */ |
4436
|
|
|
|
|
|
to_complement = 1; |
4437
|
|
|
|
|
|
/* FALLTHROUGH */ |
4438
|
|
|
|
|
|
|
4439
|
|
|
|
|
|
case POSIXD: /* \w or [:punct:] etc. under /d */ |
4440
|
1235148
|
|
|
|
|
if (utf8_target) { |
4441
|
|
|
|
|
|
goto utf8_posix; |
4442
|
|
|
|
|
|
} |
4443
|
|
|
|
|
|
goto posixa; |
4444
|
|
|
|
|
|
|
4445
|
|
|
|
|
|
case NPOSIXA: /* \W or [:^punct:] etc. under /a */ |
4446
|
|
|
|
|
|
|
4447
|
2
|
|
|
|
|
if (NEXTCHR_IS_EOS) { |
4448
|
|
|
|
|
|
sayNO; |
4449
|
|
|
|
|
|
} |
4450
|
|
|
|
|
|
|
4451
|
|
|
|
|
|
/* All UTF-8 variants match */ |
4452
|
1235146
|
|
|
|
|
if (! UTF8_IS_INVARIANT(nextchr)) { |
4453
|
|
|
|
|
|
goto increment_locinput; |
4454
|
|
|
|
|
|
} |
4455
|
|
|
|
|
|
|
4456
|
|
|
|
|
|
to_complement = 1; |
4457
|
|
|
|
|
|
/* FALLTHROUGH */ |
4458
|
|
|
|
|
|
|
4459
|
|
|
|
|
|
case POSIXA: /* \w or [:punct:] etc. under /a */ |
4460
|
|
|
|
|
|
|
4461
|
|
|
|
|
|
posixa: |
4462
|
|
|
|
|
|
/* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in |
4463
|
|
|
|
|
|
* UTF-8, and also from NPOSIXA even in UTF-8 when the current |
4464
|
|
|
|
|
|
* character is a single byte */ |
4465
|
|
|
|
|
|
|
4466
|
1235146
|
|
|
|
|
if (NEXTCHR_IS_EOS |
4467
|
1235148
|
|
|
|
|
|| ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, |
4468
|
|
|
|
|
|
FLAGS(scan))))) |
4469
|
|
|
|
|
|
{ |
4470
|
|
|
|
|
|
sayNO; |
4471
|
|
|
|
|
|
} |
4472
|
|
|
|
|
|
|
4473
|
|
|
|
|
|
/* Here we are either not in utf8, or we matched a utf8-invariant, |
4474
|
|
|
|
|
|
* so the next char is the next byte */ |
4475
|
1235148
|
|
|
|
|
locinput++; |
4476
|
1016512
|
|
|
|
|
break; |
4477
|
|
|
|
|
|
|
4478
|
|
|
|
|
|
case NPOSIXU: /* \W or [:^punct:] etc. under /u */ |
4479
|
|
|
|
|
|
to_complement = 1; |
4480
|
|
|
|
|
|
/* FALLTHROUGH */ |
4481
|
|
|
|
|
|
|
4482
|
|
|
|
|
|
case POSIXU: /* \w or [:punct:] etc. under /u */ |
4483
|
|
|
|
|
|
utf8_posix: |
4484
|
218636
|
|
|
|
|
if (NEXTCHR_IS_EOS) { |
4485
|
|
|
|
|
|
sayNO; |
4486
|
|
|
|
|
|
} |
4487
|
|
|
|
|
|
utf8_posix_not_eos: |
4488
|
|
|
|
|
|
|
4489
|
|
|
|
|
|
/* Use _generic_isCC() for characters within Latin1. (Note that |
4490
|
|
|
|
|
|
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else |
4491
|
|
|
|
|
|
* wouldn't be invariant) */ |
4492
|
506
|
|
|
|
|
if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { |
4493
|
218130
|
|
|
|
|
if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, |
4494
|
|
|
|
|
|
FLAGS(scan))))) |
4495
|
|
|
|
|
|
{ |
4496
|
|
|
|
|
|
sayNO; |
4497
|
|
|
|
|
|
} |
4498
|
218130
|
|
|
|
|
locinput++; |
4499
|
|
|
|
|
|
} |
4500
|
218130
|
|
|
|
|
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { |
4501
|
208310
|
|
|
|
|
if (! (to_complement |
4502
|
218130
|
|
|
|
|
^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, |
4503
|
|
|
|
|
|
*(locinput + 1)), |
4504
|
|
|
|
|
|
FLAGS(scan))))) |
4505
|
|
|
|
|
|
{ |
4506
|
|
|
|
|
|
sayNO; |
4507
|
|
|
|
|
|
} |
4508
|
9820
|
|
|
|
|
locinput += 2; |
4509
|
|
|
|
|
|
} |
4510
|
|
|
|
|
|
else { /* Handle above Latin-1 code points */ |
4511
|
860
|
|
|
|
|
classnum = (_char_class_number) FLAGS(scan); |
4512
|
860
|
|
|
|
|
if (classnum < _FIRST_NON_SWASH_CC) { |
4513
|
|
|
|
|
|
|
4514
|
|
|
|
|
|
/* Here, uses a swash to find such code points. Load if if |
4515
|
|
|
|
|
|
* not done already */ |
4516
|
0
|
|
|
|
|
if (! PL_utf8_swash_ptrs[classnum]) { |
4517
|
218130
|
|
|
|
|
U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; |
4518
|
|
|
|
|
|
PL_utf8_swash_ptrs[classnum] |
4519
|
9820
|
|
|
|
|
= _core_swash_init("utf8", |
4520
|
|
|
|
|
|
swash_property_names[classnum], |
4521
|
|
|
|
|
|
&PL_sv_undef, 1, 0, NULL, &flags); |
4522
|
|
|
|
|
|
} |
4523
|
9820
|
|
|
|
|
if (! (to_complement |
4524
|
1235148
|
|
|
|
|
^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], |
4525
|
|
|
|
|
|
(U8 *) locinput, TRUE)))) |
4526
|
|
|
|
|
|
{ |
4527
|
|
|
|
|
|
sayNO; |
4528
|
|
|
|
|
|
} |
4529
|
|
|
|
|
|
} |
4530
|
|
|
|
|
|
else { /* Here, uses macros to find above Latin-1 code points */ |
4531
|
1235148
|
|
|
|
|
switch (classnum) { |
4532
|
|
|
|
|
|
case _CC_ENUM_SPACE: /* XXX would require separate |
4533
|
|
|
|
|
|
code if we revert the change |
4534
|
|
|
|
|
|
of \v matching this */ |
4535
|
|
|
|
|
|
case _CC_ENUM_PSXSPC: |
4536
|
1235148
|
|
|
|
|
if (! (to_complement |
4537
|
1235148
|
|
|
|
|
^ cBOOL(is_XPERLSPACE_high(locinput)))) |
4538
|
|
|
|
|
|
{ |
4539
|
|
|
|
|
|
sayNO; |
4540
|
|
|
|
|
|
} |
4541
|
|
|
|
|
|
break; |
4542
|
|
|
|
|
|
case _CC_ENUM_BLANK: |
4543
|
218130
|
|
|
|
|
if (! (to_complement |
4544
|
208310
|
|
|
|
|
^ cBOOL(is_HORIZWS_high(locinput)))) |
4545
|
|
|
|
|
|
{ |
4546
|
|
|
|
|
|
sayNO; |
4547
|
|
|
|
|
|
} |
4548
|
|
|
|
|
|
break; |
4549
|
|
|
|
|
|
case _CC_ENUM_XDIGIT: |
4550
|
9820
|
|
|
|
|
if (! (to_complement |
4551
|
0
|
|
|
|
|
^ cBOOL(is_XDIGIT_high(locinput)))) |
4552
|
|
|
|
|
|
{ |
4553
|
|
|
|
|
|
sayNO; |
4554
|
|
|
|
|
|
} |
4555
|
|
|
|
|
|
break; |
4556
|
|
|
|
|
|
case _CC_ENUM_VERTSPACE: |
4557
|
0
|
|
|
|
|
if (! (to_complement |
4558
|
9820
|
|
|
|
|
^ cBOOL(is_VERTWS_high(locinput)))) |
4559
|
|
|
|
|
|
{ |
4560
|
|
|
|
|
|
sayNO; |
4561
|
|
|
|
|
|
} |
4562
|
|
|
|
|
|
break; |
4563
|
|
|
|
|
|
default: /* The rest, e.g. [:cntrl:], can't match |
4564
|
|
|
|
|
|
above Latin1 */ |
4565
|
14730
|
|
|
|
|
if (! to_complement) { |
4566
|
|
|
|
|
|
sayNO; |
4567
|
|
|
|
|
|
} |
4568
|
|
|
|
|
|
break; |
4569
|
|
|
|
|
|
} |
4570
|
|
|
|
|
|
} |
4571
|
9820
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4572
|
|
|
|
|
|
} |
4573
|
|
|
|
|
|
break; |
4574
|
|
|
|
|
|
|
4575
|
|
|
|
|
|
case CLUMP: /* Match \X: logical Unicode character. This is defined as |
4576
|
|
|
|
|
|
a Unicode extended Grapheme Cluster */ |
4577
|
|
|
|
|
|
/* From http://www.unicode.org/reports/tr29 (5.2 version). An |
4578
|
|
|
|
|
|
extended Grapheme Cluster is: |
4579
|
|
|
|
|
|
|
4580
|
|
|
|
|
|
CR LF |
4581
|
|
|
|
|
|
| Prepend* Begin Extend* |
4582
|
|
|
|
|
|
| . |
4583
|
|
|
|
|
|
|
4584
|
|
|
|
|
|
Begin is: ( Special_Begin | ! Control ) |
4585
|
|
|
|
|
|
Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) |
4586
|
|
|
|
|
|
Extend is: ( Grapheme_Extend | Spacing_Mark ) |
4587
|
|
|
|
|
|
Control is: [ GCB_Control | CR | LF ] |
4588
|
|
|
|
|
|
Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) |
4589
|
|
|
|
|
|
|
4590
|
|
|
|
|
|
If we create a 'Regular_Begin' = Begin - Special_Begin, then |
4591
|
|
|
|
|
|
we can rewrite |
4592
|
|
|
|
|
|
|
4593
|
|
|
|
|
|
Begin is ( Regular_Begin + Special Begin ) |
4594
|
|
|
|
|
|
|
4595
|
|
|
|
|
|
It turns out that 98.4% of all Unicode code points match |
4596
|
|
|
|
|
|
Regular_Begin. Doing it this way eliminates a table match in |
4597
|
|
|
|
|
|
the previous implementation for almost all Unicode code points. |
4598
|
|
|
|
|
|
|
4599
|
|
|
|
|
|
There is a subtlety with Prepend* which showed up in testing. |
4600
|
|
|
|
|
|
Note that the Begin, and only the Begin is required in: |
4601
|
|
|
|
|
|
| Prepend* Begin Extend* |
4602
|
|
|
|
|
|
Also, Begin contains '! Control'. A Prepend must be a |
4603
|
|
|
|
|
|
'! Control', which means it must also be a Begin. What it |
4604
|
|
|
|
|
|
comes down to is that if we match Prepend* and then find no |
4605
|
|
|
|
|
|
suitable Begin afterwards, that if we backtrack the last |
4606
|
|
|
|
|
|
Prepend, that one will be a suitable Begin. |
4607
|
|
|
|
|
|
*/ |
4608
|
|
|
|
|
|
|
4609
|
14730
|
|
|
|
|
if (NEXTCHR_IS_EOS) |
4610
|
|
|
|
|
|
sayNO; |
4611
|
9820
|
|
|
|
|
if (! utf8_target) { |
4612
|
|
|
|
|
|
|
4613
|
|
|
|
|
|
/* Match either CR LF or '.', as all the other possibilities |
4614
|
|
|
|
|
|
* require utf8 */ |
4615
|
0
|
|
|
|
|
locinput++; /* Match the . or CR */ |
4616
|
218130
|
|
|
|
|
if (nextchr == '\r' /* And if it was CR, and the next is LF, |
4617
|
|
|
|
|
|
match the LF */ |
4618
|
218130
|
|
|
|
|
&& locinput < reginfo->strend |
4619
|
218130
|
|
|
|
|
&& UCHARAT(locinput) == '\n') |
4620
|
|
|
|
|
|
{ |
4621
|
218130
|
|
|
|
|
locinput++; |
4622
|
|
|
|
|
|
} |
4623
|
|
|
|
|
|
} |
4624
|
|
|
|
|
|
else { |
4625
|
|
|
|
|
|
|
4626
|
|
|
|
|
|
/* Utf8: See if is ( CR LF ); already know that locinput < |
4627
|
|
|
|
|
|
* reginfo->strend, so locinput+1 is in bounds */ |
4628
|
218130
|
|
|
|
|
if ( nextchr == '\r' && locinput+1 < reginfo->strend |
4629
|
218130
|
|
|
|
|
&& UCHARAT(locinput + 1) == '\n') |
4630
|
|
|
|
|
|
{ |
4631
|
218130
|
|
|
|
|
locinput += 2; |
4632
|
|
|
|
|
|
} |
4633
|
|
|
|
|
|
else { |
4634
|
|
|
|
|
|
STRLEN len; |
4635
|
|
|
|
|
|
|
4636
|
|
|
|
|
|
/* In case have to backtrack to beginning, then match '.' */ |
4637
|
|
|
|
|
|
char *starting = locinput; |
4638
|
|
|
|
|
|
|
4639
|
|
|
|
|
|
/* In case have to backtrack the last prepend */ |
4640
|
|
|
|
|
|
char *previous_prepend = NULL; |
4641
|
|
|
|
|
|
|
4642
|
218130
|
|
|
|
|
LOAD_UTF8_CHARCLASS_GCB(); |
4643
|
|
|
|
|
|
|
4644
|
|
|
|
|
|
/* Match (prepend)* */ |
4645
|
|
|
|
|
|
while (locinput < reginfo->strend |
4646
|
|
|
|
|
|
&& (len = is_GCB_Prepend_utf8(locinput))) |
4647
|
|
|
|
|
|
{ |
4648
|
|
|
|
|
|
previous_prepend = locinput; |
4649
|
|
|
|
|
|
locinput += len; |
4650
|
|
|
|
|
|
} |
4651
|
|
|
|
|
|
|
4652
|
|
|
|
|
|
/* As noted above, if we matched a prepend character, but |
4653
|
|
|
|
|
|
* the next thing won't match, back off the last prepend we |
4654
|
|
|
|
|
|
* matched, as it is guaranteed to match the begin */ |
4655
|
|
|
|
|
|
if (previous_prepend |
4656
|
|
|
|
|
|
&& (locinput >= reginfo->strend |
4657
|
|
|
|
|
|
|| (! swash_fetch(PL_utf8_X_regular_begin, |
4658
|
|
|
|
|
|
(U8*)locinput, utf8_target) |
4659
|
|
|
|
|
|
&& ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) |
4660
|
|
|
|
|
|
) |
4661
|
|
|
|
|
|
{ |
4662
|
|
|
|
|
|
locinput = previous_prepend; |
4663
|
|
|
|
|
|
} |
4664
|
|
|
|
|
|
|
4665
|
|
|
|
|
|
/* Note that here we know reginfo->strend > locinput, as we |
4666
|
|
|
|
|
|
* tested that upon input to this switch case, and if we |
4667
|
|
|
|
|
|
* moved locinput forward, we tested the result just above |
4668
|
|
|
|
|
|
* and it either passed, or we backed off so that it will |
4669
|
|
|
|
|
|
* now pass */ |
4670
|
218130
|
|
|
|
|
if (swash_fetch(PL_utf8_X_regular_begin, |
4671
|
|
|
|
|
|
(U8*)locinput, utf8_target)) { |
4672
|
3215988
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4673
|
|
|
|
|
|
} |
4674
|
3215988
|
|
|
|
|
else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { |
4675
|
|
|
|
|
|
|
4676
|
|
|
|
|
|
/* Here did not match the required 'Begin' in the |
4677
|
|
|
|
|
|
* second term. So just match the very first |
4678
|
|
|
|
|
|
* character, the '.' of the final term of the regex */ |
4679
|
3215988
|
|
|
|
|
locinput = starting + UTF8SKIP(starting); |
4680
|
3215988
|
|
|
|
|
goto exit_utf8; |
4681
|
|
|
|
|
|
} else { |
4682
|
|
|
|
|
|
|
4683
|
|
|
|
|
|
/* Here is a special begin. It can be composed of |
4684
|
|
|
|
|
|
* several individual characters. One possibility is |
4685
|
|
|
|
|
|
* RI+ */ |
4686
|
3215988
|
|
|
|
|
if ((len = is_GCB_RI_utf8(locinput))) { |
4687
|
3215988
|
|
|
|
|
locinput += len; |
4688
|
3215988
|
|
|
|
|
while (locinput < reginfo->strend |
4689
|
3215988
|
|
|
|
|
&& (len = is_GCB_RI_utf8(locinput))) |
4690
|
|
|
|
|
|
{ |
4691
|
3215988
|
|
|
|
|
locinput += len; |
4692
|
|
|
|
|
|
} |
4693
|
3434148
|
|
|
|
|
} else if ((len = is_GCB_T_utf8(locinput))) { |
4694
|
|
|
|
|
|
/* Another possibility is T+ */ |
4695
|
3215988
|
|
|
|
|
locinput += len; |
4696
|
3215988
|
|
|
|
|
while (locinput < reginfo->strend |
4697
|
3215988
|
|
|
|
|
&& (len = is_GCB_T_utf8(locinput))) |
4698
|
|
|
|
|
|
{ |
4699
|
378504
|
|
|
|
|
locinput += len; |
4700
|
|
|
|
|
|
} |
4701
|
|
|
|
|
|
} else { |
4702
|
|
|
|
|
|
|
4703
|
|
|
|
|
|
/* Here, neither RI+ nor T+; must be some other |
4704
|
|
|
|
|
|
* Hangul. That means it is one of the others: L, |
4705
|
|
|
|
|
|
* LV, LVT or V, and matches: |
4706
|
|
|
|
|
|
* L* (L | LVT T* | V * V* T* | LV V* T*) */ |
4707
|
|
|
|
|
|
|
4708
|
|
|
|
|
|
/* Match L* */ |
4709
|
378504
|
|
|
|
|
while (locinput < reginfo->strend |
4710
|
561556
|
|
|
|
|
&& (len = is_GCB_L_utf8(locinput))) |
4711
|
|
|
|
|
|
{ |
4712
|
378504
|
|
|
|
|
locinput += len; |
4713
|
|
|
|
|
|
} |
4714
|
|
|
|
|
|
|
4715
|
|
|
|
|
|
/* Here, have exhausted L*. If the next character |
4716
|
|
|
|
|
|
* is not an LV, LVT nor V, it means we had to have |
4717
|
|
|
|
|
|
* at least one L, so matches L+ in the original |
4718
|
|
|
|
|
|
* equation, we have a complete hangul syllable. |
4719
|
|
|
|
|
|
* Are done. */ |
4720
|
|
|
|
|
|
|
4721
|
378504
|
|
|
|
|
if (locinput < reginfo->strend |
4722
|
378504
|
|
|
|
|
&& is_GCB_LV_LVT_V_utf8(locinput)) |
4723
|
|
|
|
|
|
{ |
4724
|
|
|
|
|
|
/* Otherwise keep going. Must be LV, LVT or V. |
4725
|
|
|
|
|
|
* See if LVT, by first ruling out V, then LV */ |
4726
|
378504
|
|
|
|
|
if (! is_GCB_V_utf8(locinput) |
4727
|
|
|
|
|
|
/* All but every TCount one is LV */ |
4728
|
378504
|
|
|
|
|
&& (valid_utf8_to_uvchr((U8 *) locinput, |
4729
|
|
|
|
|
|
NULL) |
4730
|
378504
|
|
|
|
|
- SBASE) |
4731
|
26438
|
|
|
|
|
% TCount != 0) |
4732
|
|
|
|
|
|
{ |
4733
|
3469852
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4734
|
|
|
|
|
|
} else { |
4735
|
|
|
|
|
|
|
4736
|
|
|
|
|
|
/* Must be V or LV. Take it, then match |
4737
|
|
|
|
|
|
* V* */ |
4738
|
3469852
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4739
|
3602116
|
|
|
|
|
while (locinput < reginfo->strend |
4740
|
3469852
|
|
|
|
|
&& (len = is_GCB_V_utf8(locinput))) |
4741
|
|
|
|
|
|
{ |
4742
|
3469852
|
|
|
|
|
locinput += len; |
4743
|
|
|
|
|
|
} |
4744
|
|
|
|
|
|
} |
4745
|
|
|
|
|
|
|
4746
|
|
|
|
|
|
/* And any of LV, LVT, or V can be followed |
4747
|
|
|
|
|
|
* by T* */ |
4748
|
3469852
|
|
|
|
|
while (locinput < reginfo->strend |
4749
|
3469852
|
|
|
|
|
&& (len = is_GCB_T_utf8(locinput))) |
4750
|
|
|
|
|
|
{ |
4751
|
3469852
|
|
|
|
|
locinput += len; |
4752
|
|
|
|
|
|
} |
4753
|
|
|
|
|
|
} |
4754
|
|
|
|
|
|
} |
4755
|
|
|
|
|
|
} |
4756
|
|
|
|
|
|
|
4757
|
|
|
|
|
|
/* Match any extender */ |
4758
|
3469852
|
|
|
|
|
while (locinput < reginfo->strend |
4759
|
3469852
|
|
|
|
|
&& swash_fetch(PL_utf8_X_extend, |
4760
|
|
|
|
|
|
(U8*)locinput, utf8_target)) |
4761
|
|
|
|
|
|
{ |
4762
|
1989360
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
4763
|
|
|
|
|
|
} |
4764
|
|
|
|
|
|
} |
4765
|
|
|
|
|
|
exit_utf8: |
4766
|
149852615
|
|
|
|
|
if (locinput > reginfo->strend) sayNO; |
4767
|
|
|
|
|
|
} |
4768
|
|
|
|
|
|
break; |
4769
|
|
|
|
|
|
|
4770
|
|
|
|
|
|
case NREFFL: /* /\g{name}/il */ |
4771
|
|
|
|
|
|
{ /* The capture buffer cases. The ones beginning with N for the |
4772
|
|
|
|
|
|
named buffers just convert to the equivalent numbered and |
4773
|
|
|
|
|
|
pretend they were called as the corresponding numbered buffer |
4774
|
|
|
|
|
|
op. */ |
4775
|
|
|
|
|
|
/* don't initialize these in the declaration, it makes C++ |
4776
|
|
|
|
|
|
unhappy */ |
4777
|
|
|
|
|
|
const char *s; |
4778
|
|
|
|
|
|
char type; |
4779
|
|
|
|
|
|
re_fold_t folder; |
4780
|
|
|
|
|
|
const U8 *fold_array; |
4781
|
|
|
|
|
|
UV utf8_fold_flags; |
4782
|
|
|
|
|
|
|
4783
|
149852615
|
|
|
|
|
RX_MATCH_TAINTED_on(reginfo->prog); |
4784
|
|
|
|
|
|
folder = foldEQ_locale; |
4785
|
|
|
|
|
|
fold_array = PL_fold_locale; |
4786
|
|
|
|
|
|
type = REFFL; |
4787
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_UTF8_LOCALE; |
4788
|
149852615
|
|
|
|
|
goto do_nref; |
4789
|
|
|
|
|
|
|
4790
|
|
|
|
|
|
case NREFFA: /* /\g{name}/iaa */ |
4791
|
|
|
|
|
|
folder = foldEQ_latin1; |
4792
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
4793
|
|
|
|
|
|
type = REFFA; |
4794
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; |
4795
|
|
|
|
|
|
goto do_nref; |
4796
|
|
|
|
|
|
|
4797
|
|
|
|
|
|
case NREFFU: /* /\g{name}/iu */ |
4798
|
|
|
|
|
|
folder = foldEQ_latin1; |
4799
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
4800
|
|
|
|
|
|
type = REFFU; |
4801
|
|
|
|
|
|
utf8_fold_flags = 0; |
4802
|
143530275
|
|
|
|
|
goto do_nref; |
4803
|
|
|
|
|
|
|
4804
|
|
|
|
|
|
case NREFF: /* /\g{name}/i */ |
4805
|
|
|
|
|
|
folder = foldEQ; |
4806
|
|
|
|
|
|
fold_array = PL_fold; |
4807
|
|
|
|
|
|
type = REFF; |
4808
|
|
|
|
|
|
utf8_fold_flags = 0; |
4809
|
149852615
|
|
|
|
|
goto do_nref; |
4810
|
|
|
|
|
|
|
4811
|
|
|
|
|
|
case NREF: /* /\g{name}/ */ |
4812
|
|
|
|
|
|
type = REF; |
4813
|
|
|
|
|
|
folder = NULL; |
4814
|
|
|
|
|
|
fold_array = NULL; |
4815
|
|
|
|
|
|
utf8_fold_flags = 0; |
4816
|
|
|
|
|
|
do_nref: |
4817
|
|
|
|
|
|
|
4818
|
|
|
|
|
|
/* For the named back references, find the corresponding buffer |
4819
|
|
|
|
|
|
* number */ |
4820
|
82383719
|
|
|
|
|
n = reg_check_named_buff_matched(rex,scan); |
4821
|
|
|
|
|
|
|
4822
|
82383719
|
|
|
|
|
if ( ! n ) { |
4823
|
|
|
|
|
|
sayNO; |
4824
|
|
|
|
|
|
} |
4825
|
|
|
|
|
|
goto do_nref_ref_common; |
4826
|
|
|
|
|
|
|
4827
|
|
|
|
|
|
case REFFL: /* /\1/il */ |
4828
|
82383719
|
|
|
|
|
RX_MATCH_TAINTED_on(reginfo->prog); |
4829
|
|
|
|
|
|
folder = foldEQ_locale; |
4830
|
|
|
|
|
|
fold_array = PL_fold_locale; |
4831
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_UTF8_LOCALE; |
4832
|
74406611
|
|
|
|
|
goto do_ref; |
4833
|
|
|
|
|
|
|
4834
|
|
|
|
|
|
case REFFA: /* /\1/iaa */ |
4835
|
|
|
|
|
|
folder = foldEQ_latin1; |
4836
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
4837
|
|
|
|
|
|
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; |
4838
|
|
|
|
|
|
goto do_ref; |
4839
|
|
|
|
|
|
|
4840
|
|
|
|
|
|
case REFFU: /* /\1/iu */ |
4841
|
|
|
|
|
|
folder = foldEQ_latin1; |
4842
|
|
|
|
|
|
fold_array = PL_fold_latin1; |
4843
|
|
|
|
|
|
utf8_fold_flags = 0; |
4844
|
82383719
|
|
|
|
|
goto do_ref; |
4845
|
|
|
|
|
|
|
4846
|
|
|
|
|
|
case REFF: /* /\1/i */ |
4847
|
|
|
|
|
|
folder = foldEQ; |
4848
|
|
|
|
|
|
fold_array = PL_fold; |
4849
|
|
|
|
|
|
utf8_fold_flags = 0; |
4850
|
82383719
|
|
|
|
|
goto do_ref; |
4851
|
|
|
|
|
|
|
4852
|
|
|
|
|
|
case REF: /* /\1/ */ |
4853
|
|
|
|
|
|
folder = NULL; |
4854
|
|
|
|
|
|
fold_array = NULL; |
4855
|
|
|
|
|
|
utf8_fold_flags = 0; |
4856
|
|
|
|
|
|
|
4857
|
|
|
|
|
|
do_ref: |
4858
|
50
|
|
|
|
|
type = OP(scan); |
4859
|
200
|
|
|
|
|
n = ARG(scan); /* which paren pair */ |
4860
|
|
|
|
|
|
|
4861
|
|
|
|
|
|
do_nref_ref_common: |
4862
|
200
|
|
|
|
|
ln = rex->offs[n].start; |
4863
|
150
|
|
|
|
|
reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ |
4864
|
200
|
|
|
|
|
if (rex->lastparen < n || ln == -1) |
4865
|
|
|
|
|
|
sayNO; /* Do not match unless seen CLOSEn. */ |
4866
|
100
|
|
|
|
|
if (ln == rex->offs[n].end) |
4867
|
|
|
|
|
|
break; |
4868
|
|
|
|
|
|
|
4869
|
100
|
|
|
|
|
s = reginfo->strbeg + ln; |
4870
|
100
|
|
|
|
|
if (type != REF /* REF can do byte comparison */ |
4871
|
100
|
|
|
|
|
&& (utf8_target || type == REFFU)) |
4872
|
|
|
|
|
|
{ /* XXX handle REFFL better */ |
4873
|
50
|
|
|
|
|
char * limit = reginfo->strend; |
4874
|
|
|
|
|
|
|
4875
|
|
|
|
|
|
/* This call case insensitively compares the entire buffer |
4876
|
|
|
|
|
|
* at s, with the current input starting at locinput, but |
4877
|
|
|
|
|
|
* not going off the end given by reginfo->strend, and |
4878
|
|
|
|
|
|
* returns in upon success, how much of the |
4879
|
|
|
|
|
|
* current input was matched */ |
4880
|
100
|
|
|
|
|
if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, |
4881
|
|
|
|
|
|
locinput, &limit, 0, utf8_target, utf8_fold_flags)) |
4882
|
|
|
|
|
|
{ |
4883
|
|
|
|
|
|
sayNO; |
4884
|
|
|
|
|
|
} |
4885
|
100
|
|
|
|
|
locinput = limit; |
4886
|
0
|
|
|
|
|
break; |
4887
|
|
|
|
|
|
} |
4888
|
|
|
|
|
|
|
4889
|
|
|
|
|
|
/* Not utf8: Inline the first character, for speed. */ |
4890
|
2546
|
|
|
|
|
if (!NEXTCHR_IS_EOS && |
4891
|
2546
|
|
|
|
|
UCHARAT(s) != nextchr && |
4892
|
2546
|
|
|
|
|
(type == REF || |
4893
|
500
|
|
|
|
|
UCHARAT(s) != fold_array[nextchr])) |
4894
|
|
|
|
|
|
sayNO; |
4895
|
500
|
|
|
|
|
ln = rex->offs[n].end - ln; |
4896
|
600
|
|
|
|
|
if (locinput + ln > reginfo->strend) |
4897
|
|
|
|
|
|
sayNO; |
4898
|
600
|
|
|
|
|
if (ln > 1 && (type == REF |
4899
|
600
|
|
|
|
|
? memNE(s, locinput, ln) |
4900
|
176
|
|
|
|
|
: ! folder(s, locinput, ln))) |
4901
|
|
|
|
|
|
sayNO; |
4902
|
4978
|
|
|
|
|
locinput += ln; |
4903
|
4978
|
|
|
|
|
break; |
4904
|
|
|
|
|
|
} |
4905
|
|
|
|
|
|
|
4906
|
|
|
|
|
|
case NOTHING: /* null op; e.g. the 'nothing' following |
4907
|
|
|
|
|
|
* the '*' in m{(a+|b)*}' */ |
4908
|
|
|
|
|
|
break; |
4909
|
|
|
|
|
|
case TAIL: /* placeholder while compiling (A|B|C) */ |
4910
|
|
|
|
|
|
break; |
4911
|
|
|
|
|
|
|
4912
|
|
|
|
|
|
case BACK: /* ??? doesn't appear to be used ??? */ |
4913
|
|
|
|
|
|
break; |
4914
|
|
|
|
|
|
|
4915
|
|
|
|
|
|
#undef ST |
4916
|
|
|
|
|
|
#define ST st->u.eval |
4917
|
|
|
|
|
|
{ |
4918
|
|
|
|
|
|
SV *ret; |
4919
|
|
|
|
|
|
REGEXP *re_sv; |
4920
|
|
|
|
|
|
regexp *re; |
4921
|
|
|
|
|
|
regexp_internal *rei; |
4922
|
|
|
|
|
|
regnode *startpoint; |
4923
|
|
|
|
|
|
|
4924
|
|
|
|
|
|
case GOSTART: /* (?R) */ |
4925
|
|
|
|
|
|
case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ |
4926
|
2200
|
|
|
|
|
if (cur_eval && cur_eval->locinput==locinput) { |
4927
|
2778
|
|
|
|
|
if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) |
4928
|
2778
|
|
|
|
|
Perl_croak(aTHX_ "Infinite recursion in regex"); |
4929
|
1952
|
|
|
|
|
if ( ++nochange_depth > max_nochange_depth ) |
4930
|
219286
|
|
|
|
|
Perl_croak(aTHX_ |
4931
|
|
|
|
|
|
"Pattern subroutine nesting without pos change" |
4932
|
|
|
|
|
|
" exceeded limit in regex"); |
4933
|
|
|
|
|
|
} else { |
4934
|
|
|
|
|
|
nochange_depth = 0; |
4935
|
|
|
|
|
|
} |
4936
|
|
|
|
|
|
re_sv = rex_sv; |
4937
|
|
|
|
|
|
re = rex; |
4938
|
|
|
|
|
|
rei = rexi; |
4939
|
219286
|
|
|
|
|
if (OP(scan)==GOSUB) { |
4940
|
47784842
|
|
|
|
|
startpoint = scan + ARG2L(scan); |
4941
|
47784842
|
|
|
|
|
ST.close_paren = ARG(scan); |
4942
|
|
|
|
|
|
} else { |
4943
|
0
|
|
|
|
|
startpoint = rei->program+1; |
4944
|
47784842
|
|
|
|
|
ST.close_paren = 0; |
4945
|
|
|
|
|
|
} |
4946
|
|
|
|
|
|
goto eval_recurse_doit; |
4947
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
4948
|
|
|
|
|
|
|
4949
|
|
|
|
|
|
case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ |
4950
|
7819878
|
|
|
|
|
if (cur_eval && cur_eval->locinput==locinput) { |
4951
|
47784842
|
|
|
|
|
if ( ++nochange_depth > max_nochange_depth ) |
4952
|
47784842
|
|
|
|
|
Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); |
4953
|
|
|
|
|
|
} else { |
4954
|
|
|
|
|
|
nochange_depth = 0; |
4955
|
|
|
|
|
|
} |
4956
|
|
|
|
|
|
{ |
4957
|
|
|
|
|
|
/* execute the code in the {...} */ |
4958
|
|
|
|
|
|
|
4959
|
47784890
|
|
|
|
|
dSP; |
4960
|
|
|
|
|
|
IV before; |
4961
|
47784890
|
|
|
|
|
OP * const oop = PL_op; |
4962
|
47784890
|
|
|
|
|
COP * const ocurcop = PL_curcop; |
4963
|
|
|
|
|
|
OP *nop; |
4964
|
|
|
|
|
|
CV *newcv; |
4965
|
|
|
|
|
|
|
4966
|
|
|
|
|
|
/* save *all* paren positions */ |
4967
|
47784890
|
|
|
|
|
regcppush(rex, 0, maxopenparen); |
4968
|
47784890
|
|
|
|
|
REGCP_SET(runops_cp); |
4969
|
|
|
|
|
|
|
4970
|
47784890
|
|
|
|
|
if (!caller_cv) |
4971
|
47784878
|
|
|
|
|
caller_cv = find_runcv(NULL); |
4972
|
|
|
|
|
|
|
4973
|
7117694
|
|
|
|
|
n = ARG(scan); |
4974
|
|
|
|
|
|
|
4975
|
7117694
|
|
|
|
|
if (rexi->data->what[n] == 'r') { /* code from an external qr */ |
4976
|
40667292
|
|
|
|
|
newcv = (ReANY( |
4977
|
|
|
|
|
|
(REGEXP*)(rexi->data->data[n]) |
4978
|
|
|
|
|
|
))->qr_anoncv |
4979
|
|
|
|
|
|
; |
4980
|
40667244
|
|
|
|
|
nop = (OP*)rexi->data->data[n+1]; |
4981
|
|
|
|
|
|
} |
4982
|
40667196
|
|
|
|
|
else if (rexi->data->what[n] == 'l') { /* literal code */ |
4983
|
|
|
|
|
|
newcv = caller_cv; |
4984
|
65757916
|
|
|
|
|
nop = (OP*)rexi->data->data[n]; |
4985
|
65757916
|
|
|
|
|
assert(CvDEPTH(newcv)); |
4986
|
|
|
|
|
|
} |
4987
|
|
|
|
|
|
else { |
4988
|
|
|
|
|
|
/* literal with own CV */ |
4989
|
65757916
|
|
|
|
|
assert(rexi->data->what[n] == 'L'); |
4990
|
65757916
|
|
|
|
|
newcv = rex->qr_anoncv; |
4991
|
65757916
|
|
|
|
|
nop = (OP*)rexi->data->data[n]; |
4992
|
|
|
|
|
|
} |
4993
|
|
|
|
|
|
|
4994
|
|
|
|
|
|
/* normally if we're about to execute code from the same |
4995
|
|
|
|
|
|
* CV that we used previously, we just use the existing |
4996
|
|
|
|
|
|
* CX stack entry. However, its possible that in the |
4997
|
|
|
|
|
|
* meantime we may have backtracked, popped from the save |
4998
|
|
|
|
|
|
* stack, and undone the SAVECOMPPAD(s) associated with |
4999
|
|
|
|
|
|
* PUSH_MULTICALL; in which case PL_comppad no longer |
5000
|
|
|
|
|
|
* points to newcv's pad. */ |
5001
|
65757964
|
|
|
|
|
if (newcv != last_pushed_cv || PL_comppad != last_pad) |
5002
|
65757964
|
|
|
|
|
{ |
5003
|
65757964
|
|
|
|
|
U8 flags = (CXp_SUB_RE | |
5004
|
|
|
|
|
|
((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); |
5005
|
16837620
|
|
|
|
|
if (last_pushed_cv) { |
5006
|
16837656
|
|
|
|
|
CHANGE_MULTICALL_FLAGS(newcv, flags); |
5007
|
|
|
|
|
|
} |
5008
|
|
|
|
|
|
else { |
5009
|
16837788
|
|
|
|
|
PUSH_MULTICALL_FLAGS(newcv, flags); |
5010
|
|
|
|
|
|
} |
5011
|
|
|
|
|
|
last_pushed_cv = newcv; |
5012
|
|
|
|
|
|
} |
5013
|
|
|
|
|
|
else { |
5014
|
|
|
|
|
|
/* these assignments are just to silence compiler |
5015
|
|
|
|
|
|
* warnings */ |
5016
|
|
|
|
|
|
multicall_cop = NULL; |
5017
|
|
|
|
|
|
newsp = NULL; |
5018
|
|
|
|
|
|
} |
5019
|
16837620
|
|
|
|
|
last_pad = PL_comppad; |
5020
|
|
|
|
|
|
|
5021
|
|
|
|
|
|
/* the initial nextstate you would normally execute |
5022
|
|
|
|
|
|
* at the start of an eval (which would cause error |
5023
|
|
|
|
|
|
* messages to come from the eval), may be optimised |
5024
|
|
|
|
|
|
* away from the execution path in the regex code blocks; |
5025
|
|
|
|
|
|
* so manually set PL_curcop to it initially */ |
5026
|
|
|
|
|
|
{ |
5027
|
48920392
|
|
|
|
|
OP *o = cUNOPx(nop)->op_first; |
5028
|
47746874
|
|
|
|
|
assert(o->op_type == OP_NULL); |
5029
|
10637254
|
|
|
|
|
if (o->op_targ == OP_SCOPE) { |
5030
|
4089468
|
|
|
|
|
o = cUNOPo->op_first; |
5031
|
|
|
|
|
|
} |
5032
|
|
|
|
|
|
else { |
5033
|
2044710
|
|
|
|
|
assert(o->op_targ == OP_LEAVE); |
5034
|
2044710
|
|
|
|
|
o = cUNOPo->op_first; |
5035
|
0
|
|
|
|
|
assert(o->op_type == OP_ENTER); |
5036
|
2044710
|
|
|
|
|
o = o->op_sibling; |
5037
|
|
|
|
|
|
} |
5038
|
|
|
|
|
|
|
5039
|
10637254
|
|
|
|
|
if (o->op_type != OP_STUB) { |
5040
|
3148
|
|
|
|
|
assert( o->op_type == OP_NEXTSTATE |
5041
|
|
|
|
|
|
|| o->op_type == OP_DBSTATE |
5042
|
|
|
|
|
|
|| (o->op_type == OP_NULL |
5043
|
|
|
|
|
|
&& ( o->op_targ == OP_NEXTSTATE |
5044
|
|
|
|
|
|
|| o->op_targ == OP_DBSTATE |
5045
|
|
|
|
|
|
) |
5046
|
|
|
|
|
|
) |
5047
|
|
|
|
|
|
); |
5048
|
3148
|
|
|
|
|
PL_curcop = (COP*)o; |
5049
|
|
|
|
|
|
} |
5050
|
|
|
|
|
|
} |
5051
|
3148
|
|
|
|
|
nop = nop->op_next; |
5052
|
|
|
|
|
|
|
5053
|
48
|
|
|
|
|
DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, |
5054
|
|
|
|
|
|
" re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); |
5055
|
|
|
|
|
|
|
5056
|
48
|
|
|
|
|
rex->offs[0].end = locinput - reginfo->strbeg; |
5057
|
48
|
|
|
|
|
if (reginfo->info_aux_eval->pos_magic) |
5058
|
48
|
|
|
|
|
MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, |
5059
|
|
|
|
|
|
reginfo->sv, reginfo->strbeg, |
5060
|
|
|
|
|
|
locinput - reginfo->strbeg); |
5061
|
|
|
|
|
|
|
5062
|
3148
|
|
|
|
|
if (sv_yes_mark) { |
5063
|
3100
|
|
|
|
|
SV *sv_mrk = get_sv("REGMARK", 1); |
5064
|
10637206
|
|
|
|
|
sv_setsv(sv_mrk, sv_yes_mark); |
5065
|
|
|
|
|
|
} |
5066
|
|
|
|
|
|
|
5067
|
|
|
|
|
|
/* we don't use MULTICALL here as we want to call the |
5068
|
|
|
|
|
|
* first op of the block of interest, rather than the |
5069
|
|
|
|
|
|
* first op of the sub */ |
5070
|
334798
|
|
|
|
|
before = (IV)(SP-PL_stack_base); |
5071
|
669548
|
|
|
|
|
PL_op = nop; |
5072
|
334798
|
|
|
|
|
CALLRUNOPS(aTHX); /* Scalar context. */ |
5073
|
334798
|
|
|
|
|
SPAGAIN; |
5074
|
334798
|
|
|
|
|
if ((IV)(SP-PL_stack_base) == before) |
5075
|
334750
|
|
|
|
|
ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ |
5076
|
|
|
|
|
|
else { |
5077
|
334798
|
|
|
|
|
ret = POPs; |
5078
|
29764
|
|
|
|
|
PUTBACK; |
5079
|
|
|
|
|
|
} |
5080
|
|
|
|
|
|
|
5081
|
|
|
|
|
|
/* before restoring everything, evaluate the returned |
5082
|
|
|
|
|
|
* value, so that 'uninit' warnings don't use the wrong |
5083
|
|
|
|
|
|
* PL_op or pad. Also need to process any magic vars |
5084
|
|
|
|
|
|
* (e.g. $1) *before* parentheses are restored */ |
5085
|
|
|
|
|
|
|
5086
|
29764
|
|
|
|
|
PL_op = NULL; |
5087
|
|
|
|
|
|
|
5088
|
|
|
|
|
|
re_sv = NULL; |
5089
|
47441840
|
|
|
|
|
if (logical == 0) /* (?{})/ */ |
5090
|
1290690
|
|
|
|
|
sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ |
5091
|
1290642
|
|
|
|
|
else if (logical == 1) { /* /(?(?{...})X|Y)/ */ |
5092
|
1290642
|
|
|
|
|
sw = cBOOL(SvTRUE(ret)); |
5093
|
|
|
|
|
|
logical = 0; |
5094
|
|
|
|
|
|
} |
5095
|
|
|
|
|
|
else { /* /(??{}) */ |
5096
|
|
|
|
|
|
/* if its overloaded, let the regex compiler handle |
5097
|
|
|
|
|
|
* it; otherwise extract regex, or stringify */ |
5098
|
1290642
|
|
|
|
|
if (!SvAMAGIC(ret)) { |
5099
|
1290642
|
|
|
|
|
SV *sv = ret; |
5100
|
46151150
|
|
|
|
|
if (SvROK(sv)) |
5101
|
43391304
|
|
|
|
|
sv = SvRV(sv); |
5102
|
43391304
|
|
|
|
|
if (SvTYPE(sv) == SVt_REGEXP) |
5103
|
|
|
|
|
|
re_sv = (REGEXP*) sv; |
5104
|
43391304
|
|
|
|
|
else if (SvSMAGICAL(sv)) { |
5105
|
43391304
|
|
|
|
|
MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); |
5106
|
7117646
|
|
|
|
|
if (mg) |
5107
|
7117646
|
|
|
|
|
re_sv = (REGEXP *) mg->mg_obj; |
5108
|
|
|
|
|
|
} |
5109
|
|
|
|
|
|
|
5110
|
|
|
|
|
|
/* force any magic, undef warnings here */ |
5111
|
34139456
|
|
|
|
|
if (!re_sv) { |
5112
|
34139456
|
|
|
|
|
ret = sv_mortalcopy(ret); |
5113
|
34139456
|
|
|
|
|
(void) SvPV_force_nolen(ret); |
5114
|
|
|
|
|
|
} |
5115
|
|
|
|
|
|
} |
5116
|
|
|
|
|
|
|
5117
|
|
|
|
|
|
} |
5118
|
|
|
|
|
|
|
5119
|
|
|
|
|
|
/* *** Note that at this point we don't restore |
5120
|
|
|
|
|
|
* PL_comppad, (or pop the CxSUB) on the assumption it may |
5121
|
|
|
|
|
|
* be used again soon. This is safe as long as nothing |
5122
|
|
|
|
|
|
* in the regexp code uses the pad ! */ |
5123
|
34139504
|
|
|
|
|
PL_op = oop; |
5124
|
15541786
|
|
|
|
|
PL_curcop = ocurcop; |
5125
|
|
|
|
|
|
S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); |
5126
|
15541786
|
|
|
|
|
PL_curpm = PL_reg_curpm; |
5127
|
|
|
|
|
|
|
5128
|
15541786
|
|
|
|
|
if (logical != 2) |
5129
|
|
|
|
|
|
break; |
5130
|
|
|
|
|
|
} |
5131
|
|
|
|
|
|
|
5132
|
|
|
|
|
|
/* only /(??{})/ from now on */ |
5133
|
|
|
|
|
|
logical = 0; |
5134
|
|
|
|
|
|
{ |
5135
|
|
|
|
|
|
/* extract RE object from returned value; compiling if |
5136
|
|
|
|
|
|
* necessary */ |
5137
|
|
|
|
|
|
|
5138
|
15541738
|
|
|
|
|
if (re_sv) { |
5139
|
15541738
|
|
|
|
|
re_sv = reg_temp_copy(NULL, re_sv); |
5140
|
|
|
|
|
|
} |
5141
|
|
|
|
|
|
else { |
5142
|
|
|
|
|
|
U32 pm_flags = 0; |
5143
|
|
|
|
|
|
|
5144
|
37306592
|
|
|
|
|
if (SvUTF8(ret) && IN_BYTES) { |
5145
|
|
|
|
|
|
/* In use 'bytes': make a copy of the octet |
5146
|
|
|
|
|
|
* sequence, but without the flag on */ |
5147
|
|
|
|
|
|
STRLEN len; |
5148
|
37306592
|
|
|
|
|
const char *const p = SvPV(ret, len); |
5149
|
41239956
|
|
|
|
|
ret = newSVpvn_flags(p, len, SVs_TEMP); |
5150
|
|
|
|
|
|
} |
5151
|
2
|
|
|
|
|
if (rex->intflags & PREGf_USE_RE_EVAL) |
5152
|
|
|
|
|
|
pm_flags |= PMf_USE_RE_EVAL; |
5153
|
|
|
|
|
|
|
5154
|
|
|
|
|
|
/* if we got here, it should be an engine which |
5155
|
|
|
|
|
|
* supports compiling code blocks and stuff */ |
5156
|
2
|
|
|
|
|
assert(rex->engine && rex->engine->op_comp); |
5157
|
2
|
|
|
|
|
assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); |
5158
|
2
|
|
|
|
|
re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, |
5159
|
|
|
|
|
|
rex->engine, NULL, NULL, |
5160
|
|
|
|
|
|
/* copy /msix etc to inner pattern */ |
5161
|
41239956
|
|
|
|
|
scan->flags, |
5162
|
|
|
|
|
|
pm_flags); |
5163
|
|
|
|
|
|
|
5164
|
41239956
|
|
|
|
|
if (!(SvFLAGS(ret) |
5165
|
41239956
|
|
|
|
|
& (SVs_TEMP | SVs_PADTMP | SVf_READONLY |
5166
|
|
|
|
|
|
| SVs_GMG))) { |
5167
|
|
|
|
|
|
/* This isn't a first class regexp. Instead, it's |
5168
|
|
|
|
|
|
caching a regexp onto an existing, Perl visible |
5169
|
|
|
|
|
|
scalar. */ |
5170
|
1273496
|
|
|
|
|
sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); |
5171
|
|
|
|
|
|
} |
5172
|
|
|
|
|
|
/* safe to do now that any $1 etc has been |
5173
|
|
|
|
|
|
* interpolated into the new pattern string and |
5174
|
|
|
|
|
|
* compiled */ |
5175
|
|
|
|
|
|
S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); |
5176
|
|
|
|
|
|
} |
5177
|
1273496
|
|
|
|
|
SAVEFREESV(re_sv); |
5178
|
1273496
|
|
|
|
|
re = ReANY(re_sv); |
5179
|
|
|
|
|
|
} |
5180
|
1273496
|
|
|
|
|
RXp_MATCH_COPIED_off(re); |
5181
|
477166
|
|
|
|
|
re->subbeg = rex->subbeg; |
5182
|
0
|
|
|
|
|
re->sublen = rex->sublen; |
5183
|
0
|
|
|
|
|
re->suboffset = rex->suboffset; |
5184
|
0
|
|
|
|
|
re->subcoffset = rex->subcoffset; |
5185
|
0
|
|
|
|
|
rei = RXi_GET(re); |
5186
|
477166
|
|
|
|
|
DEBUG_EXECUTE_r( |
5187
|
|
|
|
|
|
debug_start_match(re_sv, utf8_target, locinput, |
5188
|
|
|
|
|
|
reginfo->strend, "Matching embedded"); |
5189
|
|
|
|
|
|
); |
5190
|
477166
|
|
|
|
|
startpoint = rei->program + 1; |
5191
|
796330
|
|
|
|
|
ST.close_paren = 0; /* only used for GOSUB */ |
5192
|
|
|
|
|
|
|
5193
|
|
|
|
|
|
eval_recurse_doit: /* Share code with GOSUB below this line */ |
5194
|
|
|
|
|
|
/* run the pattern returned from (??{...}) */ |
5195
|
|
|
|
|
|
|
5196
|
|
|
|
|
|
/* Save *all* the positions. */ |
5197
|
796330
|
|
|
|
|
ST.cp = regcppush(rex, 0, maxopenparen); |
5198
|
796330
|
|
|
|
|
REGCP_SET(ST.lastcp); |
5199
|
|
|
|
|
|
|
5200
|
796330
|
|
|
|
|
re->lastparen = 0; |
5201
|
0
|
|
|
|
|
re->lastcloseparen = 0; |
5202
|
|
|
|
|
|
|
5203
|
0
|
|
|
|
|
maxopenparen = 0; |
5204
|
|
|
|
|
|
|
5205
|
|
|
|
|
|
/* invalidate the S-L poscache. We're now executing a |
5206
|
|
|
|
|
|
* different set of WHILEM ops (and their associated |
5207
|
|
|
|
|
|
* indexes) against the same string, so the bits in the |
5208
|
|
|
|
|
|
* cache are meaningless. Setting maxiter to zero forces |
5209
|
|
|
|
|
|
* the cache to be invalidated and zeroed before reuse. |
5210
|
|
|
|
|
|
* XXX This is too dramatic a measure. Ideally we should |
5211
|
|
|
|
|
|
* save the old cache and restore when running the outer |
5212
|
|
|
|
|
|
* pattern again */ |
5213
|
0
|
|
|
|
|
reginfo->poscache_maxiter = 0; |
5214
|
|
|
|
|
|
|
5215
|
248917174
|
|
|
|
|
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); |
5216
|
|
|
|
|
|
|
5217
|
248917174
|
|
|
|
|
ST.prev_rex = rex_sv; |
5218
|
248917174
|
|
|
|
|
ST.prev_curlyx = cur_curlyx; |
5219
|
|
|
|
|
|
rex_sv = re_sv; |
5220
|
248917174
|
|
|
|
|
SET_reg_curpm(rex_sv); |
5221
|
|
|
|
|
|
rex = re; |
5222
|
|
|
|
|
|
rexi = rei; |
5223
|
|
|
|
|
|
cur_curlyx = NULL; |
5224
|
248917174
|
|
|
|
|
ST.B = next; |
5225
|
248917174
|
|
|
|
|
ST.prev_eval = cur_eval; |
5226
|
|
|
|
|
|
cur_eval = st; |
5227
|
|
|
|
|
|
/* now continue from first node in postoned RE */ |
5228
|
0
|
|
|
|
|
PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); |
5229
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5230
|
|
|
|
|
|
} |
5231
|
|
|
|
|
|
|
5232
|
|
|
|
|
|
case EVAL_AB: /* cleanup after a successful (??{A})B */ |
5233
|
|
|
|
|
|
/* note: this is called twice; first after popping B, then A */ |
5234
|
248917174
|
|
|
|
|
rex_sv = ST.prev_rex; |
5235
|
648
|
|
|
|
|
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); |
5236
|
0
|
|
|
|
|
SET_reg_curpm(rex_sv); |
5237
|
432
|
|
|
|
|
rex = ReANY(rex_sv); |
5238
|
228
|
|
|
|
|
rexi = RXi_GET(rex); |
5239
|
0
|
|
|
|
|
regcpblow(ST.cp); |
5240
|
237460570
|
|
|
|
|
cur_eval = ST.prev_eval; |
5241
|
237460570
|
|
|
|
|
cur_curlyx = ST.prev_curlyx; |
5242
|
|
|
|
|
|
|
5243
|
|
|
|
|
|
/* Invalidate cache. See "invalidate" comment above. */ |
5244
|
245974287
|
|
|
|
|
reginfo->poscache_maxiter = 0; |
5245
|
237460570
|
|
|
|
|
if ( nochange_depth ) |
5246
|
237460570
|
|
|
|
|
nochange_depth--; |
5247
|
|
|
|
|
|
sayYES; |
5248
|
|
|
|
|
|
|
5249
|
|
|
|
|
|
|
5250
|
|
|
|
|
|
case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ |
5251
|
|
|
|
|
|
/* note: this is called twice; first after popping B, then A */ |
5252
|
146639292
|
|
|
|
|
rex_sv = ST.prev_rex; |
5253
|
8171312
|
|
|
|
|
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); |
5254
|
15384612
|
|
|
|
|
SET_reg_curpm(rex_sv); |
5255
|
15384612
|
|
|
|
|
rex = ReANY(rex_sv); |
5256
|
15384612
|
|
|
|
|
rexi = RXi_GET(rex); |
5257
|
|
|
|
|
|
|
5258
|
15384612
|
|
|
|
|
REGCP_UNWIND(ST.lastcp); |
5259
|
15384612
|
|
|
|
|
regcppop(rex, &maxopenparen); |
5260
|
1971074
|
|
|
|
|
cur_eval = ST.prev_eval; |
5261
|
1971074
|
|
|
|
|
cur_curlyx = ST.prev_curlyx; |
5262
|
|
|
|
|
|
/* Invalidate cache. See "invalidate" comment above. */ |
5263
|
1951236
|
|
|
|
|
reginfo->poscache_maxiter = 0; |
5264
|
1971074
|
|
|
|
|
if ( nochange_depth ) |
5265
|
15384612
|
|
|
|
|
nochange_depth--; |
5266
|
|
|
|
|
|
sayNO_SILENT; |
5267
|
|
|
|
|
|
#undef ST |
5268
|
|
|
|
|
|
|
5269
|
|
|
|
|
|
case OPEN: /* ( */ |
5270
|
15384616
|
|
|
|
|
n = ARG(scan); /* which paren pair */ |
5271
|
15384616
|
|
|
|
|
rex->offs[n].start_tmp = locinput - reginfo->strbeg; |
5272
|
15384616
|
|
|
|
|
if (n > maxopenparen) |
5273
|
15384616
|
|
|
|
|
maxopenparen = n; |
5274
|
15384616
|
|
|
|
|
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, |
5275
|
|
|
|
|
|
"rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n", |
5276
|
|
|
|
|
|
PTR2UV(rex), |
5277
|
|
|
|
|
|
PTR2UV(rex->offs), |
5278
|
|
|
|
|
|
(UV)n, |
5279
|
|
|
|
|
|
(IV)rex->offs[n].start_tmp, |
5280
|
|
|
|
|
|
(UV)maxopenparen |
5281
|
|
|
|
|
|
)); |
5282
|
|
|
|
|
|
lastopen = n; |
5283
|
15384616
|
|
|
|
|
break; |
5284
|
|
|
|
|
|
|
5285
|
|
|
|
|
|
/* XXX really need to log other places start/end are set too */ |
5286
|
|
|
|
|
|
#define CLOSE_CAPTURE \ |
5287
|
|
|
|
|
|
rex->offs[n].start = rex->offs[n].start_tmp; \ |
5288
|
|
|
|
|
|
rex->offs[n].end = locinput - reginfo->strbeg; \ |
5289
|
|
|
|
|
|
DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ |
5290
|
|
|
|
|
|
"rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ |
5291
|
|
|
|
|
|
PTR2UV(rex), \ |
5292
|
|
|
|
|
|
PTR2UV(rex->offs), \ |
5293
|
|
|
|
|
|
(UV)n, \ |
5294
|
|
|
|
|
|
(IV)rex->offs[n].start, \ |
5295
|
|
|
|
|
|
(IV)rex->offs[n].end \ |
5296
|
|
|
|
|
|
)) |
5297
|
|
|
|
|
|
|
5298
|
|
|
|
|
|
case CLOSE: /* ) */ |
5299
|
15384616
|
|
|
|
|
n = ARG(scan); /* which paren pair */ |
5300
|
21528948
|
|
|
|
|
CLOSE_CAPTURE; |
5301
|
6178950
|
|
|
|
|
if (n > rex->lastparen) |
5302
|
6178950
|
|
|
|
|
rex->lastparen = n; |
5303
|
475098
|
|
|
|
|
rex->lastcloseparen = n; |
5304
|
44070
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren == n) { |
5305
|
|
|
|
|
|
goto fake_end; |
5306
|
|
|
|
|
|
} |
5307
|
|
|
|
|
|
break; |
5308
|
|
|
|
|
|
|
5309
|
|
|
|
|
|
case ACCEPT: /* (*ACCEPT) */ |
5310
|
130009
|
|
|
|
|
if (ARG(scan)){ |
5311
|
|
|
|
|
|
regnode *cursor; |
5312
|
63910
|
|
|
|
|
for (cursor=scan; |
5313
|
63910
|
|
|
|
|
cursor && OP(cursor)!=END; |
5314
|
431028
|
|
|
|
|
cursor=regnext(cursor)) |
5315
|
|
|
|
|
|
{ |
5316
|
475094
|
|
|
|
|
if ( OP(cursor)==CLOSE ){ |
5317
|
0
|
|
|
|
|
n = ARG(cursor); |
5318
|
6219608
|
|
|
|
|
if ( n <= lastopen ) { |
5319
|
81324
|
|
|
|
|
CLOSE_CAPTURE; |
5320
|
6178696
|
|
|
|
|
if (n > rex->lastparen) |
5321
|
6178696
|
|
|
|
|
rex->lastparen = n; |
5322
|
15349998
|
|
|
|
|
rex->lastcloseparen = n; |
5323
|
15349998
|
|
|
|
|
if ( n == ARG(scan) || (cur_eval && |
5324
|
15116219
|
|
|
|
|
cur_eval->u.eval.close_paren == n)) |
5325
|
|
|
|
|
|
break; |
5326
|
|
|
|
|
|
} |
5327
|
|
|
|
|
|
} |
5328
|
|
|
|
|
|
} |
5329
|
|
|
|
|
|
} |
5330
|
|
|
|
|
|
goto fake_end; |
5331
|
|
|
|
|
|
/*NOTREACHED*/ |
5332
|
|
|
|
|
|
|
5333
|
|
|
|
|
|
case GROUPP: /* (?(1)) */ |
5334
|
432650
|
|
|
|
|
n = ARG(scan); /* which paren pair */ |
5335
|
17713074
|
|
|
|
|
sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); |
5336
|
14940102
|
|
|
|
|
break; |
5337
|
|
|
|
|
|
|
5338
|
|
|
|
|
|
case NGROUPP: /* (?()) */ |
5339
|
|
|
|
|
|
/* reg_check_named_buff_matched returns 0 for no match */ |
5340
|
14940102
|
|
|
|
|
sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); |
5341
|
14326810
|
|
|
|
|
break; |
5342
|
|
|
|
|
|
|
5343
|
|
|
|
|
|
case INSUBP: /* (?(R)) */ |
5344
|
14326810
|
|
|
|
|
n = ARG(scan); |
5345
|
29084276
|
|
|
|
|
sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); |
5346
|
14326810
|
|
|
|
|
break; |
5347
|
|
|
|
|
|
|
5348
|
|
|
|
|
|
case DEFINEP: /* (?(DEFINE)) */ |
5349
|
|
|
|
|
|
sw = 0; |
5350
|
3997672
|
|
|
|
|
break; |
5351
|
|
|
|
|
|
|
5352
|
|
|
|
|
|
case IFTHEN: /* (?(cond)A|B) */ |
5353
|
3997672
|
|
|
|
|
reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ |
5354
|
17710626
|
|
|
|
|
if (sw) |
5355
|
2119552
|
|
|
|
|
next = NEXTOPER(NEXTOPER(scan)); |
5356
|
|
|
|
|
|
else { |
5357
|
3348
|
|
|
|
|
next = scan + ARG(scan); |
5358
|
2916
|
|
|
|
|
if (OP(next) == IFTHEN) /* Fake one. */ |
5359
|
2116204
|
|
|
|
|
next = NEXTOPER(NEXTOPER(next)); |
5360
|
|
|
|
|
|
} |
5361
|
|
|
|
|
|
break; |
5362
|
|
|
|
|
|
|
5363
|
|
|
|
|
|
case LOGICAL: /* modifier for EVAL and IFMATCH */ |
5364
|
16958982
|
|
|
|
|
logical = scan->flags; |
5365
|
1882854
|
|
|
|
|
break; |
5366
|
|
|
|
|
|
|
5367
|
|
|
|
|
|
/******************************************************************* |
5368
|
|
|
|
|
|
|
5369
|
|
|
|
|
|
The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ |
5370
|
|
|
|
|
|
pattern, where A and B are subpatterns. (For simple A, CURLYM or |
5371
|
|
|
|
|
|
STAR/PLUS/CURLY/CURLYN are used instead.) |
5372
|
|
|
|
|
|
|
5373
|
|
|
|
|
|
A*B is compiled as |
5374
|
|
|
|
|
|
|
5375
|
|
|
|
|
|
On entry to the subpattern, CURLYX is called. This pushes a CURLYX |
5376
|
|
|
|
|
|
state, which contains the current count, initialised to -1. It also sets |
5377
|
|
|
|
|
|
cur_curlyx to point to this state, with any previous value saved in the |
5378
|
|
|
|
|
|
state block. |
5379
|
|
|
|
|
|
|
5380
|
|
|
|
|
|
CURLYX then jumps straight to the WHILEM op, rather than executing A, |
5381
|
|
|
|
|
|
since the pattern may possibly match zero times (i.e. it's a while {} loop |
5382
|
|
|
|
|
|
rather than a do {} while loop). |
5383
|
|
|
|
|
|
|
5384
|
|
|
|
|
|
Each entry to WHILEM represents a successful match of A. The count in the |
5385
|
|
|
|
|
|
CURLYX block is incremented, another WHILEM state is pushed, and execution |
5386
|
|
|
|
|
|
passes to A or B depending on greediness and the current count. |
5387
|
|
|
|
|
|
|
5388
|
|
|
|
|
|
For example, if matching against the string a1a2a3b (where the aN are |
5389
|
|
|
|
|
|
substrings that match /A/), then the match progresses as follows: (the |
5390
|
|
|
|
|
|
pushed states are interspersed with the bits of strings matched so far): |
5391
|
|
|
|
|
|
|
5392
|
|
|
|
|
|
|
5393
|
|
|
|
|
|
|
5394
|
|
|
|
|
|
a1 |
5395
|
|
|
|
|
|
a1 a2 |
5396
|
|
|
|
|
|
a1 a2 a3 |
5397
|
|
|
|
|
|
a1 a2 a3 b |
5398
|
|
|
|
|
|
|
5399
|
|
|
|
|
|
(Contrast this with something like CURLYM, which maintains only a single |
5400
|
|
|
|
|
|
backtrack state: |
5401
|
|
|
|
|
|
|
5402
|
|
|
|
|
|
a1 |
5403
|
|
|
|
|
|
a1 a2 |
5404
|
|
|
|
|
|
a1 a2 a3 |
5405
|
|
|
|
|
|
a1 a2 a3 b |
5406
|
|
|
|
|
|
) |
5407
|
|
|
|
|
|
|
5408
|
|
|
|
|
|
Each WHILEM state block marks a point to backtrack to upon partial failure |
5409
|
|
|
|
|
|
of A or B, and also contains some minor state data related to that |
5410
|
|
|
|
|
|
iteration. The CURLYX block, pointed to by cur_curlyx, contains the |
5411
|
|
|
|
|
|
overall state, such as the count, and pointers to the A and B ops. |
5412
|
|
|
|
|
|
|
5413
|
|
|
|
|
|
This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx |
5414
|
|
|
|
|
|
must always point to the *current* CURLYX block, the rules are: |
5415
|
|
|
|
|
|
|
5416
|
|
|
|
|
|
When executing CURLYX, save the old cur_curlyx in the CURLYX state block, |
5417
|
|
|
|
|
|
and set cur_curlyx to point the new block. |
5418
|
|
|
|
|
|
|
5419
|
|
|
|
|
|
When popping the CURLYX block after a successful or unsuccessful match, |
5420
|
|
|
|
|
|
restore the previous cur_curlyx. |
5421
|
|
|
|
|
|
|
5422
|
|
|
|
|
|
When WHILEM is about to execute B, save the current cur_curlyx, and set it |
5423
|
|
|
|
|
|
to the outer one saved in the CURLYX block. |
5424
|
|
|
|
|
|
|
5425
|
|
|
|
|
|
When popping the WHILEM block after a successful or unsuccessful B match, |
5426
|
|
|
|
|
|
restore the previous cur_curlyx. |
5427
|
|
|
|
|
|
|
5428
|
|
|
|
|
|
Here's an example for the pattern (AI* BI)*BO |
5429
|
|
|
|
|
|
I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: |
5430
|
|
|
|
|
|
|
5431
|
|
|
|
|
|
cur_ |
5432
|
|
|
|
|
|
curlyx backtrack stack |
5433
|
|
|
|
|
|
------ --------------- |
5434
|
|
|
|
|
|
NULL |
5435
|
|
|
|
|
|
CO |
5436
|
|
|
|
|
|
CI ai |
5437
|
|
|
|
|
|
CO ai bi |
5438
|
|
|
|
|
|
NULL ai bi bo |
5439
|
|
|
|
|
|
|
5440
|
|
|
|
|
|
At this point the pattern succeeds, and we work back down the stack to |
5441
|
|
|
|
|
|
clean up, restoring as we go: |
5442
|
|
|
|
|
|
|
5443
|
|
|
|
|
|
CO ai bi |
5444
|
|
|
|
|
|
CI ai |
5445
|
|
|
|
|
|
CO |
5446
|
|
|
|
|
|
NULL |
5447
|
|
|
|
|
|
|
5448
|
|
|
|
|
|
*******************************************************************/ |
5449
|
|
|
|
|
|
|
5450
|
|
|
|
|
|
#define ST st->u.curlyx |
5451
|
|
|
|
|
|
|
5452
|
|
|
|
|
|
case CURLYX: /* start of /A*B/ (for complex A) */ |
5453
|
|
|
|
|
|
{ |
5454
|
|
|
|
|
|
/* No need to save/restore up to this paren */ |
5455
|
1882854
|
|
|
|
|
I32 parenfloor = scan->flags; |
5456
|
|
|
|
|
|
|
5457
|
110032
|
|
|
|
|
assert(next); /* keep Coverity happy */ |
5458
|
110032
|
|
|
|
|
if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ |
5459
|
110032
|
|
|
|
|
next += ARG(next); |
5460
|
|
|
|
|
|
|
5461
|
|
|
|
|
|
/* XXXX Probably it is better to teach regpush to support |
5462
|
|
|
|
|
|
parenfloor > maxopenparen ... */ |
5463
|
110032
|
|
|
|
|
if (parenfloor > (I32)rex->lastparen) |
5464
|
109686
|
|
|
|
|
parenfloor = rex->lastparen; /* Pessimization... */ |
5465
|
|
|
|
|
|
|
5466
|
110032
|
|
|
|
|
ST.prev_curlyx= cur_curlyx; |
5467
|
|
|
|
|
|
cur_curlyx = st; |
5468
|
1772822
|
|
|
|
|
ST.cp = PL_savestack_ix; |
5469
|
|
|
|
|
|
|
5470
|
|
|
|
|
|
/* these fields contain the state of the current curly. |
5471
|
|
|
|
|
|
* they are accessed by subsequent WHILEMs */ |
5472
|
1882854
|
|
|
|
|
ST.parenfloor = parenfloor; |
5473
|
0
|
|
|
|
|
ST.me = scan; |
5474
|
0
|
|
|
|
|
ST.B = next; |
5475
|
16958982
|
|
|
|
|
ST.minmod = minmod; |
5476
|
|
|
|
|
|
minmod = 0; |
5477
|
15807628
|
|
|
|
|
ST.count = -1; /* this will be updated by WHILEM */ |
5478
|
21728853
|
|
|
|
|
ST.lastloc = NULL; /* this will be updated by WHILEM */ |
5479
|
|
|
|
|
|
|
5480
|
15807628
|
|
|
|
|
PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); |
5481
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5482
|
|
|
|
|
|
} |
5483
|
|
|
|
|
|
|
5484
|
|
|
|
|
|
case CURLYX_end: /* just finished matching all of A*B */ |
5485
|
144980
|
|
|
|
|
cur_curlyx = ST.prev_curlyx; |
5486
|
144980
|
|
|
|
|
sayYES; |
5487
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5488
|
|
|
|
|
|
|
5489
|
|
|
|
|
|
case CURLYX_end_fail: /* just failed to match all of A*B */ |
5490
|
15662648
|
|
|
|
|
regcpblow(ST.cp); |
5491
|
2635444
|
|
|
|
|
cur_curlyx = ST.prev_curlyx; |
5492
|
2635444
|
|
|
|
|
sayNO; |
5493
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5494
|
|
|
|
|
|
|
5495
|
|
|
|
|
|
|
5496
|
|
|
|
|
|
#undef ST |
5497
|
|
|
|
|
|
#define ST st->u.whilem |
5498
|
|
|
|
|
|
|
5499
|
|
|
|
|
|
case WHILEM: /* just matched an A in /A*B/ (for complex A) */ |
5500
|
|
|
|
|
|
{ |
5501
|
|
|
|
|
|
/* see the discussion above about CURLYX/WHILEM */ |
5502
|
|
|
|
|
|
I32 n; |
5503
|
104255967
|
|
|
|
|
int min = ARG1(cur_curlyx->u.curlyx.me); |
5504
|
104255967
|
|
|
|
|
int max = ARG2(cur_curlyx->u.curlyx.me); |
5505
|
104255967
|
|
|
|
|
regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; |
5506
|
|
|
|
|
|
|
5507
|
104255967
|
|
|
|
|
assert(cur_curlyx); /* keep Coverity happy */ |
5508
|
104255967
|
|
|
|
|
n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ |
5509
|
46879280
|
|
|
|
|
ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; |
5510
|
46879280
|
|
|
|
|
ST.cache_offset = 0; |
5511
|
46879280
|
|
|
|
|
ST.cache_mask = 0; |
5512
|
|
|
|
|
|
|
5513
|
|
|
|
|
|
|
5514
|
46879280
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
5515
|
|
|
|
|
|
"%*s whilem: matched %ld out of %d..%d\n", |
5516
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", (long)n, min, max) |
5517
|
|
|
|
|
|
); |
5518
|
|
|
|
|
|
|
5519
|
|
|
|
|
|
/* First just match a string of min A's. */ |
5520
|
|
|
|
|
|
|
5521
|
46879280
|
|
|
|
|
if (n < min) { |
5522
|
593850
|
|
|
|
|
ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, |
5523
|
|
|
|
|
|
maxopenparen); |
5524
|
593850
|
|
|
|
|
cur_curlyx->u.curlyx.lastloc = locinput; |
5525
|
593850
|
|
|
|
|
REGCP_SET(ST.lastcp); |
5526
|
|
|
|
|
|
|
5527
|
593850
|
|
|
|
|
PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); |
5528
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5529
|
|
|
|
|
|
} |
5530
|
|
|
|
|
|
|
5531
|
|
|
|
|
|
/* If degenerate A matches "", assume A done. */ |
5532
|
|
|
|
|
|
|
5533
|
438074
|
|
|
|
|
if (locinput == cur_curlyx->u.curlyx.lastloc) { |
5534
|
593850
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
5535
|
|
|
|
|
|
"%*s whilem: empty match detected, trying continuation...\n", |
5536
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "") |
5537
|
|
|
|
|
|
); |
5538
|
|
|
|
|
|
goto do_whilem_B_max; |
5539
|
|
|
|
|
|
} |
5540
|
|
|
|
|
|
|
5541
|
|
|
|
|
|
/* super-linear cache processing. |
5542
|
|
|
|
|
|
* |
5543
|
|
|
|
|
|
* The idea here is that for certain types of CURLYX/WHILEM - |
5544
|
|
|
|
|
|
* principally those whose upper bound is infinity (and |
5545
|
|
|
|
|
|
* excluding regexes that have things like \1 and other very |
5546
|
|
|
|
|
|
* non-regular expresssiony things), then if a pattern like |
5547
|
|
|
|
|
|
* /....A*.../ fails and we backtrack to the WHILEM, then we |
5548
|
|
|
|
|
|
* make a note that this particular WHILEM op was at string |
5549
|
|
|
|
|
|
* position 47 (say) when the rest of pattern failed. Then, if |
5550
|
|
|
|
|
|
* we ever find ourselves back at that WHILEM, and at string |
5551
|
|
|
|
|
|
* position 47 again, we can just fail immediately rather than |
5552
|
|
|
|
|
|
* running the rest of the pattern again. |
5553
|
|
|
|
|
|
* |
5554
|
|
|
|
|
|
* This is very handy when patterns start to go |
5555
|
|
|
|
|
|
* 'super-linear', like in (a+)*(a+)*(a+)*, where you end up |
5556
|
|
|
|
|
|
* with a combinatorial explosion of backtracking. |
5557
|
|
|
|
|
|
* |
5558
|
|
|
|
|
|
* The cache is implemented as a bit array, with one bit per |
5559
|
|
|
|
|
|
* string byte position per WHILEM op (up to 16) - so its |
5560
|
|
|
|
|
|
* between 0.25 and 2x the string size. |
5561
|
|
|
|
|
|
* |
5562
|
|
|
|
|
|
* To avoid allocating a poscache buffer every time, we do an |
5563
|
|
|
|
|
|
* initially countdown; only after we have executed a WHILEM |
5564
|
|
|
|
|
|
* op (string-length x #WHILEMs) times do we allocate the |
5565
|
|
|
|
|
|
* cache. |
5566
|
|
|
|
|
|
* |
5567
|
|
|
|
|
|
* The top 4 bits of scan->flags byte say how many different |
5568
|
|
|
|
|
|
* relevant CURLLYX/WHILEM op pairs there are, while the |
5569
|
|
|
|
|
|
* bottom 4-bits is the identifying index number of this |
5570
|
|
|
|
|
|
* WHILEM. |
5571
|
|
|
|
|
|
*/ |
5572
|
|
|
|
|
|
|
5573
|
593850
|
|
|
|
|
if (scan->flags) { |
5574
|
|
|
|
|
|
|
5575
|
594100
|
|
|
|
|
if (!reginfo->poscache_maxiter) { |
5576
|
|
|
|
|
|
/* start the countdown: Postpone detection until we |
5577
|
|
|
|
|
|
* know the match is not *that* much linear. */ |
5578
|
|
|
|
|
|
reginfo->poscache_maxiter |
5579
|
500
|
|
|
|
|
= (reginfo->strend - reginfo->strbeg + 1) |
5580
|
500
|
|
|
|
|
* (scan->flags>>4); |
5581
|
|
|
|
|
|
/* possible overflow for long strings and many CURLYX's */ |
5582
|
500
|
|
|
|
|
if (reginfo->poscache_maxiter < 0) |
5583
|
593850
|
|
|
|
|
reginfo->poscache_maxiter = I32_MAX; |
5584
|
593850
|
|
|
|
|
reginfo->poscache_iter = reginfo->poscache_maxiter; |
5585
|
|
|
|
|
|
} |
5586
|
|
|
|
|
|
|
5587
|
37202816
|
|
|
|
|
if (reginfo->poscache_iter-- == 0) { |
5588
|
|
|
|
|
|
/* initialise cache */ |
5589
|
37202816
|
|
|
|
|
const SSize_t size = (reginfo->poscache_maxiter + 7)/8; |
5590
|
37202816
|
|
|
|
|
regmatch_info_aux *const aux = reginfo->info_aux; |
5591
|
37202816
|
|
|
|
|
if (aux->poscache) { |
5592
|
188931913
|
|
|
|
|
if ((SSize_t)reginfo->poscache_size < size) { |
5593
|
59466306
|
|
|
|
|
Renew(aux->poscache, size, char); |
5594
|
129465607
|
|
|
|
|
reginfo->poscache_size = size; |
5595
|
|
|
|
|
|
} |
5596
|
66929416
|
|
|
|
|
Zero(aux->poscache, size, char); |
5597
|
|
|
|
|
|
} |
5598
|
|
|
|
|
|
else { |
5599
|
129465607
|
|
|
|
|
reginfo->poscache_size = size; |
5600
|
22838179
|
|
|
|
|
Newxz(aux->poscache, size, char); |
5601
|
|
|
|
|
|
} |
5602
|
106627428
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
5603
|
|
|
|
|
|
"%swhilem: Detected a super-linear match, switching on caching%s...\n", |
5604
|
|
|
|
|
|
PL_colors[4], PL_colors[5]) |
5605
|
|
|
|
|
|
); |
5606
|
|
|
|
|
|
} |
5607
|
|
|
|
|
|
|
5608
|
16383
|
|
|
|
|
if (reginfo->poscache_iter < 0) { |
5609
|
|
|
|
|
|
/* have we already failed at this position? */ |
5610
|
|
|
|
|
|
SSize_t offset, mask; |
5611
|
|
|
|
|
|
|
5612
|
212496611
|
|
|
|
|
reginfo->poscache_iter = -1; /* stop eventual underflow */ |
5613
|
213222090
|
|
|
|
|
offset = (scan->flags & 0xf) - 1 |
5614
|
188930225
|
|
|
|
|
+ (locinput - reginfo->strbeg) |
5615
|
188930225
|
|
|
|
|
* (scan->flags>>4); |
5616
|
188930225
|
|
|
|
|
mask = 1 << (offset % 8); |
5617
|
5695768
|
|
|
|
|
offset /= 8; |
5618
|
5830660
|
|
|
|
|
if (reginfo->info_aux->poscache[offset] & mask) { |
5619
|
134892
|
|
|
|
|
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, |
5620
|
|
|
|
|
|
"%*s whilem: (cache) already tried at this position...\n", |
5621
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "") |
5622
|
|
|
|
|
|
); |
5623
|
|
|
|
|
|
sayNO; /* cache records failure */ |
5624
|
|
|
|
|
|
} |
5625
|
134892
|
|
|
|
|
ST.cache_offset = offset; |
5626
|
5682418
|
|
|
|
|
ST.cache_mask = mask; |
5627
|
|
|
|
|
|
} |
5628
|
|
|
|
|
|
} |
5629
|
|
|
|
|
|
|
5630
|
|
|
|
|
|
/* Prefer B over A for minimal matching. */ |
5631
|
|
|
|
|
|
|
5632
|
5682418
|
|
|
|
|
if (cur_curlyx->u.curlyx.minmod) { |
5633
|
5682418
|
|
|
|
|
ST.save_curlyx = cur_curlyx; |
5634
|
5682418
|
|
|
|
|
cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; |
5635
|
2811375
|
|
|
|
|
ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, |
5636
|
|
|
|
|
|
maxopenparen); |
5637
|
2811375
|
|
|
|
|
REGCP_SET(ST.lastcp); |
5638
|
2808393
|
|
|
|
|
PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, |
5639
|
|
|
|
|
|
locinput); |
5640
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5641
|
|
|
|
|
|
} |
5642
|
|
|
|
|
|
|
5643
|
|
|
|
|
|
/* Prefer A over B for maximal matching. */ |
5644
|
|
|
|
|
|
|
5645
|
2808393
|
|
|
|
|
if (n < max) { /* More greed allowed? */ |
5646
|
2868
|
|
|
|
|
ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, |
5647
|
|
|
|
|
|
maxopenparen); |
5648
|
1026
|
|
|
|
|
cur_curlyx->u.curlyx.lastloc = locinput; |
5649
|
2982
|
|
|
|
|
REGCP_SET(ST.lastcp); |
5650
|
2172
|
|
|
|
|
PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); |
5651
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5652
|
|
|
|
|
|
} |
5653
|
|
|
|
|
|
goto do_whilem_B_max; |
5654
|
|
|
|
|
|
} |
5655
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5656
|
|
|
|
|
|
|
5657
|
|
|
|
|
|
case WHILEM_B_min: /* just matched B in a minimal match */ |
5658
|
|
|
|
|
|
case WHILEM_B_max: /* just matched B in a maximal match */ |
5659
|
4998
|
|
|
|
|
cur_curlyx = ST.save_curlyx; |
5660
|
3162
|
|
|
|
|
sayYES; |
5661
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5662
|
|
|
|
|
|
|
5663
|
|
|
|
|
|
case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ |
5664
|
1740
|
|
|
|
|
cur_curlyx = ST.save_curlyx; |
5665
|
810
|
|
|
|
|
cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; |
5666
|
810
|
|
|
|
|
cur_curlyx->u.curlyx.count--; |
5667
|
358
|
|
|
|
|
CACHEsayNO; |
5668
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5669
|
|
|
|
|
|
|
5670
|
|
|
|
|
|
case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ |
5671
|
|
|
|
|
|
/* FALL THROUGH */ |
5672
|
|
|
|
|
|
case WHILEM_A_pre_fail: /* just failed to match even minimal A */ |
5673
|
183234457
|
|
|
|
|
REGCP_UNWIND(ST.lastcp); |
5674
|
183234457
|
|
|
|
|
regcppop(rex, &maxopenparen); |
5675
|
183234457
|
|
|
|
|
cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; |
5676
|
160374317
|
|
|
|
|
cur_curlyx->u.curlyx.count--; |
5677
|
160374317
|
|
|
|
|
CACHEsayNO; |
5678
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5679
|
|
|
|
|
|
|
5680
|
|
|
|
|
|
case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ |
5681
|
90106554
|
|
|
|
|
REGCP_UNWIND(ST.lastcp); |
5682
|
4426240
|
|
|
|
|
regcppop(rex, &maxopenparen); /* Restore some previous $s? */ |
5683
|
4426240
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
5684
|
|
|
|
|
|
"%*s whilem: failed, trying continuation...\n", |
5685
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "") |
5686
|
|
|
|
|
|
); |
5687
|
|
|
|
|
|
do_whilem_B_max: |
5688
|
56540
|
|
|
|
|
if (cur_curlyx->u.curlyx.count >= REG_INFTY |
5689
|
160374317
|
|
|
|
|
&& ckWARN(WARN_REGEXP) |
5690
|
160374317
|
|
|
|
|
&& !reginfo->warned) |
5691
|
|
|
|
|
|
{ |
5692
|
27186
|
|
|
|
|
reginfo->warned = TRUE; |
5693
|
27186
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_REGEXP), |
5694
|
|
|
|
|
|
"Complex regular subexpression recursion limit (%d) " |
5695
|
|
|
|
|
|
"exceeded", |
5696
|
|
|
|
|
|
REG_INFTY - 1); |
5697
|
|
|
|
|
|
} |
5698
|
|
|
|
|
|
|
5699
|
|
|
|
|
|
/* now try B */ |
5700
|
0
|
|
|
|
|
ST.save_curlyx = cur_curlyx; |
5701
|
27186
|
|
|
|
|
cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; |
5702
|
27186
|
|
|
|
|
PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, |
5703
|
|
|
|
|
|
locinput); |
5704
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5705
|
|
|
|
|
|
|
5706
|
|
|
|
|
|
case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ |
5707
|
504
|
|
|
|
|
cur_curlyx = ST.save_curlyx; |
5708
|
26682
|
|
|
|
|
REGCP_UNWIND(ST.lastcp); |
5709
|
27186
|
|
|
|
|
regcppop(rex, &maxopenparen); |
5710
|
|
|
|
|
|
|
5711
|
2838561
|
|
|
|
|
if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { |
5712
|
|
|
|
|
|
/* Maximum greed exceeded */ |
5713
|
4518
|
|
|
|
|
if (cur_curlyx->u.curlyx.count >= REG_INFTY |
5714
|
4518
|
|
|
|
|
&& ckWARN(WARN_REGEXP) |
5715
|
15798
|
|
|
|
|
&& !reginfo->warned) |
5716
|
|
|
|
|
|
{ |
5717
|
13896
|
|
|
|
|
reginfo->warned = TRUE; |
5718
|
13144
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_REGEXP), |
5719
|
|
|
|
|
|
"Complex regular subexpression recursion " |
5720
|
|
|
|
|
|
"limit (%d) exceeded", |
5721
|
|
|
|
|
|
REG_INFTY - 1); |
5722
|
|
|
|
|
|
} |
5723
|
13144
|
|
|
|
|
cur_curlyx->u.curlyx.count--; |
5724
|
3364
|
|
|
|
|
CACHEsayNO; |
5725
|
|
|
|
|
|
} |
5726
|
|
|
|
|
|
|
5727
|
2564
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
5728
|
|
|
|
|
|
"%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") |
5729
|
|
|
|
|
|
); |
5730
|
|
|
|
|
|
/* Try grabbing another A and see if it helps. */ |
5731
|
2396
|
|
|
|
|
cur_curlyx->u.curlyx.lastloc = locinput; |
5732
|
1500
|
|
|
|
|
ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, |
5733
|
|
|
|
|
|
maxopenparen); |
5734
|
1500
|
|
|
|
|
REGCP_SET(ST.lastcp); |
5735
|
2834043
|
|
|
|
|
PUSH_STATE_GOTO(WHILEM_A_min, |
5736
|
|
|
|
|
|
/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, |
5737
|
|
|
|
|
|
locinput); |
5738
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5739
|
|
|
|
|
|
|
5740
|
|
|
|
|
|
#undef ST |
5741
|
|
|
|
|
|
#define ST st->u.branch |
5742
|
|
|
|
|
|
|
5743
|
|
|
|
|
|
case BRANCHJ: /* /(...|A|...)/ with long next pointer */ |
5744
|
58182923
|
|
|
|
|
next = scan + ARG(scan); |
5745
|
38788433
|
|
|
|
|
if (next == scan) |
5746
|
|
|
|
|
|
next = NULL; |
5747
|
35957336
|
|
|
|
|
scan = NEXTOPER(scan); |
5748
|
|
|
|
|
|
/* FALL THROUGH */ |
5749
|
|
|
|
|
|
|
5750
|
|
|
|
|
|
case BRANCH: /* /(...|A|...)/ */ |
5751
|
2912
|
|
|
|
|
scan = NEXTOPER(scan); /* scan now points to inner node */ |
5752
|
2786
|
|
|
|
|
ST.lastparen = rex->lastparen; |
5753
|
2576
|
|
|
|
|
ST.lastcloseparen = rex->lastcloseparen; |
5754
|
1496
|
|
|
|
|
ST.next_branch = next; |
5755
|
2834043
|
|
|
|
|
REGCP_SET(ST.cp); |
5756
|
|
|
|
|
|
|
5757
|
|
|
|
|
|
/* Now go into the branch */ |
5758
|
2838561
|
|
|
|
|
if (has_cutgroup) { |
5759
|
2834203
|
|
|
|
|
PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput); |
5760
|
|
|
|
|
|
} else { |
5761
|
2274643
|
|
|
|
|
PUSH_STATE_GOTO(BRANCH_next, scan, locinput); |
5762
|
|
|
|
|
|
} |
5763
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5764
|
|
|
|
|
|
|
5765
|
|
|
|
|
|
case CUTGROUP: /* /(*THEN)/ */ |
5766
|
2274643
|
|
|
|
|
sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : |
5767
|
2274643
|
|
|
|
|
MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); |
5768
|
2818339
|
|
|
|
|
PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); |
5769
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5770
|
|
|
|
|
|
|
5771
|
|
|
|
|
|
case CUTGROUP_next_fail: |
5772
|
|
|
|
|
|
do_cutgroup = 1; |
5773
|
|
|
|
|
|
no_final = 1; |
5774
|
2818339
|
|
|
|
|
if (st->u.mark.mark_name) |
5775
|
0
|
|
|
|
|
sv_commit = st->u.mark.mark_name; |
5776
|
|
|
|
|
|
sayNO; |
5777
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5778
|
|
|
|
|
|
|
5779
|
|
|
|
|
|
case BRANCH_next: |
5780
|
|
|
|
|
|
sayYES; |
5781
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5782
|
|
|
|
|
|
|
5783
|
|
|
|
|
|
case BRANCH_next_fail: /* that branch failed; try the next, if any */ |
5784
|
2818339
|
|
|
|
|
if (do_cutgroup) { |
5785
|
|
|
|
|
|
do_cutgroup = 0; |
5786
|
|
|
|
|
|
no_final = 0; |
5787
|
|
|
|
|
|
} |
5788
|
6118662
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
5789
|
6118662
|
|
|
|
|
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); |
5790
|
0
|
|
|
|
|
scan = ST.next_branch; |
5791
|
|
|
|
|
|
/* no more branches? */ |
5792
|
6118662
|
|
|
|
|
if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { |
5793
|
6118662
|
|
|
|
|
DEBUG_EXECUTE_r({ |
5794
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
5795
|
|
|
|
|
|
"%*s %sBRANCH failed...%s\n", |
5796
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", |
5797
|
|
|
|
|
|
PL_colors[4], |
5798
|
|
|
|
|
|
PL_colors[5] ); |
5799
|
|
|
|
|
|
}); |
5800
|
|
|
|
|
|
sayNO_SILENT; |
5801
|
|
|
|
|
|
} |
5802
|
3509270
|
|
|
|
|
continue; /* execute next BRANCH[J] op */ |
5803
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5804
|
|
|
|
|
|
|
5805
|
|
|
|
|
|
case MINMOD: /* next op will be non-greedy, e.g. A*? */ |
5806
|
|
|
|
|
|
minmod = 1; |
5807
|
3509270
|
|
|
|
|
break; |
5808
|
|
|
|
|
|
|
5809
|
|
|
|
|
|
#undef ST |
5810
|
|
|
|
|
|
#define ST st->u.curlym |
5811
|
|
|
|
|
|
|
5812
|
|
|
|
|
|
case CURLYM: /* /A{m,n}B/ where A is fixed-length */ |
5813
|
|
|
|
|
|
|
5814
|
|
|
|
|
|
/* This is an optimisation of CURLYX that enables us to push |
5815
|
|
|
|
|
|
* only a single backtracking state, no matter how many matches |
5816
|
|
|
|
|
|
* there are in {m,n}. It relies on the pattern being constant |
5817
|
|
|
|
|
|
* length, with no parens to influence future backrefs |
5818
|
|
|
|
|
|
*/ |
5819
|
|
|
|
|
|
|
5820
|
3509270
|
|
|
|
|
ST.me = scan; |
5821
|
0
|
|
|
|
|
scan = NEXTOPER(scan) + NODE_STEP_REGNODE; |
5822
|
|
|
|
|
|
|
5823
|
6380021
|
|
|
|
|
ST.lastparen = rex->lastparen; |
5824
|
6380021
|
|
|
|
|
ST.lastcloseparen = rex->lastcloseparen; |
5825
|
|
|
|
|
|
|
5826
|
|
|
|
|
|
/* if paren positive, emulate an OPEN/CLOSE around A */ |
5827
|
0
|
|
|
|
|
if (ST.me->flags) { |
5828
|
6380021
|
|
|
|
|
U32 paren = ST.me->flags; |
5829
|
529450262
|
|
|
|
|
if (paren > maxopenparen) |
5830
|
4174
|
|
|
|
|
maxopenparen = paren; |
5831
|
529447875
|
|
|
|
|
scan += NEXT_OFF(scan); /* Skip former OPEN. */ |
5832
|
|
|
|
|
|
} |
5833
|
529447875
|
|
|
|
|
ST.A = scan; |
5834
|
410544422
|
|
|
|
|
ST.B = next; |
5835
|
20439
|
|
|
|
|
ST.alen = 0; |
5836
|
13626
|
|
|
|
|
ST.count = 0; |
5837
|
615424876
|
|
|
|
|
ST.minmod = minmod; |
5838
|
|
|
|
|
|
minmod = 0; |
5839
|
410530796
|
|
|
|
|
ST.c1 = CHRTEST_UNINIT; |
5840
|
529447875
|
|
|
|
|
REGCP_SET(ST.cp); |
5841
|
|
|
|
|
|
|
5842
|
195992257
|
|
|
|
|
if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ |
5843
|
|
|
|
|
|
goto curlym_do_B; |
5844
|
|
|
|
|
|
|
5845
|
|
|
|
|
|
curlym_do_A: /* execute the A in /A{m,n}B/ */ |
5846
|
195992257
|
|
|
|
|
PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ |
5847
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5848
|
|
|
|
|
|
|
5849
|
|
|
|
|
|
case CURLYM_A: /* we've just matched an A */ |
5850
|
439292424
|
|
|
|
|
ST.count++; |
5851
|
|
|
|
|
|
/* after first match, determine A's length: u.curlym.alen */ |
5852
|
439292424
|
|
|
|
|
if (ST.count == 1) { |
5853
|
259827
|
|
|
|
|
if (reginfo->is_utf8_target) { |
5854
|
439292424
|
|
|
|
|
char *s = st->locinput; |
5855
|
369073858
|
|
|
|
|
while (s < locinput) { |
5856
|
112441273
|
|
|
|
|
ST.alen++; |
5857
|
632368
|
|
|
|
|
s += UTF8SKIP(s); |
5858
|
|
|
|
|
|
} |
5859
|
|
|
|
|
|
} |
5860
|
|
|
|
|
|
else { |
5861
|
632368
|
|
|
|
|
ST.alen = locinput - st->locinput; |
5862
|
|
|
|
|
|
} |
5863
|
632368
|
|
|
|
|
if (ST.alen == 0) |
5864
|
632368
|
|
|
|
|
ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); |
5865
|
|
|
|
|
|
} |
5866
|
729524
|
|
|
|
|
DEBUG_EXECUTE_r( |
5867
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
5868
|
|
|
|
|
|
"%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", |
5869
|
|
|
|
|
|
(int)(REPORT_CODE_OFF+(depth*2)), "", |
5870
|
|
|
|
|
|
(IV) ST.count, (IV)ST.alen) |
5871
|
|
|
|
|
|
); |
5872
|
|
|
|
|
|
|
5873
|
632368
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren && |
5874
|
632368
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.me->flags) |
5875
|
|
|
|
|
|
goto fake_end; |
5876
|
|
|
|
|
|
|
5877
|
|
|
|
|
|
{ |
5878
|
632368
|
|
|
|
|
I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); |
5879
|
632368
|
|
|
|
|
if ( max == REG_INFTY || ST.count < max ) |
5880
|
|
|
|
|
|
goto curlym_do_A; /* try to match another A */ |
5881
|
|
|
|
|
|
} |
5882
|
|
|
|
|
|
goto curlym_do_B; /* try to match B */ |
5883
|
|
|
|
|
|
|
5884
|
|
|
|
|
|
case CURLYM_A_fail: /* just failed to match an A */ |
5885
|
632368
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
5886
|
|
|
|
|
|
|
5887
|
632368
|
|
|
|
|
if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ |
5888
|
632368
|
|
|
|
|
|| (cur_eval && cur_eval->u.eval.close_paren && |
5889
|
261636
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.me->flags)) |
5890
|
|
|
|
|
|
sayNO; |
5891
|
|
|
|
|
|
|
5892
|
|
|
|
|
|
curlym_do_B: /* execute the B in /A{m,n}B/ */ |
5893
|
632368
|
|
|
|
|
if (ST.c1 == CHRTEST_UNINIT) { |
5894
|
|
|
|
|
|
/* calculate c1 and c2 for possible match of 1st char |
5895
|
|
|
|
|
|
* following curly */ |
5896
|
111808905
|
|
|
|
|
ST.c1 = ST.c2 = CHRTEST_VOID; |
5897
|
840636
|
|
|
|
|
if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { |
5898
|
840636
|
|
|
|
|
regnode *text_node = ST.B; |
5899
|
3509278
|
|
|
|
|
if (! HAS_TEXT(text_node)) |
5900
|
3509278
|
|
|
|
|
FIND_NEXT_IMPT(text_node); |
5901
|
|
|
|
|
|
/* this used to be |
5902
|
|
|
|
|
|
|
5903
|
|
|
|
|
|
(HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) |
5904
|
|
|
|
|
|
|
5905
|
|
|
|
|
|
But the former is redundant in light of the latter. |
5906
|
|
|
|
|
|
|
5907
|
|
|
|
|
|
if this changes back then the macro for |
5908
|
|
|
|
|
|
IS_TEXT and friends need to change. |
5909
|
|
|
|
|
|
*/ |
5910
|
11840382
|
|
|
|
|
if (PL_regkind[OP(text_node)] == EXACT) { |
5911
|
15349660
|
|
|
|
|
if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ |
5912
|
8764976
|
|
|
|
|
text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, |
5913
|
|
|
|
|
|
reginfo)) |
5914
|
|
|
|
|
|
{ |
5915
|
|
|
|
|
|
sayNO; |
5916
|
|
|
|
|
|
} |
5917
|
|
|
|
|
|
} |
5918
|
|
|
|
|
|
} |
5919
|
|
|
|
|
|
} |
5920
|
|
|
|
|
|
|
5921
|
8764976
|
|
|
|
|
DEBUG_EXECUTE_r( |
5922
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
5923
|
|
|
|
|
|
"%*s CURLYM trying tail with matches=%"IVdf"...\n", |
5924
|
|
|
|
|
|
(int)(REPORT_CODE_OFF+(depth*2)), |
5925
|
|
|
|
|
|
"", (IV)ST.count) |
5926
|
|
|
|
|
|
); |
5927
|
362910
|
|
|
|
|
if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { |
5928
|
0
|
|
|
|
|
if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { |
5929
|
362910
|
|
|
|
|
if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) |
5930
|
70974
|
|
|
|
|
&& memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) |
5931
|
|
|
|
|
|
{ |
5932
|
|
|
|
|
|
/* simulate B failing */ |
5933
|
70974
|
|
|
|
|
DEBUG_OPTIMISE_r( |
5934
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
5935
|
|
|
|
|
|
"%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", |
5936
|
|
|
|
|
|
(int)(REPORT_CODE_OFF+(depth*2)),"", |
5937
|
|
|
|
|
|
valid_utf8_to_uvchr((U8 *) locinput, NULL), |
5938
|
|
|
|
|
|
valid_utf8_to_uvchr(ST.c1_utf8, NULL), |
5939
|
|
|
|
|
|
valid_utf8_to_uvchr(ST.c2_utf8, NULL)) |
5940
|
|
|
|
|
|
); |
5941
|
|
|
|
|
|
state_num = CURLYM_B_fail; |
5942
|
|
|
|
|
|
goto reenter_switch; |
5943
|
|
|
|
|
|
} |
5944
|
|
|
|
|
|
} |
5945
|
15827386
|
|
|
|
|
else if (nextchr != ST.c1 && nextchr != ST.c2) { |
5946
|
|
|
|
|
|
/* simulate B failing */ |
5947
|
15827386
|
|
|
|
|
DEBUG_OPTIMISE_r( |
5948
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
5949
|
|
|
|
|
|
"%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", |
5950
|
|
|
|
|
|
(int)(REPORT_CODE_OFF+(depth*2)),"", |
5951
|
|
|
|
|
|
(int) nextchr, ST.c1, ST.c2) |
5952
|
|
|
|
|
|
); |
5953
|
|
|
|
|
|
state_num = CURLYM_B_fail; |
5954
|
|
|
|
|
|
goto reenter_switch; |
5955
|
|
|
|
|
|
} |
5956
|
|
|
|
|
|
} |
5957
|
|
|
|
|
|
|
5958
|
15827386
|
|
|
|
|
if (ST.me->flags) { |
5959
|
|
|
|
|
|
/* emulate CLOSE: mark current A as captured */ |
5960
|
10422058
|
|
|
|
|
I32 paren = ST.me->flags; |
5961
|
15827386
|
|
|
|
|
if (ST.count) { |
5962
|
650
|
|
|
|
|
rex->offs[paren].start |
5963
|
15826736
|
|
|
|
|
= HOPc(locinput, -ST.alen) - reginfo->strbeg; |
5964
|
7031634
|
|
|
|
|
rex->offs[paren].end = locinput - reginfo->strbeg; |
5965
|
6516736
|
|
|
|
|
if ((U32)paren > rex->lastparen) |
5966
|
7031634
|
|
|
|
|
rex->lastparen = paren; |
5967
|
7031634
|
|
|
|
|
rex->lastcloseparen = paren; |
5968
|
|
|
|
|
|
} |
5969
|
|
|
|
|
|
else |
5970
|
7031634
|
|
|
|
|
rex->offs[paren].end = -1; |
5971
|
0
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren && |
5972
|
0
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.me->flags) |
5973
|
|
|
|
|
|
{ |
5974
|
54
|
|
|
|
|
if (ST.count) |
5975
|
|
|
|
|
|
goto fake_end; |
5976
|
|
|
|
|
|
else |
5977
|
|
|
|
|
|
sayNO; |
5978
|
|
|
|
|
|
} |
5979
|
|
|
|
|
|
} |
5980
|
|
|
|
|
|
|
5981
|
894
|
|
|
|
|
PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ |
5982
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
5983
|
|
|
|
|
|
|
5984
|
|
|
|
|
|
case CURLYM_B_fail: /* just failed to match a B */ |
5985
|
22
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
5986
|
1233
|
|
|
|
|
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); |
5987
|
278
|
|
|
|
|
if (ST.minmod) { |
5988
|
278
|
|
|
|
|
I32 max = ARG2(ST.me); |
5989
|
278
|
|
|
|
|
if (max != REG_INFTY && ST.count == max) |
5990
|
|
|
|
|
|
sayNO; |
5991
|
|
|
|
|
|
goto curlym_do_A; /* try to match a further A */ |
5992
|
|
|
|
|
|
} |
5993
|
|
|
|
|
|
/* backtrack one A */ |
5994
|
278
|
|
|
|
|
if (ST.count == ARG1(ST.me) /* min */) |
5995
|
|
|
|
|
|
sayNO; |
5996
|
278
|
|
|
|
|
ST.count--; |
5997
|
20
|
|
|
|
|
SET_locinput(HOPc(locinput, -ST.alen)); |
5998
|
|
|
|
|
|
goto curlym_do_B; /* try to match B */ |
5999
|
|
|
|
|
|
|
6000
|
|
|
|
|
|
#undef ST |
6001
|
|
|
|
|
|
#define ST st->u.curly |
6002
|
|
|
|
|
|
|
6003
|
|
|
|
|
|
#define CURLY_SETPAREN(paren, success) \ |
6004
|
|
|
|
|
|
if (paren) { \ |
6005
|
|
|
|
|
|
if (success) { \ |
6006
|
|
|
|
|
|
rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ |
6007
|
|
|
|
|
|
rex->offs[paren].end = locinput - reginfo->strbeg; \ |
6008
|
|
|
|
|
|
if (paren > rex->lastparen) \ |
6009
|
|
|
|
|
|
rex->lastparen = paren; \ |
6010
|
|
|
|
|
|
rex->lastcloseparen = paren; \ |
6011
|
|
|
|
|
|
} \ |
6012
|
|
|
|
|
|
else { \ |
6013
|
|
|
|
|
|
rex->offs[paren].end = -1; \ |
6014
|
|
|
|
|
|
rex->lastparen = ST.lastparen; \ |
6015
|
|
|
|
|
|
rex->lastcloseparen = ST.lastcloseparen; \ |
6016
|
|
|
|
|
|
} \ |
6017
|
|
|
|
|
|
} |
6018
|
|
|
|
|
|
|
6019
|
|
|
|
|
|
case STAR: /* /A*B/ where A is width 1 char */ |
6020
|
20
|
|
|
|
|
ST.paren = 0; |
6021
|
258
|
|
|
|
|
ST.min = 0; |
6022
|
164
|
|
|
|
|
ST.max = REG_INFTY; |
6023
|
154
|
|
|
|
|
scan = NEXTOPER(scan); |
6024
|
164
|
|
|
|
|
goto repeat; |
6025
|
|
|
|
|
|
|
6026
|
|
|
|
|
|
case PLUS: /* /A+B/ where A is width 1 char */ |
6027
|
258
|
|
|
|
|
ST.paren = 0; |
6028
|
258
|
|
|
|
|
ST.min = 1; |
6029
|
258
|
|
|
|
|
ST.max = REG_INFTY; |
6030
|
252
|
|
|
|
|
scan = NEXTOPER(scan); |
6031
|
84
|
|
|
|
|
goto repeat; |
6032
|
|
|
|
|
|
|
6033
|
|
|
|
|
|
case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ |
6034
|
84
|
|
|
|
|
ST.paren = scan->flags; /* Which paren to set */ |
6035
|
84
|
|
|
|
|
ST.lastparen = rex->lastparen; |
6036
|
168
|
|
|
|
|
ST.lastcloseparen = rex->lastcloseparen; |
6037
|
268
|
|
|
|
|
if (ST.paren > maxopenparen) |
6038
|
184
|
|
|
|
|
maxopenparen = ST.paren; |
6039
|
168
|
|
|
|
|
ST.min = ARG1(scan); /* min to match */ |
6040
|
168
|
|
|
|
|
ST.max = ARG2(scan); /* max to match */ |
6041
|
16
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren && |
6042
|
244
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.paren) { |
6043
|
164
|
|
|
|
|
ST.min=1; |
6044
|
80
|
|
|
|
|
ST.max=1; |
6045
|
|
|
|
|
|
} |
6046
|
80
|
|
|
|
|
scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); |
6047
|
80
|
|
|
|
|
goto repeat; |
6048
|
|
|
|
|
|
|
6049
|
|
|
|
|
|
case CURLY: /* /A{m,n}B/ where A is width 1 char */ |
6050
|
14
|
|
|
|
|
ST.paren = 0; |
6051
|
46422
|
|
|
|
|
ST.min = ARG1(scan); /* min to match */ |
6052
|
882
|
|
|
|
|
ST.max = ARG2(scan); /* max to match */ |
6053
|
882
|
|
|
|
|
scan = NEXTOPER(scan) + NODE_STEP_REGNODE; |
6054
|
|
|
|
|
|
repeat: |
6055
|
|
|
|
|
|
/* |
6056
|
|
|
|
|
|
* Lookahead to avoid useless match attempts |
6057
|
|
|
|
|
|
* when we know what character comes next. |
6058
|
|
|
|
|
|
* |
6059
|
|
|
|
|
|
* Used to only do .*x and .*?x, but now it allows |
6060
|
|
|
|
|
|
* for )'s, ('s and (?{ ... })'s to be in the way |
6061
|
|
|
|
|
|
* of the quantifier and the EXACT-like node. -- japhy |
6062
|
|
|
|
|
|
*/ |
6063
|
|
|
|
|
|
|
6064
|
0
|
|
|
|
|
assert(ST.min <= ST.max); |
6065
|
0
|
|
|
|
|
if (! HAS_TEXT(next) && ! JUMPABLE(next)) { |
6066
|
0
|
|
|
|
|
ST.c1 = ST.c2 = CHRTEST_VOID; |
6067
|
|
|
|
|
|
} |
6068
|
|
|
|
|
|
else { |
6069
|
|
|
|
|
|
regnode *text_node = next; |
6070
|
|
|
|
|
|
|
6071
|
25264060
|
|
|
|
|
if (! HAS_TEXT(text_node)) |
6072
|
6363152
|
|
|
|
|
FIND_NEXT_IMPT(text_node); |
6073
|
|
|
|
|
|
|
6074
|
6363152
|
|
|
|
|
if (! HAS_TEXT(text_node)) |
6075
|
18900908
|
|
|
|
|
ST.c1 = ST.c2 = CHRTEST_VOID; |
6076
|
|
|
|
|
|
else { |
6077
|
583511433
|
|
|
|
|
if ( PL_regkind[OP(text_node)] != EXACT ) { |
6078
|
131520404
|
|
|
|
|
ST.c1 = ST.c2 = CHRTEST_VOID; |
6079
|
|
|
|
|
|
} |
6080
|
|
|
|
|
|
else { |
6081
|
|
|
|
|
|
|
6082
|
|
|
|
|
|
/* Currently we only get here when |
6083
|
|
|
|
|
|
|
6084
|
|
|
|
|
|
PL_rekind[OP(text_node)] == EXACT |
6085
|
|
|
|
|
|
|
6086
|
|
|
|
|
|
if this changes back then the macro for IS_TEXT and |
6087
|
|
|
|
|
|
friends need to change. */ |
6088
|
663703331
|
|
|
|
|
if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ |
6089
|
663703331
|
|
|
|
|
text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, |
6090
|
|
|
|
|
|
reginfo)) |
6091
|
|
|
|
|
|
{ |
6092
|
|
|
|
|
|
sayNO; |
6093
|
|
|
|
|
|
} |
6094
|
|
|
|
|
|
} |
6095
|
|
|
|
|
|
} |
6096
|
|
|
|
|
|
} |
6097
|
|
|
|
|
|
|
6098
|
663703331
|
|
|
|
|
ST.A = scan; |
6099
|
663703331
|
|
|
|
|
ST.B = next; |
6100
|
1889268
|
|
|
|
|
if (minmod) { |
6101
|
663703331
|
|
|
|
|
char *li = locinput; |
6102
|
|
|
|
|
|
minmod = 0; |
6103
|
1036151107
|
|
|
|
|
if (ST.min && |
6104
|
0
|
|
|
|
|
regrepeat(rex, &li, ST.A, reginfo, ST.min, depth) |
6105
|
136660069
|
|
|
|
|
< ST.min) |
6106
|
|
|
|
|
|
sayNO; |
6107
|
26376090
|
|
|
|
|
SET_locinput(li); |
6108
|
26213858
|
|
|
|
|
ST.count = ST.min; |
6109
|
177624
|
|
|
|
|
REGCP_SET(ST.cp); |
6110
|
177624
|
|
|
|
|
if (ST.c1 == CHRTEST_VOID) |
6111
|
|
|
|
|
|
goto curly_try_B_min; |
6112
|
|
|
|
|
|
|
6113
|
177624
|
|
|
|
|
ST.oldloc = locinput; |
6114
|
|
|
|
|
|
|
6115
|
|
|
|
|
|
/* set ST.maxpos to the furthest point along the |
6116
|
|
|
|
|
|
* string that could possibly match */ |
6117
|
26198466
|
|
|
|
|
if (ST.max == REG_INFTY) { |
6118
|
26198466
|
|
|
|
|
ST.maxpos = reginfo->strend - 1; |
6119
|
26198466
|
|
|
|
|
if (utf8_target) |
6120
|
26198466
|
|
|
|
|
while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) |
6121
|
376
|
|
|
|
|
ST.maxpos--; |
6122
|
|
|
|
|
|
} |
6123
|
26198466
|
|
|
|
|
else if (utf8_target) { |
6124
|
26198466
|
|
|
|
|
int m = ST.max - ST.min; |
6125
|
110461603
|
|
|
|
|
for (ST.maxpos = locinput; |
6126
|
10232
|
|
|
|
|
m >0 && ST.maxpos < reginfo->strend; m--) |
6127
|
1690
|
|
|
|
|
ST.maxpos += UTF8SKIP(ST.maxpos); |
6128
|
|
|
|
|
|
} |
6129
|
|
|
|
|
|
else { |
6130
|
642025740
|
|
|
|
|
ST.maxpos = locinput + ST.max - ST.min; |
6131
|
1526
|
|
|
|
|
if (ST.maxpos >= reginfo->strend) |
6132
|
642024214
|
|
|
|
|
ST.maxpos = reginfo->strend - 1; |
6133
|
|
|
|
|
|
} |
6134
|
|
|
|
|
|
goto curly_try_B_min_known; |
6135
|
|
|
|
|
|
|
6136
|
|
|
|
|
|
} |
6137
|
|
|
|
|
|
else { |
6138
|
|
|
|
|
|
/* avoid taking address of locinput, so it can remain |
6139
|
|
|
|
|
|
* a register var */ |
6140
|
522704776
|
|
|
|
|
char *li = locinput; |
6141
|
522704776
|
|
|
|
|
ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth); |
6142
|
1710774
|
|
|
|
|
if (ST.count < ST.min) |
6143
|
|
|
|
|
|
sayNO; |
6144
|
1710774
|
|
|
|
|
SET_locinput(li); |
6145
|
522704776
|
|
|
|
|
if ((ST.count > ST.min) |
6146
|
522704776
|
|
|
|
|
&& (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) |
6147
|
|
|
|
|
|
{ |
6148
|
|
|
|
|
|
/* A{m,n} must come at the end of the string, there's |
6149
|
|
|
|
|
|
* no point in backing off ... */ |
6150
|
522704776
|
|
|
|
|
ST.min = ST.count; |
6151
|
|
|
|
|
|
/* ...except that $ and \Z can match before *and* after |
6152
|
|
|
|
|
|
newline at the end. Consider "\n\n" =~ /\n+\Z\n/. |
6153
|
|
|
|
|
|
We may back off by one in this case. */ |
6154
|
522704776
|
|
|
|
|
if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) |
6155
|
105321938
|
|
|
|
|
ST.min--; |
6156
|
|
|
|
|
|
} |
6157
|
522704776
|
|
|
|
|
REGCP_SET(ST.cp); |
6158
|
522704776
|
|
|
|
|
goto curly_try_B_max; |
6159
|
|
|
|
|
|
} |
6160
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6161
|
|
|
|
|
|
|
6162
|
|
|
|
|
|
|
6163
|
|
|
|
|
|
case CURLY_B_min_known_fail: |
6164
|
|
|
|
|
|
/* failed to find B in a non-greedy match where c1,c2 valid */ |
6165
|
|
|
|
|
|
|
6166
|
229782191
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
6167
|
1786
|
|
|
|
|
if (ST.paren) { |
6168
|
1786
|
|
|
|
|
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); |
6169
|
|
|
|
|
|
} |
6170
|
|
|
|
|
|
/* Couldn't or didn't -- move forward. */ |
6171
|
1786
|
|
|
|
|
ST.oldloc = locinput; |
6172
|
544
|
|
|
|
|
if (utf8_target) |
6173
|
1242
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
6174
|
|
|
|
|
|
else |
6175
|
1786
|
|
|
|
|
locinput++; |
6176
|
1786
|
|
|
|
|
ST.count++; |
6177
|
|
|
|
|
|
curly_try_B_min_known: |
6178
|
|
|
|
|
|
/* find the next place where 'B' could work, then call B */ |
6179
|
|
|
|
|
|
{ |
6180
|
|
|
|
|
|
int n; |
6181
|
229782191
|
|
|
|
|
if (utf8_target) { |
6182
|
53862
|
|
|
|
|
n = (ST.oldloc == locinput) ? 0 : 1; |
6183
|
229782191
|
|
|
|
|
if (ST.c1 == ST.c2) { |
6184
|
|
|
|
|
|
/* set n to utf8_distance(oldloc, locinput) */ |
6185
|
191762654
|
|
|
|
|
while (locinput <= ST.maxpos |
6186
|
191762654
|
|
|
|
|
&& memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) |
6187
|
|
|
|
|
|
{ |
6188
|
191762654
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
6189
|
191762654
|
|
|
|
|
n++; |
6190
|
|
|
|
|
|
} |
6191
|
|
|
|
|
|
} |
6192
|
|
|
|
|
|
else { |
6193
|
|
|
|
|
|
/* set n to utf8_distance(oldloc, locinput) */ |
6194
|
191762654
|
|
|
|
|
while (locinput <= ST.maxpos |
6195
|
45772085
|
|
|
|
|
&& memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) |
6196
|
43693479
|
|
|
|
|
&& memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) |
6197
|
|
|
|
|
|
{ |
6198
|
191762654
|
|
|
|
|
locinput += UTF8SKIP(locinput); |
6199
|
59601419
|
|
|
|
|
n++; |
6200
|
|
|
|
|
|
} |
6201
|
|
|
|
|
|
} |
6202
|
|
|
|
|
|
} |
6203
|
|
|
|
|
|
else { /* Not utf8_target */ |
6204
|
235980
|
|
|
|
|
if (ST.c1 == ST.c2) { |
6205
|
193688
|
|
|
|
|
while (locinput <= ST.maxpos && |
6206
|
193688
|
|
|
|
|
UCHARAT(locinput) != ST.c1) |
6207
|
718968725
|
|
|
|
|
locinput++; |
6208
|
|
|
|
|
|
} |
6209
|
|
|
|
|
|
else { |
6210
|
659409598
|
|
|
|
|
while (locinput <= ST.maxpos |
6211
|
3434907
|
|
|
|
|
&& UCHARAT(locinput) != ST.c1 |
6212
|
15508
|
|
|
|
|
&& UCHARAT(locinput) != ST.c2) |
6213
|
14482
|
|
|
|
|
locinput++; |
6214
|
|
|
|
|
|
} |
6215
|
14482
|
|
|
|
|
n = locinput - ST.oldloc; |
6216
|
|
|
|
|
|
} |
6217
|
64
|
|
|
|
|
if (locinput > ST.maxpos) |
6218
|
|
|
|
|
|
sayNO; |
6219
|
18
|
|
|
|
|
if (n) { |
6220
|
|
|
|
|
|
/* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is |
6221
|
|
|
|
|
|
* at b; check that everything between oldloc and |
6222
|
|
|
|
|
|
* locinput matches */ |
6223
|
29807566
|
|
|
|
|
char *li = ST.oldloc; |
6224
|
29807566
|
|
|
|
|
ST.count += n; |
6225
|
29804414
|
|
|
|
|
if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n) |
6226
|
|
|
|
|
|
sayNO; |
6227
|
14937594
|
|
|
|
|
assert(n == REG_INFTY || locinput == li); |
6228
|
|
|
|
|
|
} |
6229
|
33412070
|
|
|
|
|
CURLY_SETPAREN(ST.paren, ST.count); |
6230
|
3607656
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren && |
6231
|
3152
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.paren) { |
6232
|
|
|
|
|
|
goto fake_end; |
6233
|
|
|
|
|
|
} |
6234
|
1610
|
|
|
|
|
PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); |
6235
|
|
|
|
|
|
} |
6236
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6237
|
|
|
|
|
|
|
6238
|
|
|
|
|
|
|
6239
|
|
|
|
|
|
case CURLY_B_min_fail: |
6240
|
|
|
|
|
|
/* failed to find B in a non-greedy match where c1,c2 invalid */ |
6241
|
|
|
|
|
|
|
6242
|
3147
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
6243
|
2098
|
|
|
|
|
if (ST.paren) { |
6244
|
1452
|
|
|
|
|
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); |
6245
|
|
|
|
|
|
} |
6246
|
|
|
|
|
|
/* failed -- move forward one */ |
6247
|
|
|
|
|
|
{ |
6248
|
1452
|
|
|
|
|
char *li = locinput; |
6249
|
1042
|
|
|
|
|
if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) { |
6250
|
|
|
|
|
|
sayNO; |
6251
|
|
|
|
|
|
} |
6252
|
1042
|
|
|
|
|
locinput = li; |
6253
|
|
|
|
|
|
} |
6254
|
|
|
|
|
|
{ |
6255
|
554
|
|
|
|
|
ST.count++; |
6256
|
512
|
|
|
|
|
if (ST.count <= ST.max || (ST.max == REG_INFTY && |
6257
|
1280
|
|
|
|
|
ST.count > 0)) /* count overflow ? */ |
6258
|
|
|
|
|
|
{ |
6259
|
|
|
|
|
|
curly_try_B_min: |
6260
|
512
|
|
|
|
|
CURLY_SETPAREN(ST.paren, ST.count); |
6261
|
1542
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren && |
6262
|
1542
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.paren) { |
6263
|
|
|
|
|
|
goto fake_end; |
6264
|
|
|
|
|
|
} |
6265
|
3479
|
|
|
|
|
PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); |
6266
|
|
|
|
|
|
} |
6267
|
|
|
|
|
|
} |
6268
|
|
|
|
|
|
sayNO; |
6269
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6270
|
|
|
|
|
|
|
6271
|
|
|
|
|
|
|
6272
|
|
|
|
|
|
curly_try_B_max: |
6273
|
|
|
|
|
|
/* a successful greedy match: now try to match B */ |
6274
|
2578
|
|
|
|
|
if (cur_eval && cur_eval->u.eval.close_paren && |
6275
|
1168
|
|
|
|
|
cur_eval->u.eval.close_paren == (U32)ST.paren) { |
6276
|
|
|
|
|
|
goto fake_end; |
6277
|
|
|
|
|
|
} |
6278
|
|
|
|
|
|
{ |
6279
|
1166
|
|
|
|
|
bool could_match = locinput < reginfo->strend; |
6280
|
|
|
|
|
|
|
6281
|
|
|
|
|
|
/* If it could work, try it. */ |
6282
|
1166
|
|
|
|
|
if (ST.c1 != CHRTEST_VOID && could_match) { |
6283
|
1166
|
|
|
|
|
if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) |
6284
|
|
|
|
|
|
{ |
6285
|
116440
|
|
|
|
|
could_match = memEQ(locinput, |
6286
|
|
|
|
|
|
ST.c1_utf8, |
6287
|
|
|
|
|
|
UTF8SKIP(locinput)) |
6288
|
116440
|
|
|
|
|
|| memEQ(locinput, |
6289
|
|
|
|
|
|
ST.c2_utf8, |
6290
|
|
|
|
|
|
UTF8SKIP(locinput)); |
6291
|
|
|
|
|
|
} |
6292
|
|
|
|
|
|
else { |
6293
|
14800
|
|
|
|
|
could_match = UCHARAT(locinput) == ST.c1 |
6294
|
101154
|
|
|
|
|
|| UCHARAT(locinput) == ST.c2; |
6295
|
|
|
|
|
|
} |
6296
|
|
|
|
|
|
} |
6297
|
260736
|
|
|
|
|
if (ST.c1 == CHRTEST_VOID || could_match) { |
6298
|
254788
|
|
|
|
|
CURLY_SETPAREN(ST.paren, ST.count); |
6299
|
63646
|
|
|
|
|
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); |
6300
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6301
|
|
|
|
|
|
} |
6302
|
|
|
|
|
|
} |
6303
|
|
|
|
|
|
/* FALL THROUGH */ |
6304
|
|
|
|
|
|
|
6305
|
|
|
|
|
|
case CURLY_B_max_fail: |
6306
|
|
|
|
|
|
/* failed to find B in a greedy match */ |
6307
|
|
|
|
|
|
|
6308
|
63646
|
|
|
|
|
REGCP_UNWIND(ST.cp); |
6309
|
144461
|
|
|
|
|
if (ST.paren) { |
6310
|
105790
|
|
|
|
|
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); |
6311
|
|
|
|
|
|
} |
6312
|
|
|
|
|
|
/* back up. */ |
6313
|
48992
|
|
|
|
|
if (--ST.count < ST.min) |
6314
|
|
|
|
|
|
sayNO; |
6315
|
48992
|
|
|
|
|
locinput = HOPc(locinput, -1); |
6316
|
|
|
|
|
|
goto curly_try_B_max; |
6317
|
|
|
|
|
|
|
6318
|
|
|
|
|
|
#undef ST |
6319
|
|
|
|
|
|
|
6320
|
|
|
|
|
|
case END: /* last op of main pattern */ |
6321
|
|
|
|
|
|
fake_end: |
6322
|
49034
|
|
|
|
|
if (cur_eval) { |
6323
|
|
|
|
|
|
/* we've just finished A in /(??{A})B/; now continue with B */ |
6324
|
|
|
|
|
|
|
6325
|
191142
|
|
|
|
|
st->u.eval.prev_rex = rex_sv; /* inner */ |
6326
|
|
|
|
|
|
|
6327
|
|
|
|
|
|
/* Save *all* the positions. */ |
6328
|
107280
|
|
|
|
|
st->u.eval.cp = regcppush(rex, 0, maxopenparen); |
6329
|
123819
|
|
|
|
|
rex_sv = cur_eval->u.eval.prev_rex; |
6330
|
82546
|
|
|
|
|
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); |
6331
|
45852
|
|
|
|
|
SET_reg_curpm(rex_sv); |
6332
|
6080
|
|
|
|
|
rex = ReANY(rex_sv); |
6333
|
6080
|
|
|
|
|
rexi = RXi_GET(rex); |
6334
|
69222
|
|
|
|
|
cur_curlyx = cur_eval->u.eval.prev_curlyx; |
6335
|
|
|
|
|
|
|
6336
|
46148
|
|
|
|
|
REGCP_SET(st->u.eval.lastcp); |
6337
|
|
|
|
|
|
|
6338
|
|
|
|
|
|
/* Restore parens of the outer rex without popping the |
6339
|
|
|
|
|
|
* savestack */ |
6340
|
34134
|
|
|
|
|
S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp, |
6341
|
|
|
|
|
|
&maxopenparen); |
6342
|
|
|
|
|
|
|
6343
|
31636
|
|
|
|
|
st->u.eval.prev_eval = cur_eval; |
6344
|
15334
|
|
|
|
|
cur_eval = cur_eval->u.eval.prev_eval; |
6345
|
15334
|
|
|
|
|
DEBUG_EXECUTE_r( |
6346
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", |
6347
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); |
6348
|
83862
|
|
|
|
|
if ( nochange_depth ) |
6349
|
48340
|
|
|
|
|
nochange_depth--; |
6350
|
|
|
|
|
|
|
6351
|
6336
|
|
|
|
|
PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, |
6352
|
|
|
|
|
|
locinput); /* match B */ |
6353
|
|
|
|
|
|
} |
6354
|
|
|
|
|
|
|
6355
|
93612
|
|
|
|
|
if (locinput < reginfo->till) { |
6356
|
62618
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, |
6357
|
|
|
|
|
|
"%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", |
6358
|
|
|
|
|
|
PL_colors[4], |
6359
|
|
|
|
|
|
(long)(locinput - startpos), |
6360
|
|
|
|
|
|
(long)(reginfo->till - startpos), |
6361
|
|
|
|
|
|
PL_colors[5])); |
6362
|
|
|
|
|
|
|
6363
|
|
|
|
|
|
sayNO_SILENT; /* Cannot match: too short. */ |
6364
|
|
|
|
|
|
} |
6365
|
|
|
|
|
|
sayYES; /* Success! */ |
6366
|
|
|
|
|
|
|
6367
|
|
|
|
|
|
case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ |
6368
|
28634
|
|
|
|
|
DEBUG_EXECUTE_r( |
6369
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
6370
|
|
|
|
|
|
"%*s %ssubpattern success...%s\n", |
6371
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); |
6372
|
|
|
|
|
|
sayYES; /* Success! */ |
6373
|
|
|
|
|
|
|
6374
|
|
|
|
|
|
#undef ST |
6375
|
|
|
|
|
|
#define ST st->u.ifmatch |
6376
|
|
|
|
|
|
|
6377
|
|
|
|
|
|
{ |
6378
|
|
|
|
|
|
char *newstart; |
6379
|
|
|
|
|
|
|
6380
|
|
|
|
|
|
case SUSPEND: /* (?>A) */ |
6381
|
43894685
|
|
|
|
|
ST.wanted = 1; |
6382
|
|
|
|
|
|
newstart = locinput; |
6383
|
1604094
|
|
|
|
|
goto do_ifmatch; |
6384
|
|
|
|
|
|
|
6385
|
|
|
|
|
|
case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?
|
6386
|
1069396
|
|
|
|
|
ST.wanted = 0; |
6387
|
894900
|
|
|
|
|
goto ifmatch_trivial_fail_test; |
6388
|
|
|
|
|
|
|
6389
|
|
|
|
|
|
case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */ |
6390
|
701202
|
|
|
|
|
ST.wanted = 1; |
6391
|
|
|
|
|
|
ifmatch_trivial_fail_test: |
6392
|
701202
|
|
|
|
|
if (scan->flags) { |
6393
|
187674251
|
|
|
|
|
char * const s = HOPBACKc(locinput, scan->flags); |
6394
|
144147756
|
|
|
|
|
if (!s) { |
6395
|
|
|
|
|
|
/* trivial fail */ |
6396
|
350
|
|
|
|
|
if (logical) { |
6397
|
|
|
|
|
|
logical = 0; |
6398
|
350
|
|
|
|
|
sw = 1 - cBOOL(ST.wanted); |
6399
|
|
|
|
|
|
} |
6400
|
862
|
|
|
|
|
else if (ST.wanted) |
6401
|
|
|
|
|
|
sayNO; |
6402
|
528
|
|
|
|
|
next = scan + ARG(scan); |
6403
|
176
|
|
|
|
|
if (next == scan) |
6404
|
|
|
|
|
|
next = NULL; |
6405
|
|
|
|
|
|
break; |
6406
|
|
|
|
|
|
} |
6407
|
|
|
|
|
|
newstart = s; |
6408
|
|
|
|
|
|
} |
6409
|
|
|
|
|
|
else |
6410
|
|
|
|
|
|
newstart = locinput; |
6411
|
|
|
|
|
|
|
6412
|
|
|
|
|
|
do_ifmatch: |
6413
|
160
|
|
|
|
|
ST.me = scan; |
6414
|
160
|
|
|
|
|
ST.logical = logical; |
6415
|
|
|
|
|
|
logical = 0; /* XXX: reset state of logical once it has been saved into ST */ |
6416
|
|
|
|
|
|
|
6417
|
|
|
|
|
|
/* execute body of (?...A) */ |
6418
|
160
|
|
|
|
|
PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); |
6419
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6420
|
|
|
|
|
|
} |
6421
|
|
|
|
|
|
|
6422
|
|
|
|
|
|
case IFMATCH_A_fail: /* body of (?...A) failed */ |
6423
|
47524475
|
|
|
|
|
ST.wanted = !ST.wanted; |
6424
|
|
|
|
|
|
/* FALL THROUGH */ |
6425
|
|
|
|
|
|
|
6426
|
|
|
|
|
|
case IFMATCH_A: /* body of (?...A) succeeded */ |
6427
|
52829331
|
|
|
|
|
if (ST.logical) { |
6428
|
26968780
|
|
|
|
|
sw = cBOOL(ST.wanted); |
6429
|
|
|
|
|
|
} |
6430
|
112680115
|
|
|
|
|
else if (!ST.wanted) |
6431
|
|
|
|
|
|
sayNO; |
6432
|
|
|
|
|
|
|
6433
|
59850784
|
|
|
|
|
if (OP(ST.me) != SUSPEND) { |
6434
|
|
|
|
|
|
/* restore old position except for (?>...) */ |
6435
|
1780822
|
|
|
|
|
locinput = st->locinput; |
6436
|
|
|
|
|
|
} |
6437
|
1780136
|
|
|
|
|
scan = ST.me + ARG(ST.me); |
6438
|
13635316
|
|
|
|
|
if (scan == ST.me) |
6439
|
|
|
|
|
|
scan = NULL; |
6440
|
11855446
|
|
|
|
|
continue; /* execute B */ |
6441
|
|
|
|
|
|
|
6442
|
|
|
|
|
|
#undef ST |
6443
|
|
|
|
|
|
|
6444
|
|
|
|
|
|
case LONGJMP: /* alternative with many branches compiles to |
6445
|
|
|
|
|
|
* (BRANCHJ; EXACT ...; LONGJMP ) x N */ |
6446
|
2542
|
|
|
|
|
next = scan + ARG(scan); |
6447
|
2276
|
|
|
|
|
if (next == scan) |
6448
|
|
|
|
|
|
next = NULL; |
6449
|
|
|
|
|
|
break; |
6450
|
|
|
|
|
|
|
6451
|
|
|
|
|
|
case COMMIT: /* (*COMMIT) */ |
6452
|
1933
|
|
|
|
|
reginfo->cutpoint = reginfo->strend; |
6453
|
|
|
|
|
|
/* FALLTHROUGH */ |
6454
|
|
|
|
|
|
|
6455
|
|
|
|
|
|
case PRUNE: /* (*PRUNE) */ |
6456
|
2276
|
|
|
|
|
if (!scan->flags) |
6457
|
2276
|
|
|
|
|
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); |
6458
|
47030
|
|
|
|
|
PUSH_STATE_GOTO(COMMIT_next, next, locinput); |
6459
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6460
|
|
|
|
|
|
|
6461
|
|
|
|
|
|
case COMMIT_next_fail: |
6462
|
|
|
|
|
|
no_final = 1; |
6463
|
|
|
|
|
|
/* FALLTHROUGH */ |
6464
|
|
|
|
|
|
|
6465
|
|
|
|
|
|
case OPFAIL: /* (*FAIL) */ |
6466
|
|
|
|
|
|
sayNO; |
6467
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6468
|
|
|
|
|
|
|
6469
|
|
|
|
|
|
#define ST st->u.mark |
6470
|
|
|
|
|
|
case MARKPOINT: /* (*MARK:foo) */ |
6471
|
99707
|
|
|
|
|
ST.prev_mark = mark_state; |
6472
|
64374
|
|
|
|
|
ST.mark_name = sv_commit = sv_yes_mark |
6473
|
20912
|
|
|
|
|
= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); |
6474
|
|
|
|
|
|
mark_state = st; |
6475
|
106510
|
|
|
|
|
ST.mark_loc = locinput; |
6476
|
106510
|
|
|
|
|
PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); |
6477
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6478
|
|
|
|
|
|
|
6479
|
|
|
|
|
|
case MARKPOINT_next: |
6480
|
10558
|
|
|
|
|
mark_state = ST.prev_mark; |
6481
|
9542
|
|
|
|
|
sayYES; |
6482
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6483
|
|
|
|
|
|
|
6484
|
|
|
|
|
|
case MARKPOINT_next_fail: |
6485
|
9264
|
|
|
|
|
if (popmark && sv_eq(ST.mark_name,popmark)) |
6486
|
|
|
|
|
|
{ |
6487
|
7360
|
|
|
|
|
if (ST.mark_loc > startpoint) |
6488
|
278
|
|
|
|
|
reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); |
6489
|
|
|
|
|
|
popmark = NULL; /* we found our mark */ |
6490
|
320
|
|
|
|
|
sv_commit = ST.mark_name; |
6491
|
|
|
|
|
|
|
6492
|
240
|
|
|
|
|
DEBUG_EXECUTE_r({ |
6493
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
6494
|
|
|
|
|
|
"%*s %ssetting cutpoint to mark:%"SVf"...%s\n", |
6495
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", |
6496
|
|
|
|
|
|
PL_colors[4], SVfARG(sv_commit), PL_colors[5]); |
6497
|
|
|
|
|
|
}); |
6498
|
|
|
|
|
|
} |
6499
|
160
|
|
|
|
|
mark_state = ST.prev_mark; |
6500
|
|
|
|
|
|
sv_yes_mark = mark_state ? |
6501
|
7520
|
|
|
|
|
mark_state->u.mark.mark_name : NULL; |
6502
|
103472
|
|
|
|
|
sayNO; |
6503
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6504
|
|
|
|
|
|
|
6505
|
|
|
|
|
|
case SKIP: /* (*SKIP) */ |
6506
|
213654
|
|
|
|
|
if (scan->flags) { |
6507
|
|
|
|
|
|
/* (*SKIP) : if we fail we cut here*/ |
6508
|
142436
|
|
|
|
|
ST.mark_name = NULL; |
6509
|
139348
|
|
|
|
|
ST.mark_loc = locinput; |
6510
|
39332
|
|
|
|
|
PUSH_STATE_GOTO(SKIP_next,next, locinput); |
6511
|
|
|
|
|
|
} else { |
6512
|
|
|
|
|
|
/* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, |
6513
|
|
|
|
|
|
otherwise do nothing. Meaning we need to scan |
6514
|
|
|
|
|
|
*/ |
6515
|
|
|
|
|
|
regmatch_state *cur = mark_state; |
6516
|
39332
|
|
|
|
|
SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); |
6517
|
|
|
|
|
|
|
6518
|
588
|
|
|
|
|
while (cur) { |
6519
|
392
|
|
|
|
|
if ( sv_eq( cur->u.mark.mark_name, |
6520
|
|
|
|
|
|
find ) ) |
6521
|
|
|
|
|
|
{ |
6522
|
316
|
|
|
|
|
ST.mark_name = find; |
6523
|
244
|
|
|
|
|
PUSH_STATE_GOTO( SKIP_next, next, locinput); |
6524
|
|
|
|
|
|
} |
6525
|
244
|
|
|
|
|
cur = cur->u.mark.prev_mark; |
6526
|
|
|
|
|
|
} |
6527
|
|
|
|
|
|
} |
6528
|
|
|
|
|
|
/* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ |
6529
|
|
|
|
|
|
break; |
6530
|
|
|
|
|
|
|
6531
|
|
|
|
|
|
case SKIP_next_fail: |
6532
|
720
|
|
|
|
|
if (ST.mark_name) { |
6533
|
|
|
|
|
|
/* (*CUT:NAME) - Set up to search for the name as we |
6534
|
|
|
|
|
|
collapse the stack*/ |
6535
|
480
|
|
|
|
|
popmark = ST.mark_name; |
6536
|
|
|
|
|
|
} else { |
6537
|
|
|
|
|
|
/* (*CUT) - No name, we cut here.*/ |
6538
|
456
|
|
|
|
|
if (ST.mark_loc > startpoint) |
6539
|
432
|
|
|
|
|
reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); |
6540
|
|
|
|
|
|
/* but we set sv_commit to latest mark_name if there |
6541
|
|
|
|
|
|
is one so they can test to see how things lead to this |
6542
|
|
|
|
|
|
cut */ |
6543
|
432
|
|
|
|
|
if (mark_state) |
6544
|
876
|
|
|
|
|
sv_commit=mark_state->u.mark.mark_name; |
6545
|
|
|
|
|
|
} |
6546
|
|
|
|
|
|
no_final = 1; |
6547
|
|
|
|
|
|
sayNO; |
6548
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6549
|
|
|
|
|
|
#undef ST |
6550
|
|
|
|
|
|
|
6551
|
|
|
|
|
|
case LNBREAK: /* \R */ |
6552
|
584
|
|
|
|
|
if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { |
6553
|
556
|
|
|
|
|
locinput += n; |
6554
|
|
|
|
|
|
} else |
6555
|
|
|
|
|
|
sayNO; |
6556
|
460
|
|
|
|
|
break; |
6557
|
|
|
|
|
|
|
6558
|
|
|
|
|
|
default: |
6559
|
460
|
|
|
|
|
PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", |
6560
|
648
|
|
|
|
|
PTR2UV(scan), OP(scan)); |
6561
|
432
|
|
|
|
|
Perl_croak(aTHX_ "regexp memory corruption"); |
6562
|
|
|
|
|
|
|
6563
|
|
|
|
|
|
/* this is a point to jump to in order to increment |
6564
|
|
|
|
|
|
* locinput by one character */ |
6565
|
|
|
|
|
|
increment_locinput: |
6566
|
432
|
|
|
|
|
assert(!NEXTCHR_IS_EOS); |
6567
|
384
|
|
|
|
|
if (utf8_target) { |
6568
|
384
|
|
|
|
|
locinput += PL_utf8skip[nextchr]; |
6569
|
|
|
|
|
|
/* locinput is allowed to go 1 char off the end, but not 2+ */ |
6570
|
0
|
|
|
|
|
if (locinput > reginfo->strend) |
6571
|
|
|
|
|
|
sayNO; |
6572
|
|
|
|
|
|
} |
6573
|
|
|
|
|
|
else |
6574
|
118
|
|
|
|
|
locinput++; |
6575
|
|
|
|
|
|
break; |
6576
|
|
|
|
|
|
|
6577
|
|
|
|
|
|
} /* end switch */ |
6578
|
|
|
|
|
|
|
6579
|
|
|
|
|
|
/* switch break jumps here */ |
6580
|
|
|
|
|
|
scan = next; /* prepare to execute the next op and ... */ |
6581
|
116
|
|
|
|
|
continue; /* ... jump back to the top, reusing st */ |
6582
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6583
|
|
|
|
|
|
|
6584
|
|
|
|
|
|
push_yes_state: |
6585
|
|
|
|
|
|
/* push a state that backtracks on success */ |
6586
|
66
|
|
|
|
|
st->u.yes.prev_yes_state = yes_state; |
6587
|
|
|
|
|
|
yes_state = st; |
6588
|
|
|
|
|
|
/* FALL THROUGH */ |
6589
|
|
|
|
|
|
push_state: |
6590
|
|
|
|
|
|
/* push a new regex state, then continue at scan */ |
6591
|
|
|
|
|
|
{ |
6592
|
|
|
|
|
|
regmatch_state *newst; |
6593
|
|
|
|
|
|
|
6594
|
310
|
|
|
|
|
DEBUG_STACK_r({ |
6595
|
|
|
|
|
|
regmatch_state *cur = st; |
6596
|
|
|
|
|
|
regmatch_state *curyes = yes_state; |
6597
|
|
|
|
|
|
int curd = depth; |
6598
|
|
|
|
|
|
regmatch_slab *slab = PL_regmatch_slab; |
6599
|
|
|
|
|
|
for (;curd > -1;cur--,curd--) { |
6600
|
|
|
|
|
|
if (cur < SLAB_FIRST(slab)) { |
6601
|
|
|
|
|
|
slab = slab->prev; |
6602
|
|
|
|
|
|
cur = SLAB_LAST(slab); |
6603
|
|
|
|
|
|
} |
6604
|
|
|
|
|
|
PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", |
6605
|
|
|
|
|
|
REPORT_CODE_OFF + 2 + depth * 2,"", |
6606
|
|
|
|
|
|
curd, PL_reg_name[cur->resume_state], |
6607
|
|
|
|
|
|
(curyes == cur) ? "yes" : "" |
6608
|
|
|
|
|
|
); |
6609
|
|
|
|
|
|
if (curyes == cur) |
6610
|
|
|
|
|
|
curyes = cur->u.yes.prev_yes_state; |
6611
|
|
|
|
|
|
} |
6612
|
|
|
|
|
|
} else |
6613
|
|
|
|
|
|
DEBUG_STATE_pp("push") |
6614
|
|
|
|
|
|
); |
6615
|
208
|
|
|
|
|
depth++; |
6616
|
192
|
|
|
|
|
st->locinput = locinput; |
6617
|
192
|
|
|
|
|
newst = st+1; |
6618
|
332
|
|
|
|
|
if (newst > SLAB_LAST(PL_regmatch_slab)) |
6619
|
840
|
|
|
|
|
newst = S_push_slab(aTHX); |
6620
|
392
|
|
|
|
|
PL_regmatch_state = newst; |
6621
|
|
|
|
|
|
|
6622
|
|
|
|
|
|
locinput = pushinput; |
6623
|
|
|
|
|
|
st = newst; |
6624
|
272
|
|
|
|
|
continue; |
6625
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
6626
|
|
|
|
|
|
} |
6627
|
|
|
|
|
|
} |
6628
|
|
|
|
|
|
|
6629
|
|
|
|
|
|
/* |
6630
|
|
|
|
|
|
* We get here only if there's trouble -- normally "case END" is |
6631
|
|
|
|
|
|
* the terminating point. |
6632
|
|
|
|
|
|
*/ |
6633
|
272
|
|
|
|
|
Perl_croak(aTHX_ "corrupted regexp pointers"); |
6634
|
|
|
|
|
|
/*NOTREACHED*/ |
6635
|
|
|
|
|
|
sayNO; |
6636
|
|
|
|
|
|
|
6637
|
|
|
|
|
|
yes: |
6638
|
198
|
|
|
|
|
if (yes_state) { |
6639
|
|
|
|
|
|
/* we have successfully completed a subexpression, but we must now |
6640
|
|
|
|
|
|
* pop to the state marked by yes_state and continue from there */ |
6641
|
494
|
|
|
|
|
assert(st != yes_state); |
6642
|
|
|
|
|
|
#ifdef DEBUGGING |
6643
|
260
|
|
|
|
|
while (st != yes_state) { |
6644
|
260
|
|
|
|
|
st--; |
6645
|
0
|
|
|
|
|
if (st < SLAB_FIRST(PL_regmatch_slab)) { |
6646
|
191762654
|
|
|
|
|
PL_regmatch_slab = PL_regmatch_slab->prev; |
6647
|
191486484
|
|
|
|
|
st = SLAB_LAST(PL_regmatch_slab); |
6648
|
|
|
|
|
|
} |
6649
|
191762654
|
|
|
|
|
DEBUG_STATE_r({ |
6650
|
|
|
|
|
|
if (no_final) { |
6651
|
|
|
|
|
|
DEBUG_STATE_pp("pop (no final)"); |
6652
|
|
|
|
|
|
} else { |
6653
|
|
|
|
|
|
DEBUG_STATE_pp("pop (yes)"); |
6654
|
|
|
|
|
|
} |
6655
|
|
|
|
|
|
}); |
6656
|
191762654
|
|
|
|
|
depth--; |
6657
|
|
|
|
|
|
} |
6658
|
|
|
|
|
|
#else |
6659
|
|
|
|
|
|
while (yes_state < SLAB_FIRST(PL_regmatch_slab) |
6660
|
|
|
|
|
|
|| yes_state > SLAB_LAST(PL_regmatch_slab)) |
6661
|
|
|
|
|
|
{ |
6662
|
|
|
|
|
|
/* not in this slab, pop slab */ |
6663
|
|
|
|
|
|
depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); |
6664
|
|
|
|
|
|
PL_regmatch_slab = PL_regmatch_slab->prev; |
6665
|
|
|
|
|
|
st = SLAB_LAST(PL_regmatch_slab); |
6666
|
|
|
|
|
|
} |
6667
|
|
|
|
|
|
depth -= (st - yes_state); |
6668
|
|
|
|
|
|
#endif |
6669
|
|
|
|
|
|
st = yes_state; |
6670
|
0
|
|
|
|
|
yes_state = st->u.yes.prev_yes_state; |
6671
|
0
|
|
|
|
|
PL_regmatch_state = st; |
6672
|
|
|
|
|
|
|
6673
|
0
|
|
|
|
|
if (no_final) |
6674
|
0
|
|
|
|
|
locinput= st->locinput; |
6675
|
961082
|
|
|
|
|
state_num = st->resume_state + no_final; |
6676
|
961082
|
|
|
|
|
goto reenter_switch; |
6677
|
|
|
|
|
|
} |
6678
|
|
|
|
|
|
|
6679
|
961124
|
|
|
|
|
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", |
6680
|
|
|
|
|
|
PL_colors[4], PL_colors[5])); |
6681
|
|
|
|
|
|
|
6682
|
961124
|
|
|
|
|
if (reginfo->info_aux_eval) { |
6683
|
|
|
|
|
|
/* each successfully executed (?{...}) block does the equivalent of |
6684
|
|
|
|
|
|
* local $^R = do {...} |
6685
|
|
|
|
|
|
* When popping the save stack, all these locals would be undone; |
6686
|
|
|
|
|
|
* bypass this by setting the outermost saved $^R to the latest |
6687
|
|
|
|
|
|
* value */ |
6688
|
961118
|
|
|
|
|
if (oreplsv != GvSV(PL_replgv)) |
6689
|
961118
|
|
|
|
|
sv_setsv(oreplsv, GvSV(PL_replgv)); |
6690
|
|
|
|
|
|
} |
6691
|
|
|
|
|
|
result = 1; |
6692
|
|
|
|
|
|
goto final_exit; |
6693
|
|
|
|
|
|
|
6694
|
|
|
|
|
|
no: |
6695
|
961082
|
|
|
|
|
DEBUG_EXECUTE_r( |
6696
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
6697
|
|
|
|
|
|
"%*s %sfailed...%s\n", |
6698
|
|
|
|
|
|
REPORT_CODE_OFF+depth*2, "", |
6699
|
|
|
|
|
|
PL_colors[4], PL_colors[5]) |
6700
|
|
|
|
|
|
); |
6701
|
|
|
|
|
|
|
6702
|
|
|
|
|
|
no_silent: |
6703
|
961082
|
|
|
|
|
if (no_final) { |
6704
|
961082
|
|
|
|
|
if (yes_state) { |
6705
|
|
|
|
|
|
goto yes; |
6706
|
|
|
|
|
|
} else { |
6707
|
|
|
|
|
|
goto final_exit; |
6708
|
|
|
|
|
|
} |
6709
|
|
|
|
|
|
} |
6710
|
961082
|
|
|
|
|
if (depth) { |
6711
|
|
|
|
|
|
/* there's a previous state to backtrack to */ |
6712
|
961082
|
|
|
|
|
st--; |
6713
|
961082
|
|
|
|
|
if (st < SLAB_FIRST(PL_regmatch_slab)) { |
6714
|
669000
|
|
|
|
|
PL_regmatch_slab = PL_regmatch_slab->prev; |
6715
|
669000
|
|
|
|
|
st = SLAB_LAST(PL_regmatch_slab); |
6716
|
|
|
|
|
|
} |
6717
|
54
|
|
|
|
|
PL_regmatch_state = st; |
6718
|
961082
|
|
|
|
|
locinput= st->locinput; |
6719
|
|
|
|
|
|
|
6720
|
530768
|
|
|
|
|
DEBUG_STATE_pp("pop"); |
6721
|
430314
|
|
|
|
|
depth--; |
6722
|
430314
|
|
|
|
|
if (yes_state == st) |
6723
|
405188
|
|
|
|
|
yes_state = st->u.yes.prev_yes_state; |
6724
|
|
|
|
|
|
|
6725
|
935956
|
|
|
|
|
state_num = st->resume_state + 1; /* failure = success + 1 */ |
6726
|
0
|
|
|
|
|
goto reenter_switch; |
6727
|
|
|
|
|
|
} |
6728
|
|
|
|
|
|
result = 0; |
6729
|
|
|
|
|
|
|
6730
|
|
|
|
|
|
final_exit: |
6731
|
42
|
|
|
|
|
if (rex->intflags & PREGf_VERBARG_SEEN) { |
6732
|
0
|
|
|
|
|
SV *sv_err = get_sv("REGERROR", 1); |
6733
|
0
|
|
|
|
|
SV *sv_mrk = get_sv("REGMARK", 1); |
6734
|
0
|
|
|
|
|
if (result) { |
6735
|
|
|
|
|
|
sv_commit = &PL_sv_no; |
6736
|
0
|
|
|
|
|
if (!sv_yes_mark) |
6737
|
|
|
|
|
|
sv_yes_mark = &PL_sv_yes; |
6738
|
|
|
|
|
|
} else { |
6739
|
0
|
|
|
|
|
if (!sv_commit) |
6740
|
|
|
|
|
|
sv_commit = &PL_sv_yes; |
6741
|
|
|
|
|
|
sv_yes_mark = &PL_sv_no; |
6742
|
|
|
|
|
|
} |
6743
|
935956
|
|
|
|
|
sv_setsv(sv_err, sv_commit); |
6744
|
124854668
|
|
|
|
|
sv_setsv(sv_mrk, sv_yes_mark); |
6745
|
|
|
|
|
|
} |
6746
|
|
|
|
|
|
|
6747
|
|
|
|
|
|
|
6748
|
124854710
|
|
|
|
|
if (last_pushed_cv) { |
6749
|
|
|
|
|
|
dSP; |
6750
|
124854740
|
|
|
|
|
POP_MULTICALL; |
6751
|
|
|
|
|
|
PERL_UNUSED_VAR(SP); |
6752
|
|
|
|
|
|
} |
6753
|
|
|
|
|
|
|
6754
|
124854710
|
|
|
|
|
assert(!result || locinput - reginfo->strbeg >= 0); |
6755
|
1398610
|
|
|
|
|
return result ? locinput - reginfo->strbeg : -1; |
6756
|
|
|
|
|
|
} |
6757
|
|
|
|
|
|
|
6758
|
|
|
|
|
|
/* |
6759
|
|
|
|
|
|
- regrepeat - repeatedly match something simple, report how many |
6760
|
|
|
|
|
|
* |
6761
|
|
|
|
|
|
* What 'simple' means is a node which can be the operand of a quantifier like |
6762
|
|
|
|
|
|
* '+', or {1,3} |
6763
|
|
|
|
|
|
* |
6764
|
|
|
|
|
|
* startposp - pointer a pointer to the start position. This is updated |
6765
|
|
|
|
|
|
* to point to the byte following the highest successful |
6766
|
|
|
|
|
|
* match. |
6767
|
|
|
|
|
|
* p - the regnode to be repeatedly matched against. |
6768
|
|
|
|
|
|
* reginfo - struct holding match state, such as strend |
6769
|
|
|
|
|
|
* max - maximum number of things to match. |
6770
|
|
|
|
|
|
* depth - (for debugging) backtracking depth. |
6771
|
|
|
|
|
|
*/ |
6772
|
|
|
|
|
|
STATIC I32 |
6773
|
1398568
|
|
|
|
|
S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, |
6774
|
|
|
|
|
|
regmatch_info *const reginfo, I32 max, int depth) |
6775
|
|
|
|
|
|
{ |
6776
|
|
|
|
|
|
dVAR; |
6777
|
|
|
|
|
|
char *scan; /* Pointer to current position in target string */ |
6778
|
|
|
|
|
|
I32 c; |
6779
|
1398568
|
|
|
|
|
char *loceol = reginfo->strend; /* local version */ |
6780
|
|
|
|
|
|
I32 hardcount = 0; /* How many matches so far */ |
6781
|
4
|
|
|
|
|
bool utf8_target = reginfo->is_utf8_target; |
6782
|
|
|
|
|
|
int to_complement = 0; /* Invert the result? */ |
6783
|
|
|
|
|
|
UV utf8_flags; |
6784
|
|
|
|
|
|
_char_class_number classnum; |
6785
|
|
|
|
|
|
#ifndef DEBUGGING |
6786
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
6787
|
|
|
|
|
|
#endif |
6788
|
|
|
|
|
|
|
6789
|
124854664
|
|
|
|
|
PERL_ARGS_ASSERT_REGREPEAT; |
6790
|
|
|
|
|
|
|
6791
|
123606588
|
|
|
|
|
scan = *startposp; |
6792
|
27134164
|
|
|
|
|
if (max == REG_INFTY) |
6793
|
|
|
|
|
|
max = I32_MAX; |
6794
|
1278132
|
|
|
|
|
else if (! utf8_target && loceol - scan > max) |
6795
|
1277930
|
|
|
|
|
loceol = scan + max; |
6796
|
|
|
|
|
|
|
6797
|
|
|
|
|
|
/* Here, for the case of a non-UTF-8 target we have adjusted down |
6798
|
|
|
|
|
|
* to the maximum of how far we should go in it (leaving it set to the real |
6799
|
|
|
|
|
|
* end, if the maximum permissible would take us beyond that). This allows |
6800
|
|
|
|
|
|
* us to make the loop exit condition that we haven't gone past to |
6801
|
|
|
|
|
|
* also mean that we haven't exceeded the max permissible count, saving a |
6802
|
|
|
|
|
|
* test each time through the loop. But it assumes that the OP matches a |
6803
|
|
|
|
|
|
* single byte, which is true for most of the OPs below when applied to a |
6804
|
|
|
|
|
|
* non-UTF-8 target. Those relatively few OPs that don't have this |
6805
|
|
|
|
|
|
* characteristic will have to compensate. |
6806
|
|
|
|
|
|
* |
6807
|
|
|
|
|
|
* There is no adjustment for UTF-8 targets, as the number of bytes per |
6808
|
|
|
|
|
|
* character varies. OPs will have to test both that the count is less |
6809
|
|
|
|
|
|
* than the max permissible (using to keep track), and that we |
6810
|
|
|
|
|
|
* are still within the bounds of the string (using . A few OPs |
6811
|
|
|
|
|
|
* match a single byte no matter what the encoding. They can omit the max |
6812
|
|
|
|
|
|
* test if, for the UTF-8 case, they do the adjustment that was skipped |
6813
|
|
|
|
|
|
* above. |
6814
|
|
|
|
|
|
* |
6815
|
|
|
|
|
|
* Thus, the code above sets things up for the common case; and exceptional |
6816
|
|
|
|
|
|
* cases need extra work; the common case is to make sure doesn't |
6817
|
|
|
|
|
|
* go past , and for UTF-8 to also use to make sure the |
6818
|
|
|
|
|
|
* count doesn't exceed the maximum permissible */ |
6819
|
|
|
|
|
|
|
6820
|
27133826
|
|
|
|
|
switch (OP(p)) { |
6821
|
|
|
|
|
|
case REG_ANY: |
6822
|
141758
|
|
|
|
|
if (utf8_target) { |
6823
|
141758
|
|
|
|
|
while (scan < loceol && hardcount < max && *scan != '\n') { |
6824
|
139442
|
|
|
|
|
scan += UTF8SKIP(scan); |
6825
|
128830
|
|
|
|
|
hardcount++; |
6826
|
|
|
|
|
|
} |
6827
|
|
|
|
|
|
} else { |
6828
|
28610
|
|
|
|
|
while (scan < loceol && *scan != '\n') |
6829
|
28378
|
|
|
|
|
scan++; |
6830
|
|
|
|
|
|
} |
6831
|
|
|
|
|
|
break; |
6832
|
|
|
|
|
|
case SANY: |
6833
|
1790
|
|
|
|
|
if (utf8_target) { |
6834
|
26820
|
|
|
|
|
while (scan < loceol && hardcount < max) { |
6835
|
26820
|
|
|
|
|
scan += UTF8SKIP(scan); |
6836
|
124854664
|
|
|
|
|
hardcount++; |
6837
|
|
|
|
|
|
} |
6838
|
|
|
|
|
|
} |
6839
|
|
|
|
|
|
else |
6840
|
|
|
|
|
|
scan = loceol; |
6841
|
|
|
|
|
|
break; |
6842
|
|
|
|
|
|
case CANY: /* Move forward bytes, unless goes off end */ |
6843
|
28367416
|
|
|
|
|
if (utf8_target && loceol - scan > max) { |
6844
|
|
|
|
|
|
|
6845
|
|
|
|
|
|
/* hadn't been adjusted in the UTF-8 case */ |
6846
|
28182320
|
|
|
|
|
scan += max; |
6847
|
|
|
|
|
|
} |
6848
|
|
|
|
|
|
else { |
6849
|
|
|
|
|
|
scan = loceol; |
6850
|
|
|
|
|
|
} |
6851
|
|
|
|
|
|
break; |
6852
|
|
|
|
|
|
case EXACT: |
6853
|
14898562
|
|
|
|
|
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); |
6854
|
|
|
|
|
|
|
6855
|
14873336
|
|
|
|
|
c = (U8)*STRING(p); |
6856
|
|
|
|
|
|
|
6857
|
|
|
|
|
|
/* Can use a simple loop if the pattern char to match on is invariant |
6858
|
|
|
|
|
|
* under UTF-8, or both target and pattern aren't UTF-8. Note that we |
6859
|
|
|
|
|
|
* can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's |
6860
|
|
|
|
|
|
* true iff it doesn't matter if the argument is in UTF-8 or not */ |
6861
|
988418
|
|
|
|
|
if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { |
6862
|
371176
|
|
|
|
|
if (utf8_target && loceol - scan > max) { |
6863
|
|
|
|
|
|
/* We didn't adjust because is UTF-8, but ok to do so, |
6864
|
|
|
|
|
|
* since here, to match at all, 1 char == 1 byte */ |
6865
|
52562
|
|
|
|
|
loceol = scan + max; |
6866
|
|
|
|
|
|
} |
6867
|
961082
|
|
|
|
|
while (scan < loceol && UCHARAT(scan) == c) { |
6868
|
935956
|
|
|
|
|
scan++; |
6869
|
|
|
|
|
|
} |
6870
|
|
|
|
|
|
} |
6871
|
935956
|
|
|
|
|
else if (reginfo->is_utf8_pat) { |
6872
|
100
|
|
|
|
|
if (utf8_target) { |
6873
|
|
|
|
|
|
STRLEN scan_char_len; |
6874
|
|
|
|
|
|
|
6875
|
|
|
|
|
|
/* When both target and pattern are UTF-8, we have to do |
6876
|
|
|
|
|
|
* string EQ */ |
6877
|
100
|
|
|
|
|
while (hardcount < max |
6878
|
935956
|
|
|
|
|
&& scan < loceol |
6879
|
935956
|
|
|
|
|
&& (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) |
6880
|
28342290
|
|
|
|
|
&& memEQ(scan, STRING(p), scan_char_len)) |
6881
|
|
|
|
|
|
{ |
6882
|
936
|
|
|
|
|
scan += scan_char_len; |
6883
|
358
|
|
|
|
|
hardcount++; |
6884
|
|
|
|
|
|
} |
6885
|
|
|
|
|
|
} |
6886
|
178
|
|
|
|
|
else if (! UTF8_IS_ABOVE_LATIN1(c)) { |
6887
|
|
|
|
|
|
|
6888
|
|
|
|
|
|
/* Target isn't utf8; convert the character in the UTF-8 |
6889
|
|
|
|
|
|
* pattern to non-UTF8, and do a simple loop */ |
6890
|
124829538
|
|
|
|
|
c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); |
6891
|
7571424
|
|
|
|
|
while (scan < loceol && UCHARAT(scan) == c) { |
6892
|
7571424
|
|
|
|
|
scan++; |
6893
|
|
|
|
|
|
} |
6894
|
|
|
|
|
|
} /* else pattern char is above Latin1, can't possibly match the |
6895
|
|
|
|
|
|
non-UTF-8 target */ |
6896
|
|
|
|
|
|
} |
6897
|
|
|
|
|
|
else { |
6898
|
|
|
|
|
|
|
6899
|
|
|
|
|
|
/* Here, the string must be utf8; pattern isn't, and is |
6900
|
|
|
|
|
|
* different in utf8 than not, so can't compare them directly. |
6901
|
|
|
|
|
|
* Outside the loop, find the two utf8 bytes that represent c, and |
6902
|
|
|
|
|
|
* then look for those in sequence in the utf8 string */ |
6903
|
6890156
|
|
|
|
|
U8 high = UTF8_TWO_BYTE_HI(c); |
6904
|
1136136
|
|
|
|
|
U8 low = UTF8_TWO_BYTE_LO(c); |
6905
|
|
|
|
|
|
|
6906
|
4899172
|
|
|
|
|
while (hardcount < max |
6907
|
3081768
|
|
|
|
|
&& scan + 1 < loceol |
6908
|
3081768
|
|
|
|
|
&& UCHARAT(scan) == high |
6909
|
3568440
|
|
|
|
|
&& UCHARAT(scan + 1) == low) |
6910
|
|
|
|
|
|
{ |
6911
|
1470242
|
|
|
|
|
scan += 2; |
6912
|
7571424
|
|
|
|
|
hardcount++; |
6913
|
|
|
|
|
|
} |
6914
|
|
|
|
|
|
} |
6915
|
|
|
|
|
|
break; |
6916
|
|
|
|
|
|
|
6917
|
|
|
|
|
|
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ |
6918
|
127394
|
|
|
|
|
assert(! reginfo->is_utf8_pat); |
6919
|
|
|
|
|
|
/* FALL THROUGH */ |
6920
|
|
|
|
|
|
case EXACTFA: |
6921
|
|
|
|
|
|
utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; |
6922
|
|
|
|
|
|
goto do_exactf; |
6923
|
|
|
|
|
|
|
6924
|
|
|
|
|
|
case EXACTFL: |
6925
|
127394
|
|
|
|
|
RXp_MATCH_TAINTED_on(prog); |
6926
|
|
|
|
|
|
utf8_flags = FOLDEQ_UTF8_LOCALE; |
6927
|
43614
|
|
|
|
|
goto do_exactf; |
6928
|
|
|
|
|
|
|
6929
|
|
|
|
|
|
case EXACTF: /* This node only generated for non-utf8 patterns */ |
6930
|
0
|
|
|
|
|
assert(! reginfo->is_utf8_pat); |
6931
|
|
|
|
|
|
utf8_flags = 0; |
6932
|
|
|
|
|
|
goto do_exactf; |
6933
|
|
|
|
|
|
|
6934
|
|
|
|
|
|
case EXACTFU_SS: |
6935
|
|
|
|
|
|
case EXACTFU: |
6936
|
43614
|
|
|
|
|
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; |
6937
|
|
|
|
|
|
|
6938
|
|
|
|
|
|
do_exactf: { |
6939
|
|
|
|
|
|
int c1, c2; |
6940
|
|
|
|
|
|
U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; |
6941
|
|
|
|
|
|
|
6942
|
222158
|
|
|
|
|
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); |
6943
|
|
|
|
|
|
|
6944
|
138378
|
|
|
|
|
if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, |
6945
|
|
|
|
|
|
reginfo)) |
6946
|
|
|
|
|
|
{ |
6947
|
138378
|
|
|
|
|
if (c1 == CHRTEST_VOID) { |
6948
|
|
|
|
|
|
/* Use full Unicode fold matching */ |
6949
|
219634
|
|
|
|
|
char *tmpeol = reginfo->strend; |
6950
|
109416
|
|
|
|
|
STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; |
6951
|
83780
|
|
|
|
|
while (hardcount < max |
6952
|
116865
|
|
|
|
|
&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, |
6953
|
|
|
|
|
|
STRING(p), NULL, pat_len, |
6954
|
|
|
|
|
|
reginfo->is_utf8_pat, utf8_flags)) |
6955
|
|
|
|
|
|
{ |
6956
|
111774
|
|
|
|
|
scan = tmpeol; |
6957
|
111774
|
|
|
|
|
tmpeol = reginfo->strend; |
6958
|
111774
|
|
|
|
|
hardcount++; |
6959
|
|
|
|
|
|
} |
6960
|
|
|
|
|
|
} |
6961
|
111774
|
|
|
|
|
else if (utf8_target) { |
6962
|
111774
|
|
|
|
|
if (c1 == c2) { |
6963
|
111774
|
|
|
|
|
while (scan < loceol |
6964
|
111334
|
|
|
|
|
&& hardcount < max |
6965
|
222668
|
|
|
|
|
&& memEQ(scan, c1_utf8, UTF8SKIP(scan))) |
6966
|
|
|
|
|
|
{ |
6967
|
111774
|
|
|
|
|
scan += UTF8SKIP(scan); |
6968
|
107096
|
|
|
|
|
hardcount++; |
6969
|
|
|
|
|
|
} |
6970
|
|
|
|
|
|
} |
6971
|
|
|
|
|
|
else { |
6972
|
107096
|
|
|
|
|
while (scan < loceol |
6973
|
111774
|
|
|
|
|
&& hardcount < max |
6974
|
111774
|
|
|
|
|
&& (memEQ(scan, c1_utf8, UTF8SKIP(scan)) |
6975
|
111774
|
|
|
|
|
|| memEQ(scan, c2_utf8, UTF8SKIP(scan)))) |
6976
|
|
|
|
|
|
{ |
6977
|
0
|
|
|
|
|
scan += UTF8SKIP(scan); |
6978
|
111774
|
|
|
|
|
hardcount++; |
6979
|
|
|
|
|
|
} |
6980
|
|
|
|
|
|
} |
6981
|
|
|
|
|
|
} |
6982
|
4622
|
|
|
|
|
else if (c1 == c2) { |
6983
|
223548
|
|
|
|
|
while (scan < loceol && UCHARAT(scan) == c1) { |
6984
|
111774
|
|
|
|
|
scan++; |
6985
|
|
|
|
|
|
} |
6986
|
|
|
|
|
|
} |
6987
|
|
|
|
|
|
else { |
6988
|
111774
|
|
|
|
|
while (scan < loceol && |
6989
|
111774
|
|
|
|
|
(UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) |
6990
|
|
|
|
|
|
{ |
6991
|
30
|
|
|
|
|
scan++; |
6992
|
|
|
|
|
|
} |
6993
|
|
|
|
|
|
} |
6994
|
|
|
|
|
|
} |
6995
|
|
|
|
|
|
break; |
6996
|
|
|
|
|
|
} |
6997
|
|
|
|
|
|
case ANYOF: |
6998
|
|
|
|
|
|
case ANYOF_WARN_SUPER: |
6999
|
30
|
|
|
|
|
if (utf8_target) { |
7000
|
30
|
|
|
|
|
while (hardcount < max |
7001
|
30
|
|
|
|
|
&& scan < loceol |
7002
|
30
|
|
|
|
|
&& reginclass(prog, p, (U8*)scan, utf8_target)) |
7003
|
|
|
|
|
|
{ |
7004
|
30
|
|
|
|
|
scan += UTF8SKIP(scan); |
7005
|
111744
|
|
|
|
|
hardcount++; |
7006
|
|
|
|
|
|
} |
7007
|
|
|
|
|
|
} else { |
7008
|
111774
|
|
|
|
|
while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) |
7009
|
111774
|
|
|
|
|
scan++; |
7010
|
|
|
|
|
|
} |
7011
|
|
|
|
|
|
break; |
7012
|
|
|
|
|
|
|
7013
|
|
|
|
|
|
/* The argument (FLAGS) to all the POSIX node types is the class number */ |
7014
|
|
|
|
|
|
|
7015
|
|
|
|
|
|
case NPOSIXL: |
7016
|
|
|
|
|
|
to_complement = 1; |
7017
|
|
|
|
|
|
/* FALLTHROUGH */ |
7018
|
|
|
|
|
|
|
7019
|
|
|
|
|
|
case POSIXL: |
7020
|
111774
|
|
|
|
|
RXp_MATCH_TAINTED_on(prog); |
7021
|
111774
|
|
|
|
|
if (! utf8_target) { |
7022
|
111774
|
|
|
|
|
while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), |
7023
|
|
|
|
|
|
*scan))) |
7024
|
|
|
|
|
|
{ |
7025
|
151263341
|
|
|
|
|
scan++; |
7026
|
|
|
|
|
|
} |
7027
|
|
|
|
|
|
} else { |
7028
|
151263341
|
|
|
|
|
while (hardcount < max && scan < loceol |
7029
|
151263341
|
|
|
|
|
&& to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), |
7030
|
|
|
|
|
|
(U8 *) scan))) |
7031
|
|
|
|
|
|
{ |
7032
|
151263341
|
|
|
|
|
scan += UTF8SKIP(scan); |
7033
|
111774
|
|
|
|
|
hardcount++; |
7034
|
|
|
|
|
|
} |
7035
|
|
|
|
|
|
} |
7036
|
|
|
|
|
|
break; |
7037
|
|
|
|
|
|
|
7038
|
|
|
|
|
|
case POSIXD: |
7039
|
30
|
|
|
|
|
if (utf8_target) { |
7040
|
|
|
|
|
|
goto utf8_posix; |
7041
|
|
|
|
|
|
} |
7042
|
|
|
|
|
|
/* FALLTHROUGH */ |
7043
|
|
|
|
|
|
|
7044
|
|
|
|
|
|
case POSIXA: |
7045
|
30
|
|
|
|
|
if (utf8_target && loceol - scan > max) { |
7046
|
|
|
|
|
|
|
7047
|
|
|
|
|
|
/* We didn't adjust at the beginning of this routine |
7048
|
|
|
|
|
|
* because is UTF-8, but it is actually ok to do so, since here, to |
7049
|
|
|
|
|
|
* match, 1 char == 1 byte. */ |
7050
|
30
|
|
|
|
|
loceol = scan + max; |
7051
|
|
|
|
|
|
} |
7052
|
30
|
|
|
|
|
while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { |
7053
|
30
|
|
|
|
|
scan++; |
7054
|
|
|
|
|
|
} |
7055
|
|
|
|
|
|
break; |
7056
|
|
|
|
|
|
|
7057
|
|
|
|
|
|
case NPOSIXD: |
7058
|
30
|
|
|
|
|
if (utf8_target) { |
7059
|
|
|
|
|
|
to_complement = 1; |
7060
|
|
|
|
|
|
goto utf8_posix; |
7061
|
|
|
|
|
|
} |
7062
|
|
|
|
|
|
/* FALL THROUGH */ |
7063
|
|
|
|
|
|
|
7064
|
|
|
|
|
|
case NPOSIXA: |
7065
|
30
|
|
|
|
|
if (! utf8_target) { |
7066
|
111774
|
|
|
|
|
while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { |
7067
|
111774
|
|
|
|
|
scan++; |
7068
|
|
|
|
|
|
} |
7069
|
|
|
|
|
|
} |
7070
|
|
|
|
|
|
else { |
7071
|
|
|
|
|
|
|
7072
|
|
|
|
|
|
/* The complement of something that matches only ASCII matches all |
7073
|
|
|
|
|
|
* UTF-8 variant code points, plus everything in ASCII that isn't |
7074
|
|
|
|
|
|
* in the class. */ |
7075
|
167661
|
|
|
|
|
while (hardcount < max && scan < loceol |
7076
|
111774
|
|
|
|
|
&& (! UTF8_IS_INVARIANT(*scan) |
7077
|
111774
|
|
|
|
|
|| ! _generic_isCC_A((U8) *scan, FLAGS(p)))) |
7078
|
|
|
|
|
|
{ |
7079
|
111774
|
|
|
|
|
scan += UTF8SKIP(scan); |
7080
|
151263341
|
|
|
|
|
hardcount++; |
7081
|
|
|
|
|
|
} |
7082
|
|
|
|
|
|
} |
7083
|
|
|
|
|
|
break; |
7084
|
|
|
|
|
|
|
7085
|
|
|
|
|
|
case NPOSIXU: |
7086
|
|
|
|
|
|
to_complement = 1; |
7087
|
|
|
|
|
|
/* FALLTHROUGH */ |
7088
|
|
|
|
|
|
|
7089
|
|
|
|
|
|
case POSIXU: |
7090
|
151263341
|
|
|
|
|
if (! utf8_target) { |
7091
|
151263341
|
|
|
|
|
while (scan < loceol && to_complement |
7092
|
151263341
|
|
|
|
|
^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) |
7093
|
|
|
|
|
|
{ |
7094
|
417738
|
|
|
|
|
scan++; |
7095
|
|
|
|
|
|
} |
7096
|
|
|
|
|
|
} |
7097
|
|
|
|
|
|
else { |
7098
|
|
|
|
|
|
utf8_posix: |
7099
|
1141235
|
|
|
|
|
classnum = (_char_class_number) FLAGS(p); |
7100
|
514628
|
|
|
|
|
if (classnum < _FIRST_NON_SWASH_CC) { |
7101
|
|
|
|
|
|
|
7102
|
|
|
|
|
|
/* Here, a swash is needed for above-Latin1 code points. |
7103
|
|
|
|
|
|
* Process as many Latin1 code points using the built-in rules. |
7104
|
|
|
|
|
|
* Go to another loop to finish processing upon encountering |
7105
|
|
|
|
|
|
* the first Latin1 code point. We could do that in this loop |
7106
|
|
|
|
|
|
* as well, but the other way saves having to test if the swash |
7107
|
|
|
|
|
|
* has been loaded every time through the loop: extra space to |
7108
|
|
|
|
|
|
* save a test. */ |
7109
|
514628
|
|
|
|
|
while (hardcount < max && scan < loceol) { |
7110
|
151263341
|
|
|
|
|
if (UTF8_IS_INVARIANT(*scan)) { |
7111
|
502808
|
|
|
|
|
if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, |
7112
|
|
|
|
|
|
classnum)))) |
7113
|
|
|
|
|
|
{ |
7114
|
|
|
|
|
|
break; |
7115
|
|
|
|
|
|
} |
7116
|
1005616
|
|
|
|
|
scan++; |
7117
|
|
|
|
|
|
} |
7118
|
627696
|
|
|
|
|
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { |
7119
|
627696
|
|
|
|
|
if (! (to_complement |
7120
|
627696
|
|
|
|
|
^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, |
7121
|
|
|
|
|
|
*(scan + 1)), |
7122
|
|
|
|
|
|
classnum)))) |
7123
|
|
|
|
|
|
{ |
7124
|
|
|
|
|
|
break; |
7125
|
|
|
|
|
|
} |
7126
|
627696
|
|
|
|
|
scan += 2; |
7127
|
|
|
|
|
|
} |
7128
|
|
|
|
|
|
else { |
7129
|
|
|
|
|
|
goto found_above_latin1; |
7130
|
|
|
|
|
|
} |
7131
|
|
|
|
|
|
|
7132
|
627696
|
|
|
|
|
hardcount++; |
7133
|
|
|
|
|
|
} |
7134
|
|
|
|
|
|
} |
7135
|
|
|
|
|
|
else { |
7136
|
|
|
|
|
|
/* For these character classes, the knowledge of how to handle |
7137
|
|
|
|
|
|
* every code point is compiled in to Perl via a macro. This |
7138
|
|
|
|
|
|
* code is written for making the loops as tight as possible. |
7139
|
|
|
|
|
|
* It could be refactored to save space instead */ |
7140
|
627696
|
|
|
|
|
switch (classnum) { |
7141
|
|
|
|
|
|
case _CC_ENUM_SPACE: /* XXX would require separate code |
7142
|
|
|
|
|
|
if we revert the change of \v |
7143
|
|
|
|
|
|
matching this */ |
7144
|
|
|
|
|
|
/* FALL THROUGH */ |
7145
|
|
|
|
|
|
case _CC_ENUM_PSXSPC: |
7146
|
316852
|
|
|
|
|
while (hardcount < max |
7147
|
316852
|
|
|
|
|
&& scan < loceol |
7148
|
310844
|
|
|
|
|
&& (to_complement ^ cBOOL(isSPACE_utf8(scan)))) |
7149
|
|
|
|
|
|
{ |
7150
|
627696
|
|
|
|
|
scan += UTF8SKIP(scan); |
7151
|
495010
|
|
|
|
|
hardcount++; |
7152
|
|
|
|
|
|
} |
7153
|
|
|
|
|
|
break; |
7154
|
|
|
|
|
|
case _CC_ENUM_BLANK: |
7155
|
1005616
|
|
|
|
|
while (hardcount < max |
7156
|
502808
|
|
|
|
|
&& scan < loceol |
7157
|
106572
|
|
|
|
|
&& (to_complement ^ cBOOL(isBLANK_utf8(scan)))) |
7158
|
|
|
|
|
|
{ |
7159
|
213134
|
|
|
|
|
scan += UTF8SKIP(scan); |
7160
|
158168
|
|
|
|
|
hardcount++; |
7161
|
|
|
|
|
|
} |
7162
|
|
|
|
|
|
break; |
7163
|
|
|
|
|
|
case _CC_ENUM_XDIGIT: |
7164
|
158168
|
|
|
|
|
while (hardcount < max |
7165
|
158168
|
|
|
|
|
&& scan < loceol |
7166
|
158140
|
|
|
|
|
&& (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) |
7167
|
|
|
|
|
|
{ |
7168
|
158140
|
|
|
|
|
scan += UTF8SKIP(scan); |
7169
|
52120
|
|
|
|
|
hardcount++; |
7170
|
|
|
|
|
|
} |
7171
|
|
|
|
|
|
break; |
7172
|
|
|
|
|
|
case _CC_ENUM_VERTSPACE: |
7173
|
52120
|
|
|
|
|
while (hardcount < max |
7174
|
106020
|
|
|
|
|
&& scan < loceol |
7175
|
158140
|
|
|
|
|
&& (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) |
7176
|
|
|
|
|
|
{ |
7177
|
158140
|
|
|
|
|
scan += UTF8SKIP(scan); |
7178
|
106548
|
|
|
|
|
hardcount++; |
7179
|
|
|
|
|
|
} |
7180
|
|
|
|
|
|
break; |
7181
|
|
|
|
|
|
case _CC_ENUM_CNTRL: |
7182
|
213120
|
|
|
|
|
while (hardcount < max |
7183
|
1576113953
|
|
|
|
|
&& scan < loceol |
7184
|
0
|
|
|
|
|
&& (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) |
7185
|
|
|
|
|
|
{ |
7186
|
0
|
|
|
|
|
scan += UTF8SKIP(scan); |
7187
|
0
|
|
|
|
|
hardcount++; |
7188
|
|
|
|
|
|
} |
7189
|
|
|
|
|
|
break; |
7190
|
|
|
|
|
|
default: |
7191
|
0
|
|
|
|
|
Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); |
7192
|
|
|
|
|
|
} |
7193
|
|
|
|
|
|
} |
7194
|
|
|
|
|
|
} |
7195
|
|
|
|
|
|
break; |
7196
|
|
|
|
|
|
|
7197
|
|
|
|
|
|
found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ |
7198
|
|
|
|
|
|
|
7199
|
|
|
|
|
|
/* Load the swash if not already present */ |
7200
|
0
|
|
|
|
|
if (! PL_utf8_swash_ptrs[classnum]) { |
7201
|
0
|
|
|
|
|
U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; |
7202
|
0
|
|
|
|
|
PL_utf8_swash_ptrs[classnum] = _core_swash_init( |
7203
|
|
|
|
|
|
"utf8", swash_property_names[classnum], |
7204
|
|
|
|
|
|
&PL_sv_undef, 1, 0, NULL, &flags); |
7205
|
|
|
|
|
|
} |
7206
|
|
|
|
|
|
|
7207
|
0
|
|
|
|
|
while (hardcount < max && scan < loceol |
7208
|
0
|
|
|
|
|
&& to_complement ^ cBOOL(_generic_utf8( |
7209
|
|
|
|
|
|
classnum, |
7210
|
|
|
|
|
|
scan, |
7211
|
|
|
|
|
|
swash_fetch(PL_utf8_swash_ptrs[classnum], |
7212
|
|
|
|
|
|
(U8 *) scan, |
7213
|
|
|
|
|
|
TRUE)))) |
7214
|
|
|
|
|
|
{ |
7215
|
0
|
|
|
|
|
scan += UTF8SKIP(scan); |
7216
|
0
|
|
|
|
|
hardcount++; |
7217
|
|
|
|
|
|
} |
7218
|
|
|
|
|
|
break; |
7219
|
|
|
|
|
|
|
7220
|
|
|
|
|
|
case LNBREAK: |
7221
|
0
|
|
|
|
|
if (utf8_target) { |
7222
|
0
|
|
|
|
|
while (hardcount < max && scan < loceol && |
7223
|
0
|
|
|
|
|
(c=is_LNBREAK_utf8_safe(scan, loceol))) { |
7224
|
0
|
|
|
|
|
scan += c; |
7225
|
0
|
|
|
|
|
hardcount++; |
7226
|
|
|
|
|
|
} |
7227
|
|
|
|
|
|
} else { |
7228
|
|
|
|
|
|
/* LNBREAK can match one or two latin chars, which is ok, but we |
7229
|
|
|
|
|
|
* have to use hardcount in this situation, and throw away the |
7230
|
|
|
|
|
|
* adjustment to done before the switch statement */ |
7231
|
0
|
|
|
|
|
loceol = reginfo->strend; |
7232
|
0
|
|
|
|
|
while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { |
7233
|
0
|
|
|
|
|
scan+=c; |
7234
|
0
|
|
|
|
|
hardcount++; |
7235
|
|
|
|
|
|
} |
7236
|
|
|
|
|
|
} |
7237
|
|
|
|
|
|
break; |
7238
|
|
|
|
|
|
|
7239
|
|
|
|
|
|
case BOUND: |
7240
|
|
|
|
|
|
case BOUNDA: |
7241
|
|
|
|
|
|
case BOUNDL: |
7242
|
|
|
|
|
|
case BOUNDU: |
7243
|
|
|
|
|
|
case EOS: |
7244
|
|
|
|
|
|
case GPOS: |
7245
|
|
|
|
|
|
case KEEPS: |
7246
|
|
|
|
|
|
case NBOUND: |
7247
|
|
|
|
|
|
case NBOUNDA: |
7248
|
|
|
|
|
|
case NBOUNDL: |
7249
|
|
|
|
|
|
case NBOUNDU: |
7250
|
|
|
|
|
|
case OPFAIL: |
7251
|
|
|
|
|
|
case SBOL: |
7252
|
|
|
|
|
|
case SEOL: |
7253
|
|
|
|
|
|
/* These are all 0 width, so match right here or not at all. */ |
7254
|
|
|
|
|
|
break; |
7255
|
|
|
|
|
|
|
7256
|
|
|
|
|
|
default: |
7257
|
0
|
|
|
|
|
Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); |
7258
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
7259
|
|
|
|
|
|
|
7260
|
|
|
|
|
|
} |
7261
|
|
|
|
|
|
|
7262
|
0
|
|
|
|
|
if (hardcount) |
7263
|
|
|
|
|
|
c = hardcount; |
7264
|
|
|
|
|
|
else |
7265
|
0
|
|
|
|
|
c = scan - *startposp; |
7266
|
0
|
|
|
|
|
*startposp = scan; |
7267
|
|
|
|
|
|
|
7268
|
0
|
|
|
|
|
DEBUG_r({ |
7269
|
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
7270
|
|
|
|
|
|
DEBUG_EXECUTE_r({ |
7271
|
|
|
|
|
|
SV * const prop = sv_newmortal(); |
7272
|
|
|
|
|
|
regprop(prog, prop, p); |
7273
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
7274
|
|
|
|
|
|
"%*s %s can match %"IVdf" times out of %"IVdf"...\n", |
7275
|
|
|
|
|
|
REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); |
7276
|
|
|
|
|
|
}); |
7277
|
|
|
|
|
|
}); |
7278
|
|
|
|
|
|
|
7279
|
0
|
|
|
|
|
return(c); |
7280
|
|
|
|
|
|
} |
7281
|
|
|
|
|
|
|
7282
|
|
|
|
|
|
|
7283
|
|
|
|
|
|
#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) |
7284
|
|
|
|
|
|
/* |
7285
|
|
|
|
|
|
- regclass_swash - prepare the utf8 swash. Wraps the shared core version to |
7286
|
|
|
|
|
|
create a copy so that changes the caller makes won't change the shared one. |
7287
|
|
|
|
|
|
If is non-null, will return NULL in it, for back-compat. |
7288
|
|
|
|
|
|
*/ |
7289
|
|
|
|
|
|
SV * |
7290
|
|
|
|
|
|
Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp) |
7291
|
|
|
|
|
|
{ |
7292
|
|
|
|
|
|
PERL_ARGS_ASSERT_REGCLASS_SWASH; |
7293
|
|
|
|
|
|
|
7294
|
|
|
|
|
|
if (altsvp) { |
7295
|
|
|
|
|
|
*altsvp = NULL; |
7296
|
|
|
|
|
|
} |
7297
|
|
|
|
|
|
|
7298
|
|
|
|
|
|
return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); |
7299
|
|
|
|
|
|
} |
7300
|
|
|
|
|
|
#endif |
7301
|
|
|
|
|
|
|
7302
|
|
|
|
|
|
STATIC SV * |
7303
|
0
|
|
|
|
|
S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) |
7304
|
|
|
|
|
|
{ |
7305
|
|
|
|
|
|
/* Returns the swash for the input 'node' in the regex 'prog'. |
7306
|
|
|
|
|
|
* If is 'true', will attempt to create the swash if not already |
7307
|
|
|
|
|
|
* done. |
7308
|
|
|
|
|
|
* If is non-null, will return the printable contents of the |
7309
|
|
|
|
|
|
* swash. This can be used to get debugging information even before the |
7310
|
|
|
|
|
|
* swash exists, by calling this function with 'doinit' set to false, in |
7311
|
|
|
|
|
|
* which case the components that will be used to eventually create the |
7312
|
|
|
|
|
|
* swash are returned (in a printable form). |
7313
|
|
|
|
|
|
* Tied intimately to how regcomp.c sets up the data structure */ |
7314
|
|
|
|
|
|
|
7315
|
|
|
|
|
|
dVAR; |
7316
|
|
|
|
|
|
SV *sw = NULL; |
7317
|
|
|
|
|
|
SV *si = NULL; /* Input swash initialization string */ |
7318
|
|
|
|
|
|
SV* invlist = NULL; |
7319
|
|
|
|
|
|
|
7320
|
0
|
|
|
|
|
RXi_GET_DECL(prog,progi); |
7321
|
0
|
|
|
|
|
const struct reg_data * const data = prog ? progi->data : NULL; |
7322
|
|
|
|
|
|
|
7323
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; |
7324
|
|
|
|
|
|
|
7325
|
0
|
|
|
|
|
assert(ANYOF_NONBITMAP(node)); |
7326
|
|
|
|
|
|
|
7327
|
0
|
|
|
|
|
if (data && data->count) { |
7328
|
0
|
|
|
|
|
const U32 n = ARG(node); |
7329
|
|
|
|
|
|
|
7330
|
0
|
|
|
|
|
if (data->what[n] == 's') { |
7331
|
0
|
|
|
|
|
SV * const rv = MUTABLE_SV(data->data[n]); |
7332
|
0
|
|
|
|
|
AV * const av = MUTABLE_AV(SvRV(rv)); |
7333
|
0
|
|
|
|
|
SV **const ary = AvARRAY(av); |
7334
|
0
|
|
|
|
|
U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; |
7335
|
|
|
|
|
|
|
7336
|
0
|
|
|
|
|
si = *ary; /* ary[0] = the string to initialize the swash with */ |
7337
|
|
|
|
|
|
|
7338
|
|
|
|
|
|
/* Elements 2 and 3 are either both present or both absent. [2] is |
7339
|
|
|
|
|
|
* any inversion list generated at compile time; [3] indicates if |
7340
|
|
|
|
|
|
* that inversion list has any user-defined properties in it. */ |
7341
|
0
|
|
|
|
|
if (av_len(av) >= 2) { |
7342
|
0
|
|
|
|
|
invlist = ary[2]; |
7343
|
0
|
|
|
|
|
if (SvUV(ary[3])) { |
7344
|
0
|
|
|
|
|
swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; |
7345
|
|
|
|
|
|
} |
7346
|
|
|
|
|
|
} |
7347
|
|
|
|
|
|
else { |
7348
|
|
|
|
|
|
invlist = NULL; |
7349
|
|
|
|
|
|
} |
7350
|
|
|
|
|
|
|
7351
|
|
|
|
|
|
/* Element [1] is reserved for the set-up swash. If already there, |
7352
|
|
|
|
|
|
* return it; if not, create it and store it there */ |
7353
|
0
|
|
|
|
|
if (ary[1] && SvROK(ary[1])) { |
7354
|
0
|
|
|
|
|
sw = ary[1]; |
7355
|
|
|
|
|
|
} |
7356
|
0
|
|
|
|
|
else if (si && doinit) { |
7357
|
|
|
|
|
|
|
7358
|
0
|
|
|
|
|
sw = _core_swash_init("utf8", /* the utf8 package */ |
7359
|
|
|
|
|
|
"", /* nameless */ |
7360
|
|
|
|
|
|
si, |
7361
|
|
|
|
|
|
1, /* binary */ |
7362
|
|
|
|
|
|
0, /* not from tr/// */ |
7363
|
|
|
|
|
|
invlist, |
7364
|
|
|
|
|
|
&swash_init_flags); |
7365
|
0
|
|
|
|
|
(void)av_store(av, 1, sw); |
7366
|
|
|
|
|
|
} |
7367
|
|
|
|
|
|
} |
7368
|
|
|
|
|
|
} |
7369
|
|
|
|
|
|
|
7370
|
|
|
|
|
|
/* If requested, return a printable version of what this swash matches */ |
7371
|
0
|
|
|
|
|
if (listsvp) { |
7372
|
0
|
|
|
|
|
SV* matches_string = newSVpvn("", 0); |
7373
|
|
|
|
|
|
|
7374
|
|
|
|
|
|
/* The swash should be used, if possible, to get the data, as it |
7375
|
|
|
|
|
|
* contains the resolved data. But this function can be called at |
7376
|
|
|
|
|
|
* compile-time, before everything gets resolved, in which case we |
7377
|
|
|
|
|
|
* return the currently best available information, which is the string |
7378
|
|
|
|
|
|
* that will eventually be used to do that resolving, 'si' */ |
7379
|
0
|
|
|
|
|
if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) |
7380
|
0
|
|
|
|
|
&& (si && si != &PL_sv_undef)) |
7381
|
|
|
|
|
|
{ |
7382
|
0
|
|
|
|
|
sv_catsv(matches_string, si); |
7383
|
|
|
|
|
|
} |
7384
|
|
|
|
|
|
|
7385
|
|
|
|
|
|
/* Add the inversion list to whatever we have. This may have come from |
7386
|
|
|
|
|
|
* the swash, or from an input parameter */ |
7387
|
0
|
|
|
|
|
if (invlist) { |
7388
|
0
|
|
|
|
|
sv_catsv(matches_string, _invlist_contents(invlist)); |
7389
|
|
|
|
|
|
} |
7390
|
0
|
|
|
|
|
*listsvp = matches_string; |
7391
|
|
|
|
|
|
} |
7392
|
|
|
|
|
|
|
7393
|
0
|
|
|
|
|
return sw; |
7394
|
|
|
|
|
|
} |
7395
|
|
|
|
|
|
|
7396
|
|
|
|
|
|
/* |
7397
|
|
|
|
|
|
- reginclass - determine if a character falls into a character class |
7398
|
|
|
|
|
|
|
7399
|
|
|
|
|
|
n is the ANYOF regnode |
7400
|
|
|
|
|
|
p is the target string |
7401
|
|
|
|
|
|
utf8_target tells whether p is in UTF-8. |
7402
|
|
|
|
|
|
|
7403
|
|
|
|
|
|
Returns true if matched; false otherwise. |
7404
|
|
|
|
|
|
|
7405
|
|
|
|
|
|
Note that this can be a synthetic start class, a combination of various |
7406
|
|
|
|
|
|
nodes, so things you think might be mutually exclusive, such as locale, |
7407
|
|
|
|
|
|
aren't. It can match both locale and non-locale |
7408
|
|
|
|
|
|
|
7409
|
|
|
|
|
|
*/ |
7410
|
|
|
|
|
|
|
7411
|
|
|
|
|
|
STATIC bool |
7412
|
0
|
|
|
|
|
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) |
7413
|
|
|
|
|
|
{ |
7414
|
|
|
|
|
|
dVAR; |
7415
|
0
|
|
|
|
|
const char flags = ANYOF_FLAGS(n); |
7416
|
|
|
|
|
|
bool match = FALSE; |
7417
|
0
|
|
|
|
|
UV c = *p; |
7418
|
|
|
|
|
|
|
7419
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_REGINCLASS; |
7420
|
|
|
|
|
|
|
7421
|
|
|
|
|
|
/* If c is not already the code point, get it. Note that |
7422
|
|
|
|
|
|
* UTF8_IS_INVARIANT() works even if not in UTF-8 */ |
7423
|
0
|
|
|
|
|
if (! UTF8_IS_INVARIANT(c) && utf8_target) { |
7424
|
0
|
|
|
|
|
STRLEN c_len = 0; |
7425
|
0
|
|
|
|
|
c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, |
7426
|
|
|
|
|
|
(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) |
7427
|
|
|
|
|
|
| UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); |
7428
|
|
|
|
|
|
/* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for |
7429
|
|
|
|
|
|
* UTF8_ALLOW_FFFF */ |
7430
|
0
|
|
|
|
|
if (c_len == (STRLEN)-1) |
7431
|
0
|
|
|
|
|
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); |
7432
|
|
|
|
|
|
} |
7433
|
|
|
|
|
|
|
7434
|
|
|
|
|
|
/* If this character is potentially in the bitmap, check it */ |
7435
|
0
|
|
|
|
|
if (c < 256) { |
7436
|
0
|
|
|
|
|
if (ANYOF_BITMAP_TEST(n, c)) |
7437
|
|
|
|
|
|
match = TRUE; |
7438
|
0
|
|
|
|
|
else if (flags & ANYOF_NON_UTF8_LATIN1_ALL |
7439
|
0
|
|
|
|
|
&& ! utf8_target |
7440
|
0
|
|
|
|
|
&& ! isASCII(c)) |
7441
|
|
|
|
|
|
{ |
7442
|
|
|
|
|
|
match = TRUE; |
7443
|
|
|
|
|
|
} |
7444
|
0
|
|
|
|
|
else if (flags & ANYOF_LOCALE) { |
7445
|
0
|
|
|
|
|
RXp_MATCH_TAINTED_on(prog); |
7446
|
|
|
|
|
|
|
7447
|
0
|
|
|
|
|
if ((flags & ANYOF_LOC_FOLD) |
7448
|
0
|
|
|
|
|
&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) |
7449
|
|
|
|
|
|
{ |
7450
|
|
|
|
|
|
match = TRUE; |
7451
|
|
|
|
|
|
} |
7452
|
0
|
|
|
|
|
else if (ANYOF_CLASS_TEST_ANY_SET(n)) { |
7453
|
|
|
|
|
|
|
7454
|
|
|
|
|
|
/* The data structure is arranged so bits 0, 2, 4, ... are set |
7455
|
|
|
|
|
|
* if the class includes the Posix character class given by |
7456
|
|
|
|
|
|
* bit/2; and 1, 3, 5, ... are set if the class includes the |
7457
|
|
|
|
|
|
* complemented Posix class given by int(bit/2). So we loop |
7458
|
|
|
|
|
|
* through the bits, each time changing whether we complement |
7459
|
|
|
|
|
|
* the result or not. Suppose for the sake of illustration |
7460
|
|
|
|
|
|
* that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 |
7461
|
|
|
|
|
|
* is set, it means there is a match for this ANYOF node if the |
7462
|
|
|
|
|
|
* character is in the class given by the expression (0 / 2 = 0 |
7463
|
|
|
|
|
|
* = \w). If it is in that class, isFOO_lc() will return 1, |
7464
|
|
|
|
|
|
* and since 'to_complement' is 0, the result will stay TRUE, |
7465
|
|
|
|
|
|
* and we exit the loop. Suppose instead that bit 0 is 0, but |
7466
|
|
|
|
|
|
* bit 1 is 1. That means there is a match if the character |
7467
|
|
|
|
|
|
* matches \W. We won't bother to call isFOO_lc() on bit 0, |
7468
|
|
|
|
|
|
* but will on bit 1. On the second iteration 'to_complement' |
7469
|
|
|
|
|
|
* will be 1, so the exclusive or will reverse things, so we |
7470
|
|
|
|
|
|
* are testing for \W. On the third iteration, 'to_complement' |
7471
|
|
|
|
|
|
* will be 0, and we would be testing for \s; the fourth |
7472
|
|
|
|
|
|
* iteration would test for \S, etc. |
7473
|
|
|
|
|
|
* |
7474
|
|
|
|
|
|
* Note that this code assumes that all the classes are closed |
7475
|
|
|
|
|
|
* under folding. For example, if a character matches \w, then |
7476
|
|
|
|
|
|
* its fold does too; and vice versa. This should be true for |
7477
|
|
|
|
|
|
* any well-behaved locale for all the currently defined Posix |
7478
|
|
|
|
|
|
* classes, except for :lower: and :upper:, which are handled |
7479
|
|
|
|
|
|
* by the pseudo-class :cased: which matches if either of the |
7480
|
|
|
|
|
|
* other two does. To get rid of this assumption, an outer |
7481
|
|
|
|
|
|
* loop could be used below to iterate over both the source |
7482
|
|
|
|
|
|
* character, and its fold (if different) */ |
7483
|
|
|
|
|
|
|
7484
|
|
|
|
|
|
int count = 0; |
7485
|
|
|
|
|
|
int to_complement = 0; |
7486
|
0
|
|
|
|
|
while (count < ANYOF_MAX) { |
7487
|
0
|
|
|
|
|
if (ANYOF_CLASS_TEST(n, count) |
7488
|
0
|
|
|
|
|
&& to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) |
7489
|
|
|
|
|
|
{ |
7490
|
|
|
|
|
|
match = TRUE; |
7491
|
|
|
|
|
|
break; |
7492
|
|
|
|
|
|
} |
7493
|
0
|
|
|
|
|
count++; |
7494
|
0
|
|
|
|
|
to_complement ^= 1; |
7495
|
|
|
|
|
|
} |
7496
|
|
|
|
|
|
} |
7497
|
|
|
|
|
|
} |
7498
|
|
|
|
|
|
} |
7499
|
|
|
|
|
|
|
7500
|
|
|
|
|
|
/* If the bitmap didn't (or couldn't) match, and something outside the |
7501
|
|
|
|
|
|
* bitmap could match, try that. Locale nodes specify completely the |
7502
|
|
|
|
|
|
* behavior of code points in the bit map (otherwise, a utf8 target would |
7503
|
|
|
|
|
|
* cause them to be treated as Unicode and not locale), except in |
7504
|
|
|
|
|
|
* the very unlikely event when this node is a synthetic start class, which |
7505
|
|
|
|
|
|
* could be a combination of locale and non-locale nodes. So allow locale |
7506
|
|
|
|
|
|
* to match for the synthetic start class, which will give a false |
7507
|
|
|
|
|
|
* positive that will be resolved when the match is done again as not part |
7508
|
|
|
|
|
|
* of the synthetic start class */ |
7509
|
0
|
|
|
|
|
if (!match) { |
7510
|
0
|
|
|
|
|
if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { |
7511
|
|
|
|
|
|
match = TRUE; /* Everything above 255 matches */ |
7512
|
|
|
|
|
|
} |
7513
|
0
|
|
|
|
|
else if (ANYOF_NONBITMAP(n) |
7514
|
0
|
|
|
|
|
&& ((flags & ANYOF_NONBITMAP_NON_UTF8) |
7515
|
0
|
|
|
|
|
|| (utf8_target |
7516
|
0
|
|
|
|
|
&& (c >=256 |
7517
|
0
|
|
|
|
|
|| (! (flags & ANYOF_LOCALE)) |
7518
|
0
|
|
|
|
|
|| OP(n) == ANYOF_SYNTHETIC)))) |
7519
|
|
|
|
|
|
{ |
7520
|
0
|
|
|
|
|
SV * const sw = core_regclass_swash(prog, n, TRUE, 0); |
7521
|
0
|
|
|
|
|
if (sw) { |
7522
|
|
|
|
|
|
U8 * utf8_p; |
7523
|
0
|
|
|
|
|
if (utf8_target) { |
7524
|
|
|
|
|
|
utf8_p = (U8 *) p; |
7525
|
|
|
|
|
|
} else { /* Convert to utf8 */ |
7526
|
0
|
|
|
|
|
STRLEN len = 1; |
7527
|
0
|
|
|
|
|
utf8_p = bytes_to_utf8(p, &len); |
7528
|
|
|
|
|
|
} |
7529
|
|
|
|
|
|
|
7530
|
0
|
|
|
|
|
if (swash_fetch(sw, utf8_p, TRUE)) { |
7531
|
|
|
|
|
|
match = TRUE; |
7532
|
|
|
|
|
|
} |
7533
|
|
|
|
|
|
|
7534
|
|
|
|
|
|
/* If we allocated a string above, free it */ |
7535
|
0
|
|
|
|
|
if (! utf8_target) Safefree(utf8_p); |
7536
|
|
|
|
|
|
} |
7537
|
|
|
|
|
|
} |
7538
|
|
|
|
|
|
|
7539
|
0
|
|
|
|
|
if (UNICODE_IS_SUPER(c) |
7540
|
0
|
|
|
|
|
&& OP(n) == ANYOF_WARN_SUPER |
7541
|
0
|
|
|
|
|
&& ckWARN_d(WARN_NON_UNICODE)) |
7542
|
|
|
|
|
|
{ |
7543
|
0
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), |
7544
|
|
|
|
|
|
"Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); |
7545
|
|
|
|
|
|
} |
7546
|
|
|
|
|
|
} |
7547
|
|
|
|
|
|
|
7548
|
|
|
|
|
|
/* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ |
7549
|
0
|
|
|
|
|
return cBOOL(flags & ANYOF_INVERT) ^ match; |
7550
|
|
|
|
|
|
} |
7551
|
|
|
|
|
|
|
7552
|
|
|
|
|
|
STATIC U8 * |
7553
|
0
|
|
|
|
|
S_reghop3(U8 *s, SSize_t off, const U8* lim) |
7554
|
|
|
|
|
|
{ |
7555
|
|
|
|
|
|
/* return the position 'off' UTF-8 characters away from 's', forward if |
7556
|
|
|
|
|
|
* 'off' >= 0, backwards if negative. But don't go outside of position |
7557
|
|
|
|
|
|
* 'lim', which better be < s if off < 0 */ |
7558
|
|
|
|
|
|
|
7559
|
|
|
|
|
|
dVAR; |
7560
|
|
|
|
|
|
|
7561
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_REGHOP3; |
7562
|
|
|
|
|
|
|
7563
|
0
|
|
|
|
|
if (off >= 0) { |
7564
|
0
|
|
|
|
|
while (off-- && s < lim) { |
7565
|
|
|
|
|
|
/* XXX could check well-formedness here */ |
7566
|
0
|
|
|
|
|
s += UTF8SKIP(s); |
7567
|
|
|
|
|
|
} |
7568
|
|
|
|
|
|
} |
7569
|
|
|
|
|
|
else { |
7570
|
0
|
|
|
|
|
while (off++ && s > lim) { |
7571
|
0
|
|
|
|
|
s--; |
7572
|
0
|
|
|
|
|
if (UTF8_IS_CONTINUED(*s)) { |
7573
|
0
|
|
|
|
|
while (s > lim && UTF8_IS_CONTINUATION(*s)) |
7574
|
0
|
|
|
|
|
s--; |
7575
|
|
|
|
|
|
} |
7576
|
|
|
|
|
|
/* XXX could check well-formedness here */ |
7577
|
|
|
|
|
|
} |
7578
|
|
|
|
|
|
} |
7579
|
0
|
|
|
|
|
return s; |
7580
|
|
|
|
|
|
} |
7581
|
|
|
|
|
|
|
7582
|
|
|
|
|
|
#ifdef XXX_dmq |
7583
|
|
|
|
|
|
/* there are a bunch of places where we use two reghop3's that should |
7584
|
|
|
|
|
|
be replaced with this routine. but since thats not done yet |
7585
|
|
|
|
|
|
we ifdef it out - dmq |
7586
|
|
|
|
|
|
*/ |
7587
|
|
|
|
|
|
STATIC U8 * |
7588
|
|
|
|
|
|
S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) |
7589
|
|
|
|
|
|
{ |
7590
|
|
|
|
|
|
dVAR; |
7591
|
|
|
|
|
|
|
7592
|
|
|
|
|
|
PERL_ARGS_ASSERT_REGHOP4; |
7593
|
|
|
|
|
|
|
7594
|
|
|
|
|
|
if (off >= 0) { |
7595
|
|
|
|
|
|
while (off-- && s < rlim) { |
7596
|
|
|
|
|
|
/* XXX could check well-formedness here */ |
7597
|
|
|
|
|
|
s += UTF8SKIP(s); |
7598
|
|
|
|
|
|
} |
7599
|
|
|
|
|
|
} |
7600
|
|
|
|
|
|
else { |
7601
|
|
|
|
|
|
while (off++ && s > llim) { |
7602
|
|
|
|
|
|
s--; |
7603
|
|
|
|
|
|
if (UTF8_IS_CONTINUED(*s)) { |
7604
|
|
|
|
|
|
while (s > llim && UTF8_IS_CONTINUATION(*s)) |
7605
|
|
|
|
|
|
s--; |
7606
|
|
|
|
|
|
} |
7607
|
|
|
|
|
|
/* XXX could check well-formedness here */ |
7608
|
|
|
|
|
|
} |
7609
|
|
|
|
|
|
} |
7610
|
|
|
|
|
|
return s; |
7611
|
|
|
|
|
|
} |
7612
|
|
|
|
|
|
#endif |
7613
|
|
|
|
|
|
|
7614
|
|
|
|
|
|
STATIC U8 * |
7615
|
0
|
|
|
|
|
S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) |
7616
|
|
|
|
|
|
{ |
7617
|
|
|
|
|
|
dVAR; |
7618
|
|
|
|
|
|
|
7619
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_REGHOPMAYBE3; |
7620
|
|
|
|
|
|
|
7621
|
0
|
|
|
|
|
if (off >= 0) { |
7622
|
0
|
|
|
|
|
while (off-- && s < lim) { |
7623
|
|
|
|
|
|
/* XXX could check well-formedness here */ |
7624
|
0
|
|
|
|
|
s += UTF8SKIP(s); |
7625
|
|
|
|
|
|
} |
7626
|
0
|
|
|
|
|
if (off >= 0) |
7627
|
|
|
|
|
|
return NULL; |
7628
|
|
|
|
|
|
} |
7629
|
|
|
|
|
|
else { |
7630
|
0
|
|
|
|
|
while (off++ && s > lim) { |
7631
|
0
|
|
|
|
|
s--; |
7632
|
0
|
|
|
|
|
if (UTF8_IS_CONTINUED(*s)) { |
7633
|
0
|
|
|
|
|
while (s > lim && UTF8_IS_CONTINUATION(*s)) |
7634
|
0
|
|
|
|
|
s--; |
7635
|
|
|
|
|
|
} |
7636
|
|
|
|
|
|
/* XXX could check well-formedness here */ |
7637
|
|
|
|
|
|
} |
7638
|
0
|
|
|
|
|
if (off <= 0) |
7639
|
|
|
|
|
|
return NULL; |
7640
|
|
|
|
|
|
} |
7641
|
0
|
|
|
|
|
return s; |
7642
|
|
|
|
|
|
} |
7643
|
|
|
|
|
|
|
7644
|
|
|
|
|
|
|
7645
|
|
|
|
|
|
/* when executing a regex that may have (?{}), extra stuff needs setting |
7646
|
|
|
|
|
|
up that will be visible to the called code, even before the current |
7647
|
|
|
|
|
|
match has finished. In particular: |
7648
|
|
|
|
|
|
|
7649
|
|
|
|
|
|
* $_ is localised to the SV currently being matched; |
7650
|
|
|
|
|
|
* pos($_) is created if necessary, ready to be updated on each call-out |
7651
|
|
|
|
|
|
to code; |
7652
|
|
|
|
|
|
* a fake PMOP is created that can be set to PL_curpm (normally PL_curpm |
7653
|
|
|
|
|
|
isn't set until the current pattern is successfully finished), so that |
7654
|
|
|
|
|
|
$1 etc of the match-so-far can be seen; |
7655
|
|
|
|
|
|
* save the old values of subbeg etc of the current regex, and set then |
7656
|
|
|
|
|
|
to the current string (again, this is normally only done at the end |
7657
|
|
|
|
|
|
of execution) |
7658
|
|
|
|
|
|
*/ |
7659
|
|
|
|
|
|
|
7660
|
|
|
|
|
|
static void |
7661
|
36
|
|
|
|
|
S_setup_eval_state(pTHX_ regmatch_info *const reginfo) |
7662
|
|
|
|
|
|
{ |
7663
|
|
|
|
|
|
MAGIC *mg; |
7664
|
72
|
|
|
|
|
regexp *const rex = ReANY(reginfo->prog); |
7665
|
36
|
|
|
|
|
regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; |
7666
|
|
|
|
|
|
|
7667
|
36
|
|
|
|
|
eval_state->rex = rex; |
7668
|
|
|
|
|
|
|
7669
|
36
|
|
|
|
|
if (reginfo->sv) { |
7670
|
|
|
|
|
|
/* Make $_ available to executed code. */ |
7671
|
36
|
|
|
|
|
if (reginfo->sv != DEFSV) { |
7672
|
36
|
|
|
|
|
SAVE_DEFSV; |
7673
|
36
|
|
|
|
|
DEFSV_set(reginfo->sv); |
7674
|
|
|
|
|
|
} |
7675
|
|
|
|
|
|
|
7676
|
36
|
|
|
|
|
if (!(mg = mg_find_mglob(reginfo->sv))) { |
7677
|
|
|
|
|
|
/* prepare for quick setting of pos */ |
7678
|
12
|
|
|
|
|
mg = sv_magicext_mglob(reginfo->sv); |
7679
|
12
|
|
|
|
|
mg->mg_len = -1; |
7680
|
|
|
|
|
|
} |
7681
|
36
|
|
|
|
|
eval_state->pos_magic = mg; |
7682
|
36
|
|
|
|
|
eval_state->pos = mg->mg_len; |
7683
|
36
|
|
|
|
|
eval_state->pos_flags = mg->mg_flags; |
7684
|
|
|
|
|
|
} |
7685
|
|
|
|
|
|
else |
7686
|
0
|
|
|
|
|
eval_state->pos_magic = NULL; |
7687
|
|
|
|
|
|
|
7688
|
36
|
|
|
|
|
if (!PL_reg_curpm) { |
7689
|
|
|
|
|
|
/* PL_reg_curpm is a fake PMOP that we can attach the current |
7690
|
|
|
|
|
|
* regex to and point PL_curpm at, so that $1 et al are visible |
7691
|
|
|
|
|
|
* within a /(?{})/. It's just allocated once per interpreter the |
7692
|
|
|
|
|
|
* first time its needed */ |
7693
|
12
|
|
|
|
|
Newxz(PL_reg_curpm, 1, PMOP); |
7694
|
|
|
|
|
|
#ifdef USE_ITHREADS |
7695
|
|
|
|
|
|
{ |
7696
|
|
|
|
|
|
SV* const repointer = &PL_sv_undef; |
7697
|
|
|
|
|
|
/* this regexp is also owned by the new PL_reg_curpm, which |
7698
|
|
|
|
|
|
will try to free it. */ |
7699
|
|
|
|
|
|
av_push(PL_regex_padav, repointer); |
7700
|
|
|
|
|
|
PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); |
7701
|
|
|
|
|
|
PL_regex_pad = AvARRAY(PL_regex_padav); |
7702
|
|
|
|
|
|
} |
7703
|
|
|
|
|
|
#endif |
7704
|
|
|
|
|
|
} |
7705
|
72
|
|
|
|
|
SET_reg_curpm(reginfo->prog); |
7706
|
36
|
|
|
|
|
eval_state->curpm = PL_curpm; |
7707
|
36
|
|
|
|
|
PL_curpm = PL_reg_curpm; |
7708
|
36
|
|
|
|
|
if (RXp_MATCH_COPIED(rex)) { |
7709
|
|
|
|
|
|
/* Here is a serious problem: we cannot rewrite subbeg, |
7710
|
|
|
|
|
|
since it may be needed if this match fails. Thus |
7711
|
|
|
|
|
|
$` inside (?{}) could fail... */ |
7712
|
0
|
|
|
|
|
eval_state->subbeg = rex->subbeg; |
7713
|
0
|
|
|
|
|
eval_state->sublen = rex->sublen; |
7714
|
0
|
|
|
|
|
eval_state->suboffset = rex->suboffset; |
7715
|
0
|
|
|
|
|
eval_state->subcoffset = rex->subcoffset; |
7716
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
7717
|
0
|
|
|
|
|
eval_state->saved_copy = rex->saved_copy; |
7718
|
|
|
|
|
|
#endif |
7719
|
0
|
|
|
|
|
RXp_MATCH_COPIED_off(rex); |
7720
|
|
|
|
|
|
} |
7721
|
|
|
|
|
|
else |
7722
|
36
|
|
|
|
|
eval_state->subbeg = NULL; |
7723
|
36
|
|
|
|
|
rex->subbeg = (char *)reginfo->strbeg; |
7724
|
36
|
|
|
|
|
rex->suboffset = 0; |
7725
|
36
|
|
|
|
|
rex->subcoffset = 0; |
7726
|
36
|
|
|
|
|
rex->sublen = reginfo->strend - reginfo->strbeg; |
7727
|
36
|
|
|
|
|
} |
7728
|
|
|
|
|
|
|
7729
|
|
|
|
|
|
|
7730
|
|
|
|
|
|
/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ |
7731
|
|
|
|
|
|
|
7732
|
|
|
|
|
|
static void |
7733
|
44
|
|
|
|
|
S_cleanup_regmatch_info_aux(pTHX_ void *arg) |
7734
|
|
|
|
|
|
{ |
7735
|
|
|
|
|
|
dVAR; |
7736
|
|
|
|
|
|
regmatch_info_aux *aux = (regmatch_info_aux *) arg; |
7737
|
44
|
|
|
|
|
regmatch_info_aux_eval *eval_state = aux->info_aux_eval; |
7738
|
|
|
|
|
|
regmatch_slab *s; |
7739
|
|
|
|
|
|
|
7740
|
44
|
|
|
|
|
Safefree(aux->poscache); |
7741
|
|
|
|
|
|
|
7742
|
44
|
|
|
|
|
if (eval_state) { |
7743
|
|
|
|
|
|
|
7744
|
|
|
|
|
|
/* undo the effects of S_setup_eval_state() */ |
7745
|
|
|
|
|
|
|
7746
|
36
|
|
|
|
|
if (eval_state->subbeg) { |
7747
|
0
|
|
|
|
|
regexp * const rex = eval_state->rex; |
7748
|
0
|
|
|
|
|
rex->subbeg = eval_state->subbeg; |
7749
|
0
|
|
|
|
|
rex->sublen = eval_state->sublen; |
7750
|
0
|
|
|
|
|
rex->suboffset = eval_state->suboffset; |
7751
|
0
|
|
|
|
|
rex->subcoffset = eval_state->subcoffset; |
7752
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
7753
|
0
|
|
|
|
|
rex->saved_copy = eval_state->saved_copy; |
7754
|
|
|
|
|
|
#endif |
7755
|
0
|
|
|
|
|
RXp_MATCH_COPIED_on(rex); |
7756
|
|
|
|
|
|
} |
7757
|
36
|
|
|
|
|
if (eval_state->pos_magic) |
7758
|
|
|
|
|
|
{ |
7759
|
36
|
|
|
|
|
eval_state->pos_magic->mg_len = eval_state->pos; |
7760
|
72
|
|
|
|
|
eval_state->pos_magic->mg_flags = |
7761
|
36
|
|
|
|
|
(eval_state->pos_magic->mg_flags & ~MGf_BYTES) |
7762
|
36
|
|
|
|
|
| (eval_state->pos_flags & MGf_BYTES); |
7763
|
|
|
|
|
|
} |
7764
|
|
|
|
|
|
|
7765
|
36
|
|
|
|
|
PL_curpm = eval_state->curpm; |
7766
|
|
|
|
|
|
} |
7767
|
|
|
|
|
|
|
7768
|
44
|
|
|
|
|
PL_regmatch_state = aux->old_regmatch_state; |
7769
|
44
|
|
|
|
|
PL_regmatch_slab = aux->old_regmatch_slab; |
7770
|
|
|
|
|
|
|
7771
|
|
|
|
|
|
/* free all slabs above current one - this must be the last action |
7772
|
|
|
|
|
|
* of this function, as aux and eval_state are allocated within |
7773
|
|
|
|
|
|
* slabs and may be freed here */ |
7774
|
|
|
|
|
|
|
7775
|
44
|
|
|
|
|
s = PL_regmatch_slab->next; |
7776
|
44
|
|
|
|
|
if (s) { |
7777
|
0
|
|
|
|
|
PL_regmatch_slab->next = NULL; |
7778
|
0
|
|
|
|
|
while (s) { |
7779
|
|
|
|
|
|
regmatch_slab * const osl = s; |
7780
|
0
|
|
|
|
|
s = s->next; |
7781
|
0
|
|
|
|
|
Safefree(osl); |
7782
|
|
|
|
|
|
} |
7783
|
|
|
|
|
|
} |
7784
|
44
|
|
|
|
|
} |
7785
|
|
|
|
|
|
|
7786
|
|
|
|
|
|
|
7787
|
|
|
|
|
|
STATIC void |
7788
|
0
|
|
|
|
|
S_to_utf8_substr(pTHX_ regexp *prog) |
7789
|
|
|
|
|
|
{ |
7790
|
|
|
|
|
|
/* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile |
7791
|
|
|
|
|
|
* on the converted value */ |
7792
|
|
|
|
|
|
|
7793
|
|
|
|
|
|
int i = 1; |
7794
|
|
|
|
|
|
|
7795
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; |
7796
|
|
|
|
|
|
|
7797
|
|
|
|
|
|
do { |
7798
|
0
|
|
|
|
|
if (prog->substrs->data[i].substr |
7799
|
0
|
|
|
|
|
&& !prog->substrs->data[i].utf8_substr) { |
7800
|
0
|
|
|
|
|
SV* const sv = newSVsv(prog->substrs->data[i].substr); |
7801
|
0
|
|
|
|
|
prog->substrs->data[i].utf8_substr = sv; |
7802
|
0
|
|
|
|
|
sv_utf8_upgrade(sv); |
7803
|
0
|
|
|
|
|
if (SvVALID(prog->substrs->data[i].substr)) { |
7804
|
0
|
|
|
|
|
if (SvTAIL(prog->substrs->data[i].substr)) { |
7805
|
|
|
|
|
|
/* Trim the trailing \n that fbm_compile added last |
7806
|
|
|
|
|
|
time. */ |
7807
|
0
|
|
|
|
|
SvCUR_set(sv, SvCUR(sv) - 1); |
7808
|
|
|
|
|
|
/* Whilst this makes the SV technically "invalid" (as its |
7809
|
|
|
|
|
|
buffer is no longer followed by "\0") when fbm_compile() |
7810
|
|
|
|
|
|
adds the "\n" back, a "\0" is restored. */ |
7811
|
0
|
|
|
|
|
fbm_compile(sv, FBMcf_TAIL); |
7812
|
|
|
|
|
|
} else |
7813
|
0
|
|
|
|
|
fbm_compile(sv, 0); |
7814
|
|
|
|
|
|
} |
7815
|
0
|
|
|
|
|
if (prog->substrs->data[i].substr == prog->check_substr) |
7816
|
0
|
|
|
|
|
prog->check_utf8 = sv; |
7817
|
|
|
|
|
|
} |
7818
|
0
|
|
|
|
|
} while (i--); |
7819
|
0
|
|
|
|
|
} |
7820
|
|
|
|
|
|
|
7821
|
|
|
|
|
|
STATIC bool |
7822
|
28
|
|
|
|
|
S_to_byte_substr(pTHX_ regexp *prog) |
7823
|
|
|
|
|
|
{ |
7824
|
|
|
|
|
|
/* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile |
7825
|
|
|
|
|
|
* on the converted value; returns FALSE if can't be converted. */ |
7826
|
|
|
|
|
|
|
7827
|
|
|
|
|
|
dVAR; |
7828
|
|
|
|
|
|
int i = 1; |
7829
|
|
|
|
|
|
|
7830
|
28
|
|
|
|
|
PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; |
7831
|
|
|
|
|
|
|
7832
|
|
|
|
|
|
do { |
7833
|
56
|
|
|
|
|
if (prog->substrs->data[i].utf8_substr |
7834
|
28
|
|
|
|
|
&& !prog->substrs->data[i].substr) { |
7835
|
28
|
|
|
|
|
SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); |
7836
|
28
|
|
|
|
|
if (! sv_utf8_downgrade(sv, TRUE)) { |
7837
|
|
|
|
|
|
return FALSE; |
7838
|
|
|
|
|
|
} |
7839
|
0
|
|
|
|
|
if (SvVALID(prog->substrs->data[i].utf8_substr)) { |
7840
|
0
|
|
|
|
|
if (SvTAIL(prog->substrs->data[i].utf8_substr)) { |
7841
|
|
|
|
|
|
/* Trim the trailing \n that fbm_compile added last |
7842
|
|
|
|
|
|
time. */ |
7843
|
0
|
|
|
|
|
SvCUR_set(sv, SvCUR(sv) - 1); |
7844
|
0
|
|
|
|
|
fbm_compile(sv, FBMcf_TAIL); |
7845
|
|
|
|
|
|
} else |
7846
|
0
|
|
|
|
|
fbm_compile(sv, 0); |
7847
|
|
|
|
|
|
} |
7848
|
0
|
|
|
|
|
prog->substrs->data[i].substr = sv; |
7849
|
0
|
|
|
|
|
if (prog->substrs->data[i].utf8_substr == prog->check_utf8) |
7850
|
0
|
|
|
|
|
prog->check_substr = sv; |
7851
|
|
|
|
|
|
} |
7852
|
28
|
|
|
|
|
} while (i--); |
7853
|
|
|
|
|
|
|
7854
|
|
|
|
|
|
return TRUE; |
7855
|
1216
|
|
|
|
|
} |
7856
|
|
|
|
|
|
|
7857
|
|
|
|
|
|
/* |
7858
|
|
|
|
|
|
* Local variables: |
7859
|
|
|
|
|
|
* c-indentation-style: bsd |
7860
|
|
|
|
|
|
* c-basic-offset: 4 |
7861
|
|
|
|
|
|
* indent-tabs-mode: nil |
7862
|
|
|
|
|
|
* End: |
7863
|
|
|
|
|
|
* |
7864
|
|
|
|
|
|
* ex: set ts=8 sts=4 sw=4 et: |
7865
|
|
|
|
|
|
*/ |