line |
stmt |
bran |
cond |
sub |
time |
code |
1
|
|
|
|
|
|
/* regcomp.c |
2
|
|
|
|
|
|
*/ |
3
|
|
|
|
|
|
|
4
|
|
|
|
|
|
/* |
5
|
|
|
|
|
|
* 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee |
6
|
|
|
|
|
|
* |
7
|
|
|
|
|
|
* [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] |
8
|
|
|
|
|
|
*/ |
9
|
|
|
|
|
|
|
10
|
|
|
|
|
|
/* This file contains functions for compiling a regular expression. See |
11
|
|
|
|
|
|
* also regexec.c which funnily enough, contains functions for executing |
12
|
|
|
|
|
|
* a regular expression. |
13
|
|
|
|
|
|
* |
14
|
|
|
|
|
|
* This file is also copied at build time to ext/re/re_comp.c, where |
15
|
|
|
|
|
|
* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. |
16
|
|
|
|
|
|
* This causes the main functions to be compiled under new names and with |
17
|
|
|
|
|
|
* debugging support added, which makes "use re 'debug'" work. |
18
|
|
|
|
|
|
*/ |
19
|
|
|
|
|
|
|
20
|
|
|
|
|
|
/* NOTE: this is derived from Henry Spencer's regexp code, and should not |
21
|
|
|
|
|
|
* confused with the original package (see point 3 below). Thanks, Henry! |
22
|
|
|
|
|
|
*/ |
23
|
|
|
|
|
|
|
24
|
|
|
|
|
|
/* Additional note: this code is very heavily munged from Henry's version |
25
|
|
|
|
|
|
* in places. In some spots I've traded clarity for efficiency, so don't |
26
|
|
|
|
|
|
* blame Henry for some of the lack of readability. |
27
|
|
|
|
|
|
*/ |
28
|
|
|
|
|
|
|
29
|
|
|
|
|
|
/* The names of the functions have been changed from regcomp and |
30
|
|
|
|
|
|
* regexec to pregcomp and pregexec in order to avoid conflicts |
31
|
|
|
|
|
|
* with the POSIX routines of the same names. |
32
|
|
|
|
|
|
*/ |
33
|
|
|
|
|
|
|
34
|
|
|
|
|
|
#ifdef PERL_EXT_RE_BUILD |
35
|
|
|
|
|
|
#include "re_top.h" |
36
|
|
|
|
|
|
#endif |
37
|
|
|
|
|
|
|
38
|
|
|
|
|
|
/* |
39
|
|
|
|
|
|
* pregcomp and pregexec -- regsub and regerror are not used in perl |
40
|
|
|
|
|
|
* |
41
|
|
|
|
|
|
* Copyright (c) 1986 by University of Toronto. |
42
|
|
|
|
|
|
* Written by Henry Spencer. Not derived from licensed software. |
43
|
|
|
|
|
|
* |
44
|
|
|
|
|
|
* Permission is granted to anyone to use this software for any |
45
|
|
|
|
|
|
* purpose on any computer system, and to redistribute it freely, |
46
|
|
|
|
|
|
* subject to the following restrictions: |
47
|
|
|
|
|
|
* |
48
|
|
|
|
|
|
* 1. The author is not responsible for the consequences of use of |
49
|
|
|
|
|
|
* this software, no matter how awful, even if they arise |
50
|
|
|
|
|
|
* from defects in it. |
51
|
|
|
|
|
|
* |
52
|
|
|
|
|
|
* 2. The origin of this software must not be misrepresented, either |
53
|
|
|
|
|
|
* by explicit claim or by omission. |
54
|
|
|
|
|
|
* |
55
|
|
|
|
|
|
* 3. Altered versions must be plainly marked as such, and must not |
56
|
|
|
|
|
|
* be misrepresented as being the original software. |
57
|
|
|
|
|
|
* |
58
|
|
|
|
|
|
* |
59
|
|
|
|
|
|
**** Alterations to Henry's code are... |
60
|
|
|
|
|
|
**** |
61
|
|
|
|
|
|
**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, |
62
|
|
|
|
|
|
**** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 |
63
|
|
|
|
|
|
**** by Larry Wall and others |
64
|
|
|
|
|
|
**** |
65
|
|
|
|
|
|
**** You may distribute under the terms of either the GNU General Public |
66
|
|
|
|
|
|
**** License or the Artistic License, as specified in the README file. |
67
|
|
|
|
|
|
|
68
|
|
|
|
|
|
* |
69
|
|
|
|
|
|
* Beware that some of this code is subtly aware of the way operator |
70
|
|
|
|
|
|
* precedence is structured in regular expressions. Serious changes in |
71
|
|
|
|
|
|
* regular-expression syntax might require a total rethink. |
72
|
|
|
|
|
|
*/ |
73
|
|
|
|
|
|
#include "EXTERN.h" |
74
|
|
|
|
|
|
#define PERL_IN_REGCOMP_C |
75
|
|
|
|
|
|
#include "perl.h" |
76
|
|
|
|
|
|
|
77
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
78
|
|
|
|
|
|
# include "INTERN.h" |
79
|
|
|
|
|
|
#endif |
80
|
|
|
|
|
|
|
81
|
|
|
|
|
|
#define REG_COMP_C |
82
|
|
|
|
|
|
#ifdef PERL_IN_XSUB_RE |
83
|
|
|
|
|
|
# include "re_comp.h" |
84
|
|
|
|
|
|
extern const struct regexp_engine my_reg_engine; |
85
|
|
|
|
|
|
#else |
86
|
|
|
|
|
|
# include "regcomp.h" |
87
|
|
|
|
|
|
#endif |
88
|
|
|
|
|
|
|
89
|
|
|
|
|
|
#include "dquote_static.c" |
90
|
|
|
|
|
|
#include "charclass_invlists.h" |
91
|
|
|
|
|
|
#include "inline_invlist.c" |
92
|
|
|
|
|
|
#include "unicode_constants.h" |
93
|
|
|
|
|
|
|
94
|
|
|
|
|
|
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) |
95
|
|
|
|
|
|
#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) |
96
|
|
|
|
|
|
#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) |
97
|
|
|
|
|
|
|
98
|
|
|
|
|
|
#ifdef op |
99
|
|
|
|
|
|
#undef op |
100
|
|
|
|
|
|
#endif /* op */ |
101
|
|
|
|
|
|
|
102
|
|
|
|
|
|
#ifdef MSDOS |
103
|
|
|
|
|
|
# if defined(BUGGY_MSC6) |
104
|
|
|
|
|
|
/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ |
105
|
|
|
|
|
|
# pragma optimize("a",off) |
106
|
|
|
|
|
|
/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ |
107
|
|
|
|
|
|
# pragma optimize("w",on ) |
108
|
|
|
|
|
|
# endif /* BUGGY_MSC6 */ |
109
|
|
|
|
|
|
#endif /* MSDOS */ |
110
|
|
|
|
|
|
|
111
|
|
|
|
|
|
#ifndef STATIC |
112
|
|
|
|
|
|
#define STATIC static |
113
|
|
|
|
|
|
#endif |
114
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
116
|
|
|
|
|
|
typedef struct RExC_state_t { |
117
|
|
|
|
|
|
U32 flags; /* RXf_* are we folding, multilining? */ |
118
|
|
|
|
|
|
U32 pm_flags; /* PMf_* stuff from the calling PMOP */ |
119
|
|
|
|
|
|
char *precomp; /* uncompiled string. */ |
120
|
|
|
|
|
|
REGEXP *rx_sv; /* The SV that is the regexp. */ |
121
|
|
|
|
|
|
regexp *rx; /* perl core regexp structure */ |
122
|
|
|
|
|
|
regexp_internal *rxi; /* internal data for regexp object pprivate field */ |
123
|
|
|
|
|
|
char *start; /* Start of input for compile */ |
124
|
|
|
|
|
|
char *end; /* End of input for compile */ |
125
|
|
|
|
|
|
char *parse; /* Input-scan pointer. */ |
126
|
|
|
|
|
|
SSize_t whilem_seen; /* number of WHILEM in this expr */ |
127
|
|
|
|
|
|
regnode *emit_start; /* Start of emitted-code area */ |
128
|
|
|
|
|
|
regnode *emit_bound; /* First regnode outside of the allocated space */ |
129
|
|
|
|
|
|
regnode *emit; /* Code-emit pointer; if = &emit_dummy, |
130
|
|
|
|
|
|
implies compiling, so don't emit */ |
131
|
|
|
|
|
|
regnode emit_dummy; /* placeholder for emit to point to */ |
132
|
|
|
|
|
|
I32 naughty; /* How bad is this pattern? */ |
133
|
|
|
|
|
|
I32 sawback; /* Did we see \1, ...? */ |
134
|
|
|
|
|
|
U32 seen; |
135
|
|
|
|
|
|
SSize_t size; /* Code size. */ |
136
|
|
|
|
|
|
I32 npar; /* Capture buffer count, (OPEN). */ |
137
|
|
|
|
|
|
I32 cpar; /* Capture buffer count, (CLOSE). */ |
138
|
|
|
|
|
|
I32 nestroot; /* root parens we are in - used by accept */ |
139
|
|
|
|
|
|
I32 extralen; |
140
|
|
|
|
|
|
I32 seen_zerolen; |
141
|
|
|
|
|
|
regnode **open_parens; /* pointers to open parens */ |
142
|
|
|
|
|
|
regnode **close_parens; /* pointers to close parens */ |
143
|
|
|
|
|
|
regnode *opend; /* END node in program */ |
144
|
|
|
|
|
|
I32 utf8; /* whether the pattern is utf8 or not */ |
145
|
|
|
|
|
|
I32 orig_utf8; /* whether the pattern was originally in utf8 */ |
146
|
|
|
|
|
|
/* XXX use this for future optimisation of case |
147
|
|
|
|
|
|
* where pattern must be upgraded to utf8. */ |
148
|
|
|
|
|
|
I32 uni_semantics; /* If a d charset modifier should use unicode |
149
|
|
|
|
|
|
rules, even if the pattern is not in |
150
|
|
|
|
|
|
utf8 */ |
151
|
|
|
|
|
|
HV *paren_names; /* Paren names */ |
152
|
|
|
|
|
|
|
153
|
|
|
|
|
|
regnode **recurse; /* Recurse regops */ |
154
|
|
|
|
|
|
I32 recurse_count; /* Number of recurse regops */ |
155
|
|
|
|
|
|
I32 in_lookbehind; |
156
|
|
|
|
|
|
I32 contains_locale; |
157
|
|
|
|
|
|
I32 override_recoding; |
158
|
|
|
|
|
|
I32 in_multi_char_class; |
159
|
|
|
|
|
|
struct reg_code_block *code_blocks; /* positions of literal (?{}) |
160
|
|
|
|
|
|
within pattern */ |
161
|
|
|
|
|
|
int num_code_blocks; /* size of code_blocks[] */ |
162
|
|
|
|
|
|
int code_index; /* next code_blocks[] slot */ |
163
|
|
|
|
|
|
#if ADD_TO_REGEXEC |
164
|
|
|
|
|
|
char *starttry; /* -Dr: where regtry was called. */ |
165
|
|
|
|
|
|
#define RExC_starttry (pRExC_state->starttry) |
166
|
|
|
|
|
|
#endif |
167
|
|
|
|
|
|
SV *runtime_code_qr; /* qr with the runtime code blocks */ |
168
|
|
|
|
|
|
#ifdef DEBUGGING |
169
|
|
|
|
|
|
const char *lastparse; |
170
|
|
|
|
|
|
I32 lastnum; |
171
|
|
|
|
|
|
AV *paren_name_list; /* idx -> name */ |
172
|
|
|
|
|
|
#define RExC_lastparse (pRExC_state->lastparse) |
173
|
|
|
|
|
|
#define RExC_lastnum (pRExC_state->lastnum) |
174
|
|
|
|
|
|
#define RExC_paren_name_list (pRExC_state->paren_name_list) |
175
|
|
|
|
|
|
#endif |
176
|
|
|
|
|
|
} RExC_state_t; |
177
|
|
|
|
|
|
|
178
|
|
|
|
|
|
#define RExC_flags (pRExC_state->flags) |
179
|
|
|
|
|
|
#define RExC_pm_flags (pRExC_state->pm_flags) |
180
|
|
|
|
|
|
#define RExC_precomp (pRExC_state->precomp) |
181
|
|
|
|
|
|
#define RExC_rx_sv (pRExC_state->rx_sv) |
182
|
|
|
|
|
|
#define RExC_rx (pRExC_state->rx) |
183
|
|
|
|
|
|
#define RExC_rxi (pRExC_state->rxi) |
184
|
|
|
|
|
|
#define RExC_start (pRExC_state->start) |
185
|
|
|
|
|
|
#define RExC_end (pRExC_state->end) |
186
|
|
|
|
|
|
#define RExC_parse (pRExC_state->parse) |
187
|
|
|
|
|
|
#define RExC_whilem_seen (pRExC_state->whilem_seen) |
188
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
189
|
|
|
|
|
|
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ |
190
|
|
|
|
|
|
#endif |
191
|
|
|
|
|
|
#define RExC_emit (pRExC_state->emit) |
192
|
|
|
|
|
|
#define RExC_emit_dummy (pRExC_state->emit_dummy) |
193
|
|
|
|
|
|
#define RExC_emit_start (pRExC_state->emit_start) |
194
|
|
|
|
|
|
#define RExC_emit_bound (pRExC_state->emit_bound) |
195
|
|
|
|
|
|
#define RExC_naughty (pRExC_state->naughty) |
196
|
|
|
|
|
|
#define RExC_sawback (pRExC_state->sawback) |
197
|
|
|
|
|
|
#define RExC_seen (pRExC_state->seen) |
198
|
|
|
|
|
|
#define RExC_size (pRExC_state->size) |
199
|
|
|
|
|
|
#define RExC_npar (pRExC_state->npar) |
200
|
|
|
|
|
|
#define RExC_nestroot (pRExC_state->nestroot) |
201
|
|
|
|
|
|
#define RExC_extralen (pRExC_state->extralen) |
202
|
|
|
|
|
|
#define RExC_seen_zerolen (pRExC_state->seen_zerolen) |
203
|
|
|
|
|
|
#define RExC_utf8 (pRExC_state->utf8) |
204
|
|
|
|
|
|
#define RExC_uni_semantics (pRExC_state->uni_semantics) |
205
|
|
|
|
|
|
#define RExC_orig_utf8 (pRExC_state->orig_utf8) |
206
|
|
|
|
|
|
#define RExC_open_parens (pRExC_state->open_parens) |
207
|
|
|
|
|
|
#define RExC_close_parens (pRExC_state->close_parens) |
208
|
|
|
|
|
|
#define RExC_opend (pRExC_state->opend) |
209
|
|
|
|
|
|
#define RExC_paren_names (pRExC_state->paren_names) |
210
|
|
|
|
|
|
#define RExC_recurse (pRExC_state->recurse) |
211
|
|
|
|
|
|
#define RExC_recurse_count (pRExC_state->recurse_count) |
212
|
|
|
|
|
|
#define RExC_in_lookbehind (pRExC_state->in_lookbehind) |
213
|
|
|
|
|
|
#define RExC_contains_locale (pRExC_state->contains_locale) |
214
|
|
|
|
|
|
#define RExC_override_recoding (pRExC_state->override_recoding) |
215
|
|
|
|
|
|
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) |
216
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
218
|
|
|
|
|
|
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') |
219
|
|
|
|
|
|
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ |
220
|
|
|
|
|
|
((*s) == '{' && regcurly(s, FALSE))) |
221
|
|
|
|
|
|
|
222
|
|
|
|
|
|
#ifdef SPSTART |
223
|
|
|
|
|
|
#undef SPSTART /* dratted cpp namespace... */ |
224
|
|
|
|
|
|
#endif |
225
|
|
|
|
|
|
/* |
226
|
|
|
|
|
|
* Flags to be passed up and down. |
227
|
|
|
|
|
|
*/ |
228
|
|
|
|
|
|
#define WORST 0 /* Worst case. */ |
229
|
|
|
|
|
|
#define HASWIDTH 0x01 /* Known to match non-null strings. */ |
230
|
|
|
|
|
|
|
231
|
|
|
|
|
|
/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single |
232
|
|
|
|
|
|
* character. (There needs to be a case: in the switch statement in regexec.c |
233
|
|
|
|
|
|
* for any node marked SIMPLE.) Note that this is not the same thing as |
234
|
|
|
|
|
|
* REGNODE_SIMPLE */ |
235
|
|
|
|
|
|
#define SIMPLE 0x02 |
236
|
|
|
|
|
|
#define SPSTART 0x04 /* Starts with * or + */ |
237
|
|
|
|
|
|
#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ |
238
|
|
|
|
|
|
#define TRYAGAIN 0x10 /* Weeded out a declaration. */ |
239
|
|
|
|
|
|
#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ |
240
|
|
|
|
|
|
|
241
|
|
|
|
|
|
#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) |
242
|
|
|
|
|
|
|
243
|
|
|
|
|
|
/* whether trie related optimizations are enabled */ |
244
|
|
|
|
|
|
#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION |
245
|
|
|
|
|
|
#define TRIE_STUDY_OPT |
246
|
|
|
|
|
|
#define FULL_TRIE_STUDY |
247
|
|
|
|
|
|
#define TRIE_STCLASS |
248
|
|
|
|
|
|
#endif |
249
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
252
|
|
|
|
|
|
#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] |
253
|
|
|
|
|
|
#define PBITVAL(paren) (1 << ((paren) & 7)) |
254
|
|
|
|
|
|
#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) |
255
|
|
|
|
|
|
#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) |
256
|
|
|
|
|
|
#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) |
257
|
|
|
|
|
|
|
258
|
|
|
|
|
|
#define REQUIRE_UTF8 STMT_START { \ |
259
|
|
|
|
|
|
if (!UTF) { \ |
260
|
|
|
|
|
|
*flagp = RESTART_UTF8; \ |
261
|
|
|
|
|
|
return NULL; \ |
262
|
|
|
|
|
|
} \ |
263
|
|
|
|
|
|
} STMT_END |
264
|
|
|
|
|
|
|
265
|
|
|
|
|
|
/* This converts the named class defined in regcomp.h to its equivalent class |
266
|
|
|
|
|
|
* number defined in handy.h. */ |
267
|
|
|
|
|
|
#define namedclass_to_classnum(class) ((int) ((class) / 2)) |
268
|
|
|
|
|
|
#define classnum_to_namedclass(classnum) ((classnum) * 2) |
269
|
|
|
|
|
|
|
270
|
|
|
|
|
|
/* About scan_data_t. |
271
|
|
|
|
|
|
|
272
|
|
|
|
|
|
During optimisation we recurse through the regexp program performing |
273
|
|
|
|
|
|
various inplace (keyhole style) optimisations. In addition study_chunk |
274
|
|
|
|
|
|
and scan_commit populate this data structure with information about |
275
|
|
|
|
|
|
what strings MUST appear in the pattern. We look for the longest |
276
|
|
|
|
|
|
string that must appear at a fixed location, and we look for the |
277
|
|
|
|
|
|
longest string that may appear at a floating location. So for instance |
278
|
|
|
|
|
|
in the pattern: |
279
|
|
|
|
|
|
|
280
|
|
|
|
|
|
/FOO[xX]A.*B[xX]BAR/ |
281
|
|
|
|
|
|
|
282
|
|
|
|
|
|
Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating |
283
|
|
|
|
|
|
strings (because they follow a .* construct). study_chunk will identify |
284
|
|
|
|
|
|
both FOO and BAR as being the longest fixed and floating strings respectively. |
285
|
|
|
|
|
|
|
286
|
|
|
|
|
|
The strings can be composites, for instance |
287
|
|
|
|
|
|
|
288
|
|
|
|
|
|
/(f)(o)(o)/ |
289
|
|
|
|
|
|
|
290
|
|
|
|
|
|
will result in a composite fixed substring 'foo'. |
291
|
|
|
|
|
|
|
292
|
|
|
|
|
|
For each string some basic information is maintained: |
293
|
|
|
|
|
|
|
294
|
|
|
|
|
|
- offset or min_offset |
295
|
|
|
|
|
|
This is the position the string must appear at, or not before. |
296
|
|
|
|
|
|
It also implicitly (when combined with minlenp) tells us how many |
297
|
|
|
|
|
|
characters must match before the string we are searching for. |
298
|
|
|
|
|
|
Likewise when combined with minlenp and the length of the string it |
299
|
|
|
|
|
|
tells us how many characters must appear after the string we have |
300
|
|
|
|
|
|
found. |
301
|
|
|
|
|
|
|
302
|
|
|
|
|
|
- max_offset |
303
|
|
|
|
|
|
Only used for floating strings. This is the rightmost point that |
304
|
|
|
|
|
|
the string can appear at. If set to SSize_t_MAX it indicates that the |
305
|
|
|
|
|
|
string can occur infinitely far to the right. |
306
|
|
|
|
|
|
|
307
|
|
|
|
|
|
- minlenp |
308
|
|
|
|
|
|
A pointer to the minimum number of characters of the pattern that the |
309
|
|
|
|
|
|
string was found inside. This is important as in the case of positive |
310
|
|
|
|
|
|
lookahead or positive lookbehind we can have multiple patterns |
311
|
|
|
|
|
|
involved. Consider |
312
|
|
|
|
|
|
|
313
|
|
|
|
|
|
/(?=FOO).*F/ |
314
|
|
|
|
|
|
|
315
|
|
|
|
|
|
The minimum length of the pattern overall is 3, the minimum length |
316
|
|
|
|
|
|
of the lookahead part is 3, but the minimum length of the part that |
317
|
|
|
|
|
|
will actually match is 1. So 'FOO's minimum length is 3, but the |
318
|
|
|
|
|
|
minimum length for the F is 1. This is important as the minimum length |
319
|
|
|
|
|
|
is used to determine offsets in front of and behind the string being |
320
|
|
|
|
|
|
looked for. Since strings can be composites this is the length of the |
321
|
|
|
|
|
|
pattern at the time it was committed with a scan_commit. Note that |
322
|
|
|
|
|
|
the length is calculated by study_chunk, so that the minimum lengths |
323
|
|
|
|
|
|
are not known until the full pattern has been compiled, thus the |
324
|
|
|
|
|
|
pointer to the value. |
325
|
|
|
|
|
|
|
326
|
|
|
|
|
|
- lookbehind |
327
|
|
|
|
|
|
|
328
|
|
|
|
|
|
In the case of lookbehind the string being searched for can be |
329
|
|
|
|
|
|
offset past the start point of the final matching string. |
330
|
|
|
|
|
|
If this value was just blithely removed from the min_offset it would |
331
|
|
|
|
|
|
invalidate some of the calculations for how many chars must match |
332
|
|
|
|
|
|
before or after (as they are derived from min_offset and minlen and |
333
|
|
|
|
|
|
the length of the string being searched for). |
334
|
|
|
|
|
|
When the final pattern is compiled and the data is moved from the |
335
|
|
|
|
|
|
scan_data_t structure into the regexp structure the information |
336
|
|
|
|
|
|
about lookbehind is factored in, with the information that would |
337
|
|
|
|
|
|
have been lost precalculated in the end_shift field for the |
338
|
|
|
|
|
|
associated string. |
339
|
|
|
|
|
|
|
340
|
|
|
|
|
|
The fields pos_min and pos_delta are used to store the minimum offset |
341
|
|
|
|
|
|
and the delta to the maximum offset at the current point in the pattern. |
342
|
|
|
|
|
|
|
343
|
|
|
|
|
|
*/ |
344
|
|
|
|
|
|
|
345
|
|
|
|
|
|
typedef struct scan_data_t { |
346
|
|
|
|
|
|
/*I32 len_min; unused */ |
347
|
|
|
|
|
|
/*I32 len_delta; unused */ |
348
|
|
|
|
|
|
SSize_t pos_min; |
349
|
|
|
|
|
|
SSize_t pos_delta; |
350
|
|
|
|
|
|
SV *last_found; |
351
|
|
|
|
|
|
SSize_t last_end; /* min value, <0 unless valid. */ |
352
|
|
|
|
|
|
SSize_t last_start_min; |
353
|
|
|
|
|
|
SSize_t last_start_max; |
354
|
|
|
|
|
|
SV **longest; /* Either &l_fixed, or &l_float. */ |
355
|
|
|
|
|
|
SV *longest_fixed; /* longest fixed string found in pattern */ |
356
|
|
|
|
|
|
SSize_t offset_fixed; /* offset where it starts */ |
357
|
|
|
|
|
|
SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ |
358
|
|
|
|
|
|
I32 lookbehind_fixed; /* is the position of the string modfied by LB */ |
359
|
|
|
|
|
|
SV *longest_float; /* longest floating string found in pattern */ |
360
|
|
|
|
|
|
SSize_t offset_float_min; /* earliest point in string it can appear */ |
361
|
|
|
|
|
|
SSize_t offset_float_max; /* latest point in string it can appear */ |
362
|
|
|
|
|
|
SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ |
363
|
|
|
|
|
|
SSize_t lookbehind_float; /* is the pos of the string modified by LB */ |
364
|
|
|
|
|
|
I32 flags; |
365
|
|
|
|
|
|
I32 whilem_c; |
366
|
|
|
|
|
|
SSize_t *last_closep; |
367
|
|
|
|
|
|
struct regnode_charclass_class *start_class; |
368
|
|
|
|
|
|
} scan_data_t; |
369
|
|
|
|
|
|
|
370
|
|
|
|
|
|
/* The below is perhaps overboard, but this allows us to save a test at the |
371
|
|
|
|
|
|
* expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' |
372
|
|
|
|
|
|
* and 'a' differ by a single bit; the same with the upper and lower case of |
373
|
|
|
|
|
|
* all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; |
374
|
|
|
|
|
|
* on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and |
375
|
|
|
|
|
|
* then inverts it to form a mask, with just a single 0, in the bit position |
376
|
|
|
|
|
|
* where the upper- and lowercase differ. XXX There are about 40 other |
377
|
|
|
|
|
|
* instances in the Perl core where this micro-optimization could be used. |
378
|
|
|
|
|
|
* Should decide if maintenance cost is worse, before changing those |
379
|
|
|
|
|
|
* |
380
|
|
|
|
|
|
* Returns a boolean as to whether or not 'v' is either a lowercase or |
381
|
|
|
|
|
|
* uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a |
382
|
|
|
|
|
|
* compile-time constant, the generated code is better than some optimizing |
383
|
|
|
|
|
|
* compilers figure out, amounting to a mask and test. The results are |
384
|
|
|
|
|
|
* meaningless if 'c' is not one of [A-Za-z] */ |
385
|
|
|
|
|
|
#define isARG2_lower_or_UPPER_ARG1(c, v) \ |
386
|
|
|
|
|
|
(((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) |
387
|
|
|
|
|
|
|
388
|
|
|
|
|
|
/* |
389
|
|
|
|
|
|
* Forward declarations for pregcomp()'s friends. |
390
|
|
|
|
|
|
*/ |
391
|
|
|
|
|
|
|
392
|
|
|
|
|
|
static const scan_data_t zero_scan_data = |
393
|
|
|
|
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; |
394
|
|
|
|
|
|
|
395
|
|
|
|
|
|
#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) |
396
|
|
|
|
|
|
#define SF_BEFORE_SEOL 0x0001 |
397
|
|
|
|
|
|
#define SF_BEFORE_MEOL 0x0002 |
398
|
|
|
|
|
|
#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) |
399
|
|
|
|
|
|
#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) |
400
|
|
|
|
|
|
|
401
|
|
|
|
|
|
#ifdef NO_UNARY_PLUS |
402
|
|
|
|
|
|
# define SF_FIX_SHIFT_EOL (0+2) |
403
|
|
|
|
|
|
# define SF_FL_SHIFT_EOL (0+4) |
404
|
|
|
|
|
|
#else |
405
|
|
|
|
|
|
# define SF_FIX_SHIFT_EOL (+2) |
406
|
|
|
|
|
|
# define SF_FL_SHIFT_EOL (+4) |
407
|
|
|
|
|
|
#endif |
408
|
|
|
|
|
|
|
409
|
|
|
|
|
|
#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) |
410
|
|
|
|
|
|
#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) |
411
|
|
|
|
|
|
|
412
|
|
|
|
|
|
#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) |
413
|
|
|
|
|
|
#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ |
414
|
|
|
|
|
|
#define SF_IS_INF 0x0040 |
415
|
|
|
|
|
|
#define SF_HAS_PAR 0x0080 |
416
|
|
|
|
|
|
#define SF_IN_PAR 0x0100 |
417
|
|
|
|
|
|
#define SF_HAS_EVAL 0x0200 |
418
|
|
|
|
|
|
#define SCF_DO_SUBSTR 0x0400 |
419
|
|
|
|
|
|
#define SCF_DO_STCLASS_AND 0x0800 |
420
|
|
|
|
|
|
#define SCF_DO_STCLASS_OR 0x1000 |
421
|
|
|
|
|
|
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) |
422
|
|
|
|
|
|
#define SCF_WHILEM_VISITED_POS 0x2000 |
423
|
|
|
|
|
|
|
424
|
|
|
|
|
|
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ |
425
|
|
|
|
|
|
#define SCF_SEEN_ACCEPT 0x8000 |
426
|
|
|
|
|
|
#define SCF_TRIE_DOING_RESTUDY 0x10000 |
427
|
|
|
|
|
|
|
428
|
|
|
|
|
|
#define UTF cBOOL(RExC_utf8) |
429
|
|
|
|
|
|
|
430
|
|
|
|
|
|
/* The enums for all these are ordered so things work out correctly */ |
431
|
|
|
|
|
|
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) |
432
|
|
|
|
|
|
#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) |
433
|
|
|
|
|
|
#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) |
434
|
|
|
|
|
|
#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) |
435
|
|
|
|
|
|
#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) |
436
|
|
|
|
|
|
#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) |
437
|
|
|
|
|
|
#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) |
438
|
|
|
|
|
|
|
439
|
|
|
|
|
|
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) |
440
|
|
|
|
|
|
|
441
|
|
|
|
|
|
#define OOB_NAMEDCLASS -1 |
442
|
|
|
|
|
|
|
443
|
|
|
|
|
|
/* There is no code point that is out-of-bounds, so this is problematic. But |
444
|
|
|
|
|
|
* its only current use is to initialize a variable that is always set before |
445
|
|
|
|
|
|
* looked at. */ |
446
|
|
|
|
|
|
#define OOB_UNICODE 0xDEADBEEF |
447
|
|
|
|
|
|
|
448
|
|
|
|
|
|
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) |
449
|
|
|
|
|
|
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) |
450
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
452
|
|
|
|
|
|
/* length of regex to show in messages that don't mark a position within */ |
453
|
|
|
|
|
|
#define RegexLengthToShowInErrorMessages 127 |
454
|
|
|
|
|
|
|
455
|
|
|
|
|
|
/* |
456
|
|
|
|
|
|
* If MARKER[12] are adjusted, be sure to adjust the constants at the top |
457
|
|
|
|
|
|
* of t/op/regmesg.t, the tests in t/op/re_tests, and those in |
458
|
|
|
|
|
|
* op/pragma/warn/regcomp. |
459
|
|
|
|
|
|
*/ |
460
|
|
|
|
|
|
#define MARKER1 "<-- HERE" /* marker as it appears in the description */ |
461
|
|
|
|
|
|
#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ |
462
|
|
|
|
|
|
|
463
|
|
|
|
|
|
#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" |
464
|
|
|
|
|
|
|
465
|
|
|
|
|
|
/* |
466
|
|
|
|
|
|
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given |
467
|
|
|
|
|
|
* arg. Show regex, up to a maximum length. If it's too long, chop and add |
468
|
|
|
|
|
|
* "...". |
469
|
|
|
|
|
|
*/ |
470
|
|
|
|
|
|
#define _FAIL(code) STMT_START { \ |
471
|
|
|
|
|
|
const char *ellipses = ""; \ |
472
|
|
|
|
|
|
IV len = RExC_end - RExC_precomp; \ |
473
|
|
|
|
|
|
\ |
474
|
|
|
|
|
|
if (!SIZE_ONLY) \ |
475
|
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); \ |
476
|
|
|
|
|
|
if (len > RegexLengthToShowInErrorMessages) { \ |
477
|
|
|
|
|
|
/* chop 10 shorter than the max, to ensure meaning of "..." */ \ |
478
|
|
|
|
|
|
len = RegexLengthToShowInErrorMessages - 10; \ |
479
|
|
|
|
|
|
ellipses = "..."; \ |
480
|
|
|
|
|
|
} \ |
481
|
|
|
|
|
|
code; \ |
482
|
|
|
|
|
|
} STMT_END |
483
|
|
|
|
|
|
|
484
|
|
|
|
|
|
#define FAIL(msg) _FAIL( \ |
485
|
|
|
|
|
|
Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ |
486
|
|
|
|
|
|
msg, (int)len, RExC_precomp, ellipses)) |
487
|
|
|
|
|
|
|
488
|
|
|
|
|
|
#define FAIL2(msg,arg) _FAIL( \ |
489
|
|
|
|
|
|
Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ |
490
|
|
|
|
|
|
arg, (int)len, RExC_precomp, ellipses)) |
491
|
|
|
|
|
|
|
492
|
|
|
|
|
|
/* |
493
|
|
|
|
|
|
* Simple_vFAIL -- like FAIL, but marks the current location in the scan |
494
|
|
|
|
|
|
*/ |
495
|
|
|
|
|
|
#define Simple_vFAIL(m) STMT_START { \ |
496
|
|
|
|
|
|
const IV offset = RExC_parse - RExC_precomp; \ |
497
|
|
|
|
|
|
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ |
498
|
|
|
|
|
|
m, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
499
|
|
|
|
|
|
} STMT_END |
500
|
|
|
|
|
|
|
501
|
|
|
|
|
|
/* |
502
|
|
|
|
|
|
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() |
503
|
|
|
|
|
|
*/ |
504
|
|
|
|
|
|
#define vFAIL(m) STMT_START { \ |
505
|
|
|
|
|
|
if (!SIZE_ONLY) \ |
506
|
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); \ |
507
|
|
|
|
|
|
Simple_vFAIL(m); \ |
508
|
|
|
|
|
|
} STMT_END |
509
|
|
|
|
|
|
|
510
|
|
|
|
|
|
/* |
511
|
|
|
|
|
|
* Like Simple_vFAIL(), but accepts two arguments. |
512
|
|
|
|
|
|
*/ |
513
|
|
|
|
|
|
#define Simple_vFAIL2(m,a1) STMT_START { \ |
514
|
|
|
|
|
|
const IV offset = RExC_parse - RExC_precomp; \ |
515
|
|
|
|
|
|
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ |
516
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
517
|
|
|
|
|
|
} STMT_END |
518
|
|
|
|
|
|
|
519
|
|
|
|
|
|
/* |
520
|
|
|
|
|
|
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). |
521
|
|
|
|
|
|
*/ |
522
|
|
|
|
|
|
#define vFAIL2(m,a1) STMT_START { \ |
523
|
|
|
|
|
|
if (!SIZE_ONLY) \ |
524
|
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); \ |
525
|
|
|
|
|
|
Simple_vFAIL2(m, a1); \ |
526
|
|
|
|
|
|
} STMT_END |
527
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
529
|
|
|
|
|
|
/* |
530
|
|
|
|
|
|
* Like Simple_vFAIL(), but accepts three arguments. |
531
|
|
|
|
|
|
*/ |
532
|
|
|
|
|
|
#define Simple_vFAIL3(m, a1, a2) STMT_START { \ |
533
|
|
|
|
|
|
const IV offset = RExC_parse - RExC_precomp; \ |
534
|
|
|
|
|
|
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ |
535
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
536
|
|
|
|
|
|
} STMT_END |
537
|
|
|
|
|
|
|
538
|
|
|
|
|
|
/* |
539
|
|
|
|
|
|
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). |
540
|
|
|
|
|
|
*/ |
541
|
|
|
|
|
|
#define vFAIL3(m,a1,a2) STMT_START { \ |
542
|
|
|
|
|
|
if (!SIZE_ONLY) \ |
543
|
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); \ |
544
|
|
|
|
|
|
Simple_vFAIL3(m, a1, a2); \ |
545
|
|
|
|
|
|
} STMT_END |
546
|
|
|
|
|
|
|
547
|
|
|
|
|
|
/* |
548
|
|
|
|
|
|
* Like Simple_vFAIL(), but accepts four arguments. |
549
|
|
|
|
|
|
*/ |
550
|
|
|
|
|
|
#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ |
551
|
|
|
|
|
|
const IV offset = RExC_parse - RExC_precomp; \ |
552
|
|
|
|
|
|
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ |
553
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
554
|
|
|
|
|
|
} STMT_END |
555
|
|
|
|
|
|
|
556
|
|
|
|
|
|
#define vFAIL4(m,a1,a2,a3) STMT_START { \ |
557
|
|
|
|
|
|
if (!SIZE_ONLY) \ |
558
|
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); \ |
559
|
|
|
|
|
|
Simple_vFAIL4(m, a1, a2, a3); \ |
560
|
|
|
|
|
|
} STMT_END |
561
|
|
|
|
|
|
|
562
|
|
|
|
|
|
/* m is not necessarily a "literal string", in this macro */ |
563
|
|
|
|
|
|
#define reg_warn_non_literal_string(loc, m) STMT_START { \ |
564
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
565
|
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ |
566
|
|
|
|
|
|
m, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
567
|
|
|
|
|
|
} STMT_END |
568
|
|
|
|
|
|
|
569
|
|
|
|
|
|
#define ckWARNreg(loc,m) STMT_START { \ |
570
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
571
|
|
|
|
|
|
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
572
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
573
|
|
|
|
|
|
} STMT_END |
574
|
|
|
|
|
|
|
575
|
|
|
|
|
|
#define vWARN_dep(loc, m) STMT_START { \ |
576
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
577
|
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ |
578
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
579
|
|
|
|
|
|
} STMT_END |
580
|
|
|
|
|
|
|
581
|
|
|
|
|
|
#define ckWARNdep(loc,m) STMT_START { \ |
582
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
583
|
|
|
|
|
|
Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ |
584
|
|
|
|
|
|
m REPORT_LOCATION, \ |
585
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
586
|
|
|
|
|
|
} STMT_END |
587
|
|
|
|
|
|
|
588
|
|
|
|
|
|
#define ckWARNregdep(loc,m) STMT_START { \ |
589
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
590
|
|
|
|
|
|
Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ |
591
|
|
|
|
|
|
m REPORT_LOCATION, \ |
592
|
|
|
|
|
|
(int)offset, RExC_precomp, RExC_precomp + offset); \ |
593
|
|
|
|
|
|
} STMT_END |
594
|
|
|
|
|
|
|
595
|
|
|
|
|
|
#define ckWARN2reg_d(loc,m, a1) STMT_START { \ |
596
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
597
|
|
|
|
|
|
Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ |
598
|
|
|
|
|
|
m REPORT_LOCATION, \ |
599
|
|
|
|
|
|
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
600
|
|
|
|
|
|
} STMT_END |
601
|
|
|
|
|
|
|
602
|
|
|
|
|
|
#define ckWARN2reg(loc, m, a1) STMT_START { \ |
603
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
604
|
|
|
|
|
|
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
605
|
|
|
|
|
|
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
606
|
|
|
|
|
|
} STMT_END |
607
|
|
|
|
|
|
|
608
|
|
|
|
|
|
#define vWARN3(loc, m, a1, a2) STMT_START { \ |
609
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
610
|
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
611
|
|
|
|
|
|
a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
612
|
|
|
|
|
|
} STMT_END |
613
|
|
|
|
|
|
|
614
|
|
|
|
|
|
#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ |
615
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
616
|
|
|
|
|
|
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
617
|
|
|
|
|
|
a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
618
|
|
|
|
|
|
} STMT_END |
619
|
|
|
|
|
|
|
620
|
|
|
|
|
|
#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ |
621
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
622
|
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
623
|
|
|
|
|
|
a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
624
|
|
|
|
|
|
} STMT_END |
625
|
|
|
|
|
|
|
626
|
|
|
|
|
|
#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ |
627
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
628
|
|
|
|
|
|
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
629
|
|
|
|
|
|
a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
630
|
|
|
|
|
|
} STMT_END |
631
|
|
|
|
|
|
|
632
|
|
|
|
|
|
#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ |
633
|
|
|
|
|
|
const IV offset = loc - RExC_precomp; \ |
634
|
|
|
|
|
|
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ |
635
|
|
|
|
|
|
a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ |
636
|
|
|
|
|
|
} STMT_END |
637
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
639
|
|
|
|
|
|
/* Allow for side effects in s */ |
640
|
|
|
|
|
|
#define REGC(c,s) STMT_START { \ |
641
|
|
|
|
|
|
if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ |
642
|
|
|
|
|
|
} STMT_END |
643
|
|
|
|
|
|
|
644
|
|
|
|
|
|
/* Macros for recording node offsets. 20001227 mjd@plover.com |
645
|
|
|
|
|
|
* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in |
646
|
|
|
|
|
|
* element 2*n-1 of the array. Element #2n holds the byte length node #n. |
647
|
|
|
|
|
|
* Element 0 holds the number n. |
648
|
|
|
|
|
|
* Position is 1 indexed. |
649
|
|
|
|
|
|
*/ |
650
|
|
|
|
|
|
#ifndef RE_TRACK_PATTERN_OFFSETS |
651
|
|
|
|
|
|
#define Set_Node_Offset_To_R(node,byte) |
652
|
|
|
|
|
|
#define Set_Node_Offset(node,byte) |
653
|
|
|
|
|
|
#define Set_Cur_Node_Offset |
654
|
|
|
|
|
|
#define Set_Node_Length_To_R(node,len) |
655
|
|
|
|
|
|
#define Set_Node_Length(node,len) |
656
|
|
|
|
|
|
#define Set_Node_Cur_Length(node,start) |
657
|
|
|
|
|
|
#define Node_Offset(n) |
658
|
|
|
|
|
|
#define Node_Length(n) |
659
|
|
|
|
|
|
#define Set_Node_Offset_Length(node,offset,len) |
660
|
|
|
|
|
|
#define ProgLen(ri) ri->u.proglen |
661
|
|
|
|
|
|
#define SetProgLen(ri,x) ri->u.proglen = x |
662
|
|
|
|
|
|
#else |
663
|
|
|
|
|
|
#define ProgLen(ri) ri->u.offsets[0] |
664
|
|
|
|
|
|
#define SetProgLen(ri,x) ri->u.offsets[0] = x |
665
|
|
|
|
|
|
#define Set_Node_Offset_To_R(node,byte) STMT_START { \ |
666
|
|
|
|
|
|
if (! SIZE_ONLY) { \ |
667
|
|
|
|
|
|
MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ |
668
|
|
|
|
|
|
__LINE__, (int)(node), (int)(byte))); \ |
669
|
|
|
|
|
|
if((node) < 0) { \ |
670
|
|
|
|
|
|
Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ |
671
|
|
|
|
|
|
} else { \ |
672
|
|
|
|
|
|
RExC_offsets[2*(node)-1] = (byte); \ |
673
|
|
|
|
|
|
} \ |
674
|
|
|
|
|
|
} \ |
675
|
|
|
|
|
|
} STMT_END |
676
|
|
|
|
|
|
|
677
|
|
|
|
|
|
#define Set_Node_Offset(node,byte) \ |
678
|
|
|
|
|
|
Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) |
679
|
|
|
|
|
|
#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) |
680
|
|
|
|
|
|
|
681
|
|
|
|
|
|
#define Set_Node_Length_To_R(node,len) STMT_START { \ |
682
|
|
|
|
|
|
if (! SIZE_ONLY) { \ |
683
|
|
|
|
|
|
MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ |
684
|
|
|
|
|
|
__LINE__, (int)(node), (int)(len))); \ |
685
|
|
|
|
|
|
if((node) < 0) { \ |
686
|
|
|
|
|
|
Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ |
687
|
|
|
|
|
|
} else { \ |
688
|
|
|
|
|
|
RExC_offsets[2*(node)] = (len); \ |
689
|
|
|
|
|
|
} \ |
690
|
|
|
|
|
|
} \ |
691
|
|
|
|
|
|
} STMT_END |
692
|
|
|
|
|
|
|
693
|
|
|
|
|
|
#define Set_Node_Length(node,len) \ |
694
|
|
|
|
|
|
Set_Node_Length_To_R((node)-RExC_emit_start, len) |
695
|
|
|
|
|
|
#define Set_Node_Cur_Length(node, start) \ |
696
|
|
|
|
|
|
Set_Node_Length(node, RExC_parse - start) |
697
|
|
|
|
|
|
|
698
|
|
|
|
|
|
/* Get offsets and lengths */ |
699
|
|
|
|
|
|
#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) |
700
|
|
|
|
|
|
#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) |
701
|
|
|
|
|
|
|
702
|
|
|
|
|
|
#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ |
703
|
|
|
|
|
|
Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ |
704
|
|
|
|
|
|
Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ |
705
|
|
|
|
|
|
} STMT_END |
706
|
|
|
|
|
|
#endif |
707
|
|
|
|
|
|
|
708
|
|
|
|
|
|
#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS |
709
|
|
|
|
|
|
#define EXPERIMENTAL_INPLACESCAN |
710
|
|
|
|
|
|
#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ |
711
|
|
|
|
|
|
|
712
|
|
|
|
|
|
#define DEBUG_STUDYDATA(str,data,depth) \ |
713
|
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r(if(data){ \ |
714
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, \ |
715
|
|
|
|
|
|
"%*s" str "Pos:%"IVdf"/%"IVdf \ |
716
|
|
|
|
|
|
" Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ |
717
|
|
|
|
|
|
(int)(depth)*2, "", \ |
718
|
|
|
|
|
|
(IV)((data)->pos_min), \ |
719
|
|
|
|
|
|
(IV)((data)->pos_delta), \ |
720
|
|
|
|
|
|
(UV)((data)->flags), \ |
721
|
|
|
|
|
|
(IV)((data)->whilem_c), \ |
722
|
|
|
|
|
|
(IV)((data)->last_closep ? *((data)->last_closep) : -1), \ |
723
|
|
|
|
|
|
is_inf ? "INF " : "" \ |
724
|
|
|
|
|
|
); \ |
725
|
|
|
|
|
|
if ((data)->last_found) \ |
726
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, \ |
727
|
|
|
|
|
|
"Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ |
728
|
|
|
|
|
|
" %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ |
729
|
|
|
|
|
|
SvPVX_const((data)->last_found), \ |
730
|
|
|
|
|
|
(IV)((data)->last_end), \ |
731
|
|
|
|
|
|
(IV)((data)->last_start_min), \ |
732
|
|
|
|
|
|
(IV)((data)->last_start_max), \ |
733
|
|
|
|
|
|
((data)->longest && \ |
734
|
|
|
|
|
|
(data)->longest==&((data)->longest_fixed)) ? "*" : "", \ |
735
|
|
|
|
|
|
SvPVX_const((data)->longest_fixed), \ |
736
|
|
|
|
|
|
(IV)((data)->offset_fixed), \ |
737
|
|
|
|
|
|
((data)->longest && \ |
738
|
|
|
|
|
|
(data)->longest==&((data)->longest_float)) ? "*" : "", \ |
739
|
|
|
|
|
|
SvPVX_const((data)->longest_float), \ |
740
|
|
|
|
|
|
(IV)((data)->offset_float_min), \ |
741
|
|
|
|
|
|
(IV)((data)->offset_float_max) \ |
742
|
|
|
|
|
|
); \ |
743
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"\n"); \ |
744
|
|
|
|
|
|
}); |
745
|
|
|
|
|
|
|
746
|
|
|
|
|
|
/* Mark that we cannot extend a found fixed substring at this point. |
747
|
|
|
|
|
|
Update the longest found anchored substring and the longest found |
748
|
|
|
|
|
|
floating substrings if needed. */ |
749
|
|
|
|
|
|
|
750
|
|
|
|
|
|
STATIC void |
751
|
24403259
|
|
|
|
|
S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, |
752
|
|
|
|
|
|
SSize_t *minlenp, int is_inf) |
753
|
|
|
|
|
|
{ |
754
|
24403259
|
|
|
|
|
const STRLEN l = CHR_SVLEN(data->last_found); |
755
|
24403259
|
|
|
|
|
const STRLEN old_l = CHR_SVLEN(*data->longest); |
756
|
24403259
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
757
|
|
|
|
|
|
|
758
|
9017128
|
|
|
|
|
PERL_ARGS_ASSERT_SCAN_COMMIT; |
759
|
|
|
|
|
|
|
760
|
9017128
|
|
|
|
|
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { |
761
|
4708910
|
|
|
|
|
SvSetMagicSV(*data->longest, data->last_found); |
762
|
4708910
|
|
|
|
|
if (*data->longest == data->longest_fixed) { |
763
|
519868
|
|
|
|
|
data->offset_fixed = l ? data->last_start_min : data->pos_min; |
764
|
4189152
|
|
|
|
|
if (data->flags & SF_BEFORE_EOL) |
765
|
|
|
|
|
|
data->flags |
766
|
4708788
|
|
|
|
|
|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); |
767
|
|
|
|
|
|
else |
768
|
4708904
|
|
|
|
|
data->flags &= ~SF_FIX_BEFORE_EOL; |
769
|
4308292
|
|
|
|
|
data->minlen_fixed=minlenp; |
770
|
4308292
|
|
|
|
|
data->lookbehind_fixed=0; |
771
|
|
|
|
|
|
} |
772
|
|
|
|
|
|
else { /* *data->longest == data->longest_float */ |
773
|
4308182
|
|
|
|
|
data->offset_float_min = l ? data->last_start_min : data->pos_min; |
774
|
1181149
|
|
|
|
|
data->offset_float_max = (l |
775
|
|
|
|
|
|
? data->last_start_max |
776
|
4308182
|
|
|
|
|
: (data->pos_delta == SSize_t_MAX |
777
|
|
|
|
|
|
? SSize_t_MAX |
778
|
2937189
|
|
|
|
|
: data->pos_min + data->pos_delta)); |
779
|
1371253
|
|
|
|
|
if (is_inf |
780
|
4308182
|
|
|
|
|
|| (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) |
781
|
1693699
|
|
|
|
|
data->offset_float_max = SSize_t_MAX; |
782
|
2614483
|
|
|
|
|
if (data->flags & SF_BEFORE_EOL) |
783
|
|
|
|
|
|
data->flags |
784
|
4308180
|
|
|
|
|
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); |
785
|
|
|
|
|
|
else |
786
|
4308178
|
|
|
|
|
data->flags &= ~SF_FL_BEFORE_EOL; |
787
|
24403101
|
|
|
|
|
data->minlen_float=minlenp; |
788
|
24403101
|
|
|
|
|
data->lookbehind_float=0; |
789
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
} |
791
|
24403259
|
|
|
|
|
SvCUR_set(data->last_found, 0); |
792
|
|
|
|
|
|
{ |
793
|
1237694
|
|
|
|
|
SV * const sv = data->last_found; |
794
|
1237694
|
|
|
|
|
if (SvUTF8(sv) && SvMAGICAL(sv)) { |
795
|
1237550
|
|
|
|
|
MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); |
796
|
24403115
|
|
|
|
|
if (mg) |
797
|
24403115
|
|
|
|
|
mg->mg_len = 0; |
798
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
} |
800
|
24403259
|
|
|
|
|
data->last_end = -1; |
801
|
11174063
|
|
|
|
|
data->flags &= ~SF_BEFORE_EOL; |
802
|
11174063
|
|
|
|
|
DEBUG_STUDYDATA("commit: ",data,0); |
803
|
11174063
|
|
|
|
|
} |
804
|
|
|
|
|
|
|
805
|
|
|
|
|
|
/* These macros set, clear and test whether the synthetic start class ('ssc', |
806
|
|
|
|
|
|
* given by the parameter) matches an empty string (EOS). This uses the |
807
|
|
|
|
|
|
* 'next_off' field in the node, to save a bit in the flags field. The ssc |
808
|
|
|
|
|
|
* stands alone, so there is never a next_off, so this field is otherwise |
809
|
|
|
|
|
|
* unused. The EOS information is used only for compilation, but theoretically |
810
|
|
|
|
|
|
* it could be passed on to the execution code. This could be used to store |
811
|
|
|
|
|
|
* more than one bit of information, but only this one is currently used. */ |
812
|
|
|
|
|
|
#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END |
813
|
|
|
|
|
|
#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END |
814
|
|
|
|
|
|
#define TEST_SSC_EOS(node) cBOOL((node)->next_off) |
815
|
|
|
|
|
|
|
816
|
|
|
|
|
|
/* Can match anything (initialization) */ |
817
|
|
|
|
|
|
STATIC void |
818
|
11174083
|
|
|
|
|
S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) |
819
|
|
|
|
|
|
{ |
820
|
11174083
|
|
|
|
|
PERL_ARGS_ASSERT_CL_ANYTHING; |
821
|
|
|
|
|
|
|
822
|
322450
|
|
|
|
|
ANYOF_BITMAP_SETALL(cl); |
823
|
322450
|
|
|
|
|
cl->flags = ANYOF_UNICODE_ALL; |
824
|
10851817
|
|
|
|
|
SET_SSC_EOS(cl); |
825
|
|
|
|
|
|
|
826
|
|
|
|
|
|
/* If any portion of the regex is to operate under locale rules, |
827
|
|
|
|
|
|
* initialization includes it. The reason this isn't done for all regexes |
828
|
|
|
|
|
|
* is that the optimizer was written under the assumption that locale was |
829
|
|
|
|
|
|
* all-or-nothing. Given the complexity and lack of documentation in the |
830
|
|
|
|
|
|
* optimizer, and that there are inadequate test cases for locale, so many |
831
|
|
|
|
|
|
* parts of it may not work properly, it is safest to avoid locale unless |
832
|
|
|
|
|
|
* necessary. */ |
833
|
11174083
|
|
|
|
|
if (RExC_contains_locale) { |
834
|
561264
|
|
|
|
|
ANYOF_CLASS_SETALL(cl); /* /l uses class */ |
835
|
9000016
|
|
|
|
|
cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; |
836
|
|
|
|
|
|
} |
837
|
|
|
|
|
|
else { |
838
|
8472778
|
|
|
|
|
ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ |
839
|
|
|
|
|
|
} |
840
|
527606
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
842
|
|
|
|
|
|
/* Can match anything (initialization) */ |
843
|
|
|
|
|
|
STATIC int |
844
|
289256
|
|
|
|
|
S_cl_is_anything(const struct regnode_charclass_class *cl) |
845
|
|
|
|
|
|
{ |
846
|
|
|
|
|
|
int value; |
847
|
|
|
|
|
|
|
848
|
323810
|
|
|
|
|
PERL_ARGS_ASSERT_CL_IS_ANYTHING; |
849
|
|
|
|
|
|
|
850
|
11157999
|
|
|
|
|
for (value = 0; value < ANYOF_MAX; value += 2) |
851
|
11157999
|
|
|
|
|
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) |
852
|
|
|
|
|
|
return 1; |
853
|
11157999
|
|
|
|
|
if (!(cl->flags & ANYOF_UNICODE_ALL)) |
854
|
|
|
|
|
|
return 0; |
855
|
11157999
|
|
|
|
|
if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) |
856
|
|
|
|
|
|
return 0; |
857
|
11157999
|
|
|
|
|
return 1; |
858
|
|
|
|
|
|
} |
859
|
|
|
|
|
|
|
860
|
|
|
|
|
|
/* Can match anything (initialization) */ |
861
|
|
|
|
|
|
STATIC void |
862
|
2996302
|
|
|
|
|
S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) |
863
|
|
|
|
|
|
{ |
864
|
2996302
|
|
|
|
|
PERL_ARGS_ASSERT_CL_INIT; |
865
|
|
|
|
|
|
|
866
|
|
|
|
|
|
Zero(cl, 1, struct regnode_charclass_class); |
867
|
2897626
|
|
|
|
|
cl->type = ANYOF; |
868
|
2765006
|
|
|
|
|
cl_anything(pRExC_state, cl); |
869
|
2765006
|
|
|
|
|
ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); |
870
|
2765006
|
|
|
|
|
} |
871
|
|
|
|
|
|
|
872
|
|
|
|
|
|
/* These two functions currently do the exact same thing */ |
873
|
|
|
|
|
|
#define cl_init_zero cl_init |
874
|
|
|
|
|
|
|
875
|
|
|
|
|
|
/* 'AND' a given class with another one. Can create false positives. 'cl' |
876
|
|
|
|
|
|
* should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if |
877
|
|
|
|
|
|
* 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ |
878
|
|
|
|
|
|
STATIC void |
879
|
2764828
|
|
|
|
|
S_cl_and(struct regnode_charclass_class *cl, |
880
|
|
|
|
|
|
const struct regnode_charclass_class *and_with) |
881
|
|
|
|
|
|
{ |
882
|
476246
|
|
|
|
|
PERL_ARGS_ASSERT_CL_AND; |
883
|
|
|
|
|
|
|
884
|
468742
|
|
|
|
|
assert(PL_regkind[and_with->type] == ANYOF); |
885
|
|
|
|
|
|
|
886
|
|
|
|
|
|
/* I (khw) am not sure all these restrictions are necessary XXX */ |
887
|
89415577
|
|
|
|
|
if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) |
888
|
88005574
|
|
|
|
|
&& !(ANYOF_CLASS_TEST_ANY_SET(cl)) |
889
|
2996124
|
|
|
|
|
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) |
890
|
47614
|
|
|
|
|
&& !(and_with->flags & ANYOF_LOC_FOLD) |
891
|
47614
|
|
|
|
|
&& !(cl->flags & ANYOF_LOC_FOLD)) { |
892
|
|
|
|
|
|
int i; |
893
|
|
|
|
|
|
|
894
|
47614
|
|
|
|
|
if (and_with->flags & ANYOF_INVERT) |
895
|
47608
|
|
|
|
|
for (i = 0; i < ANYOF_BITMAP_SIZE; i++) |
896
|
0
|
|
|
|
|
cl->bitmap[i] &= ~and_with->bitmap[i]; |
897
|
|
|
|
|
|
else |
898
|
47800
|
|
|
|
|
for (i = 0; i < ANYOF_BITMAP_SIZE; i++) |
899
|
3578
|
|
|
|
|
cl->bitmap[i] &= and_with->bitmap[i]; |
900
|
|
|
|
|
|
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */ |
901
|
|
|
|
|
|
|
902
|
2948516
|
|
|
|
|
if (and_with->flags & ANYOF_INVERT) { |
903
|
|
|
|
|
|
|
904
|
|
|
|
|
|
/* Here, the and'ed node is inverted. Get the AND of the flags that |
905
|
|
|
|
|
|
* aren't affected by the inversion. Those that are affected are |
906
|
|
|
|
|
|
* handled individually below */ |
907
|
2389512
|
|
|
|
|
U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; |
908
|
489192
|
|
|
|
|
cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); |
909
|
489192
|
|
|
|
|
cl->flags |= affected_flags; |
910
|
|
|
|
|
|
|
911
|
|
|
|
|
|
/* We currently don't know how to deal with things that aren't in the |
912
|
|
|
|
|
|
* bitmap, but we know that the intersection is no greater than what |
913
|
|
|
|
|
|
* is already in cl, so let there be false positives that get sorted |
914
|
|
|
|
|
|
* out after the synthetic start class succeeds, and the node is |
915
|
|
|
|
|
|
* matched for real. */ |
916
|
|
|
|
|
|
|
917
|
|
|
|
|
|
/* The inversion of these two flags indicate that the resulting |
918
|
|
|
|
|
|
* intersection doesn't have them */ |
919
|
558998
|
|
|
|
|
if (and_with->flags & ANYOF_UNICODE_ALL) { |
920
|
558996
|
|
|
|
|
cl->flags &= ~ANYOF_UNICODE_ALL; |
921
|
|
|
|
|
|
} |
922
|
558996
|
|
|
|
|
if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { |
923
|
558996
|
|
|
|
|
cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; |
924
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
else { /* and'd node is not inverted */ |
927
|
|
|
|
|
|
U8 outside_bitmap_but_not_utf8; /* Temp variable */ |
928
|
|
|
|
|
|
|
929
|
2948516
|
|
|
|
|
if (! ANYOF_NONBITMAP(and_with)) { |
930
|
|
|
|
|
|
|
931
|
|
|
|
|
|
/* Here 'and_with' doesn't match anything outside the bitmap |
932
|
|
|
|
|
|
* (except possibly ANYOF_UNICODE_ALL), which means the |
933
|
|
|
|
|
|
* intersection can't either, except for ANYOF_UNICODE_ALL, in |
934
|
|
|
|
|
|
* which case we don't know what the intersection is, but it's no |
935
|
|
|
|
|
|
* greater than what cl already has, so can just leave it alone, |
936
|
|
|
|
|
|
* with possible false positives */ |
937
|
2948516
|
|
|
|
|
if (! (and_with->flags & ANYOF_UNICODE_ALL)) { |
938
|
2948510
|
|
|
|
|
ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); |
939
|
2996118
|
|
|
|
|
cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; |
940
|
|
|
|
|
|
} |
941
|
|
|
|
|
|
} |
942
|
2760550
|
|
|
|
|
else if (! ANYOF_NONBITMAP(cl)) { |
943
|
|
|
|
|
|
|
944
|
|
|
|
|
|
/* Here, 'and_with' does match something outside the bitmap, and cl |
945
|
|
|
|
|
|
* doesn't have a list of things to match outside the bitmap. If |
946
|
|
|
|
|
|
* cl can match all code points above 255, the intersection will |
947
|
|
|
|
|
|
* be those above-255 code points that 'and_with' matches. If cl |
948
|
|
|
|
|
|
* can't match all Unicode code points, it means that it can't |
949
|
|
|
|
|
|
* match anything outside the bitmap (since the 'if' that got us |
950
|
|
|
|
|
|
* into this block tested for that), so we leave the bitmap empty. |
951
|
|
|
|
|
|
*/ |
952
|
2760550
|
|
|
|
|
if (cl->flags & ANYOF_UNICODE_ALL) { |
953
|
14920
|
|
|
|
|
ARG_SET(cl, ARG(and_with)); |
954
|
|
|
|
|
|
|
955
|
|
|
|
|
|
/* and_with's ARG may match things that don't require UTF8. |
956
|
|
|
|
|
|
* And now cl's will too, in spite of this being an 'and'. See |
957
|
|
|
|
|
|
* the comments below about the kludge */ |
958
|
14920
|
|
|
|
|
cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; |
959
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
} |
961
|
|
|
|
|
|
else { |
962
|
|
|
|
|
|
/* Here, both 'and_with' and cl match something outside the |
963
|
|
|
|
|
|
* bitmap. Currently we do not do the intersection, so just match |
964
|
|
|
|
|
|
* whatever cl had at the beginning. */ |
965
|
|
|
|
|
|
} |
966
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
968
|
|
|
|
|
|
/* Take the intersection of the two sets of flags. However, the |
969
|
|
|
|
|
|
* ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a |
970
|
|
|
|
|
|
* kludge around the fact that this flag is not treated like the others |
971
|
|
|
|
|
|
* which are initialized in cl_anything(). The way the optimizer works |
972
|
|
|
|
|
|
* is that the synthetic start class (SSC) is initialized to match |
973
|
|
|
|
|
|
* anything, and then the first time a real node is encountered, its |
974
|
|
|
|
|
|
* values are AND'd with the SSC's with the result being the values of |
975
|
|
|
|
|
|
* the real node. However, there are paths through the optimizer where |
976
|
|
|
|
|
|
* the AND never gets called, so those initialized bits are set |
977
|
|
|
|
|
|
* inappropriately, which is not usually a big deal, as they just cause |
978
|
|
|
|
|
|
* false positives in the SSC, which will just mean a probably |
979
|
|
|
|
|
|
* imperceptible slow down in execution. However this bit has a |
980
|
|
|
|
|
|
* higher false positive consequence in that it can cause utf8.pm, |
981
|
|
|
|
|
|
* utf8_heavy.pl ... to be loaded when not necessary, which is a much |
982
|
|
|
|
|
|
* bigger slowdown and also causes significant extra memory to be used. |
983
|
|
|
|
|
|
* In order to prevent this, the code now takes a different tack. The |
984
|
|
|
|
|
|
* bit isn't set unless some part of the regular expression needs it, |
985
|
|
|
|
|
|
* but once set it won't get cleared. This means that these extra |
986
|
|
|
|
|
|
* modules won't get loaded unless there was some path through the |
987
|
|
|
|
|
|
* pattern that would have required them anyway, and so any false |
988
|
|
|
|
|
|
* positives that occur by not ANDing them out when they could be |
989
|
|
|
|
|
|
* aren't as severe as they would be if we treated this bit like all |
990
|
|
|
|
|
|
* the others */ |
991
|
6
|
|
|
|
|
outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) |
992
|
|
|
|
|
|
& ANYOF_NONBITMAP_NON_UTF8; |
993
|
6
|
|
|
|
|
cl->flags &= and_with->flags; |
994
|
6
|
|
|
|
|
cl->flags |= outside_bitmap_but_not_utf8; |
995
|
|
|
|
|
|
} |
996
|
6
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
998
|
|
|
|
|
|
/* 'OR' a given class with another one. Can create false positives. 'cl' |
999
|
|
|
|
|
|
* should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if |
1000
|
|
|
|
|
|
* 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ |
1001
|
|
|
|
|
|
STATIC void |
1002
|
46
|
|
|
|
|
S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) |
1003
|
|
|
|
|
|
{ |
1004
|
46
|
|
|
|
|
PERL_ARGS_ASSERT_CL_OR; |
1005
|
|
|
|
|
|
|
1006
|
14966
|
|
|
|
|
if (or_with->flags & ANYOF_INVERT) { |
1007
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
/* Here, the or'd node is to be inverted. This means we take the |
1009
|
|
|
|
|
|
* complement of everything not in the bitmap, but currently we don't |
1010
|
|
|
|
|
|
* know what that is, so give up and match anything */ |
1011
|
2745630
|
|
|
|
|
if (ANYOF_NONBITMAP(or_with)) { |
1012
|
2745630
|
|
|
|
|
cl_anything(pRExC_state, cl); |
1013
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
/* We do not use |
1015
|
|
|
|
|
|
* (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) |
1016
|
|
|
|
|
|
* <= (B1 | !B2) | (CL1 | !CL2) |
1017
|
|
|
|
|
|
* which is wasteful if CL2 is small, but we ignore CL2: |
1018
|
|
|
|
|
|
* (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 |
1019
|
|
|
|
|
|
* XXXX Can we handle case-fold? Unclear: |
1020
|
|
|
|
|
|
* (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = |
1021
|
|
|
|
|
|
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) |
1022
|
|
|
|
|
|
*/ |
1023
|
1408133
|
|
|
|
|
else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) |
1024
|
89256193
|
|
|
|
|
&& !(or_with->flags & ANYOF_LOC_FOLD) |
1025
|
87860160
|
|
|
|
|
&& !(cl->flags & ANYOF_LOC_FOLD) ) { |
1026
|
|
|
|
|
|
int i; |
1027
|
|
|
|
|
|
|
1028
|
2745630
|
|
|
|
|
for (i = 0; i < ANYOF_BITMAP_SIZE; i++) |
1029
|
24200
|
|
|
|
|
cl->bitmap[i] |= ~or_with->bitmap[i]; |
1030
|
|
|
|
|
|
} /* XXXX: logic is complicated otherwise */ |
1031
|
|
|
|
|
|
else { |
1032
|
0
|
|
|
|
|
cl_anything(pRExC_state, cl); |
1033
|
|
|
|
|
|
} |
1034
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
/* And, we can just take the union of the flags that aren't affected |
1036
|
|
|
|
|
|
* by the inversion */ |
1037
|
2745630
|
|
|
|
|
cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; |
1038
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
/* For the remaining flags: |
1040
|
|
|
|
|
|
ANYOF_UNICODE_ALL and inverted means to not match anything above |
1041
|
|
|
|
|
|
255, which means that the union with cl should just be |
1042
|
|
|
|
|
|
what cl has in it, so can ignore this flag |
1043
|
|
|
|
|
|
ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord |
1044
|
|
|
|
|
|
is (ASCII) 127-255 to match them, but then invert that, so |
1045
|
|
|
|
|
|
the union with cl should just be what cl has in it, so can |
1046
|
|
|
|
|
|
ignore this flag |
1047
|
|
|
|
|
|
*/ |
1048
|
|
|
|
|
|
} else { /* 'or_with' is not inverted */ |
1049
|
|
|
|
|
|
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ |
1050
|
140606
|
|
|
|
|
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) |
1051
|
140192
|
|
|
|
|
&& (!(or_with->flags & ANYOF_LOC_FOLD) |
1052
|
414
|
|
|
|
|
|| (cl->flags & ANYOF_LOC_FOLD)) ) { |
1053
|
|
|
|
|
|
int i; |
1054
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
/* OR char bitmap and class bitmap separately */ |
1056
|
1886
|
|
|
|
|
for (i = 0; i < ANYOF_BITMAP_SIZE; i++) |
1057
|
1472
|
|
|
|
|
cl->bitmap[i] |= or_with->bitmap[i]; |
1058
|
460
|
|
|
|
|
if (or_with->flags & ANYOF_CLASS) { |
1059
|
2745630
|
|
|
|
|
ANYOF_CLASS_OR(or_with, cl); |
1060
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
} |
1062
|
|
|
|
|
|
else { /* XXXX: logic is complicated, leave it along for a moment. */ |
1063
|
2760550
|
|
|
|
|
cl_anything(pRExC_state, cl); |
1064
|
|
|
|
|
|
} |
1065
|
|
|
|
|
|
|
1066
|
630257
|
|
|
|
|
if (ANYOF_NONBITMAP(or_with)) { |
1067
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
/* Use the added node's outside-the-bit-map match if there isn't a |
1069
|
|
|
|
|
|
* conflict. If there is a conflict (both nodes match something |
1070
|
|
|
|
|
|
* outside the bitmap, but what they match outside is not the same |
1071
|
|
|
|
|
|
* pointer, and hence not easily compared until XXX we extend |
1072
|
|
|
|
|
|
* inversion lists this far), give up and allow the start class to |
1073
|
|
|
|
|
|
* match everything outside the bitmap. If that stuff is all above |
1074
|
|
|
|
|
|
* 255, can just set UNICODE_ALL, otherwise caould be anything. */ |
1075
|
630211
|
|
|
|
|
if (! ANYOF_NONBITMAP(cl)) { |
1076
|
630211
|
|
|
|
|
ARG_SET(cl, ARG(or_with)); |
1077
|
|
|
|
|
|
} |
1078
|
630211
|
|
|
|
|
else if (ARG(cl) != ARG(or_with)) { |
1079
|
|
|
|
|
|
|
1080
|
630211
|
|
|
|
|
if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { |
1081
|
630211
|
|
|
|
|
cl_anything(pRExC_state, cl); |
1082
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
else { |
1084
|
630211
|
|
|
|
|
cl->flags |= ANYOF_UNICODE_ALL; |
1085
|
|
|
|
|
|
} |
1086
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
} |
1088
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
/* Take the union */ |
1090
|
92314
|
|
|
|
|
cl->flags |= or_with->flags; |
1091
|
|
|
|
|
|
} |
1092
|
46
|
|
|
|
|
} |
1093
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] |
1095
|
|
|
|
|
|
#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) |
1096
|
|
|
|
|
|
#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) |
1097
|
|
|
|
|
|
#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) |
1098
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
#ifdef DEBUGGING |
1101
|
|
|
|
|
|
/* |
1102
|
|
|
|
|
|
dump_trie(trie,widecharmap,revcharmap) |
1103
|
|
|
|
|
|
dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) |
1104
|
|
|
|
|
|
dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) |
1105
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
These routines dump out a trie in a somewhat readable format. |
1107
|
|
|
|
|
|
The _interim_ variants are used for debugging the interim |
1108
|
|
|
|
|
|
tables that are used to generate the final compressed |
1109
|
|
|
|
|
|
representation which is what dump_trie expects. |
1110
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
Part of the reason for their existence is to provide a form |
1112
|
|
|
|
|
|
of documentation as to how the different representations function. |
1113
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
*/ |
1115
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
/* |
1117
|
|
|
|
|
|
Dumps the final compressed table form of the trie to Perl_debug_log. |
1118
|
|
|
|
|
|
Used for debugging make_trie(). |
1119
|
|
|
|
|
|
*/ |
1120
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
STATIC void |
1122
|
6
|
|
|
|
|
S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, |
1123
|
|
|
|
|
|
AV *revcharmap, U32 depth) |
1124
|
|
|
|
|
|
{ |
1125
|
|
|
|
|
|
U32 state; |
1126
|
6
|
|
|
|
|
SV *sv=sv_newmortal(); |
1127
|
630217
|
|
|
|
|
int colwidth= widecharmap ? 6 : 4; |
1128
|
|
|
|
|
|
U16 word; |
1129
|
630217
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
1130
|
|
|
|
|
|
|
1131
|
630217
|
|
|
|
|
PERL_ARGS_ASSERT_DUMP_TRIE; |
1132
|
|
|
|
|
|
|
1133
|
630217
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", |
1134
|
630217
|
|
|
|
|
(int)depth * 2 + 2,"", |
1135
|
|
|
|
|
|
"Match","Base","Ofs" ); |
1136
|
|
|
|
|
|
|
1137
|
630311
|
|
|
|
|
for( state = 0 ; state < trie->uniquecharcount ; state++ ) { |
1138
|
630305
|
|
|
|
|
SV ** const tmp = av_fetch( revcharmap, state, 0); |
1139
|
538037
|
|
|
|
|
if ( tmp ) { |
1140
|
630681
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s", |
1141
|
|
|
|
|
|
colwidth, |
1142
|
630681
|
|
|
|
|
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, |
1143
|
|
|
|
|
|
PL_colors[0], PL_colors[1], |
1144
|
|
|
|
|
|
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | |
1145
|
|
|
|
|
|
PERL_PV_ESCAPE_FIRSTCHAR |
1146
|
|
|
|
|
|
) |
1147
|
|
|
|
|
|
); |
1148
|
|
|
|
|
|
} |
1149
|
|
|
|
|
|
} |
1150
|
630217
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", |
1151
|
6
|
|
|
|
|
(int)depth * 2 + 2,""); |
1152
|
|
|
|
|
|
|
1153
|
630311
|
|
|
|
|
for( state = 0 ; state < trie->uniquecharcount ; state++ ) |
1154
|
259262
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); |
1155
|
2557967
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "\n"); |
1156
|
|
|
|
|
|
|
1157
|
1927846
|
|
|
|
|
for( state = 1 ; state < trie->statecount ; state++ ) { |
1158
|
1927840
|
|
|
|
|
const U32 base = trie->states[ state ].trans.base; |
1159
|
|
|
|
|
|
|
1160
|
1927840
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); |
1161
|
|
|
|
|
|
|
1162
|
1927840
|
|
|
|
|
if ( trie->states[ state ].wordnum ) { |
1163
|
1927800
|
|
|
|
|
PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); |
1164
|
|
|
|
|
|
} else { |
1165
|
5728
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%6s", "" ); |
1166
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
1168
|
5778
|
|
|
|
|
PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); |
1169
|
|
|
|
|
|
|
1170
|
142
|
|
|
|
|
if ( base ) { |
1171
|
|
|
|
|
|
U32 ofs = 0; |
1172
|
|
|
|
|
|
|
1173
|
842
|
|
|
|
|
while( ( base + ofs < trie->uniquecharcount ) || |
1174
|
358
|
|
|
|
|
( base + ofs - trie->uniquecharcount < trie->lasttrans |
1175
|
5942
|
|
|
|
|
&& trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) |
1176
|
6074
|
|
|
|
|
ofs++; |
1177
|
|
|
|
|
|
|
1178
|
1922160
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); |
1179
|
|
|
|
|
|
|
1180
|
1722742
|
|
|
|
|
for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { |
1181
|
1723784
|
|
|
|
|
if ( ( base + ofs >= trie->uniquecharcount ) && |
1182
|
978878
|
|
|
|
|
( base + ofs - trie->uniquecharcount < trie->lasttrans ) && |
1183
|
7175658
|
|
|
|
|
trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) |
1184
|
|
|
|
|
|
{ |
1185
|
6198054
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*"UVXf, |
1186
|
|
|
|
|
|
colwidth, |
1187
|
6198054
|
|
|
|
|
(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); |
1188
|
|
|
|
|
|
} else { |
1189
|
6198948
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); |
1190
|
|
|
|
|
|
} |
1191
|
|
|
|
|
|
} |
1192
|
|
|
|
|
|
|
1193
|
287120
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "]"); |
1194
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
} |
1196
|
287164
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "\n" ); |
1197
|
|
|
|
|
|
} |
1198
|
118829
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); |
1199
|
79262
|
|
|
|
|
for (word=1; word <= trie->wordcount; word++) { |
1200
|
207954
|
|
|
|
|
PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", |
1201
|
75074
|
|
|
|
|
(int)word, (int)(trie->wordinfo[word].prev), |
1202
|
132880
|
|
|
|
|
(int)(trie->wordinfo[word].len)); |
1203
|
|
|
|
|
|
} |
1204
|
132836
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n" ); |
1205
|
5910800
|
|
|
|
|
} |
1206
|
|
|
|
|
|
/* |
1207
|
|
|
|
|
|
Dumps a fully constructed but uncompressed trie in list form. |
1208
|
|
|
|
|
|
List tries normally only are used for construction when the number of |
1209
|
|
|
|
|
|
possible chars (trie->uniquecharcount) is very high. |
1210
|
|
|
|
|
|
Used for debugging make_trie(). |
1211
|
|
|
|
|
|
*/ |
1212
|
|
|
|
|
|
STATIC void |
1213
|
5910794
|
|
|
|
|
S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, |
1214
|
|
|
|
|
|
HV *widecharmap, AV *revcharmap, U32 next_alloc, |
1215
|
|
|
|
|
|
U32 depth) |
1216
|
|
|
|
|
|
{ |
1217
|
|
|
|
|
|
U32 state; |
1218
|
5682834
|
|
|
|
|
SV *sv=sv_newmortal(); |
1219
|
227960
|
|
|
|
|
int colwidth= widecharmap ? 6 : 4; |
1220
|
14588
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
1221
|
|
|
|
|
|
|
1222
|
213372
|
|
|
|
|
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; |
1223
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
/* print out the table precompression. */ |
1225
|
213372
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", |
1226
|
6197868
|
|
|
|
|
(int)depth * 2 + 2,"", (int)depth * 2 + 2,"", |
1227
|
|
|
|
|
|
"------:-----+-----------------\n" ); |
1228
|
|
|
|
|
|
|
1229
|
6050074
|
|
|
|
|
for( state=1 ; state < next_alloc ; state ++ ) { |
1230
|
|
|
|
|
|
U16 charid; |
1231
|
|
|
|
|
|
|
1232
|
367052
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", |
1233
|
367052
|
|
|
|
|
(int)depth * 2 + 2,"", (UV)state ); |
1234
|
258210
|
|
|
|
|
if ( ! trie->states[ state ].wordnum ) { |
1235
|
258210
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%5s| ",""); |
1236
|
|
|
|
|
|
} else { |
1237
|
6050074
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "W%4x| ", |
1238
|
3437251
|
|
|
|
|
trie->states[ state ].wordnum |
1239
|
|
|
|
|
|
); |
1240
|
|
|
|
|
|
} |
1241
|
3437251
|
|
|
|
|
for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { |
1242
|
6050074
|
|
|
|
|
SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); |
1243
|
1642398
|
|
|
|
|
if ( tmp ) { |
1244
|
1642398
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", |
1245
|
|
|
|
|
|
colwidth, |
1246
|
1642398
|
|
|
|
|
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, |
1247
|
|
|
|
|
|
PL_colors[0], PL_colors[1], |
1248
|
|
|
|
|
|
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | |
1249
|
|
|
|
|
|
PERL_PV_ESCAPE_FIRSTCHAR |
1250
|
|
|
|
|
|
) , |
1251
|
1642314
|
|
|
|
|
TRIE_LIST_ITEM(state,charid).forid, |
1252
|
1220
|
|
|
|
|
(UV)TRIE_LIST_ITEM(state,charid).newstate |
1253
|
|
|
|
|
|
); |
1254
|
147794
|
|
|
|
|
if (!(charid % 10)) |
1255
|
35058
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n%*s| ", |
1256
|
147794
|
|
|
|
|
(int)((depth * 2) + 14), ""); |
1257
|
|
|
|
|
|
} |
1258
|
|
|
|
|
|
} |
1259
|
147794
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "\n"); |
1260
|
|
|
|
|
|
} |
1261
|
0
|
|
|
|
|
} |
1262
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
/* |
1264
|
|
|
|
|
|
Dumps a fully constructed but uncompressed trie in table form. |
1265
|
|
|
|
|
|
This is the normal DFA style state transition table, with a few |
1266
|
|
|
|
|
|
twists to facilitate compression later. |
1267
|
|
|
|
|
|
Used for debugging make_trie(). |
1268
|
|
|
|
|
|
*/ |
1269
|
|
|
|
|
|
STATIC void |
1270
|
147794
|
|
|
|
|
S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, |
1271
|
|
|
|
|
|
HV *widecharmap, AV *revcharmap, U32 next_alloc, |
1272
|
|
|
|
|
|
U32 depth) |
1273
|
|
|
|
|
|
{ |
1274
|
|
|
|
|
|
U32 state; |
1275
|
|
|
|
|
|
U16 charid; |
1276
|
141522
|
|
|
|
|
SV *sv=sv_newmortal(); |
1277
|
141522
|
|
|
|
|
int colwidth= widecharmap ? 6 : 4; |
1278
|
1922114
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
1279
|
|
|
|
|
|
|
1280
|
630211
|
|
|
|
|
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; |
1281
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
/* |
1283
|
|
|
|
|
|
print out the table precompression so that we can do a visual check |
1284
|
|
|
|
|
|
that they are identical. |
1285
|
|
|
|
|
|
*/ |
1286
|
|
|
|
|
|
|
1287
|
630211
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); |
1288
|
|
|
|
|
|
|
1289
|
1291903
|
|
|
|
|
for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { |
1290
|
259772
|
|
|
|
|
SV ** const tmp = av_fetch( revcharmap, charid, 0); |
1291
|
1032131
|
|
|
|
|
if ( tmp ) { |
1292
|
175469
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s", |
1293
|
|
|
|
|
|
colwidth, |
1294
|
630211
|
|
|
|
|
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, |
1295
|
|
|
|
|
|
PL_colors[0], PL_colors[1], |
1296
|
|
|
|
|
|
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | |
1297
|
|
|
|
|
|
PERL_PV_ESCAPE_FIRSTCHAR |
1298
|
|
|
|
|
|
) |
1299
|
|
|
|
|
|
); |
1300
|
|
|
|
|
|
} |
1301
|
|
|
|
|
|
} |
1302
|
|
|
|
|
|
|
1303
|
630211
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); |
1304
|
|
|
|
|
|
|
1305
|
630211
|
|
|
|
|
for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { |
1306
|
4
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); |
1307
|
|
|
|
|
|
} |
1308
|
|
|
|
|
|
|
1309
|
4
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "\n" ); |
1310
|
|
|
|
|
|
|
1311
|
4
|
|
|
|
|
for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { |
1312
|
|
|
|
|
|
|
1313
|
79008
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", |
1314
|
79004
|
|
|
|
|
(int)depth * 2 + 2,"", |
1315
|
79004
|
|
|
|
|
(UV)TRIE_NODENUM( state ) ); |
1316
|
|
|
|
|
|
|
1317
|
79004
|
|
|
|
|
for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { |
1318
|
79004
|
|
|
|
|
UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); |
1319
|
0
|
|
|
|
|
if (v) |
1320
|
0
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); |
1321
|
|
|
|
|
|
else |
1322
|
0
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); |
1323
|
|
|
|
|
|
} |
1324
|
0
|
|
|
|
|
if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { |
1325
|
79004
|
|
|
|
|
PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); |
1326
|
|
|
|
|
|
} else { |
1327
|
197606
|
|
|
|
|
PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, |
1328
|
79068
|
|
|
|
|
trie->states[ TRIE_NODENUM( state ) ].wordnum ); |
1329
|
|
|
|
|
|
} |
1330
|
|
|
|
|
|
} |
1331
|
79068
|
|
|
|
|
} |
1332
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
#endif |
1334
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
1336
|
|
|
|
|
|
/* make_trie(startbranch,first,last,tail,word_count,flags,depth) |
1337
|
|
|
|
|
|
startbranch: the first branch in the whole branch sequence |
1338
|
|
|
|
|
|
first : start branch of sequence of branch-exact nodes. |
1339
|
|
|
|
|
|
May be the same as startbranch |
1340
|
|
|
|
|
|
last : Thing following the last branch. |
1341
|
|
|
|
|
|
May be the same as tail. |
1342
|
|
|
|
|
|
tail : item following the branch sequence |
1343
|
|
|
|
|
|
count : words in the sequence |
1344
|
|
|
|
|
|
flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ |
1345
|
|
|
|
|
|
depth : indent depth |
1346
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. |
1348
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
A trie is an N'ary tree where the branches are determined by digital |
1350
|
|
|
|
|
|
decomposition of the key. IE, at the root node you look up the 1st character and |
1351
|
|
|
|
|
|
follow that branch repeat until you find the end of the branches. Nodes can be |
1352
|
|
|
|
|
|
marked as "accepting" meaning they represent a complete word. Eg: |
1353
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
/he|she|his|hers/ |
1355
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
would convert into the following structure. Numbers represent states, letters |
1357
|
|
|
|
|
|
following numbers represent valid transitions on the letter from that state, if |
1358
|
|
|
|
|
|
the number is in square brackets it represents an accepting state, otherwise it |
1359
|
|
|
|
|
|
will be in parenthesis. |
1360
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
+-h->+-e->[3]-+-r->(8)-+-s->[9] |
1362
|
|
|
|
|
|
| | |
1363
|
|
|
|
|
|
| (2) |
1364
|
|
|
|
|
|
| | |
1365
|
|
|
|
|
|
(1) +-i->(6)-+-s->[7] |
1366
|
|
|
|
|
|
| |
1367
|
|
|
|
|
|
+-s->(3)-+-h->(4)-+-e->[5] |
1368
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) |
1370
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
This shows that when matching against the string 'hers' we will begin at state 1 |
1372
|
|
|
|
|
|
read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, |
1373
|
|
|
|
|
|
then read 'r' and go to state 8 followed by 's' which takes us to state 9 which |
1374
|
|
|
|
|
|
is also accepting. Thus we know that we can match both 'he' and 'hers' with a |
1375
|
|
|
|
|
|
single traverse. We store a mapping from accepting to state to which word was |
1376
|
|
|
|
|
|
matched, and then when we have multiple possibilities we try to complete the |
1377
|
|
|
|
|
|
rest of the regex in the order in which they occured in the alternation. |
1378
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
The only prior NFA like behaviour that would be changed by the TRIE support is |
1380
|
|
|
|
|
|
the silent ignoring of duplicate alternations which are of the form: |
1381
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
/ (DUPE|DUPE) X? (?{ ... }) Y /x |
1383
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
Thus EVAL blocks following a trie may be called a different number of times with |
1385
|
|
|
|
|
|
and without the optimisation. With the optimisations dupes will be silently |
1386
|
|
|
|
|
|
ignored. This inconsistent behaviour of EVAL type nodes is well established as |
1387
|
|
|
|
|
|
the following demonstrates: |
1388
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
'words'=~/(word|word|word)(?{ print $1 })[xyz]/ |
1390
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
which prints out 'word' three times, but |
1392
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
'words'=~/(word|word|word)(?{ print $1 })S/ |
1394
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
which doesnt print it out at all. This is due to other optimisations kicking in. |
1396
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
Example of what happens on a structural level: |
1398
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
The regexp /(ac|ad|ab)+/ will produce the following debug output: |
1400
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
1: CURLYM[1] {1,32767}(18) |
1402
|
|
|
|
|
|
5: BRANCH(8) |
1403
|
|
|
|
|
|
6: EXACT (16) |
1404
|
|
|
|
|
|
8: BRANCH(11) |
1405
|
|
|
|
|
|
9: EXACT (16) |
1406
|
|
|
|
|
|
11: BRANCH(14) |
1407
|
|
|
|
|
|
12: EXACT (16) |
1408
|
|
|
|
|
|
16: SUCCEED(0) |
1409
|
|
|
|
|
|
17: NOTHING(18) |
1410
|
|
|
|
|
|
18: END(0) |
1411
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
This would be optimizable with startbranch=5, first=5, last=16, tail=16 |
1413
|
|
|
|
|
|
and should turn into: |
1414
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
1: CURLYM[1] {1,32767}(18) |
1416
|
|
|
|
|
|
5: TRIE(16) |
1417
|
|
|
|
|
|
[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] |
1418
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
16: SUCCEED(0) |
1422
|
|
|
|
|
|
17: NOTHING(18) |
1423
|
|
|
|
|
|
18: END(0) |
1424
|
|
|
|
|
|
|
1425
|
|
|
|
|
|
Cases where tail != last would be like /(?foo|bar)baz/: |
1426
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
1: BRANCH(4) |
1428
|
|
|
|
|
|
2: EXACT (8) |
1429
|
|
|
|
|
|
4: BRANCH(7) |
1430
|
|
|
|
|
|
5: EXACT (8) |
1431
|
|
|
|
|
|
7: TAIL(8) |
1432
|
|
|
|
|
|
8: EXACT (10) |
1433
|
|
|
|
|
|
10: END(0) |
1434
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
which would be optimizable with startbranch=1, first=1, last=7, tail=8 |
1436
|
|
|
|
|
|
and would end up looking like: |
1437
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
1: TRIE(8) |
1439
|
|
|
|
|
|
[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] |
1440
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
7: TAIL(8) |
1443
|
|
|
|
|
|
8: EXACT (10) |
1444
|
|
|
|
|
|
10: END(0) |
1445
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
d = uvchr_to_utf8_flags(d, uv, 0); |
1447
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
is the recommended Unicode-aware way of saying |
1449
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
*(d++) = uv; |
1451
|
|
|
|
|
|
*/ |
1452
|
|
|
|
|
|
|
1453
|
|
|
|
|
|
#define TRIE_STORE_REVCHAR(val) \ |
1454
|
|
|
|
|
|
STMT_START { \ |
1455
|
|
|
|
|
|
if (UTF) { \ |
1456
|
|
|
|
|
|
SV *zlopp = newSV(7); /* XXX: optimize me */ \ |
1457
|
|
|
|
|
|
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ |
1458
|
|
|
|
|
|
unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ |
1459
|
|
|
|
|
|
SvCUR_set(zlopp, kapow - flrbbbbb); \ |
1460
|
|
|
|
|
|
SvPOK_on(zlopp); \ |
1461
|
|
|
|
|
|
SvUTF8_on(zlopp); \ |
1462
|
|
|
|
|
|
av_push(revcharmap, zlopp); \ |
1463
|
|
|
|
|
|
} else { \ |
1464
|
|
|
|
|
|
char ooooff = (char)val; \ |
1465
|
|
|
|
|
|
av_push(revcharmap, newSVpvn(&ooooff, 1)); \ |
1466
|
|
|
|
|
|
} \ |
1467
|
|
|
|
|
|
} STMT_END |
1468
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
/* This gets the next character from the input, folding it if not already |
1470
|
|
|
|
|
|
* folded. */ |
1471
|
|
|
|
|
|
#define TRIE_READ_CHAR STMT_START { \ |
1472
|
|
|
|
|
|
wordlen++; \ |
1473
|
|
|
|
|
|
if ( UTF ) { \ |
1474
|
|
|
|
|
|
/* if it is UTF then it is either already folded, or does not need \ |
1475
|
|
|
|
|
|
* folding */ \ |
1476
|
|
|
|
|
|
uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ |
1477
|
|
|
|
|
|
} \ |
1478
|
|
|
|
|
|
else if (folder == PL_fold_latin1) { \ |
1479
|
|
|
|
|
|
/* This folder implies Unicode rules, which in the range expressible \ |
1480
|
|
|
|
|
|
* by not UTF is the lower case, with the two exceptions, one of \ |
1481
|
|
|
|
|
|
* which should have been taken care of before calling this */ \ |
1482
|
|
|
|
|
|
assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ |
1483
|
|
|
|
|
|
uvc = toLOWER_L1(*uc); \ |
1484
|
|
|
|
|
|
if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ |
1485
|
|
|
|
|
|
len = 1; \ |
1486
|
|
|
|
|
|
} else { \ |
1487
|
|
|
|
|
|
/* raw data, will be folded later if needed */ \ |
1488
|
|
|
|
|
|
uvc = (U32)*uc; \ |
1489
|
|
|
|
|
|
len = 1; \ |
1490
|
|
|
|
|
|
} \ |
1491
|
|
|
|
|
|
} STMT_END |
1492
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
1494
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ |
1496
|
|
|
|
|
|
if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ |
1497
|
|
|
|
|
|
U32 ging = TRIE_LIST_LEN( state ) *= 2; \ |
1498
|
|
|
|
|
|
Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ |
1499
|
|
|
|
|
|
} \ |
1500
|
|
|
|
|
|
TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ |
1501
|
|
|
|
|
|
TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ |
1502
|
|
|
|
|
|
TRIE_LIST_CUR( state )++; \ |
1503
|
|
|
|
|
|
} STMT_END |
1504
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
#define TRIE_LIST_NEW(state) STMT_START { \ |
1506
|
|
|
|
|
|
Newxz( trie->states[ state ].trans.list, \ |
1507
|
|
|
|
|
|
4, reg_trie_trans_le ); \ |
1508
|
|
|
|
|
|
TRIE_LIST_CUR( state ) = 1; \ |
1509
|
|
|
|
|
|
TRIE_LIST_LEN( state ) = 4; \ |
1510
|
|
|
|
|
|
} STMT_END |
1511
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
#define TRIE_HANDLE_WORD(state) STMT_START { \ |
1513
|
|
|
|
|
|
U16 dupe= trie->states[ state ].wordnum; \ |
1514
|
|
|
|
|
|
regnode * const noper_next = regnext( noper ); \ |
1515
|
|
|
|
|
|
\ |
1516
|
|
|
|
|
|
DEBUG_r({ \ |
1517
|
|
|
|
|
|
/* store the word for dumping */ \ |
1518
|
|
|
|
|
|
SV* tmp; \ |
1519
|
|
|
|
|
|
if (OP(noper) != NOTHING) \ |
1520
|
|
|
|
|
|
tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ |
1521
|
|
|
|
|
|
else \ |
1522
|
|
|
|
|
|
tmp = newSVpvn_utf8( "", 0, UTF ); \ |
1523
|
|
|
|
|
|
av_push( trie_words, tmp ); \ |
1524
|
|
|
|
|
|
}); \ |
1525
|
|
|
|
|
|
\ |
1526
|
|
|
|
|
|
curword++; \ |
1527
|
|
|
|
|
|
trie->wordinfo[curword].prev = 0; \ |
1528
|
|
|
|
|
|
trie->wordinfo[curword].len = wordlen; \ |
1529
|
|
|
|
|
|
trie->wordinfo[curword].accept = state; \ |
1530
|
|
|
|
|
|
\ |
1531
|
|
|
|
|
|
if ( noper_next < tail ) { \ |
1532
|
|
|
|
|
|
if (!trie->jump) \ |
1533
|
|
|
|
|
|
trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ |
1534
|
|
|
|
|
|
trie->jump[curword] = (U16)(noper_next - convert); \ |
1535
|
|
|
|
|
|
if (!jumper) \ |
1536
|
|
|
|
|
|
jumper = noper_next; \ |
1537
|
|
|
|
|
|
if (!nextbranch) \ |
1538
|
|
|
|
|
|
nextbranch= regnext(cur); \ |
1539
|
|
|
|
|
|
} \ |
1540
|
|
|
|
|
|
\ |
1541
|
|
|
|
|
|
if ( dupe ) { \ |
1542
|
|
|
|
|
|
/* It's a dupe. Pre-insert into the wordinfo[].prev */\ |
1543
|
|
|
|
|
|
/* chain, so that when the bits of chain are later */\ |
1544
|
|
|
|
|
|
/* linked together, the dups appear in the chain */\ |
1545
|
|
|
|
|
|
trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ |
1546
|
|
|
|
|
|
trie->wordinfo[dupe].prev = curword; \ |
1547
|
|
|
|
|
|
} else { \ |
1548
|
|
|
|
|
|
/* we haven't inserted this word yet. */ \ |
1549
|
|
|
|
|
|
trie->states[ state ].wordnum = curword; \ |
1550
|
|
|
|
|
|
} \ |
1551
|
|
|
|
|
|
} STMT_END |
1552
|
|
|
|
|
|
|
1553
|
|
|
|
|
|
|
1554
|
|
|
|
|
|
#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ |
1555
|
|
|
|
|
|
( ( base + charid >= ucharcount \ |
1556
|
|
|
|
|
|
&& base + charid < ubound \ |
1557
|
|
|
|
|
|
&& state == trie->trans[ base - ucharcount + charid ].check \ |
1558
|
|
|
|
|
|
&& trie->trans[ base - ucharcount + charid ].next ) \ |
1559
|
|
|
|
|
|
? trie->trans[ base - ucharcount + charid ].next \ |
1560
|
|
|
|
|
|
: ( state==1 ? special : 0 ) \ |
1561
|
|
|
|
|
|
) |
1562
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
#define MADE_TRIE 1 |
1564
|
|
|
|
|
|
#define MADE_JUMP_TRIE 2 |
1565
|
|
|
|
|
|
#define MADE_EXACT_TRIE 4 |
1566
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
STATIC I32 |
1568
|
96
|
|
|
|
|
S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) |
1569
|
|
|
|
|
|
{ |
1570
|
|
|
|
|
|
dVAR; |
1571
|
|
|
|
|
|
/* first pass, loop through and scan words */ |
1572
|
|
|
|
|
|
reg_trie_data *trie; |
1573
|
|
|
|
|
|
HV *widecharmap = NULL; |
1574
|
78988
|
|
|
|
|
AV *revcharmap = newAV(); |
1575
|
|
|
|
|
|
regnode *cur; |
1576
|
78988
|
|
|
|
|
const U32 uniflags = UTF8_ALLOW_DEFAULT; |
1577
|
78988
|
|
|
|
|
STRLEN len = 0; |
1578
|
79076
|
|
|
|
|
UV uvc = 0; |
1579
|
|
|
|
|
|
U16 curword = 0; |
1580
|
|
|
|
|
|
U32 next_alloc = 0; |
1581
|
|
|
|
|
|
regnode *jumper = NULL; |
1582
|
|
|
|
|
|
regnode *nextbranch = NULL; |
1583
|
|
|
|
|
|
regnode *convert = NULL; |
1584
|
|
|
|
|
|
U32 *prev_states; /* temp array mapping each state to previous one */ |
1585
|
|
|
|
|
|
/* we just use folder as a flag in utf8 */ |
1586
|
|
|
|
|
|
const U8 * folder = NULL; |
1587
|
|
|
|
|
|
|
1588
|
|
|
|
|
|
#ifdef DEBUGGING |
1589
|
79076
|
|
|
|
|
const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); |
1590
|
|
|
|
|
|
AV *trie_words = NULL; |
1591
|
|
|
|
|
|
/* along with revcharmap, this only used during construction but both are |
1592
|
|
|
|
|
|
* useful during debugging so we store them in the struct when debugging. |
1593
|
|
|
|
|
|
*/ |
1594
|
|
|
|
|
|
#else |
1595
|
|
|
|
|
|
const U32 data_slot = add_data( pRExC_state, 2, "tu" ); |
1596
|
|
|
|
|
|
STRLEN trie_charcount=0; |
1597
|
|
|
|
|
|
#endif |
1598
|
|
|
|
|
|
SV *re_trie_maxbuff; |
1599
|
79076
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
1600
|
|
|
|
|
|
|
1601
|
72
|
|
|
|
|
PERL_ARGS_ASSERT_MAKE_TRIE; |
1602
|
|
|
|
|
|
#ifndef DEBUGGING |
1603
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
1604
|
|
|
|
|
|
#endif |
1605
|
|
|
|
|
|
|
1606
|
780243576
|
|
|
|
|
switch (flags) { |
1607
|
|
|
|
|
|
case EXACT: break; |
1608
|
|
|
|
|
|
case EXACTFA: |
1609
|
|
|
|
|
|
case EXACTFU_SS: |
1610
|
780164502
|
|
|
|
|
case EXACTFU: folder = PL_fold_latin1; break; |
1611
|
0
|
|
|
|
|
case EXACTF: folder = PL_fold; break; |
1612
|
0
|
|
|
|
|
case EXACTFL: folder = PL_fold_locale; break; |
1613
|
79068
|
|
|
|
|
default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); |
1614
|
|
|
|
|
|
} |
1615
|
|
|
|
|
|
|
1616
|
79076
|
|
|
|
|
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); |
1617
|
79076
|
|
|
|
|
trie->refcount = 1; |
1618
|
79076
|
|
|
|
|
trie->startstate = 1; |
1619
|
79076
|
|
|
|
|
trie->wordcount = word_count; |
1620
|
8
|
|
|
|
|
RExC_rxi->data->data[ data_slot ] = (void*)trie; |
1621
|
79012
|
|
|
|
|
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); |
1622
|
12
|
|
|
|
|
if (flags == EXACT) |
1623
|
10
|
|
|
|
|
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); |
1624
|
12
|
|
|
|
|
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( |
1625
|
|
|
|
|
|
trie->wordcount+1, sizeof(reg_trie_wordinfo)); |
1626
|
|
|
|
|
|
|
1627
|
12
|
|
|
|
|
DEBUG_r({ |
1628
|
|
|
|
|
|
trie_words = newAV(); |
1629
|
|
|
|
|
|
}); |
1630
|
|
|
|
|
|
|
1631
|
12
|
|
|
|
|
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); |
1632
|
79084
|
|
|
|
|
if (!SvIOK(re_trie_maxbuff)) { |
1633
|
79072
|
|
|
|
|
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); |
1634
|
|
|
|
|
|
} |
1635
|
76
|
|
|
|
|
DEBUG_TRIE_COMPILE_r({ |
1636
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
1637
|
|
|
|
|
|
"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", |
1638
|
|
|
|
|
|
(int)depth * 2 + 2, "", |
1639
|
|
|
|
|
|
REG_NODE_NUM(startbranch),REG_NODE_NUM(first), |
1640
|
|
|
|
|
|
REG_NODE_NUM(last), REG_NODE_NUM(tail), |
1641
|
|
|
|
|
|
(int)depth); |
1642
|
|
|
|
|
|
}); |
1643
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
/* Find the node we are going to overwrite */ |
1645
|
79076
|
|
|
|
|
if ( first == startbranch && OP( last ) != BRANCH ) { |
1646
|
|
|
|
|
|
/* whole branch chain */ |
1647
|
|
|
|
|
|
convert = first; |
1648
|
|
|
|
|
|
} else { |
1649
|
|
|
|
|
|
/* branch sub-chain */ |
1650
|
79000
|
|
|
|
|
convert = NEXTOPER( first ); |
1651
|
|
|
|
|
|
} |
1652
|
|
|
|
|
|
|
1653
|
|
|
|
|
|
/* -- First loop and Setup -- |
1654
|
|
|
|
|
|
|
1655
|
|
|
|
|
|
We first traverse the branches and scan each word to determine if it |
1656
|
|
|
|
|
|
contains widechars, and how many unique chars there are, this is |
1657
|
|
|
|
|
|
important as we have to build a table with at least as many columns as we |
1658
|
|
|
|
|
|
have unique chars. |
1659
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
We use an array of integers to represent the character codes 0..255 |
1661
|
|
|
|
|
|
(trie->charmap) and we use a an HV* to store Unicode characters. We use the |
1662
|
|
|
|
|
|
native representation of the character value as the key and IV's for the |
1663
|
|
|
|
|
|
coded index. |
1664
|
|
|
|
|
|
|
1665
|
|
|
|
|
|
*TODO* If we keep track of how many times each character is used we can |
1666
|
|
|
|
|
|
remap the columns so that the table compression later on is more |
1667
|
|
|
|
|
|
efficient in terms of memory by ensuring the most common value is in the |
1668
|
|
|
|
|
|
middle and the least common are on the outside. IMO this would be better |
1669
|
|
|
|
|
|
than a most to least common mapping as theres a decent chance the most |
1670
|
|
|
|
|
|
common letter will share a node with the least common, meaning the node |
1671
|
|
|
|
|
|
will not be compressible. With a middle is most common approach the worst |
1672
|
|
|
|
|
|
case is when we have the least common nodes twice. |
1673
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
*/ |
1675
|
|
|
|
|
|
|
1676
|
79064
|
|
|
|
|
for ( cur = first ; cur < last ; cur = regnext( cur ) ) { |
1677
|
79056
|
|
|
|
|
regnode *noper = NEXTOPER( cur ); |
1678
|
124
|
|
|
|
|
const U8 *uc = (U8*)STRING( noper ); |
1679
|
56
|
|
|
|
|
const U8 *e = uc + STR_LEN( noper ); |
1680
|
|
|
|
|
|
STRLEN foldlen = 0; |
1681
|
|
|
|
|
|
U32 wordlen = 0; /* required init */ |
1682
|
|
|
|
|
|
STRLEN minbytes = 0; |
1683
|
|
|
|
|
|
STRLEN maxbytes = 0; |
1684
|
56
|
|
|
|
|
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ |
1685
|
|
|
|
|
|
|
1686
|
56
|
|
|
|
|
if (OP(noper) == NOTHING) { |
1687
|
2
|
|
|
|
|
regnode *noper_next= regnext(noper); |
1688
|
70
|
|
|
|
|
if (noper_next != tail && OP(noper_next) == flags) { |
1689
|
|
|
|
|
|
noper = noper_next; |
1690
|
68
|
|
|
|
|
uc= (U8*)STRING(noper); |
1691
|
79064
|
|
|
|
|
e= uc + STR_LEN(noper); |
1692
|
79060
|
|
|
|
|
trie->minlen= STR_LEN(noper); |
1693
|
|
|
|
|
|
} else { |
1694
|
30
|
|
|
|
|
trie->minlen= 0; |
1695
|
30
|
|
|
|
|
continue; |
1696
|
|
|
|
|
|
} |
1697
|
|
|
|
|
|
} |
1698
|
|
|
|
|
|
|
1699
|
82
|
|
|
|
|
if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ |
1700
|
54
|
|
|
|
|
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte |
1701
|
|
|
|
|
|
regardless of encoding */ |
1702
|
90
|
|
|
|
|
if (OP( noper ) == EXACTFU_SS) { |
1703
|
|
|
|
|
|
/* false positives are ok, so just set this */ |
1704
|
36
|
|
|
|
|
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); |
1705
|
|
|
|
|
|
} |
1706
|
|
|
|
|
|
} |
1707
|
220
|
|
|
|
|
for ( ; uc < e ; uc += len ) { |
1708
|
220
|
|
|
|
|
TRIE_CHARCOUNT(trie)++; |
1709
|
79190
|
|
|
|
|
TRIE_READ_CHAR; |
1710
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
/* Acummulate to the current values, the range in the number of |
1712
|
|
|
|
|
|
* bytes that this character could match. The max is presumed to |
1713
|
|
|
|
|
|
* be the same as the folded input (which TRIE_READ_CHAR returns), |
1714
|
|
|
|
|
|
* except that when this is not in UTF-8, it could be matched |
1715
|
|
|
|
|
|
* against a string which is UTF-8, and the variant characters |
1716
|
|
|
|
|
|
* could be 2 bytes instead of the 1 here. Likewise, for the |
1717
|
|
|
|
|
|
* minimum number of bytes when not folded. When folding, the min |
1718
|
|
|
|
|
|
* is assumed to be 1 byte could fold to match the single character |
1719
|
|
|
|
|
|
* here, or in the case of a multi-char fold, 1 byte can fold to |
1720
|
|
|
|
|
|
* the whole sequence. 'foldlen' is used to denote whether we are |
1721
|
|
|
|
|
|
* in such a sequence, skipping the min setting if so. XXX TODO |
1722
|
|
|
|
|
|
* Use the exact list of what folds to each character, from |
1723
|
|
|
|
|
|
* PL_utf8_foldclosures */ |
1724
|
79188
|
|
|
|
|
if (UTF) { |
1725
|
79004
|
|
|
|
|
maxbytes += UTF8SKIP(uc); |
1726
|
79004
|
|
|
|
|
if (! folder) { |
1727
|
|
|
|
|
|
/* A non-UTF-8 string could be 1 byte to match our 2 */ |
1728
|
4
|
|
|
|
|
minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc)) |
1729
|
|
|
|
|
|
? 1 |
1730
|
68
|
|
|
|
|
: UTF8SKIP(uc); |
1731
|
|
|
|
|
|
} |
1732
|
|
|
|
|
|
else { |
1733
|
79072
|
|
|
|
|
if (foldlen) { |
1734
|
4
|
|
|
|
|
foldlen -= UTF8SKIP(uc); |
1735
|
|
|
|
|
|
} |
1736
|
|
|
|
|
|
else { |
1737
|
630207
|
|
|
|
|
foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e); |
1738
|
630207
|
|
|
|
|
minbytes++; |
1739
|
|
|
|
|
|
} |
1740
|
|
|
|
|
|
} |
1741
|
|
|
|
|
|
} |
1742
|
|
|
|
|
|
else { |
1743
|
630391
|
|
|
|
|
maxbytes += (UNI_IS_INVARIANT(*uc)) |
1744
|
|
|
|
|
|
? 1 |
1745
|
|
|
|
|
|
: 2; |
1746
|
630391
|
|
|
|
|
if (! folder) { |
1747
|
630283
|
|
|
|
|
minbytes++; |
1748
|
|
|
|
|
|
} |
1749
|
|
|
|
|
|
else { |
1750
|
2479061
|
|
|
|
|
if (foldlen) { |
1751
|
1848746
|
|
|
|
|
foldlen--; |
1752
|
|
|
|
|
|
} |
1753
|
|
|
|
|
|
else { |
1754
|
1848854
|
|
|
|
|
foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e); |
1755
|
1848854
|
|
|
|
|
minbytes++; |
1756
|
|
|
|
|
|
} |
1757
|
|
|
|
|
|
} |
1758
|
|
|
|
|
|
} |
1759
|
1848930
|
|
|
|
|
if ( uvc < 256 ) { |
1760
|
5872
|
|
|
|
|
if ( folder ) { |
1761
|
5796
|
|
|
|
|
U8 folded= folder[ (U8) uvc ]; |
1762
|
160
|
|
|
|
|
if ( !trie->charmap[ folded ] ) { |
1763
|
88
|
|
|
|
|
trie->charmap[ folded ]=( ++trie->uniquecharcount ); |
1764
|
1848782
|
|
|
|
|
TRIE_STORE_REVCHAR( folded ); |
1765
|
|
|
|
|
|
} |
1766
|
|
|
|
|
|
} |
1767
|
10937261
|
|
|
|
|
if ( !trie->charmap[ uvc ] ) { |
1768
|
6118870
|
|
|
|
|
trie->charmap[ uvc ]=( ++trie->uniquecharcount ); |
1769
|
6118870
|
|
|
|
|
TRIE_STORE_REVCHAR( uvc ); |
1770
|
|
|
|
|
|
} |
1771
|
6050170
|
|
|
|
|
if ( set_bit ) { |
1772
|
|
|
|
|
|
/* store the codepoint in the bitmap, and its folded |
1773
|
|
|
|
|
|
* equivalent. */ |
1774
|
68840
|
|
|
|
|
TRIE_BITMAP_SET(trie, uvc); |
1775
|
|
|
|
|
|
|
1776
|
|
|
|
|
|
/* store the folded codepoint */ |
1777
|
68840
|
|
|
|
|
if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); |
1778
|
|
|
|
|
|
|
1779
|
6118826
|
|
|
|
|
if ( !UTF ) { |
1780
|
|
|
|
|
|
/* store first byte of utf8 representation of |
1781
|
|
|
|
|
|
variant codepoints */ |
1782
|
6118826
|
|
|
|
|
if (! NATIVE_IS_INVARIANT(uvc)) { |
1783
|
6118800
|
|
|
|
|
TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); |
1784
|
|
|
|
|
|
} |
1785
|
|
|
|
|
|
} |
1786
|
|
|
|
|
|
set_bit = 0; /* We've done our bit :-) */ |
1787
|
|
|
|
|
|
} |
1788
|
|
|
|
|
|
} else { |
1789
|
|
|
|
|
|
SV** svpp; |
1790
|
5825601
|
|
|
|
|
if ( !widecharmap ) |
1791
|
5825601
|
|
|
|
|
widecharmap = newHV(); |
1792
|
|
|
|
|
|
|
1793
|
5825601
|
|
|
|
|
svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); |
1794
|
|
|
|
|
|
|
1795
|
5825601
|
|
|
|
|
if ( !svpp ) |
1796
|
5825601
|
|
|
|
|
Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); |
1797
|
|
|
|
|
|
|
1798
|
6118800
|
|
|
|
|
if ( !SvTRUE( *svpp ) ) { |
1799
|
0
|
|
|
|
|
sv_setiv( *svpp, ++trie->uniquecharcount ); |
1800
|
1848746
|
|
|
|
|
TRIE_STORE_REVCHAR(uvc); |
1801
|
|
|
|
|
|
} |
1802
|
|
|
|
|
|
} |
1803
|
|
|
|
|
|
} |
1804
|
1848800
|
|
|
|
|
if( cur == first ) { |
1805
|
630215
|
|
|
|
|
trie->minlen = minbytes; |
1806
|
630215
|
|
|
|
|
trie->maxlen = maxbytes; |
1807
|
7086061
|
|
|
|
|
} else if (minbytes < trie->minlen) { |
1808
|
6455810
|
|
|
|
|
trie->minlen = minbytes; |
1809
|
6455852
|
|
|
|
|
} else if (maxbytes > trie->maxlen) { |
1810
|
6455810
|
|
|
|
|
trie->maxlen = maxbytes; |
1811
|
|
|
|
|
|
} |
1812
|
|
|
|
|
|
} /* end first pass */ |
1813
|
6455816
|
|
|
|
|
DEBUG_TRIE_COMPILE_r( |
1814
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", |
1815
|
|
|
|
|
|
(int)depth * 2 + 2,"", |
1816
|
|
|
|
|
|
( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, |
1817
|
|
|
|
|
|
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, |
1818
|
|
|
|
|
|
(int)trie->minlen, (int)trie->maxlen ) |
1819
|
|
|
|
|
|
); |
1820
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
/* |
1822
|
|
|
|
|
|
We now know what we are dealing with in terms of unique chars and |
1823
|
|
|
|
|
|
string sizes so we can calculate how much memory a naive |
1824
|
|
|
|
|
|
representation using a flat table will take. If it's over a reasonable |
1825
|
|
|
|
|
|
limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory |
1826
|
|
|
|
|
|
conservative but potentially much slower representation using an array |
1827
|
|
|
|
|
|
of lists. |
1828
|
|
|
|
|
|
|
1829
|
|
|
|
|
|
At the end we convert both representations into the same compressed |
1830
|
|
|
|
|
|
form that will be used in regexec.c for matching with. The latter |
1831
|
|
|
|
|
|
is a form that cannot be used to construct with but has memory |
1832
|
|
|
|
|
|
properties similar to the list form and access properties similar |
1833
|
|
|
|
|
|
to the table form making it both suitable for fast searches and |
1834
|
|
|
|
|
|
small enough that its feasable to store for the duration of a program. |
1835
|
|
|
|
|
|
|
1836
|
|
|
|
|
|
See the comment in the code where the compressed table is produced |
1837
|
|
|
|
|
|
inplace from the flat tabe representation for an explanation of how |
1838
|
|
|
|
|
|
the compression works. |
1839
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
*/ |
1841
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
1843
|
30925548
|
|
|
|
|
Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); |
1844
|
28538549
|
|
|
|
|
prev_states[1] = 0; |
1845
|
|
|
|
|
|
|
1846
|
7486107
|
|
|
|
|
if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { |
1847
|
|
|
|
|
|
/* |
1848
|
|
|
|
|
|
Second Pass -- Array Of Lists Representation |
1849
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
Each state will be represented by a list of charid:state records |
1851
|
|
|
|
|
|
(reg_trie_trans_le) the first such element holds the CUR and LEN |
1852
|
|
|
|
|
|
points of the allocated array. (See defines above). |
1853
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
We build the initial structure using the lists, and then convert |
1855
|
|
|
|
|
|
it into the compressed table form which allows faster lookups |
1856
|
|
|
|
|
|
(but cant be modified once converted). |
1857
|
|
|
|
|
|
*/ |
1858
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
STRLEN transcount = 1; |
1860
|
|
|
|
|
|
|
1861
|
5825601
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, |
1862
|
|
|
|
|
|
"%*sCompiling trie using list compiler\n", |
1863
|
|
|
|
|
|
(int)depth * 2 + 2, "")); |
1864
|
|
|
|
|
|
|
1865
|
3434215
|
|
|
|
|
trie->states = (reg_trie_state *) |
1866
|
2964532
|
|
|
|
|
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, |
1867
|
|
|
|
|
|
sizeof(reg_trie_state) ); |
1868
|
4068809
|
|
|
|
|
TRIE_LIST_NEW(1); |
1869
|
|
|
|
|
|
next_alloc = 2; |
1870
|
|
|
|
|
|
|
1871
|
4068809
|
|
|
|
|
for ( cur = first ; cur < last ; cur = regnext( cur ) ) { |
1872
|
|
|
|
|
|
|
1873
|
4068809
|
|
|
|
|
regnode *noper = NEXTOPER( cur ); |
1874
|
4068809
|
|
|
|
|
U8 *uc = (U8*)STRING( noper ); |
1875
|
1756792
|
|
|
|
|
const U8 *e = uc + STR_LEN( noper ); |
1876
|
|
|
|
|
|
U32 state = 1; /* required init */ |
1877
|
|
|
|
|
|
U16 charid = 0; /* sanity init */ |
1878
|
|
|
|
|
|
U32 wordlen = 0; /* required init */ |
1879
|
|
|
|
|
|
|
1880
|
3417290
|
|
|
|
|
if (OP(noper) == NOTHING) { |
1881
|
682779
|
|
|
|
|
regnode *noper_next= regnext(noper); |
1882
|
3417290
|
|
|
|
|
if (noper_next != tail && OP(noper_next) == flags) { |
1883
|
|
|
|
|
|
noper = noper_next; |
1884
|
3417290
|
|
|
|
|
uc= (U8*)STRING(noper); |
1885
|
3417290
|
|
|
|
|
e= uc + STR_LEN(noper); |
1886
|
|
|
|
|
|
} |
1887
|
|
|
|
|
|
} |
1888
|
|
|
|
|
|
|
1889
|
630207
|
|
|
|
|
if (OP(noper) != NOTHING) { |
1890
|
630207
|
|
|
|
|
for ( ; uc < e ; uc += len ) { |
1891
|
|
|
|
|
|
|
1892
|
630207
|
|
|
|
|
TRIE_READ_CHAR; |
1893
|
|
|
|
|
|
|
1894
|
630211
|
|
|
|
|
if ( uvc < 256 ) { |
1895
|
630211
|
|
|
|
|
charid = trie->charmap[ uvc ]; |
1896
|
|
|
|
|
|
} else { |
1897
|
630211
|
|
|
|
|
SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); |
1898
|
630211
|
|
|
|
|
if ( !svpp ) { |
1899
|
|
|
|
|
|
charid = 0; |
1900
|
|
|
|
|
|
} else { |
1901
|
259168
|
|
|
|
|
charid=(U16)SvIV( *svpp ); |
1902
|
|
|
|
|
|
} |
1903
|
|
|
|
|
|
} |
1904
|
|
|
|
|
|
/* charid is now 0 if we dont know the char read, or nonzero if we do */ |
1905
|
630211
|
|
|
|
|
if ( charid ) { |
1906
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
U16 check; |
1908
|
|
|
|
|
|
U32 newstate = 0; |
1909
|
|
|
|
|
|
|
1910
|
630211
|
|
|
|
|
charid--; |
1911
|
193429
|
|
|
|
|
if ( !trie->states[ state ].trans.list ) { |
1912
|
364702
|
|
|
|
|
TRIE_LIST_NEW( state ); |
1913
|
|
|
|
|
|
} |
1914
|
364702
|
|
|
|
|
for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { |
1915
|
1237152
|
|
|
|
|
if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { |
1916
|
2062410
|
|
|
|
|
newstate = TRIE_LIST_ITEM( state, check ).newstate; |
1917
|
2062404
|
|
|
|
|
break; |
1918
|
|
|
|
|
|
} |
1919
|
|
|
|
|
|
} |
1920
|
1387048
|
|
|
|
|
if ( ! newstate ) { |
1921
|
706821
|
|
|
|
|
newstate = next_alloc++; |
1922
|
348827
|
|
|
|
|
prev_states[newstate] = state; |
1923
|
348827
|
|
|
|
|
TRIE_LIST_PUSH( state, charid, newstate ); |
1924
|
348827
|
|
|
|
|
transcount++; |
1925
|
|
|
|
|
|
} |
1926
|
|
|
|
|
|
state = newstate; |
1927
|
|
|
|
|
|
} else { |
1928
|
13335
|
|
|
|
|
Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); |
1929
|
|
|
|
|
|
} |
1930
|
|
|
|
|
|
} |
1931
|
|
|
|
|
|
} |
1932
|
12815
|
|
|
|
|
TRIE_HANDLE_WORD(state); |
1933
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
} /* end second pass */ |
1935
|
|
|
|
|
|
|
1936
|
|
|
|
|
|
/* next alloc is the NEXT state to be allocated */ |
1937
|
12815
|
|
|
|
|
trie->statecount = next_alloc; |
1938
|
11429
|
|
|
|
|
trie->states = (reg_trie_state *) |
1939
|
11429
|
|
|
|
|
PerlMemShared_realloc( trie->states, |
1940
|
|
|
|
|
|
next_alloc |
1941
|
|
|
|
|
|
* sizeof(reg_trie_state) ); |
1942
|
|
|
|
|
|
|
1943
|
|
|
|
|
|
/* and now dump it out before we compress it */ |
1944
|
11429
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, |
1945
|
|
|
|
|
|
revcharmap, next_alloc, |
1946
|
|
|
|
|
|
depth+1) |
1947
|
|
|
|
|
|
); |
1948
|
|
|
|
|
|
|
1949
|
11429
|
|
|
|
|
trie->trans = (reg_trie_trans *) |
1950
|
0
|
|
|
|
|
PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); |
1951
|
|
|
|
|
|
{ |
1952
|
|
|
|
|
|
U32 state; |
1953
|
|
|
|
|
|
U32 tp = 0; |
1954
|
|
|
|
|
|
U32 zp = 0; |
1955
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
1957
|
13335
|
|
|
|
|
for( state=1 ; state < next_alloc ; state ++ ) { |
1958
|
|
|
|
|
|
U32 base=0; |
1959
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
/* |
1961
|
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r( |
1962
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) |
1963
|
|
|
|
|
|
); |
1964
|
|
|
|
|
|
*/ |
1965
|
|
|
|
|
|
|
1966
|
13335
|
|
|
|
|
if (trie->states[state].trans.list) { |
1967
|
0
|
|
|
|
|
U16 minid=TRIE_LIST_ITEM( state, 1).forid; |
1968
|
|
|
|
|
|
U16 maxid=minid; |
1969
|
|
|
|
|
|
U16 idx; |
1970
|
|
|
|
|
|
|
1971
|
371329
|
|
|
|
|
for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { |
1972
|
364702
|
|
|
|
|
const U16 forid = TRIE_LIST_ITEM( state, idx).forid; |
1973
|
16395
|
|
|
|
|
if ( forid < minid ) { |
1974
|
|
|
|
|
|
minid=forid; |
1975
|
16395
|
|
|
|
|
} else if ( forid > maxid ) { |
1976
|
|
|
|
|
|
maxid=forid; |
1977
|
|
|
|
|
|
} |
1978
|
|
|
|
|
|
} |
1979
|
16395
|
|
|
|
|
if ( transcount < tp + maxid - minid + 1) { |
1980
|
12817
|
|
|
|
|
transcount *= 2; |
1981
|
12817
|
|
|
|
|
trie->trans = (reg_trie_trans *) |
1982
|
12817
|
|
|
|
|
PerlMemShared_realloc( trie->trans, |
1983
|
|
|
|
|
|
transcount |
1984
|
|
|
|
|
|
* sizeof(reg_trie_trans) ); |
1985
|
16395
|
|
|
|
|
Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); |
1986
|
|
|
|
|
|
} |
1987
|
40630
|
|
|
|
|
base = trie->uniquecharcount + tp - minid; |
1988
|
16397
|
|
|
|
|
if ( maxid == minid ) { |
1989
|
|
|
|
|
|
U32 set = 0; |
1990
|
348309
|
|
|
|
|
for ( ; zp < tp ; zp++ ) { |
1991
|
348309
|
|
|
|
|
if ( ! trie->trans[ zp ].next ) { |
1992
|
12817
|
|
|
|
|
base = trie->uniquecharcount + zp - minid; |
1993
|
12817
|
|
|
|
|
trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; |
1994
|
12817
|
|
|
|
|
trie->trans[ zp ].check = state; |
1995
|
|
|
|
|
|
set = 1; |
1996
|
12817
|
|
|
|
|
break; |
1997
|
|
|
|
|
|
} |
1998
|
|
|
|
|
|
} |
1999
|
12817
|
|
|
|
|
if ( !set ) { |
2000
|
12817
|
|
|
|
|
trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; |
2001
|
2
|
|
|
|
|
trie->trans[ tp ].check = state; |
2002
|
630211
|
|
|
|
|
tp++; |
2003
|
|
|
|
|
|
zp = tp; |
2004
|
|
|
|
|
|
} |
2005
|
|
|
|
|
|
} else { |
2006
|
630211
|
|
|
|
|
for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { |
2007
|
630209
|
|
|
|
|
const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; |
2008
|
630209
|
|
|
|
|
trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; |
2009
|
630209
|
|
|
|
|
trie->trans[ tid ].check = state; |
2010
|
|
|
|
|
|
} |
2011
|
191162
|
|
|
|
|
tp += ( maxid - minid + 1 ); |
2012
|
|
|
|
|
|
} |
2013
|
630209
|
|
|
|
|
Safefree(trie->states[ state ].trans.list); |
2014
|
|
|
|
|
|
} |
2015
|
|
|
|
|
|
/* |
2016
|
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r( |
2017
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, " base: %d\n",base); |
2018
|
|
|
|
|
|
); |
2019
|
|
|
|
|
|
*/ |
2020
|
623187
|
|
|
|
|
trie->states[ state ].trans.base=base; |
2021
|
|
|
|
|
|
} |
2022
|
531169
|
|
|
|
|
trie->lasttrans = tp + 1; |
2023
|
|
|
|
|
|
} |
2024
|
|
|
|
|
|
} else { |
2025
|
|
|
|
|
|
/* |
2026
|
|
|
|
|
|
Second Pass -- Flat Table Representation. |
2027
|
|
|
|
|
|
|
2028
|
|
|
|
|
|
we dont use the 0 slot of either trans[] or states[] so we add 1 to each. |
2029
|
|
|
|
|
|
We know that we will need Charcount+1 trans at most to store the data |
2030
|
|
|
|
|
|
(one row per char at worst case) So we preallocate both structures |
2031
|
|
|
|
|
|
assuming worst case. |
2032
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
We then construct the trie using only the .next slots of the entry |
2034
|
|
|
|
|
|
structs. |
2035
|
|
|
|
|
|
|
2036
|
|
|
|
|
|
We use the .check field of the first entry of the node temporarily to |
2037
|
|
|
|
|
|
make compression both faster and easier by keeping track of how many non |
2038
|
|
|
|
|
|
zero fields are in the node. |
2039
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
Since trans are numbered from 1 any 0 pointer in the table is a FAIL |
2041
|
|
|
|
|
|
transition. |
2042
|
|
|
|
|
|
|
2043
|
|
|
|
|
|
There are two terms at use here: state as a TRIE_NODEIDX() which is a |
2044
|
|
|
|
|
|
number representing the first entry of the node, and state as a |
2045
|
|
|
|
|
|
TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and |
2046
|
|
|
|
|
|
TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there |
2047
|
|
|
|
|
|
are 2 entrys per node. eg: |
2048
|
|
|
|
|
|
|
2049
|
|
|
|
|
|
A B A B |
2050
|
|
|
|
|
|
1. 2 4 1. 3 7 |
2051
|
|
|
|
|
|
2. 0 3 3. 0 5 |
2052
|
|
|
|
|
|
3. 0 0 5. 0 0 |
2053
|
|
|
|
|
|
4. 0 0 7. 0 0 |
2054
|
|
|
|
|
|
|
2055
|
|
|
|
|
|
The table is internally in the right hand, idx form. However as we also |
2056
|
|
|
|
|
|
have to deal with the states array which is indexed by nodenum we have to |
2057
|
|
|
|
|
|
use TRIE_NODENUM() to convert. |
2058
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
*/ |
2060
|
90126
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, |
2061
|
|
|
|
|
|
"%*sCompiling trie using table compiler\n", |
2062
|
|
|
|
|
|
(int)depth * 2 + 2, "")); |
2063
|
|
|
|
|
|
|
2064
|
90126
|
|
|
|
|
trie->trans = (reg_trie_trans *) |
2065
|
90126
|
|
|
|
|
PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) |
2066
|
|
|
|
|
|
* trie->uniquecharcount + 1, |
2067
|
|
|
|
|
|
sizeof(reg_trie_trans) ); |
2068
|
90126
|
|
|
|
|
trie->states = (reg_trie_state *) |
2069
|
540099
|
|
|
|
|
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, |
2070
|
|
|
|
|
|
sizeof(reg_trie_state) ); |
2071
|
630218
|
|
|
|
|
next_alloc = trie->uniquecharcount + 1; |
2072
|
|
|
|
|
|
|
2073
|
|
|
|
|
|
|
2074
|
2247419
|
|
|
|
|
for ( cur = first ; cur < last ; cur = regnext( cur ) ) { |
2075
|
|
|
|
|
|
|
2076
|
1927806
|
|
|
|
|
regnode *noper = NEXTOPER( cur ); |
2077
|
5990
|
|
|
|
|
const U8 *uc = (U8*)STRING( noper ); |
2078
|
1921872
|
|
|
|
|
const U8 *e = uc + STR_LEN( noper ); |
2079
|
|
|
|
|
|
|
2080
|
|
|
|
|
|
U32 state = 1; /* required init */ |
2081
|
|
|
|
|
|
|
2082
|
|
|
|
|
|
U16 charid = 0; /* sanity init */ |
2083
|
|
|
|
|
|
U32 accept_state = 0; /* sanity init */ |
2084
|
|
|
|
|
|
|
2085
|
|
|
|
|
|
U32 wordlen = 0; /* required init */ |
2086
|
|
|
|
|
|
|
2087
|
8727781
|
|
|
|
|
if (OP(noper) == NOTHING) { |
2088
|
7782298
|
|
|
|
|
regnode *noper_next= regnext(noper); |
2089
|
7782298
|
|
|
|
|
if (noper_next != tail && OP(noper_next) == flags) { |
2090
|
|
|
|
|
|
noper = noper_next; |
2091
|
6010852
|
|
|
|
|
uc= (U8*)STRING(noper); |
2092
|
6010852
|
|
|
|
|
e= uc + STR_LEN(noper); |
2093
|
|
|
|
|
|
} |
2094
|
|
|
|
|
|
} |
2095
|
|
|
|
|
|
|
2096
|
1921872
|
|
|
|
|
if ( OP(noper) != NOTHING ) { |
2097
|
630633
|
|
|
|
|
for ( ; uc < e ; uc += len ) { |
2098
|
|
|
|
|
|
|
2099
|
630395
|
|
|
|
|
TRIE_READ_CHAR; |
2100
|
|
|
|
|
|
|
2101
|
630395
|
|
|
|
|
if ( uvc < 256 ) { |
2102
|
941001
|
|
|
|
|
charid = trie->charmap[ uvc ]; |
2103
|
|
|
|
|
|
} else { |
2104
|
630211
|
|
|
|
|
SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); |
2105
|
25678
|
|
|
|
|
charid = svpp ? (U16)SvIV(*svpp) : 0; |
2106
|
|
|
|
|
|
} |
2107
|
25862
|
|
|
|
|
if ( charid ) { |
2108
|
25862
|
|
|
|
|
charid--; |
2109
|
25862
|
|
|
|
|
if ( !trie->trans[ state + charid ].next ) { |
2110
|
25778
|
|
|
|
|
trie->trans[ state + charid ].next = next_alloc; |
2111
|
25778
|
|
|
|
|
trie->trans[ state ].check++; |
2112
|
25778
|
|
|
|
|
prev_states[TRIE_NODENUM(next_alloc)] |
2113
|
25778
|
|
|
|
|
= TRIE_NODENUM(state); |
2114
|
25778
|
|
|
|
|
next_alloc += trie->uniquecharcount; |
2115
|
|
|
|
|
|
} |
2116
|
25862
|
|
|
|
|
state = trie->trans[ state + charid ].next; |
2117
|
|
|
|
|
|
} else { |
2118
|
25678
|
|
|
|
|
Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); |
2119
|
|
|
|
|
|
} |
2120
|
|
|
|
|
|
/* charid is now 0 if we dont know the char read, or nonzero if we do */ |
2121
|
|
|
|
|
|
} |
2122
|
|
|
|
|
|
} |
2123
|
25734
|
|
|
|
|
accept_state = TRIE_NODENUM( state ); |
2124
|
25734
|
|
|
|
|
TRIE_HANDLE_WORD(accept_state); |
2125
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
} /* end second pass */ |
2127
|
|
|
|
|
|
|
2128
|
|
|
|
|
|
/* and now dump it out before we compress it */ |
2129
|
25686
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, |
2130
|
|
|
|
|
|
revcharmap, |
2131
|
|
|
|
|
|
next_alloc, depth+1)); |
2132
|
|
|
|
|
|
|
2133
|
|
|
|
|
|
{ |
2134
|
|
|
|
|
|
/* |
2135
|
|
|
|
|
|
* Inplace compress the table.* |
2136
|
|
|
|
|
|
|
2137
|
|
|
|
|
|
For sparse data sets the table constructed by the trie algorithm will |
2138
|
|
|
|
|
|
be mostly 0/FAIL transitions or to put it another way mostly empty. |
2139
|
|
|
|
|
|
(Note that leaf nodes will not contain any transitions.) |
2140
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
This algorithm compresses the tables by eliminating most such |
2142
|
|
|
|
|
|
transitions, at the cost of a modest bit of extra work during lookup: |
2143
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
- Each states[] entry contains a .base field which indicates the |
2145
|
|
|
|
|
|
index in the state[] array wheres its transition data is stored. |
2146
|
|
|
|
|
|
|
2147
|
|
|
|
|
|
- If .base is 0 there are no valid transitions from that node. |
2148
|
|
|
|
|
|
|
2149
|
|
|
|
|
|
- If .base is nonzero then charid is added to it to find an entry in |
2150
|
|
|
|
|
|
the trans array. |
2151
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
-If trans[states[state].base+charid].check!=state then the |
2153
|
|
|
|
|
|
transition is taken to be a 0/Fail transition. Thus if there are fail |
2154
|
|
|
|
|
|
transitions at the front of the node then the .base offset will point |
2155
|
|
|
|
|
|
somewhere inside the previous nodes data (or maybe even into a node |
2156
|
|
|
|
|
|
even earlier), but the .check field determines if the transition is |
2157
|
|
|
|
|
|
valid. |
2158
|
|
|
|
|
|
|
2159
|
|
|
|
|
|
XXX - wrong maybe? |
2160
|
|
|
|
|
|
The following process inplace converts the table to the compressed |
2161
|
|
|
|
|
|
table: We first do not compress the root node 1,and mark all its |
2162
|
|
|
|
|
|
.check pointers as 1 and set its .base pointer as 1 as well. This |
2163
|
|
|
|
|
|
allows us to do a DFA construction from the compressed table later, |
2164
|
|
|
|
|
|
and ensures that any .base pointers we calculate later are greater |
2165
|
|
|
|
|
|
than 0. |
2166
|
|
|
|
|
|
|
2167
|
|
|
|
|
|
- We set 'pos' to indicate the first entry of the second node. |
2168
|
|
|
|
|
|
|
2169
|
|
|
|
|
|
- We then iterate over the columns of the node, finding the first and |
2170
|
|
|
|
|
|
last used entry at l and m. We then copy l..m into pos..(pos+m-l), |
2171
|
|
|
|
|
|
and set the .check pointers accordingly, and advance pos |
2172
|
|
|
|
|
|
appropriately and repreat for the next node. Note that when we copy |
2173
|
|
|
|
|
|
the next pointers we have to convert them from the original |
2174
|
|
|
|
|
|
NODEIDX form to NODENUM form as the former is not valid post |
2175
|
|
|
|
|
|
compression. |
2176
|
|
|
|
|
|
|
2177
|
|
|
|
|
|
- If a node has no transitions used we mark its base as 0 and do not |
2178
|
|
|
|
|
|
advance the pos pointer. |
2179
|
|
|
|
|
|
|
2180
|
|
|
|
|
|
- If a node only has one transition we use a second pointer into the |
2181
|
|
|
|
|
|
structure to fill in allocated fail transitions from other states. |
2182
|
|
|
|
|
|
This pointer is independent of the main pointer and scans forward |
2183
|
|
|
|
|
|
looking for null transitions that are allocated to a state. When it |
2184
|
|
|
|
|
|
finds one it writes the single transition into the "hole". If the |
2185
|
|
|
|
|
|
pointer doesnt find one the single transition is appended as normal. |
2186
|
|
|
|
|
|
|
2187
|
|
|
|
|
|
- Once compressed we can Renew/realloc the structures to release the |
2188
|
|
|
|
|
|
excess space. |
2189
|
|
|
|
|
|
|
2190
|
|
|
|
|
|
See "Table-Compression Methods" in sec 3.9 of the Red Dragon, |
2191
|
|
|
|
|
|
specifically Fig 3.47 and the associated pseudocode. |
2192
|
|
|
|
|
|
|
2193
|
|
|
|
|
|
demq |
2194
|
|
|
|
|
|
*/ |
2195
|
25686
|
|
|
|
|
const U32 laststate = TRIE_NODENUM( next_alloc ); |
2196
|
|
|
|
|
|
U32 state, charid; |
2197
|
|
|
|
|
|
U32 pos = 0, zp=0; |
2198
|
25686
|
|
|
|
|
trie->statecount = laststate; |
2199
|
|
|
|
|
|
|
2200
|
25794
|
|
|
|
|
for ( state = 1 ; state < laststate ; state++ ) { |
2201
|
|
|
|
|
|
U8 flag = 0; |
2202
|
25786
|
|
|
|
|
const U32 stateidx = TRIE_NODEIDX( state ); |
2203
|
25786
|
|
|
|
|
const U32 o_used = trie->trans[ stateidx ].check; |
2204
|
330712
|
|
|
|
|
U32 used = trie->trans[ stateidx ].check; |
2205
|
305034
|
|
|
|
|
trie->trans[ stateidx ].check = 0; |
2206
|
|
|
|
|
|
|
2207
|
305684
|
|
|
|
|
for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { |
2208
|
145700
|
|
|
|
|
if ( flag || trie->trans[ stateidx + charid ].next ) { |
2209
|
145242
|
|
|
|
|
if ( trie->trans[ stateidx + charid ].next ) { |
2210
|
436852
|
|
|
|
|
if (o_used == 1) { |
2211
|
411092
|
|
|
|
|
for ( ; zp < pos ; zp++ ) { |
2212
|
411122
|
|
|
|
|
if ( ! trie->trans[ zp ].next ) { |
2213
|
|
|
|
|
|
break; |
2214
|
|
|
|
|
|
} |
2215
|
|
|
|
|
|
} |
2216
|
1566276129
|
|
|
|
|
trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; |
2217
|
1565852036
|
|
|
|
|
trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); |
2218
|
1565852036
|
|
|
|
|
trie->trans[ zp ].check = state; |
2219
|
310038
|
|
|
|
|
if ( ++zp > pos ) pos = zp; |
2220
|
|
|
|
|
|
break; |
2221
|
|
|
|
|
|
} |
2222
|
310058
|
|
|
|
|
used--; |
2223
|
|
|
|
|
|
} |
2224
|
310190
|
|
|
|
|
if ( !flag ) { |
2225
|
|
|
|
|
|
flag = 1; |
2226
|
266082
|
|
|
|
|
trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; |
2227
|
|
|
|
|
|
} |
2228
|
266256
|
|
|
|
|
trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); |
2229
|
266256
|
|
|
|
|
trie->trans[ pos ].check = state; |
2230
|
216
|
|
|
|
|
pos++; |
2231
|
|
|
|
|
|
} |
2232
|
|
|
|
|
|
} |
2233
|
|
|
|
|
|
} |
2234
|
266072
|
|
|
|
|
trie->lasttrans = pos + 1; |
2235
|
25686
|
|
|
|
|
trie->states = (reg_trie_state *) |
2236
|
25686
|
|
|
|
|
PerlMemShared_realloc( trie->states, laststate |
2237
|
|
|
|
|
|
* sizeof(reg_trie_state) ); |
2238
|
25686
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r( |
2239
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
2240
|
|
|
|
|
|
"%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", |
2241
|
|
|
|
|
|
(int)depth * 2 + 2,"", |
2242
|
|
|
|
|
|
(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), |
2243
|
|
|
|
|
|
(IV)next_alloc, |
2244
|
|
|
|
|
|
(IV)pos, |
2245
|
|
|
|
|
|
( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); |
2246
|
|
|
|
|
|
); |
2247
|
|
|
|
|
|
|
2248
|
|
|
|
|
|
} /* end table compress */ |
2249
|
|
|
|
|
|
} |
2250
|
14511669
|
|
|
|
|
DEBUG_TRIE_COMPILE_MORE_r( |
2251
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", |
2252
|
|
|
|
|
|
(int)depth * 2 + 2, "", |
2253
|
|
|
|
|
|
(UV)trie->statecount, |
2254
|
|
|
|
|
|
(UV)trie->lasttrans) |
2255
|
|
|
|
|
|
); |
2256
|
|
|
|
|
|
/* resize the trans array to remove unused space */ |
2257
|
14511669
|
|
|
|
|
trie->trans = (reg_trie_trans *) |
2258
|
14511669
|
|
|
|
|
PerlMemShared_realloc( trie->trans, trie->lasttrans |
2259
|
|
|
|
|
|
* sizeof(reg_trie_trans) ); |
2260
|
|
|
|
|
|
|
2261
|
|
|
|
|
|
{ /* Modify the program and insert the new TRIE node */ |
2262
|
22779147
|
|
|
|
|
U8 nodetype =(U8)(flags & 0xFF); |
2263
|
|
|
|
|
|
char *str=NULL; |
2264
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
#ifdef DEBUGGING |
2266
|
|
|
|
|
|
regnode *optimize = NULL; |
2267
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
2268
|
|
|
|
|
|
|
2269
|
|
|
|
|
|
U32 mjd_offset = 0; |
2270
|
|
|
|
|
|
U32 mjd_nodelen = 0; |
2271
|
|
|
|
|
|
#endif /* RE_TRACK_PATTERN_OFFSETS */ |
2272
|
|
|
|
|
|
#endif /* DEBUGGING */ |
2273
|
|
|
|
|
|
/* |
2274
|
|
|
|
|
|
This means we convert either the first branch or the first Exact, |
2275
|
|
|
|
|
|
depending on whether the thing following (in 'last') is a branch |
2276
|
|
|
|
|
|
or not and whther first is the startbranch (ie is it a sub part of |
2277
|
|
|
|
|
|
the alternation or is it the whole thing.) |
2278
|
|
|
|
|
|
Assuming its a sub part we convert the EXACT otherwise we convert |
2279
|
|
|
|
|
|
the whole branch sequence, including the first. |
2280
|
|
|
|
|
|
*/ |
2281
|
|
|
|
|
|
/* Find the node we are going to overwrite */ |
2282
|
13896921
|
|
|
|
|
if ( first != startbranch || OP( last ) == BRANCH ) { |
2283
|
|
|
|
|
|
/* branch sub-chain */ |
2284
|
12795297
|
|
|
|
|
NEXT_OFF( first ) = (U16)(last - first); |
2285
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
2286
|
1111076
|
|
|
|
|
DEBUG_r({ |
2287
|
|
|
|
|
|
mjd_offset= Node_Offset((convert)); |
2288
|
|
|
|
|
|
mjd_nodelen= Node_Length((convert)); |
2289
|
|
|
|
|
|
}); |
2290
|
|
|
|
|
|
#endif |
2291
|
|
|
|
|
|
/* whole branch chain */ |
2292
|
|
|
|
|
|
} |
2293
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
2294
|
|
|
|
|
|
else { |
2295
|
1111084
|
|
|
|
|
DEBUG_r({ |
2296
|
|
|
|
|
|
const regnode *nop = NEXTOPER( convert ); |
2297
|
|
|
|
|
|
mjd_offset= Node_Offset((nop)); |
2298
|
|
|
|
|
|
mjd_nodelen= Node_Length((nop)); |
2299
|
|
|
|
|
|
}); |
2300
|
|
|
|
|
|
} |
2301
|
1111084
|
|
|
|
|
DEBUG_OPTIMISE_r( |
2302
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", |
2303
|
|
|
|
|
|
(int)depth * 2 + 2, "", |
2304
|
|
|
|
|
|
(UV)mjd_offset, (UV)mjd_nodelen) |
2305
|
|
|
|
|
|
); |
2306
|
|
|
|
|
|
#endif |
2307
|
|
|
|
|
|
/* But first we check to see if there is a common prefix we can |
2308
|
|
|
|
|
|
split out as an EXACT and put in front of the TRIE node. */ |
2309
|
1111084
|
|
|
|
|
trie->startstate= 1; |
2310
|
1101624
|
|
|
|
|
if ( trie->bitmap && !widecharmap && !trie->jump ) { |
2311
|
|
|
|
|
|
U32 state; |
2312
|
1101622
|
|
|
|
|
for ( state = 1 ; state < trie->statecount-1 ; state++ ) { |
2313
|
|
|
|
|
|
U32 ofs = 0; |
2314
|
|
|
|
|
|
I32 idx = -1; |
2315
|
|
|
|
|
|
U32 count = 0; |
2316
|
1101628
|
|
|
|
|
const U32 base = trie->states[ state ].trans.base; |
2317
|
|
|
|
|
|
|
2318
|
9472
|
|
|
|
|
if ( trie->states[state].wordnum ) |
2319
|
|
|
|
|
|
count = 1; |
2320
|
|
|
|
|
|
|
2321
|
9488
|
|
|
|
|
for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { |
2322
|
9556
|
|
|
|
|
if ( ( base + ofs >= trie->uniquecharcount ) && |
2323
|
9556
|
|
|
|
|
( base + ofs - trie->uniquecharcount < trie->lasttrans ) && |
2324
|
9050
|
|
|
|
|
trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) |
2325
|
|
|
|
|
|
{ |
2326
|
9012
|
|
|
|
|
if ( ++count > 1 ) { |
2327
|
9002
|
|
|
|
|
SV **tmp = av_fetch( revcharmap, ofs, 0); |
2328
|
9002
|
|
|
|
|
const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); |
2329
|
14511677
|
|
|
|
|
if ( state == 1 ) break; |
2330
|
14511673
|
|
|
|
|
if ( count == 2 ) { |
2331
|
14511663
|
|
|
|
|
Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); |
2332
|
1832660
|
|
|
|
|
DEBUG_OPTIMISE_r( |
2333
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2334
|
|
|
|
|
|
"%*sNew Start State=%"UVuf" Class: [", |
2335
|
|
|
|
|
|
(int)depth * 2 + 2, "", |
2336
|
|
|
|
|
|
(UV)state)); |
2337
|
1832660
|
|
|
|
|
if (idx >= 0) { |
2338
|
1832660
|
|
|
|
|
SV ** const tmp = av_fetch( revcharmap, idx, 0); |
2339
|
1327376
|
|
|
|
|
const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); |
2340
|
|
|
|
|
|
|
2341
|
529534
|
|
|
|
|
TRIE_BITMAP_SET(trie,*ch); |
2342
|
529534
|
|
|
|
|
if ( folder ) |
2343
|
303114
|
|
|
|
|
TRIE_BITMAP_SET(trie, folder[ *ch ]); |
2344
|
303116
|
|
|
|
|
DEBUG_OPTIMISE_r( |
2345
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s", (char*)ch) |
2346
|
|
|
|
|
|
); |
2347
|
|
|
|
|
|
} |
2348
|
|
|
|
|
|
} |
2349
|
226430
|
|
|
|
|
TRIE_BITMAP_SET(trie,*ch); |
2350
|
48448
|
|
|
|
|
if ( folder ) |
2351
|
47878
|
|
|
|
|
TRIE_BITMAP_SET(trie,folder[ *ch ]); |
2352
|
46774
|
|
|
|
|
DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); |
2353
|
|
|
|
|
|
} |
2354
|
46784
|
|
|
|
|
idx = ofs; |
2355
|
|
|
|
|
|
} |
2356
|
|
|
|
|
|
} |
2357
|
46774
|
|
|
|
|
if ( count == 1 ) { |
2358
|
179662
|
|
|
|
|
SV **tmp = av_fetch( revcharmap, idx, 0); |
2359
|
|
|
|
|
|
STRLEN len; |
2360
|
269490
|
|
|
|
|
char *ch = SvPV( *tmp, len ); |
2361
|
179662
|
|
|
|
|
DEBUG_OPTIMISE_r({ |
2362
|
|
|
|
|
|
SV *sv=sv_newmortal(); |
2363
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
2364
|
|
|
|
|
|
"%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", |
2365
|
|
|
|
|
|
(int)depth * 2 + 2, "", |
2366
|
|
|
|
|
|
(UV)state, (UV)idx, |
2367
|
|
|
|
|
|
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, |
2368
|
|
|
|
|
|
PL_colors[0], PL_colors[1], |
2369
|
|
|
|
|
|
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | |
2370
|
|
|
|
|
|
PERL_PV_ESCAPE_FIRSTCHAR |
2371
|
|
|
|
|
|
) |
2372
|
|
|
|
|
|
); |
2373
|
|
|
|
|
|
}); |
2374
|
158640
|
|
|
|
|
if ( state==1 ) { |
2375
|
137614
|
|
|
|
|
OP( convert ) = nodetype; |
2376
|
137614
|
|
|
|
|
str=STRING(convert); |
2377
|
104372
|
|
|
|
|
STR_LEN(convert)=0; |
2378
|
|
|
|
|
|
} |
2379
|
73492
|
|
|
|
|
STR_LEN(convert) += len; |
2380
|
11178
|
|
|
|
|
while (len--) |
2381
|
11166
|
|
|
|
|
*str++ = *ch++; |
2382
|
|
|
|
|
|
} else { |
2383
|
|
|
|
|
|
#ifdef DEBUGGING |
2384
|
62332
|
|
|
|
|
if (state>1) |
2385
|
62328
|
|
|
|
|
DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); |
2386
|
|
|
|
|
|
#endif |
2387
|
|
|
|
|
|
break; |
2388
|
|
|
|
|
|
} |
2389
|
|
|
|
|
|
} |
2390
|
372401
|
|
|
|
|
trie->prefixlen = (state-1); |
2391
|
1034822
|
|
|
|
|
if (str) { |
2392
|
100918
|
|
|
|
|
regnode *n = convert+NODE_SZ_STR(convert); |
2393
|
64648
|
|
|
|
|
NEXT_OFF(convert) = NODE_SZ_STR(convert); |
2394
|
17018
|
|
|
|
|
trie->startstate = state; |
2395
|
17018
|
|
|
|
|
trie->minlen -= (state - 1); |
2396
|
17018
|
|
|
|
|
trie->maxlen -= (state - 1); |
2397
|
|
|
|
|
|
#ifdef DEBUGGING |
2398
|
|
|
|
|
|
/* At least the UNICOS C compiler choked on this |
2399
|
|
|
|
|
|
* being argument to DEBUG_r(), so let's just have |
2400
|
|
|
|
|
|
* it right here. */ |
2401
|
|
|
|
|
|
if ( |
2402
|
|
|
|
|
|
#ifdef PERL_EXT_RE_BUILD |
2403
|
|
|
|
|
|
1 |
2404
|
|
|
|
|
|
#else |
2405
|
|
|
|
|
|
DEBUG_r_TEST |
2406
|
|
|
|
|
|
#endif |
2407
|
|
|
|
|
|
) { |
2408
|
|
|
|
|
|
regnode *fix = convert; |
2409
|
47632
|
|
|
|
|
U32 word = trie->wordcount; |
2410
|
47632
|
|
|
|
|
mjd_nodelen++; |
2411
|
981532
|
|
|
|
|
Set_Node_Offset_Length(convert, mjd_offset, state - 1); |
2412
|
526450
|
|
|
|
|
while( ++fix < n ) { |
2413
|
1994023
|
|
|
|
|
Set_Node_Offset_Length(fix, 0, 0); |
2414
|
|
|
|
|
|
} |
2415
|
1224706
|
|
|
|
|
while (word--) { |
2416
|
1224704
|
|
|
|
|
SV ** const tmp = av_fetch( trie_words, word, 0 ); |
2417
|
1172530
|
|
|
|
|
if (tmp) { |
2418
|
18088
|
|
|
|
|
if ( STR_LEN(convert) <= SvCUR(*tmp) ) |
2419
|
1172530
|
|
|
|
|
sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); |
2420
|
|
|
|
|
|
else |
2421
|
1172518
|
|
|
|
|
sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); |
2422
|
|
|
|
|
|
} |
2423
|
|
|
|
|
|
} |
2424
|
|
|
|
|
|
} |
2425
|
|
|
|
|
|
#endif |
2426
|
52176
|
|
|
|
|
if (trie->maxlen) { |
2427
|
|
|
|
|
|
convert = n; |
2428
|
|
|
|
|
|
} else { |
2429
|
49618
|
|
|
|
|
NEXT_OFF(convert) = (U16)(tail - convert); |
2430
|
|
|
|
|
|
DEBUG_r(optimize= n); |
2431
|
|
|
|
|
|
} |
2432
|
|
|
|
|
|
} |
2433
|
|
|
|
|
|
} |
2434
|
46010
|
|
|
|
|
if (!jumper) |
2435
|
|
|
|
|
|
jumper = last; |
2436
|
41578
|
|
|
|
|
if ( trie->maxlen ) { |
2437
|
39484
|
|
|
|
|
NEXT_OFF( convert ) = (U16)(tail - convert); |
2438
|
52182
|
|
|
|
|
ARG_SET( convert, data_slot ); |
2439
|
|
|
|
|
|
/* Store the offset to the first unabsorbed branch in |
2440
|
|
|
|
|
|
jump[0], which is otherwise unused by the jump logic. |
2441
|
|
|
|
|
|
We use this when dumping a trie and during optimisation. */ |
2442
|
703241
|
|
|
|
|
if (trie->jump) |
2443
|
14511661
|
|
|
|
|
trie->jump[0] = (U16)(nextbranch - convert); |
2444
|
|
|
|
|
|
|
2445
|
|
|
|
|
|
/* If the start state is not accepting (meaning there is no empty string/NOTHING) |
2446
|
|
|
|
|
|
* and there is a bitmap |
2447
|
|
|
|
|
|
* and the first "jump target" node we found leaves enough room |
2448
|
|
|
|
|
|
* then convert the TRIE node into a TRIEC node, with the bitmap |
2449
|
|
|
|
|
|
* embedded inline in the opcode - this is hypothetically faster. |
2450
|
|
|
|
|
|
*/ |
2451
|
17472848
|
|
|
|
|
if ( !trie->states[trie->startstate].wordnum |
2452
|
17472846
|
|
|
|
|
&& trie->bitmap |
2453
|
17472844
|
|
|
|
|
&& ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) |
2454
|
|
|
|
|
|
{ |
2455
|
17472842
|
|
|
|
|
OP( convert ) = TRIEC; |
2456
|
17472842
|
|
|
|
|
Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); |
2457
|
21805354
|
|
|
|
|
PerlMemShared_free(trie->bitmap); |
2458
|
9141715
|
|
|
|
|
trie->bitmap= NULL; |
2459
|
|
|
|
|
|
} else |
2460
|
643351
|
|
|
|
|
OP( convert ) = TRIE; |
2461
|
|
|
|
|
|
|
2462
|
|
|
|
|
|
/* store the type in the flags */ |
2463
|
68931023
|
|
|
|
|
convert->flags = nodetype; |
2464
|
53853883
|
|
|
|
|
DEBUG_r({ |
2465
|
|
|
|
|
|
optimize = convert |
2466
|
|
|
|
|
|
+ NODE_STEP_REGNODE |
2467
|
|
|
|
|
|
+ regarglen[ OP( convert ) ]; |
2468
|
|
|
|
|
|
}); |
2469
|
|
|
|
|
|
/* XXX We really should free up the resource in trie now, |
2470
|
|
|
|
|
|
as we won't use them - (which resources?) dmq */ |
2471
|
|
|
|
|
|
} |
2472
|
|
|
|
|
|
/* needed for dumping*/ |
2473
|
53853883
|
|
|
|
|
DEBUG_r(if (optimize) { |
2474
|
|
|
|
|
|
regnode *opt = convert; |
2475
|
|
|
|
|
|
|
2476
|
|
|
|
|
|
while ( ++opt < optimize) { |
2477
|
|
|
|
|
|
Set_Node_Offset_Length(opt,0,0); |
2478
|
|
|
|
|
|
} |
2479
|
|
|
|
|
|
/* |
2480
|
|
|
|
|
|
Try to clean up some of the debris left after the |
2481
|
|
|
|
|
|
optimisation. |
2482
|
|
|
|
|
|
*/ |
2483
|
|
|
|
|
|
while( optimize < jumper ) { |
2484
|
|
|
|
|
|
mjd_nodelen += Node_Length((optimize)); |
2485
|
|
|
|
|
|
OP( optimize ) = OPTIMIZED; |
2486
|
|
|
|
|
|
Set_Node_Offset_Length(optimize,0,0); |
2487
|
|
|
|
|
|
optimize++; |
2488
|
|
|
|
|
|
} |
2489
|
|
|
|
|
|
Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); |
2490
|
|
|
|
|
|
}); |
2491
|
|
|
|
|
|
} /* end node insert */ |
2492
|
|
|
|
|
|
|
2493
|
|
|
|
|
|
/* Finish populating the prev field of the wordinfo array. Walk back |
2494
|
|
|
|
|
|
* from each accept state until we find another accept state, and if |
2495
|
|
|
|
|
|
* so, point the first word's .prev field at the second word. If the |
2496
|
|
|
|
|
|
* second already has a .prev field set, stop now. This will be the |
2497
|
|
|
|
|
|
* case either if we've already processed that word's accept state, |
2498
|
|
|
|
|
|
* or that state had multiple words, and the overspill words were |
2499
|
|
|
|
|
|
* already linked up earlier. |
2500
|
|
|
|
|
|
*/ |
2501
|
|
|
|
|
|
{ |
2502
|
|
|
|
|
|
U16 word; |
2503
|
|
|
|
|
|
U32 state; |
2504
|
|
|
|
|
|
U16 prev; |
2505
|
|
|
|
|
|
|
2506
|
53853931
|
|
|
|
|
for (word=1; word <= trie->wordcount; word++) { |
2507
|
|
|
|
|
|
prev = 0; |
2508
|
53853931
|
|
|
|
|
if (trie->wordinfo[word].prev) |
2509
|
102878242
|
|
|
|
|
continue; |
2510
|
51439177
|
|
|
|
|
state = trie->wordinfo[word].accept; |
2511
|
51439377
|
|
|
|
|
while (state) { |
2512
|
51439321
|
|
|
|
|
state = prev_states[state]; |
2513
|
51439321
|
|
|
|
|
if (!state) |
2514
|
|
|
|
|
|
break; |
2515
|
77481289
|
|
|
|
|
prev = trie->states[state].wordnum; |
2516
|
46795196
|
|
|
|
|
if (prev) |
2517
|
|
|
|
|
|
break; |
2518
|
|
|
|
|
|
} |
2519
|
46051257
|
|
|
|
|
trie->wordinfo[word].prev = prev; |
2520
|
|
|
|
|
|
} |
2521
|
747025
|
|
|
|
|
Safefree(prev_states); |
2522
|
|
|
|
|
|
} |
2523
|
|
|
|
|
|
|
2524
|
|
|
|
|
|
|
2525
|
|
|
|
|
|
/* and now dump out the compressed format */ |
2526
|
747025
|
|
|
|
|
DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); |
2527
|
|
|
|
|
|
|
2528
|
51439129
|
|
|
|
|
RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; |
2529
|
|
|
|
|
|
#ifdef DEBUGGING |
2530
|
238911
|
|
|
|
|
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; |
2531
|
51200226
|
|
|
|
|
RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; |
2532
|
|
|
|
|
|
#else |
2533
|
|
|
|
|
|
SvREFCNT_dec_NN(revcharmap); |
2534
|
|
|
|
|
|
#endif |
2535
|
53853891
|
|
|
|
|
return trie->jump |
2536
|
|
|
|
|
|
? MADE_JUMP_TRIE |
2537
|
52728374
|
|
|
|
|
: trie->startstate>1 |
2538
|
|
|
|
|
|
? MADE_EXACT_TRIE |
2539
|
|
|
|
|
|
: MADE_TRIE; |
2540
|
|
|
|
|
|
} |
2541
|
|
|
|
|
|
|
2542
|
|
|
|
|
|
STATIC void |
2543
|
1129043
|
|
|
|
|
S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) |
2544
|
|
|
|
|
|
{ |
2545
|
|
|
|
|
|
/* The Trie is constructed and compressed now so we can build a fail array if it's needed |
2546
|
|
|
|
|
|
|
2547
|
|
|
|
|
|
This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the |
2548
|
|
|
|
|
|
"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 |
2549
|
|
|
|
|
|
ISBN 0-201-10088-6 |
2550
|
|
|
|
|
|
|
2551
|
|
|
|
|
|
We find the fail state for each state in the trie, this state is the longest proper |
2552
|
|
|
|
|
|
suffix of the current state's 'word' that is also a proper prefix of another word in our |
2553
|
|
|
|
|
|
trie. State 1 represents the word '' and is thus the default fail state. This allows |
2554
|
|
|
|
|
|
the DFA not to have to restart after its tried and failed a word at a given point, it |
2555
|
|
|
|
|
|
simply continues as though it had been matching the other word in the first place. |
2556
|
|
|
|
|
|
Consider |
2557
|
|
|
|
|
|
'abcdgu'=~/abcdefg|cdgu/ |
2558
|
|
|
|
|
|
When we get to 'd' we are still matching the first word, we would encounter 'g' which would |
2559
|
|
|
|
|
|
fail, which would bring us to the state representing 'd' in the second word where we would |
2560
|
|
|
|
|
|
try 'g' and succeed, proceeding to match 'cdgu'. |
2561
|
|
|
|
|
|
*/ |
2562
|
|
|
|
|
|
/* add a fail transition */ |
2563
|
1129043
|
|
|
|
|
const U32 trie_offset = ARG(source); |
2564
|
1129043
|
|
|
|
|
reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; |
2565
|
|
|
|
|
|
U32 *q; |
2566
|
1129043
|
|
|
|
|
const U32 ucharcount = trie->uniquecharcount; |
2567
|
1129043
|
|
|
|
|
const U32 numstates = trie->statecount; |
2568
|
723937
|
|
|
|
|
const U32 ubound = trie->lasttrans + ucharcount; |
2569
|
|
|
|
|
|
U32 q_read = 0; |
2570
|
|
|
|
|
|
U32 q_write = 0; |
2571
|
|
|
|
|
|
U32 charid; |
2572
|
1129043
|
|
|
|
|
U32 base = trie->states[ 1 ].trans.base; |
2573
|
|
|
|
|
|
U32 *fail; |
2574
|
|
|
|
|
|
reg_ac_data *aho; |
2575
|
929480
|
|
|
|
|
const U32 data_slot = add_data( pRExC_state, 1, "T" ); |
2576
|
4607645
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
2577
|
|
|
|
|
|
|
2578
|
3478606
|
|
|
|
|
PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; |
2579
|
|
|
|
|
|
#ifndef DEBUGGING |
2580
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
2581
|
|
|
|
|
|
#endif |
2582
|
|
|
|
|
|
|
2583
|
|
|
|
|
|
|
2584
|
3478606
|
|
|
|
|
ARG_SET( stclass, data_slot ); |
2585
|
3478606
|
|
|
|
|
aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); |
2586
|
3468846
|
|
|
|
|
RExC_rxi->data->data[ data_slot ] = (void*)aho; |
2587
|
3468846
|
|
|
|
|
aho->trie=trie_offset; |
2588
|
9764
|
|
|
|
|
aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); |
2589
|
3478606
|
|
|
|
|
Copy( trie->states, aho->states, numstates, reg_trie_state ); |
2590
|
3478606
|
|
|
|
|
Newxz( q, numstates, U32); |
2591
|
3478606
|
|
|
|
|
aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); |
2592
|
3478606
|
|
|
|
|
aho->refcount = 1; |
2593
|
5398
|
|
|
|
|
fail = aho->fail; |
2594
|
|
|
|
|
|
/* initialize fail[0..1] to be 1 so that we always have |
2595
|
|
|
|
|
|
a valid final fail state */ |
2596
|
3478606
|
|
|
|
|
fail[ 0 ] = fail[ 1 ] = 1; |
2597
|
|
|
|
|
|
|
2598
|
2227890
|
|
|
|
|
for ( charid = 0; charid < ucharcount ; charid++ ) { |
2599
|
2227886
|
|
|
|
|
const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); |
2600
|
3478684
|
|
|
|
|
if ( newstate ) { |
2601
|
3413278
|
|
|
|
|
q[ q_write ] = newstate; |
2602
|
|
|
|
|
|
/* set to point at the root */ |
2603
|
3478608
|
|
|
|
|
fail[ q[ q_write++ ] ]=1; |
2604
|
|
|
|
|
|
} |
2605
|
|
|
|
|
|
} |
2606
|
3478678
|
|
|
|
|
while ( q_read < q_write) { |
2607
|
3478674
|
|
|
|
|
const U32 cur = q[ q_read++ % numstates ]; |
2608
|
3130034
|
|
|
|
|
base = trie->states[ cur ].trans.base; |
2609
|
|
|
|
|
|
|
2610
|
1334762
|
|
|
|
|
for ( charid = 0 ; charid < ucharcount ; charid++ ) { |
2611
|
3480658
|
|
|
|
|
const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); |
2612
|
3480658
|
|
|
|
|
if (ch_state) { |
2613
|
|
|
|
|
|
U32 fail_state = cur; |
2614
|
|
|
|
|
|
U32 fail_base; |
2615
|
|
|
|
|
|
do { |
2616
|
182392
|
|
|
|
|
fail_state = fail[ fail_state ]; |
2617
|
3478668
|
|
|
|
|
fail_base = aho->states[ fail_state ].trans.base; |
2618
|
182
|
|
|
|
|
} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); |
2619
|
|
|
|
|
|
|
2620
|
182
|
|
|
|
|
fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); |
2621
|
182
|
|
|
|
|
fail[ ch_state ] = fail_state; |
2622
|
182
|
|
|
|
|
if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) |
2623
|
|
|
|
|
|
{ |
2624
|
116
|
|
|
|
|
aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; |
2625
|
|
|
|
|
|
} |
2626
|
3478668
|
|
|
|
|
q[ q_write++ % numstates] = ch_state; |
2627
|
|
|
|
|
|
} |
2628
|
|
|
|
|
|
} |
2629
|
|
|
|
|
|
} |
2630
|
|
|
|
|
|
/* restore fail[0..1] to 0 so that we "fall out" of the AC loop |
2631
|
|
|
|
|
|
when we fail in state 1, this allows us to use the |
2632
|
|
|
|
|
|
charclass scan to find a valid start char. This is based on the principle |
2633
|
|
|
|
|
|
that theres a good chance the string being searched contains lots of stuff |
2634
|
|
|
|
|
|
that cant be a start char. |
2635
|
|
|
|
|
|
*/ |
2636
|
3468846
|
|
|
|
|
fail[ 0 ] = fail[ 1 ] = 0; |
2637
|
210992
|
|
|
|
|
DEBUG_TRIE_COMPILE_r({ |
2638
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
2639
|
|
|
|
|
|
"%*sStclass Failtable (%"UVuf" states): 0", |
2640
|
|
|
|
|
|
(int)(depth * 2), "", (UV)numstates |
2641
|
|
|
|
|
|
); |
2642
|
|
|
|
|
|
for( q_read=1; q_read
|
2643
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); |
2644
|
|
|
|
|
|
} |
2645
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n"); |
2646
|
|
|
|
|
|
}); |
2647
|
3468846
|
|
|
|
|
Safefree(q); |
2648
|
|
|
|
|
|
/*RExC_seen |= REG_SEEN_TRIEDFA;*/ |
2649
|
3478606
|
|
|
|
|
} |
2650
|
|
|
|
|
|
|
2651
|
|
|
|
|
|
|
2652
|
|
|
|
|
|
/* |
2653
|
|
|
|
|
|
* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. |
2654
|
|
|
|
|
|
* These need to be revisited when a newer toolchain becomes available. |
2655
|
|
|
|
|
|
*/ |
2656
|
|
|
|
|
|
#if defined(__sparc64__) && defined(__GNUC__) |
2657
|
|
|
|
|
|
# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) |
2658
|
|
|
|
|
|
# undef SPARC64_GCC_WORKAROUND |
2659
|
|
|
|
|
|
# define SPARC64_GCC_WORKAROUND 1 |
2660
|
|
|
|
|
|
# endif |
2661
|
|
|
|
|
|
#endif |
2662
|
|
|
|
|
|
|
2663
|
|
|
|
|
|
#define DEBUG_PEEP(str,scan,depth) \ |
2664
|
|
|
|
|
|
DEBUG_OPTIMISE_r({if (scan){ \ |
2665
|
|
|
|
|
|
SV * const mysv=sv_newmortal(); \ |
2666
|
|
|
|
|
|
regnode *Next = regnext(scan); \ |
2667
|
|
|
|
|
|
regprop(RExC_rx, mysv, scan); \ |
2668
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ |
2669
|
|
|
|
|
|
(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ |
2670
|
|
|
|
|
|
Next ? (REG_NODE_NUM(Next)) : 0 ); \ |
2671
|
|
|
|
|
|
}}); |
2672
|
|
|
|
|
|
|
2673
|
|
|
|
|
|
|
2674
|
|
|
|
|
|
/* The below joins as many adjacent EXACTish nodes as possible into a single |
2675
|
|
|
|
|
|
* one. The regop may be changed if the node(s) contain certain sequences that |
2676
|
|
|
|
|
|
* require special handling. The joining is only done if: |
2677
|
|
|
|
|
|
* 1) there is room in the current conglomerated node to entirely contain the |
2678
|
|
|
|
|
|
* next one. |
2679
|
|
|
|
|
|
* 2) they are the exact same node type |
2680
|
|
|
|
|
|
* |
2681
|
|
|
|
|
|
* The adjacent nodes actually may be separated by NOTHING-kind nodes, and |
2682
|
|
|
|
|
|
* these get optimized out |
2683
|
|
|
|
|
|
* |
2684
|
|
|
|
|
|
* If a node is to match under /i (folded), the number of characters it matches |
2685
|
|
|
|
|
|
* can be different than its character length if it contains a multi-character |
2686
|
|
|
|
|
|
* fold. *min_subtract is set to the total delta of the input nodes. |
2687
|
|
|
|
|
|
* |
2688
|
|
|
|
|
|
* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF |
2689
|
|
|
|
|
|
* and contains LATIN SMALL LETTER SHARP S |
2690
|
|
|
|
|
|
* |
2691
|
|
|
|
|
|
* This is as good a place as any to discuss the design of handling these |
2692
|
|
|
|
|
|
* multi-character fold sequences. It's been wrong in Perl for a very long |
2693
|
|
|
|
|
|
* time. There are three code points in Unicode whose multi-character folds |
2694
|
|
|
|
|
|
* were long ago discovered to mess things up. The previous designs for |
2695
|
|
|
|
|
|
* dealing with these involved assigning a special node for them. This |
2696
|
|
|
|
|
|
* approach doesn't work, as evidenced by this example: |
2697
|
|
|
|
|
|
* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches |
2698
|
|
|
|
|
|
* Both these fold to "sss", but if the pattern is parsed to create a node that |
2699
|
|
|
|
|
|
* would match just the \xDF, it won't be able to handle the case where a |
2700
|
|
|
|
|
|
* successful match would have to cross the node's boundary. The new approach |
2701
|
|
|
|
|
|
* that hopefully generally solves the problem generates an EXACTFU_SS node |
2702
|
|
|
|
|
|
* that is "sss". |
2703
|
|
|
|
|
|
* |
2704
|
|
|
|
|
|
* It turns out that there are problems with all multi-character folds, and not |
2705
|
|
|
|
|
|
* just these three. Now the code is general, for all such cases. The |
2706
|
|
|
|
|
|
* approach taken is: |
2707
|
|
|
|
|
|
* 1) This routine examines each EXACTFish node that could contain multi- |
2708
|
|
|
|
|
|
* character fold sequences. It returns in *min_subtract how much to |
2709
|
|
|
|
|
|
* subtract from the the actual length of the string to get a real minimum |
2710
|
|
|
|
|
|
* match length; it is 0 if there are no multi-char folds. This delta is |
2711
|
|
|
|
|
|
* used by the caller to adjust the min length of the match, and the delta |
2712
|
|
|
|
|
|
* between min and max, so that the optimizer doesn't reject these |
2713
|
|
|
|
|
|
* possibilities based on size constraints. |
2714
|
|
|
|
|
|
* 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS |
2715
|
|
|
|
|
|
* is used for an EXACTFU node that contains at least one "ss" sequence in |
2716
|
|
|
|
|
|
* it. For non-UTF-8 patterns and strings, this is the only case where |
2717
|
|
|
|
|
|
* there is a possible fold length change. That means that a regular |
2718
|
|
|
|
|
|
* EXACTFU node without UTF-8 involvement doesn't have to concern itself |
2719
|
|
|
|
|
|
* with length changes, and so can be processed faster. regexec.c takes |
2720
|
|
|
|
|
|
* advantage of this. Generally, an EXACTFish node that is in UTF-8 is |
2721
|
|
|
|
|
|
* pre-folded by regcomp.c. This saves effort in regex matching. |
2722
|
|
|
|
|
|
* However, the pre-folding isn't done for non-UTF8 patterns because the |
2723
|
|
|
|
|
|
* fold of the MICRO SIGN requires UTF-8, and we don't want to slow things |
2724
|
|
|
|
|
|
* down by forcing the pattern into UTF8 unless necessary. Also what |
2725
|
|
|
|
|
|
* EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold |
2726
|
|
|
|
|
|
* possibilities for the non-UTF8 patterns are quite simple, except for |
2727
|
|
|
|
|
|
* the sharp s. All the ones that don't involve a UTF-8 target string are |
2728
|
|
|
|
|
|
* members of a fold-pair, and arrays are set up for all of them so that |
2729
|
|
|
|
|
|
* the other member of the pair can be found quickly. Code elsewhere in |
2730
|
|
|
|
|
|
* this file makes sure that in EXACTFU nodes, the sharp s gets folded to |
2731
|
|
|
|
|
|
* 'ss', even if the pattern isn't UTF-8. This avoids the issues |
2732
|
|
|
|
|
|
* described in the next item. |
2733
|
|
|
|
|
|
* 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the |
2734
|
|
|
|
|
|
* pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a |
2735
|
|
|
|
|
|
* UTF-8 pattern.) An assumption that the optimizer part of regexec.c |
2736
|
|
|
|
|
|
* (probably unwittingly, in Perl_regexec_flags()) makes is that a |
2737
|
|
|
|
|
|
* character in the pattern corresponds to at most a single character in |
2738
|
|
|
|
|
|
* the target string. (And I do mean character, and not byte here, unlike |
2739
|
|
|
|
|
|
* other parts of the documentation that have never been updated to |
2740
|
|
|
|
|
|
* account for multibyte Unicode.) sharp s in EXACTF nodes can match the |
2741
|
|
|
|
|
|
* two character string 'ss'; in EXACTFA nodes it can match |
2742
|
|
|
|
|
|
* "\x{17F}\x{17F}". These violate the assumption, and they are the only |
2743
|
|
|
|
|
|
* instances where it is violated. I'm reluctant to try to change the |
2744
|
|
|
|
|
|
* assumption, as the code involved is impenetrable to me (khw), so |
2745
|
|
|
|
|
|
* instead the code here punts. This routine examines (when the pattern |
2746
|
|
|
|
|
|
* isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a |
2747
|
|
|
|
|
|
* boolean indicating whether or not the node contains a sharp s. When it |
2748
|
|
|
|
|
|
* is true, the caller sets a flag that later causes the optimizer in this |
2749
|
|
|
|
|
|
* file to not set values for the floating and fixed string lengths, and |
2750
|
|
|
|
|
|
* thus avoids the optimizer code in regexec.c that makes the invalid |
2751
|
|
|
|
|
|
* assumption. Thus, there is no optimization based on string lengths for |
2752
|
|
|
|
|
|
* non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. |
2753
|
|
|
|
|
|
* (The reason the assumption is wrong only in these two cases is that all |
2754
|
|
|
|
|
|
* other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all |
2755
|
|
|
|
|
|
* other folds to their expanded versions. We can't prefold sharp s to |
2756
|
|
|
|
|
|
* 'ss' in EXACTF nodes because we don't know at compile time if it |
2757
|
|
|
|
|
|
* actually matches 'ss' or not. It will match iff the target string is |
2758
|
|
|
|
|
|
* in UTF-8, unlike the EXACTFU nodes, where it always matches; and |
2759
|
|
|
|
|
|
* EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 |
2760
|
|
|
|
|
|
* pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; |
2761
|
|
|
|
|
|
* but in a non-UTF8 pattern, folding it to that above-Latin1 string would |
2762
|
|
|
|
|
|
* require the pattern to be forced into UTF-8, the overhead of which we |
2763
|
|
|
|
|
|
* want to avoid.) |
2764
|
|
|
|
|
|
* |
2765
|
|
|
|
|
|
* Similarly, the code that generates tries doesn't currently handle |
2766
|
|
|
|
|
|
* not-already-folded multi-char folds, and it looks like a pain to change |
2767
|
|
|
|
|
|
* that. Therefore, trie generation of EXACTFA nodes with the sharp s |
2768
|
|
|
|
|
|
* doesn't work. Instead, such an EXACTFA is turned into a new regnode, |
2769
|
|
|
|
|
|
* EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people |
2770
|
|
|
|
|
|
* using /iaa matching will be doing so almost entirely with ASCII |
2771
|
|
|
|
|
|
* strings, so this should rarely be encountered in practice */ |
2772
|
|
|
|
|
|
|
2773
|
|
|
|
|
|
#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ |
2774
|
|
|
|
|
|
if (PL_regkind[OP(scan)] == EXACT) \ |
2775
|
|
|
|
|
|
join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) |
2776
|
|
|
|
|
|
|
2777
|
|
|
|
|
|
STATIC U32 |
2778
|
2862741
|
|
|
|
|
S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { |
2779
|
|
|
|
|
|
/* Merge several consecutive EXACTish nodes into one. */ |
2780
|
1129217
|
|
|
|
|
regnode *n = regnext(scan); |
2781
|
|
|
|
|
|
U32 stringok = 1; |
2782
|
1129217
|
|
|
|
|
regnode *next = scan + NODE_SZ_STR(scan); |
2783
|
|
|
|
|
|
U32 merged = 0; |
2784
|
|
|
|
|
|
U32 stopnow = 0; |
2785
|
|
|
|
|
|
#ifdef DEBUGGING |
2786
|
|
|
|
|
|
regnode *stop = scan; |
2787
|
723995
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
2788
|
|
|
|
|
|
#else |
2789
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
2790
|
|
|
|
|
|
#endif |
2791
|
|
|
|
|
|
|
2792
|
723995
|
|
|
|
|
PERL_ARGS_ASSERT_JOIN_EXACT; |
2793
|
|
|
|
|
|
#ifndef EXPERIMENTAL_INPLACESCAN |
2794
|
|
|
|
|
|
PERL_UNUSED_ARG(flags); |
2795
|
|
|
|
|
|
PERL_UNUSED_ARG(val); |
2796
|
|
|
|
|
|
#endif |
2797
|
122836
|
|
|
|
|
DEBUG_PEEP("join",scan,depth); |
2798
|
|
|
|
|
|
|
2799
|
|
|
|
|
|
/* Look through the subsequent nodes in the chain. Skip NOTHING, merge |
2800
|
|
|
|
|
|
* EXACT ones that are mergeable to the current one. */ |
2801
|
601377
|
|
|
|
|
while (n |
2802
|
724035
|
|
|
|
|
&& (PL_regkind[OP(n)] == NOTHING |
2803
|
650941
|
|
|
|
|
|| (stringok && OP(n) == OP(scan))) |
2804
|
1129079
|
|
|
|
|
&& NEXT_OFF(n) |
2805
|
1129079
|
|
|
|
|
&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) |
2806
|
|
|
|
|
|
{ |
2807
|
|
|
|
|
|
|
2808
|
974993
|
|
|
|
|
if (OP(n) == TAIL || n > next) |
2809
|
|
|
|
|
|
stringok = 0; |
2810
|
969927
|
|
|
|
|
if (PL_regkind[OP(n)] == NOTHING) { |
2811
|
1129051
|
|
|
|
|
DEBUG_PEEP("skip:",n,depth); |
2812
|
136894
|
|
|
|
|
NEXT_OFF(scan) += NEXT_OFF(n); |
2813
|
136894
|
|
|
|
|
next = n + NODE_STEP_REGNODE; |
2814
|
|
|
|
|
|
#ifdef DEBUGGING |
2815
|
126964
|
|
|
|
|
if (stringok) |
2816
|
|
|
|
|
|
stop = n; |
2817
|
|
|
|
|
|
#endif |
2818
|
126964
|
|
|
|
|
n = regnext(n); |
2819
|
|
|
|
|
|
} |
2820
|
992185
|
|
|
|
|
else if (stringok) { |
2821
|
586219
|
|
|
|
|
const unsigned int oldl = STR_LEN(scan); |
2822
|
413109
|
|
|
|
|
regnode * const nnext = regnext(n); |
2823
|
|
|
|
|
|
|
2824
|
|
|
|
|
|
/* XXX I (khw) kind of doubt that this works on platforms where |
2825
|
|
|
|
|
|
* U8_MAX is above 255 because of lots of other assumptions */ |
2826
|
|
|
|
|
|
/* Don't join if the sum can't fit into a single node */ |
2827
|
413109
|
|
|
|
|
if (oldl + STR_LEN(n) > U8_MAX) |
2828
|
|
|
|
|
|
break; |
2829
|
|
|
|
|
|
|
2830
|
173138
|
|
|
|
|
DEBUG_PEEP("merg",n,depth); |
2831
|
173138
|
|
|
|
|
merged++; |
2832
|
|
|
|
|
|
|
2833
|
173138
|
|
|
|
|
NEXT_OFF(scan) += NEXT_OFF(n); |
2834
|
173138
|
|
|
|
|
STR_LEN(scan) += STR_LEN(n); |
2835
|
173138
|
|
|
|
|
next = n + NODE_SZ_STR(n); |
2836
|
|
|
|
|
|
/* Now we can overwrite *n : */ |
2837
|
173138
|
|
|
|
|
Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); |
2838
|
|
|
|
|
|
#ifdef DEBUGGING |
2839
|
1129067
|
|
|
|
|
stop = next - 1; |
2840
|
|
|
|
|
|
#endif |
2841
|
|
|
|
|
|
n = nnext; |
2842
|
|
|
|
|
|
if (stopnow) break; |
2843
|
|
|
|
|
|
} |
2844
|
|
|
|
|
|
|
2845
|
|
|
|
|
|
#ifdef EXPERIMENTAL_INPLACESCAN |
2846
|
|
|
|
|
|
if (flags && !NEXT_OFF(n)) { |
2847
|
|
|
|
|
|
DEBUG_PEEP("atch", val, depth); |
2848
|
|
|
|
|
|
if (reg_off_by_arg[OP(n)]) { |
2849
|
|
|
|
|
|
ARG_SET(n, val - n); |
2850
|
|
|
|
|
|
} |
2851
|
|
|
|
|
|
else { |
2852
|
|
|
|
|
|
NEXT_OFF(n) = val - n; |
2853
|
|
|
|
|
|
} |
2854
|
|
|
|
|
|
stopnow = 1; |
2855
|
|
|
|
|
|
} |
2856
|
|
|
|
|
|
#endif |
2857
|
|
|
|
|
|
} |
2858
|
|
|
|
|
|
|
2859
|
1125687
|
|
|
|
|
*min_subtract = 0; |
2860
|
1011573
|
|
|
|
|
*has_exactf_sharp_s = FALSE; |
2861
|
|
|
|
|
|
|
2862
|
|
|
|
|
|
/* Here, all the adjacent mergeable EXACTish nodes have been merged. We |
2863
|
|
|
|
|
|
* can now analyze for sequences of problematic code points. (Prior to |
2864
|
|
|
|
|
|
* this final joining, sequences could have been split over boundaries, and |
2865
|
|
|
|
|
|
* hence missed). The sequences only happen in folding, hence for any |
2866
|
|
|
|
|
|
* non-EXACT EXACTish node */ |
2867
|
1011573
|
|
|
|
|
if (OP(scan) != EXACT) { |
2868
|
18868
|
|
|
|
|
const U8 * const s0 = (U8*) STRING(scan); |
2869
|
|
|
|
|
|
const U8 * s = s0; |
2870
|
1125543
|
|
|
|
|
const U8 * const s_end = s0 + STR_LEN(scan); |
2871
|
|
|
|
|
|
|
2872
|
|
|
|
|
|
/* One pass is made over the node's string looking for all the |
2873
|
|
|
|
|
|
* possibilities. to avoid some tests in the loop, there are two main |
2874
|
|
|
|
|
|
* cases, for UTF-8 patterns (which can't have EXACTF nodes) and |
2875
|
|
|
|
|
|
* non-UTF-8 */ |
2876
|
1122151
|
|
|
|
|
if (UTF) { |
2877
|
|
|
|
|
|
|
2878
|
|
|
|
|
|
/* Examine the string for a multi-character fold sequence. UTF-8 |
2879
|
|
|
|
|
|
* patterns have all characters pre-folded by the time this code is |
2880
|
|
|
|
|
|
* executed */ |
2881
|
2170020
|
|
|
|
|
while (s < s_end - 1) /* Can stop 1 before the end, as minimum |
2882
|
|
|
|
|
|
length sequence we are looking for is 2 */ |
2883
|
|
|
|
|
|
{ |
2884
|
|
|
|
|
|
int count = 0; |
2885
|
494764
|
|
|
|
|
int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); |
2886
|
4033070
|
|
|
|
|
if (! len) { /* Not a multi-char fold: get next char */ |
2887
|
3464092
|
|
|
|
|
s += UTF8SKIP(s); |
2888
|
3464092
|
|
|
|
|
continue; |
2889
|
|
|
|
|
|
} |
2890
|
|
|
|
|
|
|
2891
|
|
|
|
|
|
/* Nodes with 'ss' require special handling, except for EXACTFL |
2892
|
|
|
|
|
|
* and EXACTFA-ish for which there is no multi-char fold to |
2893
|
|
|
|
|
|
* this */ |
2894
|
3464092
|
|
|
|
|
if (len == 2 && *s == 's' && *(s+1) == 's' |
2895
|
3464092
|
|
|
|
|
&& OP(scan) != EXACTFL |
2896
|
2206944
|
|
|
|
|
&& OP(scan) != EXACTFA |
2897
|
2206944
|
|
|
|
|
&& OP(scan) != EXACTFA_NO_TRIE) |
2898
|
|
|
|
|
|
{ |
2899
|
|
|
|
|
|
count = 2; |
2900
|
2206944
|
|
|
|
|
OP(scan) = EXACTFU_SS; |
2901
|
2200916
|
|
|
|
|
s += 2; |
2902
|
|
|
|
|
|
} |
2903
|
|
|
|
|
|
else { /* Here is a generic multi-char fold. */ |
2904
|
1297931
|
|
|
|
|
const U8* multi_end = s + len; |
2905
|
|
|
|
|
|
|
2906
|
|
|
|
|
|
/* Count how many characters in it. In the case of /l and |
2907
|
|
|
|
|
|
* /aa, no folds which contain ASCII code points are |
2908
|
|
|
|
|
|
* allowed, so check for those, and skip if found. (In |
2909
|
|
|
|
|
|
* EXACTFL, no folds are allowed to any Latin1 code point, |
2910
|
|
|
|
|
|
* not just ASCII. But there aren't any of these |
2911
|
|
|
|
|
|
* currently, nor ever likely, so don't take the time to |
2912
|
|
|
|
|
|
* test for them. The code that generates the |
2913
|
|
|
|
|
|
* is_MULTI_foo() macros croaks should one actually get put |
2914
|
|
|
|
|
|
* into Unicode .) */ |
2915
|
1297931
|
|
|
|
|
if (OP(scan) != EXACTFL |
2916
|
376
|
|
|
|
|
&& OP(scan) != EXACTFA |
2917
|
376
|
|
|
|
|
&& OP(scan) != EXACTFA_NO_TRIE) |
2918
|
|
|
|
|
|
{ |
2919
|
376
|
|
|
|
|
count = utf8_length(s, multi_end); |
2920
|
376
|
|
|
|
|
s = multi_end; |
2921
|
|
|
|
|
|
} |
2922
|
|
|
|
|
|
else { |
2923
|
376
|
|
|
|
|
while (s < multi_end) { |
2924
|
324
|
|
|
|
|
if (isASCII(*s)) { |
2925
|
1297555
|
|
|
|
|
s++; |
2926
|
1297931
|
|
|
|
|
goto next_iteration; |
2927
|
|
|
|
|
|
} |
2928
|
|
|
|
|
|
else { |
2929
|
1297875
|
|
|
|
|
s += UTF8SKIP(s); |
2930
|
|
|
|
|
|
} |
2931
|
2166161
|
|
|
|
|
count++; |
2932
|
|
|
|
|
|
} |
2933
|
|
|
|
|
|
} |
2934
|
|
|
|
|
|
} |
2935
|
|
|
|
|
|
|
2936
|
|
|
|
|
|
/* The delta is how long the sequence is minus 1 (1 is how long |
2937
|
|
|
|
|
|
* the character that folds to the sequence is) */ |
2938
|
111824
|
|
|
|
|
*min_subtract += count - 1; |
2939
|
|
|
|
|
|
next_iteration: ; |
2940
|
|
|
|
|
|
} |
2941
|
|
|
|
|
|
} |
2942
|
111854
|
|
|
|
|
else if (OP(scan) == EXACTFA) { |
2943
|
|
|
|
|
|
|
2944
|
|
|
|
|
|
/* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char |
2945
|
|
|
|
|
|
* fold to the ASCII range (and there are no existing ones in the |
2946
|
|
|
|
|
|
* upper latin1 range). But, as outlined in the comments preceding |
2947
|
|
|
|
|
|
* this function, we need to flag any occurrences of the sharp s. |
2948
|
|
|
|
|
|
* This character forbids trie formation (because of added |
2949
|
|
|
|
|
|
* complexity) */ |
2950
|
2166161
|
|
|
|
|
while (s < s_end) { |
2951
|
1257148
|
|
|
|
|
if (*s == LATIN_SMALL_LETTER_SHARP_S) { |
2952
|
1122117
|
|
|
|
|
OP(scan) = EXACTFA_NO_TRIE; |
2953
|
518391
|
|
|
|
|
*has_exactf_sharp_s = TRUE; |
2954
|
518391
|
|
|
|
|
break; |
2955
|
|
|
|
|
|
} |
2956
|
773267
|
|
|
|
|
s++; |
2957
|
518391
|
|
|
|
|
continue; |
2958
|
|
|
|
|
|
} |
2959
|
|
|
|
|
|
} |
2960
|
512773
|
|
|
|
|
else if (OP(scan) != EXACTFL) { |
2961
|
|
|
|
|
|
|
2962
|
|
|
|
|
|
/* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the |
2963
|
|
|
|
|
|
* multi-char folds that are all Latin1. (This code knows that |
2964
|
|
|
|
|
|
* there are no current multi-char folds possible with EXACTFL, |
2965
|
|
|
|
|
|
* relying on fold_grind.t to catch any errors if the very unlikely |
2966
|
|
|
|
|
|
* event happens that some get added in future Unicode versions.) |
2967
|
|
|
|
|
|
* As explained in the comments preceding this function, we look |
2968
|
|
|
|
|
|
* also for the sharp s in EXACTF nodes; it can be in the final |
2969
|
|
|
|
|
|
* position. Otherwise we can stop looking 1 byte earlier because |
2970
|
|
|
|
|
|
* have to find at least two characters for a multi-fold */ |
2971
|
32651
|
|
|
|
|
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; |
2972
|
|
|
|
|
|
|
2973
|
32765
|
|
|
|
|
while (s < upper) { |
2974
|
32697
|
|
|
|
|
int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); |
2975
|
23193
|
|
|
|
|
if (! len) { /* Not a multi-char fold. */ |
2976
|
80
|
|
|
|
|
if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) |
2977
|
|
|
|
|
|
{ |
2978
|
0
|
|
|
|
|
*has_exactf_sharp_s = TRUE; |
2979
|
|
|
|
|
|
} |
2980
|
80
|
|
|
|
|
s++; |
2981
|
80
|
|
|
|
|
continue; |
2982
|
|
|
|
|
|
} |
2983
|
|
|
|
|
|
|
2984
|
0
|
|
|
|
|
if (len == 2 |
2985
|
0
|
|
|
|
|
&& isARG2_lower_or_UPPER_ARG1('s', *s) |
2986
|
0
|
|
|
|
|
&& isARG2_lower_or_UPPER_ARG1('s', *(s+1))) |
2987
|
|
|
|
|
|
{ |
2988
|
|
|
|
|
|
|
2989
|
|
|
|
|
|
/* EXACTF nodes need to know that the minimum length |
2990
|
|
|
|
|
|
* changed so that a sharp s in the string can match this |
2991
|
|
|
|
|
|
* ss in the pattern, but they remain EXACTF nodes, as they |
2992
|
|
|
|
|
|
* won't match this unless the target string is is UTF-8, |
2993
|
|
|
|
|
|
* which we don't know until runtime */ |
2994
|
0
|
|
|
|
|
if (OP(scan) != EXACTF) { |
2995
|
1129039
|
|
|
|
|
OP(scan) = EXACTFU_SS; |
2996
|
|
|
|
|
|
} |
2997
|
|
|
|
|
|
} |
2998
|
|
|
|
|
|
|
2999
|
52724836
|
|
|
|
|
*min_subtract += len - 1; |
3000
|
21780
|
|
|
|
|
s += len; |
3001
|
|
|
|
|
|
} |
3002
|
|
|
|
|
|
} |
3003
|
|
|
|
|
|
} |
3004
|
|
|
|
|
|
|
3005
|
|
|
|
|
|
#ifdef DEBUGGING |
3006
|
|
|
|
|
|
/* Allow dumping but overwriting the collection of skipped |
3007
|
|
|
|
|
|
* ops and/or strings with fake optimized ops */ |
3008
|
6470
|
|
|
|
|
n = scan + NODE_SZ_STR(scan); |
3009
|
6574
|
|
|
|
|
while (n <= stop) { |
3010
|
6218
|
|
|
|
|
OP(n) = OPTIMIZED; |
3011
|
6218
|
|
|
|
|
FLAGS(n) = 0; |
3012
|
6218
|
|
|
|
|
NEXT_OFF(n) = 0; |
3013
|
182
|
|
|
|
|
n++; |
3014
|
|
|
|
|
|
} |
3015
|
|
|
|
|
|
#endif |
3016
|
306
|
|
|
|
|
DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); |
3017
|
6470
|
|
|
|
|
return stopnow; |
3018
|
|
|
|
|
|
} |
3019
|
|
|
|
|
|
|
3020
|
|
|
|
|
|
/* REx optimizer. Converts nodes into quicker variants "in place". |
3021
|
|
|
|
|
|
Finds fixed substrings. */ |
3022
|
|
|
|
|
|
|
3023
|
|
|
|
|
|
/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set |
3024
|
|
|
|
|
|
to the position after last scanned or to NULL. */ |
3025
|
|
|
|
|
|
|
3026
|
|
|
|
|
|
#define INIT_AND_WITHP \ |
3027
|
|
|
|
|
|
assert(!and_withp); \ |
3028
|
|
|
|
|
|
Newx(and_withp,1,struct regnode_charclass_class); \ |
3029
|
|
|
|
|
|
SAVEFREEPV(and_withp) |
3030
|
|
|
|
|
|
|
3031
|
|
|
|
|
|
/* this is a chain of data about sub patterns we are processing that |
3032
|
|
|
|
|
|
need to be handled separately/specially in study_chunk. Its so |
3033
|
|
|
|
|
|
we can simulate recursion without losing state. */ |
3034
|
|
|
|
|
|
struct scan_frame; |
3035
|
|
|
|
|
|
typedef struct scan_frame { |
3036
|
|
|
|
|
|
regnode *last; /* last node to process in this frame */ |
3037
|
|
|
|
|
|
regnode *next; /* next node to process when last is reached */ |
3038
|
|
|
|
|
|
struct scan_frame *prev; /*previous frame*/ |
3039
|
|
|
|
|
|
I32 stop; /* what stopparen do we use */ |
3040
|
|
|
|
|
|
} scan_frame; |
3041
|
|
|
|
|
|
|
3042
|
|
|
|
|
|
|
3043
|
|
|
|
|
|
#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) |
3044
|
|
|
|
|
|
|
3045
|
|
|
|
|
|
STATIC SSize_t |
3046
|
3306
|
|
|
|
|
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, |
3047
|
|
|
|
|
|
SSize_t *minlenp, SSize_t *deltap, |
3048
|
|
|
|
|
|
regnode *last, |
3049
|
|
|
|
|
|
scan_data_t *data, |
3050
|
|
|
|
|
|
I32 stopparen, |
3051
|
|
|
|
|
|
U8* recursed, |
3052
|
|
|
|
|
|
struct regnode_charclass_class *and_withp, |
3053
|
|
|
|
|
|
U32 flags, U32 depth) |
3054
|
|
|
|
|
|
/* scanp: Start here (read-write). */ |
3055
|
|
|
|
|
|
/* deltap: Write maxlen-minlen here. */ |
3056
|
|
|
|
|
|
/* last: Stop before this one. */ |
3057
|
|
|
|
|
|
/* data: string data about the pattern */ |
3058
|
|
|
|
|
|
/* stopparen: treat close N as END */ |
3059
|
|
|
|
|
|
/* recursed: which subroutines have we recursed into */ |
3060
|
|
|
|
|
|
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ |
3061
|
|
|
|
|
|
{ |
3062
|
|
|
|
|
|
dVAR; |
3063
|
|
|
|
|
|
/* There must be at least this number of characters to match */ |
3064
|
|
|
|
|
|
SSize_t min = 0; |
3065
|
|
|
|
|
|
I32 pars = 0, code; |
3066
|
3306
|
|
|
|
|
regnode *scan = *scanp, *next; |
3067
|
|
|
|
|
|
SSize_t delta = 0; |
3068
|
6488
|
|
|
|
|
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); |
3069
|
|
|
|
|
|
int is_inf_internal = 0; /* The studied chunk is infinite */ |
3070
|
4366
|
|
|
|
|
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; |
3071
|
|
|
|
|
|
scan_data_t data_fake; |
3072
|
|
|
|
|
|
SV *re_trie_maxbuff = NULL; |
3073
|
4366
|
|
|
|
|
regnode *first_non_open = scan; |
3074
|
|
|
|
|
|
SSize_t stopmin = SSize_t_MAX; |
3075
|
|
|
|
|
|
scan_frame *frame = NULL; |
3076
|
2318
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
3077
|
|
|
|
|
|
|
3078
|
196
|
|
|
|
|
PERL_ARGS_ASSERT_STUDY_CHUNK; |
3079
|
|
|
|
|
|
|
3080
|
|
|
|
|
|
#ifdef DEBUGGING |
3081
|
196
|
|
|
|
|
StructCopy(&zero_scan_data, &data_fake, scan_data_t); |
3082
|
|
|
|
|
|
#endif |
3083
|
|
|
|
|
|
|
3084
|
2318
|
|
|
|
|
if ( depth == 0 ) { |
3085
|
168
|
|
|
|
|
while (first_non_open && OP(first_non_open) == OPEN) |
3086
|
2130
|
|
|
|
|
first_non_open=regnext(first_non_open); |
3087
|
|
|
|
|
|
} |
3088
|
|
|
|
|
|
|
3089
|
|
|
|
|
|
|
3090
|
|
|
|
|
|
fake_study_recurse: |
3091
|
15992
|
|
|
|
|
while ( scan && OP(scan) != END && scan < last ){ |
3092
|
15796
|
|
|
|
|
UV min_subtract = 0; /* How mmany chars to subtract from the minimum |
3093
|
|
|
|
|
|
node length to get a real minimum (because |
3094
|
|
|
|
|
|
the folded version may be shorter) */ |
3095
|
15796
|
|
|
|
|
bool has_exactf_sharp_s = FALSE; |
3096
|
|
|
|
|
|
/* Peephole optimizer: */ |
3097
|
22088
|
|
|
|
|
DEBUG_STUDYDATA("Peep:", data,depth); |
3098
|
19966
|
|
|
|
|
DEBUG_PEEP("Peep",scan,depth); |
3099
|
|
|
|
|
|
|
3100
|
|
|
|
|
|
/* Its not clear to khw or hv why this is done here, and not in the |
3101
|
|
|
|
|
|
* clauses that deal with EXACT nodes. khw's guess is that it's |
3102
|
|
|
|
|
|
* because of a previous design */ |
3103
|
19966
|
|
|
|
|
JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); |
3104
|
|
|
|
|
|
|
3105
|
|
|
|
|
|
/* Follow the next-chain of the current node and optimize |
3106
|
|
|
|
|
|
away all the NOTHINGs from it. */ |
3107
|
19966
|
|
|
|
|
if (OP(scan) != CURLYX) { |
3108
|
20274
|
|
|
|
|
const int max = (reg_off_by_arg[OP(scan)] |
3109
|
19966
|
|
|
|
|
? I32_MAX |
3110
|
|
|
|
|
|
/* I32 may be smaller than U16 on CRAYs! */ |
3111
|
19966
|
|
|
|
|
: (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); |
3112
|
19966
|
|
|
|
|
int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); |
3113
|
|
|
|
|
|
int noff; |
3114
|
52703364
|
|
|
|
|
regnode *n = scan; |
3115
|
|
|
|
|
|
|
3116
|
|
|
|
|
|
/* Skip NOTHING and LONGJMP. */ |
3117
|
12679619
|
|
|
|
|
while ((n = regnext(n)) |
3118
|
12679311
|
|
|
|
|
&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) |
3119
|
928774
|
|
|
|
|
|| ((OP(n) == LONGJMP) && (noff = ARG(n)))) |
3120
|
928466
|
|
|
|
|
&& off + noff < max) |
3121
|
928466
|
|
|
|
|
off += noff; |
3122
|
11750845
|
|
|
|
|
if (reg_off_by_arg[OP(scan)]) |
3123
|
12679003
|
|
|
|
|
ARG(scan) = off; |
3124
|
|
|
|
|
|
else |
3125
|
12679311
|
|
|
|
|
NEXT_OFF(scan) = off; |
3126
|
|
|
|
|
|
} |
3127
|
|
|
|
|
|
|
3128
|
|
|
|
|
|
|
3129
|
|
|
|
|
|
|
3130
|
|
|
|
|
|
/* The principal pseudo-switch. Cannot be a switch, since we |
3131
|
|
|
|
|
|
look into several different things. */ |
3132
|
7981987
|
|
|
|
|
if (OP(scan) == BRANCH || OP(scan) == BRANCHJ |
3133
|
3494223
|
|
|
|
|
|| OP(scan) == IFTHEN) { |
3134
|
3493931
|
|
|
|
|
next = regnext(scan); |
3135
|
3493931
|
|
|
|
|
code = OP(scan); |
3136
|
|
|
|
|
|
/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ |
3137
|
|
|
|
|
|
|
3138
|
7981687
|
|
|
|
|
if (OP(next) == code || code == IFTHEN) { |
3139
|
|
|
|
|
|
/* NOTE - There is similar code to this block below for handling |
3140
|
|
|
|
|
|
TRIE nodes on a re-study. If you change stuff here check there |
3141
|
|
|
|
|
|
too. */ |
3142
|
|
|
|
|
|
SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; |
3143
|
|
|
|
|
|
struct regnode_charclass_class accum; |
3144
|
7981687
|
|
|
|
|
regnode * const startbranch=scan; |
3145
|
|
|
|
|
|
|
3146
|
511202
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) |
3147
|
7981687
|
|
|
|
|
SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ |
3148
|
4457770
|
|
|
|
|
if (flags & SCF_DO_STCLASS) |
3149
|
8112063
|
|
|
|
|
cl_init_zero(pRExC_state, &accum); |
3150
|
|
|
|
|
|
|
3151
|
7981743
|
|
|
|
|
while (OP(scan) == code) { |
3152
|
|
|
|
|
|
SSize_t deltanext, minnext, fake; |
3153
|
|
|
|
|
|
I32 f = 0; |
3154
|
|
|
|
|
|
struct regnode_charclass_class this_class; |
3155
|
|
|
|
|
|
|
3156
|
129794
|
|
|
|
|
num++; |
3157
|
7981735
|
|
|
|
|
data_fake.flags = 0; |
3158
|
7981735
|
|
|
|
|
if (data) { |
3159
|
7981735
|
|
|
|
|
data_fake.whilem_c = data->whilem_c; |
3160
|
12679059
|
|
|
|
|
data_fake.last_closep = data->last_closep; |
3161
|
|
|
|
|
|
} |
3162
|
|
|
|
|
|
else |
3163
|
9116401
|
|
|
|
|
data_fake.last_closep = &fake; |
3164
|
|
|
|
|
|
|
3165
|
6053568
|
|
|
|
|
data_fake.pos_delta = delta; |
3166
|
6049736
|
|
|
|
|
next = regnext(scan); |
3167
|
56
|
|
|
|
|
scan = NEXTOPER(scan); |
3168
|
40798
|
|
|
|
|
if (code != BRANCH) |
3169
|
6134996
|
|
|
|
|
scan = NEXTOPER(scan); |
3170
|
6135052
|
|
|
|
|
if (flags & SCF_DO_STCLASS) { |
3171
|
6135042
|
|
|
|
|
cl_init(pRExC_state, &this_class); |
3172
|
6053558
|
|
|
|
|
data_fake.start_class = &this_class; |
3173
|
|
|
|
|
|
f = SCF_DO_STCLASS_AND; |
3174
|
|
|
|
|
|
} |
3175
|
81540
|
|
|
|
|
if (flags & SCF_WHILEM_VISITED_POS) |
3176
|
20900702
|
|
|
|
|
f |= SCF_WHILEM_VISITED_POS; |
3177
|
|
|
|
|
|
|
3178
|
|
|
|
|
|
/* we suppose the run is continuous, last=next...*/ |
3179
|
20859960
|
|
|
|
|
minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, |
3180
|
|
|
|
|
|
next, &data_fake, |
3181
|
|
|
|
|
|
stopparen, recursed, NULL, f,depth+1); |
3182
|
2363092
|
|
|
|
|
if (min1 > minnext) |
3183
|
|
|
|
|
|
min1 = minnext; |
3184
|
6135052
|
|
|
|
|
if (deltanext == SSize_t_MAX) { |
3185
|
|
|
|
|
|
is_inf = is_inf_internal = 1; |
3186
|
|
|
|
|
|
max1 = SSize_t_MAX; |
3187
|
6135052
|
|
|
|
|
} else if (max1 < minnext + deltanext) |
3188
|
6053522
|
|
|
|
|
max1 = minnext + deltanext; |
3189
|
6544063
|
|
|
|
|
scan = next; |
3190
|
432984
|
|
|
|
|
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) |
3191
|
432848
|
|
|
|
|
pars++; |
3192
|
136
|
|
|
|
|
if (data_fake.flags & SCF_SEEN_ACCEPT) { |
3193
|
432928
|
|
|
|
|
if ( stopmin > minnext) |
3194
|
432928
|
|
|
|
|
stopmin = min + min1; |
3195
|
12679003
|
|
|
|
|
flags &= ~SCF_DO_SUBSTR; |
3196
|
40024053
|
|
|
|
|
if (data) |
3197
|
1832658
|
|
|
|
|
data->flags |= SCF_SEEN_ACCEPT; |
3198
|
|
|
|
|
|
} |
3199
|
1832714
|
|
|
|
|
if (data) { |
3200
|
1832714
|
|
|
|
|
if (data_fake.flags & SF_HAS_EVAL) |
3201
|
1018818
|
|
|
|
|
data->flags |= SF_HAS_EVAL; |
3202
|
1832714
|
|
|
|
|
data->whilem_c = data_fake.whilem_c; |
3203
|
|
|
|
|
|
} |
3204
|
797898
|
|
|
|
|
if (flags & SCF_DO_STCLASS) |
3205
|
797888
|
|
|
|
|
cl_or(pRExC_state, &accum, &this_class); |
3206
|
|
|
|
|
|
} |
3207
|
797850
|
|
|
|
|
if (code == IFTHEN && num < 2) /* Empty ELSE branch */ |
3208
|
|
|
|
|
|
min1 = 0; |
3209
|
1832666
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
3210
|
35100
|
|
|
|
|
data->pos_min += min1; |
3211
|
1832666
|
|
|
|
|
if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) |
3212
|
1832658
|
|
|
|
|
data->pos_delta = SSize_t_MAX; |
3213
|
|
|
|
|
|
else |
3214
|
1832666
|
|
|
|
|
data->pos_delta += max1 - min1; |
3215
|
1018826
|
|
|
|
|
if (max1 != min1 || is_inf) |
3216
|
1018822
|
|
|
|
|
data->longest = &(data->longest_float); |
3217
|
|
|
|
|
|
} |
3218
|
8
|
|
|
|
|
min += min1; |
3219
|
1018826
|
|
|
|
|
if (delta == SSize_t_MAX |
3220
|
1018826
|
|
|
|
|
|| SSize_t_MAX - delta - (max1 - min1) < 0) |
3221
|
|
|
|
|
|
delta = SSize_t_MAX; |
3222
|
|
|
|
|
|
else |
3223
|
127476
|
|
|
|
|
delta += max1 - min1; |
3224
|
1832666
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) { |
3225
|
482397
|
|
|
|
|
cl_or(pRExC_state, data->start_class, &accum); |
3226
|
274802
|
|
|
|
|
if (min1) { |
3227
|
229976
|
|
|
|
|
cl_and(data->start_class, and_withp); |
3228
|
35727
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
3229
|
|
|
|
|
|
} |
3230
|
|
|
|
|
|
} |
3231
|
346264
|
|
|
|
|
else if (flags & SCF_DO_STCLASS_AND) { |
3232
|
346262
|
|
|
|
|
if (min1) { |
3233
|
346262
|
|
|
|
|
cl_and(data->start_class, &accum); |
3234
|
274808
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
3235
|
|
|
|
|
|
} |
3236
|
|
|
|
|
|
else { |
3237
|
|
|
|
|
|
/* Switch to OR mode: cache the old value of |
3238
|
|
|
|
|
|
* data->start_class */ |
3239
|
274802
|
|
|
|
|
INIT_AND_WITHP; |
3240
|
274802
|
|
|
|
|
StructCopy(data->start_class, and_withp, |
3241
|
|
|
|
|
|
struct regnode_charclass_class); |
3242
|
44826
|
|
|
|
|
flags &= ~SCF_DO_STCLASS_AND; |
3243
|
229976
|
|
|
|
|
StructCopy(&accum, data->start_class, |
3244
|
|
|
|
|
|
struct regnode_charclass_class); |
3245
|
229976
|
|
|
|
|
flags |= SCF_DO_STCLASS_OR; |
3246
|
202190
|
|
|
|
|
SET_SSC_EOS(data->start_class); |
3247
|
|
|
|
|
|
} |
3248
|
|
|
|
|
|
} |
3249
|
|
|
|
|
|
|
3250
|
48912
|
|
|
|
|
if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { |
3251
|
|
|
|
|
|
/* demq. |
3252
|
|
|
|
|
|
|
3253
|
|
|
|
|
|
Assuming this was/is a branch we are dealing with: 'scan' now |
3254
|
|
|
|
|
|
points at the item that follows the branch sequence, whatever |
3255
|
|
|
|
|
|
it is. We now start at the beginning of the sequence and look |
3256
|
|
|
|
|
|
for subsequences of |
3257
|
|
|
|
|
|
|
3258
|
|
|
|
|
|
BRANCH->EXACT=>x1 |
3259
|
|
|
|
|
|
BRANCH->EXACT=>x2 |
3260
|
|
|
|
|
|
tail |
3261
|
|
|
|
|
|
|
3262
|
|
|
|
|
|
which would be constructed from a pattern like /A|LIST|OF|WORDS/ |
3263
|
|
|
|
|
|
|
3264
|
|
|
|
|
|
If we can find such a subsequence we need to turn the first |
3265
|
|
|
|
|
|
element into a trie and then add the subsequent branch exact |
3266
|
|
|
|
|
|
strings to the trie. |
3267
|
|
|
|
|
|
|
3268
|
|
|
|
|
|
We have two cases |
3269
|
|
|
|
|
|
|
3270
|
|
|
|
|
|
1. patterns where the whole set of branches can be converted. |
3271
|
|
|
|
|
|
|
3272
|
|
|
|
|
|
2. patterns where only a subset can be converted. |
3273
|
|
|
|
|
|
|
3274
|
|
|
|
|
|
In case 1 we can replace the whole set with a single regop |
3275
|
|
|
|
|
|
for the trie. In case 2 we need to keep the start and end |
3276
|
|
|
|
|
|
branches so |
3277
|
|
|
|
|
|
|
3278
|
|
|
|
|
|
'BRANCH EXACT; BRANCH EXACT; BRANCH X' |
3279
|
|
|
|
|
|
becomes BRANCH TRIE; BRANCH X; |
3280
|
|
|
|
|
|
|
3281
|
|
|
|
|
|
There is an additional case, that being where there is a |
3282
|
|
|
|
|
|
common prefix, which gets split out into an EXACT like node |
3283
|
|
|
|
|
|
preceding the TRIE node. |
3284
|
|
|
|
|
|
|
3285
|
|
|
|
|
|
If x(1..n)==tail then we can do a simple trie, if not we make |
3286
|
|
|
|
|
|
a "jump" trie, such that when we match the appropriate word |
3287
|
|
|
|
|
|
we "jump" to the appropriate tail node. Essentially we turn |
3288
|
|
|
|
|
|
a nested if into a case structure of sorts. |
3289
|
|
|
|
|
|
|
3290
|
|
|
|
|
|
*/ |
3291
|
|
|
|
|
|
|
3292
|
|
|
|
|
|
int made=0; |
3293
|
153294
|
|
|
|
|
if (!re_trie_maxbuff) { |
3294
|
6912
|
|
|
|
|
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); |
3295
|
6912
|
|
|
|
|
if (!SvIOK(re_trie_maxbuff)) |
3296
|
71454
|
|
|
|
|
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); |
3297
|
|
|
|
|
|
} |
3298
|
18327959
|
|
|
|
|
if ( SvIV(re_trie_maxbuff)>=0 ) { |
3299
|
|
|
|
|
|
regnode *cur; |
3300
|
|
|
|
|
|
regnode *first = (regnode *)NULL; |
3301
|
|
|
|
|
|
regnode *last = (regnode *)NULL; |
3302
|
18292232
|
|
|
|
|
regnode *tail = scan; |
3303
|
|
|
|
|
|
U8 trietype = 0; |
3304
|
|
|
|
|
|
U32 count=0; |
3305
|
|
|
|
|
|
|
3306
|
|
|
|
|
|
#ifdef DEBUGGING |
3307
|
2072174
|
|
|
|
|
SV * const mysv = sv_newmortal(); /* for dumping */ |
3308
|
|
|
|
|
|
#endif |
3309
|
|
|
|
|
|
/* var tail is used because there may be a TAIL |
3310
|
|
|
|
|
|
regop in the way. Ie, the exacts will point to the |
3311
|
|
|
|
|
|
thing following the TAIL, but the last branch will |
3312
|
|
|
|
|
|
point at the TAIL. So we advance tail. If we |
3313
|
|
|
|
|
|
have nested (?:) we may have to move through several |
3314
|
|
|
|
|
|
tails. |
3315
|
|
|
|
|
|
*/ |
3316
|
|
|
|
|
|
|
3317
|
1486420
|
|
|
|
|
while ( OP( tail ) == TAIL ) { |
3318
|
|
|
|
|
|
/* this is the TAIL generated by (?:) */ |
3319
|
59536
|
|
|
|
|
tail = regnext( tail ); |
3320
|
|
|
|
|
|
} |
3321
|
|
|
|
|
|
|
3322
|
|
|
|
|
|
|
3323
|
39232
|
|
|
|
|
DEBUG_TRIE_COMPILE_r({ |
3324
|
|
|
|
|
|
regprop(RExC_rx, mysv, tail ); |
3325
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s%s%s\n", |
3326
|
|
|
|
|
|
(int)depth * 2 + 2, "", |
3327
|
|
|
|
|
|
"Looking for TRIE'able sequences. Tail node is: ", |
3328
|
|
|
|
|
|
SvPV_nolen_const( mysv ) |
3329
|
|
|
|
|
|
); |
3330
|
|
|
|
|
|
}); |
3331
|
|
|
|
|
|
|
3332
|
|
|
|
|
|
/* |
3333
|
|
|
|
|
|
|
3334
|
|
|
|
|
|
Step through the branches |
3335
|
|
|
|
|
|
cur represents each branch, |
3336
|
|
|
|
|
|
noper is the first thing to be matched as part of that branch |
3337
|
|
|
|
|
|
noper_next is the regnext() of that node. |
3338
|
|
|
|
|
|
|
3339
|
|
|
|
|
|
We normally handle a case like this /FOO[xyz]|BAR[pqr]/ |
3340
|
|
|
|
|
|
via a "jump trie" but we also support building with NOJUMPTRIE, |
3341
|
|
|
|
|
|
which restricts the trie logic to structures like /FOO|BAR/. |
3342
|
|
|
|
|
|
|
3343
|
|
|
|
|
|
If noper is a trieable nodetype then the branch is a possible optimization |
3344
|
|
|
|
|
|
target. If we are building under NOJUMPTRIE then we require that noper_next |
3345
|
|
|
|
|
|
is the same as scan (our current position in the regex program). |
3346
|
|
|
|
|
|
|
3347
|
|
|
|
|
|
Once we have two or more consecutive such branches we can create a |
3348
|
|
|
|
|
|
trie of the EXACT's contents and stitch it in place into the program. |
3349
|
|
|
|
|
|
|
3350
|
|
|
|
|
|
If the sequence represents all of the branches in the alternation we |
3351
|
|
|
|
|
|
replace the entire thing with a single TRIE node. |
3352
|
|
|
|
|
|
|
3353
|
|
|
|
|
|
Otherwise when it is a subsequence we need to stitch it in place and |
3354
|
|
|
|
|
|
replace only the relevant branches. This means the first branch has |
3355
|
|
|
|
|
|
to remain as it is used by the alternation logic, and its next pointer, |
3356
|
|
|
|
|
|
and needs to be repointed at the item on the branch chain following |
3357
|
|
|
|
|
|
the last branch we have optimized away. |
3358
|
|
|
|
|
|
|
3359
|
|
|
|
|
|
This could be either a BRANCH, in which case the subsequence is internal, |
3360
|
|
|
|
|
|
or it could be the item following the branch sequence in which case the |
3361
|
|
|
|
|
|
subsequence is at the end (which does not necessarily mean the first node |
3362
|
|
|
|
|
|
is the start of the alternation). |
3363
|
|
|
|
|
|
|
3364
|
|
|
|
|
|
TRIE_TYPE(X) is a define which maps the optype to a trietype. |
3365
|
|
|
|
|
|
|
3366
|
|
|
|
|
|
optype | trietype |
3367
|
|
|
|
|
|
----------------+----------- |
3368
|
|
|
|
|
|
NOTHING | NOTHING |
3369
|
|
|
|
|
|
EXACT | EXACT |
3370
|
|
|
|
|
|
EXACTFU | EXACTFU |
3371
|
|
|
|
|
|
EXACTFU_SS | EXACTFU |
3372
|
|
|
|
|
|
EXACTFA | EXACTFA |
3373
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
3375
|
|
|
|
|
|
*/ |
3376
|
|
|
|
|
|
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ |
3377
|
|
|
|
|
|
( EXACT == (X) ) ? EXACT : \ |
3378
|
|
|
|
|
|
( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ |
3379
|
|
|
|
|
|
( EXACTFA == (X) ) ? EXACTFA : \ |
3380
|
|
|
|
|
|
0 ) |
3381
|
|
|
|
|
|
|
3382
|
|
|
|
|
|
/* dont use tail as the end marker for this traverse */ |
3383
|
37628
|
|
|
|
|
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { |
3384
|
37620
|
|
|
|
|
regnode * const noper = NEXTOPER( cur ); |
3385
|
56
|
|
|
|
|
U8 noper_type = OP( noper ); |
3386
|
56
|
|
|
|
|
U8 noper_trietype = TRIE_TYPE( noper_type ); |
3387
|
|
|
|
|
|
#if defined(DEBUGGING) || defined(NOJUMPTRIE) |
3388
|
56
|
|
|
|
|
regnode * const noper_next = regnext( noper ); |
3389
|
56
|
|
|
|
|
U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; |
3390
|
56
|
|
|
|
|
U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; |
3391
|
|
|
|
|
|
#endif |
3392
|
|
|
|
|
|
|
3393
|
56
|
|
|
|
|
DEBUG_TRIE_COMPILE_r({ |
3394
|
|
|
|
|
|
regprop(RExC_rx, mysv, cur); |
3395
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", |
3396
|
|
|
|
|
|
(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); |
3397
|
|
|
|
|
|
|
3398
|
|
|
|
|
|
regprop(RExC_rx, mysv, noper); |
3399
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, " -> %s", |
3400
|
|
|
|
|
|
SvPV_nolen_const(mysv)); |
3401
|
|
|
|
|
|
|
3402
|
|
|
|
|
|
if ( noper_next ) { |
3403
|
|
|
|
|
|
regprop(RExC_rx, mysv, noper_next ); |
3404
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log,"\t=> %s\t", |
3405
|
|
|
|
|
|
SvPV_nolen_const(mysv)); |
3406
|
|
|
|
|
|
} |
3407
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", |
3408
|
|
|
|
|
|
REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), |
3409
|
|
|
|
|
|
PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] |
3410
|
|
|
|
|
|
); |
3411
|
|
|
|
|
|
}); |
3412
|
|
|
|
|
|
|
3413
|
|
|
|
|
|
/* Is noper a trieable nodetype that can be merged with the |
3414
|
|
|
|
|
|
* current trie (if there is one)? */ |
3415
|
56
|
|
|
|
|
if ( noper_trietype |
3416
|
56
|
|
|
|
|
&& |
3417
|
|
|
|
|
|
( |
3418
|
39280
|
|
|
|
|
( noper_trietype == NOTHING) |
3419
|
59590
|
|
|
|
|
|| ( trietype == NOTHING ) |
3420
|
1832712
|
|
|
|
|
|| ( trietype == noper_trietype ) |
3421
|
|
|
|
|
|
) |
3422
|
|
|
|
|
|
#ifdef NOJUMPTRIE |
3423
|
|
|
|
|
|
&& noper_next == tail |
3424
|
|
|
|
|
|
#endif |
3425
|
38191443
|
|
|
|
|
&& count < U16_MAX) |
3426
|
|
|
|
|
|
{ |
3427
|
|
|
|
|
|
/* Handle mergable triable node |
3428
|
|
|
|
|
|
* Either we are the first node in a new trieable sequence, |
3429
|
|
|
|
|
|
* in which case we do some bookkeeping, otherwise we update |
3430
|
|
|
|
|
|
* the end pointer. */ |
3431
|
10896442
|
|
|
|
|
if ( !first ) { |
3432
|
|
|
|
|
|
first = cur; |
3433
|
10896394
|
|
|
|
|
if ( noper_trietype == NOTHING ) { |
3434
|
|
|
|
|
|
#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) |
3435
|
|
|
|
|
|
regnode * const noper_next = regnext( noper ); |
3436
|
|
|
|
|
|
U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; |
3437
|
|
|
|
|
|
U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; |
3438
|
|
|
|
|
|
#endif |
3439
|
|
|
|
|
|
|
3440
|
10896394
|
|
|
|
|
if ( noper_next_trietype ) { |
3441
|
|
|
|
|
|
trietype = noper_next_trietype; |
3442
|
2414754
|
|
|
|
|
} else if (noper_next_type) { |
3443
|
|
|
|
|
|
/* a NOTHING regop is 1 regop wide. We need at least two |
3444
|
|
|
|
|
|
* for a trie so we can't merge this in */ |
3445
|
|
|
|
|
|
first = NULL; |
3446
|
|
|
|
|
|
} |
3447
|
|
|
|
|
|
} else { |
3448
|
|
|
|
|
|
trietype = noper_trietype; |
3449
|
|
|
|
|
|
} |
3450
|
|
|
|
|
|
} else { |
3451
|
2414802
|
|
|
|
|
if ( trietype == NOTHING ) |
3452
|
|
|
|
|
|
trietype = noper_trietype; |
3453
|
|
|
|
|
|
last = cur; |
3454
|
|
|
|
|
|
} |
3455
|
1144186
|
|
|
|
|
if (first) |
3456
|
963624
|
|
|
|
|
count++; |
3457
|
|
|
|
|
|
} /* end handle mergable triable node */ |
3458
|
|
|
|
|
|
else { |
3459
|
|
|
|
|
|
/* handle unmergable node - |
3460
|
|
|
|
|
|
* noper may either be a triable node which can not be tried |
3461
|
|
|
|
|
|
* together with the current trie, or a non triable node */ |
3462
|
963584
|
|
|
|
|
if ( last ) { |
3463
|
|
|
|
|
|
/* If last is set and trietype is not NOTHING then we have found |
3464
|
|
|
|
|
|
* at least two triable branch sequences in a row of a similar |
3465
|
|
|
|
|
|
* trietype so we can turn them into a trie. If/when we |
3466
|
|
|
|
|
|
* allow NOTHING to start a trie sequence this condition will be |
3467
|
|
|
|
|
|
* required, and it isn't expensive so we leave it in for now. */ |
3468
|
296026
|
|
|
|
|
if ( trietype && trietype != NOTHING ) |
3469
|
296026
|
|
|
|
|
make_trie( pRExC_state, |
3470
|
|
|
|
|
|
startbranch, first, cur, tail, count, |
3471
|
|
|
|
|
|
trietype, depth+1 ); |
3472
|
|
|
|
|
|
last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ |
3473
|
|
|
|
|
|
} |
3474
|
296034
|
|
|
|
|
if ( noper_trietype |
3475
|
|
|
|
|
|
#ifdef NOJUMPTRIE |
3476
|
|
|
|
|
|
&& noper_next == tail |
3477
|
|
|
|
|
|
#endif |
3478
|
|
|
|
|
|
){ |
3479
|
|
|
|
|
|
/* noper is triable, so we can start a new trie sequence */ |
3480
|
|
|
|
|
|
count = 1; |
3481
|
|
|
|
|
|
first = cur; |
3482
|
|
|
|
|
|
trietype = noper_trietype; |
3483
|
848112
|
|
|
|
|
} else if (first) { |
3484
|
|
|
|
|
|
/* if we already saw a first but the current node is not triable then we have |
3485
|
|
|
|
|
|
* to reset the first information. */ |
3486
|
|
|
|
|
|
count = 0; |
3487
|
|
|
|
|
|
first = NULL; |
3488
|
|
|
|
|
|
trietype = 0; |
3489
|
|
|
|
|
|
} |
3490
|
|
|
|
|
|
} /* end handle unmergable node */ |
3491
|
|
|
|
|
|
} /* loop over branches */ |
3492
|
667558
|
|
|
|
|
DEBUG_TRIE_COMPILE_r({ |
3493
|
|
|
|
|
|
regprop(RExC_rx, mysv, cur); |
3494
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
3495
|
|
|
|
|
|
"%*s- %s (%d) \n", (int)depth * 2 + 2, |
3496
|
|
|
|
|
|
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); |
3497
|
|
|
|
|
|
|
3498
|
|
|
|
|
|
}); |
3499
|
848120
|
|
|
|
|
if ( last && trietype ) { |
3500
|
3711545
|
|
|
|
|
if ( trietype != NOTHING ) { |
3501
|
|
|
|
|
|
/* the last branch of the sequence was part of a trie, |
3502
|
|
|
|
|
|
* so we have to construct it here outside of the loop |
3503
|
|
|
|
|
|
*/ |
3504
|
418977
|
|
|
|
|
made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); |
3505
|
|
|
|
|
|
#ifdef TRIE_STUDY_OPT |
3506
|
418985
|
|
|
|
|
if ( ((made == MADE_EXACT_TRIE && |
3507
|
418977
|
|
|
|
|
startbranch == first) |
3508
|
3292574
|
|
|
|
|
|| ( first_non_open == first )) && |
3509
|
|
|
|
|
|
depth==0 ) { |
3510
|
3292574
|
|
|
|
|
flags |= SCF_TRIE_RESTUDY; |
3511
|
2975736
|
|
|
|
|
if ( startbranch == first |
3512
|
2975736
|
|
|
|
|
&& scan == tail ) |
3513
|
|
|
|
|
|
{ |
3514
|
4441028
|
|
|
|
|
RExC_seen &=~REG_TOP_LEVEL_BRANCHES; |
3515
|
|
|
|
|
|
} |
3516
|
|
|
|
|
|
} |
3517
|
|
|
|
|
|
#endif |
3518
|
|
|
|
|
|
} else { |
3519
|
|
|
|
|
|
/* at this point we know whatever we have is a NOTHING sequence/branch |
3520
|
|
|
|
|
|
* AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING |
3521
|
|
|
|
|
|
*/ |
3522
|
806
|
|
|
|
|
if ( startbranch == first ) { |
3523
|
|
|
|
|
|
regnode *opt; |
3524
|
|
|
|
|
|
/* the entire thing is a NOTHING sequence, something like this: |
3525
|
|
|
|
|
|
* (?:|) So we can turn it into a plain NOTHING op. */ |
3526
|
4440386
|
|
|
|
|
DEBUG_TRIE_COMPILE_r({ |
3527
|
|
|
|
|
|
regprop(RExC_rx, mysv, cur); |
3528
|
|
|
|
|
|
PerlIO_printf( Perl_debug_log, |
3529
|
|
|
|
|
|
"%*s- %s (%d) \n", (int)depth * 2 + 2, |
3530
|
|
|
|
|
|
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); |
3531
|
|
|
|
|
|
|
3532
|
|
|
|
|
|
}); |
3533
|
4440386
|
|
|
|
|
OP(startbranch)= NOTHING; |
3534
|
4441024
|
|
|
|
|
NEXT_OFF(startbranch)= tail - startbranch; |
3535
|
4441024
|
|
|
|
|
for ( opt= startbranch + 1; opt < tail ; opt++ ) |
3536
|
2414754
|
|
|
|
|
OP(opt)= OPTIMIZED; |
3537
|
|
|
|
|
|
} |
3538
|
|
|
|
|
|
} |
3539
|
|
|
|
|
|
} /* end if ( last) */ |
3540
|
|
|
|
|
|
} /* TRIE_MAXBUF is non zero */ |
3541
|
|
|
|
|
|
|
3542
|
|
|
|
|
|
} /* do trie */ |
3543
|
|
|
|
|
|
|
3544
|
|
|
|
|
|
} |
3545
|
2414754
|
|
|
|
|
else if ( code == BRANCHJ ) { /* single branch is optimized. */ |
3546
|
4441024
|
|
|
|
|
scan = NEXTOPER(NEXTOPER(scan)); |
3547
|
|
|
|
|
|
} else /* single branch is optimized. */ |
3548
|
4441024
|
|
|
|
|
scan = NEXTOPER(scan); |
3549
|
5156027
|
|
|
|
|
continue; |
3550
|
3965620
|
|
|
|
|
} else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { |
3551
|
|
|
|
|
|
scan_frame *newframe = NULL; |
3552
|
|
|
|
|
|
I32 paren; |
3553
|
|
|
|
|
|
regnode *start; |
3554
|
|
|
|
|
|
regnode *end; |
3555
|
|
|
|
|
|
|
3556
|
3965320
|
|
|
|
|
if (OP(scan) != SUSPEND) { |
3557
|
|
|
|
|
|
/* set the pointer */ |
3558
|
5156019
|
|
|
|
|
if (OP(scan) == GOSUB) { |
3559
|
5155657
|
|
|
|
|
paren = ARG(scan); |
3560
|
5155657
|
|
|
|
|
RExC_recurse[ARG2L(scan)] = scan; |
3561
|
5155657
|
|
|
|
|
start = RExC_open_parens[paren-1]; |
3562
|
860149
|
|
|
|
|
end = RExC_close_parens[paren-1]; |
3563
|
|
|
|
|
|
} else { |
3564
|
|
|
|
|
|
paren = 0; |
3565
|
5156019
|
|
|
|
|
start = RExC_rxi->program + 1; |
3566
|
1939909
|
|
|
|
|
end = RExC_opend; |
3567
|
|
|
|
|
|
} |
3568
|
1939909
|
|
|
|
|
if (!recursed) { |
3569
|
1939909
|
|
|
|
|
Newxz(recursed, (((RExC_npar)>>3) +1), U8); |
3570
|
1939909
|
|
|
|
|
SAVEFREEPV(recursed); |
3571
|
|
|
|
|
|
} |
3572
|
1939909
|
|
|
|
|
if (!PAREN_TEST(recursed,paren+1)) { |
3573
|
5156019
|
|
|
|
|
PAREN_SET(recursed,paren+1); |
3574
|
426292
|
|
|
|
|
Newx(newframe,1,scan_frame); |
3575
|
|
|
|
|
|
} else { |
3576
|
5156019
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
3577
|
5156019
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); |
3578
|
1939909
|
|
|
|
|
data->longest = &(data->longest_float); |
3579
|
|
|
|
|
|
} |
3580
|
|
|
|
|
|
is_inf = is_inf_internal = 1; |
3581
|
5156019
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ |
3582
|
3606443
|
|
|
|
|
cl_anything(pRExC_state, data->start_class); |
3583
|
245366
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
3584
|
|
|
|
|
|
} |
3585
|
|
|
|
|
|
} else { |
3586
|
3361077
|
|
|
|
|
Newx(newframe,1,scan_frame); |
3587
|
|
|
|
|
|
paren = stopparen; |
3588
|
985875
|
|
|
|
|
start = scan+2; |
3589
|
985875
|
|
|
|
|
end = regnext(scan); |
3590
|
|
|
|
|
|
} |
3591
|
985875
|
|
|
|
|
if (newframe) { |
3592
|
985875
|
|
|
|
|
assert(start); |
3593
|
985875
|
|
|
|
|
assert(end); |
3594
|
985875
|
|
|
|
|
SAVEFREEPV(newframe); |
3595
|
1549576
|
|
|
|
|
newframe->next = regnext(scan); |
3596
|
61936
|
|
|
|
|
newframe->last = last; |
3597
|
61936
|
|
|
|
|
newframe->stop = stopparen; |
3598
|
1487640
|
|
|
|
|
newframe->prev = frame; |
3599
|
|
|
|
|
|
|
3600
|
|
|
|
|
|
frame = newframe; |
3601
|
646732
|
|
|
|
|
scan = start; |
3602
|
|
|
|
|
|
stopparen = paren; |
3603
|
|
|
|
|
|
last = end; |
3604
|
|
|
|
|
|
|
3605
|
1549576
|
|
|
|
|
continue; |
3606
|
|
|
|
|
|
} |
3607
|
|
|
|
|
|
} |
3608
|
5156319
|
|
|
|
|
else if (OP(scan) == EXACT) { |
3609
|
2741119
|
|
|
|
|
SSize_t l = STR_LEN(scan); |
3610
|
|
|
|
|
|
UV uc; |
3611
|
5156163
|
|
|
|
|
if (UTF) { |
3612
|
5154387
|
|
|
|
|
const U8 * const s = (U8*)STRING(scan); |
3613
|
2482219
|
|
|
|
|
uc = utf8_to_uvchr_buf(s, s + l, NULL); |
3614
|
7076
|
|
|
|
|
l = utf8_length(s, s + l); |
3615
|
|
|
|
|
|
} else { |
3616
|
338
|
|
|
|
|
uc = *((U8*)STRING(scan)); |
3617
|
|
|
|
|
|
} |
3618
|
166
|
|
|
|
|
min += l; |
3619
|
166
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ |
3620
|
|
|
|
|
|
/* The code below prefers earlier match for fixed |
3621
|
|
|
|
|
|
offset, later match for variable offset. */ |
3622
|
136
|
|
|
|
|
if (data->last_end == -1) { /* Update the start info. */ |
3623
|
5156017
|
|
|
|
|
data->last_start_min = data->pos_min; |
3624
|
7688130
|
|
|
|
|
data->last_start_max = is_inf |
3625
|
4320789
|
|
|
|
|
? SSize_t_MAX : data->pos_min + data->pos_delta; |
3626
|
|
|
|
|
|
} |
3627
|
5156133
|
|
|
|
|
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); |
3628
|
5156133
|
|
|
|
|
if (UTF) |
3629
|
2014207
|
|
|
|
|
SvUTF8_on(data->last_found); |
3630
|
|
|
|
|
|
{ |
3631
|
5156133
|
|
|
|
|
SV * const sv = data->last_found; |
3632
|
2414888
|
|
|
|
|
MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? |
3633
|
1179612
|
|
|
|
|
mg_find(sv, PERL_MAGIC_utf8) : NULL; |
3634
|
478270
|
|
|
|
|
if (mg && mg->mg_len >= 0) |
3635
|
478152
|
|
|
|
|
mg->mg_len += utf8_length((U8*)STRING(scan), |
3636
|
|
|
|
|
|
(U8*)STRING(scan)+STR_LEN(scan)); |
3637
|
|
|
|
|
|
} |
3638
|
478270
|
|
|
|
|
data->last_end = data->pos_min + l; |
3639
|
282985
|
|
|
|
|
data->pos_min += l; /* As in the first entry. */ |
3640
|
182450
|
|
|
|
|
data->flags &= ~SF_BEFORE_EOL; |
3641
|
|
|
|
|
|
} |
3642
|
412500
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND) { |
3643
|
|
|
|
|
|
/* Check whether it is compatible with what we know already! */ |
3644
|
|
|
|
|
|
int compat = 1; |
3645
|
|
|
|
|
|
|
3646
|
|
|
|
|
|
|
3647
|
|
|
|
|
|
/* If compatible, we or it in below. It is compatible if is |
3648
|
|
|
|
|
|
* in the bitmp and either 1) its bit or its fold is set, or 2) |
3649
|
|
|
|
|
|
* it's for a locale. Even if there isn't unicode semantics |
3650
|
|
|
|
|
|
* here, at runtime there may be because of matching against a |
3651
|
|
|
|
|
|
* utf8 string, so accept a possible false positive for |
3652
|
|
|
|
|
|
* latin1-range folds */ |
3653
|
412612
|
|
|
|
|
if (uc >= 0x100 || |
3654
|
412478
|
|
|
|
|
(!(data->start_class->flags & ANYOF_LOCALE) |
3655
|
470
|
|
|
|
|
&& !ANYOF_BITMAP_TEST(data->start_class, uc) |
3656
|
348
|
|
|
|
|
&& (!(data->start_class->flags & ANYOF_LOC_FOLD) |
3657
|
412356
|
|
|
|
|
|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) |
3658
|
|
|
|
|
|
) |
3659
|
|
|
|
|
|
{ |
3660
|
|
|
|
|
|
compat = 0; |
3661
|
|
|
|
|
|
} |
3662
|
412490
|
|
|
|
|
ANYOF_CLASS_ZERO(data->start_class); |
3663
|
412490
|
|
|
|
|
ANYOF_BITMAP_ZERO(data->start_class); |
3664
|
5156149
|
|
|
|
|
if (compat) |
3665
|
2002516
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, uc); |
3666
|
1453778
|
|
|
|
|
else if (uc >= 0x100) { |
3667
|
|
|
|
|
|
int i; |
3668
|
|
|
|
|
|
|
3669
|
|
|
|
|
|
/* Some Unicode code points fold to the Latin1 range; as |
3670
|
|
|
|
|
|
* XXX temporary code, instead of figuring out if this is |
3671
|
|
|
|
|
|
* one, just assume it is and set all the start class bits |
3672
|
|
|
|
|
|
* that could be some such above 255 code point's fold |
3673
|
|
|
|
|
|
* which will generate fals positives. As the code |
3674
|
|
|
|
|
|
* elsewhere that does compute the fold settles down, it |
3675
|
|
|
|
|
|
* can be extracted out and re-used here */ |
3676
|
1456838
|
|
|
|
|
for (i = 0; i < 256; i++){ |
3677
|
527818
|
|
|
|
|
if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { |
3678
|
515052
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, i); |
3679
|
|
|
|
|
|
} |
3680
|
|
|
|
|
|
} |
3681
|
|
|
|
|
|
} |
3682
|
514838
|
|
|
|
|
CLEAR_SSC_EOS(data->start_class); |
3683
|
657695
|
|
|
|
|
if (uc < 0x100) |
3684
|
794420
|
|
|
|
|
data->start_class->flags &= ~ANYOF_UNICODE_ALL; |
3685
|
|
|
|
|
|
} |
3686
|
514714
|
|
|
|
|
else if (flags & SCF_DO_STCLASS_OR) { |
3687
|
|
|
|
|
|
/* false positive possible if the class is case-folded */ |
3688
|
630904
|
|
|
|
|
if (uc < 0x100) |
3689
|
116200
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, uc); |
3690
|
|
|
|
|
|
else |
3691
|
116200
|
|
|
|
|
data->start_class->flags |= ANYOF_UNICODE_ALL; |
3692
|
116200
|
|
|
|
|
CLEAR_SSC_EOS(data->start_class); |
3693
|
348
|
|
|
|
|
cl_and(data->start_class, and_withp); |
3694
|
|
|
|
|
|
} |
3695
|
492
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
3696
|
|
|
|
|
|
} |
3697
|
116356
|
|
|
|
|
else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ |
3698
|
116234
|
|
|
|
|
SSize_t l = STR_LEN(scan); |
3699
|
116234
|
|
|
|
|
UV uc = *((U8*)STRING(scan)); |
3700
|
|
|
|
|
|
|
3701
|
|
|
|
|
|
/* Search for fixed substrings supports EXACT only. */ |
3702
|
398538
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
3703
|
4641317
|
|
|
|
|
assert(data); |
3704
|
1487696
|
|
|
|
|
SCAN_COMMIT(pRExC_state, data, minlenp); |
3705
|
|
|
|
|
|
} |
3706
|
1487724
|
|
|
|
|
if (UTF) { |
3707
|
709004
|
|
|
|
|
const U8 * const s = (U8 *)STRING(scan); |
3708
|
708790
|
|
|
|
|
uc = utf8_to_uvchr_buf(s, s + l, NULL); |
3709
|
708790
|
|
|
|
|
l = utf8_length(s, s + l); |
3710
|
|
|
|
|
|
} |
3711
|
34
|
|
|
|
|
if (has_exactf_sharp_s) { |
3712
|
1062285
|
|
|
|
|
RExC_seen |= REG_SEEN_EXACTF_SHARP_S; |
3713
|
|
|
|
|
|
} |
3714
|
708824
|
|
|
|
|
min += l - min_subtract; |
3715
|
5156049
|
|
|
|
|
assert (min >= 0); |
3716
|
631579
|
|
|
|
|
delta += min_subtract; |
3717
|
5156049
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
3718
|
3965322
|
|
|
|
|
data->pos_min += l - min_subtract; |
3719
|
3965322
|
|
|
|
|
if (data->pos_min < 0) { |
3720
|
118234
|
|
|
|
|
data->pos_min = 0; |
3721
|
|
|
|
|
|
} |
3722
|
118240
|
|
|
|
|
data->pos_delta += min_subtract; |
3723
|
118240
|
|
|
|
|
if (min_subtract) { |
3724
|
118234
|
|
|
|
|
data->longest = &(data->longest_float); |
3725
|
|
|
|
|
|
} |
3726
|
|
|
|
|
|
} |
3727
|
118268
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND) { |
3728
|
|
|
|
|
|
/* Check whether it is compatible with what we know already! */ |
3729
|
|
|
|
|
|
int compat = 1; |
3730
|
3272
|
|
|
|
|
if (uc >= 0x100 || |
3731
|
118262
|
|
|
|
|
(!(data->start_class->flags & ANYOF_LOCALE) |
3732
|
118262
|
|
|
|
|
&& !ANYOF_BITMAP_TEST(data->start_class, uc) |
3733
|
118234
|
|
|
|
|
&& !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) |
3734
|
|
|
|
|
|
{ |
3735
|
|
|
|
|
|
compat = 0; |
3736
|
|
|
|
|
|
} |
3737
|
111634
|
|
|
|
|
ANYOF_CLASS_ZERO(data->start_class); |
3738
|
14296
|
|
|
|
|
ANYOF_BITMAP_ZERO(data->start_class); |
3739
|
14296
|
|
|
|
|
if (compat) { |
3740
|
14296
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, uc); |
3741
|
14296
|
|
|
|
|
CLEAR_SSC_EOS(data->start_class); |
3742
|
14296
|
|
|
|
|
if (OP(scan) == EXACTFL) { |
3743
|
|
|
|
|
|
/* XXX This set is probably no longer necessary, and |
3744
|
|
|
|
|
|
* probably wrong as LOCALE now is on in the initial |
3745
|
|
|
|
|
|
* state */ |
3746
|
14268
|
|
|
|
|
data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; |
3747
|
|
|
|
|
|
} |
3748
|
|
|
|
|
|
else { |
3749
|
|
|
|
|
|
|
3750
|
|
|
|
|
|
/* Also set the other member of the fold pair. In case |
3751
|
|
|
|
|
|
* that unicode semantics is called for at runtime, use |
3752
|
|
|
|
|
|
* the full latin1 fold. (Can't do this for locale, |
3753
|
|
|
|
|
|
* because not known until runtime) */ |
3754
|
6988
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); |
3755
|
|
|
|
|
|
|
3756
|
|
|
|
|
|
/* All other (EXACTFL handled above) folds except under |
3757
|
|
|
|
|
|
* /iaa that include s, S, and sharp_s also may include |
3758
|
|
|
|
|
|
* the others */ |
3759
|
14296
|
|
|
|
|
if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) |
3760
|
|
|
|
|
|
{ |
3761
|
14296
|
|
|
|
|
if (uc == 's' || uc == 'S') { |
3762
|
0
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, |
3763
|
|
|
|
|
|
LATIN_SMALL_LETTER_SHARP_S); |
3764
|
|
|
|
|
|
} |
3765
|
14296
|
|
|
|
|
else if (uc == LATIN_SMALL_LETTER_SHARP_S) { |
3766
|
6628
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, 's'); |
3767
|
9942
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, 'S'); |
3768
|
|
|
|
|
|
} |
3769
|
|
|
|
|
|
} |
3770
|
|
|
|
|
|
} |
3771
|
|
|
|
|
|
} |
3772
|
6628
|
|
|
|
|
else if (uc >= 0x100) { |
3773
|
|
|
|
|
|
int i; |
3774
|
3965316
|
|
|
|
|
for (i = 0; i < 256; i++){ |
3775
|
5570619
|
|
|
|
|
if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { |
3776
|
3277556
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, i); |
3777
|
|
|
|
|
|
} |
3778
|
|
|
|
|
|
} |
3779
|
|
|
|
|
|
} |
3780
|
|
|
|
|
|
} |
3781
|
794028
|
|
|
|
|
else if (flags & SCF_DO_STCLASS_OR) { |
3782
|
6283198
|
|
|
|
|
if (data->start_class->flags & ANYOF_LOC_FOLD) { |
3783
|
|
|
|
|
|
/* false positive possible if the class is case-folded. |
3784
|
|
|
|
|
|
Assume that the locale settings are the same... */ |
3785
|
6342588
|
|
|
|
|
if (uc < 0x100) { |
3786
|
3965316
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, uc); |
3787
|
3943698
|
|
|
|
|
if (OP(scan) != EXACTFL) { |
3788
|
|
|
|
|
|
|
3789
|
|
|
|
|
|
/* And set the other member of the fold pair, but |
3790
|
|
|
|
|
|
* can't do that in locale because not known until |
3791
|
|
|
|
|
|
* run-time */ |
3792
|
3943698
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, |
3793
|
|
|
|
|
|
PL_fold_latin1[uc]); |
3794
|
|
|
|
|
|
|
3795
|
|
|
|
|
|
/* All folds except under /iaa that include s, S, |
3796
|
|
|
|
|
|
* and sharp_s also may include the others */ |
3797
|
113028
|
|
|
|
|
if (OP(scan) != EXACTFA |
3798
|
57382
|
|
|
|
|
&& OP(scan) != EXACTFA_NO_TRIE) |
3799
|
|
|
|
|
|
{ |
3800
|
116236
|
|
|
|
|
if (uc == 's' || uc == 'S') { |
3801
|
113028
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, |
3802
|
|
|
|
|
|
LATIN_SMALL_LETTER_SHARP_S); |
3803
|
|
|
|
|
|
} |
3804
|
3208
|
|
|
|
|
else if (uc == LATIN_SMALL_LETTER_SHARP_S) { |
3805
|
113028
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, 's'); |
3806
|
113028
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, 'S'); |
3807
|
|
|
|
|
|
} |
3808
|
|
|
|
|
|
} |
3809
|
|
|
|
|
|
} |
3810
|
|
|
|
|
|
} |
3811
|
113028
|
|
|
|
|
CLEAR_SSC_EOS(data->start_class); |
3812
|
|
|
|
|
|
} |
3813
|
113028
|
|
|
|
|
cl_and(data->start_class, and_withp); |
3814
|
|
|
|
|
|
} |
3815
|
127210
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
3816
|
|
|
|
|
|
} |
3817
|
14270
|
|
|
|
|
else if (REGNODE_VARIES(OP(scan))) { |
3818
|
|
|
|
|
|
SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; |
3819
|
14148
|
|
|
|
|
I32 fl = 0, f = flags; |
3820
|
3943698
|
|
|
|
|
regnode * const oscan = scan; |
3821
|
|
|
|
|
|
struct regnode_charclass_class this_class; |
3822
|
|
|
|
|
|
struct regnode_charclass_class *oclass = NULL; |
3823
|
|
|
|
|
|
I32 next_is_eval = 0; |
3824
|
|
|
|
|
|
|
3825
|
3965316
|
|
|
|
|
switch (PL_regkind[OP(scan)]) { |
3826
|
|
|
|
|
|
case WHILEM: /* End of (?:...)* . */ |
3827
|
5156015
|
|
|
|
|
scan = NEXTOPER(scan); |
3828
|
672
|
|
|
|
|
goto finish; |
3829
|
|
|
|
|
|
case PLUS: |
3830
|
8448583
|
|
|
|
|
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { |
3831
|
7893119
|
|
|
|
|
next = NEXTOPER(scan); |
3832
|
932226
|
|
|
|
|
if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { |
3833
|
|
|
|
|
|
mincount = 1; |
3834
|
|
|
|
|
|
maxcount = REG_INFTY; |
3835
|
932226
|
|
|
|
|
next = regnext(scan); |
3836
|
8448583
|
|
|
|
|
scan = NEXTOPER(scan); |
3837
|
33053
|
|
|
|
|
goto do_curly; |
3838
|
|
|
|
|
|
} |
3839
|
|
|
|
|
|
} |
3840
|
25923
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) |
3841
|
25923
|
|
|
|
|
data->pos_min++; |
3842
|
33053
|
|
|
|
|
min++; |
3843
|
|
|
|
|
|
/* Fall through. */ |
3844
|
|
|
|
|
|
case STAR: |
3845
|
360
|
|
|
|
|
if (flags & SCF_DO_STCLASS) { |
3846
|
|
|
|
|
|
mincount = 0; |
3847
|
|
|
|
|
|
maxcount = REG_INFTY; |
3848
|
33053
|
|
|
|
|
next = regnext(scan); |
3849
|
27295001
|
|
|
|
|
scan = NEXTOPER(scan); |
3850
|
18188
|
|
|
|
|
goto do_curly; |
3851
|
|
|
|
|
|
} |
3852
|
|
|
|
|
|
is_inf = is_inf_internal = 1; |
3853
|
17490
|
|
|
|
|
scan = regnext(scan); |
3854
|
17490
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
3855
|
4486185
|
|
|
|
|
SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ |
3856
|
4477440
|
|
|
|
|
data->longest = &(data->longest_float); |
3857
|
|
|
|
|
|
} |
3858
|
|
|
|
|
|
goto optimize_curly_tail; |
3859
|
|
|
|
|
|
case CURLY: |
3860
|
4389990
|
|
|
|
|
if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) |
3861
|
0
|
|
|
|
|
&& (scan->flags == stopparen)) |
3862
|
|
|
|
|
|
{ |
3863
|
|
|
|
|
|
mincount = 1; |
3864
|
|
|
|
|
|
maxcount = 1; |
3865
|
|
|
|
|
|
} else { |
3866
|
0
|
|
|
|
|
mincount = ARG1(scan); |
3867
|
0
|
|
|
|
|
maxcount = ARG2(scan); |
3868
|
|
|
|
|
|
} |
3869
|
17490
|
|
|
|
|
next = regnext(scan); |
3870
|
0
|
|
|
|
|
if (OP(scan) == CURLYX) { |
3871
|
17490
|
|
|
|
|
I32 lp = (data ? *(data->last_closep) : 0); |
3872
|
18188
|
|
|
|
|
scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); |
3873
|
|
|
|
|
|
} |
3874
|
18188
|
|
|
|
|
scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; |
3875
|
18188
|
|
|
|
|
next_is_eval = (OP(scan) == EVAL); |
3876
|
|
|
|
|
|
do_curly: |
3877
|
1116
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
3878
|
1116
|
|
|
|
|
if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ |
3879
|
1116
|
|
|
|
|
pos_before = data->pos_min; |
3880
|
|
|
|
|
|
} |
3881
|
1116
|
|
|
|
|
if (data) { |
3882
|
27276813
|
|
|
|
|
fl = data->flags; |
3883
|
4266835
|
|
|
|
|
data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); |
3884
|
2184871
|
|
|
|
|
if (is_inf) |
3885
|
2184871
|
|
|
|
|
data->flags |= SF_IS_INF; |
3886
|
|
|
|
|
|
} |
3887
|
4266835
|
|
|
|
|
if (flags & SCF_DO_STCLASS) { |
3888
|
4266835
|
|
|
|
|
cl_init(pRExC_state, &this_class); |
3889
|
1456087
|
|
|
|
|
oclass = data->start_class; |
3890
|
1456087
|
|
|
|
|
data->start_class = &this_class; |
3891
|
167964
|
|
|
|
|
f |= SCF_DO_STCLASS_AND; |
3892
|
0
|
|
|
|
|
f &= ~SCF_DO_STCLASS_OR; |
3893
|
|
|
|
|
|
} |
3894
|
|
|
|
|
|
/* Exclude from super-linear cache processing any {n,m} |
3895
|
|
|
|
|
|
regops for which the combination of input pos and regex |
3896
|
|
|
|
|
|
pos is not enough information to determine if a match |
3897
|
|
|
|
|
|
will be possible. |
3898
|
|
|
|
|
|
|
3899
|
|
|
|
|
|
For example, in the regex /foo(bar\s*){4,8}baz/ with the |
3900
|
|
|
|
|
|
regex pos at the \s*, the prospects for a match depend not |
3901
|
|
|
|
|
|
only on the input position but also on how many (bar\s*) |
3902
|
|
|
|
|
|
repeats into the {4,8} we are. */ |
3903
|
279325
|
|
|
|
|
if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) |
3904
|
111361
|
|
|
|
|
f &= ~SCF_WHILEM_VISITED_POS; |
3905
|
|
|
|
|
|
|
3906
|
|
|
|
|
|
/* This will finish on WHILEM, setting scan, or on NULL: */ |
3907
|
442
|
|
|
|
|
minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, |
3908
|
|
|
|
|
|
last, data, stopparen, recursed, NULL, |
3909
|
|
|
|
|
|
(mincount == 0 |
3910
|
|
|
|
|
|
? (f & ~SCF_DO_SUBSTR) : f),depth+1); |
3911
|
|
|
|
|
|
|
3912
|
28
|
|
|
|
|
if (flags & SCF_DO_STCLASS) |
3913
|
442
|
|
|
|
|
data->start_class = oclass; |
3914
|
111361
|
|
|
|
|
if (mincount == 0 || minnext == 0) { |
3915
|
110947
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) { |
3916
|
856944
|
|
|
|
|
cl_or(pRExC_state, data->start_class, &this_class); |
3917
|
|
|
|
|
|
} |
3918
|
768424
|
|
|
|
|
else if (flags & SCF_DO_STCLASS_AND) { |
3919
|
|
|
|
|
|
/* Switch to OR mode: cache the old value of |
3920
|
|
|
|
|
|
* data->start_class */ |
3921
|
88520
|
|
|
|
|
INIT_AND_WITHP; |
3922
|
294204
|
|
|
|
|
StructCopy(data->start_class, and_withp, |
3923
|
|
|
|
|
|
struct regnode_charclass_class); |
3924
|
294204
|
|
|
|
|
flags &= ~SCF_DO_STCLASS_AND; |
3925
|
291516
|
|
|
|
|
StructCopy(&this_class, data->start_class, |
3926
|
|
|
|
|
|
struct regnode_charclass_class); |
3927
|
290952
|
|
|
|
|
flags |= SCF_DO_STCLASS_OR; |
3928
|
74774664
|
|
|
|
|
SET_SSC_EOS(data->start_class); |
3929
|
|
|
|
|
|
} |
3930
|
|
|
|
|
|
} else { /* Non-zero len */ |
3931
|
74483712
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) { |
3932
|
64631706
|
|
|
|
|
cl_or(pRExC_state, data->start_class, &this_class); |
3933
|
2688
|
|
|
|
|
cl_and(data->start_class, and_withp); |
3934
|
|
|
|
|
|
} |
3935
|
58
|
|
|
|
|
else if (flags & SCF_DO_STCLASS_AND) |
3936
|
674595
|
|
|
|
|
cl_and(data->start_class, &this_class); |
3937
|
673280
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
3938
|
|
|
|
|
|
} |
3939
|
59340
|
|
|
|
|
if (!scan) /* It was not CURLYX, but CURLY. */ |
3940
|
25614
|
|
|
|
|
scan = next; |
3941
|
25614
|
|
|
|
|
if (!(flags & SCF_TRIE_DOING_RESTUDY) |
3942
|
|
|
|
|
|
/* ? quantifier ok, except for (?{ ... }) */ |
3943
|
22470
|
|
|
|
|
&& (next_is_eval || !(mincount == 0 && maxcount == 1)) |
3944
|
22470
|
|
|
|
|
&& (minnext == 0) && (deltanext == 0) |
3945
|
5774790
|
|
|
|
|
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) |
3946
|
5752320
|
|
|
|
|
&& maxcount <= REG_INFTY/3) /* Complement check for big count */ |
3947
|
|
|
|
|
|
{ |
3948
|
|
|
|
|
|
/* Fatal warnings may leak the regexp without this: */ |
3949
|
1325032
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); |
3950
|
3144
|
|
|
|
|
ckWARNreg(RExC_parse, |
3951
|
|
|
|
|
|
"Quantifier unexpected on zero-length expression"); |
3952
|
0
|
|
|
|
|
(void)ReREFCNT_inc(RExC_rx_sv); |
3953
|
|
|
|
|
|
} |
3954
|
|
|
|
|
|
|
3955
|
806616
|
|
|
|
|
min += minnext * mincount; |
3956
|
804864
|
|
|
|
|
is_inf_internal |= deltanext == SSize_t_MAX |
3957
|
765560
|
|
|
|
|
|| (maxcount == REG_INFTY && minnext + deltanext > 0); |
3958
|
3144
|
|
|
|
|
is_inf |= is_inf_internal; |
3959
|
3144
|
|
|
|
|
if (is_inf) |
3960
|
|
|
|
|
|
delta = SSize_t_MAX; |
3961
|
|
|
|
|
|
else |
3962
|
1456087
|
|
|
|
|
delta += (minnext + deltanext) * maxcount - minnext * mincount; |
3963
|
|
|
|
|
|
|
3964
|
|
|
|
|
|
/* Try powerful optimization CURLYX => CURLYN. */ |
3965
|
94794
|
|
|
|
|
if ( OP(oscan) == CURLYX && data |
3966
|
|
|
|
|
|
&& data->flags & SF_IN_PAR |
3967
|
1456087
|
|
|
|
|
&& !(data->flags & SF_HAS_EVAL) |
3968
|
23009978
|
|
|
|
|
&& !deltanext && minnext == 1 ) { |
3969
|
|
|
|
|
|
/* Try to optimize to CURLYN. */ |
3970
|
4741990
|
|
|
|
|
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; |
3971
|
|
|
|
|
|
regnode * const nxt1 = nxt; |
3972
|
|
|
|
|
|
#ifdef DEBUGGING |
3973
|
|
|
|
|
|
regnode *nxt2; |
3974
|
|
|
|
|
|
#endif |
3975
|
|
|
|
|
|
|
3976
|
|
|
|
|
|
/* Skip open. */ |
3977
|
2370995
|
|
|
|
|
nxt = regnext(nxt); |
3978
|
2370995
|
|
|
|
|
if (!REGNODE_SIMPLE(OP(nxt)) |
3979
|
20638983
|
|
|
|
|
&& !(PL_regkind[OP(nxt)] == EXACT |
3980
|
219885
|
|
|
|
|
&& STR_LEN(nxt) == 1)) |
3981
|
|
|
|
|
|
goto nogo; |
3982
|
|
|
|
|
|
#ifdef DEBUGGING |
3983
|
|
|
|
|
|
nxt2 = nxt; |
3984
|
|
|
|
|
|
#endif |
3985
|
219885
|
|
|
|
|
nxt = regnext(nxt); |
3986
|
219885
|
|
|
|
|
if (OP(nxt) != CLOSE) |
3987
|
|
|
|
|
|
goto nogo; |
3988
|
144920
|
|
|
|
|
if (RExC_open_parens) { |
3989
|
71920
|
|
|
|
|
RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ |
3990
|
0
|
|
|
|
|
RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ |
3991
|
|
|
|
|
|
} |
3992
|
|
|
|
|
|
/* Now we know that nxt2 is the only contents: */ |
3993
|
0
|
|
|
|
|
oscan->flags = (U8)ARG(nxt); |
3994
|
0
|
|
|
|
|
OP(oscan) = CURLYN; |
3995
|
0
|
|
|
|
|
OP(nxt1) = NOTHING; /* was OPEN. */ |
3996
|
|
|
|
|
|
|
3997
|
|
|
|
|
|
#ifdef DEBUGGING |
3998
|
0
|
|
|
|
|
OP(nxt1 + 1) = OPTIMIZED; /* was count. */ |
3999
|
0
|
|
|
|
|
NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ |
4000
|
0
|
|
|
|
|
NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ |
4001
|
0
|
|
|
|
|
OP(nxt) = OPTIMIZED; /* was CLOSE. */ |
4002
|
219885
|
|
|
|
|
OP(nxt + 1) = OPTIMIZED; /* was count. */ |
4003
|
219885
|
|
|
|
|
NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ |
4004
|
|
|
|
|
|
#endif |
4005
|
|
|
|
|
|
} |
4006
|
|
|
|
|
|
nogo: |
4007
|
|
|
|
|
|
|
4008
|
|
|
|
|
|
/* Try optimization CURLYX => CURLYM. */ |
4009
|
219885
|
|
|
|
|
if ( OP(oscan) == CURLYX && data |
4010
|
|
|
|
|
|
&& !(data->flags & SF_HAS_PAR) |
4011
|
219885
|
|
|
|
|
&& !(data->flags & SF_HAS_EVAL) |
4012
|
219885
|
|
|
|
|
&& !deltanext /* atom is fixed width */ |
4013
|
0
|
|
|
|
|
&& minnext != 0 /* CURLYM can't handle zero width */ |
4014
|
219885
|
|
|
|
|
&& ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ |
4015
|
|
|
|
|
|
) { |
4016
|
|
|
|
|
|
/* XXXX How to optimize if data == 0? */ |
4017
|
|
|
|
|
|
/* Optimize to a simpler form. */ |
4018
|
219885
|
|
|
|
|
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ |
4019
|
|
|
|
|
|
regnode *nxt2; |
4020
|
|
|
|
|
|
|
4021
|
64270
|
|
|
|
|
OP(oscan) = CURLYM; |
4022
|
9444
|
|
|
|
|
while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ |
4023
|
9444
|
|
|
|
|
&& (OP(nxt2) != WHILEM)) |
4024
|
|
|
|
|
|
nxt = nxt2; |
4025
|
219885
|
|
|
|
|
OP(nxt2) = SUCCEED; /* Whas WHILEM */ |
4026
|
|
|
|
|
|
/* Need to optimize away parenths. */ |
4027
|
217387
|
|
|
|
|
if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { |
4028
|
|
|
|
|
|
/* Set the parenth number. */ |
4029
|
219885
|
|
|
|
|
regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ |
4030
|
|
|
|
|
|
|
4031
|
219885
|
|
|
|
|
oscan->flags = (U8)ARG(nxt); |
4032
|
219885
|
|
|
|
|
if (RExC_open_parens) { |
4033
|
219885
|
|
|
|
|
RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ |
4034
|
46724
|
|
|
|
|
RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ |
4035
|
|
|
|
|
|
} |
4036
|
16
|
|
|
|
|
OP(nxt1) = OPTIMIZED; /* was OPEN. */ |
4037
|
46708
|
|
|
|
|
OP(nxt) = OPTIMIZED; /* was CLOSE. */ |
4038
|
|
|
|
|
|
|
4039
|
|
|
|
|
|
#ifdef DEBUGGING |
4040
|
2
|
|
|
|
|
OP(nxt1 + 1) = OPTIMIZED; /* was count. */ |
4041
|
46706
|
|
|
|
|
OP(nxt + 1) = OPTIMIZED; /* was count. */ |
4042
|
219867
|
|
|
|
|
NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ |
4043
|
219867
|
|
|
|
|
NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ |
4044
|
|
|
|
|
|
#endif |
4045
|
|
|
|
|
|
#if 0 |
4046
|
|
|
|
|
|
while ( nxt1 && (OP(nxt1) != WHILEM)) { |
4047
|
|
|
|
|
|
regnode *nnxt = regnext(nxt1); |
4048
|
|
|
|
|
|
if (nnxt == nxt) { |
4049
|
|
|
|
|
|
if (reg_off_by_arg[OP(nxt1)]) |
4050
|
|
|
|
|
|
ARG_SET(nxt1, nxt2 - nxt1); |
4051
|
|
|
|
|
|
else if (nxt2 - nxt1 < U16_MAX) |
4052
|
|
|
|
|
|
NEXT_OFF(nxt1) = nxt2 - nxt1; |
4053
|
|
|
|
|
|
else |
4054
|
|
|
|
|
|
OP(nxt) = NOTHING; /* Cannot beautify */ |
4055
|
|
|
|
|
|
} |
4056
|
|
|
|
|
|
nxt1 = nnxt; |
4057
|
|
|
|
|
|
} |
4058
|
|
|
|
|
|
#endif |
4059
|
|
|
|
|
|
/* Optimize again: */ |
4060
|
1634
|
|
|
|
|
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, |
4061
|
|
|
|
|
|
NULL, stopparen, recursed, NULL, 0,depth+1); |
4062
|
|
|
|
|
|
} |
4063
|
|
|
|
|
|
else |
4064
|
219867
|
|
|
|
|
oscan->flags = 0; |
4065
|
|
|
|
|
|
} |
4066
|
0
|
|
|
|
|
else if ((OP(oscan) == CURLYX) |
4067
|
219867
|
|
|
|
|
&& (flags & SCF_WHILEM_VISITED_POS) |
4068
|
|
|
|
|
|
/* See the comment on a similar expression above. |
4069
|
|
|
|
|
|
However, this time it's not a subexpression |
4070
|
|
|
|
|
|
we care about, but the expression itself. */ |
4071
|
219867
|
|
|
|
|
&& (maxcount == REG_INFTY) |
4072
|
9444
|
|
|
|
|
&& data && ++data->whilem_c < 16) { |
4073
|
|
|
|
|
|
/* This stays as CURLYX, we can put the count/of pair. */ |
4074
|
|
|
|
|
|
/* Find WHILEM (as in regexec.c) */ |
4075
|
178
|
|
|
|
|
regnode *nxt = oscan + NEXT_OFF(oscan); |
4076
|
|
|
|
|
|
|
4077
|
9266
|
|
|
|
|
if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ |
4078
|
9266
|
|
|
|
|
nxt += ARG(nxt); |
4079
|
9266
|
|
|
|
|
PREVOPER(nxt)->flags = (U8)(data->whilem_c |
4080
|
9266
|
|
|
|
|
| (RExC_whilem_seen << 4)); /* On WHILEM */ |
4081
|
|
|
|
|
|
} |
4082
|
20419098
|
|
|
|
|
if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) |
4083
|
5604918
|
|
|
|
|
pars++; |
4084
|
5601450
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4085
|
|
|
|
|
|
SV *last_str = NULL; |
4086
|
14814180
|
|
|
|
|
int counted = mincount != 0; |
4087
|
|
|
|
|
|
|
4088
|
5602032
|
|
|
|
|
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ |
4089
|
|
|
|
|
|
#if defined(SPARC64_GCC_WORKAROUND) |
4090
|
|
|
|
|
|
SSize_t b = 0; |
4091
|
|
|
|
|
|
STRLEN l = 0; |
4092
|
|
|
|
|
|
const char *s = NULL; |
4093
|
|
|
|
|
|
SSize_t old = 0; |
4094
|
|
|
|
|
|
|
4095
|
|
|
|
|
|
if (pos_before >= data->last_start_min) |
4096
|
|
|
|
|
|
b = pos_before; |
4097
|
|
|
|
|
|
else |
4098
|
|
|
|
|
|
b = data->last_start_min; |
4099
|
|
|
|
|
|
|
4100
|
|
|
|
|
|
l = 0; |
4101
|
|
|
|
|
|
s = SvPV_const(data->last_found, l); |
4102
|
|
|
|
|
|
old = b - data->last_start_min; |
4103
|
|
|
|
|
|
|
4104
|
|
|
|
|
|
#else |
4105
|
5601450
|
|
|
|
|
SSize_t b = pos_before >= data->last_start_min |
4106
|
2388919
|
|
|
|
|
? pos_before : data->last_start_min; |
4107
|
|
|
|
|
|
STRLEN l; |
4108
|
2388919
|
|
|
|
|
const char * const s = SvPV_const(data->last_found, l); |
4109
|
5601450
|
|
|
|
|
SSize_t old = b - data->last_start_min; |
4110
|
|
|
|
|
|
#endif |
4111
|
|
|
|
|
|
|
4112
|
5601450
|
|
|
|
|
if (UTF) |
4113
|
9212148
|
|
|
|
|
old = utf8_hop((U8*)s, old) - (U8*)s; |
4114
|
114456
|
|
|
|
|
l -= old; |
4115
|
|
|
|
|
|
/* Get the added string: */ |
4116
|
114456
|
|
|
|
|
last_str = newSVpvn_utf8(s + old, l, UTF); |
4117
|
9097692
|
|
|
|
|
if (deltanext == 0 && pos_before == b) { |
4118
|
|
|
|
|
|
/* What was added is a constant string */ |
4119
|
746
|
|
|
|
|
if (mincount > 1) { |
4120
|
506
|
|
|
|
|
SvGROW(last_str, (mincount * l) + 1); |
4121
|
506
|
|
|
|
|
repeatcpy(SvPVX(last_str) + l, |
4122
|
746
|
|
|
|
|
SvPVX_const(last_str), l, mincount - 1); |
4123
|
116
|
|
|
|
|
SvCUR_set(last_str, SvCUR(last_str) * mincount); |
4124
|
|
|
|
|
|
/* Add additional parts. */ |
4125
|
116
|
|
|
|
|
SvCUR_set(data->last_found, |
4126
|
|
|
|
|
|
SvCUR(data->last_found) - l); |
4127
|
9096946
|
|
|
|
|
sv_catsv(data->last_found, last_str); |
4128
|
|
|
|
|
|
{ |
4129
|
111760
|
|
|
|
|
SV * sv = data->last_found; |
4130
|
|
|
|
|
|
MAGIC *mg = |
4131
|
2856
|
|
|
|
|
SvUTF8(sv) && SvMAGICAL(sv) ? |
4132
|
2856
|
|
|
|
|
mg_find(sv, PERL_MAGIC_utf8) : NULL; |
4133
|
111760
|
|
|
|
|
if (mg && mg->mg_len >= 0) |
4134
|
158
|
|
|
|
|
mg->mg_len += CHR_SVLEN(last_str) - l; |
4135
|
|
|
|
|
|
} |
4136
|
111760
|
|
|
|
|
data->last_end += l * (mincount - 1); |
4137
|
|
|
|
|
|
} |
4138
|
|
|
|
|
|
} else { |
4139
|
|
|
|
|
|
/* start offset must point into the last copy */ |
4140
|
8985186
|
|
|
|
|
data->last_start_min += minnext * (mincount - 1); |
4141
|
184878
|
|
|
|
|
data->last_start_max += is_inf ? SSize_t_MAX |
4142
|
184878
|
|
|
|
|
: (maxcount - 1) * (minnext + data->pos_delta); |
4143
|
|
|
|
|
|
} |
4144
|
|
|
|
|
|
} |
4145
|
|
|
|
|
|
/* It is counted once already... */ |
4146
|
123612
|
|
|
|
|
data->pos_min += minnext * (mincount - counted); |
4147
|
|
|
|
|
|
#if 0 |
4148
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf |
4149
|
|
|
|
|
|
" SSize_t_MAX=%"UVdf" minnext=%"UVdf |
4150
|
|
|
|
|
|
" maxcount=%"UVdf" mincount=%"UVdf"\n", |
4151
|
|
|
|
|
|
(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, |
4152
|
|
|
|
|
|
(UV)mincount); |
4153
|
|
|
|
|
|
if (deltanext != SSize_t_MAX) |
4154
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", |
4155
|
|
|
|
|
|
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount |
4156
|
|
|
|
|
|
- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); |
4157
|
|
|
|
|
|
#endif |
4158
|
28
|
|
|
|
|
if (deltanext == SSize_t_MAX || |
4159
|
123612
|
|
|
|
|
-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) |
4160
|
60
|
|
|
|
|
data->pos_delta = SSize_t_MAX; |
4161
|
|
|
|
|
|
else |
4162
|
0
|
|
|
|
|
data->pos_delta += - counted * deltanext + |
4163
|
0
|
|
|
|
|
(minnext + deltanext) * maxcount - minnext * mincount; |
4164
|
8861574
|
|
|
|
|
if (mincount != maxcount) { |
4165
|
|
|
|
|
|
/* Cannot extend fixed substrings found inside |
4166
|
|
|
|
|
|
the group. */ |
4167
|
36473
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); |
4168
|
36473
|
|
|
|
|
if (mincount && last_str) { |
4169
|
36473
|
|
|
|
|
SV * const sv = data->last_found; |
4170
|
36473
|
|
|
|
|
MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? |
4171
|
32863
|
|
|
|
|
mg_find(sv, PERL_MAGIC_utf8) : NULL; |
4172
|
|
|
|
|
|
|
4173
|
36473
|
|
|
|
|
if (mg) |
4174
|
214
|
|
|
|
|
mg->mg_len = -1; |
4175
|
36473
|
|
|
|
|
sv_setsv(sv, last_str); |
4176
|
34725
|
|
|
|
|
data->last_end = data->pos_min; |
4177
|
34725
|
|
|
|
|
data->last_start_min = |
4178
|
6016
|
|
|
|
|
data->pos_min - CHR_SVLEN(last_str); |
4179
|
5142
|
|
|
|
|
data->last_start_max = is_inf |
4180
|
|
|
|
|
|
? SSize_t_MAX |
4181
|
5142
|
|
|
|
|
: data->pos_min + data->pos_delta |
4182
|
5142
|
|
|
|
|
- CHR_SVLEN(last_str); |
4183
|
|
|
|
|
|
} |
4184
|
4470
|
|
|
|
|
data->longest = &(data->longest_float); |
4185
|
|
|
|
|
|
} |
4186
|
4470
|
|
|
|
|
SvREFCNT_dec(last_str); |
4187
|
|
|
|
|
|
} |
4188
|
672
|
|
|
|
|
if (data && (fl & SF_HAS_EVAL)) |
4189
|
5142
|
|
|
|
|
data->flags |= SF_HAS_EVAL; |
4190
|
|
|
|
|
|
optimize_curly_tail: |
4191
|
5142
|
|
|
|
|
if (OP(oscan) != CURLYX) { |
4192
|
0
|
|
|
|
|
while (PL_regkind[OP(next = regnext(oscan))] == NOTHING |
4193
|
0
|
|
|
|
|
&& NEXT_OFF(next)) |
4194
|
5142
|
|
|
|
|
NEXT_OFF(oscan) += NEXT_OFF(next); |
4195
|
|
|
|
|
|
} |
4196
|
4470
|
|
|
|
|
continue; |
4197
|
|
|
|
|
|
default: /* REF, and CLUMP only? */ |
4198
|
5142
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4199
|
3766
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ |
4200
|
1748
|
|
|
|
|
data->longest = &(data->longest_float); |
4201
|
|
|
|
|
|
} |
4202
|
|
|
|
|
|
is_inf = is_inf_internal = 1; |
4203
|
3766
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) |
4204
|
3766
|
|
|
|
|
cl_anything(pRExC_state, data->start_class); |
4205
|
5142
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
4206
|
|
|
|
|
|
break; |
4207
|
|
|
|
|
|
} |
4208
|
|
|
|
|
|
} |
4209
|
3100
|
|
|
|
|
else if (OP(scan) == LNBREAK) { |
4210
|
5142
|
|
|
|
|
if (flags & SCF_DO_STCLASS) { |
4211
|
|
|
|
|
|
int value = 0; |
4212
|
2298
|
|
|
|
|
CLEAR_SSC_EOS(data->start_class); /* No match on empty */ |
4213
|
5142
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND) { |
4214
|
3970
|
|
|
|
|
for (value = 0; value < 256; value++) |
4215
|
1440
|
|
|
|
|
if (!is_VERTWS_cp(value)) |
4216
|
5142
|
|
|
|
|
ANYOF_BITMAP_CLEAR(data->start_class, value); |
4217
|
|
|
|
|
|
} |
4218
|
|
|
|
|
|
else { |
4219
|
834
|
|
|
|
|
for (value = 0; value < 256; value++) |
4220
|
5142
|
|
|
|
|
if (is_VERTWS_cp(value)) |
4221
|
0
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, value); |
4222
|
|
|
|
|
|
} |
4223
|
0
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) |
4224
|
0
|
|
|
|
|
cl_and(data->start_class, and_withp); |
4225
|
0
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
4226
|
|
|
|
|
|
} |
4227
|
0
|
|
|
|
|
min++; |
4228
|
5142
|
|
|
|
|
delta++; /* Because of the 2 char string cr-lf */ |
4229
|
4470
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4230
|
24
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ |
4231
|
4470
|
|
|
|
|
data->pos_min += 1; |
4232
|
5142
|
|
|
|
|
data->pos_delta += 1; |
4233
|
0
|
|
|
|
|
data->longest = &(data->longest_float); |
4234
|
|
|
|
|
|
} |
4235
|
|
|
|
|
|
} |
4236
|
36595
|
|
|
|
|
else if (REGNODE_SIMPLE(OP(scan))) { |
4237
|
|
|
|
|
|
int value = 0; |
4238
|
|
|
|
|
|
|
4239
|
32869
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4240
|
32869
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); |
4241
|
32869
|
|
|
|
|
data->pos_min++; |
4242
|
|
|
|
|
|
} |
4243
|
30019
|
|
|
|
|
min++; |
4244
|
36479
|
|
|
|
|
if (flags & SCF_DO_STCLASS) { |
4245
|
|
|
|
|
|
int loop_max = 256; |
4246
|
36473
|
|
|
|
|
CLEAR_SSC_EOS(data->start_class); /* No match on empty */ |
4247
|
|
|
|
|
|
|
4248
|
|
|
|
|
|
/* Some of the logic below assumes that switching |
4249
|
|
|
|
|
|
locale on will only add false positives. */ |
4250
|
36473
|
|
|
|
|
switch (PL_regkind[OP(scan)]) { |
4251
|
|
|
|
|
|
U8 classnum; |
4252
|
|
|
|
|
|
|
4253
|
|
|
|
|
|
case SANY: |
4254
|
|
|
|
|
|
default: |
4255
|
|
|
|
|
|
#ifdef DEBUGGING |
4256
|
42
|
|
|
|
|
Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); |
4257
|
|
|
|
|
|
#endif |
4258
|
|
|
|
|
|
do_default: |
4259
|
42
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ |
4260
|
42
|
|
|
|
|
cl_anything(pRExC_state, data->start_class); |
4261
|
|
|
|
|
|
break; |
4262
|
|
|
|
|
|
case REG_ANY: |
4263
|
42
|
|
|
|
|
if (OP(scan) == SANY) |
4264
|
|
|
|
|
|
goto do_default; |
4265
|
36431
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ |
4266
|
172
|
|
|
|
|
value = (ANYOF_BITMAP_TEST(data->start_class,'\n') |
4267
|
0
|
|
|
|
|
|| ANYOF_CLASS_TEST_ANY_SET(data->start_class)); |
4268
|
0
|
|
|
|
|
cl_anything(pRExC_state, data->start_class); |
4269
|
|
|
|
|
|
} |
4270
|
172
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND || !value) |
4271
|
172
|
|
|
|
|
ANYOF_BITMAP_CLEAR(data->start_class,'\n'); |
4272
|
|
|
|
|
|
break; |
4273
|
|
|
|
|
|
case ANYOF: |
4274
|
172
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND) |
4275
|
172
|
|
|
|
|
cl_and(data->start_class, |
4276
|
|
|
|
|
|
(struct regnode_charclass_class*)scan); |
4277
|
|
|
|
|
|
else |
4278
|
172
|
|
|
|
|
cl_or(pRExC_state, data->start_class, |
4279
|
|
|
|
|
|
(struct regnode_charclass_class*)scan); |
4280
|
|
|
|
|
|
break; |
4281
|
|
|
|
|
|
case POSIXA: |
4282
|
|
|
|
|
|
loop_max = 128; |
4283
|
|
|
|
|
|
/* FALL THROUGH */ |
4284
|
|
|
|
|
|
case POSIXL: |
4285
|
|
|
|
|
|
case POSIXD: |
4286
|
|
|
|
|
|
case POSIXU: |
4287
|
172
|
|
|
|
|
classnum = FLAGS(scan); |
4288
|
36473
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND) { |
4289
|
36473
|
|
|
|
|
if (!(data->start_class->flags & ANYOF_LOCALE)) { |
4290
|
46720021
|
|
|
|
|
ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); |
4291
|
15077722
|
|
|
|
|
for (value = 0; value < loop_max; value++) { |
4292
|
19658
|
|
|
|
|
if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { |
4293
|
19658
|
|
|
|
|
ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value)); |
4294
|
|
|
|
|
|
} |
4295
|
|
|
|
|
|
} |
4296
|
|
|
|
|
|
} |
4297
|
|
|
|
|
|
} |
4298
|
|
|
|
|
|
else { |
4299
|
19658
|
|
|
|
|
if (data->start_class->flags & ANYOF_LOCALE) { |
4300
|
19658
|
|
|
|
|
ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); |
4301
|
|
|
|
|
|
} |
4302
|
|
|
|
|
|
else { |
4303
|
|
|
|
|
|
|
4304
|
|
|
|
|
|
/* Even if under locale, set the bits for non-locale |
4305
|
|
|
|
|
|
* in case it isn't a true locale-node. This will |
4306
|
|
|
|
|
|
* create false positives if it truly is locale */ |
4307
|
19658
|
|
|
|
|
for (value = 0; value < loop_max; value++) { |
4308
|
17472818
|
|
|
|
|
if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { |
4309
|
17472818
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value)); |
4310
|
|
|
|
|
|
} |
4311
|
|
|
|
|
|
} |
4312
|
|
|
|
|
|
} |
4313
|
|
|
|
|
|
} |
4314
|
|
|
|
|
|
break; |
4315
|
|
|
|
|
|
case NPOSIXA: |
4316
|
|
|
|
|
|
loop_max = 128; |
4317
|
|
|
|
|
|
/* FALL THROUGH */ |
4318
|
|
|
|
|
|
case NPOSIXL: |
4319
|
|
|
|
|
|
case NPOSIXU: |
4320
|
|
|
|
|
|
case NPOSIXD: |
4321
|
17472818
|
|
|
|
|
classnum = FLAGS(scan); |
4322
|
4343511
|
|
|
|
|
if (flags & SCF_DO_STCLASS_AND) { |
4323
|
17472818
|
|
|
|
|
if (!(data->start_class->flags & ANYOF_LOCALE)) { |
4324
|
17472818
|
|
|
|
|
ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); |
4325
|
1306136
|
|
|
|
|
for (value = 0; value < loop_max; value++) { |
4326
|
1306136
|
|
|
|
|
if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { |
4327
|
16166682
|
|
|
|
|
ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value)); |
4328
|
|
|
|
|
|
} |
4329
|
|
|
|
|
|
} |
4330
|
|
|
|
|
|
} |
4331
|
|
|
|
|
|
} |
4332
|
|
|
|
|
|
else { |
4333
|
3838307
|
|
|
|
|
if (data->start_class->flags & ANYOF_LOCALE) { |
4334
|
3838307
|
|
|
|
|
ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); |
4335
|
|
|
|
|
|
} |
4336
|
|
|
|
|
|
else { |
4337
|
|
|
|
|
|
|
4338
|
|
|
|
|
|
/* Even if under locale, set the bits for non-locale in |
4339
|
|
|
|
|
|
* case it isn't a true locale-node. This will create |
4340
|
|
|
|
|
|
* false positives if it truly is locale */ |
4341
|
17472818
|
|
|
|
|
for (value = 0; value < loop_max; value++) { |
4342
|
382429
|
|
|
|
|
if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) { |
4343
|
17472818
|
|
|
|
|
ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value)); |
4344
|
|
|
|
|
|
} |
4345
|
|
|
|
|
|
} |
4346
|
33895
|
|
|
|
|
if (PL_regkind[OP(scan)] == NPOSIXD) { |
4347
|
17472818
|
|
|
|
|
data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; |
4348
|
|
|
|
|
|
} |
4349
|
|
|
|
|
|
} |
4350
|
|
|
|
|
|
} |
4351
|
|
|
|
|
|
break; |
4352
|
|
|
|
|
|
} |
4353
|
2753131
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) |
4354
|
2753131
|
|
|
|
|
cl_and(data->start_class, and_withp); |
4355
|
2753131
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
4356
|
|
|
|
|
|
} |
4357
|
|
|
|
|
|
} |
4358
|
2753247
|
|
|
|
|
else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { |
4359
|
666260
|
|
|
|
|
data->flags |= (OP(scan) == MEOL |
4360
|
|
|
|
|
|
? SF_BEFORE_MEOL |
4361
|
2086883
|
|
|
|
|
: SF_BEFORE_SEOL); |
4362
|
2753135
|
|
|
|
|
SCAN_COMMIT(pRExC_state, data, minlenp); |
4363
|
|
|
|
|
|
|
4364
|
|
|
|
|
|
} |
4365
|
2753243
|
|
|
|
|
else if ( PL_regkind[OP(scan)] == BRANCHJ |
4366
|
|
|
|
|
|
/* Lookbehind, or need to calculate parens/evals/stclass: */ |
4367
|
2753131
|
|
|
|
|
&& (scan->flags || data || (flags & SCF_DO_STCLASS)) |
4368
|
48
|
|
|
|
|
&& (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { |
4369
|
48
|
|
|
|
|
if ( OP(scan) == UNLESSM && |
4370
|
48
|
|
|
|
|
scan->flags == 0 && |
4371
|
0
|
|
|
|
|
OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && |
4372
|
0
|
|
|
|
|
OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED |
4373
|
|
|
|
|
|
) { |
4374
|
|
|
|
|
|
regnode *opt; |
4375
|
0
|
|
|
|
|
regnode *upto= regnext(scan); |
4376
|
0
|
|
|
|
|
DEBUG_PARSE_r({ |
4377
|
|
|
|
|
|
SV * const mysv_val=sv_newmortal(); |
4378
|
|
|
|
|
|
DEBUG_STUDYDATA("OPFAIL",data,depth); |
4379
|
|
|
|
|
|
|
4380
|
|
|
|
|
|
/*DEBUG_PARSE_MSG("opfail");*/ |
4381
|
|
|
|
|
|
regprop(RExC_rx, mysv_val, upto); |
4382
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", |
4383
|
|
|
|
|
|
SvPV_nolen_const(mysv_val), |
4384
|
|
|
|
|
|
(IV)REG_NODE_NUM(upto), |
4385
|
|
|
|
|
|
(IV)(upto - scan) |
4386
|
|
|
|
|
|
); |
4387
|
|
|
|
|
|
}); |
4388
|
0
|
|
|
|
|
OP(scan) = OPFAIL; |
4389
|
0
|
|
|
|
|
NEXT_OFF(scan) = upto - scan; |
4390
|
0
|
|
|
|
|
for (opt= scan + 1; opt < upto ; opt++) |
4391
|
0
|
|
|
|
|
OP(opt) = OPTIMIZED; |
4392
|
336
|
|
|
|
|
scan= upto; |
4393
|
288
|
|
|
|
|
continue; |
4394
|
|
|
|
|
|
} |
4395
|
|
|
|
|
|
if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY |
4396
|
|
|
|
|
|
|| OP(scan) == UNLESSM ) |
4397
|
|
|
|
|
|
{ |
4398
|
|
|
|
|
|
/* Negative Lookahead/lookbehind |
4399
|
|
|
|
|
|
In this case we can't do fixed string optimisation. |
4400
|
|
|
|
|
|
*/ |
4401
|
|
|
|
|
|
|
4402
|
48
|
|
|
|
|
SSize_t deltanext, minnext, fake = 0; |
4403
|
|
|
|
|
|
regnode *nscan; |
4404
|
|
|
|
|
|
struct regnode_charclass_class intrnl; |
4405
|
|
|
|
|
|
int f = 0; |
4406
|
|
|
|
|
|
|
4407
|
48
|
|
|
|
|
data_fake.flags = 0; |
4408
|
4931834
|
|
|
|
|
if (data) { |
4409
|
4931834
|
|
|
|
|
data_fake.whilem_c = data->whilem_c; |
4410
|
4477820
|
|
|
|
|
data_fake.last_closep = data->last_closep; |
4411
|
|
|
|
|
|
} |
4412
|
|
|
|
|
|
else |
4413
|
4477820
|
|
|
|
|
data_fake.last_closep = &fake; |
4414
|
4477814
|
|
|
|
|
data_fake.pos_delta = delta; |
4415
|
4477814
|
|
|
|
|
if ( flags & SCF_DO_STCLASS && !scan->flags |
4416
|
56
|
|
|
|
|
&& OP(scan) == IFMATCH ) { /* Lookahead */ |
4417
|
454014
|
|
|
|
|
cl_init(pRExC_state, &intrnl); |
4418
|
50512
|
|
|
|
|
data_fake.start_class = &intrnl; |
4419
|
|
|
|
|
|
f |= SCF_DO_STCLASS_AND; |
4420
|
|
|
|
|
|
} |
4421
|
50512
|
|
|
|
|
if (flags & SCF_WHILEM_VISITED_POS) |
4422
|
2547655
|
|
|
|
|
f |= SCF_WHILEM_VISITED_POS; |
4423
|
0
|
|
|
|
|
next = regnext(scan); |
4424
|
0
|
|
|
|
|
nscan = NEXTOPER(NEXTOPER(scan)); |
4425
|
0
|
|
|
|
|
minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, |
4426
|
|
|
|
|
|
last, &data_fake, stopparen, recursed, NULL, f, depth+1); |
4427
|
0
|
|
|
|
|
if (scan->flags) { |
4428
|
0
|
|
|
|
|
if (deltanext) { |
4429
|
0
|
|
|
|
|
FAIL("Variable length lookbehind not implemented"); |
4430
|
|
|
|
|
|
} |
4431
|
312412
|
|
|
|
|
else if (minnext > (I32)U8_MAX) { |
4432
|
312412
|
|
|
|
|
FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); |
4433
|
|
|
|
|
|
} |
4434
|
312412
|
|
|
|
|
scan->flags = (U8)minnext; |
4435
|
|
|
|
|
|
} |
4436
|
7322298
|
|
|
|
|
if (data) { |
4437
|
6853680
|
|
|
|
|
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) |
4438
|
6853676
|
|
|
|
|
pars++; |
4439
|
4
|
|
|
|
|
if (data_fake.flags & SF_HAS_EVAL) |
4440
|
4
|
|
|
|
|
data->flags |= SF_HAS_EVAL; |
4441
|
6853680
|
|
|
|
|
data->whilem_c = data_fake.whilem_c; |
4442
|
|
|
|
|
|
} |
4443
|
3482
|
|
|
|
|
if (f & SCF_DO_STCLASS_AND) { |
4444
|
202
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) { |
4445
|
|
|
|
|
|
/* OR before, AND after: ideally we would recurse with |
4446
|
|
|
|
|
|
* data_fake to get the AND applied by study of the |
4447
|
|
|
|
|
|
* remainder of the pattern, and then derecurse; |
4448
|
|
|
|
|
|
* *** HACK *** for now just treat as "no information". |
4449
|
|
|
|
|
|
* See [perl #56690]. |
4450
|
|
|
|
|
|
*/ |
4451
|
202
|
|
|
|
|
cl_init(pRExC_state, data->start_class); |
4452
|
|
|
|
|
|
} else { |
4453
|
|
|
|
|
|
/* AND before and after: combine and continue */ |
4454
|
3280
|
|
|
|
|
const int was = TEST_SSC_EOS(data->start_class); |
4455
|
|
|
|
|
|
|
4456
|
202
|
|
|
|
|
cl_and(data->start_class, &intrnl); |
4457
|
202
|
|
|
|
|
if (was) |
4458
|
6853680
|
|
|
|
|
SET_SSC_EOS(data->start_class); |
4459
|
|
|
|
|
|
} |
4460
|
|
|
|
|
|
} |
4461
|
|
|
|
|
|
} |
4462
|
|
|
|
|
|
#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY |
4463
|
|
|
|
|
|
else { |
4464
|
|
|
|
|
|
/* Positive Lookahead/lookbehind |
4465
|
|
|
|
|
|
In this case we can do fixed string optimisation, |
4466
|
|
|
|
|
|
but we must be careful about it. Note in the case of |
4467
|
|
|
|
|
|
lookbehind the positions will be offset by the minimum |
4468
|
|
|
|
|
|
length of the pattern, something we won't know about |
4469
|
|
|
|
|
|
until after the recurse. |
4470
|
|
|
|
|
|
*/ |
4471
|
|
|
|
|
|
SSize_t deltanext; |
4472
|
|
|
|
|
|
I32 fake = 0; |
4473
|
|
|
|
|
|
regnode *nscan; |
4474
|
|
|
|
|
|
struct regnode_charclass_class intrnl; |
4475
|
|
|
|
|
|
int f = 0; |
4476
|
|
|
|
|
|
/* We use SAVEFREEPV so that when the full compile |
4477
|
|
|
|
|
|
is finished perl will clean up the allocated |
4478
|
|
|
|
|
|
minlens when it's all done. This way we don't |
4479
|
|
|
|
|
|
have to worry about freeing them when we know |
4480
|
|
|
|
|
|
they wont be used, which would be a pain. |
4481
|
|
|
|
|
|
*/ |
4482
|
|
|
|
|
|
SSize_t *minnextp; |
4483
|
|
|
|
|
|
Newx( minnextp, 1, SSize_t ); |
4484
|
|
|
|
|
|
SAVEFREEPV(minnextp); |
4485
|
|
|
|
|
|
|
4486
|
|
|
|
|
|
if (data) { |
4487
|
|
|
|
|
|
StructCopy(data, &data_fake, scan_data_t); |
4488
|
|
|
|
|
|
if ((flags & SCF_DO_SUBSTR) && data->last_found) { |
4489
|
|
|
|
|
|
f |= SCF_DO_SUBSTR; |
4490
|
|
|
|
|
|
if (scan->flags) |
4491
|
|
|
|
|
|
SCAN_COMMIT(pRExC_state, &data_fake,minlenp); |
4492
|
|
|
|
|
|
data_fake.last_found=newSVsv(data->last_found); |
4493
|
|
|
|
|
|
} |
4494
|
|
|
|
|
|
} |
4495
|
|
|
|
|
|
else |
4496
|
|
|
|
|
|
data_fake.last_closep = &fake; |
4497
|
|
|
|
|
|
data_fake.flags = 0; |
4498
|
|
|
|
|
|
data_fake.pos_delta = delta; |
4499
|
|
|
|
|
|
if (is_inf) |
4500
|
|
|
|
|
|
data_fake.flags |= SF_IS_INF; |
4501
|
|
|
|
|
|
if ( flags & SCF_DO_STCLASS && !scan->flags |
4502
|
|
|
|
|
|
&& OP(scan) == IFMATCH ) { /* Lookahead */ |
4503
|
|
|
|
|
|
cl_init(pRExC_state, &intrnl); |
4504
|
|
|
|
|
|
data_fake.start_class = &intrnl; |
4505
|
|
|
|
|
|
f |= SCF_DO_STCLASS_AND; |
4506
|
|
|
|
|
|
} |
4507
|
|
|
|
|
|
if (flags & SCF_WHILEM_VISITED_POS) |
4508
|
|
|
|
|
|
f |= SCF_WHILEM_VISITED_POS; |
4509
|
|
|
|
|
|
next = regnext(scan); |
4510
|
|
|
|
|
|
nscan = NEXTOPER(NEXTOPER(scan)); |
4511
|
|
|
|
|
|
|
4512
|
|
|
|
|
|
*minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, |
4513
|
|
|
|
|
|
last, &data_fake, stopparen, recursed, NULL, f,depth+1); |
4514
|
|
|
|
|
|
if (scan->flags) { |
4515
|
|
|
|
|
|
if (deltanext) { |
4516
|
|
|
|
|
|
FAIL("Variable length lookbehind not implemented"); |
4517
|
|
|
|
|
|
} |
4518
|
|
|
|
|
|
else if (*minnextp > (I32)U8_MAX) { |
4519
|
|
|
|
|
|
FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); |
4520
|
|
|
|
|
|
} |
4521
|
|
|
|
|
|
scan->flags = (U8)*minnextp; |
4522
|
|
|
|
|
|
} |
4523
|
|
|
|
|
|
|
4524
|
|
|
|
|
|
*minnextp += min; |
4525
|
|
|
|
|
|
|
4526
|
|
|
|
|
|
if (f & SCF_DO_STCLASS_AND) { |
4527
|
|
|
|
|
|
const int was = TEST_SSC_EOS(data.start_class); |
4528
|
|
|
|
|
|
|
4529
|
|
|
|
|
|
cl_and(data->start_class, &intrnl); |
4530
|
|
|
|
|
|
if (was) |
4531
|
|
|
|
|
|
SET_SSC_EOS(data->start_class); |
4532
|
|
|
|
|
|
} |
4533
|
|
|
|
|
|
if (data) { |
4534
|
|
|
|
|
|
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) |
4535
|
|
|
|
|
|
pars++; |
4536
|
|
|
|
|
|
if (data_fake.flags & SF_HAS_EVAL) |
4537
|
|
|
|
|
|
data->flags |= SF_HAS_EVAL; |
4538
|
|
|
|
|
|
data->whilem_c = data_fake.whilem_c; |
4539
|
|
|
|
|
|
if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { |
4540
|
|
|
|
|
|
if (RExC_rx->minlen<*minnextp) |
4541
|
|
|
|
|
|
RExC_rx->minlen=*minnextp; |
4542
|
|
|
|
|
|
SCAN_COMMIT(pRExC_state, &data_fake, minnextp); |
4543
|
|
|
|
|
|
SvREFCNT_dec_NN(data_fake.last_found); |
4544
|
|
|
|
|
|
|
4545
|
|
|
|
|
|
if ( data_fake.minlen_fixed != minlenp ) |
4546
|
|
|
|
|
|
{ |
4547
|
|
|
|
|
|
data->offset_fixed= data_fake.offset_fixed; |
4548
|
|
|
|
|
|
data->minlen_fixed= data_fake.minlen_fixed; |
4549
|
|
|
|
|
|
data->lookbehind_fixed+= scan->flags; |
4550
|
|
|
|
|
|
} |
4551
|
|
|
|
|
|
if ( data_fake.minlen_float != minlenp ) |
4552
|
|
|
|
|
|
{ |
4553
|
|
|
|
|
|
data->minlen_float= data_fake.minlen_float; |
4554
|
|
|
|
|
|
data->offset_float_min=data_fake.offset_float_min; |
4555
|
|
|
|
|
|
data->offset_float_max=data_fake.offset_float_max; |
4556
|
|
|
|
|
|
data->lookbehind_float+= scan->flags; |
4557
|
|
|
|
|
|
} |
4558
|
|
|
|
|
|
} |
4559
|
|
|
|
|
|
} |
4560
|
|
|
|
|
|
} |
4561
|
|
|
|
|
|
#endif |
4562
|
|
|
|
|
|
} |
4563
|
6853792
|
|
|
|
|
else if (OP(scan) == OPEN) { |
4564
|
312422
|
|
|
|
|
if (stopparen != (I32)ARG(scan)) |
4565
|
312422
|
|
|
|
|
pars++; |
4566
|
|
|
|
|
|
} |
4567
|
312514
|
|
|
|
|
else if (OP(scan) == CLOSE) { |
4568
|
312422
|
|
|
|
|
if (stopparen == (I32)ARG(scan)) { |
4569
|
|
|
|
|
|
break; |
4570
|
|
|
|
|
|
} |
4571
|
312422
|
|
|
|
|
if ((I32)ARG(scan) == is_par) { |
4572
|
312420
|
|
|
|
|
next = regnext(scan); |
4573
|
|
|
|
|
|
|
4574
|
26813582
|
|
|
|
|
if ( next && (OP(next) != WHILEM) && next < last) |
4575
|
|
|
|
|
|
is_par = 0; /* Disable optimization */ |
4576
|
|
|
|
|
|
} |
4577
|
26813584
|
|
|
|
|
if (data) |
4578
|
6931925
|
|
|
|
|
*(data->last_closep) = ARG(scan); |
4579
|
|
|
|
|
|
} |
4580
|
6932007
|
|
|
|
|
else if (OP(scan) == EVAL) { |
4581
|
116880278
|
|
|
|
|
if (data) |
4582
|
45033410
|
|
|
|
|
data->flags |= SF_HAS_EVAL; |
4583
|
|
|
|
|
|
} |
4584
|
45033342
|
|
|
|
|
else if ( PL_regkind[OP(scan)] == ENDLIKE ) { |
4585
|
16
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4586
|
45033314
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); |
4587
|
45033330
|
|
|
|
|
flags &= ~SCF_DO_SUBSTR; |
4588
|
|
|
|
|
|
} |
4589
|
36
|
|
|
|
|
if (data && OP(scan)==ACCEPT) { |
4590
|
36
|
|
|
|
|
data->flags |= SCF_SEEN_ACCEPT; |
4591
|
0
|
|
|
|
|
if (stopmin > min) |
4592
|
|
|
|
|
|
stopmin = min; |
4593
|
|
|
|
|
|
} |
4594
|
|
|
|
|
|
} |
4595
|
48
|
|
|
|
|
else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ |
4596
|
|
|
|
|
|
{ |
4597
|
8
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4598
|
8
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); |
4599
|
8
|
|
|
|
|
data->longest = &(data->longest_float); |
4600
|
|
|
|
|
|
} |
4601
|
|
|
|
|
|
is_inf = is_inf_internal = 1; |
4602
|
0
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ |
4603
|
0
|
|
|
|
|
cl_anything(pRExC_state, data->start_class); |
4604
|
28
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
4605
|
|
|
|
|
|
} |
4606
|
48
|
|
|
|
|
else if (OP(scan) == GPOS) { |
4607
|
36
|
|
|
|
|
if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && |
4608
|
36
|
|
|
|
|
!(delta || is_inf || (data && data->pos_delta))) |
4609
|
|
|
|
|
|
{ |
4610
|
45033294
|
|
|
|
|
if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) |
4611
|
4498516
|
|
|
|
|
RExC_rx->extflags |= RXf_ANCH_GPOS; |
4612
|
11068
|
|
|
|
|
if (RExC_rx->gofs < (STRLEN)min) |
4613
|
11066
|
|
|
|
|
RExC_rx->gofs = min; |
4614
|
|
|
|
|
|
} else { |
4615
|
11066
|
|
|
|
|
RExC_rx->extflags |= RXf_GPOS_FLOAT; |
4616
|
11066
|
|
|
|
|
RExC_rx->gofs = 0; |
4617
|
|
|
|
|
|
} |
4618
|
|
|
|
|
|
} |
4619
|
|
|
|
|
|
#ifdef TRIE_STUDY_OPT |
4620
|
|
|
|
|
|
#ifdef FULL_TRIE_STUDY |
4621
|
11078
|
|
|
|
|
else if (PL_regkind[OP(scan)] == TRIE) { |
4622
|
|
|
|
|
|
/* NOTE - There is similar code to this block above for handling |
4623
|
|
|
|
|
|
BRANCH nodes on the initial study. If you change stuff here |
4624
|
|
|
|
|
|
check there too. */ |
4625
|
11072
|
|
|
|
|
regnode *trie_node= scan; |
4626
|
4498522
|
|
|
|
|
regnode *tail= regnext(scan); |
4627
|
22249075
|
|
|
|
|
reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; |
4628
|
|
|
|
|
|
SSize_t max1 = 0, min1 = SSize_t_MAX; |
4629
|
|
|
|
|
|
struct regnode_charclass_class accum; |
4630
|
|
|
|
|
|
|
4631
|
45033300
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ |
4632
|
15504
|
|
|
|
|
SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ |
4633
|
15504
|
|
|
|
|
if (flags & SCF_DO_STCLASS) |
4634
|
4
|
|
|
|
|
cl_init_zero(pRExC_state, &accum); |
4635
|
|
|
|
|
|
|
4636
|
10
|
|
|
|
|
if (!trie->jump) { |
4637
|
10
|
|
|
|
|
min1= trie->minlen; |
4638
|
6
|
|
|
|
|
max1= trie->maxlen; |
4639
|
|
|
|
|
|
} else { |
4640
|
|
|
|
|
|
const regnode *nextbranch= NULL; |
4641
|
|
|
|
|
|
U32 word; |
4642
|
|
|
|
|
|
|
4643
|
45033392
|
|
|
|
|
for ( word=1 ; word <= trie->wordcount ; word++) |
4644
|
|
|
|
|
|
{ |
4645
|
14
|
|
|
|
|
SSize_t deltanext=0, minnext=0, f = 0, fake; |
4646
|
|
|
|
|
|
struct regnode_charclass_class this_class; |
4647
|
|
|
|
|
|
|
4648
|
14
|
|
|
|
|
data_fake.flags = 0; |
4649
|
14
|
|
|
|
|
if (data) { |
4650
|
45033378
|
|
|
|
|
data_fake.whilem_c = data->whilem_c; |
4651
|
102
|
|
|
|
|
data_fake.last_closep = data->last_closep; |
4652
|
|
|
|
|
|
} |
4653
|
|
|
|
|
|
else |
4654
|
102
|
|
|
|
|
data_fake.last_closep = &fake; |
4655
|
110
|
|
|
|
|
data_fake.pos_delta = delta; |
4656
|
102
|
|
|
|
|
if (flags & SCF_DO_STCLASS) { |
4657
|
8
|
|
|
|
|
cl_init(pRExC_state, &this_class); |
4658
|
98
|
|
|
|
|
data_fake.start_class = &this_class; |
4659
|
|
|
|
|
|
f = SCF_DO_STCLASS_AND; |
4660
|
|
|
|
|
|
} |
4661
|
45033280
|
|
|
|
|
if (flags & SCF_WHILEM_VISITED_POS) |
4662
|
15021019
|
|
|
|
|
f |= SCF_WHILEM_VISITED_POS; |
4663
|
|
|
|
|
|
|
4664
|
45033280
|
|
|
|
|
if (trie->jump[word]) { |
4665
|
25151657
|
|
|
|
|
if (!nextbranch) |
4666
|
25151657
|
|
|
|
|
nextbranch = trie_node + trie->jump[0]; |
4667
|
25151657
|
|
|
|
|
scan= trie_node + trie->jump[word]; |
4668
|
|
|
|
|
|
/* We go from the jump point to the branch that follows |
4669
|
|
|
|
|
|
it. Note this means we need the vestigal unused branches |
4670
|
|
|
|
|
|
even though they arent otherwise used. |
4671
|
|
|
|
|
|
*/ |
4672
|
2132
|
|
|
|
|
minnext = study_chunk(pRExC_state, &scan, minlenp, |
4673
|
|
|
|
|
|
&deltanext, (regnode *)nextbranch, &data_fake, |
4674
|
|
|
|
|
|
stopparen, recursed, NULL, f,depth+1); |
4675
|
|
|
|
|
|
} |
4676
|
2132
|
|
|
|
|
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) |
4677
|
2132
|
|
|
|
|
nextbranch= regnext((regnode*)nextbranch); |
4678
|
|
|
|
|
|
|
4679
|
25151657
|
|
|
|
|
if (min1 > (SSize_t)(minnext + trie->minlen)) |
4680
|
45033280
|
|
|
|
|
min1 = minnext + trie->minlen; |
4681
|
11062
|
|
|
|
|
if (deltanext == SSize_t_MAX) { |
4682
|
|
|
|
|
|
is_inf = is_inf_internal = 1; |
4683
|
|
|
|
|
|
max1 = SSize_t_MAX; |
4684
|
45033294
|
|
|
|
|
} else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) |
4685
|
1593078
|
|
|
|
|
max1 = minnext + deltanext + trie->maxlen; |
4686
|
|
|
|
|
|
|
4687
|
1593078
|
|
|
|
|
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) |
4688
|
1593078
|
|
|
|
|
pars++; |
4689
|
103156
|
|
|
|
|
if (data_fake.flags & SCF_SEEN_ACCEPT) { |
4690
|
103156
|
|
|
|
|
if ( stopmin > min + min1) |
4691
|
103156
|
|
|
|
|
stopmin = min + min1; |
4692
|
206370
|
|
|
|
|
flags &= ~SCF_DO_SUBSTR; |
4693
|
103214
|
|
|
|
|
if (data) |
4694
|
103214
|
|
|
|
|
data->flags |= SCF_SEEN_ACCEPT; |
4695
|
|
|
|
|
|
} |
4696
|
103214
|
|
|
|
|
if (data) { |
4697
|
103214
|
|
|
|
|
if (data_fake.flags & SF_HAS_EVAL) |
4698
|
103214
|
|
|
|
|
data->flags |= SF_HAS_EVAL; |
4699
|
103214
|
|
|
|
|
data->whilem_c = data_fake.whilem_c; |
4700
|
|
|
|
|
|
} |
4701
|
103214
|
|
|
|
|
if (flags & SCF_DO_STCLASS) |
4702
|
154821
|
|
|
|
|
cl_or(pRExC_state, &accum, &this_class); |
4703
|
|
|
|
|
|
} |
4704
|
|
|
|
|
|
} |
4705
|
103220
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4706
|
26813580
|
|
|
|
|
data->pos_min += min1; |
4707
|
6931921
|
|
|
|
|
data->pos_delta += max1 - min1; |
4708
|
26813580
|
|
|
|
|
if (max1 != min1 || is_inf) |
4709
|
5172
|
|
|
|
|
data->longest = &(data->longest_float); |
4710
|
|
|
|
|
|
} |
4711
|
391241
|
|
|
|
|
min += min1; |
4712
|
388688
|
|
|
|
|
delta += max1 - min1; |
4713
|
304954
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) { |
4714
|
5184
|
|
|
|
|
cl_or(pRExC_state, data->start_class, &accum); |
4715
|
5184
|
|
|
|
|
if (min1) { |
4716
|
5184
|
|
|
|
|
cl_and(data->start_class, and_withp); |
4717
|
386131
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
4718
|
|
|
|
|
|
} |
4719
|
|
|
|
|
|
} |
4720
|
5272
|
|
|
|
|
else if (flags & SCF_DO_STCLASS_AND) { |
4721
|
5210
|
|
|
|
|
if (min1) { |
4722
|
64
|
|
|
|
|
cl_and(data->start_class, &accum); |
4723
|
64
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; |
4724
|
|
|
|
|
|
} |
4725
|
|
|
|
|
|
else { |
4726
|
|
|
|
|
|
/* Switch to OR mode: cache the old value of |
4727
|
|
|
|
|
|
* data->start_class */ |
4728
|
0
|
|
|
|
|
INIT_AND_WITHP; |
4729
|
0
|
|
|
|
|
StructCopy(data->start_class, and_withp, |
4730
|
|
|
|
|
|
struct regnode_charclass_class); |
4731
|
64
|
|
|
|
|
flags &= ~SCF_DO_STCLASS_AND; |
4732
|
64
|
|
|
|
|
StructCopy(&accum, data->start_class, |
4733
|
|
|
|
|
|
struct regnode_charclass_class); |
4734
|
2730
|
|
|
|
|
flags |= SCF_DO_STCLASS_OR; |
4735
|
2666
|
|
|
|
|
SET_SSC_EOS(data->start_class); |
4736
|
|
|
|
|
|
} |
4737
|
|
|
|
|
|
} |
4738
|
6
|
|
|
|
|
scan= tail; |
4739
|
70
|
|
|
|
|
continue; |
4740
|
|
|
|
|
|
} |
4741
|
|
|
|
|
|
#else |
4742
|
|
|
|
|
|
else if (PL_regkind[OP(scan)] == TRIE) { |
4743
|
|
|
|
|
|
reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; |
4744
|
|
|
|
|
|
U8*bang=NULL; |
4745
|
|
|
|
|
|
|
4746
|
|
|
|
|
|
min += trie->minlen; |
4747
|
|
|
|
|
|
delta += (trie->maxlen - trie->minlen); |
4748
|
|
|
|
|
|
flags &= ~SCF_DO_STCLASS; /* xxx */ |
4749
|
|
|
|
|
|
if (flags & SCF_DO_SUBSTR) { |
4750
|
|
|
|
|
|
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ |
4751
|
|
|
|
|
|
data->pos_min += trie->minlen; |
4752
|
|
|
|
|
|
data->pos_delta += (trie->maxlen - trie->minlen); |
4753
|
|
|
|
|
|
if (trie->maxlen != trie->minlen) |
4754
|
|
|
|
|
|
data->longest = &(data->longest_float); |
4755
|
|
|
|
|
|
} |
4756
|
|
|
|
|
|
if (trie->jump) /* no more substrings -- for now /grr*/ |
4757
|
|
|
|
|
|
flags &= ~SCF_DO_SUBSTR; |
4758
|
|
|
|
|
|
} |
4759
|
|
|
|
|
|
#endif /* old or new */ |
4760
|
|
|
|
|
|
#endif /* TRIE_STUDY_OPT */ |
4761
|
|
|
|
|
|
|
4762
|
|
|
|
|
|
/* Else: zero-length, ignore. */ |
4763
|
358
|
|
|
|
|
scan = regnext(scan); |
4764
|
|
|
|
|
|
} |
4765
|
2730
|
|
|
|
|
if (frame) { |
4766
|
2470
|
|
|
|
|
last = frame->last; |
4767
|
352
|
|
|
|
|
scan = frame->next; |
4768
|
240
|
|
|
|
|
stopparen = frame->stop; |
4769
|
218
|
|
|
|
|
frame = frame->prev; |
4770
|
218
|
|
|
|
|
goto fake_study_recurse; |
4771
|
|
|
|
|
|
} |
4772
|
|
|
|
|
|
|
4773
|
|
|
|
|
|
finish: |
4774
|
218
|
|
|
|
|
assert(!frame); |
4775
|
218
|
|
|
|
|
DEBUG_STUDYDATA("pre-fin:",data,depth); |
4776
|
|
|
|
|
|
|
4777
|
218
|
|
|
|
|
*scanp = scan; |
4778
|
2644
|
|
|
|
|
*deltap = is_inf_internal ? SSize_t_MAX : delta; |
4779
|
196
|
|
|
|
|
if (flags & SCF_DO_SUBSTR && is_inf) |
4780
|
2448
|
|
|
|
|
data->pos_delta = SSize_t_MAX - data->pos_min; |
4781
|
260
|
|
|
|
|
if (is_par > (I32)U8_MAX) |
4782
|
|
|
|
|
|
is_par = 0; |
4783
|
260
|
|
|
|
|
if (is_par && pars==1 && data) { |
4784
|
14
|
|
|
|
|
data->flags |= SF_IN_PAR; |
4785
|
64
|
|
|
|
|
data->flags &= ~SF_HAS_PAR; |
4786
|
|
|
|
|
|
} |
4787
|
260
|
|
|
|
|
else if (pars && data) { |
4788
|
74
|
|
|
|
|
data->flags |= SF_HAS_PAR; |
4789
|
74
|
|
|
|
|
data->flags &= ~SF_IN_PAR; |
4790
|
|
|
|
|
|
} |
4791
|
260
|
|
|
|
|
if (flags & SCF_DO_STCLASS_OR) |
4792
|
64
|
|
|
|
|
cl_and(data->start_class, and_withp); |
4793
|
260
|
|
|
|
|
if (flags & SCF_TRIE_RESTUDY) |
4794
|
70
|
|
|
|
|
data->flags |= SCF_TRIE_RESTUDY; |
4795
|
|
|
|
|
|
|
4796
|
260
|
|
|
|
|
DEBUG_STUDYDATA("post-fin:",data,depth); |
4797
|
|
|
|
|
|
|
4798
|
260
|
|
|
|
|
return min < stopmin ? min : stopmin; |
4799
|
|
|
|
|
|
} |
4800
|
|
|
|
|
|
|
4801
|
|
|
|
|
|
STATIC U32 |
4802
|
156
|
|
|
|
|
S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) |
4803
|
|
|
|
|
|
{ |
4804
|
156
|
|
|
|
|
U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; |
4805
|
|
|
|
|
|
|
4806
|
156
|
|
|
|
|
PERL_ARGS_ASSERT_ADD_DATA; |
4807
|
|
|
|
|
|
|
4808
|
156
|
|
|
|
|
Renewc(RExC_rxi->data, |
4809
|
|
|
|
|
|
sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), |
4810
|
|
|
|
|
|
char, struct reg_data); |
4811
|
92
|
|
|
|
|
if(count) |
4812
|
16
|
|
|
|
|
Renew(RExC_rxi->data->what, count + n, U8); |
4813
|
|
|
|
|
|
else |
4814
|
140
|
|
|
|
|
Newx(RExC_rxi->data->what, n, U8); |
4815
|
156
|
|
|
|
|
RExC_rxi->data->count = count + n; |
4816
|
156
|
|
|
|
|
Copy(s, RExC_rxi->data->what + count, n, U8); |
4817
|
156
|
|
|
|
|
return count; |
4818
|
|
|
|
|
|
} |
4819
|
|
|
|
|
|
|
4820
|
|
|
|
|
|
/*XXX: todo make this not included in a non debugging perl */ |
4821
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
4822
|
|
|
|
|
|
void |
4823
|
|
|
|
|
|
Perl_reginitcolors(pTHX) |
4824
|
|
|
|
|
|
{ |
4825
|
|
|
|
|
|
dVAR; |
4826
|
|
|
|
|
|
const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); |
4827
|
|
|
|
|
|
if (s) { |
4828
|
|
|
|
|
|
char *t = savepv(s); |
4829
|
|
|
|
|
|
int i = 0; |
4830
|
|
|
|
|
|
PL_colors[0] = t; |
4831
|
|
|
|
|
|
while (++i < 6) { |
4832
|
|
|
|
|
|
t = strchr(t, '\t'); |
4833
|
|
|
|
|
|
if (t) { |
4834
|
|
|
|
|
|
*t = '\0'; |
4835
|
|
|
|
|
|
PL_colors[i] = ++t; |
4836
|
|
|
|
|
|
} |
4837
|
|
|
|
|
|
else |
4838
|
|
|
|
|
|
PL_colors[i] = t = (char *)""; |
4839
|
|
|
|
|
|
} |
4840
|
|
|
|
|
|
} else { |
4841
|
|
|
|
|
|
int i = 0; |
4842
|
|
|
|
|
|
while (i < 6) |
4843
|
|
|
|
|
|
PL_colors[i++] = (char *)""; |
4844
|
|
|
|
|
|
} |
4845
|
|
|
|
|
|
PL_colorset = 1; |
4846
|
|
|
|
|
|
} |
4847
|
|
|
|
|
|
#endif |
4848
|
|
|
|
|
|
|
4849
|
|
|
|
|
|
|
4850
|
|
|
|
|
|
#ifdef TRIE_STUDY_OPT |
4851
|
|
|
|
|
|
#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ |
4852
|
|
|
|
|
|
STMT_START { \ |
4853
|
|
|
|
|
|
if ( \ |
4854
|
|
|
|
|
|
(data.flags & SCF_TRIE_RESTUDY) \ |
4855
|
|
|
|
|
|
&& ! restudied++ \ |
4856
|
|
|
|
|
|
) { \ |
4857
|
|
|
|
|
|
dOsomething; \ |
4858
|
|
|
|
|
|
goto reStudy; \ |
4859
|
|
|
|
|
|
} \ |
4860
|
|
|
|
|
|
} STMT_END |
4861
|
|
|
|
|
|
#else |
4862
|
|
|
|
|
|
#define CHECK_RESTUDY_GOTO_butfirst |
4863
|
|
|
|
|
|
#endif |
4864
|
|
|
|
|
|
|
4865
|
|
|
|
|
|
/* |
4866
|
|
|
|
|
|
* pregcomp - compile a regular expression into internal code |
4867
|
|
|
|
|
|
* |
4868
|
|
|
|
|
|
* Decides which engine's compiler to call based on the hint currently in |
4869
|
|
|
|
|
|
* scope |
4870
|
|
|
|
|
|
*/ |
4871
|
|
|
|
|
|
|
4872
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
4873
|
|
|
|
|
|
|
4874
|
|
|
|
|
|
/* return the currently in-scope regex engine (or the default if none) */ |
4875
|
|
|
|
|
|
|
4876
|
|
|
|
|
|
regexp_engine const * |
4877
|
|
|
|
|
|
Perl_current_re_engine(pTHX) |
4878
|
|
|
|
|
|
{ |
4879
|
|
|
|
|
|
dVAR; |
4880
|
|
|
|
|
|
|
4881
|
|
|
|
|
|
if (IN_PERL_COMPILETIME) { |
4882
|
|
|
|
|
|
HV * const table = GvHV(PL_hintgv); |
4883
|
|
|
|
|
|
SV **ptr; |
4884
|
|
|
|
|
|
|
4885
|
|
|
|
|
|
if (!table) |
4886
|
|
|
|
|
|
return &PL_core_reg_engine; |
4887
|
|
|
|
|
|
ptr = hv_fetchs(table, "regcomp", FALSE); |
4888
|
|
|
|
|
|
if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) |
4889
|
|
|
|
|
|
return &PL_core_reg_engine; |
4890
|
|
|
|
|
|
return INT2PTR(regexp_engine*,SvIV(*ptr)); |
4891
|
|
|
|
|
|
} |
4892
|
|
|
|
|
|
else { |
4893
|
|
|
|
|
|
SV *ptr; |
4894
|
|
|
|
|
|
if (!PL_curcop->cop_hints_hash) |
4895
|
|
|
|
|
|
return &PL_core_reg_engine; |
4896
|
|
|
|
|
|
ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); |
4897
|
|
|
|
|
|
if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) |
4898
|
|
|
|
|
|
return &PL_core_reg_engine; |
4899
|
|
|
|
|
|
return INT2PTR(regexp_engine*,SvIV(ptr)); |
4900
|
|
|
|
|
|
} |
4901
|
|
|
|
|
|
} |
4902
|
|
|
|
|
|
|
4903
|
|
|
|
|
|
|
4904
|
|
|
|
|
|
REGEXP * |
4905
|
|
|
|
|
|
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) |
4906
|
|
|
|
|
|
{ |
4907
|
|
|
|
|
|
dVAR; |
4908
|
|
|
|
|
|
regexp_engine const *eng = current_re_engine(); |
4909
|
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
4910
|
|
|
|
|
|
|
4911
|
|
|
|
|
|
PERL_ARGS_ASSERT_PREGCOMP; |
4912
|
|
|
|
|
|
|
4913
|
|
|
|
|
|
/* Dispatch a request to compile a regexp to correct regexp engine. */ |
4914
|
|
|
|
|
|
DEBUG_COMPILE_r({ |
4915
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", |
4916
|
|
|
|
|
|
PTR2UV(eng)); |
4917
|
|
|
|
|
|
}); |
4918
|
|
|
|
|
|
return CALLREGCOMP_ENG(eng, pattern, flags); |
4919
|
|
|
|
|
|
} |
4920
|
|
|
|
|
|
#endif |
4921
|
|
|
|
|
|
|
4922
|
|
|
|
|
|
/* public(ish) entry point for the perl core's own regex compiling code. |
4923
|
|
|
|
|
|
* It's actually a wrapper for Perl_re_op_compile that only takes an SV |
4924
|
|
|
|
|
|
* pattern rather than a list of OPs, and uses the internal engine rather |
4925
|
|
|
|
|
|
* than the current one */ |
4926
|
|
|
|
|
|
|
4927
|
|
|
|
|
|
REGEXP * |
4928
|
64
|
|
|
|
|
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) |
4929
|
|
|
|
|
|
{ |
4930
|
0
|
|
|
|
|
SV *pat = pattern; /* defeat constness! */ |
4931
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_RE_COMPILE; |
4932
|
64
|
|
|
|
|
return Perl_re_op_compile(aTHX_ &pat, 1, NULL, |
4933
|
|
|
|
|
|
#ifdef PERL_IN_XSUB_RE |
4934
|
|
|
|
|
|
&my_reg_engine, |
4935
|
|
|
|
|
|
#else |
4936
|
|
|
|
|
|
&PL_core_reg_engine, |
4937
|
|
|
|
|
|
#endif |
4938
|
|
|
|
|
|
NULL, NULL, rx_flags, 0); |
4939
|
|
|
|
|
|
} |
4940
|
|
|
|
|
|
|
4941
|
|
|
|
|
|
|
4942
|
|
|
|
|
|
/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code |
4943
|
|
|
|
|
|
* blocks, recalculate the indices. Update pat_p and plen_p in-place to |
4944
|
|
|
|
|
|
* point to the realloced string and length. |
4945
|
|
|
|
|
|
* |
4946
|
|
|
|
|
|
* This is essentially a copy of Perl_bytes_to_utf8() with the code index |
4947
|
|
|
|
|
|
* stuff added */ |
4948
|
|
|
|
|
|
|
4949
|
|
|
|
|
|
static void |
4950
|
100
|
|
|
|
|
S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, |
4951
|
|
|
|
|
|
char **pat_p, STRLEN *plen_p, int num_code_blocks) |
4952
|
|
|
|
|
|
{ |
4953
|
44
|
|
|
|
|
U8 *const src = (U8*)*pat_p; |
4954
|
|
|
|
|
|
U8 *dst; |
4955
|
|
|
|
|
|
int n=0; |
4956
|
|
|
|
|
|
STRLEN s = 0, d = 0; |
4957
|
|
|
|
|
|
bool do_end = 0; |
4958
|
44
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
4959
|
|
|
|
|
|
|
4960
|
92
|
|
|
|
|
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, |
4961
|
|
|
|
|
|
"UTF8 mismatch! Converting to utf8 for resizing and compile\n")); |
4962
|
|
|
|
|
|
|
4963
|
210
|
|
|
|
|
Newx(dst, *plen_p * 2 + 1, U8); |
4964
|
|
|
|
|
|
|
4965
|
380
|
|
|
|
|
while (s < *plen_p) { |
4966
|
282
|
|
|
|
|
if (NATIVE_IS_INVARIANT(src[s])) |
4967
|
228
|
|
|
|
|
dst[d] = src[s]; |
4968
|
|
|
|
|
|
else { |
4969
|
54
|
|
|
|
|
dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); |
4970
|
32
|
|
|
|
|
dst[d] = UTF8_EIGHT_BIT_LO(src[s]); |
4971
|
|
|
|
|
|
} |
4972
|
225
|
|
|
|
|
if (n < num_code_blocks) { |
4973
|
22
|
|
|
|
|
if (!do_end && pRExC_state->code_blocks[n].start == s) { |
4974
|
10
|
|
|
|
|
pRExC_state->code_blocks[n].start = d; |
4975
|
12
|
|
|
|
|
assert(dst[d] == '('); |
4976
|
|
|
|
|
|
do_end = 1; |
4977
|
|
|
|
|
|
} |
4978
|
90
|
|
|
|
|
else if (do_end && pRExC_state->code_blocks[n].end == s) { |
4979
|
90
|
|
|
|
|
pRExC_state->code_blocks[n].end = d; |
4980
|
90
|
|
|
|
|
assert(dst[d] == ')'); |
4981
|
|
|
|
|
|
do_end = 0; |
4982
|
90
|
|
|
|
|
n++; |
4983
|
|
|
|
|
|
} |
4984
|
|
|
|
|
|
} |
4985
|
291
|
|
|
|
|
s++; |
4986
|
282
|
|
|
|
|
d++; |
4987
|
|
|
|
|
|
} |
4988
|
92
|
|
|
|
|
dst[d] = '\0'; |
4989
|
92
|
|
|
|
|
*plen_p = d; |
4990
|
92
|
|
|
|
|
*pat_p = (char*) dst; |
4991
|
92
|
|
|
|
|
SAVEFREEPV(*pat_p); |
4992
|
96
|
|
|
|
|
RExC_orig_utf8 = RExC_utf8 = 1; |
4993
|
16779664
|
|
|
|
|
} |
4994
|
|
|
|
|
|
|
4995
|
|
|
|
|
|
|
4996
|
|
|
|
|
|
|
4997
|
|
|
|
|
|
/* S_concat_pat(): concatenate a list of args to the pattern string pat, |
4998
|
|
|
|
|
|
* while recording any code block indices, and handling overloading, |
4999
|
|
|
|
|
|
* nested qr// objects etc. If pat is null, it will allocate a new |
5000
|
|
|
|
|
|
* string, or just return the first arg, if there's only one. |
5001
|
|
|
|
|
|
* |
5002
|
|
|
|
|
|
* Returns the malloced/updated pat. |
5003
|
|
|
|
|
|
* patternp and pat_count is the array of SVs to be concatted; |
5004
|
|
|
|
|
|
* oplist is the optional list of ops that generated the SVs; |
5005
|
|
|
|
|
|
* recompile_p is a pointer to a boolean that will be set if |
5006
|
|
|
|
|
|
* the regex will need to be recompiled. |
5007
|
|
|
|
|
|
* delim, if non-null is an SV that will be inserted between each element |
5008
|
|
|
|
|
|
*/ |
5009
|
|
|
|
|
|
|
5010
|
|
|
|
|
|
static SV* |
5011
|
16779864
|
|
|
|
|
S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, |
5012
|
|
|
|
|
|
SV *pat, SV ** const patternp, int pat_count, |
5013
|
|
|
|
|
|
OP *oplist, bool *recompile_p, SV *delim) |
5014
|
|
|
|
|
|
{ |
5015
|
|
|
|
|
|
SV **svp; |
5016
|
|
|
|
|
|
int n = 0; |
5017
|
|
|
|
|
|
bool use_delim = FALSE; |
5018
|
|
|
|
|
|
bool alloced = FALSE; |
5019
|
|
|
|
|
|
|
5020
|
|
|
|
|
|
/* if we know we have at least two args, create an empty string, |
5021
|
|
|
|
|
|
* then concatenate args to that. For no args, return an empty string */ |
5022
|
1323509
|
|
|
|
|
if (!pat && pat_count != 1) { |
5023
|
8832072
|
|
|
|
|
pat = newSVpvn("", 0); |
5024
|
8791262
|
|
|
|
|
SAVEFREESV(pat); |
5025
|
|
|
|
|
|
alloced = TRUE; |
5026
|
|
|
|
|
|
} |
5027
|
|
|
|
|
|
|
5028
|
555330
|
|
|
|
|
for (svp = patternp; svp < patternp + pat_count; svp++) { |
5029
|
|
|
|
|
|
SV *sv; |
5030
|
|
|
|
|
|
SV *rx = NULL; |
5031
|
|
|
|
|
|
STRLEN orig_patlen = 0; |
5032
|
|
|
|
|
|
bool code = 0; |
5033
|
554750
|
|
|
|
|
SV *msv = use_delim ? delim : *svp; |
5034
|
|
|
|
|
|
|
5035
|
|
|
|
|
|
/* if we've got a delimiter, we go round the loop twice for each |
5036
|
|
|
|
|
|
* svp slot (except the last), using the delimiter the second |
5037
|
|
|
|
|
|
* time round */ |
5038
|
8237106
|
|
|
|
|
if (use_delim) { |
5039
|
8236762
|
|
|
|
|
svp--; |
5040
|
|
|
|
|
|
use_delim = FALSE; |
5041
|
|
|
|
|
|
} |
5042
|
8791512
|
|
|
|
|
else if (delim) |
5043
|
|
|
|
|
|
use_delim = TRUE; |
5044
|
|
|
|
|
|
|
5045
|
17582680
|
|
|
|
|
if (SvTYPE(msv) == SVt_PVAV) { |
5046
|
|
|
|
|
|
/* we've encountered an interpolated array within |
5047
|
|
|
|
|
|
* the pattern, e.g. /...@a..../. Expand the list of elements, |
5048
|
|
|
|
|
|
* then recursively append elements. |
5049
|
|
|
|
|
|
* The code in this block is based on S_pushav() */ |
5050
|
|
|
|
|
|
|
5051
|
|
|
|
|
|
AV *const av = (AV*)msv; |
5052
|
8791168
|
|
|
|
|
const I32 maxarg = AvFILL(av) + 1; |
5053
|
|
|
|
|
|
SV **array; |
5054
|
|
|
|
|
|
|
5055
|
8791168
|
|
|
|
|
if (oplist) { |
5056
|
8791168
|
|
|
|
|
assert(oplist->op_type == OP_PADAV |
5057
|
|
|
|
|
|
|| oplist->op_type == OP_RV2AV); |
5058
|
5508759
|
|
|
|
|
oplist = oplist->op_sibling;; |
5059
|
|
|
|
|
|
} |
5060
|
|
|
|
|
|
|
5061
|
12882224
|
|
|
|
|
if (SvRMAGICAL(av)) { |
5062
|
|
|
|
|
|
U32 i; |
5063
|
|
|
|
|
|
|
5064
|
0
|
|
|
|
|
Newx(array, maxarg, SV*); |
5065
|
26813538
|
|
|
|
|
SAVEFREEPV(array); |
5066
|
26813538
|
|
|
|
|
for (i=0; i < (U32)maxarg; i++) { |
5067
|
26813538
|
|
|
|
|
SV ** const svp = av_fetch(av, i, FALSE); |
5068
|
26813538
|
|
|
|
|
array[i] = svp ? *svp : &PL_sv_undef; |
5069
|
|
|
|
|
|
} |
5070
|
|
|
|
|
|
} |
5071
|
|
|
|
|
|
else |
5072
|
20210
|
|
|
|
|
array = AvARRAY(av); |
5073
|
|
|
|
|
|
|
5074
|
20210
|
|
|
|
|
pat = S_concat_pat(aTHX_ pRExC_state, pat, |
5075
|
|
|
|
|
|
array, maxarg, NULL, recompile_p, |
5076
|
|
|
|
|
|
/* $" */ |
5077
|
20210
|
|
|
|
|
GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); |
5078
|
|
|
|
|
|
|
5079
|
20210
|
|
|
|
|
continue; |
5080
|
|
|
|
|
|
} |
5081
|
|
|
|
|
|
|
5082
|
|
|
|
|
|
|
5083
|
|
|
|
|
|
/* we make the assumption here that each op in the list of |
5084
|
|
|
|
|
|
* op_siblings maps to one SV pushed onto the stack, |
5085
|
|
|
|
|
|
* except for code blocks, with have both an OP_NULL and |
5086
|
|
|
|
|
|
* and OP_CONST. |
5087
|
|
|
|
|
|
* This allows us to match up the list of SVs against the |
5088
|
|
|
|
|
|
* list of OPs to find the next code block. |
5089
|
|
|
|
|
|
* |
5090
|
|
|
|
|
|
* Note that PUSHMARK PADSV PADSV .. |
5091
|
|
|
|
|
|
* is optimised to |
5092
|
|
|
|
|
|
* PADRANGE PADSV PADSV .. |
5093
|
|
|
|
|
|
* so the alignment still works. */ |
5094
|
|
|
|
|
|
|
5095
|
20554
|
|
|
|
|
if (oplist) { |
5096
|
20290
|
|
|
|
|
if (oplist->op_type == OP_NULL |
5097
|
20242
|
|
|
|
|
&& (oplist->op_flags & OPf_SPECIAL)) |
5098
|
|
|
|
|
|
{ |
5099
|
20242
|
|
|
|
|
assert(n < pRExC_state->num_code_blocks); |
5100
|
20242
|
|
|
|
|
pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; |
5101
|
20242
|
|
|
|
|
pRExC_state->code_blocks[n].block = oplist; |
5102
|
20242
|
|
|
|
|
pRExC_state->code_blocks[n].src_regex = NULL; |
5103
|
20242
|
|
|
|
|
n++; |
5104
|
|
|
|
|
|
code = 1; |
5105
|
20242
|
|
|
|
|
oplist = oplist->op_sibling; /* skip CONST */ |
5106
|
20242
|
|
|
|
|
assert(oplist); |
5107
|
|
|
|
|
|
} |
5108
|
20290
|
|
|
|
|
oplist = oplist->op_sibling;; |
5109
|
|
|
|
|
|
} |
5110
|
|
|
|
|
|
|
5111
|
|
|
|
|
|
/* apply magic and QR overloading to arg */ |
5112
|
|
|
|
|
|
|
5113
|
20554
|
|
|
|
|
SvGETMAGIC(msv); |
5114
|
20554
|
|
|
|
|
if (SvROK(msv) && SvAMAGIC(msv)) { |
5115
|
20234
|
|
|
|
|
SV *sv = AMG_CALLunary(msv, regexp_amg); |
5116
|
20234
|
|
|
|
|
if (sv) { |
5117
|
20210
|
|
|
|
|
if (SvROK(sv)) |
5118
|
20210
|
|
|
|
|
sv = SvRV(sv); |
5119
|
20210
|
|
|
|
|
if (SvTYPE(sv) != SVt_REGEXP) |
5120
|
20210
|
|
|
|
|
Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); |
5121
|
|
|
|
|
|
msv = sv; |
5122
|
|
|
|
|
|
} |
5123
|
|
|
|
|
|
} |
5124
|
|
|
|
|
|
|
5125
|
|
|
|
|
|
/* try concatenation overload ... */ |
5126
|
20554
|
|
|
|
|
if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && |
5127
|
|
|
|
|
|
(sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) |
5128
|
|
|
|
|
|
{ |
5129
|
20210
|
|
|
|
|
sv_setsv(pat, sv); |
5130
|
|
|
|
|
|
/* overloading involved: all bets are off over literal |
5131
|
|
|
|
|
|
* code. Pretend we haven't seen it */ |
5132
|
20210
|
|
|
|
|
pRExC_state->num_code_blocks -= n; |
5133
|
20210
|
|
|
|
|
n = 0; |
5134
|
|
|
|
|
|
} |
5135
|
|
|
|
|
|
else { |
5136
|
|
|
|
|
|
/* ... or failing that, try "" overload */ |
5137
|
20554
|
|
|
|
|
while (SvAMAGIC(msv) |
5138
|
20210
|
|
|
|
|
&& (sv = AMG_CALLunary(msv, string_amg)) |
5139
|
20210
|
|
|
|
|
&& sv != msv |
5140
|
20210
|
|
|
|
|
&& !( SvROK(msv) |
5141
|
20210
|
|
|
|
|
&& SvROK(sv) |
5142
|
20210
|
|
|
|
|
&& SvRV(msv) == SvRV(sv)) |
5143
|
|
|
|
|
|
) { |
5144
|
|
|
|
|
|
msv = sv; |
5145
|
20210
|
|
|
|
|
SvGETMAGIC(msv); |
5146
|
|
|
|
|
|
} |
5147
|
20554
|
|
|
|
|
if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) |
5148
|
26813632
|
|
|
|
|
msv = SvRV(msv); |
5149
|
|
|
|
|
|
|
5150
|
26813882
|
|
|
|
|
if (pat) { |
5151
|
|
|
|
|
|
/* this is a partially unrolled |
5152
|
|
|
|
|
|
* sv_catsv_nomg(pat, msv); |
5153
|
|
|
|
|
|
* that allows us to adjust code block indices if |
5154
|
|
|
|
|
|
* needed */ |
5155
|
|
|
|
|
|
STRLEN dlen; |
5156
|
26813740
|
|
|
|
|
char *dst = SvPV_force_nomg(pat, dlen); |
5157
|
22326156
|
|
|
|
|
orig_patlen = dlen; |
5158
|
28280369
|
|
|
|
|
if (SvUTF8(msv) && !SvUTF8(pat)) { |
5159
|
1466683
|
|
|
|
|
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); |
5160
|
4328895
|
|
|
|
|
sv_setpvn(pat, dst, dlen); |
5161
|
2896618
|
|
|
|
|
SvUTF8_on(pat); |
5162
|
|
|
|
|
|
} |
5163
|
11268
|
|
|
|
|
sv_catsv_nomg(pat, msv); |
5164
|
|
|
|
|
|
rx = msv; |
5165
|
|
|
|
|
|
} |
5166
|
|
|
|
|
|
else |
5167
|
|
|
|
|
|
pat = msv; |
5168
|
|
|
|
|
|
|
5169
|
1432621
|
|
|
|
|
if (code) |
5170
|
10604
|
|
|
|
|
pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; |
5171
|
|
|
|
|
|
} |
5172
|
|
|
|
|
|
|
5173
|
|
|
|
|
|
/* extract any code blocks within any embedded qr//'s */ |
5174
|
10916
|
|
|
|
|
if (rx && SvTYPE(rx) == SVt_REGEXP |
5175
|
26813570
|
|
|
|
|
&& RX_ENGINE((REGEXP*)rx)->op_comp) |
5176
|
|
|
|
|
|
{ |
5177
|
|
|
|
|
|
|
5178
|
4477796
|
|
|
|
|
RXi_GET_DECL(ReANY((REGEXP *)rx), ri); |
5179
|
4328657
|
|
|
|
|
if (ri->num_code_blocks) { |
5180
|
|
|
|
|
|
int i; |
5181
|
|
|
|
|
|
/* the presence of an embedded qr// with code means |
5182
|
|
|
|
|
|
* we should always recompile: the text of the |
5183
|
|
|
|
|
|
* qr// may not have changed, but it may be a |
5184
|
|
|
|
|
|
* different closure than last time */ |
5185
|
2896414
|
|
|
|
|
*recompile_p = 1; |
5186
|
1453155
|
|
|
|
|
Renew(pRExC_state->code_blocks, |
5187
|
|
|
|
|
|
pRExC_state->num_code_blocks + ri->num_code_blocks, |
5188
|
|
|
|
|
|
struct reg_code_block); |
5189
|
4477788
|
|
|
|
|
pRExC_state->num_code_blocks += ri->num_code_blocks; |
5190
|
|
|
|
|
|
|
5191
|
4477812
|
|
|
|
|
for (i=0; i < ri->num_code_blocks; i++) { |
5192
|
|
|
|
|
|
struct reg_code_block *src, *dst; |
5193
|
4477788
|
|
|
|
|
STRLEN offset = orig_patlen |
5194
|
3045553
|
|
|
|
|
+ ReANY((REGEXP *)rx)->pre_prefix; |
5195
|
4328649
|
|
|
|
|
assert(n < pRExC_state->num_code_blocks); |
5196
|
2896414
|
|
|
|
|
src = &ri->code_blocks[i]; |
5197
|
1453155
|
|
|
|
|
dst = &pRExC_state->code_blocks[n]; |
5198
|
26813562
|
|
|
|
|
dst->start = src->start + offset; |
5199
|
10596
|
|
|
|
|
dst->end = src->end + offset; |
5200
|
10596
|
|
|
|
|
dst->block = src->block; |
5201
|
10620
|
|
|
|
|
dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) |
5202
|
|
|
|
|
|
src->src_regex |
5203
|
|
|
|
|
|
? src->src_regex |
5204
|
|
|
|
|
|
: (REGEXP*)rx); |
5205
|
26813562
|
|
|
|
|
n++; |
5206
|
|
|
|
|
|
} |
5207
|
|
|
|
|
|
} |
5208
|
|
|
|
|
|
} |
5209
|
|
|
|
|
|
} |
5210
|
|
|
|
|
|
/* avoid calling magic multiple times on a single element e.g. =~ $qr */ |
5211
|
26813774
|
|
|
|
|
if (alloced) |
5212
|
102
|
|
|
|
|
SvSETMAGIC(pat); |
5213
|
|
|
|
|
|
|
5214
|
26813774
|
|
|
|
|
return pat; |
5215
|
|
|
|
|
|
} |
5216
|
|
|
|
|
|
|
5217
|
|
|
|
|
|
|
5218
|
|
|
|
|
|
|
5219
|
|
|
|
|
|
/* see if there are any run-time code blocks in the pattern. |
5220
|
|
|
|
|
|
* False positives are allowed */ |
5221
|
|
|
|
|
|
|
5222
|
|
|
|
|
|
static bool |
5223
|
13427977
|
|
|
|
|
S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, |
5224
|
|
|
|
|
|
char *pat, STRLEN plen) |
5225
|
|
|
|
|
|
{ |
5226
|
|
|
|
|
|
int n = 0; |
5227
|
|
|
|
|
|
STRLEN s; |
5228
|
|
|
|
|
|
|
5229
|
13428009
|
|
|
|
|
for (s = 0; s < plen; s++) { |
5230
|
13428073
|
|
|
|
|
if (n < pRExC_state->num_code_blocks |
5231
|
13428025
|
|
|
|
|
&& s == pRExC_state->code_blocks[n].start) |
5232
|
|
|
|
|
|
{ |
5233
|
13385597
|
|
|
|
|
s = pRExC_state->code_blocks[n].end; |
5234
|
13385597
|
|
|
|
|
n++; |
5235
|
12
|
|
|
|
|
continue; |
5236
|
|
|
|
|
|
} |
5237
|
|
|
|
|
|
/* TODO ideally should handle [..], (#..), /#.../x to reduce false |
5238
|
|
|
|
|
|
* positives here */ |
5239
|
144
|
|
|
|
|
if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && |
5240
|
36
|
|
|
|
|
(pat[s+2] == '{' |
5241
|
12
|
|
|
|
|
|| (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) |
5242
|
|
|
|
|
|
) |
5243
|
|
|
|
|
|
return 1; |
5244
|
|
|
|
|
|
} |
5245
|
|
|
|
|
|
return 0; |
5246
|
|
|
|
|
|
} |
5247
|
|
|
|
|
|
|
5248
|
|
|
|
|
|
/* Handle run-time code blocks. We will already have compiled any direct |
5249
|
|
|
|
|
|
* or indirect literal code blocks. Now, take the pattern 'pat' and make a |
5250
|
|
|
|
|
|
* copy of it, but with any literal code blocks blanked out and |
5251
|
|
|
|
|
|
* appropriate chars escaped; then feed it into |
5252
|
|
|
|
|
|
* |
5253
|
|
|
|
|
|
* eval "qr'modified_pattern'" |
5254
|
|
|
|
|
|
* |
5255
|
|
|
|
|
|
* For example, |
5256
|
|
|
|
|
|
* |
5257
|
|
|
|
|
|
* a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno |
5258
|
|
|
|
|
|
* |
5259
|
|
|
|
|
|
* becomes |
5260
|
|
|
|
|
|
* |
5261
|
|
|
|
|
|
* qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' |
5262
|
|
|
|
|
|
* |
5263
|
|
|
|
|
|
* After eval_sv()-ing that, grab any new code blocks from the returned qr |
5264
|
|
|
|
|
|
* and merge them with any code blocks of the original regexp. |
5265
|
|
|
|
|
|
* |
5266
|
|
|
|
|
|
* If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; |
5267
|
|
|
|
|
|
* instead, just save the qr and return FALSE; this tells our caller that |
5268
|
|
|
|
|
|
* the original pattern needs upgrading to utf8. |
5269
|
|
|
|
|
|
*/ |
5270
|
|
|
|
|
|
|
5271
|
|
|
|
|
|
static bool |
5272
|
24
|
|
|
|
|
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, |
5273
|
|
|
|
|
|
char *pat, STRLEN plen) |
5274
|
|
|
|
|
|
{ |
5275
|
|
|
|
|
|
SV *qr; |
5276
|
|
|
|
|
|
|
5277
|
13385609
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
5278
|
|
|
|
|
|
|
5279
|
13385609
|
|
|
|
|
if (pRExC_state->runtime_code_qr) { |
5280
|
|
|
|
|
|
/* this is the second time we've been called; this should |
5281
|
|
|
|
|
|
* only happen if the main pattern got upgraded to utf8 |
5282
|
|
|
|
|
|
* during compilation; re-use the qr we compiled first time |
5283
|
|
|
|
|
|
* round (which should be utf8 too) |
5284
|
|
|
|
|
|
*/ |
5285
|
13385585
|
|
|
|
|
qr = pRExC_state->runtime_code_qr; |
5286
|
13385585
|
|
|
|
|
pRExC_state->runtime_code_qr = NULL; |
5287
|
13695865
|
|
|
|
|
assert(RExC_utf8 && SvUTF8(qr)); |
5288
|
|
|
|
|
|
} |
5289
|
|
|
|
|
|
else { |
5290
|
|
|
|
|
|
int n = 0; |
5291
|
|
|
|
|
|
STRLEN s; |
5292
|
|
|
|
|
|
char *p, *newpat; |
5293
|
13690723
|
|
|
|
|
int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ |
5294
|
|
|
|
|
|
SV *sv, *qr_ref; |
5295
|
5194
|
|
|
|
|
dSP; |
5296
|
|
|
|
|
|
|
5297
|
|
|
|
|
|
/* determine how many extra chars we need for ' and \ escaping */ |
5298
|
13696189
|
|
|
|
|
for (s = 0; s < plen; s++) { |
5299
|
9032686
|
|
|
|
|
if (pat[s] == '\'' || pat[s] == '\\') |
5300
|
8935362
|
|
|
|
|
newlen++; |
5301
|
|
|
|
|
|
} |
5302
|
|
|
|
|
|
|
5303
|
8650018
|
|
|
|
|
Newx(newpat, newlen, char); |
5304
|
|
|
|
|
|
p = newpat; |
5305
|
12969417
|
|
|
|
|
*p++ = 'q'; *p++ = 'r'; *p++ = '\''; |
5306
|
|
|
|
|
|
|
5307
|
12969657
|
|
|
|
|
for (s = 0; s < plen; s++) { |
5308
|
11564764
|
|
|
|
|
if (n < pRExC_state->num_code_blocks |
5309
|
4916190
|
|
|
|
|
&& s == pRExC_state->code_blocks[n].start) |
5310
|
|
|
|
|
|
{ |
5311
|
|
|
|
|
|
/* blank out literal code block */ |
5312
|
4916122
|
|
|
|
|
assert(pat[s] == '('); |
5313
|
4916194
|
|
|
|
|
while (s <= pRExC_state->code_blocks[n].end) { |
5314
|
8779827
|
|
|
|
|
*p++ = '_'; |
5315
|
6462
|
|
|
|
|
s++; |
5316
|
|
|
|
|
|
} |
5317
|
8773377
|
|
|
|
|
s--; |
5318
|
8779767
|
|
|
|
|
n++; |
5319
|
8779767
|
|
|
|
|
continue; |
5320
|
|
|
|
|
|
} |
5321
|
8779983
|
|
|
|
|
if (pat[s] == '\'' || pat[s] == '\\') |
5322
|
8779755
|
|
|
|
|
*p++ = '\\'; |
5323
|
292
|
|
|
|
|
*p++ = pat[s]; |
5324
|
|
|
|
|
|
} |
5325
|
24
|
|
|
|
|
*p++ = '\''; |
5326
|
88
|
|
|
|
|
if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) |
5327
|
0
|
|
|
|
|
*p++ = 'x'; |
5328
|
155164
|
|
|
|
|
*p++ = '\0'; |
5329
|
8779779
|
|
|
|
|
DEBUG_COMPILE_r({ |
5330
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
5331
|
|
|
|
|
|
"%sre-parsing pattern for runtime code:%s %s\n", |
5332
|
|
|
|
|
|
PL_colors[4],PL_colors[5],newpat); |
5333
|
|
|
|
|
|
}); |
5334
|
|
|
|
|
|
|
5335
|
8779779
|
|
|
|
|
sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); |
5336
|
8779779
|
|
|
|
|
Safefree(newpat); |
5337
|
|
|
|
|
|
|
5338
|
8779779
|
|
|
|
|
ENTER; |
5339
|
8779779
|
|
|
|
|
SAVETMPS; |
5340
|
8779779
|
|
|
|
|
save_re_context(); |
5341
|
8779779
|
|
|
|
|
PUSHSTACKi(PERLSI_REQUIRE); |
5342
|
|
|
|
|
|
/* G_RE_REPARSING causes the toker to collapse \\ into \ when |
5343
|
|
|
|
|
|
* parsing qr''; normally only q'' does this. It also alters |
5344
|
|
|
|
|
|
* hints handling */ |
5345
|
8779779
|
|
|
|
|
eval_sv(sv, G_SCALAR|G_RE_REPARSING); |
5346
|
8779779
|
|
|
|
|
SvREFCNT_dec_NN(sv); |
5347
|
8779779
|
|
|
|
|
SPAGAIN; |
5348
|
8779779
|
|
|
|
|
qr_ref = POPs; |
5349
|
8779779
|
|
|
|
|
PUTBACK; |
5350
|
|
|
|
|
|
{ |
5351
|
8779779
|
|
|
|
|
SV * const errsv = ERRSV; |
5352
|
8779779
|
|
|
|
|
if (SvTRUE_NN(errsv)) |
5353
|
|
|
|
|
|
{ |
5354
|
8779755
|
|
|
|
|
Safefree(pRExC_state->code_blocks); |
5355
|
|
|
|
|
|
/* use croak_sv ? */ |
5356
|
8779755
|
|
|
|
|
Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); |
5357
|
|
|
|
|
|
} |
5358
|
|
|
|
|
|
} |
5359
|
8779779
|
|
|
|
|
assert(SvROK(qr_ref)); |
5360
|
8779779
|
|
|
|
|
qr = SvRV(qr_ref); |
5361
|
8779803
|
|
|
|
|
assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); |
5362
|
|
|
|
|
|
/* the leaving below frees the tmp qr_ref. |
5363
|
|
|
|
|
|
* Give qr a life of its own */ |
5364
|
|
|
|
|
|
SvREFCNT_inc(qr); |
5365
|
8779779
|
|
|
|
|
POPSTACK; |
5366
|
8779779
|
|
|
|
|
FREETMPS; |
5367
|
8779779
|
|
|
|
|
LEAVE; |
5368
|
|
|
|
|
|
|
5369
|
|
|
|
|
|
} |
5370
|
|
|
|
|
|
|
5371
|
8779779
|
|
|
|
|
if (!RExC_utf8 && SvUTF8(qr)) { |
5372
|
|
|
|
|
|
/* first time through; the pattern got upgraded; save the |
5373
|
|
|
|
|
|
* qr for the next time through */ |
5374
|
8779755
|
|
|
|
|
assert(!pRExC_state->runtime_code_qr); |
5375
|
112836
|
|
|
|
|
pRExC_state->runtime_code_qr = qr; |
5376
|
112836
|
|
|
|
|
return 0; |
5377
|
|
|
|
|
|
} |
5378
|
|
|
|
|
|
|
5379
|
|
|
|
|
|
|
5380
|
|
|
|
|
|
/* extract any code blocks within the returned qr// */ |
5381
|
|
|
|
|
|
|
5382
|
|
|
|
|
|
|
5383
|
|
|
|
|
|
/* merge the main (r1) and run-time (r2) code blocks into one */ |
5384
|
|
|
|
|
|
{ |
5385
|
112860
|
|
|
|
|
RXi_GET_DECL(ReANY((REGEXP *)qr), r2); |
5386
|
|
|
|
|
|
struct reg_code_block *new_block, *dst; |
5387
|
|
|
|
|
|
RExC_state_t * const r1 = pRExC_state; /* convenient alias */ |
5388
|
|
|
|
|
|
int i1 = 0, i2 = 0; |
5389
|
|
|
|
|
|
|
5390
|
112860
|
|
|
|
|
if (!r2->num_code_blocks) /* we guessed wrong */ |
5391
|
|
|
|
|
|
{ |
5392
|
8779755
|
|
|
|
|
SvREFCNT_dec_NN(qr); |
5393
|
310280
|
|
|
|
|
return 1; |
5394
|
|
|
|
|
|
} |
5395
|
|
|
|
|
|
|
5396
|
310304
|
|
|
|
|
Newx(new_block, |
5397
|
|
|
|
|
|
r1->num_code_blocks + r2->num_code_blocks, |
5398
|
|
|
|
|
|
struct reg_code_block); |
5399
|
|
|
|
|
|
dst = new_block; |
5400
|
|
|
|
|
|
|
5401
|
310364
|
|
|
|
|
while ( i1 < r1->num_code_blocks |
5402
|
48
|
|
|
|
|
|| i2 < r2->num_code_blocks) |
5403
|
|
|
|
|
|
{ |
5404
|
|
|
|
|
|
struct reg_code_block *src; |
5405
|
|
|
|
|
|
bool is_qr = 0; |
5406
|
|
|
|
|
|
|
5407
|
8466107
|
|
|
|
|
if (i1 == r1->num_code_blocks) { |
5408
|
112848
|
|
|
|
|
src = &r2->code_blocks[i2++]; |
5409
|
|
|
|
|
|
is_qr = 1; |
5410
|
|
|
|
|
|
} |
5411
|
8466083
|
|
|
|
|
else if (i2 == r2->num_code_blocks) |
5412
|
1222622
|
|
|
|
|
src = &r1->code_blocks[i1++]; |
5413
|
8466095
|
|
|
|
|
else if ( r1->code_blocks[i1].start |
5414
|
12
|
|
|
|
|
< r2->code_blocks[i2].start) |
5415
|
|
|
|
|
|
{ |
5416
|
8466083
|
|
|
|
|
src = &r1->code_blocks[i1++]; |
5417
|
8466083
|
|
|
|
|
assert(src->end < r2->code_blocks[i2].start); |
5418
|
|
|
|
|
|
} |
5419
|
|
|
|
|
|
else { |
5420
|
4944
|
|
|
|
|
assert( r1->code_blocks[i1].start |
5421
|
|
|
|
|
|
> r2->code_blocks[i2].start); |
5422
|
8466071
|
|
|
|
|
src = &r2->code_blocks[i2++]; |
5423
|
|
|
|
|
|
is_qr = 1; |
5424
|
8466071
|
|
|
|
|
assert(src->end < r1->code_blocks[i1].start); |
5425
|
|
|
|
|
|
} |
5426
|
|
|
|
|
|
|
5427
|
8466107
|
|
|
|
|
assert(pat[src->start] == '('); |
5428
|
36
|
|
|
|
|
assert(pat[src->end] == ')'); |
5429
|
8466107
|
|
|
|
|
dst->start = src->start; |
5430
|
8466107
|
|
|
|
|
dst->end = src->end; |
5431
|
8466107
|
|
|
|
|
dst->block = src->block; |
5432
|
8466107
|
|
|
|
|
dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) |
5433
|
8466119
|
|
|
|
|
: src->src_regex; |
5434
|
1846538
|
|
|
|
|
dst++; |
5435
|
|
|
|
|
|
} |
5436
|
1846526
|
|
|
|
|
r1->num_code_blocks += r2->num_code_blocks; |
5437
|
3488637
|
|
|
|
|
Safefree(r1->code_blocks); |
5438
|
105424
|
|
|
|
|
r1->code_blocks = new_block; |
5439
|
|
|
|
|
|
} |
5440
|
|
|
|
|
|
|
5441
|
103292
|
|
|
|
|
SvREFCNT_dec_NN(qr); |
5442
|
6619593
|
|
|
|
|
return 1; |
5443
|
|
|
|
|
|
} |
5444
|
|
|
|
|
|
|
5445
|
|
|
|
|
|
|
5446
|
|
|
|
|
|
STATIC bool |
5447
|
8466339
|
|
|
|
|
S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, |
5448
|
|
|
|
|
|
SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol) |
5449
|
|
|
|
|
|
{ |
5450
|
|
|
|
|
|
/* This is the common code for setting up the floating and fixed length |
5451
|
|
|
|
|
|
* string data extracted from Perl_re_op_compile() below. Returns a boolean |
5452
|
|
|
|
|
|
* as to whether succeeded or not */ |
5453
|
|
|
|
|
|
|
5454
|
|
|
|
|
|
I32 t; |
5455
|
|
|
|
|
|
SSize_t ml; |
5456
|
|
|
|
|
|
|
5457
|
16932410
|
|
|
|
|
if (! (longest_length |
5458
|
|
|
|
|
|
|| (eol /* Can't have SEOL and MULTI */ |
5459
|
8466073
|
|
|
|
|
&& (! meol || (RExC_flags & RXf_PMf_MULTILINE))) |
5460
|
|
|
|
|
|
) |
5461
|
|
|
|
|
|
/* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ |
5462
|
8466191
|
|
|
|
|
|| (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) |
5463
|
|
|
|
|
|
{ |
5464
|
|
|
|
|
|
return FALSE; |
5465
|
|
|
|
|
|
} |
5466
|
|
|
|
|
|
|
5467
|
|
|
|
|
|
/* copy the information about the longest from the reg_scan_data |
5468
|
|
|
|
|
|
over to the program. */ |
5469
|
8466191
|
|
|
|
|
if (SvUTF8(sv_longest)) { |
5470
|
8466091
|
|
|
|
|
*rx_utf8 = sv_longest; |
5471
|
8466091
|
|
|
|
|
*rx_substr = NULL; |
5472
|
|
|
|
|
|
} else { |
5473
|
8466171
|
|
|
|
|
*rx_substr = sv_longest; |
5474
|
8466171
|
|
|
|
|
*rx_utf8 = NULL; |
5475
|
|
|
|
|
|
} |
5476
|
|
|
|
|
|
/* end_shift is how many chars that must be matched that |
5477
|
|
|
|
|
|
follow this item. We calculate it ahead of time as once the |
5478
|
|
|
|
|
|
lookbehind offset is added in we lose the ability to correctly |
5479
|
|
|
|
|
|
calculate it.*/ |
5480
|
8466191
|
|
|
|
|
ml = minlen ? *(minlen) : (SSize_t)longest_length; |
5481
|
8466311
|
|
|
|
|
*rx_end_shift = ml - offset |
5482
|
8466191
|
|
|
|
|
- longest_length + (SvTAIL(sv_longest) != 0) |
5483
|
8466191
|
|
|
|
|
+ lookbehind; |
5484
|
|
|
|
|
|
|
5485
|
682354
|
|
|
|
|
t = (eol/* Can't have SEOL and MULTI */ |
5486
|
8466073
|
|
|
|
|
&& (! meol || (RExC_flags & RXf_PMf_MULTILINE))); |
5487
|
8466191
|
|
|
|
|
fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); |
5488
|
|
|
|
|
|
|
5489
|
8466071
|
|
|
|
|
return TRUE; |
5490
|
|
|
|
|
|
} |
5491
|
|
|
|
|
|
|
5492
|
|
|
|
|
|
/* |
5493
|
|
|
|
|
|
* Perl_re_op_compile - the perl internal RE engine's function to compile a |
5494
|
|
|
|
|
|
* regular expression into internal code. |
5495
|
|
|
|
|
|
* The pattern may be passed either as: |
5496
|
|
|
|
|
|
* a list of SVs (patternp plus pat_count) |
5497
|
|
|
|
|
|
* a list of OPs (expr) |
5498
|
|
|
|
|
|
* If both are passed, the SV list is used, but the OP list indicates |
5499
|
|
|
|
|
|
* which SVs are actually pre-compiled code blocks |
5500
|
|
|
|
|
|
* |
5501
|
|
|
|
|
|
* The SVs in the list have magic and qr overloading applied to them (and |
5502
|
|
|
|
|
|
* the list may be modified in-place with replacement SVs in the latter |
5503
|
|
|
|
|
|
* case). |
5504
|
|
|
|
|
|
* |
5505
|
|
|
|
|
|
* If the pattern hasn't changed from old_re, then old_re will be |
5506
|
|
|
|
|
|
* returned. |
5507
|
|
|
|
|
|
* |
5508
|
|
|
|
|
|
* eng is the current engine. If that engine has an op_comp method, then |
5509
|
|
|
|
|
|
* handle directly (i.e. we assume that op_comp was us); otherwise, just |
5510
|
|
|
|
|
|
* do the initial concatenation of arguments and pass on to the external |
5511
|
|
|
|
|
|
* engine. |
5512
|
|
|
|
|
|
* |
5513
|
|
|
|
|
|
* If is_bare_re is not null, set it to a boolean indicating whether the |
5514
|
|
|
|
|
|
* arg list reduced (after overloading) to a single bare regex which has |
5515
|
|
|
|
|
|
* been returned (i.e. /$qr/). |
5516
|
|
|
|
|
|
* |
5517
|
|
|
|
|
|
* orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. |
5518
|
|
|
|
|
|
* |
5519
|
|
|
|
|
|
* pm_flags contains the PMf_* flags, typically based on those from the |
5520
|
|
|
|
|
|
* pm_flags field of the related PMOP. Currently we're only interested in |
5521
|
|
|
|
|
|
* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. |
5522
|
|
|
|
|
|
* |
5523
|
|
|
|
|
|
* We can't allocate space until we know how big the compiled form will be, |
5524
|
|
|
|
|
|
* but we can't compile it (and thus know how big it is) until we've got a |
5525
|
|
|
|
|
|
* place to put the code. So we cheat: we compile it twice, once with code |
5526
|
|
|
|
|
|
* generation turned off and size counting turned on, and once "for real". |
5527
|
|
|
|
|
|
* This also means that we don't allocate space until we are sure that the |
5528
|
|
|
|
|
|
* thing really will compile successfully, and we never have to move the |
5529
|
|
|
|
|
|
* code and thus invalidate pointers into it. (Note that it has to be in |
5530
|
|
|
|
|
|
* one piece because free() must be able to free it all.) [NB: not true in perl] |
5531
|
|
|
|
|
|
* |
5532
|
|
|
|
|
|
* Beware that the optimization-preparation code in here knows about some |
5533
|
|
|
|
|
|
* of the structure of the compiled regexp. [I'll say.] |
5534
|
|
|
|
|
|
*/ |
5535
|
|
|
|
|
|
|
5536
|
|
|
|
|
|
REGEXP * |
5537
|
8466307
|
|
|
|
|
Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, |
5538
|
|
|
|
|
|
OP *expr, const regexp_engine* eng, REGEXP *old_re, |
5539
|
|
|
|
|
|
bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) |
5540
|
|
|
|
|
|
{ |
5541
|
|
|
|
|
|
dVAR; |
5542
|
|
|
|
|
|
REGEXP *rx; |
5543
|
|
|
|
|
|
struct regexp *r; |
5544
|
|
|
|
|
|
regexp_internal *ri; |
5545
|
|
|
|
|
|
STRLEN plen; |
5546
|
|
|
|
|
|
char *exp; |
5547
|
|
|
|
|
|
regnode *scan; |
5548
|
|
|
|
|
|
I32 flags; |
5549
|
1450628
|
|
|
|
|
SSize_t minlen = 0; |
5550
|
|
|
|
|
|
U32 rx_flags; |
5551
|
|
|
|
|
|
SV *pat; |
5552
|
|
|
|
|
|
SV *code_blocksv = NULL; |
5553
|
|
|
|
|
|
SV** new_patternp = patternp; |
5554
|
|
|
|
|
|
|
5555
|
|
|
|
|
|
/* these are all flags - maybe they should be turned |
5556
|
|
|
|
|
|
* into a single int with different bit masks */ |
5557
|
|
|
|
|
|
I32 sawlookahead = 0; |
5558
|
|
|
|
|
|
I32 sawplus = 0; |
5559
|
|
|
|
|
|
I32 sawopen = 0; |
5560
|
|
|
|
|
|
I32 sawminmod = 0; |
5561
|
|
|
|
|
|
|
5562
|
|
|
|
|
|
regex_charset initial_charset = get_regex_charset(orig_rx_flags); |
5563
|
1450628
|
|
|
|
|
bool recompile = 0; |
5564
|
|
|
|
|
|
bool runtime_code = 0; |
5565
|
|
|
|
|
|
scan_data_t data; |
5566
|
|
|
|
|
|
RExC_state_t RExC_state; |
5567
|
|
|
|
|
|
RExC_state_t * const pRExC_state = &RExC_state; |
5568
|
|
|
|
|
|
#ifdef TRIE_STUDY_OPT |
5569
|
|
|
|
|
|
int restudied = 0; |
5570
|
|
|
|
|
|
RExC_state_t copyRExC_state; |
5571
|
|
|
|
|
|
#endif |
5572
|
1450628
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
5573
|
|
|
|
|
|
|
5574
|
8466307
|
|
|
|
|
PERL_ARGS_ASSERT_RE_OP_COMPILE; |
5575
|
|
|
|
|
|
|
5576
|
4316342
|
|
|
|
|
DEBUG_r(if (!PL_colorset) reginitcolors()); |
5577
|
|
|
|
|
|
|
5578
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
5579
|
|
|
|
|
|
/* Initialize these here instead of as-needed, as is quick and avoids |
5580
|
|
|
|
|
|
* having to test them each time otherwise */ |
5581
|
|
|
|
|
|
if (! PL_AboveLatin1) { |
5582
|
|
|
|
|
|
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); |
5583
|
|
|
|
|
|
PL_ASCII = _new_invlist_C_array(ASCII_invlist); |
5584
|
|
|
|
|
|
PL_Latin1 = _new_invlist_C_array(Latin1_invlist); |
5585
|
|
|
|
|
|
|
5586
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_ALPHANUMERIC] |
5587
|
|
|
|
|
|
= _new_invlist_C_array(L1PosixAlnum_invlist); |
5588
|
|
|
|
|
|
PL_Posix_ptrs[_CC_ALPHANUMERIC] |
5589
|
|
|
|
|
|
= _new_invlist_C_array(PosixAlnum_invlist); |
5590
|
|
|
|
|
|
|
5591
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_ALPHA] |
5592
|
|
|
|
|
|
= _new_invlist_C_array(L1PosixAlpha_invlist); |
5593
|
|
|
|
|
|
PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); |
5594
|
|
|
|
|
|
|
5595
|
|
|
|
|
|
PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); |
5596
|
|
|
|
|
|
PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); |
5597
|
|
|
|
|
|
|
5598
|
|
|
|
|
|
/* Cased is the same as Alpha in the ASCII range */ |
5599
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); |
5600
|
|
|
|
|
|
PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); |
5601
|
|
|
|
|
|
|
5602
|
|
|
|
|
|
PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); |
5603
|
|
|
|
|
|
PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); |
5604
|
|
|
|
|
|
|
5605
|
|
|
|
|
|
PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); |
5606
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); |
5607
|
|
|
|
|
|
|
5608
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); |
5609
|
|
|
|
|
|
PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); |
5610
|
|
|
|
|
|
|
5611
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); |
5612
|
|
|
|
|
|
PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); |
5613
|
|
|
|
|
|
|
5614
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); |
5615
|
|
|
|
|
|
PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); |
5616
|
|
|
|
|
|
|
5617
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); |
5618
|
|
|
|
|
|
PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); |
5619
|
|
|
|
|
|
|
5620
|
|
|
|
|
|
PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); |
5621
|
|
|
|
|
|
PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); |
5622
|
|
|
|
|
|
PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); |
5623
|
|
|
|
|
|
PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); |
5624
|
|
|
|
|
|
|
5625
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); |
5626
|
|
|
|
|
|
PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); |
5627
|
|
|
|
|
|
|
5628
|
|
|
|
|
|
PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); |
5629
|
|
|
|
|
|
|
5630
|
|
|
|
|
|
PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); |
5631
|
|
|
|
|
|
PL_L1Posix_ptrs[_CC_WORDCHAR] |
5632
|
|
|
|
|
|
= _new_invlist_C_array(L1PosixWord_invlist); |
5633
|
|
|
|
|
|
|
5634
|
|
|
|
|
|
PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); |
5635
|
|
|
|
|
|
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); |
5636
|
|
|
|
|
|
|
5637
|
|
|
|
|
|
PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); |
5638
|
|
|
|
|
|
} |
5639
|
|
|
|
|
|
#endif |
5640
|
|
|
|
|
|
|
5641
|
42330591
|
|
|
|
|
pRExC_state->code_blocks = NULL; |
5642
|
33864520
|
|
|
|
|
pRExC_state->num_code_blocks = 0; |
5643
|
|
|
|
|
|
|
5644
|
3895847
|
|
|
|
|
if (is_bare_re) |
5645
|
33864464
|
|
|
|
|
*is_bare_re = FALSE; |
5646
|
|
|
|
|
|
|
5647
|
8466331
|
|
|
|
|
if (expr && (expr->op_type == OP_LIST || |
5648
|
8466095
|
|
|
|
|
(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { |
5649
|
|
|
|
|
|
/* allocate code_blocks if needed */ |
5650
|
|
|
|
|
|
OP *o; |
5651
|
|
|
|
|
|
int ncode = 0; |
5652
|
|
|
|
|
|
|
5653
|
16932294
|
|
|
|
|
for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) |
5654
|
8466191
|
|
|
|
|
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) |
5655
|
8466103
|
|
|
|
|
ncode++; /* count of DO blocks */ |
5656
|
84
|
|
|
|
|
if (ncode) { |
5657
|
8466103
|
|
|
|
|
pRExC_state->num_code_blocks = ncode; |
5658
|
8466103
|
|
|
|
|
Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); |
5659
|
|
|
|
|
|
} |
5660
|
|
|
|
|
|
} |
5661
|
|
|
|
|
|
|
5662
|
16932378
|
|
|
|
|
if (!pat_count) { |
5663
|
|
|
|
|
|
/* compile-time pattern with just OP_CONSTs and DO blocks */ |
5664
|
|
|
|
|
|
|
5665
|
|
|
|
|
|
int n; |
5666
|
|
|
|
|
|
OP *o; |
5667
|
|
|
|
|
|
|
5668
|
|
|
|
|
|
/* find how many CONSTs there are */ |
5669
|
8466127
|
|
|
|
|
assert(expr); |
5670
|
|
|
|
|
|
n = 0; |
5671
|
8466127
|
|
|
|
|
if (expr->op_type == OP_CONST) |
5672
|
|
|
|
|
|
n = 1; |
5673
|
|
|
|
|
|
else |
5674
|
8466223
|
|
|
|
|
for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { |
5675
|
2722
|
|
|
|
|
if (o->op_type == OP_CONST) |
5676
|
2658
|
|
|
|
|
n++; |
5677
|
|
|
|
|
|
} |
5678
|
|
|
|
|
|
|
5679
|
|
|
|
|
|
/* fake up an SV array */ |
5680
|
|
|
|
|
|
|
5681
|
2658
|
|
|
|
|
assert(!new_patternp); |
5682
|
2658
|
|
|
|
|
Newx(new_patternp, n, SV*); |
5683
|
8466127
|
|
|
|
|
SAVEFREEPV(new_patternp); |
5684
|
|
|
|
|
|
pat_count = n; |
5685
|
|
|
|
|
|
|
5686
|
|
|
|
|
|
n = 0; |
5687
|
8466127
|
|
|
|
|
if (expr->op_type == OP_CONST) |
5688
|
8466095
|
|
|
|
|
new_patternp[n] = cSVOPx_sv(expr); |
5689
|
|
|
|
|
|
else |
5690
|
8466223
|
|
|
|
|
for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { |
5691
|
8466191
|
|
|
|
|
if (o->op_type == OP_CONST) |
5692
|
8466127
|
|
|
|
|
new_patternp[n++] = cSVOPo_sv; |
5693
|
|
|
|
|
|
} |
5694
|
|
|
|
|
|
|
5695
|
|
|
|
|
|
} |
5696
|
|
|
|
|
|
|
5697
|
8466307
|
|
|
|
|
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, |
5698
|
|
|
|
|
|
"Assembling pattern from %d elements%s\n", pat_count, |
5699
|
|
|
|
|
|
orig_rx_flags & RXf_SPLIT ? " for split" : "")); |
5700
|
|
|
|
|
|
|
5701
|
|
|
|
|
|
/* set expr to the first arg op */ |
5702
|
|
|
|
|
|
|
5703
|
8466307
|
|
|
|
|
if (pRExC_state->num_code_blocks |
5704
|
8466103
|
|
|
|
|
&& expr->op_type != OP_CONST) |
5705
|
|
|
|
|
|
{ |
5706
|
8466103
|
|
|
|
|
expr = cLISTOPx(expr)->op_first; |
5707
|
8466103
|
|
|
|
|
assert( expr->op_type == OP_PUSHMARK |
5708
|
|
|
|
|
|
|| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) |
5709
|
|
|
|
|
|
|| expr->op_type == OP_PADRANGE); |
5710
|
8466103
|
|
|
|
|
expr = expr->op_sibling; |
5711
|
|
|
|
|
|
} |
5712
|
|
|
|
|
|
|
5713
|
8466307
|
|
|
|
|
pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, |
5714
|
|
|
|
|
|
expr, &recompile, NULL); |
5715
|
|
|
|
|
|
|
5716
|
|
|
|
|
|
/* handle bare (possibly after overloading) regex: foo =~ $re */ |
5717
|
|
|
|
|
|
{ |
5718
|
|
|
|
|
|
SV *re = pat; |
5719
|
8466307
|
|
|
|
|
if (SvROK(re)) |
5720
|
8466071
|
|
|
|
|
re = SvRV(re); |
5721
|
8466307
|
|
|
|
|
if (SvTYPE(re) == SVt_REGEXP) { |
5722
|
62
|
|
|
|
|
if (is_bare_re) |
5723
|
62
|
|
|
|
|
*is_bare_re = TRUE; |
5724
|
|
|
|
|
|
SvREFCNT_inc(re); |
5725
|
8465813
|
|
|
|
|
Safefree(pRExC_state->code_blocks); |
5726
|
8465813
|
|
|
|
|
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, |
5727
|
|
|
|
|
|
"Precompiled pattern%s\n", |
5728
|
|
|
|
|
|
orig_rx_flags & RXf_SPLIT ? " for split" : "")); |
5729
|
|
|
|
|
|
|
5730
|
|
|
|
|
|
return (REGEXP*)re; |
5731
|
|
|
|
|
|
} |
5732
|
|
|
|
|
|
} |
5733
|
|
|
|
|
|
|
5734
|
2776
|
|
|
|
|
exp = SvPV_nomg(pat, plen); |
5735
|
|
|
|
|
|
|
5736
|
4333987
|
|
|
|
|
if (!eng->op_comp) { |
5737
|
8498368
|
|
|
|
|
if ((SvUTF8(pat) && IN_BYTES) |
5738
|
8498368
|
|
|
|
|
|| SvGMAGICAL(pat) || SvAMAGIC(pat)) |
5739
|
|
|
|
|
|
{ |
5740
|
|
|
|
|
|
/* make a temporary copy; either to convert to bytes, |
5741
|
|
|
|
|
|
* or to avoid repeating get-magic / overloaded stringify */ |
5742
|
8498368
|
|
|
|
|
pat = newSVpvn_flags(exp, plen, SVs_TEMP | |
5743
|
|
|
|
|
|
(IN_BYTES ? 0 : SvUTF8(pat))); |
5744
|
|
|
|
|
|
} |
5745
|
8465751
|
|
|
|
|
Safefree(pRExC_state->code_blocks); |
5746
|
8465751
|
|
|
|
|
return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); |
5747
|
|
|
|
|
|
} |
5748
|
|
|
|
|
|
|
5749
|
|
|
|
|
|
/* ignore the utf8ness if the pattern is 0 length */ |
5750
|
32791
|
|
|
|
|
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); |
5751
|
32791
|
|
|
|
|
RExC_uni_semantics = 0; |
5752
|
32791
|
|
|
|
|
RExC_contains_locale = 0; |
5753
|
174
|
|
|
|
|
pRExC_state->runtime_code_qr = NULL; |
5754
|
|
|
|
|
|
|
5755
|
32791
|
|
|
|
|
DEBUG_COMPILE_r({ |
5756
|
|
|
|
|
|
SV *dsv= sv_newmortal(); |
5757
|
|
|
|
|
|
RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); |
5758
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", |
5759
|
|
|
|
|
|
PL_colors[4],PL_colors[5],s); |
5760
|
|
|
|
|
|
}); |
5761
|
|
|
|
|
|
|
5762
|
|
|
|
|
|
redo_first_pass: |
5763
|
|
|
|
|
|
/* we jump here if we upgrade the pattern to utf8 and have to |
5764
|
|
|
|
|
|
* recompile */ |
5765
|
|
|
|
|
|
|
5766
|
32815
|
|
|
|
|
if ((pm_flags & PMf_USE_RE_EVAL) |
5767
|
|
|
|
|
|
/* this second condition covers the non-regex literal case, |
5768
|
|
|
|
|
|
* i.e. $foo =~ '(?{})'. */ |
5769
|
8498542
|
|
|
|
|
|| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) |
5770
|
|
|
|
|
|
) |
5771
|
8498392
|
|
|
|
|
runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); |
5772
|
|
|
|
|
|
|
5773
|
|
|
|
|
|
/* return old regex if pattern hasn't changed */ |
5774
|
|
|
|
|
|
/* XXX: note in the below we have to check the flags as well as the pattern. |
5775
|
|
|
|
|
|
* |
5776
|
|
|
|
|
|
* Things get a touch tricky as we have to compare the utf8 flag independently |
5777
|
|
|
|
|
|
* from the compile flags. |
5778
|
|
|
|
|
|
*/ |
5779
|
|
|
|
|
|
|
5780
|
685962
|
|
|
|
|
if ( old_re |
5781
|
8498462
|
|
|
|
|
&& !recompile |
5782
|
8498446
|
|
|
|
|
&& !!RX_UTF8(old_re) == !!RExC_utf8 |
5783
|
1712960
|
|
|
|
|
&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) |
5784
|
8498476
|
|
|
|
|
&& RX_PRECOMP(old_re) |
5785
|
8498476
|
|
|
|
|
&& RX_PRELEN(old_re) == plen |
5786
|
8457216
|
|
|
|
|
&& memEQ(RX_PRECOMP(old_re), exp, plen) |
5787
|
8457120
|
|
|
|
|
&& !runtime_code /* with runtime code, always recompile */ ) |
5788
|
|
|
|
|
|
{ |
5789
|
8457112
|
|
|
|
|
Safefree(pRExC_state->code_blocks); |
5790
|
18342017
|
|
|
|
|
return old_re; |
5791
|
|
|
|
|
|
} |
5792
|
|
|
|
|
|
|
5793
|
|
|
|
|
|
rx_flags = orig_rx_flags; |
5794
|
|
|
|
|
|
|
5795
|
13459962
|
|
|
|
|
if (initial_charset == REGEX_LOCALE_CHARSET) { |
5796
|
13345125
|
|
|
|
|
RExC_contains_locale = 1; |
5797
|
|
|
|
|
|
} |
5798
|
13174618
|
|
|
|
|
else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { |
5799
|
|
|
|
|
|
|
5800
|
|
|
|
|
|
/* Set to use unicode semantics if the pattern is in utf8 and has the |
5801
|
|
|
|
|
|
* 'depends' charset specified, as it means unicode when utf8 */ |
5802
|
|
|
|
|
|
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); |
5803
|
|
|
|
|
|
} |
5804
|
|
|
|
|
|
|
5805
|
13061762
|
|
|
|
|
RExC_precomp = exp; |
5806
|
13186260
|
|
|
|
|
RExC_flags = rx_flags; |
5807
|
5806115
|
|
|
|
|
RExC_pm_flags = pm_flags; |
5808
|
|
|
|
|
|
|
5809
|
1347418
|
|
|
|
|
if (runtime_code) { |
5810
|
1180809
|
|
|
|
|
if (TAINTING_get && TAINT_get) |
5811
|
1180785
|
|
|
|
|
Perl_croak(aTHX_ "Eval-group in insecure regular expression"); |
5812
|
|
|
|
|
|
|
5813
|
1347284
|
|
|
|
|
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { |
5814
|
|
|
|
|
|
/* whoops, we have a non-utf8 pattern, whilst run-time code |
5815
|
|
|
|
|
|
* got compiled as utf8. Try again with a utf8 pattern */ |
5816
|
1347260
|
|
|
|
|
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, |
5817
|
|
|
|
|
|
pRExC_state->num_code_blocks); |
5818
|
11574721
|
|
|
|
|
goto redo_first_pass; |
5819
|
|
|
|
|
|
} |
5820
|
|
|
|
|
|
} |
5821
|
4788639
|
|
|
|
|
assert(!pRExC_state->runtime_code_qr); |
5822
|
|
|
|
|
|
|
5823
|
487292
|
|
|
|
|
RExC_sawback = 0; |
5824
|
|
|
|
|
|
|
5825
|
6799143
|
|
|
|
|
RExC_seen = 0; |
5826
|
26008
|
|
|
|
|
RExC_in_lookbehind = 0; |
5827
|
25836
|
|
|
|
|
RExC_seen_zerolen = *exp == '^' ? -1 : 0; |
5828
|
25836
|
|
|
|
|
RExC_extralen = 0; |
5829
|
15622
|
|
|
|
|
RExC_override_recoding = 0; |
5830
|
15622
|
|
|
|
|
RExC_in_multi_char_class = 0; |
5831
|
|
|
|
|
|
|
5832
|
|
|
|
|
|
/* First pass: determine size, legality. */ |
5833
|
10372
|
|
|
|
|
RExC_parse = exp; |
5834
|
10372
|
|
|
|
|
RExC_start = exp; |
5835
|
25836
|
|
|
|
|
RExC_end = exp + plen; |
5836
|
25836
|
|
|
|
|
RExC_naughty = 0; |
5837
|
25836
|
|
|
|
|
RExC_npar = 1; |
5838
|
6760720
|
|
|
|
|
RExC_nestroot = 0; |
5839
|
1467975
|
|
|
|
|
RExC_size = 0L; |
5840
|
5292903
|
|
|
|
|
RExC_emit = &RExC_emit_dummy; |
5841
|
260520
|
|
|
|
|
RExC_whilem_seen = 0; |
5842
|
5032541
|
|
|
|
|
RExC_open_parens = NULL; |
5843
|
4401362
|
|
|
|
|
RExC_close_parens = NULL; |
5844
|
2929895
|
|
|
|
|
RExC_opend = NULL; |
5845
|
2977711
|
|
|
|
|
RExC_paren_names = NULL; |
5846
|
|
|
|
|
|
#ifdef DEBUGGING |
5847
|
2977711
|
|
|
|
|
RExC_paren_name_list = NULL; |
5848
|
|
|
|
|
|
#endif |
5849
|
2054988
|
|
|
|
|
RExC_recurse = NULL; |
5850
|
17720
|
|
|
|
|
RExC_recurse_count = 0; |
5851
|
17720
|
|
|
|
|
pRExC_state->code_index = 0; |
5852
|
|
|
|
|
|
|
5853
|
|
|
|
|
|
#if 0 /* REGC() is (currently) a NOP at the first pass. |
5854
|
|
|
|
|
|
* Clever compilers notice this and complain. --jhi */ |
5855
|
|
|
|
|
|
REGC((U8)REG_MAGIC, (char*)RExC_emit); |
5856
|
|
|
|
|
|
#endif |
5857
|
17720
|
|
|
|
|
DEBUG_PARSE_r( |
5858
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); |
5859
|
|
|
|
|
|
RExC_lastnum=0; |
5860
|
|
|
|
|
|
RExC_lastparse=NULL; |
5861
|
|
|
|
|
|
); |
5862
|
|
|
|
|
|
/* reg may croak on us, not giving us a chance to free |
5863
|
|
|
|
|
|
pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may |
5864
|
|
|
|
|
|
need it to survive as long as the regexp (qr/(?{})/). |
5865
|
|
|
|
|
|
We must check that code_blocksv is not already set, because we may |
5866
|
|
|
|
|
|
have jumped back to restart the sizing pass. */ |
5867
|
3029214
|
|
|
|
|
if (pRExC_state->code_blocks && !code_blocksv) { |
5868
|
1251520
|
|
|
|
|
code_blocksv = newSV_type(SVt_PV); |
5869
|
260060
|
|
|
|
|
SAVEFREESV(code_blocksv); |
5870
|
188944
|
|
|
|
|
SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); |
5871
|
122602
|
|
|
|
|
SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ |
5872
|
|
|
|
|
|
} |
5873
|
122692
|
|
|
|
|
if (reg(pRExC_state, 0, &flags,1) == NULL) { |
5874
|
|
|
|
|
|
/* It's possible to write a regexp in ascii that represents Unicode |
5875
|
|
|
|
|
|
codepoints outside of the byte range, such as via \x{100}. If we |
5876
|
|
|
|
|
|
detect such a sequence we have to convert the entire pattern to utf8 |
5877
|
|
|
|
|
|
and then recompile, as our sizing calculation will have been based |
5878
|
|
|
|
|
|
on 1 byte == 1 character, but we will need to use utf8 to encode |
5879
|
|
|
|
|
|
at least some part of the pattern, and therefore must convert the whole |
5880
|
|
|
|
|
|
thing. |
5881
|
|
|
|
|
|
-- dmq */ |
5882
|
122558
|
|
|
|
|
if (flags & RESTART_UTF8) { |
5883
|
122558
|
|
|
|
|
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, |
5884
|
|
|
|
|
|
pRExC_state->num_code_blocks); |
5885
|
122558
|
|
|
|
|
goto redo_first_pass; |
5886
|
|
|
|
|
|
} |
5887
|
122534
|
|
|
|
|
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); |
5888
|
|
|
|
|
|
} |
5889
|
8457206
|
|
|
|
|
if (code_blocksv) |
5890
|
163841
|
|
|
|
|
SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ |
5891
|
|
|
|
|
|
|
5892
|
163701
|
|
|
|
|
DEBUG_PARSE_r({ |
5893
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
5894
|
|
|
|
|
|
"Required size %"IVdf" nodes\n" |
5895
|
|
|
|
|
|
"Starting second pass (creation)\n", |
5896
|
|
|
|
|
|
(IV)RExC_size); |
5897
|
|
|
|
|
|
RExC_lastnum=0; |
5898
|
|
|
|
|
|
RExC_lastparse=NULL; |
5899
|
|
|
|
|
|
}); |
5900
|
|
|
|
|
|
|
5901
|
|
|
|
|
|
/* The first pass could have found things that force Unicode semantics */ |
5902
|
8457206
|
|
|
|
|
if ((RExC_utf8 || RExC_uni_semantics) |
5903
|
8457092
|
|
|
|
|
&& get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) |
5904
|
|
|
|
|
|
{ |
5905
|
|
|
|
|
|
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); |
5906
|
|
|
|
|
|
} |
5907
|
|
|
|
|
|
|
5908
|
|
|
|
|
|
/* Small enough for pointer-storage convention? |
5909
|
|
|
|
|
|
If extralen==0, this means that we will not need long jumps. */ |
5910
|
8457206
|
|
|
|
|
if (RExC_size >= 0x10000L && RExC_extralen) |
5911
|
8457072
|
|
|
|
|
RExC_size += RExC_extralen; |
5912
|
|
|
|
|
|
else |
5913
|
8457206
|
|
|
|
|
RExC_extralen = 0; |
5914
|
8457206
|
|
|
|
|
if (RExC_whilem_seen > 15) |
5915
|
8457072
|
|
|
|
|
RExC_whilem_seen = 15; |
5916
|
|
|
|
|
|
|
5917
|
|
|
|
|
|
/* Allocate space and zero-initialize. Note, the two step process |
5918
|
|
|
|
|
|
of zeroing when in debug mode, thus anything assigned has to |
5919
|
|
|
|
|
|
happen after that */ |
5920
|
8457206
|
|
|
|
|
rx = (REGEXP*) newSV_type(SVt_REGEXP); |
5921
|
8457206
|
|
|
|
|
r = ReANY(rx); |
5922
|
8457206
|
|
|
|
|
Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), |
5923
|
|
|
|
|
|
char, regexp_internal); |
5924
|
6216215
|
|
|
|
|
if ( r == NULL || ri == NULL ) |
5925
|
6216081
|
|
|
|
|
FAIL("Regexp out of space"); |
5926
|
|
|
|
|
|
#ifdef DEBUGGING |
5927
|
|
|
|
|
|
/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ |
5928
|
8457206
|
|
|
|
|
Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); |
5929
|
|
|
|
|
|
#else |
5930
|
|
|
|
|
|
/* bulk initialize base fields with 0. */ |
5931
|
|
|
|
|
|
Zero(ri, sizeof(regexp_internal), char); |
5932
|
|
|
|
|
|
#endif |
5933
|
|
|
|
|
|
|
5934
|
|
|
|
|
|
/* non-zero initialization begins here */ |
5935
|
8457206
|
|
|
|
|
RXi_SET( r, ri ); |
5936
|
8457184
|
|
|
|
|
r->engine= eng; |
5937
|
8429185
|
|
|
|
|
r->extflags = rx_flags; |
5938
|
2749792
|
|
|
|
|
RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; |
5939
|
|
|
|
|
|
|
5940
|
1307114
|
|
|
|
|
if (pm_flags & PMf_IS_QR) { |
5941
|
756375
|
|
|
|
|
ri->code_blocks = pRExC_state->code_blocks; |
5942
|
756373
|
|
|
|
|
ri->num_code_blocks = pRExC_state->num_code_blocks; |
5943
|
|
|
|
|
|
} |
5944
|
|
|
|
|
|
else |
5945
|
|
|
|
|
|
{ |
5946
|
|
|
|
|
|
int n; |
5947
|
755647
|
|
|
|
|
for (n = 0; n < pRExC_state->num_code_blocks; n++) |
5948
|
8429075
|
|
|
|
|
if (pRExC_state->code_blocks[n].src_regex) |
5949
|
8429075
|
|
|
|
|
SAVEFREESV(pRExC_state->code_blocks[n].src_regex); |
5950
|
10521380
|
|
|
|
|
SAVEFREEPV(pRExC_state->code_blocks); |
5951
|
|
|
|
|
|
} |
5952
|
|
|
|
|
|
|
5953
|
|
|
|
|
|
{ |
5954
|
4541188
|
|
|
|
|
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); |
5955
|
2092543
|
|
|
|
|
bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); |
5956
|
|
|
|
|
|
|
5957
|
|
|
|
|
|
/* The caret is output if there are any defaults: if not all the STD |
5958
|
|
|
|
|
|
* flags are set, or if no character set specifier is needed */ |
5959
|
24727911
|
|
|
|
|
bool has_default = |
5960
|
8350711
|
|
|
|
|
(((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) |
5961
|
8350711
|
|
|
|
|
|| ! has_charset); |
5962
|
8350711
|
|
|
|
|
bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); |
5963
|
8350711
|
|
|
|
|
U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) |
5964
|
|
|
|
|
|
>> RXf_PMf_STD_PMMOD_SHIFT); |
5965
|
|
|
|
|
|
const char *fptr = STD_PAT_MODS; /*"msix"*/ |
5966
|
|
|
|
|
|
char *p; |
5967
|
|
|
|
|
|
/* Allocate for the worst case, which is all the std flags are turned |
5968
|
|
|
|
|
|
* on. If more precision is desired, we could do a population count of |
5969
|
|
|
|
|
|
* the flags set. This could be done with a small lookup table, or by |
5970
|
|
|
|
|
|
* shifting, masking and adding, or even, when available, assembly |
5971
|
|
|
|
|
|
* language for a machine-language population count. |
5972
|
|
|
|
|
|
* We never output a minus, as all those are defaults, so are |
5973
|
|
|
|
|
|
* covered by the caret */ |
5974
|
8350711
|
|
|
|
|
const STRLEN wraplen = plen + has_p + has_runon |
5975
|
4113228
|
|
|
|
|
+ has_default /* If needs a caret */ |
5976
|
|
|
|
|
|
|
5977
|
|
|
|
|
|
/* If needs a character set specifier */ |
5978
|
4113228
|
|
|
|
|
+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) |
5979
|
|
|
|
|
|
+ (sizeof(STD_PAT_MODS) - 1) |
5980
|
|
|
|
|
|
+ (sizeof("(?:)") - 1); |
5981
|
|
|
|
|
|
|
5982
|
4113228
|
|
|
|
|
Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ |
5983
|
987909
|
|
|
|
|
r->xpv_len_u.xpvlenu_pv = p; |
5984
|
4113228
|
|
|
|
|
if (RExC_utf8) |
5985
|
4315977
|
|
|
|
|
SvFLAGS(rx) |= SVf_UTF8; |
5986
|
8429185
|
|
|
|
|
*p++='('; *p++='?'; |
5987
|
|
|
|
|
|
|
5988
|
|
|
|
|
|
/* If a default, cover it using the caret */ |
5989
|
29087870
|
|
|
|
|
if (has_default) { |
5990
|
8429185
|
|
|
|
|
*p++= DEFAULT_PAT_MOD; |
5991
|
|
|
|
|
|
} |
5992
|
8429185
|
|
|
|
|
if (has_charset) { |
5993
|
|
|
|
|
|
STRLEN len; |
5994
|
8429071
|
|
|
|
|
const char* const name = get_regex_charset_name(r->extflags, &len); |
5995
|
8429071
|
|
|
|
|
Copy(name, p, len, char); |
5996
|
8429071
|
|
|
|
|
p += len; |
5997
|
|
|
|
|
|
} |
5998
|
8429185
|
|
|
|
|
if (has_p) |
5999
|
4678074
|
|
|
|
|
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/ |
6000
|
|
|
|
|
|
{ |
6001
|
|
|
|
|
|
char ch; |
6002
|
4678744
|
|
|
|
|
while((ch = *fptr++)) { |
6003
|
3751513
|
|
|
|
|
if(reganch & 1) |
6004
|
8429055
|
|
|
|
|
*p++ = ch; |
6005
|
8429587
|
|
|
|
|
reganch >>= 1; |
6006
|
|
|
|
|
|
} |
6007
|
|
|
|
|
|
} |
6008
|
|
|
|
|
|
|
6009
|
2240949
|
|
|
|
|
*p++ = ':'; |
6010
|
206347
|
|
|
|
|
Copy(RExC_precomp, p, plen, char); |
6011
|
8429185
|
|
|
|
|
assert ((RX_WRAPPED(rx) - p) < 16); |
6012
|
4018439
|
|
|
|
|
r->pre_prefix = p - RX_WRAPPED(rx); |
6013
|
1808186
|
|
|
|
|
p += plen; |
6014
|
561398
|
|
|
|
|
if (has_runon) |
6015
|
450060
|
|
|
|
|
*p++ = '\n'; |
6016
|
450194
|
|
|
|
|
*p++ = ')'; |
6017
|
450194
|
|
|
|
|
*p = 0; |
6018
|
450328
|
|
|
|
|
SvCUR_set(rx, p - RX_WRAPPED(rx)); |
6019
|
|
|
|
|
|
} |
6020
|
|
|
|
|
|
|
6021
|
450194
|
|
|
|
|
r->intflags = 0; |
6022
|
450194
|
|
|
|
|
r->nparens = RExC_npar - 1; /* set early to validate backrefs */ |
6023
|
|
|
|
|
|
|
6024
|
8429185
|
|
|
|
|
if (RExC_seen & REG_SEEN_RECURSE) { |
6025
|
4126792
|
|
|
|
|
Newxz(RExC_open_parens, RExC_npar,regnode *); |
6026
|
4126792
|
|
|
|
|
SAVEFREEPV(RExC_open_parens); |
6027
|
4126792
|
|
|
|
|
Newxz(RExC_close_parens,RExC_npar,regnode *); |
6028
|
4126792
|
|
|
|
|
SAVEFREEPV(RExC_close_parens); |
6029
|
|
|
|
|
|
} |
6030
|
|
|
|
|
|
|
6031
|
|
|
|
|
|
/* Useful during FAIL. */ |
6032
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
6033
|
4126926
|
|
|
|
|
Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ |
6034
|
117512
|
|
|
|
|
DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, |
6035
|
|
|
|
|
|
"%s %"UVuf" bytes for offset annotations.\n", |
6036
|
|
|
|
|
|
ri->u.offsets ? "Got" : "Couldn't get", |
6037
|
|
|
|
|
|
(UV)((2*RExC_size+1) * sizeof(U32)))); |
6038
|
|
|
|
|
|
#endif |
6039
|
4302393
|
|
|
|
|
SetProgLen(ri,RExC_size); |
6040
|
4302393
|
|
|
|
|
RExC_rx_sv = rx; |
6041
|
4302393
|
|
|
|
|
RExC_rx = r; |
6042
|
4302393
|
|
|
|
|
RExC_rxi = ri; |
6043
|
|
|
|
|
|
|
6044
|
|
|
|
|
|
/* Second pass: emit code. */ |
6045
|
4302393
|
|
|
|
|
RExC_flags = rx_flags; /* don't let top level (?i) bleed */ |
6046
|
8429185
|
|
|
|
|
RExC_pm_flags = pm_flags; |
6047
|
6308887
|
|
|
|
|
RExC_parse = exp; |
6048
|
6308887
|
|
|
|
|
RExC_end = exp + plen; |
6049
|
1676307
|
|
|
|
|
RExC_naughty = 0; |
6050
|
41430
|
|
|
|
|
RExC_npar = 1; |
6051
|
41430
|
|
|
|
|
RExC_emit_start = ri->program; |
6052
|
41430
|
|
|
|
|
RExC_emit = ri->program; |
6053
|
41430
|
|
|
|
|
RExC_emit_bound = ri->program + RExC_size + 1; |
6054
|
41430
|
|
|
|
|
pRExC_state->code_index = 0; |
6055
|
|
|
|
|
|
|
6056
|
41430
|
|
|
|
|
REGC((U8)REG_MAGIC, (char*) RExC_emit++); |
6057
|
41430
|
|
|
|
|
if (reg(pRExC_state, 0, &flags,1) == NULL) { |
6058
|
36678
|
|
|
|
|
ReREFCNT_dec(rx); |
6059
|
36678
|
|
|
|
|
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); |
6060
|
|
|
|
|
|
} |
6061
|
|
|
|
|
|
/* XXXX To minimize changes to RE engine we always allocate |
6062
|
|
|
|
|
|
3-units-long substrs field. */ |
6063
|
36812
|
|
|
|
|
Newx(r->substrs, 1, struct reg_substr_data); |
6064
|
134
|
|
|
|
|
if (RExC_recurse_count) { |
6065
|
0
|
|
|
|
|
Newxz(RExC_recurse,RExC_recurse_count,regnode *); |
6066
|
0
|
|
|
|
|
SAVEFREEPV(RExC_recurse); |
6067
|
|
|
|
|
|
} |
6068
|
|
|
|
|
|
|
6069
|
|
|
|
|
|
reStudy: |
6070
|
140
|
|
|
|
|
r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; |
6071
|
140
|
|
|
|
|
Zero(r->substrs, 1, struct reg_substr_data); |
6072
|
|
|
|
|
|
|
6073
|
|
|
|
|
|
#ifdef TRIE_STUDY_OPT |
6074
|
140
|
|
|
|
|
if (!restudied) { |
6075
|
134
|
|
|
|
|
StructCopy(&zero_scan_data, &data, scan_data_t); |
6076
|
8465863
|
|
|
|
|
copyRExC_state = RExC_state; |
6077
|
|
|
|
|
|
} else { |
6078
|
8465735
|
|
|
|
|
U32 seen=RExC_seen; |
6079
|
7956857
|
|
|
|
|
DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); |
6080
|
|
|
|
|
|
|
6081
|
8465735
|
|
|
|
|
RExC_state = copyRExC_state; |
6082
|
123524
|
|
|
|
|
if (seen & REG_TOP_LEVEL_BRANCHES) |
6083
|
8465729
|
|
|
|
|
RExC_seen |= REG_TOP_LEVEL_BRANCHES; |
6084
|
|
|
|
|
|
else |
6085
|
329634
|
|
|
|
|
RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; |
6086
|
8465735
|
|
|
|
|
StructCopy(&zero_scan_data, &data, scan_data_t); |
6087
|
|
|
|
|
|
} |
6088
|
|
|
|
|
|
#else |
6089
|
|
|
|
|
|
StructCopy(&zero_scan_data, &data, scan_data_t); |
6090
|
|
|
|
|
|
#endif |
6091
|
|
|
|
|
|
|
6092
|
|
|
|
|
|
/* Dig out information for optimizations. */ |
6093
|
112956
|
|
|
|
|
r->extflags = RExC_flags; /* was pm_op */ |
6094
|
|
|
|
|
|
/*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ |
6095
|
|
|
|
|
|
|
6096
|
8465869
|
|
|
|
|
if (UTF) |
6097
|
126
|
|
|
|
|
SvUTF8_on(rx); /* Unicode in it? */ |
6098
|
8465869
|
|
|
|
|
ri->regstclass = NULL; |
6099
|
1374
|
|
|
|
|
if (RExC_naughty >= 10) /* Probably an expensive pattern. */ |
6100
|
1234
|
|
|
|
|
r->intflags |= PREGf_NAUGHTY; |
6101
|
8465869
|
|
|
|
|
scan = ri->program + 1; /* First BRANCH. */ |
6102
|
|
|
|
|
|
|
6103
|
|
|
|
|
|
/* testing for BRANCH here tells us whether there is "must appear" |
6104
|
|
|
|
|
|
data in the pattern. If there is then we can use it for optimisations */ |
6105
|
398
|
|
|
|
|
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ |
6106
|
|
|
|
|
|
SSize_t fake; |
6107
|
|
|
|
|
|
STRLEN longest_float_length, longest_fixed_length; |
6108
|
|
|
|
|
|
struct regnode_charclass_class ch_class; /* pointed to by data */ |
6109
|
|
|
|
|
|
int stclass_flag; |
6110
|
8465869
|
|
|
|
|
SSize_t last_close = 0; /* pointed to by data */ |
6111
|
5306
|
|
|
|
|
regnode *first= scan; |
6112
|
8465869
|
|
|
|
|
regnode *first_next= regnext(first); |
6113
|
|
|
|
|
|
/* |
6114
|
|
|
|
|
|
* Skip introductions and multiplicators >= 1 |
6115
|
|
|
|
|
|
* so that we can extract the 'meat' of the pattern that must |
6116
|
|
|
|
|
|
* match in the large if() sequence following. |
6117
|
|
|
|
|
|
* NOTE that EXACT is NOT covered here, as it is normally |
6118
|
|
|
|
|
|
* picked up by the optimiser separately. |
6119
|
|
|
|
|
|
* |
6120
|
|
|
|
|
|
* This is unfortunate as the optimiser isnt handling lookahead |
6121
|
|
|
|
|
|
* properly currently. |
6122
|
|
|
|
|
|
* |
6123
|
|
|
|
|
|
*/ |
6124
|
13643
|
|
|
|
|
while ((OP(first) == OPEN && (sawopen = 1)) || |
6125
|
|
|
|
|
|
/* An OR of *one* alternative - should not happen now. */ |
6126
|
8457205
|
|
|
|
|
(OP(first) == BRANCH && OP(first_next) != BRANCH) || |
6127
|
|
|
|
|
|
/* for now we can't handle lookbehind IFMATCH*/ |
6128
|
8466009
|
|
|
|
|
(OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || |
6129
|
8466009
|
|
|
|
|
(OP(first) == PLUS) || |
6130
|
8466009
|
|
|
|
|
(OP(first) == MINMOD) || |
6131
|
|
|
|
|
|
/* An {n,m} with n>0 */ |
6132
|
8466009
|
|
|
|
|
(PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || |
6133
|
39310
|
|
|
|
|
(OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) |
6134
|
|
|
|
|
|
{ |
6135
|
|
|
|
|
|
/* |
6136
|
|
|
|
|
|
* the only op that could be a regnode is PLUS, all the rest |
6137
|
|
|
|
|
|
* will be regnode_1 or regnode_2. |
6138
|
|
|
|
|
|
* |
6139
|
|
|
|
|
|
* (yves doesn't think this is true) |
6140
|
|
|
|
|
|
*/ |
6141
|
8426569
|
|
|
|
|
if (OP(first) == PLUS) |
6142
|
|
|
|
|
|
sawplus = 1; |
6143
|
|
|
|
|
|
else { |
6144
|
17803
|
|
|
|
|
if (OP(first) == MINMOD) |
6145
|
|
|
|
|
|
sawminmod = 1; |
6146
|
8408774
|
|
|
|
|
first += regarglen[OP(first)]; |
6147
|
|
|
|
|
|
} |
6148
|
36966
|
|
|
|
|
first = NEXTOPER(first); |
6149
|
8371816
|
|
|
|
|
first_next= regnext(first); |
6150
|
|
|
|
|
|
} |
6151
|
|
|
|
|
|
|
6152
|
|
|
|
|
|
/* Starting-point info. */ |
6153
|
|
|
|
|
|
again: |
6154
|
67727
|
|
|
|
|
DEBUG_PEEP("first:",first,0); |
6155
|
|
|
|
|
|
/* Ignore EXACT as we deal with it later. */ |
6156
|
8465869
|
|
|
|
|
if (PL_regkind[OP(first)] == EXACT) { |
6157
|
8465847
|
|
|
|
|
if (OP(first) == EXACT) |
6158
|
|
|
|
|
|
NOOP; /* Empty, get anchored substr later. */ |
6159
|
|
|
|
|
|
else |
6160
|
4601
|
|
|
|
|
ri->regstclass = first; |
6161
|
|
|
|
|
|
} |
6162
|
|
|
|
|
|
#ifdef TRIE_STCLASS |
6163
|
3324
|
|
|
|
|
else if (PL_regkind[OP(first)] == TRIE && |
6164
|
3302
|
|
|
|
|
((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) |
6165
|
8465733
|
|
|
|
|
{ |
6166
|
|
|
|
|
|
regnode *trie_op; |
6167
|
|
|
|
|
|
/* this can happen only on restudy */ |
6168
|
17830745
|
|
|
|
|
if ( OP(first) == TRIE ) { |
6169
|
8648
|
|
|
|
|
struct regnode_1 *trieop = (struct regnode_1 *) |
6170
|
|
|
|
|
|
PerlMemShared_calloc(1, sizeof(struct regnode_1)); |
6171
|
8648
|
|
|
|
|
StructCopy(first,trieop,struct regnode_1); |
6172
|
|
|
|
|
|
trie_op=(regnode *)trieop; |
6173
|
|
|
|
|
|
} else { |
6174
|
8562
|
|
|
|
|
struct regnode_charclass *trieop = (struct regnode_charclass *) |
6175
|
|
|
|
|
|
PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); |
6176
|
82
|
|
|
|
|
StructCopy(first,trieop,struct regnode_charclass); |
6177
|
|
|
|
|
|
trie_op=(regnode *)trieop; |
6178
|
|
|
|
|
|
} |
6179
|
10
|
|
|
|
|
OP(trie_op)+=2; |
6180
|
80
|
|
|
|
|
make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); |
6181
|
84
|
|
|
|
|
ri->regstclass = trie_op; |
6182
|
|
|
|
|
|
} |
6183
|
|
|
|
|
|
#endif |
6184
|
58
|
|
|
|
|
else if (REGNODE_SIMPLE(OP(first))) |
6185
|
38
|
|
|
|
|
ri->regstclass = first; |
6186
|
32
|
|
|
|
|
else if (PL_regkind[OP(first)] == BOUND || |
6187
|
|
|
|
|
|
PL_regkind[OP(first)] == NBOUND) |
6188
|
20
|
|
|
|
|
ri->regstclass = first; |
6189
|
36
|
|
|
|
|
else if (PL_regkind[OP(first)] == BOL) { |
6190
|
4319
|
|
|
|
|
r->extflags |= (OP(first) == MBOL |
6191
|
|
|
|
|
|
? RXf_ANCH_MBOL |
6192
|
226
|
|
|
|
|
: (OP(first) == SBOL |
6193
|
|
|
|
|
|
? RXf_ANCH_SBOL |
6194
|
|
|
|
|
|
: RXf_ANCH_BOL)); |
6195
|
226
|
|
|
|
|
first = NEXTOPER(first); |
6196
|
56
|
|
|
|
|
goto again; |
6197
|
|
|
|
|
|
} |
6198
|
186
|
|
|
|
|
else if (OP(first) == GPOS) { |
6199
|
170
|
|
|
|
|
r->extflags |= RXf_ANCH_GPOS; |
6200
|
113
|
|
|
|
|
first = NEXTOPER(first); |
6201
|
8562
|
|
|
|
|
goto again; |
6202
|
|
|
|
|
|
} |
6203
|
8594
|
|
|
|
|
else if ((!sawopen || !RExC_sawback) && |
6204
|
58
|
|
|
|
|
(OP(first) == STAR && |
6205
|
8562
|
|
|
|
|
PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && |
6206
|
8562
|
|
|
|
|
!(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) |
6207
|
|
|
|
|
|
{ |
6208
|
|
|
|
|
|
/* turn .* into ^.* with an implied $*=1 */ |
6209
|
|
|
|
|
|
const int type = |
6210
|
8562
|
|
|
|
|
(OP(NEXTOPER(first)) == REG_ANY) |
6211
|
|
|
|
|
|
? RXf_ANCH_MBOL |
6212
|
8554
|
|
|
|
|
: RXf_ANCH_SBOL; |
6213
|
8554
|
|
|
|
|
r->extflags |= type; |
6214
|
10146
|
|
|
|
|
r->intflags |= PREGf_IMPLICIT; |
6215
|
8782
|
|
|
|
|
first = NEXTOPER(first); |
6216
|
8782
|
|
|
|
|
goto again; |
6217
|
|
|
|
|
|
} |
6218
|
7394
|
|
|
|
|
if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) |
6219
|
7254
|
|
|
|
|
&& !pRExC_state->num_code_blocks) /* May examine pos and $& */ |
6220
|
|
|
|
|
|
/* x+ must match at the 1st pos of run of x's */ |
6221
|
7254
|
|
|
|
|
r->intflags |= PREGf_SKIP; |
6222
|
|
|
|
|
|
|
6223
|
|
|
|
|
|
/* Scan is after the zeroth branch, first is atomic matcher. */ |
6224
|
|
|
|
|
|
#ifdef TRIE_STUDY_OPT |
6225
|
7394
|
|
|
|
|
DEBUG_PARSE_r( |
6226
|
|
|
|
|
|
if (!restudied) |
6227
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", |
6228
|
|
|
|
|
|
(IV)(first - scan + 1)) |
6229
|
|
|
|
|
|
); |
6230
|
|
|
|
|
|
#else |
6231
|
|
|
|
|
|
DEBUG_PARSE_r( |
6232
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", |
6233
|
|
|
|
|
|
(IV)(first - scan + 1)) |
6234
|
|
|
|
|
|
); |
6235
|
|
|
|
|
|
#endif |
6236
|
|
|
|
|
|
|
6237
|
|
|
|
|
|
|
6238
|
|
|
|
|
|
/* |
6239
|
|
|
|
|
|
* If there's something expensive in the r.e., find the |
6240
|
|
|
|
|
|
* longest literal string that must appear and make it the |
6241
|
|
|
|
|
|
* regmust. Resolve ties in favor of later strings, since |
6242
|
|
|
|
|
|
* the regstart check works with the beginning of the r.e. |
6243
|
|
|
|
|
|
* and avoiding duplication strengthens checking. Not a |
6244
|
|
|
|
|
|
* strong reason, but sufficient in the absence of others. |
6245
|
|
|
|
|
|
* [Now we resolve ties in favor of the earlier string if |
6246
|
|
|
|
|
|
* it happens that c_offset_min has been invalidated, since the |
6247
|
|
|
|
|
|
* earlier string may buy us something the later one won't.] |
6248
|
|
|
|
|
|
*/ |
6249
|
|
|
|
|
|
|
6250
|
1668
|
|
|
|
|
data.longest_fixed = newSVpvs(""); |
6251
|
144
|
|
|
|
|
data.longest_float = newSVpvs(""); |
6252
|
1732
|
|
|
|
|
data.last_found = newSVpvs(""); |
6253
|
208
|
|
|
|
|
data.longest = &(data.longest_fixed); |
6254
|
1504
|
|
|
|
|
ENTER_with_name("study_chunk"); |
6255
|
4441
|
|
|
|
|
SAVEFREESV(data.longest_fixed); |
6256
|
180
|
|
|
|
|
SAVEFREESV(data.longest_float); |
6257
|
180
|
|
|
|
|
SAVEFREESV(data.last_found); |
6258
|
180
|
|
|
|
|
first = scan; |
6259
|
156
|
|
|
|
|
if (!ri->regstclass) { |
6260
|
156
|
|
|
|
|
cl_init(pRExC_state, &ch_class); |
6261
|
156
|
|
|
|
|
data.start_class = &ch_class; |
6262
|
|
|
|
|
|
stclass_flag = SCF_DO_STCLASS_AND; |
6263
|
|
|
|
|
|
} else /* XXXX Check for BOUND? */ |
6264
|
|
|
|
|
|
stclass_flag = 0; |
6265
|
160
|
|
|
|
|
data.last_closep = &last_close; |
6266
|
|
|
|
|
|
|
6267
|
170
|
|
|
|
|
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ |
6268
|
|
|
|
|
|
&data, -1, NULL, NULL, |
6269
|
|
|
|
|
|
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag |
6270
|
|
|
|
|
|
| (restudied ? SCF_TRIE_DOING_RESTUDY : 0), |
6271
|
|
|
|
|
|
0); |
6272
|
|
|
|
|
|
|
6273
|
|
|
|
|
|
|
6274
|
196
|
|
|
|
|
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); |
6275
|
|
|
|
|
|
|
6276
|
|
|
|
|
|
|
6277
|
190
|
|
|
|
|
if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) |
6278
|
182
|
|
|
|
|
&& data.last_start_min == 0 && data.last_end > 0 |
6279
|
163
|
|
|
|
|
&& !RExC_seen_zerolen |
6280
|
218
|
|
|
|
|
&& !(RExC_seen & REG_SEEN_VERBARG) |
6281
|
218
|
|
|
|
|
&& !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) |
6282
|
218
|
|
|
|
|
r->extflags |= RXf_CHECK_ALL; |
6283
|
451
|
|
|
|
|
scan_commit(pRExC_state, &data,&minlen,0); |
6284
|
|
|
|
|
|
|
6285
|
254
|
|
|
|
|
longest_float_length = CHR_SVLEN(data.longest_float); |
6286
|
|
|
|
|
|
|
6287
|
368
|
|
|
|
|
if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ |
6288
|
244
|
|
|
|
|
&& data.offset_fixed == data.offset_float_min |
6289
|
290
|
|
|
|
|
&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) |
6290
|
838
|
|
|
|
|
&& S_setup_longest (aTHX_ pRExC_state, |
6291
|
|
|
|
|
|
data.longest_float, |
6292
|
246
|
|
|
|
|
&(r->float_utf8), |
6293
|
246
|
|
|
|
|
&(r->float_substr), |
6294
|
246
|
|
|
|
|
&(r->float_end_shift), |
6295
|
|
|
|
|
|
data.lookbehind_float, |
6296
|
|
|
|
|
|
data.offset_float_min, |
6297
|
|
|
|
|
|
data.minlen_float, |
6298
|
|
|
|
|
|
longest_float_length, |
6299
|
254
|
|
|
|
|
cBOOL(data.flags & SF_FL_BEFORE_EOL), |
6300
|
250
|
|
|
|
|
cBOOL(data.flags & SF_FL_BEFORE_MEOL))) |
6301
|
|
|
|
|
|
{ |
6302
|
24
|
|
|
|
|
r->float_min_offset = data.offset_float_min - data.lookbehind_float; |
6303
|
24
|
|
|
|
|
r->float_max_offset = data.offset_float_max; |
6304
|
24
|
|
|
|
|
if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ |
6305
|
16
|
|
|
|
|
r->float_max_offset -= data.lookbehind_float; |
6306
|
12
|
|
|
|
|
SvREFCNT_inc_simple_void_NN(data.longest_float); |
6307
|
|
|
|
|
|
} |
6308
|
|
|
|
|
|
else { |
6309
|
138
|
|
|
|
|
r->float_substr = r->float_utf8 = NULL; |
6310
|
|
|
|
|
|
longest_float_length = 0; |
6311
|
|
|
|
|
|
} |
6312
|
|
|
|
|
|
|
6313
|
142
|
|
|
|
|
longest_fixed_length = CHR_SVLEN(data.longest_fixed); |
6314
|
|
|
|
|
|
|
6315
|
812
|
|
|
|
|
if (S_setup_longest (aTHX_ pRExC_state, |
6316
|
|
|
|
|
|
data.longest_fixed, |
6317
|
142
|
|
|
|
|
&(r->anchored_utf8), |
6318
|
142
|
|
|
|
|
&(r->anchored_substr), |
6319
|
144
|
|
|
|
|
&(r->anchored_end_shift), |
6320
|
150
|
|
|
|
|
data.lookbehind_fixed, |
6321
|
|
|
|
|
|
data.offset_fixed, |
6322
|
|
|
|
|
|
data.minlen_fixed, |
6323
|
|
|
|
|
|
longest_fixed_length, |
6324
|
150
|
|
|
|
|
cBOOL(data.flags & SF_FIX_BEFORE_EOL), |
6325
|
150
|
|
|
|
|
cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) |
6326
|
|
|
|
|
|
{ |
6327
|
132
|
|
|
|
|
r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; |
6328
|
132
|
|
|
|
|
SvREFCNT_inc_simple_void_NN(data.longest_fixed); |
6329
|
|
|
|
|
|
} |
6330
|
|
|
|
|
|
else { |
6331
|
78
|
|
|
|
|
r->anchored_substr = r->anchored_utf8 = NULL; |
6332
|
|
|
|
|
|
longest_fixed_length = 0; |
6333
|
|
|
|
|
|
} |
6334
|
170
|
|
|
|
|
LEAVE_with_name("study_chunk"); |
6335
|
|
|
|
|
|
|
6336
|
170
|
|
|
|
|
if (ri->regstclass |
6337
|
56
|
|
|
|
|
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) |
6338
|
54
|
|
|
|
|
ri->regstclass = NULL; |
6339
|
|
|
|
|
|
|
6340
|
174
|
|
|
|
|
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) |
6341
|
42
|
|
|
|
|
&& stclass_flag |
6342
|
34
|
|
|
|
|
&& ! TEST_SSC_EOS(data.start_class) |
6343
|
24
|
|
|
|
|
&& !cl_is_anything(data.start_class)) |
6344
|
|
|
|
|
|
{ |
6345
|
36
|
|
|
|
|
const U32 n = add_data(pRExC_state, 1, "f"); |
6346
|
31
|
|
|
|
|
OP(data.start_class) = ANYOF_SYNTHETIC; |
6347
|
|
|
|
|
|
|
6348
|
16
|
|
|
|
|
Newx(RExC_rxi->data->data[n], 1, |
6349
|
|
|
|
|
|
struct regnode_charclass_class); |
6350
|
46427967
|
|
|
|
|
StructCopy(data.start_class, |
6351
|
|
|
|
|
|
(struct regnode_charclass_class*)RExC_rxi->data->data[n], |
6352
|
|
|
|
|
|
struct regnode_charclass_class); |
6353
|
69095412
|
|
|
|
|
ri->regstclass = (regnode*)RExC_rxi->data->data[n]; |
6354
|
46427967
|
|
|
|
|
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ |
6355
|
46427607
|
|
|
|
|
DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); |
6356
|
|
|
|
|
|
regprop(r, sv, (regnode*)data.start_class); |
6357
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
6358
|
|
|
|
|
|
"synthetic stclass \"%s\".\n", |
6359
|
|
|
|
|
|
SvPVX_const(sv));}); |
6360
|
|
|
|
|
|
} |
6361
|
|
|
|
|
|
|
6362
|
|
|
|
|
|
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */ |
6363
|
542
|
|
|
|
|
if (longest_fixed_length > longest_float_length) { |
6364
|
522
|
|
|
|
|
r->check_end_shift = r->anchored_end_shift; |
6365
|
186
|
|
|
|
|
r->check_substr = r->anchored_substr; |
6366
|
186
|
|
|
|
|
r->check_utf8 = r->anchored_utf8; |
6367
|
522
|
|
|
|
|
r->check_offset_min = r->check_offset_max = r->anchored_offset; |
6368
|
46428033
|
|
|
|
|
if (r->extflags & RXf_ANCH_SINGLE) |
6369
|
46427847
|
|
|
|
|
r->extflags |= RXf_NOSCAN; |
6370
|
|
|
|
|
|
} |
6371
|
|
|
|
|
|
else { |
6372
|
46427867
|
|
|
|
|
r->check_end_shift = r->float_end_shift; |
6373
|
388
|
|
|
|
|
r->check_substr = r->float_substr; |
6374
|
388
|
|
|
|
|
r->check_utf8 = r->float_utf8; |
6375
|
388
|
|
|
|
|
r->check_offset_min = r->float_min_offset; |
6376
|
46427499
|
|
|
|
|
r->check_offset_max = r->float_max_offset; |
6377
|
|
|
|
|
|
} |
6378
|
1042
|
|
|
|
|
if ((r->check_substr || r->check_utf8) ) { |
6379
|
1026
|
|
|
|
|
r->extflags |= RXf_USE_INTUIT; |
6380
|
1026
|
|
|
|
|
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) |
6381
|
69087923
|
|
|
|
|
r->extflags |= RXf_INTUIT_TAIL; |
6382
|
|
|
|
|
|
} |
6383
|
|
|
|
|
|
/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) |
6384
|
|
|
|
|
|
if ( (STRLEN)minlen < longest_float_length ) |
6385
|
|
|
|
|
|
minlen= longest_float_length; |
6386
|
|
|
|
|
|
if ( (STRLEN)minlen < longest_fixed_length ) |
6387
|
|
|
|
|
|
minlen= longest_fixed_length; |
6388
|
|
|
|
|
|
*/ |
6389
|
|
|
|
|
|
} |
6390
|
|
|
|
|
|
else { |
6391
|
|
|
|
|
|
/* Several toplevels. Best we can is to set minlen. */ |
6392
|
|
|
|
|
|
SSize_t fake; |
6393
|
|
|
|
|
|
struct regnode_charclass_class ch_class; |
6394
|
65417605
|
|
|
|
|
SSize_t last_close = 0; |
6395
|
|
|
|
|
|
|
6396
|
42756255
|
|
|
|
|
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); |
6397
|
|
|
|
|
|
|
6398
|
42755587
|
|
|
|
|
scan = ri->program + 1; |
6399
|
42755587
|
|
|
|
|
cl_init(pRExC_state, &ch_class); |
6400
|
42756863
|
|
|
|
|
data.start_class = &ch_class; |
6401
|
42756863
|
|
|
|
|
data.last_closep = &last_close; |
6402
|
|
|
|
|
|
|
6403
|
|
|
|
|
|
|
6404
|
42756863
|
|
|
|
|
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, |
6405
|
|
|
|
|
|
&data, -1, NULL, NULL, |
6406
|
|
|
|
|
|
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS |
6407
|
|
|
|
|
|
|(restudied ? SCF_TRIE_DOING_RESTUDY : 0), |
6408
|
|
|
|
|
|
0); |
6409
|
|
|
|
|
|
|
6410
|
42756863
|
|
|
|
|
CHECK_RESTUDY_GOTO_butfirst(NOOP); |
6411
|
|
|
|
|
|
|
6412
|
42756863
|
|
|
|
|
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 |
6413
|
63589333
|
|
|
|
|
= r->float_substr = r->float_utf8 = NULL; |
6414
|
|
|
|
|
|
|
6415
|
58
|
|
|
|
|
if (! TEST_SSC_EOS(data.start_class) |
6416
|
16
|
|
|
|
|
&& !cl_is_anything(data.start_class)) |
6417
|
|
|
|
|
|
{ |
6418
|
42756805
|
|
|
|
|
const U32 n = add_data(pRExC_state, 1, "f"); |
6419
|
238428
|
|
|
|
|
OP(data.start_class) = ANYOF_SYNTHETIC; |
6420
|
|
|
|
|
|
|
6421
|
42518435
|
|
|
|
|
Newx(RExC_rxi->data->data[n], 1, |
6422
|
|
|
|
|
|
struct regnode_charclass_class); |
6423
|
42756863
|
|
|
|
|
StructCopy(data.start_class, |
6424
|
|
|
|
|
|
(struct regnode_charclass_class*)RExC_rxi->data->data[n], |
6425
|
|
|
|
|
|
struct regnode_charclass_class); |
6426
|
2298
|
|
|
|
|
ri->regstclass = (regnode*)RExC_rxi->data->data[n]; |
6427
|
0
|
|
|
|
|
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ |
6428
|
0
|
|
|
|
|
DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); |
6429
|
|
|
|
|
|
regprop(r, sv, (regnode*)data.start_class); |
6430
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
6431
|
|
|
|
|
|
"synthetic stclass \"%s\".\n", |
6432
|
|
|
|
|
|
SvPVX_const(sv));}); |
6433
|
|
|
|
|
|
} |
6434
|
|
|
|
|
|
} |
6435
|
|
|
|
|
|
|
6436
|
|
|
|
|
|
/* Guard against an embedded (?=) or (?<=) with a longer minlen than |
6437
|
|
|
|
|
|
the "real" pattern. */ |
6438
|
134
|
|
|
|
|
DEBUG_OPTIMISE_r({ |
6439
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", |
6440
|
|
|
|
|
|
(IV)minlen, (IV)r->minlen); |
6441
|
|
|
|
|
|
}); |
6442
|
134
|
|
|
|
|
r->minlenret = minlen; |
6443
|
134
|
|
|
|
|
if (r->minlen < minlen) |
6444
|
124
|
|
|
|
|
r->minlen = minlen; |
6445
|
|
|
|
|
|
|
6446
|
134
|
|
|
|
|
if (RExC_seen & REG_SEEN_GPOS) |
6447
|
0
|
|
|
|
|
r->extflags |= RXf_GPOS_SEEN; |
6448
|
134
|
|
|
|
|
if (RExC_seen & REG_SEEN_LOOKBEHIND) |
6449
|
0
|
|
|
|
|
r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ |
6450
|
2432
|
|
|
|
|
if (pRExC_state->num_code_blocks) |
6451
|
3671172
|
|
|
|
|
r->extflags |= RXf_EVAL_SEEN; |
6452
|
25595668
|
|
|
|
|
if (RExC_seen & REG_SEEN_CANY) |
6453
|
294292
|
|
|
|
|
r->extflags |= RXf_CANY_SEEN; |
6454
|
294426
|
|
|
|
|
if (RExC_seen & REG_SEEN_VERBARG) |
6455
|
|
|
|
|
|
{ |
6456
|
0
|
|
|
|
|
r->intflags |= PREGf_VERBARG_SEEN; |
6457
|
294292
|
|
|
|
|
r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ |
6458
|
|
|
|
|
|
} |
6459
|
134
|
|
|
|
|
if (RExC_seen & REG_SEEN_CUTGROUP) |
6460
|
0
|
|
|
|
|
r->intflags |= PREGf_CUTGROUP_SEEN; |
6461
|
134
|
|
|
|
|
if (pm_flags & PMf_USE_RE_EVAL) |
6462
|
24
|
|
|
|
|
r->intflags |= PREGf_USE_RE_EVAL; |
6463
|
134
|
|
|
|
|
if (RExC_paren_names) |
6464
|
0
|
|
|
|
|
RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); |
6465
|
|
|
|
|
|
else |
6466
|
134
|
|
|
|
|
RXp_PAREN_NAMES(r) = NULL; |
6467
|
|
|
|
|
|
|
6468
|
|
|
|
|
|
{ |
6469
|
134
|
|
|
|
|
regnode *first = ri->program + 1; |
6470
|
134
|
|
|
|
|
U8 fop = OP(first); |
6471
|
|
|
|
|
|
regnode *next = NEXTOPER(first); |
6472
|
134
|
|
|
|
|
U8 nop = OP(next); |
6473
|
|
|
|
|
|
|
6474
|
134
|
|
|
|
|
if (PL_regkind[fop] == NOTHING && nop == END) |
6475
|
2
|
|
|
|
|
r->extflags |= RXf_NULL; |
6476
|
132
|
|
|
|
|
else if (PL_regkind[fop] == BOL && nop == END) |
6477
|
0
|
|
|
|
|
r->extflags |= RXf_START_ONLY; |
6478
|
132
|
|
|
|
|
else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) |
6479
|
0
|
|
|
|
|
r->extflags |= RXf_WHITE; |
6480
|
132
|
|
|
|
|
else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) |
6481
|
0
|
|
|
|
|
r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); |
6482
|
|
|
|
|
|
|
6483
|
|
|
|
|
|
} |
6484
|
|
|
|
|
|
#ifdef DEBUGGING |
6485
|
134
|
|
|
|
|
if (RExC_paren_names) { |
6486
|
0
|
|
|
|
|
ri->name_list_idx = add_data( pRExC_state, 1, "a" ); |
6487
|
0
|
|
|
|
|
ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); |
6488
|
|
|
|
|
|
} else |
6489
|
|
|
|
|
|
#endif |
6490
|
134
|
|
|
|
|
ri->name_list_idx = 0; |
6491
|
|
|
|
|
|
|
6492
|
134
|
|
|
|
|
if (RExC_recurse_count) { |
6493
|
0
|
|
|
|
|
for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { |
6494
|
0
|
|
|
|
|
const regnode *scan = RExC_recurse[RExC_recurse_count-1]; |
6495
|
0
|
|
|
|
|
ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); |
6496
|
|
|
|
|
|
} |
6497
|
|
|
|
|
|
} |
6498
|
134
|
|
|
|
|
Newxz(r->offs, RExC_npar, regexp_paren_pair); |
6499
|
|
|
|
|
|
/* assume we don't need to swap parens around before we match */ |
6500
|
|
|
|
|
|
|
6501
|
134
|
|
|
|
|
DEBUG_DUMP_r({ |
6502
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"Final program:\n"); |
6503
|
|
|
|
|
|
regdump(r); |
6504
|
|
|
|
|
|
}); |
6505
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
6506
|
134
|
|
|
|
|
DEBUG_OFFSETS_r(if (ri->u.offsets) { |
6507
|
|
|
|
|
|
const STRLEN len = ri->u.offsets[0]; |
6508
|
|
|
|
|
|
STRLEN i; |
6509
|
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
6510
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); |
6511
|
|
|
|
|
|
for (i = 1; i <= len; i++) { |
6512
|
|
|
|
|
|
if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) |
6513
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", |
6514
|
|
|
|
|
|
(UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); |
6515
|
|
|
|
|
|
} |
6516
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n"); |
6517
|
|
|
|
|
|
}); |
6518
|
|
|
|
|
|
#endif |
6519
|
|
|
|
|
|
|
6520
|
|
|
|
|
|
#ifdef USE_ITHREADS |
6521
|
|
|
|
|
|
/* under ithreads the ?pat? PMf_USED flag on the pmop is simulated |
6522
|
|
|
|
|
|
* by setting the regexp SV to readonly-only instead. If the |
6523
|
|
|
|
|
|
* pattern's been recompiled, the USEDness should remain. */ |
6524
|
|
|
|
|
|
if (old_re && SvREADONLY(old_re)) |
6525
|
|
|
|
|
|
SvREADONLY_on(rx); |
6526
|
|
|
|
|
|
#endif |
6527
|
|
|
|
|
|
return rx; |
6528
|
|
|
|
|
|
} |
6529
|
|
|
|
|
|
|
6530
|
|
|
|
|
|
|
6531
|
|
|
|
|
|
SV* |
6532
|
0
|
|
|
|
|
Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, |
6533
|
|
|
|
|
|
const U32 flags) |
6534
|
|
|
|
|
|
{ |
6535
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF; |
6536
|
|
|
|
|
|
|
6537
|
|
|
|
|
|
PERL_UNUSED_ARG(value); |
6538
|
|
|
|
|
|
|
6539
|
0
|
|
|
|
|
if (flags & RXapif_FETCH) { |
6540
|
2086952
|
|
|
|
|
return reg_named_buff_fetch(rx, key, flags); |
6541
|
2086952
|
|
|
|
|
} else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { |
6542
|
39214
|
|
|
|
|
Perl_croak_no_modify(); |
6543
|
|
|
|
|
|
return NULL; |
6544
|
39214
|
|
|
|
|
} else if (flags & RXapif_EXISTS) { |
6545
|
39214
|
|
|
|
|
return reg_named_buff_exists(rx, key, flags) |
6546
|
|
|
|
|
|
? &PL_sv_yes |
6547
|
38946
|
|
|
|
|
: &PL_sv_no; |
6548
|
49992
|
|
|
|
|
} else if (flags & RXapif_REGNAMES) { |
6549
|
64429
|
|
|
|
|
return reg_named_buff_all(rx, flags); |
6550
|
91290
|
|
|
|
|
} else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { |
6551
|
91290
|
|
|
|
|
return reg_named_buff_scalar(rx, flags); |
6552
|
|
|
|
|
|
} else { |
6553
|
268
|
|
|
|
|
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); |
6554
|
|
|
|
|
|
return NULL; |
6555
|
|
|
|
|
|
} |
6556
|
|
|
|
|
|
} |
6557
|
|
|
|
|
|
|
6558
|
|
|
|
|
|
SV* |
6559
|
268
|
|
|
|
|
Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, |
6560
|
|
|
|
|
|
const U32 flags) |
6561
|
|
|
|
|
|
{ |
6562
|
38946
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; |
6563
|
|
|
|
|
|
PERL_UNUSED_ARG(lastkey); |
6564
|
|
|
|
|
|
|
6565
|
20306
|
|
|
|
|
if (flags & RXapif_FIRSTKEY) |
6566
|
20306
|
|
|
|
|
return reg_named_buff_firstkey(rx, flags); |
6567
|
20306
|
|
|
|
|
else if (flags & RXapif_NEXTKEY) |
6568
|
8070
|
|
|
|
|
return reg_named_buff_nextkey(rx, flags); |
6569
|
|
|
|
|
|
else { |
6570
|
8070
|
|
|
|
|
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); |
6571
|
|
|
|
|
|
return NULL; |
6572
|
|
|
|
|
|
} |
6573
|
|
|
|
|
|
} |
6574
|
|
|
|
|
|
|
6575
|
|
|
|
|
|
SV* |
6576
|
0
|
|
|
|
|
Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, |
6577
|
|
|
|
|
|
const U32 flags) |
6578
|
|
|
|
|
|
{ |
6579
|
|
|
|
|
|
AV *retarray = NULL; |
6580
|
|
|
|
|
|
SV *ret; |
6581
|
8070
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6582
|
|
|
|
|
|
|
6583
|
7958
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; |
6584
|
|
|
|
|
|
|
6585
|
8070
|
|
|
|
|
if (flags & RXapif_ALL) |
6586
|
7958
|
|
|
|
|
retarray=newAV(); |
6587
|
|
|
|
|
|
|
6588
|
8070
|
|
|
|
|
if (rx && RXp_PAREN_NAMES(rx)) { |
6589
|
112
|
|
|
|
|
HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); |
6590
|
19417
|
|
|
|
|
if (he_str) { |
6591
|
|
|
|
|
|
IV i; |
6592
|
14672152
|
|
|
|
|
SV* sv_dat=HeVAL(he_str); |
6593
|
14672152
|
|
|
|
|
I32 *nums=(I32*)SvPVX(sv_dat); |
6594
|
14672152
|
|
|
|
|
for ( i=0; i
|
6595
|
10708566
|
|
|
|
|
if ((I32)(rx->nparens) >= nums[i] |
6596
|
52907588
|
|
|
|
|
&& rx->offs[nums[i]].start != -1 |
6597
|
24765565
|
|
|
|
|
&& rx->offs[nums[i]].end != -1) |
6598
|
|
|
|
|
|
{ |
6599
|
19295878
|
|
|
|
|
ret = newSVpvs(""); |
6600
|
3349824
|
|
|
|
|
CALLREG_NUMBUF_FETCH(r,nums[i],ret); |
6601
|
987586
|
|
|
|
|
if (!retarray) |
6602
|
|
|
|
|
|
return ret; |
6603
|
|
|
|
|
|
} else { |
6604
|
11342222
|
|
|
|
|
if (retarray) |
6605
|
0
|
|
|
|
|
ret = newSVsv(&PL_sv_undef); |
6606
|
|
|
|
|
|
} |
6607
|
11342222
|
|
|
|
|
if (retarray) |
6608
|
14958468
|
|
|
|
|
av_push(retarray, ret); |
6609
|
|
|
|
|
|
} |
6610
|
14958468
|
|
|
|
|
if (retarray) |
6611
|
14958468
|
|
|
|
|
return newRV_noinc(MUTABLE_SV(retarray)); |
6612
|
|
|
|
|
|
} |
6613
|
|
|
|
|
|
} |
6614
|
|
|
|
|
|
return NULL; |
6615
|
|
|
|
|
|
} |
6616
|
|
|
|
|
|
|
6617
|
|
|
|
|
|
bool |
6618
|
14958468
|
|
|
|
|
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, |
6619
|
|
|
|
|
|
const U32 flags) |
6620
|
|
|
|
|
|
{ |
6621
|
14958468
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6622
|
|
|
|
|
|
|
6623
|
14958468
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; |
6624
|
|
|
|
|
|
|
6625
|
14958468
|
|
|
|
|
if (rx && RXp_PAREN_NAMES(rx)) { |
6626
|
707352
|
|
|
|
|
if (flags & RXapif_ALL) { |
6627
|
707352
|
|
|
|
|
return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); |
6628
|
|
|
|
|
|
} else { |
6629
|
707352
|
|
|
|
|
SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); |
6630
|
707352
|
|
|
|
|
if (sv) { |
6631
|
707352
|
|
|
|
|
SvREFCNT_dec_NN(sv); |
6632
|
707352
|
|
|
|
|
return TRUE; |
6633
|
|
|
|
|
|
} else { |
6634
|
|
|
|
|
|
return FALSE; |
6635
|
|
|
|
|
|
} |
6636
|
|
|
|
|
|
} |
6637
|
|
|
|
|
|
} else { |
6638
|
|
|
|
|
|
return FALSE; |
6639
|
|
|
|
|
|
} |
6640
|
|
|
|
|
|
} |
6641
|
|
|
|
|
|
|
6642
|
|
|
|
|
|
SV* |
6643
|
0
|
|
|
|
|
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) |
6644
|
|
|
|
|
|
{ |
6645
|
707352
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6646
|
|
|
|
|
|
|
6647
|
707352
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; |
6648
|
|
|
|
|
|
|
6649
|
707352
|
|
|
|
|
if ( rx && RXp_PAREN_NAMES(rx) ) { |
6650
|
707352
|
|
|
|
|
(void)hv_iterinit(RXp_PAREN_NAMES(rx)); |
6651
|
|
|
|
|
|
|
6652
|
707352
|
|
|
|
|
return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); |
6653
|
|
|
|
|
|
} else { |
6654
|
|
|
|
|
|
return FALSE; |
6655
|
|
|
|
|
|
} |
6656
|
|
|
|
|
|
} |
6657
|
|
|
|
|
|
|
6658
|
|
|
|
|
|
SV* |
6659
|
1830678
|
|
|
|
|
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) |
6660
|
|
|
|
|
|
{ |
6661
|
1830678
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6662
|
1830678
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
6663
|
|
|
|
|
|
|
6664
|
5783079
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; |
6665
|
|
|
|
|
|
|
6666
|
11342132
|
|
|
|
|
if (rx && RXp_PAREN_NAMES(rx)) { |
6667
|
11342132
|
|
|
|
|
HV *hv = RXp_PAREN_NAMES(rx); |
6668
|
|
|
|
|
|
HE *temphe; |
6669
|
6896898
|
|
|
|
|
while ( (temphe = hv_iternext_flags(hv,0)) ) { |
6670
|
|
|
|
|
|
IV i; |
6671
|
|
|
|
|
|
IV parno = 0; |
6672
|
6896898
|
|
|
|
|
SV* sv_dat = HeVAL(temphe); |
6673
|
4445234
|
|
|
|
|
I32 *nums = (I32*)SvPVX(sv_dat); |
6674
|
4445234
|
|
|
|
|
for ( i = 0; i < SvIVX(sv_dat); i++ ) { |
6675
|
4445234
|
|
|
|
|
if ((I32)(rx->lastparen) >= nums[i] && |
6676
|
0
|
|
|
|
|
rx->offs[nums[i]].start != -1 && |
6677
|
0
|
|
|
|
|
rx->offs[nums[i]].end != -1) |
6678
|
|
|
|
|
|
{ |
6679
|
0
|
|
|
|
|
parno = nums[i]; |
6680
|
4445234
|
|
|
|
|
break; |
6681
|
|
|
|
|
|
} |
6682
|
|
|
|
|
|
} |
6683
|
4445234
|
|
|
|
|
if (parno || flags & RXapif_ALL) { |
6684
|
687818
|
|
|
|
|
return newSVhek(HeKEY_hek(temphe)); |
6685
|
|
|
|
|
|
} |
6686
|
|
|
|
|
|
} |
6687
|
|
|
|
|
|
} |
6688
|
|
|
|
|
|
return NULL; |
6689
|
|
|
|
|
|
} |
6690
|
|
|
|
|
|
|
6691
|
|
|
|
|
|
SV* |
6692
|
687818
|
|
|
|
|
Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) |
6693
|
|
|
|
|
|
{ |
6694
|
|
|
|
|
|
SV *ret; |
6695
|
|
|
|
|
|
AV *av; |
6696
|
|
|
|
|
|
SSize_t length; |
6697
|
0
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6698
|
|
|
|
|
|
|
6699
|
11342132
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; |
6700
|
|
|
|
|
|
|
6701
|
10654314
|
|
|
|
|
if (rx && RXp_PAREN_NAMES(rx)) { |
6702
|
10654314
|
|
|
|
|
if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { |
6703
|
1830588
|
|
|
|
|
return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); |
6704
|
1830588
|
|
|
|
|
} else if (flags & RXapif_ONE) { |
6705
|
8823726
|
|
|
|
|
ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); |
6706
|
10654314
|
|
|
|
|
av = MUTABLE_AV(SvRV(ret)); |
6707
|
10654314
|
|
|
|
|
length = av_len(av); |
6708
|
10653628
|
|
|
|
|
SvREFCNT_dec_NN(ret); |
6709
|
686
|
|
|
|
|
return newSViv(length + 1); |
6710
|
|
|
|
|
|
} else { |
6711
|
3349824
|
|
|
|
|
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); |
6712
|
|
|
|
|
|
return NULL; |
6713
|
|
|
|
|
|
} |
6714
|
|
|
|
|
|
} |
6715
|
|
|
|
|
|
return &PL_sv_undef; |
6716
|
|
|
|
|
|
} |
6717
|
|
|
|
|
|
|
6718
|
|
|
|
|
|
SV* |
6719
|
3349824
|
|
|
|
|
Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) |
6720
|
|
|
|
|
|
{ |
6721
|
3349824
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6722
|
3349824
|
|
|
|
|
AV *av = newAV(); |
6723
|
|
|
|
|
|
|
6724
|
3349824
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; |
6725
|
|
|
|
|
|
|
6726
|
2217452
|
|
|
|
|
if (rx && RXp_PAREN_NAMES(rx)) { |
6727
|
2129444
|
|
|
|
|
HV *hv= RXp_PAREN_NAMES(rx); |
6728
|
|
|
|
|
|
HE *temphe; |
6729
|
227640
|
|
|
|
|
(void)hv_iterinit(hv); |
6730
|
248976
|
|
|
|
|
while ( (temphe = hv_iternext_flags(hv,0)) ) { |
6731
|
|
|
|
|
|
IV i; |
6732
|
|
|
|
|
|
IV parno = 0; |
6733
|
1132372
|
|
|
|
|
SV* sv_dat = HeVAL(temphe); |
6734
|
52594
|
|
|
|
|
I32 *nums = (I32*)SvPVX(sv_dat); |
6735
|
1330892
|
|
|
|
|
for ( i = 0; i < SvIVX(sv_dat); i++ ) { |
6736
|
1060580
|
|
|
|
|
if ((I32)(rx->lastparen) >= nums[i] && |
6737
|
1060580
|
|
|
|
|
rx->offs[nums[i]].start != -1 && |
6738
|
807960
|
|
|
|
|
rx->offs[nums[i]].end != -1) |
6739
|
|
|
|
|
|
{ |
6740
|
280234
|
|
|
|
|
parno = nums[i]; |
6741
|
1815029
|
|
|
|
|
break; |
6742
|
|
|
|
|
|
} |
6743
|
|
|
|
|
|
} |
6744
|
20230
|
|
|
|
|
if (parno || flags & RXapif_ALL) { |
6745
|
20230
|
|
|
|
|
av_push(av, newSVhek(HeKEY_hek(temphe))); |
6746
|
|
|
|
|
|
} |
6747
|
|
|
|
|
|
} |
6748
|
|
|
|
|
|
} |
6749
|
|
|
|
|
|
|
6750
|
20230
|
|
|
|
|
return newRV_noinc(MUTABLE_SV(av)); |
6751
|
|
|
|
|
|
} |
6752
|
|
|
|
|
|
|
6753
|
|
|
|
|
|
void |
6754
|
20230
|
|
|
|
|
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, |
6755
|
|
|
|
|
|
SV * const sv) |
6756
|
|
|
|
|
|
{ |
6757
|
65613
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6758
|
|
|
|
|
|
char *s = NULL; |
6759
|
|
|
|
|
|
SSize_t i = 0; |
6760
|
|
|
|
|
|
SSize_t s1, t1; |
6761
|
|
|
|
|
|
I32 n = paren; |
6762
|
|
|
|
|
|
|
6763
|
40362
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; |
6764
|
|
|
|
|
|
|
6765
|
27610
|
|
|
|
|
if ( n == RX_BUFF_IDX_CARET_PREMATCH |
6766
|
27610
|
|
|
|
|
|| n == RX_BUFF_IDX_CARET_FULLMATCH |
6767
|
25822
|
|
|
|
|
|| n == RX_BUFF_IDX_CARET_POSTMATCH |
6768
|
|
|
|
|
|
) |
6769
|
|
|
|
|
|
{ |
6770
|
25822
|
|
|
|
|
bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); |
6771
|
4602
|
|
|
|
|
if (!keepcopy) { |
6772
|
|
|
|
|
|
/* on something like |
6773
|
|
|
|
|
|
* $r = qr/.../; |
6774
|
|
|
|
|
|
* /$qr/p; |
6775
|
|
|
|
|
|
* the KEEPCOPY is set on the PMOP rather than the regex */ |
6776
|
33972
|
|
|
|
|
if (PL_curpm && r == PM_GETRE(PL_curpm)) |
6777
|
33972
|
|
|
|
|
keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); |
6778
|
|
|
|
|
|
} |
6779
|
976008
|
|
|
|
|
if (!keepcopy) |
6780
|
|
|
|
|
|
goto ret_undef; |
6781
|
|
|
|
|
|
} |
6782
|
|
|
|
|
|
|
6783
|
942036
|
|
|
|
|
if (!rx->subbeg) |
6784
|
|
|
|
|
|
goto ret_undef; |
6785
|
|
|
|
|
|
|
6786
|
942036
|
|
|
|
|
if (n == RX_BUFF_IDX_CARET_FULLMATCH) |
6787
|
|
|
|
|
|
/* no need to distinguish between them any more */ |
6788
|
|
|
|
|
|
n = RX_BUFF_IDX_FULLMATCH; |
6789
|
|
|
|
|
|
|
6790
|
33972
|
|
|
|
|
if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) |
6791
|
3798
|
|
|
|
|
&& rx->offs[0].start != -1) |
6792
|
|
|
|
|
|
{ |
6793
|
|
|
|
|
|
/* $`, ${^PREMATCH} */ |
6794
|
4
|
|
|
|
|
i = rx->offs[0].start; |
6795
|
4
|
|
|
|
|
s = rx->subbeg; |
6796
|
|
|
|
|
|
} |
6797
|
|
|
|
|
|
else |
6798
|
30174
|
|
|
|
|
if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) |
6799
|
3633929
|
|
|
|
|
&& rx->offs[0].end != -1) |
6800
|
|
|
|
|
|
{ |
6801
|
|
|
|
|
|
/* $', ${^POSTMATCH} */ |
6802
|
6983254
|
|
|
|
|
s = rx->subbeg - rx->suboffset + rx->offs[0].end; |
6803
|
285628
|
|
|
|
|
i = rx->sublen + rx->suboffset - rx->offs[0].end; |
6804
|
|
|
|
|
|
} |
6805
|
|
|
|
|
|
else |
6806
|
285520
|
|
|
|
|
if ( 0 <= n && n <= (I32)rx->nparens && |
6807
|
916
|
|
|
|
|
(s1 = rx->offs[n].start) != -1 && |
6808
|
285628
|
|
|
|
|
(t1 = rx->offs[n].end) != -1) |
6809
|
|
|
|
|
|
{ |
6810
|
|
|
|
|
|
/* $&, ${^MATCH}, $1 ... */ |
6811
|
285520
|
|
|
|
|
i = t1 - s1; |
6812
|
285520
|
|
|
|
|
s = rx->subbeg + s1 - rx->suboffset; |
6813
|
|
|
|
|
|
} else { |
6814
|
|
|
|
|
|
goto ret_undef; |
6815
|
|
|
|
|
|
} |
6816
|
|
|
|
|
|
|
6817
|
95060
|
|
|
|
|
assert(s >= rx->subbeg); |
6818
|
3348301
|
|
|
|
|
assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); |
6819
|
52
|
|
|
|
|
if (i >= 0) { |
6820
|
|
|
|
|
|
#if NO_TAINT_SUPPORT |
6821
|
|
|
|
|
|
sv_setpvn(sv, s, i); |
6822
|
|
|
|
|
|
#else |
6823
|
0
|
|
|
|
|
const int oldtainted = TAINT_get; |
6824
|
52
|
|
|
|
|
TAINT_NOT; |
6825
|
0
|
|
|
|
|
sv_setpvn(sv, s, i); |
6826
|
0
|
|
|
|
|
TAINT_set(oldtainted); |
6827
|
|
|
|
|
|
#endif |
6828
|
0
|
|
|
|
|
if ( (rx->extflags & RXf_CANY_SEEN) |
6829
|
0
|
|
|
|
|
? (RXp_MATCH_UTF8(rx) |
6830
|
52
|
|
|
|
|
&& (!i || is_utf8_string((U8*)s, i))) |
6831
|
0
|
|
|
|
|
: (RXp_MATCH_UTF8(rx)) ) |
6832
|
|
|
|
|
|
{ |
6833
|
3348249
|
|
|
|
|
SvUTF8_on(sv); |
6834
|
|
|
|
|
|
} |
6835
|
|
|
|
|
|
else |
6836
|
2
|
|
|
|
|
SvUTF8_off(sv); |
6837
|
2
|
|
|
|
|
if (TAINTING_get) { |
6838
|
2
|
|
|
|
|
if (RXp_MATCH_TAINTED(rx)) { |
6839
|
0
|
|
|
|
|
if (SvTYPE(sv) >= SVt_PVMG) { |
6840
|
0
|
|
|
|
|
MAGIC* const mg = SvMAGIC(sv); |
6841
|
|
|
|
|
|
MAGIC* mgt; |
6842
|
3348249
|
|
|
|
|
TAINT; |
6843
|
3348249
|
|
|
|
|
SvMAGIC_set(sv, mg->mg_moremagic); |
6844
|
17489700
|
|
|
|
|
SvTAINT(sv); |
6845
|
12500446
|
|
|
|
|
if ((mgt = SvMAGIC(sv))) { |
6846
|
8162610
|
|
|
|
|
mg->mg_moremagic = mgt; |
6847
|
1404116
|
|
|
|
|
SvMAGIC_set(sv, mg); |
6848
|
|
|
|
|
|
} |
6849
|
|
|
|
|
|
} else { |
6850
|
4898992
|
|
|
|
|
TAINT; |
6851
|
4898992
|
|
|
|
|
SvTAINT(sv); |
6852
|
|
|
|
|
|
} |
6853
|
|
|
|
|
|
} else |
6854
|
7601454
|
|
|
|
|
SvTAINTED_off(sv); |
6855
|
|
|
|
|
|
} |
6856
|
|
|
|
|
|
} else { |
6857
|
|
|
|
|
|
ret_undef: |
6858
|
7601454
|
|
|
|
|
sv_setsv(sv,&PL_sv_undef); |
6859
|
12500446
|
|
|
|
|
return; |
6860
|
|
|
|
|
|
} |
6861
|
|
|
|
|
|
} |
6862
|
|
|
|
|
|
|
6863
|
|
|
|
|
|
void |
6864
|
6504669
|
|
|
|
|
Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, |
6865
|
|
|
|
|
|
SV const * const value) |
6866
|
|
|
|
|
|
{ |
6867
|
5327917
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; |
6868
|
|
|
|
|
|
|
6869
|
|
|
|
|
|
PERL_UNUSED_ARG(rx); |
6870
|
|
|
|
|
|
PERL_UNUSED_ARG(paren); |
6871
|
|
|
|
|
|
PERL_UNUSED_ARG(value); |
6872
|
|
|
|
|
|
|
6873
|
6504669
|
|
|
|
|
if (!PL_localizing) |
6874
|
5995777
|
|
|
|
|
Perl_croak_no_modify(); |
6875
|
5995777
|
|
|
|
|
} |
6876
|
|
|
|
|
|
|
6877
|
|
|
|
|
|
I32 |
6878
|
8749986
|
|
|
|
|
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, |
6879
|
|
|
|
|
|
const I32 paren) |
6880
|
|
|
|
|
|
{ |
6881
|
3348249
|
|
|
|
|
struct regexp *const rx = ReANY(r); |
6882
|
|
|
|
|
|
I32 i; |
6883
|
|
|
|
|
|
I32 s1, t1; |
6884
|
|
|
|
|
|
|
6885
|
2861965
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; |
6886
|
|
|
|
|
|
|
6887
|
508394
|
|
|
|
|
if ( paren == RX_BUFF_IDX_CARET_PREMATCH |
6888
|
3348249
|
|
|
|
|
|| paren == RX_BUFF_IDX_CARET_FULLMATCH |
6889
|
3347751
|
|
|
|
|
|| paren == RX_BUFF_IDX_CARET_POSTMATCH |
6890
|
|
|
|
|
|
) |
6891
|
|
|
|
|
|
{ |
6892
|
3348249
|
|
|
|
|
bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); |
6893
|
3348249
|
|
|
|
|
if (!keepcopy) { |
6894
|
|
|
|
|
|
/* on something like |
6895
|
|
|
|
|
|
* $r = qr/.../; |
6896
|
|
|
|
|
|
* /$qr/p; |
6897
|
|
|
|
|
|
* the KEEPCOPY is set on the PMOP rather than the regex */ |
6898
|
3348249
|
|
|
|
|
if (PL_curpm && r == PM_GETRE(PL_curpm)) |
6899
|
3347751
|
|
|
|
|
keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); |
6900
|
|
|
|
|
|
} |
6901
|
3264485
|
|
|
|
|
if (!keepcopy) |
6902
|
|
|
|
|
|
goto warn_undef; |
6903
|
|
|
|
|
|
} |
6904
|
|
|
|
|
|
|
6905
|
|
|
|
|
|
/* Some of this code was originally in C in F */ |
6906
|
83266
|
|
|
|
|
switch (paren) { |
6907
|
|
|
|
|
|
case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ |
6908
|
|
|
|
|
|
case RX_BUFF_IDX_PREMATCH: /* $` */ |
6909
|
83266
|
|
|
|
|
if (rx->offs[0].start != -1) { |
6910
|
3348249
|
|
|
|
|
i = rx->offs[0].start; |
6911
|
3348239
|
|
|
|
|
if (i > 0) { |
6912
|
|
|
|
|
|
s1 = 0; |
6913
|
|
|
|
|
|
t1 = i; |
6914
|
|
|
|
|
|
goto getlen; |
6915
|
|
|
|
|
|
} |
6916
|
|
|
|
|
|
} |
6917
|
|
|
|
|
|
return 0; |
6918
|
|
|
|
|
|
|
6919
|
|
|
|
|
|
case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ |
6920
|
|
|
|
|
|
case RX_BUFF_IDX_POSTMATCH: /* $' */ |
6921
|
3348249
|
|
|
|
|
if (rx->offs[0].end != -1) { |
6922
|
3492349
|
|
|
|
|
i = rx->sublen - rx->offs[0].end; |
6923
|
4439715
|
|
|
|
|
if (i > 0) { |
6924
|
4439715
|
|
|
|
|
s1 = rx->offs[0].end; |
6925
|
8879430
|
|
|
|
|
t1 = rx->sublen; |
6926
|
12710
|
|
|
|
|
goto getlen; |
6927
|
|
|
|
|
|
} |
6928
|
|
|
|
|
|
} |
6929
|
|
|
|
|
|
return 0; |
6930
|
|
|
|
|
|
|
6931
|
|
|
|
|
|
default: /* $& / ${^MATCH}, $1, $2, ... */ |
6932
|
12710
|
|
|
|
|
if (paren <= (I32)rx->nparens && |
6933
|
4
|
|
|
|
|
(s1 = rx->offs[paren].start) != -1 && |
6934
|
0
|
|
|
|
|
(t1 = rx->offs[paren].end) != -1) |
6935
|
|
|
|
|
|
{ |
6936
|
4
|
|
|
|
|
i = t1 - s1; |
6937
|
0
|
|
|
|
|
goto getlen; |
6938
|
|
|
|
|
|
} else { |
6939
|
|
|
|
|
|
warn_undef: |
6940
|
0
|
|
|
|
|
if (ckWARN(WARN_UNINITIALIZED)) |
6941
|
0
|
|
|
|
|
report_uninit((const SV *)sv); |
6942
|
|
|
|
|
|
return 0; |
6943
|
|
|
|
|
|
} |
6944
|
|
|
|
|
|
} |
6945
|
|
|
|
|
|
getlen: |
6946
|
0
|
|
|
|
|
if (i > 0 && RXp_MATCH_UTF8(rx)) { |
6947
|
0
|
|
|
|
|
const char * const s = rx->subbeg - rx->suboffset + s1; |
6948
|
|
|
|
|
|
const U8 *ep; |
6949
|
|
|
|
|
|
STRLEN el; |
6950
|
|
|
|
|
|
|
6951
|
0
|
|
|
|
|
i = t1 - s1; |
6952
|
4427005
|
|
|
|
|
if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) |
6953
|
3397265
|
|
|
|
|
i = el; |
6954
|
|
|
|
|
|
} |
6955
|
3274319
|
|
|
|
|
return i; |
6956
|
|
|
|
|
|
} |
6957
|
|
|
|
|
|
|
6958
|
|
|
|
|
|
SV* |
6959
|
3274419
|
|
|
|
|
Perl_reg_qr_package(pTHX_ REGEXP * const rx) |
6960
|
|
|
|
|
|
{ |
6961
|
123046
|
|
|
|
|
PERL_ARGS_ASSERT_REG_QR_PACKAGE; |
6962
|
|
|
|
|
|
PERL_UNUSED_ARG(rx); |
6963
|
|
|
|
|
|
if (0) |
6964
|
|
|
|
|
|
return NULL; |
6965
|
|
|
|
|
|
else |
6966
|
123046
|
|
|
|
|
return newSVpvs("Regexp"); |
6967
|
|
|
|
|
|
} |
6968
|
|
|
|
|
|
|
6969
|
|
|
|
|
|
/* Scans the name of a named buffer from the pattern. |
6970
|
|
|
|
|
|
* If flags is REG_RSN_RETURN_NULL returns null. |
6971
|
|
|
|
|
|
* If flags is REG_RSN_RETURN_NAME returns an SV* containing the name |
6972
|
|
|
|
|
|
* If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding |
6973
|
|
|
|
|
|
* to the parsed name as looked up in the RExC_paren_names hash. |
6974
|
|
|
|
|
|
* If there is an error throws a vFAIL().. type exception. |
6975
|
|
|
|
|
|
*/ |
6976
|
|
|
|
|
|
|
6977
|
|
|
|
|
|
#define REG_RSN_RETURN_NULL 0 |
6978
|
|
|
|
|
|
#define REG_RSN_RETURN_NAME 1 |
6979
|
|
|
|
|
|
#define REG_RSN_RETURN_DATA 2 |
6980
|
|
|
|
|
|
|
6981
|
|
|
|
|
|
STATIC SV* |
6982
|
4427005
|
|
|
|
|
S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) |
6983
|
|
|
|
|
|
{ |
6984
|
4427005
|
|
|
|
|
char *name_start = RExC_parse; |
6985
|
|
|
|
|
|
|
6986
|
75944686
|
|
|
|
|
PERL_ARGS_ASSERT_REG_SCAN_NAME; |
6987
|
|
|
|
|
|
|
6988
|
69347018
|
|
|
|
|
if (isIDFIRST_lazy_if(RExC_parse, UTF)) { |
6989
|
|
|
|
|
|
/* skip IDFIRST by using do...while */ |
6990
|
6042912
|
|
|
|
|
if (UTF) |
6991
|
|
|
|
|
|
do { |
6992
|
1481680
|
|
|
|
|
RExC_parse += UTF8SKIP(RExC_parse); |
6993
|
63899850
|
|
|
|
|
} while (isWORDCHAR_utf8((U8*)RExC_parse)); |
6994
|
|
|
|
|
|
else |
6995
|
|
|
|
|
|
do { |
6996
|
63899850
|
|
|
|
|
RExC_parse++; |
6997
|
5447168
|
|
|
|
|
} while (isWORDCHAR(*RExC_parse)); |
6998
|
|
|
|
|
|
} else { |
6999
|
5447168
|
|
|
|
|
RExC_parse++; /* so the <- from the vFAIL is after the offending character */ |
7000
|
69347018
|
|
|
|
|
vFAIL("Group name must start with a non-digit word character"); |
7001
|
|
|
|
|
|
} |
7002
|
35958424
|
|
|
|
|
if ( flags ) { |
7003
|
35958424
|
|
|
|
|
SV* sv_name |
7004
|
2579080
|
|
|
|
|
= newSVpvn_flags(name_start, (int)(RExC_parse - name_start), |
7005
|
|
|
|
|
|
SVs_TEMP | (UTF ? SVf_UTF8 : 0)); |
7006
|
33388594
|
|
|
|
|
if ( flags == REG_RSN_RETURN_NAME) |
7007
|
|
|
|
|
|
return sv_name; |
7008
|
2192632
|
|
|
|
|
else if (flags==REG_RSN_RETURN_DATA) { |
7009
|
|
|
|
|
|
HE *he_str = NULL; |
7010
|
|
|
|
|
|
SV *sv_dat = NULL; |
7011
|
51496145
|
|
|
|
|
if ( ! sv_name ) /* should not happen*/ |
7012
|
4427005
|
|
|
|
|
Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); |
7013
|
3773243
|
|
|
|
|
if (RExC_paren_names) |
7014
|
1467660
|
|
|
|
|
he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); |
7015
|
4427005
|
|
|
|
|
if ( he_str ) |
7016
|
1467660
|
|
|
|
|
sv_dat = HeVAL(he_str); |
7017
|
4427005
|
|
|
|
|
if ( ! sv_dat ) |
7018
|
2434830
|
|
|
|
|
vFAIL("Reference to nonexistent named group"); |
7019
|
|
|
|
|
|
return sv_dat; |
7020
|
|
|
|
|
|
} |
7021
|
|
|
|
|
|
else { |
7022
|
4427005
|
|
|
|
|
Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", |
7023
|
|
|
|
|
|
(unsigned long) flags); |
7024
|
|
|
|
|
|
} |
7025
|
|
|
|
|
|
assert(0); /* NOT REACHED */ |
7026
|
|
|
|
|
|
} |
7027
|
|
|
|
|
|
return NULL; |
7028
|
|
|
|
|
|
} |
7029
|
|
|
|
|
|
|
7030
|
|
|
|
|
|
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ |
7031
|
|
|
|
|
|
int rem=(int)(RExC_end - RExC_parse); \ |
7032
|
|
|
|
|
|
int cut; \ |
7033
|
|
|
|
|
|
int num; \ |
7034
|
|
|
|
|
|
int iscut=0; \ |
7035
|
|
|
|
|
|
if (rem>10) { \ |
7036
|
|
|
|
|
|
rem=10; \ |
7037
|
|
|
|
|
|
iscut=1; \ |
7038
|
|
|
|
|
|
} \ |
7039
|
|
|
|
|
|
cut=10-rem; \ |
7040
|
|
|
|
|
|
if (RExC_lastparse!=RExC_parse) \ |
7041
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ |
7042
|
|
|
|
|
|
rem, RExC_parse, \ |
7043
|
|
|
|
|
|
cut + 4, \ |
7044
|
|
|
|
|
|
iscut ? "..." : "<" \ |
7045
|
|
|
|
|
|
); \ |
7046
|
|
|
|
|
|
else \ |
7047
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"%16s",""); \ |
7048
|
|
|
|
|
|
\ |
7049
|
|
|
|
|
|
if (SIZE_ONLY) \ |
7050
|
|
|
|
|
|
num = RExC_size + 1; \ |
7051
|
|
|
|
|
|
else \ |
7052
|
|
|
|
|
|
num=REG_NODE_NUM(RExC_emit); \ |
7053
|
|
|
|
|
|
if (RExC_lastnum!=num) \ |
7054
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"|%4d",num); \ |
7055
|
|
|
|
|
|
else \ |
7056
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"|%4s",""); \ |
7057
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ |
7058
|
|
|
|
|
|
(int)((depth*2)), "", \ |
7059
|
|
|
|
|
|
(funcname) \ |
7060
|
|
|
|
|
|
); \ |
7061
|
|
|
|
|
|
RExC_lastnum=num; \ |
7062
|
|
|
|
|
|
RExC_lastparse=RExC_parse; \ |
7063
|
|
|
|
|
|
}) |
7064
|
|
|
|
|
|
|
7065
|
|
|
|
|
|
|
7066
|
|
|
|
|
|
|
7067
|
|
|
|
|
|
#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ |
7068
|
|
|
|
|
|
DEBUG_PARSE_MSG((funcname)); \ |
7069
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"%4s","\n"); \ |
7070
|
|
|
|
|
|
}) |
7071
|
|
|
|
|
|
#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ |
7072
|
|
|
|
|
|
DEBUG_PARSE_MSG((funcname)); \ |
7073
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,fmt "\n",args); \ |
7074
|
|
|
|
|
|
}) |
7075
|
|
|
|
|
|
|
7076
|
|
|
|
|
|
/* This section of code defines the inversion list object and its methods. The |
7077
|
|
|
|
|
|
* interfaces are highly subject to change, so as much as possible is static to |
7078
|
|
|
|
|
|
* this file. An inversion list is here implemented as a malloc'd C UV array |
7079
|
|
|
|
|
|
* as an SVt_INVLIST scalar. |
7080
|
|
|
|
|
|
* |
7081
|
|
|
|
|
|
* An inversion list for Unicode is an array of code points, sorted by ordinal |
7082
|
|
|
|
|
|
* number. The zeroth element is the first code point in the list. The 1th |
7083
|
|
|
|
|
|
* element is the first element beyond that not in the list. In other words, |
7084
|
|
|
|
|
|
* the first range is |
7085
|
|
|
|
|
|
* invlist[0]..(invlist[1]-1) |
7086
|
|
|
|
|
|
* The other ranges follow. Thus every element whose index is divisible by two |
7087
|
|
|
|
|
|
* marks the beginning of a range that is in the list, and every element not |
7088
|
|
|
|
|
|
* divisible by two marks the beginning of a range not in the list. A single |
7089
|
|
|
|
|
|
* element inversion list that contains the single code point N generally |
7090
|
|
|
|
|
|
* consists of two elements |
7091
|
|
|
|
|
|
* invlist[0] == N |
7092
|
|
|
|
|
|
* invlist[1] == N+1 |
7093
|
|
|
|
|
|
* (The exception is when N is the highest representable value on the |
7094
|
|
|
|
|
|
* machine, in which case the list containing just it would be a single |
7095
|
|
|
|
|
|
* element, itself. By extension, if the last range in the list extends to |
7096
|
|
|
|
|
|
* infinity, then the first element of that range will be in the inversion list |
7097
|
|
|
|
|
|
* at a position that is divisible by two, and is the final element in the |
7098
|
|
|
|
|
|
* list.) |
7099
|
|
|
|
|
|
* Taking the complement (inverting) an inversion list is quite simple, if the |
7100
|
|
|
|
|
|
* first element is 0, remove it; otherwise add a 0 element at the beginning. |
7101
|
|
|
|
|
|
* This implementation reserves an element at the beginning of each inversion |
7102
|
|
|
|
|
|
* list to always contain 0; there is an additional flag in the header which |
7103
|
|
|
|
|
|
* indicates if the list begins at the 0, or is offset to begin at the next |
7104
|
|
|
|
|
|
* element. |
7105
|
|
|
|
|
|
* |
7106
|
|
|
|
|
|
* More about inversion lists can be found in "Unicode Demystified" |
7107
|
|
|
|
|
|
* Chapter 13 by Richard Gillam, published by Addison-Wesley. |
7108
|
|
|
|
|
|
* More will be coming when functionality is added later. |
7109
|
|
|
|
|
|
* |
7110
|
|
|
|
|
|
* The inversion list data structure is currently implemented as an SV pointing |
7111
|
|
|
|
|
|
* to an array of UVs that the SV thinks are bytes. This allows us to have an |
7112
|
|
|
|
|
|
* array of UV whose memory management is automatically handled by the existing |
7113
|
|
|
|
|
|
* facilities for SV's. |
7114
|
|
|
|
|
|
* |
7115
|
|
|
|
|
|
* Some of the methods should always be private to the implementation, and some |
7116
|
|
|
|
|
|
* should eventually be made public */ |
7117
|
|
|
|
|
|
|
7118
|
|
|
|
|
|
/* The header definitions are in F */ |
7119
|
|
|
|
|
|
|
7120
|
|
|
|
|
|
PERL_STATIC_INLINE UV* |
7121
|
|
|
|
|
|
S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) |
7122
|
|
|
|
|
|
{ |
7123
|
|
|
|
|
|
/* Returns a pointer to the first element in the inversion list's array. |
7124
|
|
|
|
|
|
* This is called upon initialization of an inversion list. Where the |
7125
|
|
|
|
|
|
* array begins depends on whether the list has the code point U+0000 in it |
7126
|
|
|
|
|
|
* or not. The other parameter tells it whether the code that follows this |
7127
|
|
|
|
|
|
* call is about to put a 0 in the inversion list or not. The first |
7128
|
|
|
|
|
|
* element is either the element reserved for 0, if TRUE, or the element |
7129
|
|
|
|
|
|
* after it, if FALSE */ |
7130
|
|
|
|
|
|
|
7131
|
|
|
|
|
|
bool* offset = get_invlist_offset_addr(invlist); |
7132
|
|
|
|
|
|
UV* zero_addr = (UV *) SvPVX(invlist); |
7133
|
|
|
|
|
|
|
7134
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; |
7135
|
|
|
|
|
|
|
7136
|
|
|
|
|
|
/* Must be empty */ |
7137
|
|
|
|
|
|
assert(! _invlist_len(invlist)); |
7138
|
|
|
|
|
|
|
7139
|
|
|
|
|
|
*zero_addr = 0; |
7140
|
|
|
|
|
|
|
7141
|
|
|
|
|
|
/* 1^1 = 0; 1^0 = 1 */ |
7142
|
|
|
|
|
|
*offset = 1 ^ will_have_0; |
7143
|
|
|
|
|
|
return zero_addr + *offset; |
7144
|
|
|
|
|
|
} |
7145
|
|
|
|
|
|
|
7146
|
|
|
|
|
|
PERL_STATIC_INLINE UV* |
7147
|
1467664
|
|
|
|
|
S_invlist_array(pTHX_ SV* const invlist) |
7148
|
|
|
|
|
|
{ |
7149
|
|
|
|
|
|
/* Returns the pointer to the inversion list's array. Every time the |
7150
|
|
|
|
|
|
* length changes, this needs to be called in case malloc or realloc moved |
7151
|
|
|
|
|
|
* it */ |
7152
|
|
|
|
|
|
|
7153
|
813902
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_ARRAY; |
7154
|
|
|
|
|
|
|
7155
|
|
|
|
|
|
/* Must not be empty. If these fail, you probably didn't check for |
7156
|
|
|
|
|
|
* being non-zero before trying to get the array */ |
7157
|
653766
|
|
|
|
|
assert(_invlist_len(invlist)); |
7158
|
|
|
|
|
|
|
7159
|
|
|
|
|
|
/* The very first element always contains zero, The array begins either |
7160
|
|
|
|
|
|
* there, or if the inversion list is offset, at the element after it. |
7161
|
|
|
|
|
|
* The offset header field determines which; it contains 0 or 1 to indicate |
7162
|
|
|
|
|
|
* how much additionally to add */ |
7163
|
653766
|
|
|
|
|
assert(0 == *(SvPVX(invlist))); |
7164
|
4427009
|
|
|
|
|
return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); |
7165
|
|
|
|
|
|
} |
7166
|
|
|
|
|
|
|
7167
|
|
|
|
|
|
PERL_STATIC_INLINE void |
7168
|
|
|
|
|
|
S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) |
7169
|
|
|
|
|
|
{ |
7170
|
|
|
|
|
|
/* Sets the current number of elements stored in the inversion list. |
7171
|
|
|
|
|
|
* Updates SvCUR correspondingly */ |
7172
|
|
|
|
|
|
|
7173
|
3359727
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_SET_LEN; |
7174
|
|
|
|
|
|
|
7175
|
4427005
|
|
|
|
|
assert(SvTYPE(invlist) == SVt_INVLIST); |
7176
|
|
|
|
|
|
|
7177
|
4433360
|
|
|
|
|
SvCUR_set(invlist, |
7178
|
|
|
|
|
|
(len == 0) |
7179
|
|
|
|
|
|
? 0 |
7180
|
|
|
|
|
|
: TO_INTERNAL_SIZE(len + offset)); |
7181
|
11342008
|
|
|
|
|
assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); |
7182
|
|
|
|
|
|
} |
7183
|
|
|
|
|
|
|
7184
|
|
|
|
|
|
PERL_STATIC_INLINE IV* |
7185
|
|
|
|
|
|
S_get_invlist_previous_index_addr(pTHX_ SV* invlist) |
7186
|
|
|
|
|
|
{ |
7187
|
|
|
|
|
|
/* Return the address of the IV that is reserved to hold the cached index |
7188
|
|
|
|
|
|
* */ |
7189
|
|
|
|
|
|
|
7190
|
11342008
|
|
|
|
|
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; |
7191
|
|
|
|
|
|
|
7192
|
3747235
|
|
|
|
|
assert(SvTYPE(invlist) == SVt_INVLIST); |
7193
|
|
|
|
|
|
|
7194
|
7594773
|
|
|
|
|
return &(((XINVLIST*) SvANY(invlist))->prev_index); |
7195
|
|
|
|
|
|
} |
7196
|
|
|
|
|
|
|
7197
|
|
|
|
|
|
PERL_STATIC_INLINE IV |
7198
|
|
|
|
|
|
S_invlist_previous_index(pTHX_ SV* const invlist) |
7199
|
|
|
|
|
|
{ |
7200
|
|
|
|
|
|
/* Returns cached index of previous search */ |
7201
|
|
|
|
|
|
|
7202
|
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; |
7203
|
|
|
|
|
|
|
7204
|
|
|
|
|
|
return *get_invlist_previous_index_addr(invlist); |
7205
|
|
|
|
|
|
} |
7206
|
|
|
|
|
|
|
7207
|
|
|
|
|
|
PERL_STATIC_INLINE void |
7208
|
|
|
|
|
|
S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) |
7209
|
|
|
|
|
|
{ |
7210
|
|
|
|
|
|
/* Caches for later retrieval */ |
7211
|
|
|
|
|
|
|
7212
|
11342008
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; |
7213
|
|
|
|
|
|
|
7214
|
|
|
|
|
|
assert(index == 0 || index < (int) _invlist_len(invlist)); |
7215
|
|
|
|
|
|
|
7216
|
7589037
|
|
|
|
|
*get_invlist_previous_index_addr(invlist) = index; |
7217
|
|
|
|
|
|
} |
7218
|
|
|
|
|
|
|
7219
|
|
|
|
|
|
PERL_STATIC_INLINE UV |
7220
|
|
|
|
|
|
S_invlist_max(pTHX_ SV* const invlist) |
7221
|
|
|
|
|
|
{ |
7222
|
|
|
|
|
|
/* Returns the maximum number of elements storable in the inversion list's |
7223
|
|
|
|
|
|
* array, without having to realloc() */ |
7224
|
|
|
|
|
|
|
7225
|
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_MAX; |
7226
|
|
|
|
|
|
|
7227
|
|
|
|
|
|
assert(SvTYPE(invlist) == SVt_INVLIST); |
7228
|
|
|
|
|
|
|
7229
|
|
|
|
|
|
/* Assumes worst case, in which the 0 element is not counted in the |
7230
|
|
|
|
|
|
* inversion list, so subtracts 1 for that */ |
7231
|
|
|
|
|
|
return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ |
7232
|
|
|
|
|
|
? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 |
7233
|
|
|
|
|
|
: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; |
7234
|
|
|
|
|
|
} |
7235
|
|
|
|
|
|
|
7236
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
7237
|
|
|
|
|
|
SV* |
7238
|
|
|
|
|
|
Perl__new_invlist(pTHX_ IV initial_size) |
7239
|
|
|
|
|
|
{ |
7240
|
|
|
|
|
|
|
7241
|
|
|
|
|
|
/* Return a pointer to a newly constructed inversion list, with enough |
7242
|
|
|
|
|
|
* space to store 'initial_size' elements. If that number is negative, a |
7243
|
|
|
|
|
|
* system default is used instead */ |
7244
|
|
|
|
|
|
|
7245
|
|
|
|
|
|
SV* new_list; |
7246
|
|
|
|
|
|
|
7247
|
|
|
|
|
|
if (initial_size < 0) { |
7248
|
|
|
|
|
|
initial_size = 10; |
7249
|
|
|
|
|
|
} |
7250
|
|
|
|
|
|
|
7251
|
|
|
|
|
|
/* Allocate the initial space */ |
7252
|
|
|
|
|
|
new_list = newSV_type(SVt_INVLIST); |
7253
|
|
|
|
|
|
|
7254
|
|
|
|
|
|
/* First 1 is in case the zero element isn't in the list; second 1 is for |
7255
|
|
|
|
|
|
* trailing NUL */ |
7256
|
|
|
|
|
|
SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); |
7257
|
|
|
|
|
|
invlist_set_len(new_list, 0, 0); |
7258
|
|
|
|
|
|
|
7259
|
|
|
|
|
|
/* Force iterinit() to be used to get iteration to work */ |
7260
|
|
|
|
|
|
*get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; |
7261
|
|
|
|
|
|
|
7262
|
|
|
|
|
|
*get_invlist_previous_index_addr(new_list) = 0; |
7263
|
|
|
|
|
|
|
7264
|
|
|
|
|
|
return new_list; |
7265
|
|
|
|
|
|
} |
7266
|
|
|
|
|
|
#endif |
7267
|
|
|
|
|
|
|
7268
|
|
|
|
|
|
STATIC SV* |
7269
|
11327036
|
|
|
|
|
S__new_invlist_C_array(pTHX_ const UV* const list) |
7270
|
|
|
|
|
|
{ |
7271
|
|
|
|
|
|
/* Return a pointer to a newly constructed inversion list, initialized to |
7272
|
|
|
|
|
|
* point to , which has to be in the exact correct inversion list |
7273
|
|
|
|
|
|
* form, including internal fields. Thus this is a dangerous routine that |
7274
|
|
|
|
|
|
* should not be used in the wrong hands. The passed in 'list' contains |
7275
|
|
|
|
|
|
* several header fields at the beginning that are not part of the |
7276
|
|
|
|
|
|
* inversion list body proper */ |
7277
|
|
|
|
|
|
|
7278
|
8198205
|
|
|
|
|
const STRLEN length = (STRLEN) list[0]; |
7279
|
8198205
|
|
|
|
|
const UV version_id = list[1]; |
7280
|
3143803
|
|
|
|
|
const bool offset = cBOOL(list[2]); |
7281
|
|
|
|
|
|
#define HEADER_LENGTH 3 |
7282
|
|
|
|
|
|
/* If any of the above changes in any way, you must change HEADER_LENGTH |
7283
|
|
|
|
|
|
* (if appropriate) and regenerate INVLIST_VERSION_ID by running |
7284
|
|
|
|
|
|
* perl -E 'say int(rand 2**31-1)' |
7285
|
|
|
|
|
|
*/ |
7286
|
|
|
|
|
|
#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and |
7287
|
|
|
|
|
|
data structure type, so that one being |
7288
|
|
|
|
|
|
passed in can be validated to be an |
7289
|
|
|
|
|
|
inversion list of the correct vintage. |
7290
|
|
|
|
|
|
*/ |
7291
|
|
|
|
|
|
|
7292
|
3143803
|
|
|
|
|
SV* invlist = newSV_type(SVt_INVLIST); |
7293
|
|
|
|
|
|
|
7294
|
3143803
|
|
|
|
|
PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; |
7295
|
|
|
|
|
|
|
7296
|
3143803
|
|
|
|
|
if (version_id != INVLIST_VERSION_ID) { |
7297
|
7306085
|
|
|
|
|
Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); |
7298
|
|
|
|
|
|
} |
7299
|
|
|
|
|
|
|
7300
|
|
|
|
|
|
/* The generated array passed in includes header elements that aren't part |
7301
|
|
|
|
|
|
* of the list proper, so start it just after them */ |
7302
|
1641696
|
|
|
|
|
SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); |
7303
|
|
|
|
|
|
|
7304
|
1641696
|
|
|
|
|
SvLEN_set(invlist, 0); /* Means we own the contents, and the system |
7305
|
|
|
|
|
|
shouldn't touch it */ |
7306
|
|
|
|
|
|
|
7307
|
980910
|
|
|
|
|
*(get_invlist_offset_addr(invlist)) = offset; |
7308
|
|
|
|
|
|
|
7309
|
|
|
|
|
|
/* The 'length' passed to us is the physical number of elements in the |
7310
|
|
|
|
|
|
* inversion list. But if there is an offset the logical number is one |
7311
|
|
|
|
|
|
* less than that */ |
7312
|
980910
|
|
|
|
|
invlist_set_len(invlist, length - offset, offset); |
7313
|
|
|
|
|
|
|
7314
|
|
|
|
|
|
invlist_set_previous_index(invlist, 0); |
7315
|
|
|
|
|
|
|
7316
|
|
|
|
|
|
/* Initialize the iteration pointer. */ |
7317
|
124
|
|
|
|
|
invlist_iterfinish(invlist); |
7318
|
|
|
|
|
|
|
7319
|
980972
|
|
|
|
|
return invlist; |
7320
|
|
|
|
|
|
} |
7321
|
|
|
|
|
|
|
7322
|
|
|
|
|
|
STATIC void |
7323
|
|
|
|
|
|
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) |
7324
|
|
|
|
|
|
{ |
7325
|
|
|
|
|
|
/* Grow the maximum size of an inversion list */ |
7326
|
|
|
|
|
|
|
7327
|
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_EXTEND; |
7328
|
|
|
|
|
|
|
7329
|
|
|
|
|
|
assert(SvTYPE(invlist) == SVt_INVLIST); |
7330
|
|
|
|
|
|
|
7331
|
|
|
|
|
|
/* Add one to account for the zero element at the beginning which may not |
7332
|
|
|
|
|
|
* be counted by the calling parameters */ |
7333
|
|
|
|
|
|
SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); |
7334
|
|
|
|
|
|
} |
7335
|
|
|
|
|
|
|
7336
|
|
|
|
|
|
PERL_STATIC_INLINE void |
7337
|
|
|
|
|
|
S_invlist_trim(pTHX_ SV* const invlist) |
7338
|
|
|
|
|
|
{ |
7339
|
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_TRIM; |
7340
|
|
|
|
|
|
|
7341
|
|
|
|
|
|
assert(SvTYPE(invlist) == SVt_INVLIST); |
7342
|
|
|
|
|
|
|
7343
|
|
|
|
|
|
/* Change the length of the inversion list to how many entries it currently |
7344
|
|
|
|
|
|
* has */ |
7345
|
|
|
|
|
|
SvPV_shrink_to_cur((SV *) invlist); |
7346
|
|
|
|
|
|
} |
7347
|
|
|
|
|
|
|
7348
|
|
|
|
|
|
#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) |
7349
|
|
|
|
|
|
|
7350
|
|
|
|
|
|
STATIC void |
7351
|
|
|
|
|
|
S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) |
7352
|
|
|
|
|
|
{ |
7353
|
|
|
|
|
|
/* Subject to change or removal. Append the range from 'start' to 'end' at |
7354
|
|
|
|
|
|
* the end of the inversion list. The range must be above any existing |
7355
|
|
|
|
|
|
* ones. */ |
7356
|
|
|
|
|
|
|
7357
|
|
|
|
|
|
UV* array; |
7358
|
|
|
|
|
|
UV max = invlist_max(invlist); |
7359
|
|
|
|
|
|
UV len = _invlist_len(invlist); |
7360
|
|
|
|
|
|
bool offset; |
7361
|
|
|
|
|
|
|
7362
|
|
|
|
|
|
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; |
7363
|
|
|
|
|
|
|
7364
|
|
|
|
|
|
if (len == 0) { /* Empty lists must be initialized */ |
7365
|
|
|
|
|
|
offset = start != 0; |
7366
|
|
|
|
|
|
array = _invlist_array_init(invlist, ! offset); |
7367
|
|
|
|
|
|
} |
7368
|
|
|
|
|
|
else { |
7369
|
|
|
|
|
|
/* Here, the existing list is non-empty. The current max entry in the |
7370
|
|
|
|
|
|
* list is generally the first value not in the set, except when the |
7371
|
|
|
|
|
|
* set extends to the end of permissible values, in which case it is |
7372
|
|
|
|
|
|
* the first entry in that final set, and so this call is an attempt to |
7373
|
|
|
|
|
|
* append out-of-order */ |
7374
|
|
|
|
|
|
|
7375
|
|
|
|
|
|
UV final_element = len - 1; |
7376
|
|
|
|
|
|
array = invlist_array(invlist); |
7377
|
|
|
|
|
|
if (array[final_element] > start |
7378
|
|
|
|
|
|
|| ELEMENT_RANGE_MATCHES_INVLIST(final_element)) |
7379
|
|
|
|
|
|
{ |
7380
|
|
|
|
|
|
Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", |
7381
|
|
|
|
|
|
array[final_element], start, |
7382
|
|
|
|
|
|
ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); |
7383
|
|
|
|
|
|
} |
7384
|
|
|
|
|
|
|
7385
|
|
|
|
|
|
/* Here, it is a legal append. If the new range begins with the first |
7386
|
|
|
|
|
|
* value not in the set, it is extending the set, so the new first |
7387
|
|
|
|
|
|
* value not in the set is one greater than the newly extended range. |
7388
|
|
|
|
|
|
* */ |
7389
|
|
|
|
|
|
offset = *get_invlist_offset_addr(invlist); |
7390
|
|
|
|
|
|
if (array[final_element] == start) { |
7391
|
|
|
|
|
|
if (end != UV_MAX) { |
7392
|
|
|
|
|
|
array[final_element] = end + 1; |
7393
|
|
|
|
|
|
} |
7394
|
|
|
|
|
|
else { |
7395
|
|
|
|
|
|
/* But if the end is the maximum representable on the machine, |
7396
|
|
|
|
|
|
* just let the range that this would extend to have no end */ |
7397
|
|
|
|
|
|
invlist_set_len(invlist, len - 1, offset); |
7398
|
|
|
|
|
|
} |
7399
|
|
|
|
|
|
return; |
7400
|
|
|
|
|
|
} |
7401
|
|
|
|
|
|
} |
7402
|
|
|
|
|
|
|
7403
|
|
|
|
|
|
/* Here the new range doesn't extend any existing set. Add it */ |
7404
|
|
|
|
|
|
|
7405
|
|
|
|
|
|
len += 2; /* Includes an element each for the start and end of range */ |
7406
|
|
|
|
|
|
|
7407
|
|
|
|
|
|
/* If wll overflow the existing space, extend, which may cause the array to |
7408
|
|
|
|
|
|
* be moved */ |
7409
|
|
|
|
|
|
if (max < len) { |
7410
|
|
|
|
|
|
invlist_extend(invlist, len); |
7411
|
|
|
|
|
|
|
7412
|
|
|
|
|
|
/* Have to set len here to avoid assert failure in invlist_array() */ |
7413
|
|
|
|
|
|
invlist_set_len(invlist, len, offset); |
7414
|
|
|
|
|
|
|
7415
|
|
|
|
|
|
array = invlist_array(invlist); |
7416
|
|
|
|
|
|
} |
7417
|
|
|
|
|
|
else { |
7418
|
|
|
|
|
|
invlist_set_len(invlist, len, offset); |
7419
|
|
|
|
|
|
} |
7420
|
|
|
|
|
|
|
7421
|
|
|
|
|
|
/* The next item on the list starts the range, the one after that is |
7422
|
|
|
|
|
|
* one past the new range. */ |
7423
|
|
|
|
|
|
array[len - 2] = start; |
7424
|
|
|
|
|
|
if (end != UV_MAX) { |
7425
|
|
|
|
|
|
array[len - 1] = end + 1; |
7426
|
|
|
|
|
|
} |
7427
|
|
|
|
|
|
else { |
7428
|
|
|
|
|
|
/* But if the end is the maximum representable on the machine, just let |
7429
|
|
|
|
|
|
* the range have no end */ |
7430
|
|
|
|
|
|
invlist_set_len(invlist, len - 1, offset); |
7431
|
|
|
|
|
|
} |
7432
|
|
|
|
|
|
} |
7433
|
|
|
|
|
|
|
7434
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
7435
|
|
|
|
|
|
|
7436
|
|
|
|
|
|
IV |
7437
|
|
|
|
|
|
Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) |
7438
|
|
|
|
|
|
{ |
7439
|
|
|
|
|
|
/* Searches the inversion list for the entry that contains the input code |
7440
|
|
|
|
|
|
* point . If is not in the list, -1 is returned. Otherwise, the |
7441
|
|
|
|
|
|
* return value is the index into the list's array of the range that |
7442
|
|
|
|
|
|
* contains */ |
7443
|
|
|
|
|
|
|
7444
|
|
|
|
|
|
IV low = 0; |
7445
|
|
|
|
|
|
IV mid; |
7446
|
|
|
|
|
|
IV high = _invlist_len(invlist); |
7447
|
|
|
|
|
|
const IV highest_element = high - 1; |
7448
|
|
|
|
|
|
const UV* array; |
7449
|
|
|
|
|
|
|
7450
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_SEARCH; |
7451
|
|
|
|
|
|
|
7452
|
|
|
|
|
|
/* If list is empty, return failure. */ |
7453
|
|
|
|
|
|
if (high == 0) { |
7454
|
|
|
|
|
|
return -1; |
7455
|
|
|
|
|
|
} |
7456
|
|
|
|
|
|
|
7457
|
|
|
|
|
|
/* (We can't get the array unless we know the list is non-empty) */ |
7458
|
|
|
|
|
|
array = invlist_array(invlist); |
7459
|
|
|
|
|
|
|
7460
|
|
|
|
|
|
mid = invlist_previous_index(invlist); |
7461
|
|
|
|
|
|
assert(mid >=0 && mid <= highest_element); |
7462
|
|
|
|
|
|
|
7463
|
|
|
|
|
|
/* contains the cache of the result of the previous call to this |
7464
|
|
|
|
|
|
* function (0 the first time). See if this call is for the same result, |
7465
|
|
|
|
|
|
* or if it is for mid-1. This is under the theory that calls to this |
7466
|
|
|
|
|
|
* function will often be for related code points that are near each other. |
7467
|
|
|
|
|
|
* And benchmarks show that caching gives better results. We also test |
7468
|
|
|
|
|
|
* here if the code point is within the bounds of the list. These tests |
7469
|
|
|
|
|
|
* replace others that would have had to be made anyway to make sure that |
7470
|
|
|
|
|
|
* the array bounds were not exceeded, and these give us extra information |
7471
|
|
|
|
|
|
* at the same time */ |
7472
|
|
|
|
|
|
if (cp >= array[mid]) { |
7473
|
|
|
|
|
|
if (cp >= array[highest_element]) { |
7474
|
|
|
|
|
|
return highest_element; |
7475
|
|
|
|
|
|
} |
7476
|
|
|
|
|
|
|
7477
|
|
|
|
|
|
/* Here, array[mid] <= cp < array[highest_element]. This means that |
7478
|
|
|
|
|
|
* the final element is not the answer, so can exclude it; it also |
7479
|
|
|
|
|
|
* means that is not the final element, so can refer to 'mid + 1' |
7480
|
|
|
|
|
|
* safely */ |
7481
|
|
|
|
|
|
if (cp < array[mid + 1]) { |
7482
|
|
|
|
|
|
return mid; |
7483
|
|
|
|
|
|
} |
7484
|
|
|
|
|
|
high--; |
7485
|
|
|
|
|
|
low = mid + 1; |
7486
|
|
|
|
|
|
} |
7487
|
|
|
|
|
|
else { /* cp < aray[mid] */ |
7488
|
|
|
|
|
|
if (cp < array[0]) { /* Fail if outside the array */ |
7489
|
|
|
|
|
|
return -1; |
7490
|
|
|
|
|
|
} |
7491
|
|
|
|
|
|
high = mid; |
7492
|
|
|
|
|
|
if (cp >= array[mid - 1]) { |
7493
|
|
|
|
|
|
goto found_entry; |
7494
|
|
|
|
|
|
} |
7495
|
|
|
|
|
|
} |
7496
|
|
|
|
|
|
|
7497
|
|
|
|
|
|
/* Binary search. What we are looking for is such that |
7498
|
|
|
|
|
|
* array[i] <= cp < array[i+1] |
7499
|
|
|
|
|
|
* The loop below converges on the i+1. Note that there may not be an |
7500
|
|
|
|
|
|
* (i+1)th element in the array, and things work nonetheless */ |
7501
|
|
|
|
|
|
while (low < high) { |
7502
|
|
|
|
|
|
mid = (low + high) / 2; |
7503
|
|
|
|
|
|
assert(mid <= highest_element); |
7504
|
|
|
|
|
|
if (array[mid] <= cp) { /* cp >= array[mid] */ |
7505
|
|
|
|
|
|
low = mid + 1; |
7506
|
|
|
|
|
|
|
7507
|
|
|
|
|
|
/* We could do this extra test to exit the loop early. |
7508
|
|
|
|
|
|
if (cp < array[low]) { |
7509
|
|
|
|
|
|
return mid; |
7510
|
|
|
|
|
|
} |
7511
|
|
|
|
|
|
*/ |
7512
|
|
|
|
|
|
} |
7513
|
|
|
|
|
|
else { /* cp < array[mid] */ |
7514
|
|
|
|
|
|
high = mid; |
7515
|
|
|
|
|
|
} |
7516
|
|
|
|
|
|
} |
7517
|
|
|
|
|
|
|
7518
|
|
|
|
|
|
found_entry: |
7519
|
|
|
|
|
|
high--; |
7520
|
|
|
|
|
|
invlist_set_previous_index(invlist, high); |
7521
|
|
|
|
|
|
return high; |
7522
|
|
|
|
|
|
} |
7523
|
|
|
|
|
|
|
7524
|
|
|
|
|
|
void |
7525
|
|
|
|
|
|
Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) |
7526
|
|
|
|
|
|
{ |
7527
|
|
|
|
|
|
/* populates a swatch of a swash the same way swatch_get() does in utf8.c, |
7528
|
|
|
|
|
|
* but is used when the swash has an inversion list. This makes this much |
7529
|
|
|
|
|
|
* faster, as it uses a binary search instead of a linear one. This is |
7530
|
|
|
|
|
|
* intimately tied to that function, and perhaps should be in utf8.c, |
7531
|
|
|
|
|
|
* except it is intimately tied to inversion lists as well. It assumes |
7532
|
|
|
|
|
|
* that is all 0's on input */ |
7533
|
|
|
|
|
|
|
7534
|
|
|
|
|
|
UV current = start; |
7535
|
|
|
|
|
|
const IV len = _invlist_len(invlist); |
7536
|
|
|
|
|
|
IV i; |
7537
|
|
|
|
|
|
const UV * array; |
7538
|
|
|
|
|
|
|
7539
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; |
7540
|
|
|
|
|
|
|
7541
|
|
|
|
|
|
if (len == 0) { /* Empty inversion list */ |
7542
|
|
|
|
|
|
return; |
7543
|
|
|
|
|
|
} |
7544
|
|
|
|
|
|
|
7545
|
|
|
|
|
|
array = invlist_array(invlist); |
7546
|
|
|
|
|
|
|
7547
|
|
|
|
|
|
/* Find which element it is */ |
7548
|
|
|
|
|
|
i = _invlist_search(invlist, start); |
7549
|
|
|
|
|
|
|
7550
|
|
|
|
|
|
/* We populate from to */ |
7551
|
|
|
|
|
|
while (current < end) { |
7552
|
|
|
|
|
|
UV upper; |
7553
|
|
|
|
|
|
|
7554
|
|
|
|
|
|
/* The inversion list gives the results for every possible code point |
7555
|
|
|
|
|
|
* after the first one in the list. Only those ranges whose index is |
7556
|
|
|
|
|
|
* even are ones that the inversion list matches. For the odd ones, |
7557
|
|
|
|
|
|
* and if the initial code point is not in the list, we have to skip |
7558
|
|
|
|
|
|
* forward to the next element */ |
7559
|
|
|
|
|
|
if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { |
7560
|
|
|
|
|
|
i++; |
7561
|
|
|
|
|
|
if (i >= len) { /* Finished if beyond the end of the array */ |
7562
|
|
|
|
|
|
return; |
7563
|
|
|
|
|
|
} |
7564
|
|
|
|
|
|
current = array[i]; |
7565
|
|
|
|
|
|
if (current >= end) { /* Finished if beyond the end of what we |
7566
|
|
|
|
|
|
are populating */ |
7567
|
|
|
|
|
|
if (LIKELY(end < UV_MAX)) { |
7568
|
|
|
|
|
|
return; |
7569
|
|
|
|
|
|
} |
7570
|
|
|
|
|
|
|
7571
|
|
|
|
|
|
/* We get here when the upper bound is the maximum |
7572
|
|
|
|
|
|
* representable on the machine, and we are looking for just |
7573
|
|
|
|
|
|
* that code point. Have to special case it */ |
7574
|
|
|
|
|
|
i = len; |
7575
|
|
|
|
|
|
goto join_end_of_list; |
7576
|
|
|
|
|
|
} |
7577
|
|
|
|
|
|
} |
7578
|
|
|
|
|
|
assert(current >= start); |
7579
|
|
|
|
|
|
|
7580
|
|
|
|
|
|
/* The current range ends one below the next one, except don't go past |
7581
|
|
|
|
|
|
* */ |
7582
|
|
|
|
|
|
i++; |
7583
|
|
|
|
|
|
upper = (i < len && array[i] < end) ? array[i] : end; |
7584
|
|
|
|
|
|
|
7585
|
|
|
|
|
|
/* Here we are in a range that matches. Populate a bit in the 3-bit U8 |
7586
|
|
|
|
|
|
* for each code point in it */ |
7587
|
|
|
|
|
|
for (; current < upper; current++) { |
7588
|
|
|
|
|
|
const STRLEN offset = (STRLEN)(current - start); |
7589
|
|
|
|
|
|
swatch[offset >> 3] |= 1 << (offset & 7); |
7590
|
|
|
|
|
|
} |
7591
|
|
|
|
|
|
|
7592
|
|
|
|
|
|
join_end_of_list: |
7593
|
|
|
|
|
|
|
7594
|
|
|
|
|
|
/* Quit if at the end of the list */ |
7595
|
|
|
|
|
|
if (i >= len) { |
7596
|
|
|
|
|
|
|
7597
|
|
|
|
|
|
/* But first, have to deal with the highest possible code point on |
7598
|
|
|
|
|
|
* the platform. The previous code assumes that is one |
7599
|
|
|
|
|
|
* beyond where we want to populate, but that is impossible at the |
7600
|
|
|
|
|
|
* platform's infinity, so have to handle it specially */ |
7601
|
|
|
|
|
|
if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) |
7602
|
|
|
|
|
|
{ |
7603
|
|
|
|
|
|
const STRLEN offset = (STRLEN)(end - start); |
7604
|
|
|
|
|
|
swatch[offset >> 3] |= 1 << (offset & 7); |
7605
|
|
|
|
|
|
} |
7606
|
|
|
|
|
|
return; |
7607
|
|
|
|
|
|
} |
7608
|
|
|
|
|
|
|
7609
|
|
|
|
|
|
/* Advance to the next range, which will be for code points not in the |
7610
|
|
|
|
|
|
* inversion list */ |
7611
|
|
|
|
|
|
current = array[i]; |
7612
|
|
|
|
|
|
} |
7613
|
|
|
|
|
|
|
7614
|
|
|
|
|
|
return; |
7615
|
|
|
|
|
|
} |
7616
|
|
|
|
|
|
|
7617
|
|
|
|
|
|
void |
7618
|
|
|
|
|
|
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) |
7619
|
|
|
|
|
|
{ |
7620
|
|
|
|
|
|
/* Take the union of two inversion lists and point |
7621
|
|
|
|
|
|
* SHOULD BE DEFINED upon input, and if it points to one of the two lists, |
7622
|
|
|
|
|
|
* the reference count to that list will be decremented. The first list, |
7623
|
|
|
|
|
|
* , may be NULL, in which case a copy of the second list is returned. |
7624
|
|
|
|
|
|
* If is TRUE, the union is taken of the complement |
7625
|
|
|
|
|
|
* (inversion) of instead of b itself. |
7626
|
|
|
|
|
|
* |
7627
|
|
|
|
|
|
* The basis for this comes from "Unicode Demystified" Chapter 13 by |
7628
|
|
|
|
|
|
* Richard Gillam, published by Addison-Wesley, and explained at some |
7629
|
|
|
|
|
|
* length there. The preface says to incorporate its examples into your |
7630
|
|
|
|
|
|
* code at your own risk. |
7631
|
|
|
|
|
|
* |
7632
|
|
|
|
|
|
* The algorithm is like a merge sort. |
7633
|
|
|
|
|
|
* |
7634
|
|
|
|
|
|
* XXX A potential performance improvement is to keep track as we go along |
7635
|
|
|
|
|
|
* if only one of the inputs contributes to the result, meaning the other |
7636
|
|
|
|
|
|
* is a subset of that one. In that case, we can skip the final copy and |
7637
|
|
|
|
|
|
* return the larger of the input lists, but then outside code might need |
7638
|
|
|
|
|
|
* to keep track of whether to free the input list or not */ |
7639
|
|
|
|
|
|
|
7640
|
|
|
|
|
|
const UV* array_a; /* a's array */ |
7641
|
|
|
|
|
|
const UV* array_b; |
7642
|
|
|
|
|
|
UV len_a; /* length of a's array */ |
7643
|
|
|
|
|
|
UV len_b; |
7644
|
|
|
|
|
|
|
7645
|
|
|
|
|
|
SV* u; /* the resulting union */ |
7646
|
|
|
|
|
|
UV* array_u; |
7647
|
|
|
|
|
|
UV len_u; |
7648
|
|
|
|
|
|
|
7649
|
|
|
|
|
|
UV i_a = 0; /* current index into a's array */ |
7650
|
|
|
|
|
|
UV i_b = 0; |
7651
|
|
|
|
|
|
UV i_u = 0; |
7652
|
|
|
|
|
|
|
7653
|
|
|
|
|
|
/* running count, as explained in the algorithm source book; items are |
7654
|
|
|
|
|
|
* stopped accumulating and are output when the count changes to/from 0. |
7655
|
|
|
|
|
|
* The count is incremented when we start a range that's in the set, and |
7656
|
|
|
|
|
|
* decremented when we start a range that's not in the set. So its range |
7657
|
|
|
|
|
|
* is 0 to 2. Only when the count is zero is something not in the set. |
7658
|
|
|
|
|
|
*/ |
7659
|
|
|
|
|
|
UV count = 0; |
7660
|
|
|
|
|
|
|
7661
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; |
7662
|
|
|
|
|
|
assert(a != b); |
7663
|
|
|
|
|
|
|
7664
|
|
|
|
|
|
/* If either one is empty, the union is the other one */ |
7665
|
|
|
|
|
|
if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { |
7666
|
|
|
|
|
|
if (*output == a) { |
7667
|
|
|
|
|
|
if (a != NULL) { |
7668
|
|
|
|
|
|
SvREFCNT_dec_NN(a); |
7669
|
|
|
|
|
|
} |
7670
|
|
|
|
|
|
} |
7671
|
|
|
|
|
|
if (*output != b) { |
7672
|
|
|
|
|
|
*output = invlist_clone(b); |
7673
|
|
|
|
|
|
if (complement_b) { |
7674
|
|
|
|
|
|
_invlist_invert(*output); |
7675
|
|
|
|
|
|
} |
7676
|
|
|
|
|
|
} /* else *output already = b; */ |
7677
|
|
|
|
|
|
return; |
7678
|
|
|
|
|
|
} |
7679
|
|
|
|
|
|
else if ((len_b = _invlist_len(b)) == 0) { |
7680
|
|
|
|
|
|
if (*output == b) { |
7681
|
|
|
|
|
|
SvREFCNT_dec_NN(b); |
7682
|
|
|
|
|
|
} |
7683
|
|
|
|
|
|
|
7684
|
|
|
|
|
|
/* The complement of an empty list is a list that has everything in it, |
7685
|
|
|
|
|
|
* so the union with includes everything too */ |
7686
|
|
|
|
|
|
if (complement_b) { |
7687
|
|
|
|
|
|
if (a == *output) { |
7688
|
|
|
|
|
|
SvREFCNT_dec_NN(a); |
7689
|
|
|
|
|
|
} |
7690
|
|
|
|
|
|
*output = _new_invlist(1); |
7691
|
|
|
|
|
|
_append_range_to_invlist(*output, 0, UV_MAX); |
7692
|
|
|
|
|
|
} |
7693
|
|
|
|
|
|
else if (*output != a) { |
7694
|
|
|
|
|
|
*output = invlist_clone(a); |
7695
|
|
|
|
|
|
} |
7696
|
|
|
|
|
|
/* else *output already = a; */ |
7697
|
|
|
|
|
|
return; |
7698
|
|
|
|
|
|
} |
7699
|
|
|
|
|
|
|
7700
|
|
|
|
|
|
/* Here both lists exist and are non-empty */ |
7701
|
|
|
|
|
|
array_a = invlist_array(a); |
7702
|
|
|
|
|
|
array_b = invlist_array(b); |
7703
|
|
|
|
|
|
|
7704
|
|
|
|
|
|
/* If are to take the union of 'a' with the complement of b, set it |
7705
|
|
|
|
|
|
* up so are looking at b's complement. */ |
7706
|
|
|
|
|
|
if (complement_b) { |
7707
|
|
|
|
|
|
|
7708
|
|
|
|
|
|
/* To complement, we invert: if the first element is 0, remove it. To |
7709
|
|
|
|
|
|
* do this, we just pretend the array starts one later */ |
7710
|
|
|
|
|
|
if (array_b[0] == 0) { |
7711
|
|
|
|
|
|
array_b++; |
7712
|
|
|
|
|
|
len_b--; |
7713
|
|
|
|
|
|
} |
7714
|
|
|
|
|
|
else { |
7715
|
|
|
|
|
|
|
7716
|
|
|
|
|
|
/* But if the first element is not zero, we pretend the list starts |
7717
|
|
|
|
|
|
* at the 0 that is always stored immediately before the array. */ |
7718
|
|
|
|
|
|
array_b--; |
7719
|
|
|
|
|
|
len_b++; |
7720
|
|
|
|
|
|
} |
7721
|
|
|
|
|
|
} |
7722
|
|
|
|
|
|
|
7723
|
|
|
|
|
|
/* Size the union for the worst case: that the sets are completely |
7724
|
|
|
|
|
|
* disjoint */ |
7725
|
|
|
|
|
|
u = _new_invlist(len_a + len_b); |
7726
|
|
|
|
|
|
|
7727
|
|
|
|
|
|
/* Will contain U+0000 if either component does */ |
7728
|
|
|
|
|
|
array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) |
7729
|
|
|
|
|
|
|| (len_b > 0 && array_b[0] == 0)); |
7730
|
|
|
|
|
|
|
7731
|
|
|
|
|
|
/* Go through each list item by item, stopping when exhausted one of |
7732
|
|
|
|
|
|
* them */ |
7733
|
|
|
|
|
|
while (i_a < len_a && i_b < len_b) { |
7734
|
|
|
|
|
|
UV cp; /* The element to potentially add to the union's array */ |
7735
|
|
|
|
|
|
bool cp_in_set; /* is it in the the input list's set or not */ |
7736
|
|
|
|
|
|
|
7737
|
|
|
|
|
|
/* We need to take one or the other of the two inputs for the union. |
7738
|
|
|
|
|
|
* Since we are merging two sorted lists, we take the smaller of the |
7739
|
|
|
|
|
|
* next items. In case of a tie, we take the one that is in its set |
7740
|
|
|
|
|
|
* first. If we took one not in the set first, it would decrement the |
7741
|
|
|
|
|
|
* count, possibly to 0 which would cause it to be output as ending the |
7742
|
|
|
|
|
|
* range, and the next time through we would take the same number, and |
7743
|
|
|
|
|
|
* output it again as beginning the next range. By doing it the |
7744
|
|
|
|
|
|
* opposite way, there is no possibility that the count will be |
7745
|
|
|
|
|
|
* momentarily decremented to 0, and thus the two adjoining ranges will |
7746
|
|
|
|
|
|
* be seamlessly merged. (In a tie and both are in the set or both not |
7747
|
|
|
|
|
|
* in the set, it doesn't matter which we take first.) */ |
7748
|
|
|
|
|
|
if (array_a[i_a] < array_b[i_b] |
7749
|
|
|
|
|
|
|| (array_a[i_a] == array_b[i_b] |
7750
|
|
|
|
|
|
&& ELEMENT_RANGE_MATCHES_INVLIST(i_a))) |
7751
|
|
|
|
|
|
{ |
7752
|
|
|
|
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); |
7753
|
|
|
|
|
|
cp= array_a[i_a++]; |
7754
|
|
|
|
|
|
} |
7755
|
|
|
|
|
|
else { |
7756
|
|
|
|
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); |
7757
|
|
|
|
|
|
cp = array_b[i_b++]; |
7758
|
|
|
|
|
|
} |
7759
|
|
|
|
|
|
|
7760
|
|
|
|
|
|
/* Here, have chosen which of the two inputs to look at. Only output |
7761
|
|
|
|
|
|
* if the running count changes to/from 0, which marks the |
7762
|
|
|
|
|
|
* beginning/end of a range in that's in the set */ |
7763
|
|
|
|
|
|
if (cp_in_set) { |
7764
|
|
|
|
|
|
if (count == 0) { |
7765
|
|
|
|
|
|
array_u[i_u++] = cp; |
7766
|
|
|
|
|
|
} |
7767
|
|
|
|
|
|
count++; |
7768
|
|
|
|
|
|
} |
7769
|
|
|
|
|
|
else { |
7770
|
|
|
|
|
|
count--; |
7771
|
|
|
|
|
|
if (count == 0) { |
7772
|
|
|
|
|
|
array_u[i_u++] = cp; |
7773
|
|
|
|
|
|
} |
7774
|
|
|
|
|
|
} |
7775
|
|
|
|
|
|
} |
7776
|
|
|
|
|
|
|
7777
|
|
|
|
|
|
/* Here, we are finished going through at least one of the lists, which |
7778
|
|
|
|
|
|
* means there is something remaining in at most one. We check if the list |
7779
|
|
|
|
|
|
* that hasn't been exhausted is positioned such that we are in the middle |
7780
|
|
|
|
|
|
* of a range in its set or not. (i_a and i_b point to the element beyond |
7781
|
|
|
|
|
|
* the one we care about.) If in the set, we decrement 'count'; if 0, there |
7782
|
|
|
|
|
|
* is potentially more to output. |
7783
|
|
|
|
|
|
* There are four cases: |
7784
|
|
|
|
|
|
* 1) Both weren't in their sets, count is 0, and remains 0. What's left |
7785
|
|
|
|
|
|
* in the union is entirely from the non-exhausted set. |
7786
|
|
|
|
|
|
* 2) Both were in their sets, count is 2. Nothing further should |
7787
|
|
|
|
|
|
* be output, as everything that remains will be in the exhausted |
7788
|
|
|
|
|
|
* list's set, hence in the union; decrementing to 1 but not 0 insures |
7789
|
|
|
|
|
|
* that |
7790
|
|
|
|
|
|
* 3) the exhausted was in its set, non-exhausted isn't, count is 1. |
7791
|
|
|
|
|
|
* Nothing further should be output because the union includes |
7792
|
|
|
|
|
|
* everything from the exhausted set. Not decrementing ensures that. |
7793
|
|
|
|
|
|
* 4) the exhausted wasn't in its set, non-exhausted is, count is 1; |
7794
|
|
|
|
|
|
* decrementing to 0 insures that we look at the remainder of the |
7795
|
|
|
|
|
|
* non-exhausted set */ |
7796
|
|
|
|
|
|
if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) |
7797
|
|
|
|
|
|
|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) |
7798
|
|
|
|
|
|
{ |
7799
|
|
|
|
|
|
count--; |
7800
|
|
|
|
|
|
} |
7801
|
|
|
|
|
|
|
7802
|
|
|
|
|
|
/* The final length is what we've output so far, plus what else is about to |
7803
|
|
|
|
|
|
* be output. (If 'count' is non-zero, then the input list we exhausted |
7804
|
|
|
|
|
|
* has everything remaining up to the machine's limit in its set, and hence |
7805
|
|
|
|
|
|
* in the union, so there will be no further output. */ |
7806
|
|
|
|
|
|
len_u = i_u; |
7807
|
|
|
|
|
|
if (count == 0) { |
7808
|
|
|
|
|
|
/* At most one of the subexpressions will be non-zero */ |
7809
|
|
|
|
|
|
len_u += (len_a - i_a) + (len_b - i_b); |
7810
|
|
|
|
|
|
} |
7811
|
|
|
|
|
|
|
7812
|
|
|
|
|
|
/* Set result to final length, which can change the pointer to array_u, so |
7813
|
|
|
|
|
|
* re-find it */ |
7814
|
|
|
|
|
|
if (len_u != _invlist_len(u)) { |
7815
|
|
|
|
|
|
invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); |
7816
|
|
|
|
|
|
invlist_trim(u); |
7817
|
|
|
|
|
|
array_u = invlist_array(u); |
7818
|
|
|
|
|
|
} |
7819
|
|
|
|
|
|
|
7820
|
|
|
|
|
|
/* When 'count' is 0, the list that was exhausted (if one was shorter than |
7821
|
|
|
|
|
|
* the other) ended with everything above it not in its set. That means |
7822
|
|
|
|
|
|
* that the remaining part of the union is precisely the same as the |
7823
|
|
|
|
|
|
* non-exhausted list, so can just copy it unchanged. (If both list were |
7824
|
|
|
|
|
|
* exhausted at the same time, then the operations below will be both 0.) |
7825
|
|
|
|
|
|
*/ |
7826
|
|
|
|
|
|
if (count == 0) { |
7827
|
|
|
|
|
|
IV copy_count; /* At most one will have a non-zero copy count */ |
7828
|
|
|
|
|
|
if ((copy_count = len_a - i_a) > 0) { |
7829
|
|
|
|
|
|
Copy(array_a + i_a, array_u + i_u, copy_count, UV); |
7830
|
|
|
|
|
|
} |
7831
|
|
|
|
|
|
else if ((copy_count = len_b - i_b) > 0) { |
7832
|
|
|
|
|
|
Copy(array_b + i_b, array_u + i_u, copy_count, UV); |
7833
|
|
|
|
|
|
} |
7834
|
|
|
|
|
|
} |
7835
|
|
|
|
|
|
|
7836
|
|
|
|
|
|
/* We may be removing a reference to one of the inputs */ |
7837
|
|
|
|
|
|
if (a == *output || b == *output) { |
7838
|
|
|
|
|
|
assert(! invlist_is_iterating(*output)); |
7839
|
|
|
|
|
|
SvREFCNT_dec_NN(*output); |
7840
|
|
|
|
|
|
} |
7841
|
|
|
|
|
|
|
7842
|
|
|
|
|
|
*output = u; |
7843
|
|
|
|
|
|
return; |
7844
|
|
|
|
|
|
} |
7845
|
|
|
|
|
|
|
7846
|
|
|
|
|
|
void |
7847
|
|
|
|
|
|
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) |
7848
|
|
|
|
|
|
{ |
7849
|
|
|
|
|
|
/* Take the intersection of two inversion lists and point to it. *i |
7850
|
|
|
|
|
|
* SHOULD BE DEFINED upon input, and if it points to one of the two lists, |
7851
|
|
|
|
|
|
* the reference count to that list will be decremented. |
7852
|
|
|
|
|
|
* If is TRUE, the result will be the intersection of |
7853
|
|
|
|
|
|
* and the complement (or inversion) of instead of directly. |
7854
|
|
|
|
|
|
* |
7855
|
|
|
|
|
|
* The basis for this comes from "Unicode Demystified" Chapter 13 by |
7856
|
|
|
|
|
|
* Richard Gillam, published by Addison-Wesley, and explained at some |
7857
|
|
|
|
|
|
* length there. The preface says to incorporate its examples into your |
7858
|
|
|
|
|
|
* code at your own risk. In fact, it had bugs |
7859
|
|
|
|
|
|
* |
7860
|
|
|
|
|
|
* The algorithm is like a merge sort, and is essentially the same as the |
7861
|
|
|
|
|
|
* union above |
7862
|
|
|
|
|
|
*/ |
7863
|
|
|
|
|
|
|
7864
|
|
|
|
|
|
const UV* array_a; /* a's array */ |
7865
|
|
|
|
|
|
const UV* array_b; |
7866
|
|
|
|
|
|
UV len_a; /* length of a's array */ |
7867
|
|
|
|
|
|
UV len_b; |
7868
|
|
|
|
|
|
|
7869
|
|
|
|
|
|
SV* r; /* the resulting intersection */ |
7870
|
|
|
|
|
|
UV* array_r; |
7871
|
|
|
|
|
|
UV len_r; |
7872
|
|
|
|
|
|
|
7873
|
|
|
|
|
|
UV i_a = 0; /* current index into a's array */ |
7874
|
|
|
|
|
|
UV i_b = 0; |
7875
|
|
|
|
|
|
UV i_r = 0; |
7876
|
|
|
|
|
|
|
7877
|
|
|
|
|
|
/* running count, as explained in the algorithm source book; items are |
7878
|
|
|
|
|
|
* stopped accumulating and are output when the count changes to/from 2. |
7879
|
|
|
|
|
|
* The count is incremented when we start a range that's in the set, and |
7880
|
|
|
|
|
|
* decremented when we start a range that's not in the set. So its range |
7881
|
|
|
|
|
|
* is 0 to 2. Only when the count is 2 is something in the intersection. |
7882
|
|
|
|
|
|
*/ |
7883
|
|
|
|
|
|
UV count = 0; |
7884
|
|
|
|
|
|
|
7885
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; |
7886
|
|
|
|
|
|
assert(a != b); |
7887
|
|
|
|
|
|
|
7888
|
|
|
|
|
|
/* Special case if either one is empty */ |
7889
|
|
|
|
|
|
len_a = (a == NULL) ? 0 : _invlist_len(a); |
7890
|
|
|
|
|
|
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { |
7891
|
|
|
|
|
|
|
7892
|
|
|
|
|
|
if (len_a != 0 && complement_b) { |
7893
|
|
|
|
|
|
|
7894
|
|
|
|
|
|
/* Here, 'a' is not empty, therefore from the above 'if', 'b' must |
7895
|
|
|
|
|
|
* be empty. Here, also we are using 'b's complement, which hence |
7896
|
|
|
|
|
|
* must be every possible code point. Thus the intersection is |
7897
|
|
|
|
|
|
* simply 'a'. */ |
7898
|
|
|
|
|
|
if (*i != a) { |
7899
|
|
|
|
|
|
if (*i == b) { |
7900
|
|
|
|
|
|
SvREFCNT_dec_NN(b); |
7901
|
|
|
|
|
|
} |
7902
|
|
|
|
|
|
|
7903
|
|
|
|
|
|
*i = invlist_clone(a); |
7904
|
|
|
|
|
|
} |
7905
|
|
|
|
|
|
/* else *i is already 'a' */ |
7906
|
|
|
|
|
|
return; |
7907
|
|
|
|
|
|
} |
7908
|
|
|
|
|
|
|
7909
|
|
|
|
|
|
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The |
7910
|
|
|
|
|
|
* intersection must be empty */ |
7911
|
|
|
|
|
|
if (*i == a) { |
7912
|
|
|
|
|
|
SvREFCNT_dec_NN(a); |
7913
|
|
|
|
|
|
} |
7914
|
|
|
|
|
|
else if (*i == b) { |
7915
|
|
|
|
|
|
SvREFCNT_dec_NN(b); |
7916
|
|
|
|
|
|
} |
7917
|
|
|
|
|
|
*i = _new_invlist(0); |
7918
|
|
|
|
|
|
return; |
7919
|
|
|
|
|
|
} |
7920
|
|
|
|
|
|
|
7921
|
|
|
|
|
|
/* Here both lists exist and are non-empty */ |
7922
|
|
|
|
|
|
array_a = invlist_array(a); |
7923
|
|
|
|
|
|
array_b = invlist_array(b); |
7924
|
|
|
|
|
|
|
7925
|
|
|
|
|
|
/* If are to take the intersection of 'a' with the complement of b, set it |
7926
|
|
|
|
|
|
* up so are looking at b's complement. */ |
7927
|
|
|
|
|
|
if (complement_b) { |
7928
|
|
|
|
|
|
|
7929
|
|
|
|
|
|
/* To complement, we invert: if the first element is 0, remove it. To |
7930
|
|
|
|
|
|
* do this, we just pretend the array starts one later */ |
7931
|
|
|
|
|
|
if (array_b[0] == 0) { |
7932
|
|
|
|
|
|
array_b++; |
7933
|
|
|
|
|
|
len_b--; |
7934
|
|
|
|
|
|
} |
7935
|
|
|
|
|
|
else { |
7936
|
|
|
|
|
|
|
7937
|
|
|
|
|
|
/* But if the first element is not zero, we pretend the list starts |
7938
|
|
|
|
|
|
* at the 0 that is always stored immediately before the array. */ |
7939
|
|
|
|
|
|
array_b--; |
7940
|
|
|
|
|
|
len_b++; |
7941
|
|
|
|
|
|
} |
7942
|
|
|
|
|
|
} |
7943
|
|
|
|
|
|
|
7944
|
|
|
|
|
|
/* Size the intersection for the worst case: that the intersection ends up |
7945
|
|
|
|
|
|
* fragmenting everything to be completely disjoint */ |
7946
|
|
|
|
|
|
r= _new_invlist(len_a + len_b); |
7947
|
|
|
|
|
|
|
7948
|
|
|
|
|
|
/* Will contain U+0000 iff both components do */ |
7949
|
|
|
|
|
|
array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 |
7950
|
|
|
|
|
|
&& len_b > 0 && array_b[0] == 0); |
7951
|
|
|
|
|
|
|
7952
|
|
|
|
|
|
/* Go through each list item by item, stopping when exhausted one of |
7953
|
|
|
|
|
|
* them */ |
7954
|
|
|
|
|
|
while (i_a < len_a && i_b < len_b) { |
7955
|
|
|
|
|
|
UV cp; /* The element to potentially add to the intersection's |
7956
|
|
|
|
|
|
array */ |
7957
|
|
|
|
|
|
bool cp_in_set; /* Is it in the input list's set or not */ |
7958
|
|
|
|
|
|
|
7959
|
|
|
|
|
|
/* We need to take one or the other of the two inputs for the |
7960
|
|
|
|
|
|
* intersection. Since we are merging two sorted lists, we take the |
7961
|
|
|
|
|
|
* smaller of the next items. In case of a tie, we take the one that |
7962
|
|
|
|
|
|
* is not in its set first (a difference from the union algorithm). If |
7963
|
|
|
|
|
|
* we took one in the set first, it would increment the count, possibly |
7964
|
|
|
|
|
|
* to 2 which would cause it to be output as starting a range in the |
7965
|
|
|
|
|
|
* intersection, and the next time through we would take that same |
7966
|
|
|
|
|
|
* number, and output it again as ending the set. By doing it the |
7967
|
|
|
|
|
|
* opposite of this, there is no possibility that the count will be |
7968
|
|
|
|
|
|
* momentarily incremented to 2. (In a tie and both are in the set or |
7969
|
|
|
|
|
|
* both not in the set, it doesn't matter which we take first.) */ |
7970
|
|
|
|
|
|
if (array_a[i_a] < array_b[i_b] |
7971
|
|
|
|
|
|
|| (array_a[i_a] == array_b[i_b] |
7972
|
|
|
|
|
|
&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) |
7973
|
|
|
|
|
|
{ |
7974
|
|
|
|
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); |
7975
|
|
|
|
|
|
cp= array_a[i_a++]; |
7976
|
|
|
|
|
|
} |
7977
|
|
|
|
|
|
else { |
7978
|
|
|
|
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); |
7979
|
|
|
|
|
|
cp= array_b[i_b++]; |
7980
|
|
|
|
|
|
} |
7981
|
|
|
|
|
|
|
7982
|
|
|
|
|
|
/* Here, have chosen which of the two inputs to look at. Only output |
7983
|
|
|
|
|
|
* if the running count changes to/from 2, which marks the |
7984
|
|
|
|
|
|
* beginning/end of a range that's in the intersection */ |
7985
|
|
|
|
|
|
if (cp_in_set) { |
7986
|
|
|
|
|
|
count++; |
7987
|
|
|
|
|
|
if (count == 2) { |
7988
|
|
|
|
|
|
array_r[i_r++] = cp; |
7989
|
|
|
|
|
|
} |
7990
|
|
|
|
|
|
} |
7991
|
|
|
|
|
|
else { |
7992
|
|
|
|
|
|
if (count == 2) { |
7993
|
|
|
|
|
|
array_r[i_r++] = cp; |
7994
|
|
|
|
|
|
} |
7995
|
|
|
|
|
|
count--; |
7996
|
|
|
|
|
|
} |
7997
|
|
|
|
|
|
} |
7998
|
|
|
|
|
|
|
7999
|
|
|
|
|
|
/* Here, we are finished going through at least one of the lists, which |
8000
|
|
|
|
|
|
* means there is something remaining in at most one. We check if the list |
8001
|
|
|
|
|
|
* that has been exhausted is positioned such that we are in the middle |
8002
|
|
|
|
|
|
* of a range in its set or not. (i_a and i_b point to elements 1 beyond |
8003
|
|
|
|
|
|
* the ones we care about.) There are four cases: |
8004
|
|
|
|
|
|
* 1) Both weren't in their sets, count is 0, and remains 0. There's |
8005
|
|
|
|
|
|
* nothing left in the intersection. |
8006
|
|
|
|
|
|
* 2) Both were in their sets, count is 2 and perhaps is incremented to |
8007
|
|
|
|
|
|
* above 2. What should be output is exactly that which is in the |
8008
|
|
|
|
|
|
* non-exhausted set, as everything it has is also in the intersection |
8009
|
|
|
|
|
|
* set, and everything it doesn't have can't be in the intersection |
8010
|
|
|
|
|
|
* 3) The exhausted was in its set, non-exhausted isn't, count is 1, and |
8011
|
|
|
|
|
|
* gets incremented to 2. Like the previous case, the intersection is |
8012
|
|
|
|
|
|
* everything that remains in the non-exhausted set. |
8013
|
|
|
|
|
|
* 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and |
8014
|
|
|
|
|
|
* remains 1. And the intersection has nothing more. */ |
8015
|
|
|
|
|
|
if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) |
8016
|
|
|
|
|
|
|| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) |
8017
|
|
|
|
|
|
{ |
8018
|
|
|
|
|
|
count++; |
8019
|
|
|
|
|
|
} |
8020
|
|
|
|
|
|
|
8021
|
|
|
|
|
|
/* The final length is what we've output so far plus what else is in the |
8022
|
|
|
|
|
|
* intersection. At most one of the subexpressions below will be non-zero */ |
8023
|
|
|
|
|
|
len_r = i_r; |
8024
|
|
|
|
|
|
if (count >= 2) { |
8025
|
|
|
|
|
|
len_r += (len_a - i_a) + (len_b - i_b); |
8026
|
|
|
|
|
|
} |
8027
|
|
|
|
|
|
|
8028
|
|
|
|
|
|
/* Set result to final length, which can change the pointer to array_r, so |
8029
|
|
|
|
|
|
* re-find it */ |
8030
|
|
|
|
|
|
if (len_r != _invlist_len(r)) { |
8031
|
|
|
|
|
|
invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); |
8032
|
|
|
|
|
|
invlist_trim(r); |
8033
|
|
|
|
|
|
array_r = invlist_array(r); |
8034
|
|
|
|
|
|
} |
8035
|
|
|
|
|
|
|
8036
|
|
|
|
|
|
/* Finish outputting any remaining */ |
8037
|
|
|
|
|
|
if (count >= 2) { /* At most one will have a non-zero copy count */ |
8038
|
|
|
|
|
|
IV copy_count; |
8039
|
|
|
|
|
|
if ((copy_count = len_a - i_a) > 0) { |
8040
|
|
|
|
|
|
Copy(array_a + i_a, array_r + i_r, copy_count, UV); |
8041
|
|
|
|
|
|
} |
8042
|
|
|
|
|
|
else if ((copy_count = len_b - i_b) > 0) { |
8043
|
|
|
|
|
|
Copy(array_b + i_b, array_r + i_r, copy_count, UV); |
8044
|
|
|
|
|
|
} |
8045
|
|
|
|
|
|
} |
8046
|
|
|
|
|
|
|
8047
|
|
|
|
|
|
/* We may be removing a reference to one of the inputs */ |
8048
|
|
|
|
|
|
if (a == *i || b == *i) { |
8049
|
|
|
|
|
|
assert(! invlist_is_iterating(*i)); |
8050
|
|
|
|
|
|
SvREFCNT_dec_NN(*i); |
8051
|
|
|
|
|
|
} |
8052
|
|
|
|
|
|
|
8053
|
|
|
|
|
|
*i = r; |
8054
|
|
|
|
|
|
return; |
8055
|
|
|
|
|
|
} |
8056
|
|
|
|
|
|
|
8057
|
|
|
|
|
|
SV* |
8058
|
|
|
|
|
|
Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) |
8059
|
|
|
|
|
|
{ |
8060
|
|
|
|
|
|
/* Add the range from 'start' to 'end' inclusive to the inversion list's |
8061
|
|
|
|
|
|
* set. A pointer to the inversion list is returned. This may actually be |
8062
|
|
|
|
|
|
* a new list, in which case the passed in one has been destroyed. The |
8063
|
|
|
|
|
|
* passed in inversion list can be NULL, in which case a new one is created |
8064
|
|
|
|
|
|
* with just the one range in it */ |
8065
|
|
|
|
|
|
|
8066
|
|
|
|
|
|
SV* range_invlist; |
8067
|
|
|
|
|
|
UV len; |
8068
|
|
|
|
|
|
|
8069
|
|
|
|
|
|
if (invlist == NULL) { |
8070
|
|
|
|
|
|
invlist = _new_invlist(2); |
8071
|
|
|
|
|
|
len = 0; |
8072
|
|
|
|
|
|
} |
8073
|
|
|
|
|
|
else { |
8074
|
|
|
|
|
|
len = _invlist_len(invlist); |
8075
|
|
|
|
|
|
} |
8076
|
|
|
|
|
|
|
8077
|
|
|
|
|
|
/* If comes after the final entry actually in the list, can just append it |
8078
|
|
|
|
|
|
* to the end, */ |
8079
|
|
|
|
|
|
if (len == 0 |
8080
|
|
|
|
|
|
|| (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) |
8081
|
|
|
|
|
|
&& start >= invlist_array(invlist)[len - 1])) |
8082
|
|
|
|
|
|
{ |
8083
|
|
|
|
|
|
_append_range_to_invlist(invlist, start, end); |
8084
|
|
|
|
|
|
return invlist; |
8085
|
|
|
|
|
|
} |
8086
|
|
|
|
|
|
|
8087
|
|
|
|
|
|
/* Here, can't just append things, create and return a new inversion list |
8088
|
|
|
|
|
|
* which is the union of this range and the existing inversion list */ |
8089
|
|
|
|
|
|
range_invlist = _new_invlist(2); |
8090
|
|
|
|
|
|
_append_range_to_invlist(range_invlist, start, end); |
8091
|
|
|
|
|
|
|
8092
|
|
|
|
|
|
_invlist_union(invlist, range_invlist, &invlist); |
8093
|
|
|
|
|
|
|
8094
|
|
|
|
|
|
/* The temporary can be freed */ |
8095
|
|
|
|
|
|
SvREFCNT_dec_NN(range_invlist); |
8096
|
|
|
|
|
|
|
8097
|
|
|
|
|
|
return invlist; |
8098
|
|
|
|
|
|
} |
8099
|
|
|
|
|
|
|
8100
|
|
|
|
|
|
#endif |
8101
|
|
|
|
|
|
|
8102
|
|
|
|
|
|
PERL_STATIC_INLINE SV* |
8103
|
980786
|
|
|
|
|
S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { |
8104
|
94
|
|
|
|
|
return _add_range_to_invlist(invlist, cp, cp); |
8105
|
|
|
|
|
|
} |
8106
|
|
|
|
|
|
|
8107
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
8108
|
|
|
|
|
|
void |
8109
|
|
|
|
|
|
Perl__invlist_invert(pTHX_ SV* const invlist) |
8110
|
|
|
|
|
|
{ |
8111
|
|
|
|
|
|
/* Complement the input inversion list. This adds a 0 if the list didn't |
8112
|
|
|
|
|
|
* have a zero; removes it otherwise. As described above, the data |
8113
|
|
|
|
|
|
* structure is set up so that this is very efficient */ |
8114
|
|
|
|
|
|
|
8115
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_INVERT; |
8116
|
|
|
|
|
|
|
8117
|
|
|
|
|
|
assert(! invlist_is_iterating(invlist)); |
8118
|
|
|
|
|
|
|
8119
|
|
|
|
|
|
/* The inverse of matching nothing is matching everything */ |
8120
|
|
|
|
|
|
if (_invlist_len(invlist) == 0) { |
8121
|
|
|
|
|
|
_append_range_to_invlist(invlist, 0, UV_MAX); |
8122
|
|
|
|
|
|
return; |
8123
|
|
|
|
|
|
} |
8124
|
|
|
|
|
|
|
8125
|
|
|
|
|
|
*get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); |
8126
|
|
|
|
|
|
} |
8127
|
|
|
|
|
|
|
8128
|
|
|
|
|
|
void |
8129
|
|
|
|
|
|
Perl__invlist_invert_prop(pTHX_ SV* const invlist) |
8130
|
|
|
|
|
|
{ |
8131
|
|
|
|
|
|
/* Complement the input inversion list (which must be a Unicode property, |
8132
|
|
|
|
|
|
* all of which don't match above the Unicode maximum code point.) And |
8133
|
|
|
|
|
|
* Perl has chosen to not have the inversion match above that either. This |
8134
|
|
|
|
|
|
* adds a 0x110000 if the list didn't end with it, and removes it if it did |
8135
|
|
|
|
|
|
*/ |
8136
|
|
|
|
|
|
|
8137
|
|
|
|
|
|
UV len; |
8138
|
|
|
|
|
|
UV* array; |
8139
|
|
|
|
|
|
|
8140
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; |
8141
|
|
|
|
|
|
|
8142
|
|
|
|
|
|
_invlist_invert(invlist); |
8143
|
|
|
|
|
|
|
8144
|
|
|
|
|
|
len = _invlist_len(invlist); |
8145
|
|
|
|
|
|
|
8146
|
|
|
|
|
|
if (len != 0) { /* If empty do nothing */ |
8147
|
|
|
|
|
|
array = invlist_array(invlist); |
8148
|
|
|
|
|
|
if (array[len - 1] != PERL_UNICODE_MAX + 1) { |
8149
|
|
|
|
|
|
/* Add 0x110000. First, grow if necessary */ |
8150
|
|
|
|
|
|
len++; |
8151
|
|
|
|
|
|
if (invlist_max(invlist) < len) { |
8152
|
|
|
|
|
|
invlist_extend(invlist, len); |
8153
|
|
|
|
|
|
array = invlist_array(invlist); |
8154
|
|
|
|
|
|
} |
8155
|
|
|
|
|
|
invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); |
8156
|
|
|
|
|
|
array[len - 1] = PERL_UNICODE_MAX + 1; |
8157
|
|
|
|
|
|
} |
8158
|
|
|
|
|
|
else { /* Remove the 0x110000 */ |
8159
|
|
|
|
|
|
invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); |
8160
|
|
|
|
|
|
} |
8161
|
|
|
|
|
|
} |
8162
|
|
|
|
|
|
|
8163
|
|
|
|
|
|
return; |
8164
|
|
|
|
|
|
} |
8165
|
|
|
|
|
|
#endif |
8166
|
|
|
|
|
|
|
8167
|
|
|
|
|
|
PERL_STATIC_INLINE SV* |
8168
|
|
|
|
|
|
S_invlist_clone(pTHX_ SV* const invlist) |
8169
|
|
|
|
|
|
{ |
8170
|
|
|
|
|
|
|
8171
|
|
|
|
|
|
/* Return a new inversion list that is a copy of the input one, which is |
8172
|
|
|
|
|
|
* unchanged */ |
8173
|
|
|
|
|
|
|
8174
|
|
|
|
|
|
/* Need to allocate extra space to accommodate Perl's addition of a |
8175
|
|
|
|
|
|
* trailing NUL to SvPV's, since it thinks they are always strings */ |
8176
|
|
|
|
|
|
SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); |
8177
|
|
|
|
|
|
STRLEN physical_length = SvCUR(invlist); |
8178
|
|
|
|
|
|
bool offset = *(get_invlist_offset_addr(invlist)); |
8179
|
|
|
|
|
|
|
8180
|
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_CLONE; |
8181
|
|
|
|
|
|
|
8182
|
|
|
|
|
|
*(get_invlist_offset_addr(new_invlist)) = offset; |
8183
|
|
|
|
|
|
invlist_set_len(new_invlist, _invlist_len(invlist), offset); |
8184
|
|
|
|
|
|
Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); |
8185
|
|
|
|
|
|
|
8186
|
|
|
|
|
|
return new_invlist; |
8187
|
|
|
|
|
|
} |
8188
|
|
|
|
|
|
|
8189
|
|
|
|
|
|
PERL_STATIC_INLINE STRLEN* |
8190
|
108
|
|
|
|
|
S_get_invlist_iter_addr(pTHX_ SV* invlist) |
8191
|
|
|
|
|
|
{ |
8192
|
|
|
|
|
|
/* Return the address of the UV that contains the current iteration |
8193
|
|
|
|
|
|
* position */ |
8194
|
|
|
|
|
|
|
8195
|
108
|
|
|
|
|
PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; |
8196
|
|
|
|
|
|
|
8197
|
108
|
|
|
|
|
assert(SvTYPE(invlist) == SVt_INVLIST); |
8198
|
|
|
|
|
|
|
8199
|
104
|
|
|
|
|
return &(((XINVLIST*) SvANY(invlist))->iterator); |
8200
|
|
|
|
|
|
} |
8201
|
|
|
|
|
|
|
8202
|
|
|
|
|
|
PERL_STATIC_INLINE void |
8203
|
94
|
|
|
|
|
S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ |
8204
|
|
|
|
|
|
{ |
8205
|
94
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_ITERINIT; |
8206
|
|
|
|
|
|
|
8207
|
94
|
|
|
|
|
*get_invlist_iter_addr(invlist) = 0; |
8208
|
94
|
|
|
|
|
} |
8209
|
|
|
|
|
|
|
8210
|
|
|
|
|
|
PERL_STATIC_INLINE void |
8211
|
8
|
|
|
|
|
S_invlist_iterfinish(pTHX_ SV* invlist) |
8212
|
|
|
|
|
|
{ |
8213
|
|
|
|
|
|
/* Terminate iterator for invlist. This is to catch development errors. |
8214
|
|
|
|
|
|
* Any iteration that is interrupted before completed should call this |
8215
|
|
|
|
|
|
* function. Functions that add code points anywhere else but to the end |
8216
|
|
|
|
|
|
* of an inversion list assert that they are not in the middle of an |
8217
|
|
|
|
|
|
* iteration. If they were, the addition would make the iteration |
8218
|
|
|
|
|
|
* problematical: if the iteration hadn't reached the place where things |
8219
|
|
|
|
|
|
* were being added, it would be ok */ |
8220
|
|
|
|
|
|
|
8221
|
98
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_ITERFINISH; |
8222
|
|
|
|
|
|
|
8223
|
285528
|
|
|
|
|
*get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; |
8224
|
285528
|
|
|
|
|
} |
8225
|
|
|
|
|
|
|
8226
|
|
|
|
|
|
STATIC bool |
8227
|
285530
|
|
|
|
|
S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) |
8228
|
|
|
|
|
|
{ |
8229
|
|
|
|
|
|
/* An C call on must be used to set this up. |
8230
|
|
|
|
|
|
* This call sets in <*start> and <*end>, the next range in . |
8231
|
|
|
|
|
|
* Returns if successful and the next call will return the next |
8232
|
|
|
|
|
|
* range; if was already at the end of the list. If the latter, |
8233
|
|
|
|
|
|
* <*start> and <*end> are unchanged, and the next call to this function |
8234
|
|
|
|
|
|
* will start over at the beginning of the list */ |
8235
|
|
|
|
|
|
|
8236
|
285530
|
|
|
|
|
STRLEN* pos = get_invlist_iter_addr(invlist); |
8237
|
285530
|
|
|
|
|
UV len = _invlist_len(invlist); |
8238
|
|
|
|
|
|
UV *array; |
8239
|
|
|
|
|
|
|
8240
|
285530
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_ITERNEXT; |
8241
|
|
|
|
|
|
|
8242
|
285530
|
|
|
|
|
if (*pos >= len) { |
8243
|
285526
|
|
|
|
|
*pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ |
8244
|
47151425
|
|
|
|
|
return FALSE; |
8245
|
|
|
|
|
|
} |
8246
|
|
|
|
|
|
|
8247
|
5770047
|
|
|
|
|
array = invlist_array(invlist); |
8248
|
|
|
|
|
|
|
8249
|
7121754
|
|
|
|
|
*start = array[(*pos)++]; |
8250
|
|
|
|
|
|
|
8251
|
17722335
|
|
|
|
|
if (*pos >= len) { |
8252
|
17722331
|
|
|
|
|
*end = UV_MAX; |
8253
|
|
|
|
|
|
} |
8254
|
|
|
|
|
|
else { |
8255
|
3510147
|
|
|
|
|
*end = array[(*pos)++] - 1; |
8256
|
|
|
|
|
|
} |
8257
|
|
|
|
|
|
|
8258
|
|
|
|
|
|
return TRUE; |
8259
|
|
|
|
|
|
} |
8260
|
|
|
|
|
|
|
8261
|
|
|
|
|
|
PERL_STATIC_INLINE bool |
8262
|
|
|
|
|
|
S_invlist_is_iterating(pTHX_ SV* const invlist) |
8263
|
|
|
|
|
|
{ |
8264
|
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; |
8265
|
|
|
|
|
|
|
8266
|
|
|
|
|
|
return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; |
8267
|
|
|
|
|
|
} |
8268
|
|
|
|
|
|
|
8269
|
|
|
|
|
|
PERL_STATIC_INLINE UV |
8270
|
3510143
|
|
|
|
|
S_invlist_highest(pTHX_ SV* const invlist) |
8271
|
|
|
|
|
|
{ |
8272
|
|
|
|
|
|
/* Returns the highest code point that matches an inversion list. This API |
8273
|
|
|
|
|
|
* has an ambiguity, as it returns 0 under either the highest is actually |
8274
|
|
|
|
|
|
* 0, or if the list is empty. If this distinction matters to you, check |
8275
|
|
|
|
|
|
* for emptiness before calling this function */ |
8276
|
|
|
|
|
|
|
8277
|
14212188
|
|
|
|
|
UV len = _invlist_len(invlist); |
8278
|
|
|
|
|
|
UV *array; |
8279
|
|
|
|
|
|
|
8280
|
14212188
|
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_HIGHEST; |
8281
|
|
|
|
|
|
|
8282
|
868314
|
|
|
|
|
if (len == 0) { |
8283
|
|
|
|
|
|
return 0; |
8284
|
|
|
|
|
|
} |
8285
|
|
|
|
|
|
|
8286
|
15577922
|
|
|
|
|
array = invlist_array(invlist); |
8287
|
|
|
|
|
|
|
8288
|
|
|
|
|
|
/* The last element in the array in the inversion list always starts a |
8289
|
|
|
|
|
|
* range that goes to infinity. That range may be for code points that are |
8290
|
|
|
|
|
|
* matched in the inversion list, or it may be for ones that aren't |
8291
|
|
|
|
|
|
* matched. In the latter case, the highest code point in the set is one |
8292
|
|
|
|
|
|
* less than the beginning of this range; otherwise it is the final element |
8293
|
|
|
|
|
|
* of this range: infinity */ |
8294
|
0
|
|
|
|
|
return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) |
8295
|
|
|
|
|
|
? UV_MAX |
8296
|
931956
|
|
|
|
|
: array[len - 1] - 1; |
8297
|
|
|
|
|
|
} |
8298
|
|
|
|
|
|
|
8299
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
8300
|
|
|
|
|
|
SV * |
8301
|
|
|
|
|
|
Perl__invlist_contents(pTHX_ SV* const invlist) |
8302
|
|
|
|
|
|
{ |
8303
|
|
|
|
|
|
/* Get the contents of an inversion list into a string SV so that they can |
8304
|
|
|
|
|
|
* be printed out. It uses the format traditionally done for debug tracing |
8305
|
|
|
|
|
|
*/ |
8306
|
|
|
|
|
|
|
8307
|
|
|
|
|
|
UV start, end; |
8308
|
|
|
|
|
|
SV* output = newSVpvs("\n"); |
8309
|
|
|
|
|
|
|
8310
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_CONTENTS; |
8311
|
|
|
|
|
|
|
8312
|
|
|
|
|
|
assert(! invlist_is_iterating(invlist)); |
8313
|
|
|
|
|
|
|
8314
|
|
|
|
|
|
invlist_iterinit(invlist); |
8315
|
|
|
|
|
|
while (invlist_iternext(invlist, &start, &end)) { |
8316
|
|
|
|
|
|
if (end == UV_MAX) { |
8317
|
|
|
|
|
|
Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); |
8318
|
|
|
|
|
|
} |
8319
|
|
|
|
|
|
else if (end != start) { |
8320
|
|
|
|
|
|
Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", |
8321
|
|
|
|
|
|
start, end); |
8322
|
|
|
|
|
|
} |
8323
|
|
|
|
|
|
else { |
8324
|
|
|
|
|
|
Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); |
8325
|
|
|
|
|
|
} |
8326
|
|
|
|
|
|
} |
8327
|
|
|
|
|
|
|
8328
|
|
|
|
|
|
return output; |
8329
|
|
|
|
|
|
} |
8330
|
|
|
|
|
|
#endif |
8331
|
|
|
|
|
|
|
8332
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
8333
|
|
|
|
|
|
void |
8334
|
|
|
|
|
|
Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist) |
8335
|
|
|
|
|
|
{ |
8336
|
|
|
|
|
|
/* Designed to be called only by do_sv_dump(). Dumps out the ranges of the |
8337
|
|
|
|
|
|
* inversion list 'invlist' to 'file' at 'level' Each line is prefixed by |
8338
|
|
|
|
|
|
* the string 'indent'. The output looks like this: |
8339
|
|
|
|
|
|
[0] 0x000A .. 0x000D |
8340
|
|
|
|
|
|
[2] 0x0085 |
8341
|
|
|
|
|
|
[4] 0x2028 .. 0x2029 |
8342
|
|
|
|
|
|
[6] 0x3104 .. INFINITY |
8343
|
|
|
|
|
|
* This means that the first range of code points matched by the list are |
8344
|
|
|
|
|
|
* 0xA through 0xD; the second range contains only the single code point |
8345
|
|
|
|
|
|
* 0x85, etc. An inversion list is an array of UVs. Two array elements |
8346
|
|
|
|
|
|
* are used to define each range (except if the final range extends to |
8347
|
|
|
|
|
|
* infinity, only a single element is needed). The array index of the |
8348
|
|
|
|
|
|
* first element for the corresponding range is given in brackets. */ |
8349
|
|
|
|
|
|
|
8350
|
|
|
|
|
|
UV start, end; |
8351
|
|
|
|
|
|
STRLEN count = 0; |
8352
|
|
|
|
|
|
|
8353
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLIST_DUMP; |
8354
|
|
|
|
|
|
|
8355
|
|
|
|
|
|
if (invlist_is_iterating(invlist)) { |
8356
|
|
|
|
|
|
Perl_dump_indent(aTHX_ level, file, |
8357
|
|
|
|
|
|
"%sCan't dump inversion list because is in middle of iterating\n", |
8358
|
|
|
|
|
|
indent); |
8359
|
|
|
|
|
|
return; |
8360
|
|
|
|
|
|
} |
8361
|
|
|
|
|
|
|
8362
|
|
|
|
|
|
invlist_iterinit(invlist); |
8363
|
|
|
|
|
|
while (invlist_iternext(invlist, &start, &end)) { |
8364
|
|
|
|
|
|
if (end == UV_MAX) { |
8365
|
|
|
|
|
|
Perl_dump_indent(aTHX_ level, file, |
8366
|
|
|
|
|
|
"%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", |
8367
|
|
|
|
|
|
indent, (UV)count, start); |
8368
|
|
|
|
|
|
} |
8369
|
|
|
|
|
|
else if (end != start) { |
8370
|
|
|
|
|
|
Perl_dump_indent(aTHX_ level, file, |
8371
|
|
|
|
|
|
"%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", |
8372
|
|
|
|
|
|
indent, (UV)count, start, end); |
8373
|
|
|
|
|
|
} |
8374
|
|
|
|
|
|
else { |
8375
|
|
|
|
|
|
Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", |
8376
|
|
|
|
|
|
indent, (UV)count, start); |
8377
|
|
|
|
|
|
} |
8378
|
|
|
|
|
|
count += 2; |
8379
|
|
|
|
|
|
} |
8380
|
|
|
|
|
|
} |
8381
|
|
|
|
|
|
#endif |
8382
|
|
|
|
|
|
|
8383
|
|
|
|
|
|
#ifdef PERL_ARGS_ASSERT__INVLISTEQ |
8384
|
|
|
|
|
|
bool |
8385
|
|
|
|
|
|
S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) |
8386
|
|
|
|
|
|
{ |
8387
|
|
|
|
|
|
/* Return a boolean as to if the two passed in inversion lists are |
8388
|
|
|
|
|
|
* identical. The final argument, if TRUE, says to take the complement of |
8389
|
|
|
|
|
|
* the second inversion list before doing the comparison */ |
8390
|
|
|
|
|
|
|
8391
|
|
|
|
|
|
const UV* array_a = invlist_array(a); |
8392
|
|
|
|
|
|
const UV* array_b = invlist_array(b); |
8393
|
|
|
|
|
|
UV len_a = _invlist_len(a); |
8394
|
|
|
|
|
|
UV len_b = _invlist_len(b); |
8395
|
|
|
|
|
|
|
8396
|
|
|
|
|
|
UV i = 0; /* current index into the arrays */ |
8397
|
|
|
|
|
|
bool retval = TRUE; /* Assume are identical until proven otherwise */ |
8398
|
|
|
|
|
|
|
8399
|
|
|
|
|
|
PERL_ARGS_ASSERT__INVLISTEQ; |
8400
|
|
|
|
|
|
|
8401
|
|
|
|
|
|
/* If are to compare 'a' with the complement of b, set it |
8402
|
|
|
|
|
|
* up so are looking at b's complement. */ |
8403
|
|
|
|
|
|
if (complement_b) { |
8404
|
|
|
|
|
|
|
8405
|
|
|
|
|
|
/* The complement of nothing is everything, so would have to have |
8406
|
|
|
|
|
|
* just one element, starting at zero (ending at infinity) */ |
8407
|
|
|
|
|
|
if (len_b == 0) { |
8408
|
|
|
|
|
|
return (len_a == 1 && array_a[0] == 0); |
8409
|
|
|
|
|
|
} |
8410
|
|
|
|
|
|
else if (array_b[0] == 0) { |
8411
|
|
|
|
|
|
|
8412
|
|
|
|
|
|
/* Otherwise, to complement, we invert. Here, the first element is |
8413
|
|
|
|
|
|
* 0, just remove it. To do this, we just pretend the array starts |
8414
|
|
|
|
|
|
* one later */ |
8415
|
|
|
|
|
|
|
8416
|
|
|
|
|
|
array_b++; |
8417
|
|
|
|
|
|
len_b--; |
8418
|
|
|
|
|
|
} |
8419
|
|
|
|
|
|
else { |
8420
|
|
|
|
|
|
|
8421
|
|
|
|
|
|
/* But if the first element is not zero, we pretend the list starts |
8422
|
|
|
|
|
|
* at the 0 that is always stored immediately before the array. */ |
8423
|
|
|
|
|
|
array_b--; |
8424
|
|
|
|
|
|
len_b++; |
8425
|
|
|
|
|
|
} |
8426
|
|
|
|
|
|
} |
8427
|
|
|
|
|
|
|
8428
|
|
|
|
|
|
/* Make sure that the lengths are the same, as well as the final element |
8429
|
|
|
|
|
|
* before looping through the remainder. (Thus we test the length, final, |
8430
|
|
|
|
|
|
* and first elements right off the bat) */ |
8431
|
|
|
|
|
|
if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { |
8432
|
|
|
|
|
|
retval = FALSE; |
8433
|
|
|
|
|
|
} |
8434
|
|
|
|
|
|
else for (i = 0; i < len_a - 1; i++) { |
8435
|
|
|
|
|
|
if (array_a[i] != array_b[i]) { |
8436
|
|
|
|
|
|
retval = FALSE; |
8437
|
|
|
|
|
|
break; |
8438
|
|
|
|
|
|
} |
8439
|
|
|
|
|
|
} |
8440
|
|
|
|
|
|
|
8441
|
|
|
|
|
|
return retval; |
8442
|
|
|
|
|
|
} |
8443
|
|
|
|
|
|
#endif |
8444
|
|
|
|
|
|
|
8445
|
|
|
|
|
|
#undef HEADER_LENGTH |
8446
|
|
|
|
|
|
#undef TO_INTERNAL_SIZE |
8447
|
|
|
|
|
|
#undef FROM_INTERNAL_SIZE |
8448
|
|
|
|
|
|
#undef INVLIST_VERSION_ID |
8449
|
|
|
|
|
|
|
8450
|
|
|
|
|
|
/* End of inversion list object */ |
8451
|
|
|
|
|
|
|
8452
|
|
|
|
|
|
STATIC void |
8453
|
932044
|
|
|
|
|
S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) |
8454
|
|
|
|
|
|
{ |
8455
|
|
|
|
|
|
/* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' |
8456
|
|
|
|
|
|
* constructs, and updates RExC_flags with them. On input, RExC_parse |
8457
|
|
|
|
|
|
* should point to the first flag; it is updated on output to point to the |
8458
|
|
|
|
|
|
* final ')' or ':'. There needs to be at least one flag, or this will |
8459
|
|
|
|
|
|
* abort */ |
8460
|
|
|
|
|
|
|
8461
|
|
|
|
|
|
/* for (?g), (?gc), and (?o) warnings; warning |
8462
|
|
|
|
|
|
about (?c) will warn about (?g) -- japhy */ |
8463
|
|
|
|
|
|
|
8464
|
|
|
|
|
|
#define WASTED_O 0x01 |
8465
|
|
|
|
|
|
#define WASTED_G 0x02 |
8466
|
|
|
|
|
|
#define WASTED_C 0x04 |
8467
|
|
|
|
|
|
#define WASTED_GC (WASTED_G|WASTED_C) |
8468
|
|
|
|
|
|
I32 wastedflags = 0x00; |
8469
|
1394422
|
|
|
|
|
U32 posflags = 0, negflags = 0; |
8470
|
|
|
|
|
|
U32 *flagsp = &posflags; |
8471
|
|
|
|
|
|
char has_charset_modifier = '\0'; |
8472
|
|
|
|
|
|
regex_charset cs; |
8473
|
|
|
|
|
|
bool has_use_defaults = FALSE; |
8474
|
932044
|
|
|
|
|
const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ |
8475
|
|
|
|
|
|
|
8476
|
88
|
|
|
|
|
PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; |
8477
|
|
|
|
|
|
|
8478
|
|
|
|
|
|
/* '^' as an initial flag sets certain defaults */ |
8479
|
88
|
|
|
|
|
if (UCHARAT(RExC_parse) == '^') { |
8480
|
88
|
|
|
|
|
RExC_parse++; |
8481
|
|
|
|
|
|
has_use_defaults = TRUE; |
8482
|
88
|
|
|
|
|
STD_PMMOD_FLAGS_CLEAR(&RExC_flags); |
8483
|
88
|
|
|
|
|
set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) |
8484
|
|
|
|
|
|
? REGEX_UNICODE_CHARSET |
8485
|
|
|
|
|
|
: REGEX_DEPENDS_CHARSET); |
8486
|
|
|
|
|
|
} |
8487
|
|
|
|
|
|
|
8488
|
88
|
|
|
|
|
cs = get_regex_charset(RExC_flags); |
8489
|
88
|
|
|
|
|
if (cs == REGEX_DEPENDS_CHARSET |
8490
|
88
|
|
|
|
|
&& (RExC_utf8 || RExC_uni_semantics)) |
8491
|
|
|
|
|
|
{ |
8492
|
|
|
|
|
|
cs = REGEX_UNICODE_CHARSET; |
8493
|
|
|
|
|
|
} |
8494
|
|
|
|
|
|
|
8495
|
96
|
|
|
|
|
while (*RExC_parse) { |
8496
|
|
|
|
|
|
/* && strchr("iogcmsx", *RExC_parse) */ |
8497
|
|
|
|
|
|
/* (?g), (?gc) and (?o) are useless here |
8498
|
|
|
|
|
|
and must be globally applied -- japhy */ |
8499
|
96
|
|
|
|
|
switch (*RExC_parse) { |
8500
|
|
|
|
|
|
|
8501
|
|
|
|
|
|
/* Code for the imsx flags */ |
8502
|
8
|
|
|
|
|
CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); |
8503
|
|
|
|
|
|
|
8504
|
|
|
|
|
|
case LOCALE_PAT_MOD: |
8505
|
0
|
|
|
|
|
if (has_charset_modifier) { |
8506
|
|
|
|
|
|
goto excess_modifier; |
8507
|
|
|
|
|
|
} |
8508
|
0
|
|
|
|
|
else if (flagsp == &negflags) { |
8509
|
|
|
|
|
|
goto neg_modifier; |
8510
|
|
|
|
|
|
} |
8511
|
|
|
|
|
|
cs = REGEX_LOCALE_CHARSET; |
8512
|
|
|
|
|
|
has_charset_modifier = LOCALE_PAT_MOD; |
8513
|
0
|
|
|
|
|
RExC_contains_locale = 1; |
8514
|
0
|
|
|
|
|
break; |
8515
|
|
|
|
|
|
case UNICODE_PAT_MOD: |
8516
|
0
|
|
|
|
|
if (has_charset_modifier) { |
8517
|
|
|
|
|
|
goto excess_modifier; |
8518
|
|
|
|
|
|
} |
8519
|
0
|
|
|
|
|
else if (flagsp == &negflags) { |
8520
|
|
|
|
|
|
goto neg_modifier; |
8521
|
|
|
|
|
|
} |
8522
|
|
|
|
|
|
cs = REGEX_UNICODE_CHARSET; |
8523
|
|
|
|
|
|
has_charset_modifier = UNICODE_PAT_MOD; |
8524
|
|
|
|
|
|
break; |
8525
|
|
|
|
|
|
case ASCII_RESTRICT_PAT_MOD: |
8526
|
0
|
|
|
|
|
if (flagsp == &negflags) { |
8527
|
|
|
|
|
|
goto neg_modifier; |
8528
|
|
|
|
|
|
} |
8529
|
0
|
|
|
|
|
if (has_charset_modifier) { |
8530
|
0
|
|
|
|
|
if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { |
8531
|
|
|
|
|
|
goto excess_modifier; |
8532
|
|
|
|
|
|
} |
8533
|
|
|
|
|
|
/* Doubled modifier implies more restricted */ |
8534
|
|
|
|
|
|
cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; |
8535
|
|
|
|
|
|
} |
8536
|
|
|
|
|
|
else { |
8537
|
|
|
|
|
|
cs = REGEX_ASCII_RESTRICTED_CHARSET; |
8538
|
|
|
|
|
|
} |
8539
|
|
|
|
|
|
has_charset_modifier = ASCII_RESTRICT_PAT_MOD; |
8540
|
|
|
|
|
|
break; |
8541
|
|
|
|
|
|
case DEPENDS_PAT_MOD: |
8542
|
4759122
|
|
|
|
|
if (has_use_defaults) { |
8543
|
|
|
|
|
|
goto fail_modifiers; |
8544
|
|
|
|
|
|
} |
8545
|
4759122
|
|
|
|
|
else if (flagsp == &negflags) { |
8546
|
|
|
|
|
|
goto neg_modifier; |
8547
|
|
|
|
|
|
} |
8548
|
4759122
|
|
|
|
|
else if (has_charset_modifier) { |
8549
|
|
|
|
|
|
goto excess_modifier; |
8550
|
|
|
|
|
|
} |
8551
|
|
|
|
|
|
|
8552
|
|
|
|
|
|
/* The dual charset means unicode semantics if the |
8553
|
|
|
|
|
|
* pattern (or target, not known until runtime) are |
8554
|
|
|
|
|
|
* utf8, or something in the pattern indicates unicode |
8555
|
|
|
|
|
|
* semantics */ |
8556
|
4759122
|
|
|
|
|
cs = (RExC_utf8 || RExC_uni_semantics) |
8557
|
|
|
|
|
|
? REGEX_UNICODE_CHARSET |
8558
|
|
|
|
|
|
: REGEX_DEPENDS_CHARSET; |
8559
|
|
|
|
|
|
has_charset_modifier = DEPENDS_PAT_MOD; |
8560
|
1075156
|
|
|
|
|
break; |
8561
|
|
|
|
|
|
excess_modifier: |
8562
|
1075156
|
|
|
|
|
RExC_parse++; |
8563
|
1075156
|
|
|
|
|
if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { |
8564
|
4759122
|
|
|
|
|
vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); |
8565
|
|
|
|
|
|
} |
8566
|
4759122
|
|
|
|
|
else if (has_charset_modifier == *(RExC_parse - 1)) { |
8567
|
3591854
|
|
|
|
|
vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); |
8568
|
|
|
|
|
|
} |
8569
|
|
|
|
|
|
else { |
8570
|
9979322
|
|
|
|
|
vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); |
8571
|
|
|
|
|
|
} |
8572
|
|
|
|
|
|
/*NOTREACHED*/ |
8573
|
|
|
|
|
|
neg_modifier: |
8574
|
9979322
|
|
|
|
|
RExC_parse++; |
8575
|
827054
|
|
|
|
|
vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); |
8576
|
|
|
|
|
|
/*NOTREACHED*/ |
8577
|
|
|
|
|
|
case ONCE_PAT_MOD: /* 'o' */ |
8578
|
|
|
|
|
|
case GLOBAL_PAT_MOD: /* 'g' */ |
8579
|
786790
|
|
|
|
|
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { |
8580
|
786788
|
|
|
|
|
const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; |
8581
|
786786
|
|
|
|
|
if (! (wastedflags & wflagbit) ) { |
8582
|
786786
|
|
|
|
|
wastedflags |= wflagbit; |
8583
|
|
|
|
|
|
/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ |
8584
|
1275096
|
|
|
|
|
vWARN5( |
8585
|
|
|
|
|
|
RExC_parse + 1, |
8586
|
|
|
|
|
|
"Useless (%s%c) - %suse /%c modifier", |
8587
|
|
|
|
|
|
flagsp == &negflags ? "?-" : "?", |
8588
|
|
|
|
|
|
*RExC_parse, |
8589
|
|
|
|
|
|
flagsp == &negflags ? "don't " : "", |
8590
|
|
|
|
|
|
*RExC_parse |
8591
|
|
|
|
|
|
); |
8592
|
|
|
|
|
|
} |
8593
|
|
|
|
|
|
} |
8594
|
|
|
|
|
|
break; |
8595
|
|
|
|
|
|
|
8596
|
|
|
|
|
|
case CONTINUE_PAT_MOD: /* 'c' */ |
8597
|
1275092
|
|
|
|
|
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { |
8598
|
1698322
|
|
|
|
|
if (! (wastedflags & WASTED_C) ) { |
8599
|
1698322
|
|
|
|
|
wastedflags |= WASTED_GC; |
8600
|
|
|
|
|
|
/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ |
8601
|
593498
|
|
|
|
|
vWARN3( |
8602
|
|
|
|
|
|
RExC_parse + 1, |
8603
|
|
|
|
|
|
"Useless (%sc) - %suse /gc modifier", |
8604
|
|
|
|
|
|
flagsp == &negflags ? "?-" : "?", |
8605
|
|
|
|
|
|
flagsp == &negflags ? "don't " : "" |
8606
|
|
|
|
|
|
); |
8607
|
|
|
|
|
|
} |
8608
|
|
|
|
|
|
} |
8609
|
|
|
|
|
|
break; |
8610
|
|
|
|
|
|
case KEEPCOPY_PAT_MOD: /* 'p' */ |
8611
|
511642
|
|
|
|
|
if (flagsp == &negflags) { |
8612
|
511638
|
|
|
|
|
if (SIZE_ONLY) |
8613
|
511638
|
|
|
|
|
ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); |
8614
|
|
|
|
|
|
} else { |
8615
|
511638
|
|
|
|
|
*flagsp |= RXf_PMf_KEEPCOPY; |
8616
|
|
|
|
|
|
} |
8617
|
|
|
|
|
|
break; |
8618
|
|
|
|
|
|
case '-': |
8619
|
|
|
|
|
|
/* A flag is a default iff it is following a minus, so |
8620
|
|
|
|
|
|
* if there is a minus, it means will be trying to |
8621
|
|
|
|
|
|
* re-specify a default which is an error */ |
8622
|
511638
|
|
|
|
|
if (has_use_defaults || flagsp == &negflags) { |
8623
|
|
|
|
|
|
goto fail_modifiers; |
8624
|
|
|
|
|
|
} |
8625
|
|
|
|
|
|
flagsp = &negflags; |
8626
|
|
|
|
|
|
wastedflags = 0; /* reset so (?g-c) warns twice */ |
8627
|
|
|
|
|
|
break; |
8628
|
|
|
|
|
|
case ':': |
8629
|
|
|
|
|
|
case ')': |
8630
|
98
|
|
|
|
|
RExC_flags |= posflags; |
8631
|
98
|
|
|
|
|
RExC_flags &= ~negflags; |
8632
|
|
|
|
|
|
set_regex_charset(&RExC_flags, cs); |
8633
|
90
|
|
|
|
|
return; |
8634
|
|
|
|
|
|
/*NOTREACHED*/ |
8635
|
|
|
|
|
|
default: |
8636
|
|
|
|
|
|
fail_modifiers: |
8637
|
8
|
|
|
|
|
RExC_parse++; |
8638
|
2
|
|
|
|
|
vFAIL3("Sequence (%.*s...) not recognized", |
8639
|
|
|
|
|
|
RExC_parse-seqstart, seqstart); |
8640
|
|
|
|
|
|
/*NOTREACHED*/ |
8641
|
|
|
|
|
|
} |
8642
|
|
|
|
|
|
|
8643
|
14
|
|
|
|
|
++RExC_parse; |
8644
|
|
|
|
|
|
} |
8645
|
|
|
|
|
|
} |
8646
|
|
|
|
|
|
|
8647
|
|
|
|
|
|
/* |
8648
|
|
|
|
|
|
- reg - regular expression, i.e. main body or parenthesized thing |
8649
|
|
|
|
|
|
* |
8650
|
|
|
|
|
|
* Caller must absorb opening parenthesis. |
8651
|
|
|
|
|
|
* |
8652
|
|
|
|
|
|
* Combining parenthesis handling with the base level of regular expression |
8653
|
|
|
|
|
|
* is a trifle forced, but the need to tie the tails of the branches to what |
8654
|
|
|
|
|
|
* follows makes it hard to avoid. |
8655
|
|
|
|
|
|
*/ |
8656
|
|
|
|
|
|
#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) |
8657
|
|
|
|
|
|
#ifdef DEBUGGING |
8658
|
|
|
|
|
|
#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) |
8659
|
|
|
|
|
|
#else |
8660
|
|
|
|
|
|
#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) |
8661
|
|
|
|
|
|
#endif |
8662
|
|
|
|
|
|
|
8663
|
|
|
|
|
|
/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets |
8664
|
|
|
|
|
|
flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan |
8665
|
|
|
|
|
|
needs to be restarted. |
8666
|
|
|
|
|
|
Otherwise would only return NULL if regbranch() returns NULL, which |
8667
|
|
|
|
|
|
cannot happen. */ |
8668
|
|
|
|
|
|
STATIC regnode * |
8669
|
558
|
|
|
|
|
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) |
8670
|
|
|
|
|
|
/* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. |
8671
|
|
|
|
|
|
* 2 is like 1, but indicates that nextchar() has been called to advance |
8672
|
|
|
|
|
|
* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and |
8673
|
|
|
|
|
|
* this flag alerts us to the need to check for that */ |
8674
|
|
|
|
|
|
{ |
8675
|
|
|
|
|
|
dVAR; |
8676
|
|
|
|
|
|
regnode *ret; /* Will be the head of the group. */ |
8677
|
|
|
|
|
|
regnode *br; |
8678
|
|
|
|
|
|
regnode *lastbr; |
8679
|
|
|
|
|
|
regnode *ender = NULL; |
8680
|
|
|
|
|
|
I32 parno = 0; |
8681
|
|
|
|
|
|
I32 flags; |
8682
|
558
|
|
|
|
|
U32 oregflags = RExC_flags; |
8683
|
|
|
|
|
|
bool have_branch = 0; |
8684
|
|
|
|
|
|
bool is_open = 0; |
8685
|
|
|
|
|
|
I32 freeze_paren = 0; |
8686
|
|
|
|
|
|
I32 after_freeze = 0; |
8687
|
|
|
|
|
|
|
8688
|
848
|
|
|
|
|
char * parse_start = RExC_parse; /* MJD */ |
8689
|
600
|
|
|
|
|
char * const oregcomp_parse = RExC_parse; |
8690
|
|
|
|
|
|
|
8691
|
600
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
8692
|
|
|
|
|
|
|
8693
|
596
|
|
|
|
|
PERL_ARGS_ASSERT_REG; |
8694
|
596
|
|
|
|
|
DEBUG_PARSE("reg "); |
8695
|
|
|
|
|
|
|
8696
|
688
|
|
|
|
|
*flagp = 0; /* Tentatively. */ |
8697
|
|
|
|
|
|
|
8698
|
|
|
|
|
|
|
8699
|
|
|
|
|
|
/* Make an OPEN node, if parenthesized. */ |
8700
|
576
|
|
|
|
|
if (paren) { |
8701
|
|
|
|
|
|
|
8702
|
|
|
|
|
|
/* Under /x, space and comments can be gobbled up between the '(' and |
8703
|
|
|
|
|
|
* here (if paren ==2). The forms '(*VERB' and '(?...' disallow such |
8704
|
|
|
|
|
|
* intervening space, as the sequence is a token, and a token should be |
8705
|
|
|
|
|
|
* indivisible */ |
8706
|
284
|
|
|
|
|
bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; |
8707
|
|
|
|
|
|
|
8708
|
284
|
|
|
|
|
if ( *RExC_parse == '*') { /* (*VERB:ARG) */ |
8709
|
20
|
|
|
|
|
char *start_verb = RExC_parse; |
8710
|
|
|
|
|
|
STRLEN verb_len = 0; |
8711
|
|
|
|
|
|
char *start_arg = NULL; |
8712
|
|
|
|
|
|
unsigned char op = 0; |
8713
|
|
|
|
|
|
int argok = 1; |
8714
|
|
|
|
|
|
int internal_argval = 0; /* internal_argval is only useful if !argok */ |
8715
|
|
|
|
|
|
|
8716
|
0
|
|
|
|
|
if (has_intervening_patws && SIZE_ONLY) { |
8717
|
0
|
|
|
|
|
ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); |
8718
|
|
|
|
|
|
} |
8719
|
20
|
|
|
|
|
while ( *RExC_parse && *RExC_parse != ')' ) { |
8720
|
3532
|
|
|
|
|
if ( *RExC_parse == ':' ) { |
8721
|
4759080
|
|
|
|
|
start_arg = RExC_parse + 1; |
8722
|
4759080
|
|
|
|
|
break; |
8723
|
|
|
|
|
|
} |
8724
|
4759080
|
|
|
|
|
RExC_parse++; |
8725
|
|
|
|
|
|
} |
8726
|
30
|
|
|
|
|
++start_verb; |
8727
|
30
|
|
|
|
|
verb_len = RExC_parse - start_verb; |
8728
|
5220200
|
|
|
|
|
if ( start_arg ) { |
8729
|
36693902
|
|
|
|
|
RExC_parse++; |
8730
|
36693902
|
|
|
|
|
while ( *RExC_parse && *RExC_parse != ')' ) |
8731
|
36693902
|
|
|
|
|
RExC_parse++; |
8732
|
36693902
|
|
|
|
|
if ( *RExC_parse != ')' ) |
8733
|
36693902
|
|
|
|
|
vFAIL("Unterminated verb pattern argument"); |
8734
|
36693902
|
|
|
|
|
if ( RExC_parse == start_arg ) |
8735
|
|
|
|
|
|
start_arg = NULL; |
8736
|
|
|
|
|
|
} else { |
8737
|
19448076
|
|
|
|
|
if ( *RExC_parse != ')' ) |
8738
|
19448076
|
|
|
|
|
vFAIL("Unterminated verb pattern"); |
8739
|
|
|
|
|
|
} |
8740
|
|
|
|
|
|
|
8741
|
4380
|
|
|
|
|
switch ( *start_verb ) { |
8742
|
|
|
|
|
|
case 'A': /* (*ACCEPT) */ |
8743
|
4380
|
|
|
|
|
if ( memEQs(start_verb,verb_len,"ACCEPT") ) { |
8744
|
|
|
|
|
|
op = ACCEPT; |
8745
|
2193
|
|
|
|
|
internal_argval = RExC_nestroot; |
8746
|
|
|
|
|
|
} |
8747
|
|
|
|
|
|
break; |
8748
|
|
|
|
|
|
case 'C': /* (*COMMIT) */ |
8749
|
28824
|
|
|
|
|
if ( memEQs(start_verb,verb_len,"COMMIT") ) |
8750
|
|
|
|
|
|
op = COMMIT; |
8751
|
|
|
|
|
|
break; |
8752
|
|
|
|
|
|
case 'F': /* (*FAIL) */ |
8753
|
24832
|
|
|
|
|
if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { |
8754
|
|
|
|
|
|
op = OPFAIL; |
8755
|
|
|
|
|
|
argok = 0; |
8756
|
|
|
|
|
|
} |
8757
|
|
|
|
|
|
break; |
8758
|
|
|
|
|
|
case ':': /* (*:NAME) */ |
8759
|
|
|
|
|
|
case 'M': /* (*MARK:NAME) */ |
8760
|
388
|
|
|
|
|
if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { |
8761
|
|
|
|
|
|
op = MARKPOINT; |
8762
|
|
|
|
|
|
argok = -1; |
8763
|
|
|
|
|
|
} |
8764
|
|
|
|
|
|
break; |
8765
|
|
|
|
|
|
case 'P': /* (*PRUNE) */ |
8766
|
388
|
|
|
|
|
if ( memEQs(start_verb,verb_len,"PRUNE") ) |
8767
|
|
|
|
|
|
op = PRUNE; |
8768
|
|
|
|
|
|
break; |
8769
|
|
|
|
|
|
case 'S': /* (*SKIP) */ |
8770
|
24444
|
|
|
|
|
if ( memEQs(start_verb,verb_len,"SKIP") ) |
8771
|
|
|
|
|
|
op = SKIP; |
8772
|
|
|
|
|
|
break; |
8773
|
|
|
|
|
|
case 'T': /* (*THEN) */ |
8774
|
|
|
|
|
|
/* [19:06] :: is then */ |
8775
|
4380
|
|
|
|
|
if ( memEQs(start_verb,verb_len,"THEN") ) { |
8776
|
|
|
|
|
|
op = CUTGROUP; |
8777
|
4380
|
|
|
|
|
RExC_seen |= REG_SEEN_CUTGROUP; |
8778
|
|
|
|
|
|
} |
8779
|
|
|
|
|
|
break; |
8780
|
|
|
|
|
|
} |
8781
|
4380
|
|
|
|
|
if ( ! op ) { |
8782
|
388
|
|
|
|
|
RExC_parse++; |
8783
|
1462
|
|
|
|
|
vFAIL3("Unknown verb pattern '%.*s'", |
8784
|
|
|
|
|
|
verb_len, start_verb); |
8785
|
|
|
|
|
|
} |
8786
|
880
|
|
|
|
|
if ( argok ) { |
8787
|
388
|
|
|
|
|
if ( start_arg && internal_argval ) { |
8788
|
0
|
|
|
|
|
vFAIL3("Verb pattern '%.*s' may not have an argument", |
8789
|
|
|
|
|
|
verb_len, start_verb); |
8790
|
388
|
|
|
|
|
} else if ( argok < 0 && !start_arg ) { |
8791
|
3992
|
|
|
|
|
vFAIL3("Verb pattern '%.*s' has a mandatory argument", |
8792
|
|
|
|
|
|
verb_len, start_verb); |
8793
|
|
|
|
|
|
} else { |
8794
|
0
|
|
|
|
|
ret = reganode(pRExC_state, op, internal_argval); |
8795
|
4380
|
|
|
|
|
if ( ! internal_argval && ! SIZE_ONLY ) { |
8796
|
232
|
|
|
|
|
if (start_arg) { |
8797
|
232
|
|
|
|
|
SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); |
8798
|
108
|
|
|
|
|
ARG(ret) = add_data( pRExC_state, 1, "S" ); |
8799
|
508
|
|
|
|
|
RExC_rxi->data->data[ARG(ret)]=(void*)sv; |
8800
|
224
|
|
|
|
|
ret->flags = 0; |
8801
|
|
|
|
|
|
} else { |
8802
|
2436
|
|
|
|
|
ret->flags = 1; |
8803
|
|
|
|
|
|
} |
8804
|
|
|
|
|
|
} |
8805
|
|
|
|
|
|
} |
8806
|
324
|
|
|
|
|
if (!internal_argval) |
8807
|
520
|
|
|
|
|
RExC_seen |= REG_SEEN_VERBARG; |
8808
|
520
|
|
|
|
|
} else if ( start_arg ) { |
8809
|
4380
|
|
|
|
|
vFAIL3("Verb pattern '%.*s' may not have an argument", |
8810
|
|
|
|
|
|
verb_len, start_verb); |
8811
|
|
|
|
|
|
} else { |
8812
|
28
|
|
|
|
|
ret = reg_node(pRExC_state, op); |
8813
|
|
|
|
|
|
} |
8814
|
28
|
|
|
|
|
nextchar(pRExC_state); |
8815
|
4352
|
|
|
|
|
return ret; |
8816
|
|
|
|
|
|
} |
8817
|
4108
|
|
|
|
|
else if (*RExC_parse == '?') { /* (?...) */ |
8818
|
|
|
|
|
|
bool is_logical = 0; |
8819
|
252
|
|
|
|
|
const char * const seqstart = RExC_parse; |
8820
|
4096
|
|
|
|
|
if (has_intervening_patws && SIZE_ONLY) { |
8821
|
0
|
|
|
|
|
ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); |
8822
|
|
|
|
|
|
} |
8823
|
|
|
|
|
|
|
8824
|
4096
|
|
|
|
|
RExC_parse++; |
8825
|
4096
|
|
|
|
|
paren = *RExC_parse++; |
8826
|
|
|
|
|
|
ret = NULL; /* For look-ahead/behind. */ |
8827
|
2058
|
|
|
|
|
switch (paren) { |
8828
|
|
|
|
|
|
|
8829
|
|
|
|
|
|
case 'P': /* (?P...) variants for those used to PCRE/Python */ |
8830
|
194
|
|
|
|
|
paren = *RExC_parse++; |
8831
|
194
|
|
|
|
|
if ( paren == '<') /* (?P<...>) named capture */ |
8832
|
|
|
|
|
|
goto named_capture; |
8833
|
194
|
|
|
|
|
else if (paren == '>') { /* (?P>name) named recursion */ |
8834
|
|
|
|
|
|
goto named_recursion; |
8835
|
|
|
|
|
|
} |
8836
|
194
|
|
|
|
|
else if (paren == '=') { /* (?P=...) named backref */ |
8837
|
|
|
|
|
|
/* this pretty much dupes the code for \k in regatom(), if |
8838
|
|
|
|
|
|
you change this make sure you change that */ |
8839
|
1612
|
|
|
|
|
char* name_start = RExC_parse; |
8840
|
|
|
|
|
|
U32 num = 0; |
8841
|
3844
|
|
|
|
|
SV *sv_dat = reg_scan_name(pRExC_state, |
8842
|
|
|
|
|
|
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); |
8843
|
3728
|
|
|
|
|
if (RExC_parse == name_start || *RExC_parse != ')') |
8844
|
508
|
|
|
|
|
vFAIL2("Sequence %.3s... not terminated",parse_start); |
8845
|
|
|
|
|
|
|
8846
|
0
|
|
|
|
|
if (!SIZE_ONLY) { |
8847
|
508
|
|
|
|
|
num = add_data( pRExC_state, 1, "S" ); |
8848
|
4352
|
|
|
|
|
RExC_rxi->data->data[num]=(void*)sv_dat; |
8849
|
4352
|
|
|
|
|
SvREFCNT_inc_simple_void(sv_dat); |
8850
|
|
|
|
|
|
} |
8851
|
19443696
|
|
|
|
|
RExC_sawback = 1; |
8852
|
8024360
|
|
|
|
|
ret = reganode(pRExC_state, |
8853
|
|
|
|
|
|
((! FOLD) |
8854
|
|
|
|
|
|
? NREF |
8855
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
8856
|
|
|
|
|
|
? NREFFA |
8857
|
|
|
|
|
|
: (AT_LEAST_UNI_SEMANTICS) |
8858
|
|
|
|
|
|
? NREFFU |
8859
|
|
|
|
|
|
: (LOC) |
8860
|
|
|
|
|
|
? NREFFL |
8861
|
|
|
|
|
|
: NREFF), |
8862
|
|
|
|
|
|
num); |
8863
|
8024360
|
|
|
|
|
*flagp |= HASWIDTH; |
8864
|
|
|
|
|
|
|
8865
|
6
|
|
|
|
|
Set_Node_Offset(ret, parse_start+1); |
8866
|
8024360
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
8867
|
|
|
|
|
|
|
8868
|
8024360
|
|
|
|
|
nextchar(pRExC_state); |
8869
|
8024360
|
|
|
|
|
return ret; |
8870
|
|
|
|
|
|
} |
8871
|
3686
|
|
|
|
|
RExC_parse++; |
8872
|
3686
|
|
|
|
|
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); |
8873
|
|
|
|
|
|
/*NOTREACHED*/ |
8874
|
|
|
|
|
|
case '<': /* (?<...) */ |
8875
|
1174
|
|
|
|
|
if (*RExC_parse == '!') |
8876
|
|
|
|
|
|
paren = ','; |
8877
|
942
|
|
|
|
|
else if (*RExC_parse != '=') |
8878
|
|
|
|
|
|
named_capture: |
8879
|
|
|
|
|
|
{ /* (?<...>) */ |
8880
|
|
|
|
|
|
char *name_start; |
8881
|
|
|
|
|
|
SV *svname; |
8882
|
|
|
|
|
|
paren= '>'; |
8883
|
|
|
|
|
|
case '\'': /* (?'...') */ |
8884
|
928
|
|
|
|
|
name_start= RExC_parse; |
8885
|
928
|
|
|
|
|
svname = reg_scan_name(pRExC_state, |
8886
|
|
|
|
|
|
SIZE_ONLY ? /* reverse test from the others */ |
8887
|
|
|
|
|
|
REG_RSN_RETURN_NAME : |
8888
|
|
|
|
|
|
REG_RSN_RETURN_NULL); |
8889
|
928
|
|
|
|
|
if (RExC_parse == name_start) { |
8890
|
0
|
|
|
|
|
RExC_parse++; |
8891
|
928
|
|
|
|
|
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); |
8892
|
|
|
|
|
|
/*NOTREACHED*/ |
8893
|
|
|
|
|
|
} |
8894
|
464
|
|
|
|
|
if (*RExC_parse != paren) |
8895
|
464
|
|
|
|
|
vFAIL2("Sequence (?%c... not terminated", |
8896
|
|
|
|
|
|
paren=='>' ? '<' : paren); |
8897
|
464
|
|
|
|
|
if (SIZE_ONLY) { |
8898
|
|
|
|
|
|
HE *he_str; |
8899
|
|
|
|
|
|
SV *sv_dat = NULL; |
8900
|
928
|
|
|
|
|
if (!svname) /* shouldn't happen */ |
8901
|
928
|
|
|
|
|
Perl_croak(aTHX_ |
8902
|
|
|
|
|
|
"panic: reg_scan_name returned NULL"); |
8903
|
928
|
|
|
|
|
if (!RExC_paren_names) { |
8904
|
928
|
|
|
|
|
RExC_paren_names= newHV(); |
8905
|
928
|
|
|
|
|
sv_2mortal(MUTABLE_SV(RExC_paren_names)); |
8906
|
|
|
|
|
|
#ifdef DEBUGGING |
8907
|
14
|
|
|
|
|
RExC_paren_name_list= newAV(); |
8908
|
14
|
|
|
|
|
sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); |
8909
|
|
|
|
|
|
#endif |
8910
|
|
|
|
|
|
} |
8911
|
110910
|
|
|
|
|
he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); |
8912
|
47126
|
|
|
|
|
if ( he_str ) |
8913
|
22962
|
|
|
|
|
sv_dat = HeVAL(he_str); |
8914
|
22962
|
|
|
|
|
if ( ! sv_dat ) { |
8915
|
|
|
|
|
|
/* croak baby croak */ |
8916
|
22806
|
|
|
|
|
Perl_croak(aTHX_ |
8917
|
|
|
|
|
|
"panic: paren_name hash element allocation failed"); |
8918
|
0
|
|
|
|
|
} else if ( SvPOK(sv_dat) ) { |
8919
|
|
|
|
|
|
/* (?|...) can mean we have dupes so scan to check |
8920
|
|
|
|
|
|
its already been stored. Maybe a flag indicating |
8921
|
|
|
|
|
|
we are inside such a construct would be useful, |
8922
|
|
|
|
|
|
but the arrays are likely to be quite small, so |
8923
|
|
|
|
|
|
for now we punt -- dmq */ |
8924
|
0
|
|
|
|
|
IV count = SvIV(sv_dat); |
8925
|
22806
|
|
|
|
|
I32 *pv = (I32*)SvPVX(sv_dat); |
8926
|
|
|
|
|
|
IV i; |
8927
|
0
|
|
|
|
|
for ( i = 0 ; i < count ; i++ ) { |
8928
|
22806
|
|
|
|
|
if ( pv[i] == RExC_npar ) { |
8929
|
|
|
|
|
|
count = 0; |
8930
|
|
|
|
|
|
break; |
8931
|
|
|
|
|
|
} |
8932
|
|
|
|
|
|
} |
8933
|
12236
|
|
|
|
|
if ( count ) { |
8934
|
0
|
|
|
|
|
pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); |
8935
|
12236
|
|
|
|
|
SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); |
8936
|
10476
|
|
|
|
|
pv[count] = RExC_npar; |
8937
|
10476
|
|
|
|
|
SvIV_set(sv_dat, SvIVX(sv_dat) + 1); |
8938
|
|
|
|
|
|
} |
8939
|
|
|
|
|
|
} else { |
8940
|
12236
|
|
|
|
|
(void)SvUPGRADE(sv_dat,SVt_PVNV); |
8941
|
12236
|
|
|
|
|
sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); |
8942
|
12236
|
|
|
|
|
SvIOK_on(sv_dat); |
8943
|
12236
|
|
|
|
|
SvIV_set(sv_dat, 1); |
8944
|
|
|
|
|
|
} |
8945
|
|
|
|
|
|
#ifdef DEBUGGING |
8946
|
|
|
|
|
|
/* Yes this does cause a memory leak in debugging Perls */ |
8947
|
0
|
|
|
|
|
if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) |
8948
|
18110
|
|
|
|
|
SvREFCNT_dec_NN(svname); |
8949
|
|
|
|
|
|
#endif |
8950
|
|
|
|
|
|
|
8951
|
|
|
|
|
|
/*sv_dump(sv_dat);*/ |
8952
|
|
|
|
|
|
} |
8953
|
488
|
|
|
|
|
nextchar(pRExC_state); |
8954
|
|
|
|
|
|
paren = 1; |
8955
|
488
|
|
|
|
|
goto capturing_parens; |
8956
|
|
|
|
|
|
} |
8957
|
1146
|
|
|
|
|
RExC_seen |= REG_SEEN_LOOKBEHIND; |
8958
|
662
|
|
|
|
|
RExC_in_lookbehind++; |
8959
|
488
|
|
|
|
|
RExC_parse++; |
8960
|
|
|
|
|
|
case '=': /* (?=...) */ |
8961
|
484
|
|
|
|
|
RExC_seen_zerolen++; |
8962
|
484
|
|
|
|
|
break; |
8963
|
|
|
|
|
|
case '!': /* (?!...) */ |
8964
|
484
|
|
|
|
|
RExC_seen_zerolen++; |
8965
|
484
|
|
|
|
|
if (*RExC_parse == ')') { |
8966
|
17622
|
|
|
|
|
ret=reg_node(pRExC_state, OPFAIL); |
8967
|
11748
|
|
|
|
|
nextchar(pRExC_state); |
8968
|
11748
|
|
|
|
|
return ret; |
8969
|
|
|
|
|
|
} |
8970
|
|
|
|
|
|
break; |
8971
|
|
|
|
|
|
case '|': /* (?|...) */ |
8972
|
|
|
|
|
|
/* branch reset, behave like a (?:...) except that |
8973
|
|
|
|
|
|
buffers in alternations share the same numbers */ |
8974
|
|
|
|
|
|
paren = ':'; |
8975
|
11748
|
|
|
|
|
after_freeze = freeze_paren = RExC_npar; |
8976
|
22806
|
|
|
|
|
break; |
8977
|
|
|
|
|
|
case ':': /* (?:...) */ |
8978
|
|
|
|
|
|
case '>': /* (?>...) */ |
8979
|
|
|
|
|
|
break; |
8980
|
|
|
|
|
|
case '$': /* (?$...) */ |
8981
|
|
|
|
|
|
case '@': /* (?@...) */ |
8982
|
22806
|
|
|
|
|
vFAIL2("Sequence (?%c...) not implemented", (int)paren); |
8983
|
|
|
|
|
|
break; |
8984
|
|
|
|
|
|
case '#': /* (?#...) */ |
8985
|
|
|
|
|
|
/* XXX As soon as we disallow separating the '?' and '*' (by |
8986
|
|
|
|
|
|
* spaces or (?#...) comment), it is believed that this case |
8987
|
|
|
|
|
|
* will be unreachable and can be removed. See |
8988
|
|
|
|
|
|
* [perl #117327] */ |
8989
|
93452
|
|
|
|
|
while (*RExC_parse && *RExC_parse != ')') |
8990
|
93452
|
|
|
|
|
RExC_parse++; |
8991
|
93452
|
|
|
|
|
if (*RExC_parse != ')') |
8992
|
149934
|
|
|
|
|
FAIL("Sequence (?#... not terminated"); |
8993
|
149934
|
|
|
|
|
nextchar(pRExC_state); |
8994
|
290252
|
|
|
|
|
*flagp = TRYAGAIN; |
8995
|
290252
|
|
|
|
|
return NULL; |
8996
|
|
|
|
|
|
case '0' : /* (?0) */ |
8997
|
|
|
|
|
|
case 'R' : /* (?R) */ |
8998
|
348
|
|
|
|
|
if (*RExC_parse != ')') |
8999
|
348
|
|
|
|
|
FAIL("Sequence (?R) not terminated"); |
9000
|
348
|
|
|
|
|
ret = reg_node(pRExC_state, GOSTART); |
9001
|
2676
|
|
|
|
|
*flagp |= POSTPONED; |
9002
|
2676
|
|
|
|
|
nextchar(pRExC_state); |
9003
|
2
|
|
|
|
|
return ret; |
9004
|
|
|
|
|
|
/*notreached*/ |
9005
|
|
|
|
|
|
{ /* named and numeric backreferences */ |
9006
|
|
|
|
|
|
I32 num; |
9007
|
|
|
|
|
|
case '&': /* (?&NAME) */ |
9008
|
0
|
|
|
|
|
parse_start = RExC_parse - 1; |
9009
|
|
|
|
|
|
named_recursion: |
9010
|
|
|
|
|
|
{ |
9011
|
0
|
|
|
|
|
SV *sv_dat = reg_scan_name(pRExC_state, |
9012
|
|
|
|
|
|
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); |
9013
|
0
|
|
|
|
|
num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; |
9014
|
|
|
|
|
|
} |
9015
|
0
|
|
|
|
|
goto gen_recurse_regop; |
9016
|
|
|
|
|
|
assert(0); /* NOT REACHED */ |
9017
|
|
|
|
|
|
case '+': |
9018
|
0
|
|
|
|
|
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { |
9019
|
0
|
|
|
|
|
RExC_parse++; |
9020
|
0
|
|
|
|
|
vFAIL("Illegal pattern"); |
9021
|
|
|
|
|
|
} |
9022
|
|
|
|
|
|
goto parse_recursion; |
9023
|
|
|
|
|
|
/* NOT REACHED*/ |
9024
|
|
|
|
|
|
case '-': /* (?-1) */ |
9025
|
128
|
|
|
|
|
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { |
9026
|
0
|
|
|
|
|
RExC_parse--; /* rewind to let it be handled later */ |
9027
|
128
|
|
|
|
|
goto parse_flags; |
9028
|
|
|
|
|
|
} |
9029
|
|
|
|
|
|
/*FALLTHROUGH */ |
9030
|
|
|
|
|
|
case '1': case '2': case '3': case '4': /* (?1) */ |
9031
|
|
|
|
|
|
case '5': case '6': case '7': case '8': case '9': |
9032
|
128
|
|
|
|
|
RExC_parse--; |
9033
|
|
|
|
|
|
parse_recursion: |
9034
|
128
|
|
|
|
|
num = atoi(RExC_parse); |
9035
|
128
|
|
|
|
|
parse_start = RExC_parse - 1; /* MJD */ |
9036
|
3516
|
|
|
|
|
if (*RExC_parse == '-') |
9037
|
3516
|
|
|
|
|
RExC_parse++; |
9038
|
3516
|
|
|
|
|
while (isDIGIT(*RExC_parse)) |
9039
|
348
|
|
|
|
|
RExC_parse++; |
9040
|
0
|
|
|
|
|
if (*RExC_parse!=')') |
9041
|
0
|
|
|
|
|
vFAIL("Expecting close bracket"); |
9042
|
|
|
|
|
|
|
9043
|
|
|
|
|
|
gen_recurse_regop: |
9044
|
3304
|
|
|
|
|
if ( paren == '-' ) { |
9045
|
|
|
|
|
|
/* |
9046
|
|
|
|
|
|
Diagram of capture buffer numbering. |
9047
|
|
|
|
|
|
Top line is the normal capture buffer numbers |
9048
|
|
|
|
|
|
Bottom line is the negative indexing as from |
9049
|
|
|
|
|
|
the X (the (?-2)) |
9050
|
|
|
|
|
|
|
9051
|
|
|
|
|
|
+ 1 2 3 4 5 X 6 7 |
9052
|
|
|
|
|
|
/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ |
9053
|
|
|
|
|
|
- 5 4 3 2 1 X x x |
9054
|
|
|
|
|
|
|
9055
|
|
|
|
|
|
*/ |
9056
|
2944
|
|
|
|
|
num = RExC_npar + num; |
9057
|
2944
|
|
|
|
|
if (num < 1) { |
9058
|
2732
|
|
|
|
|
RExC_parse++; |
9059
|
3080
|
|
|
|
|
vFAIL("Reference to nonexistent group"); |
9060
|
|
|
|
|
|
} |
9061
|
3080
|
|
|
|
|
} else if ( paren == '+' ) { |
9062
|
1720
|
|
|
|
|
num = RExC_npar + num - 1; |
9063
|
|
|
|
|
|
} |
9064
|
|
|
|
|
|
|
9065
|
6160
|
|
|
|
|
ret = reganode(pRExC_state, GOSUB, num); |
9066
|
3080
|
|
|
|
|
if (!SIZE_ONLY) { |
9067
|
3080
|
|
|
|
|
if (num > (I32)RExC_rx->nparens) { |
9068
|
0
|
|
|
|
|
RExC_parse++; |
9069
|
6596
|
|
|
|
|
vFAIL("Reference to nonexistent group"); |
9070
|
|
|
|
|
|
} |
9071
|
360
|
|
|
|
|
ARG2L_SET( ret, RExC_recurse_count++); |
9072
|
360
|
|
|
|
|
RExC_emit++; |
9073
|
0
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, |
9074
|
|
|
|
|
|
"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); |
9075
|
|
|
|
|
|
} else { |
9076
|
0
|
|
|
|
|
RExC_size++; |
9077
|
|
|
|
|
|
} |
9078
|
6236
|
|
|
|
|
RExC_seen |= REG_SEEN_RECURSE; |
9079
|
348
|
|
|
|
|
Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ |
9080
|
6596
|
|
|
|
|
Set_Node_Offset(ret, parse_start); /* MJD */ |
9081
|
|
|
|
|
|
|
9082
|
6596
|
|
|
|
|
*flagp |= POSTPONED; |
9083
|
3298
|
|
|
|
|
nextchar(pRExC_state); |
9084
|
0
|
|
|
|
|
return ret; |
9085
|
|
|
|
|
|
} /* named and numeric backreferences */ |
9086
|
|
|
|
|
|
assert(0); /* NOT REACHED */ |
9087
|
|
|
|
|
|
|
9088
|
|
|
|
|
|
case '?': /* (??...) */ |
9089
|
|
|
|
|
|
is_logical = 1; |
9090
|
0
|
|
|
|
|
if (*RExC_parse != '{') { |
9091
|
3298
|
|
|
|
|
RExC_parse++; |
9092
|
3298
|
|
|
|
|
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); |
9093
|
|
|
|
|
|
/*NOTREACHED*/ |
9094
|
|
|
|
|
|
} |
9095
|
3298
|
|
|
|
|
*flagp |= POSTPONED; |
9096
|
6596
|
|
|
|
|
paren = *RExC_parse++; |
9097
|
|
|
|
|
|
/* FALL THROUGH */ |
9098
|
|
|
|
|
|
case '{': /* (?{...}) */ |
9099
|
|
|
|
|
|
{ |
9100
|
|
|
|
|
|
U32 n = 0; |
9101
|
|
|
|
|
|
struct reg_code_block *cb; |
9102
|
|
|
|
|
|
|
9103
|
6756
|
|
|
|
|
RExC_seen_zerolen++; |
9104
|
|
|
|
|
|
|
9105
|
6756
|
|
|
|
|
if ( !pRExC_state->num_code_blocks |
9106
|
6756
|
|
|
|
|
|| pRExC_state->code_index >= pRExC_state->num_code_blocks |
9107
|
223756
|
|
|
|
|
|| pRExC_state->code_blocks[pRExC_state->code_index].start |
9108
|
320
|
|
|
|
|
!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) |
9109
|
160
|
|
|
|
|
- RExC_start) |
9110
|
|
|
|
|
|
) { |
9111
|
223436
|
|
|
|
|
if (RExC_pm_flags & PMf_USE_RE_EVAL) |
9112
|
223436
|
|
|
|
|
FAIL("panic: Sequence (?{...}): no code block found\n"); |
9113
|
228792
|
|
|
|
|
FAIL("Eval-group not allowed at runtime, use re 'eval'"); |
9114
|
|
|
|
|
|
} |
9115
|
|
|
|
|
|
/* this is a pre-compiled code block (?{...}) */ |
9116
|
228952
|
|
|
|
|
cb = &pRExC_state->code_blocks[pRExC_state->code_index]; |
9117
|
228940
|
|
|
|
|
RExC_parse = RExC_start + cb->end; |
9118
|
343330
|
|
|
|
|
if (!SIZE_ONLY) { |
9119
|
343250
|
|
|
|
|
OP *o = cb->block; |
9120
|
228860
|
|
|
|
|
if (cb->src_regex) { |
9121
|
72
|
|
|
|
|
n = add_data(pRExC_state, 2, "rl"); |
9122
|
96
|
|
|
|
|
RExC_rxi->data->data[n] = |
9123
|
72
|
|
|
|
|
(void*)SvREFCNT_inc((SV*)cb->src_regex); |
9124
|
228816
|
|
|
|
|
RExC_rxi->data->data[n+1] = (void*)o; |
9125
|
|
|
|
|
|
} |
9126
|
|
|
|
|
|
else { |
9127
|
228800
|
|
|
|
|
n = add_data(pRExC_state, 1, |
9128
|
228800
|
|
|
|
|
(RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); |
9129
|
114366
|
|
|
|
|
RExC_rxi->data->data[n] = (void*)o; |
9130
|
|
|
|
|
|
} |
9131
|
|
|
|
|
|
} |
9132
|
114494
|
|
|
|
|
pRExC_state->code_index++; |
9133
|
103446
|
|
|
|
|
nextchar(pRExC_state); |
9134
|
|
|
|
|
|
|
9135
|
206732
|
|
|
|
|
if (is_logical) { |
9136
|
|
|
|
|
|
regnode *eval; |
9137
|
103286
|
|
|
|
|
ret = reg_node(pRExC_state, LOGICAL); |
9138
|
103286
|
|
|
|
|
eval = reganode(pRExC_state, EVAL, n); |
9139
|
11048
|
|
|
|
|
if (!SIZE_ONLY) { |
9140
|
11048
|
|
|
|
|
ret->flags = 2; |
9141
|
|
|
|
|
|
/* for later propagation into (??{}) return value */ |
9142
|
11048
|
|
|
|
|
eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); |
9143
|
|
|
|
|
|
} |
9144
|
228768
|
|
|
|
|
REGTAIL(pRExC_state, ret, eval); |
9145
|
|
|
|
|
|
/* deal with the length of this later - MJD */ |
9146
|
228768
|
|
|
|
|
return ret; |
9147
|
|
|
|
|
|
} |
9148
|
228928
|
|
|
|
|
ret = reganode(pRExC_state, EVAL, n); |
9149
|
223596
|
|
|
|
|
Set_Node_Length(ret, RExC_parse - parse_start + 1); |
9150
|
223596
|
|
|
|
|
Set_Node_Offset(ret, parse_start); |
9151
|
|
|
|
|
|
return ret; |
9152
|
|
|
|
|
|
} |
9153
|
|
|
|
|
|
case '(': /* (?(?{...})...) and (?(?=...)...) */ |
9154
|
|
|
|
|
|
{ |
9155
|
|
|
|
|
|
int is_define= 0; |
9156
|
223436
|
|
|
|
|
if (RExC_parse[0] == '?') { /* (?(?...)) */ |
9157
|
111668
|
|
|
|
|
if (RExC_parse[1] == '=' || RExC_parse[1] == '!' |
9158
|
111668
|
|
|
|
|
|| RExC_parse[1] == '<' |
9159
|
223436
|
|
|
|
|
|| RExC_parse[1] == '{') { /* Lookahead or eval. */ |
9160
|
|
|
|
|
|
I32 flag; |
9161
|
|
|
|
|
|
regnode *tail; |
9162
|
|
|
|
|
|
|
9163
|
223436
|
|
|
|
|
ret = reg_node(pRExC_state, LOGICAL); |
9164
|
5332
|
|
|
|
|
if (!SIZE_ONLY) |
9165
|
5332
|
|
|
|
|
ret->flags = 1; |
9166
|
|
|
|
|
|
|
9167
|
6598
|
|
|
|
|
tail = reg(pRExC_state, 1, &flag, depth+1); |
9168
|
2188
|
|
|
|
|
if (flag & RESTART_UTF8) { |
9169
|
944
|
|
|
|
|
*flagp = RESTART_UTF8; |
9170
|
944
|
|
|
|
|
return NULL; |
9171
|
|
|
|
|
|
} |
9172
|
2188
|
|
|
|
|
REGTAIL(pRExC_state, ret, tail); |
9173
|
2188
|
|
|
|
|
goto insert_if; |
9174
|
|
|
|
|
|
} |
9175
|
|
|
|
|
|
} |
9176
|
994
|
|
|
|
|
else if ( RExC_parse[0] == '<' /* (?()...) */ |
9177
|
2188
|
|
|
|
|
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */ |
9178
|
|
|
|
|
|
{ |
9179
|
2188
|
|
|
|
|
char ch = RExC_parse[0] == '<' ? '>' : '\''; |
9180
|
200
|
|
|
|
|
char *name_start= RExC_parse++; |
9181
|
|
|
|
|
|
U32 num = 0; |
9182
|
200
|
|
|
|
|
SV *sv_dat=reg_scan_name(pRExC_state, |
9183
|
|
|
|
|
|
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); |
9184
|
1988
|
|
|
|
|
if (RExC_parse == name_start || *RExC_parse != ch) |
9185
|
1988
|
|
|
|
|
vFAIL2("Sequence (?(%c... not terminated", |
9186
|
|
|
|
|
|
(ch == '>' ? '<' : ch)); |
9187
|
6615
|
|
|
|
|
RExC_parse++; |
9188
|
4410
|
|
|
|
|
if (!SIZE_ONLY) { |
9189
|
232
|
|
|
|
|
num = add_data( pRExC_state, 1, "S" ); |
9190
|
232
|
|
|
|
|
RExC_rxi->data->data[num]=(void*)sv_dat; |
9191
|
232
|
|
|
|
|
SvREFCNT_inc_simple_void(sv_dat); |
9192
|
|
|
|
|
|
} |
9193
|
232
|
|
|
|
|
ret = reganode(pRExC_state,NGROUPP,num); |
9194
|
0
|
|
|
|
|
goto insert_if_check_paren; |
9195
|
|
|
|
|
|
} |
9196
|
232
|
|
|
|
|
else if (RExC_parse[0] == 'D' && |
9197
|
232
|
|
|
|
|
RExC_parse[1] == 'E' && |
9198
|
116
|
|
|
|
|
RExC_parse[2] == 'F' && |
9199
|
116
|
|
|
|
|
RExC_parse[3] == 'I' && |
9200
|
116
|
|
|
|
|
RExC_parse[4] == 'N' && |
9201
|
232
|
|
|
|
|
RExC_parse[5] == 'E') |
9202
|
|
|
|
|
|
{ |
9203
|
232
|
|
|
|
|
ret = reganode(pRExC_state,DEFINEP,0); |
9204
|
4356
|
|
|
|
|
RExC_parse +=6 ; |
9205
|
|
|
|
|
|
is_define = 1; |
9206
|
534
|
|
|
|
|
goto insert_if_check_paren; |
9207
|
|
|
|
|
|
} |
9208
|
534
|
|
|
|
|
else if (RExC_parse[0] == 'R') { |
9209
|
534
|
|
|
|
|
RExC_parse++; |
9210
|
|
|
|
|
|
parno = 0; |
9211
|
534
|
|
|
|
|
if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { |
9212
|
356
|
|
|
|
|
parno = atoi(RExC_parse++); |
9213
|
356
|
|
|
|
|
while (isDIGIT(*RExC_parse)) |
9214
|
356
|
|
|
|
|
RExC_parse++; |
9215
|
356
|
|
|
|
|
} else if (RExC_parse[0] == '&') { |
9216
|
|
|
|
|
|
SV *sv_dat; |
9217
|
3822
|
|
|
|
|
RExC_parse++; |
9218
|
696
|
|
|
|
|
sv_dat = reg_scan_name(pRExC_state, |
9219
|
|
|
|
|
|
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); |
9220
|
696
|
|
|
|
|
parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; |
9221
|
|
|
|
|
|
} |
9222
|
232
|
|
|
|
|
ret = reganode(pRExC_state,INSUBP,parno); |
9223
|
348
|
|
|
|
|
goto insert_if_check_paren; |
9224
|
|
|
|
|
|
} |
9225
|
0
|
|
|
|
|
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { |
9226
|
|
|
|
|
|
/* (?(1)...) */ |
9227
|
|
|
|
|
|
char c; |
9228
|
464
|
|
|
|
|
parno = atoi(RExC_parse++); |
9229
|
|
|
|
|
|
|
9230
|
232
|
|
|
|
|
while (isDIGIT(*RExC_parse)) |
9231
|
232
|
|
|
|
|
RExC_parse++; |
9232
|
232
|
|
|
|
|
ret = reganode(pRExC_state, GROUPP, parno); |
9233
|
|
|
|
|
|
|
9234
|
|
|
|
|
|
insert_if_check_paren: |
9235
|
696
|
|
|
|
|
if ((c = *nextchar(pRExC_state)) != ')') |
9236
|
696
|
|
|
|
|
vFAIL("Switch condition not recognized"); |
9237
|
|
|
|
|
|
insert_if: |
9238
|
3126
|
|
|
|
|
REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); |
9239
|
3124
|
|
|
|
|
br = regbranch(pRExC_state, &flags, 1,depth+1); |
9240
|
4686
|
|
|
|
|
if (br == NULL) { |
9241
|
0
|
|
|
|
|
if (flags & RESTART_UTF8) { |
9242
|
3124
|
|
|
|
|
*flagp = RESTART_UTF8; |
9243
|
4408
|
|
|
|
|
return NULL; |
9244
|
|
|
|
|
|
} |
9245
|
16
|
|
|
|
|
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", |
9246
|
|
|
|
|
|
(UV) flags); |
9247
|
|
|
|
|
|
} else |
9248
|
6380
|
|
|
|
|
REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); |
9249
|
6380
|
|
|
|
|
c = *nextchar(pRExC_state); |
9250
|
6380
|
|
|
|
|
if (flags&HASWIDTH) |
9251
|
0
|
|
|
|
|
*flagp |= HASWIDTH; |
9252
|
0
|
|
|
|
|
if (c == '|') { |
9253
|
0
|
|
|
|
|
if (is_define) |
9254
|
0
|
|
|
|
|
vFAIL("(?(DEFINE)....) does not allow branches"); |
9255
|
6380
|
|
|
|
|
lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ |
9256
|
6380
|
|
|
|
|
if (!regbranch(pRExC_state, &flags, 1,depth+1)) { |
9257
|
6380
|
|
|
|
|
if (flags & RESTART_UTF8) { |
9258
|
6056
|
|
|
|
|
*flagp = RESTART_UTF8; |
9259
|
6380
|
|
|
|
|
return NULL; |
9260
|
|
|
|
|
|
} |
9261
|
3744
|
|
|
|
|
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", |
9262
|
|
|
|
|
|
(UV) flags); |
9263
|
|
|
|
|
|
} |
9264
|
0
|
|
|
|
|
REGTAIL(pRExC_state, ret, lastbr); |
9265
|
3744
|
|
|
|
|
if (flags&HASWIDTH) |
9266
|
3744
|
|
|
|
|
*flagp |= HASWIDTH; |
9267
|
0
|
|
|
|
|
c = *nextchar(pRExC_state); |
9268
|
|
|
|
|
|
} |
9269
|
|
|
|
|
|
else |
9270
|
|
|
|
|
|
lastbr = NULL; |
9271
|
0
|
|
|
|
|
if (c != ')') |
9272
|
0
|
|
|
|
|
vFAIL("Switch (?(condition)... contains too many branches"); |
9273
|
0
|
|
|
|
|
ender = reg_node(pRExC_state, TAIL); |
9274
|
3744
|
|
|
|
|
REGTAIL(pRExC_state, br, ender); |
9275
|
3744
|
|
|
|
|
if (lastbr) { |
9276
|
3728
|
|
|
|
|
REGTAIL(pRExC_state, lastbr, ender); |
9277
|
3744
|
|
|
|
|
REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); |
9278
|
|
|
|
|
|
} |
9279
|
|
|
|
|
|
else |
9280
|
6380
|
|
|
|
|
REGTAIL(pRExC_state, ret, ender); |
9281
|
16
|
|
|
|
|
RExC_size++; /* XXX WHY do we need this?!! |
9282
|
|
|
|
|
|
For large programs it seems to be required |
9283
|
|
|
|
|
|
but I can't figure out why. -- dmq*/ |
9284
|
6364
|
|
|
|
|
return ret; |
9285
|
|
|
|
|
|
} |
9286
|
|
|
|
|
|
else { |
9287
|
6364
|
|
|
|
|
vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); |
9288
|
|
|
|
|
|
} |
9289
|
|
|
|
|
|
} |
9290
|
|
|
|
|
|
case '[': /* (?[ ... ]) */ |
9291
|
6364
|
|
|
|
|
return handle_regex_sets(pRExC_state, NULL, flagp, depth, |
9292
|
|
|
|
|
|
oregcomp_parse); |
9293
|
|
|
|
|
|
case 0: |
9294
|
3728
|
|
|
|
|
RExC_parse--; /* for vFAIL to print correctly */ |
9295
|
3728
|
|
|
|
|
vFAIL("Sequence (? incomplete"); |
9296
|
|
|
|
|
|
break; |
9297
|
|
|
|
|
|
default: /* e.g., (?i) */ |
9298
|
2724
|
|
|
|
|
--RExC_parse; |
9299
|
|
|
|
|
|
parse_flags: |
9300
|
6452
|
|
|
|
|
parse_lparen_question_flags(pRExC_state); |
9301
|
6452
|
|
|
|
|
if (UCHARAT(RExC_parse) != ':') { |
9302
|
2
|
|
|
|
|
nextchar(pRExC_state); |
9303
|
6250
|
|
|
|
|
*flagp = TRYAGAIN; |
9304
|
16
|
|
|
|
|
return NULL; |
9305
|
|
|
|
|
|
} |
9306
|
|
|
|
|
|
paren = ':'; |
9307
|
104
|
|
|
|
|
nextchar(pRExC_state); |
9308
|
|
|
|
|
|
ret = NULL; |
9309
|
4756252
|
|
|
|
|
goto parse_rest; |
9310
|
|
|
|
|
|
} /* end switch */ |
9311
|
|
|
|
|
|
} |
9312
|
|
|
|
|
|
else { /* (...) */ |
9313
|
|
|
|
|
|
capturing_parens: |
9314
|
4759120
|
|
|
|
|
parno = RExC_npar; |
9315
|
4759080
|
|
|
|
|
RExC_npar++; |
9316
|
|
|
|
|
|
|
9317
|
4752
|
|
|
|
|
ret = reganode(pRExC_state, OPEN, parno); |
9318
|
4752
|
|
|
|
|
if (!SIZE_ONLY ){ |
9319
|
4746
|
|
|
|
|
if (!RExC_nestroot) |
9320
|
4754334
|
|
|
|
|
RExC_nestroot = parno; |
9321
|
4754334
|
|
|
|
|
if (RExC_seen & REG_SEEN_RECURSE |
9322
|
11442142
|
|
|
|
|
&& !RExC_open_parens[parno-1]) |
9323
|
|
|
|
|
|
{ |
9324
|
11442142
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, |
9325
|
|
|
|
|
|
"Setting open paren #%"IVdf" to %d\n", |
9326
|
|
|
|
|
|
(IV)parno, REG_NODE_NUM(ret))); |
9327
|
11442142
|
|
|
|
|
RExC_open_parens[parno-1]= ret; |
9328
|
|
|
|
|
|
} |
9329
|
|
|
|
|
|
} |
9330
|
11442154
|
|
|
|
|
Set_Node_Length(ret, 1); /* MJD */ |
9331
|
5579271
|
|
|
|
|
Set_Node_Offset(ret, RExC_parse); /* MJD */ |
9332
|
|
|
|
|
|
is_open = 1; |
9333
|
|
|
|
|
|
} |
9334
|
|
|
|
|
|
} |
9335
|
|
|
|
|
|
else /* ! paren */ |
9336
|
|
|
|
|
|
ret = NULL; |
9337
|
|
|
|
|
|
|
9338
|
|
|
|
|
|
parse_rest: |
9339
|
|
|
|
|
|
/* Pick up the branches, linking them together. */ |
9340
|
4247294
|
|
|
|
|
parse_start = RExC_parse; /* MJD */ |
9341
|
5579655
|
|
|
|
|
br = regbranch(pRExC_state, &flags, 1,depth+1); |
9342
|
|
|
|
|
|
|
9343
|
|
|
|
|
|
/* branch_len = (paren != 0); */ |
9344
|
|
|
|
|
|
|
9345
|
5722
|
|
|
|
|
if (br == NULL) { |
9346
|
5348
|
|
|
|
|
if (flags & RESTART_UTF8) { |
9347
|
36434938
|
|
|
|
|
*flagp = RESTART_UTF8; |
9348
|
36429002
|
|
|
|
|
return NULL; |
9349
|
|
|
|
|
|
} |
9350
|
1014112
|
|
|
|
|
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); |
9351
|
|
|
|
|
|
} |
9352
|
1014484
|
|
|
|
|
if (*RExC_parse == '|') { |
9353
|
1014128
|
|
|
|
|
if (!SIZE_ONLY && RExC_extralen) { |
9354
|
0
|
|
|
|
|
reginsert(pRExC_state, BRANCHJ, br, depth+1); |
9355
|
|
|
|
|
|
} |
9356
|
|
|
|
|
|
else { /* MJD */ |
9357
|
35414882
|
|
|
|
|
reginsert(pRExC_state, BRANCH, br, depth+1); |
9358
|
2245784
|
|
|
|
|
Set_Node_Length(br, paren != 0); |
9359
|
16
|
|
|
|
|
Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); |
9360
|
|
|
|
|
|
} |
9361
|
|
|
|
|
|
have_branch = 1; |
9362
|
2245784
|
|
|
|
|
if (SIZE_ONLY) |
9363
|
2245776
|
|
|
|
|
RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ |
9364
|
|
|
|
|
|
} |
9365
|
1126611
|
|
|
|
|
else if (paren == ':') { |
9366
|
33169186
|
|
|
|
|
*flagp |= flags&SIMPLE; |
9367
|
|
|
|
|
|
} |
9368
|
5906420
|
|
|
|
|
if (is_open) { /* Starts with OPEN. */ |
9369
|
35414878
|
|
|
|
|
REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ |
9370
|
|
|
|
|
|
} |
9371
|
11158994
|
|
|
|
|
else if (paren != '?') /* Not Conditional */ |
9372
|
|
|
|
|
|
ret = br; |
9373
|
24256604
|
|
|
|
|
*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); |
9374
|
|
|
|
|
|
lastbr = br; |
9375
|
35415706
|
|
|
|
|
while (*RExC_parse == '|') { |
9376
|
57545076
|
|
|
|
|
if (!SIZE_ONLY && RExC_extralen) { |
9377
|
4689456
|
|
|
|
|
ender = reganode(pRExC_state, LONGJMP,0); |
9378
|
0
|
|
|
|
|
REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ |
9379
|
|
|
|
|
|
} |
9380
|
96
|
|
|
|
|
if (SIZE_ONLY) |
9381
|
4689504
|
|
|
|
|
RExC_extralen += 2; /* Account for LONGJMP. */ |
9382
|
2348195
|
|
|
|
|
nextchar(pRExC_state); |
9383
|
4689552
|
|
|
|
|
if (freeze_paren) { |
9384
|
4689456
|
|
|
|
|
if (RExC_npar > after_freeze) |
9385
|
2676
|
|
|
|
|
after_freeze = RExC_npar; |
9386
|
2676
|
|
|
|
|
RExC_npar = freeze_paren; |
9387
|
|
|
|
|
|
} |
9388
|
2772
|
|
|
|
|
br = regbranch(pRExC_state, &flags, 0, depth+1); |
9389
|
|
|
|
|
|
|
9390
|
4689552
|
|
|
|
|
if (br == NULL) { |
9391
|
4689414
|
|
|
|
|
if (flags & RESTART_UTF8) { |
9392
|
10
|
|
|
|
|
*flagp = RESTART_UTF8; |
9393
|
10
|
|
|
|
|
return NULL; |
9394
|
|
|
|
|
|
} |
9395
|
10
|
|
|
|
|
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); |
9396
|
|
|
|
|
|
} |
9397
|
96
|
|
|
|
|
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ |
9398
|
|
|
|
|
|
lastbr = br; |
9399
|
4689500
|
|
|
|
|
*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); |
9400
|
|
|
|
|
|
} |
9401
|
|
|
|
|
|
|
9402
|
4689776
|
|
|
|
|
if (have_branch || paren != ':') { |
9403
|
|
|
|
|
|
/* Make a closing node, and hook it on the end. */ |
9404
|
35415098
|
|
|
|
|
switch (paren) { |
9405
|
|
|
|
|
|
case ':': |
9406
|
29508770
|
|
|
|
|
ender = reg_node(pRExC_state, TAIL); |
9407
|
964548
|
|
|
|
|
break; |
9408
|
|
|
|
|
|
case 1: case 2: |
9409
|
964556
|
|
|
|
|
ender = reganode(pRExC_state, CLOSE, parno); |
9410
|
11158610
|
|
|
|
|
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { |
9411
|
11158598
|
|
|
|
|
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, |
9412
|
|
|
|
|
|
"Setting close paren #%"IVdf" to %d\n", |
9413
|
|
|
|
|
|
(IV)parno, REG_NODE_NUM(ender))); |
9414
|
5326
|
|
|
|
|
RExC_close_parens[parno-1]= ender; |
9415
|
5326
|
|
|
|
|
if (RExC_nestroot == parno) |
9416
|
4834
|
|
|
|
|
RExC_nestroot = 0; |
9417
|
|
|
|
|
|
} |
9418
|
439646
|
|
|
|
|
Set_Node_Offset(ender,RExC_parse+1); /* MJD */ |
9419
|
453758
|
|
|
|
|
Set_Node_Length(ender,1); /* MJD */ |
9420
|
|
|
|
|
|
break; |
9421
|
|
|
|
|
|
case '<': |
9422
|
|
|
|
|
|
case ',': |
9423
|
|
|
|
|
|
case '=': |
9424
|
|
|
|
|
|
case '!': |
9425
|
453746
|
|
|
|
|
*flagp &= ~HASWIDTH; |
9426
|
|
|
|
|
|
/* FALL THROUGH */ |
9427
|
|
|
|
|
|
case '>': |
9428
|
16931878
|
|
|
|
|
ender = reg_node(pRExC_state, SUCCEED); |
9429
|
16931878
|
|
|
|
|
break; |
9430
|
|
|
|
|
|
case 0: |
9431
|
8466019
|
|
|
|
|
ender = reg_node(pRExC_state, END); |
9432
|
29509034
|
|
|
|
|
if (!SIZE_ONLY) { |
9433
|
29508900
|
|
|
|
|
assert(!RExC_opend); /* there can only be one! */ |
9434
|
1119633
|
|
|
|
|
RExC_opend = ender; |
9435
|
|
|
|
|
|
} |
9436
|
|
|
|
|
|
break; |
9437
|
|
|
|
|
|
} |
9438
|
41580
|
|
|
|
|
DEBUG_PARSE_r(if (!SIZE_ONLY) { |
9439
|
|
|
|
|
|
SV * const mysv_val1=sv_newmortal(); |
9440
|
|
|
|
|
|
SV * const mysv_val2=sv_newmortal(); |
9441
|
|
|
|
|
|
DEBUG_PARSE_MSG("lsbr"); |
9442
|
|
|
|
|
|
regprop(RExC_rx, mysv_val1, lastbr); |
9443
|
|
|
|
|
|
regprop(RExC_rx, mysv_val2, ender); |
9444
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", |
9445
|
|
|
|
|
|
SvPV_nolen_const(mysv_val1), |
9446
|
|
|
|
|
|
(IV)REG_NODE_NUM(lastbr), |
9447
|
|
|
|
|
|
SvPV_nolen_const(mysv_val2), |
9448
|
|
|
|
|
|
(IV)REG_NODE_NUM(ender), |
9449
|
|
|
|
|
|
(IV)(ender - lastbr) |
9450
|
|
|
|
|
|
); |
9451
|
|
|
|
|
|
}); |
9452
|
6196541
|
|
|
|
|
REGTAIL(pRExC_state, lastbr, ender); |
9453
|
|
|
|
|
|
|
9454
|
5077042
|
|
|
|
|
if (have_branch && !SIZE_ONLY) { |
9455
|
|
|
|
|
|
char is_nothing= 1; |
9456
|
5076766
|
|
|
|
|
if (depth==1) |
9457
|
3460842
|
|
|
|
|
RExC_seen |= REG_TOP_LEVEL_BRANCHES; |
9458
|
|
|
|
|
|
|
9459
|
|
|
|
|
|
/* Hook the tails of the branches to the closing node. */ |
9460
|
3460920
|
|
|
|
|
for (br = ret; br; br = regnext(br)) { |
9461
|
1615986
|
|
|
|
|
const U8 op = PL_regkind[OP(br)]; |
9462
|
70
|
|
|
|
|
if (op == BRANCH) { |
9463
|
1119555
|
|
|
|
|
REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); |
9464
|
252
|
|
|
|
|
if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) |
9465
|
|
|
|
|
|
is_nothing= 0; |
9466
|
|
|
|
|
|
} |
9467
|
210
|
|
|
|
|
else if (op == BRANCHJ) { |
9468
|
196
|
|
|
|
|
REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); |
9469
|
|
|
|
|
|
/* for now we always disable this optimisation * / |
9470
|
|
|
|
|
|
if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) |
9471
|
|
|
|
|
|
*/ |
9472
|
|
|
|
|
|
is_nothing= 0; |
9473
|
|
|
|
|
|
} |
9474
|
|
|
|
|
|
} |
9475
|
108
|
|
|
|
|
if (is_nothing) { |
9476
|
100
|
|
|
|
|
br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; |
9477
|
384
|
|
|
|
|
DEBUG_PARSE_r(if (!SIZE_ONLY) { |
9478
|
|
|
|
|
|
SV * const mysv_val1=sv_newmortal(); |
9479
|
|
|
|
|
|
SV * const mysv_val2=sv_newmortal(); |
9480
|
|
|
|
|
|
DEBUG_PARSE_MSG("NADA"); |
9481
|
|
|
|
|
|
regprop(RExC_rx, mysv_val1, ret); |
9482
|
|
|
|
|
|
regprop(RExC_rx, mysv_val2, ender); |
9483
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", |
9484
|
|
|
|
|
|
SvPV_nolen_const(mysv_val1), |
9485
|
|
|
|
|
|
(IV)REG_NODE_NUM(ret), |
9486
|
|
|
|
|
|
SvPV_nolen_const(mysv_val2), |
9487
|
|
|
|
|
|
(IV)REG_NODE_NUM(ender), |
9488
|
|
|
|
|
|
(IV)(ender - ret) |
9489
|
|
|
|
|
|
); |
9490
|
|
|
|
|
|
}); |
9491
|
288
|
|
|
|
|
OP(br)= NOTHING; |
9492
|
96
|
|
|
|
|
if (OP(ender) == TAIL) { |
9493
|
35414814
|
|
|
|
|
NEXT_OFF(br)= 0; |
9494
|
453746
|
|
|
|
|
RExC_emit= br + 1; |
9495
|
|
|
|
|
|
} else { |
9496
|
|
|
|
|
|
regnode *opt; |
9497
|
453746
|
|
|
|
|
for ( opt= br + 1; opt < ender ; opt++ ) |
9498
|
453746
|
|
|
|
|
OP(opt)= OPTIMIZED; |
9499
|
453746
|
|
|
|
|
NEXT_OFF(br)= ender - br; |
9500
|
|
|
|
|
|
} |
9501
|
|
|
|
|
|
} |
9502
|
|
|
|
|
|
} |
9503
|
|
|
|
|
|
} |
9504
|
|
|
|
|
|
|
9505
|
|
|
|
|
|
{ |
9506
|
|
|
|
|
|
const char *p; |
9507
|
|
|
|
|
|
static const char parens[] = "=!<,>"; |
9508
|
|
|
|
|
|
|
9509
|
454118
|
|
|
|
|
if (paren && (p = strchr(parens, paren))) { |
9510
|
453746
|
|
|
|
|
U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; |
9511
|
35414814
|
|
|
|
|
int flag = (p - parens) > 1; |
9512
|
|
|
|
|
|
|
9513
|
18482936
|
|
|
|
|
if (paren == '>') |
9514
|
|
|
|
|
|
node = SUSPEND, flag = 0; |
9515
|
18482936
|
|
|
|
|
reginsert(pRExC_state, node,ret, depth+1); |
9516
|
30
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
9517
|
30
|
|
|
|
|
Set_Node_Offset(ret, parse_start + 1); |
9518
|
16931878
|
|
|
|
|
ret->flags = flag; |
9519
|
56
|
|
|
|
|
REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); |
9520
|
|
|
|
|
|
} |
9521
|
|
|
|
|
|
} |
9522
|
|
|
|
|
|
|
9523
|
|
|
|
|
|
/* Check for proper termination. */ |
9524
|
428
|
|
|
|
|
if (paren) { |
9525
|
|
|
|
|
|
/* restore original flags, but keep (?p) */ |
9526
|
160
|
|
|
|
|
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); |
9527
|
104
|
|
|
|
|
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { |
9528
|
35414728
|
|
|
|
|
RExC_parse = oregcomp_parse; |
9529
|
93448
|
|
|
|
|
vFAIL("Unmatched ("); |
9530
|
|
|
|
|
|
} |
9531
|
|
|
|
|
|
} |
9532
|
35414996
|
|
|
|
|
else if (!paren && RExC_parse < RExC_end) { |
9533
|
18610531
|
|
|
|
|
if (*RExC_parse == ')') { |
9534
|
41134494
|
|
|
|
|
RExC_parse++; |
9535
|
41134494
|
|
|
|
|
vFAIL("Unmatched )"); |
9536
|
|
|
|
|
|
} |
9537
|
|
|
|
|
|
else |
9538
|
41134494
|
|
|
|
|
FAIL("Junk on end of regexp"); /* "Can't happen". */ |
9539
|
|
|
|
|
|
assert(0); /* NOTREACHED */ |
9540
|
|
|
|
|
|
} |
9541
|
|
|
|
|
|
|
9542
|
4689828
|
|
|
|
|
if (RExC_in_lookbehind) { |
9543
|
0
|
|
|
|
|
RExC_in_lookbehind--; |
9544
|
|
|
|
|
|
} |
9545
|
4689828
|
|
|
|
|
if (after_freeze > RExC_npar) |
9546
|
41134494
|
|
|
|
|
RExC_npar = after_freeze; |
9547
|
|
|
|
|
|
return(ret); |
9548
|
|
|
|
|
|
} |
9549
|
|
|
|
|
|
|
9550
|
|
|
|
|
|
/* |
9551
|
|
|
|
|
|
- regbranch - one alternative of an | operator |
9552
|
|
|
|
|
|
* |
9553
|
|
|
|
|
|
* Implements the concatenation operator. |
9554
|
|
|
|
|
|
* |
9555
|
|
|
|
|
|
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be |
9556
|
|
|
|
|
|
* restarted. |
9557
|
|
|
|
|
|
*/ |
9558
|
|
|
|
|
|
STATIC regnode * |
9559
|
2348591
|
|
|
|
|
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) |
9560
|
|
|
|
|
|
{ |
9561
|
|
|
|
|
|
dVAR; |
9562
|
|
|
|
|
|
regnode *ret; |
9563
|
|
|
|
|
|
regnode *chain = NULL; |
9564
|
|
|
|
|
|
regnode *latest; |
9565
|
41134986
|
|
|
|
|
I32 flags = 0, c = 0; |
9566
|
41134986
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
9567
|
|
|
|
|
|
|
9568
|
41134986
|
|
|
|
|
PERL_ARGS_ASSERT_REGBRANCH; |
9569
|
|
|
|
|
|
|
9570
|
137092971
|
|
|
|
|
DEBUG_PARSE("brnc"); |
9571
|
|
|
|
|
|
|
9572
|
76713332
|
|
|
|
|
if (first) |
9573
|
|
|
|
|
|
ret = NULL; |
9574
|
|
|
|
|
|
else { |
9575
|
76712936
|
|
|
|
|
if (!SIZE_ONLY && RExC_extralen) |
9576
|
76706862
|
|
|
|
|
ret = reganode(pRExC_state, BRANCHJ,0); |
9577
|
|
|
|
|
|
else { |
9578
|
1014574
|
|
|
|
|
ret = reg_node(pRExC_state, BRANCH); |
9579
|
452
|
|
|
|
|
Set_Node_Length(ret, 1); |
9580
|
|
|
|
|
|
} |
9581
|
|
|
|
|
|
} |
9582
|
|
|
|
|
|
|
9583
|
1014614
|
|
|
|
|
if (!first && SIZE_ONLY) |
9584
|
1014170
|
|
|
|
|
RExC_extralen += 1; /* BRANCHJ */ |
9585
|
|
|
|
|
|
|
9586
|
1014614
|
|
|
|
|
*flagp = WORST; /* Tentatively. */ |
9587
|
|
|
|
|
|
|
9588
|
492
|
|
|
|
|
RExC_parse--; |
9589
|
75692876
|
|
|
|
|
nextchar(pRExC_state); |
9590
|
75694056
|
|
|
|
|
while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { |
9591
|
75693096
|
|
|
|
|
flags &= ~TRYAGAIN; |
9592
|
40178458
|
|
|
|
|
latest = regpiece(pRExC_state, &flags,depth+1); |
9593
|
35515350
|
|
|
|
|
if (latest == NULL) { |
9594
|
35514662
|
|
|
|
|
if (flags & TRYAGAIN) |
9595
|
75692562
|
|
|
|
|
continue; |
9596
|
40114418
|
|
|
|
|
if (flags & RESTART_UTF8) { |
9597
|
164082
|
|
|
|
|
*flagp = RESTART_UTF8; |
9598
|
164082
|
|
|
|
|
return NULL; |
9599
|
|
|
|
|
|
} |
9600
|
40114394
|
|
|
|
|
FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); |
9601
|
|
|
|
|
|
} |
9602
|
31592139
|
|
|
|
|
else if (ret == NULL) |
9603
|
|
|
|
|
|
ret = latest; |
9604
|
76713528
|
|
|
|
|
*flagp |= flags&(HASWIDTH|POSTPONED); |
9605
|
76713528
|
|
|
|
|
if (chain == NULL) /* First piece. */ |
9606
|
76713300
|
|
|
|
|
*flagp |= flags&SPSTART; |
9607
|
|
|
|
|
|
else { |
9608
|
76713068
|
|
|
|
|
RExC_naughty++; |
9609
|
76707126
|
|
|
|
|
REGTAIL(pRExC_state, chain, latest); |
9610
|
|
|
|
|
|
} |
9611
|
|
|
|
|
|
chain = latest; |
9612
|
1015166
|
|
|
|
|
c++; |
9613
|
|
|
|
|
|
} |
9614
|
1014946
|
|
|
|
|
if (chain == NULL) { /* Loop ran zero times. */ |
9615
|
8
|
|
|
|
|
chain = reg_node(pRExC_state, NOTHING); |
9616
|
1014486
|
|
|
|
|
if (ret == NULL) |
9617
|
|
|
|
|
|
ret = chain; |
9618
|
|
|
|
|
|
} |
9619
|
75692888
|
|
|
|
|
if (c == 1) { |
9620
|
75692724
|
|
|
|
|
*flagp |= flags&SIMPLE; |
9621
|
|
|
|
|
|
} |
9622
|
|
|
|
|
|
|
9623
|
|
|
|
|
|
return ret; |
9624
|
|
|
|
|
|
} |
9625
|
|
|
|
|
|
|
9626
|
|
|
|
|
|
/* |
9627
|
|
|
|
|
|
- regpiece - something followed by possible [*+?] |
9628
|
|
|
|
|
|
* |
9629
|
|
|
|
|
|
* Note that the branching code sequences used for ? and the general cases |
9630
|
|
|
|
|
|
* of * and + are somewhat optimized: they use the same NOTHING node as |
9631
|
|
|
|
|
|
* both the endmarker for their branch list and the body of the last branch. |
9632
|
|
|
|
|
|
* It might seem that this node could be dispensed with entirely, but the |
9633
|
|
|
|
|
|
* endmarker role is not redundant. |
9634
|
|
|
|
|
|
* |
9635
|
|
|
|
|
|
* Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with |
9636
|
|
|
|
|
|
* TRYAGAIN. |
9637
|
|
|
|
|
|
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be |
9638
|
|
|
|
|
|
* restarted. |
9639
|
|
|
|
|
|
*/ |
9640
|
|
|
|
|
|
STATIC regnode * |
9641
|
864014
|
|
|
|
|
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) |
9642
|
|
|
|
|
|
{ |
9643
|
|
|
|
|
|
dVAR; |
9644
|
|
|
|
|
|
regnode *ret; |
9645
|
|
|
|
|
|
char op; |
9646
|
|
|
|
|
|
char *next; |
9647
|
|
|
|
|
|
I32 flags; |
9648
|
3764533
|
|
|
|
|
const char * const origparse = RExC_parse; |
9649
|
|
|
|
|
|
I32 min; |
9650
|
|
|
|
|
|
I32 max = REG_INFTY; |
9651
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
9652
|
|
|
|
|
|
char *parse_start; |
9653
|
|
|
|
|
|
#endif |
9654
|
|
|
|
|
|
const char *maxpos = NULL; |
9655
|
|
|
|
|
|
|
9656
|
|
|
|
|
|
/* Save the original in case we change the emitted regop to a FAIL. */ |
9657
|
2473538
|
|
|
|
|
regnode * const orig_emit = RExC_emit; |
9658
|
|
|
|
|
|
|
9659
|
800934
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
9660
|
|
|
|
|
|
|
9661
|
2473538
|
|
|
|
|
PERL_ARGS_ASSERT_REGPIECE; |
9662
|
|
|
|
|
|
|
9663
|
864014
|
|
|
|
|
DEBUG_PARSE("piec"); |
9664
|
|
|
|
|
|
|
9665
|
864014
|
|
|
|
|
ret = regatom(pRExC_state, &flags,depth+1); |
9666
|
864014
|
|
|
|
|
if (ret == NULL) { |
9667
|
863326
|
|
|
|
|
if (flags & (TRYAGAIN|RESTART_UTF8)) |
9668
|
863326
|
|
|
|
|
*flagp |= flags & (TRYAGAIN|RESTART_UTF8); |
9669
|
|
|
|
|
|
else |
9670
|
800222
|
|
|
|
|
FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); |
9671
|
63104
|
|
|
|
|
return(NULL); |
9672
|
|
|
|
|
|
} |
9673
|
|
|
|
|
|
|
9674
|
863990
|
|
|
|
|
op = *RExC_parse; |
9675
|
|
|
|
|
|
|
9676
|
823670
|
|
|
|
|
if (op == '{' && regcurly(RExC_parse, FALSE)) { |
9677
|
|
|
|
|
|
maxpos = NULL; |
9678
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
9679
|
2
|
|
|
|
|
parse_start = RExC_parse; /* MJD */ |
9680
|
|
|
|
|
|
#endif |
9681
|
863300
|
|
|
|
|
next = RExC_parse + 1; |
9682
|
863300
|
|
|
|
|
while (isDIGIT(*next) || *next == ',') { |
9683
|
863300
|
|
|
|
|
if (*next == ',') { |
9684
|
132
|
|
|
|
|
if (maxpos) |
9685
|
|
|
|
|
|
break; |
9686
|
|
|
|
|
|
else |
9687
|
|
|
|
|
|
maxpos = next; |
9688
|
|
|
|
|
|
} |
9689
|
66
|
|
|
|
|
next++; |
9690
|
|
|
|
|
|
} |
9691
|
66
|
|
|
|
|
if (*next == '}') { /* got one */ |
9692
|
66
|
|
|
|
|
if (!maxpos) |
9693
|
|
|
|
|
|
maxpos = next; |
9694
|
132
|
|
|
|
|
RExC_parse++; |
9695
|
38999632
|
|
|
|
|
min = atoi(RExC_parse); |
9696
|
9368024
|
|
|
|
|
if (*maxpos == ',') |
9697
|
3714528
|
|
|
|
|
maxpos++; |
9698
|
|
|
|
|
|
else |
9699
|
3714528
|
|
|
|
|
maxpos = RExC_parse; |
9700
|
|
|
|
|
|
max = atoi(maxpos); |
9701
|
5653496
|
|
|
|
|
if (!max && *maxpos != '0') |
9702
|
|
|
|
|
|
max = REG_INFTY; /* meaning "infinity" */ |
9703
|
5653496
|
|
|
|
|
else if (max >= REG_INFTY) |
9704
|
5653496
|
|
|
|
|
vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); |
9705
|
5653496
|
|
|
|
|
RExC_parse = next; |
9706
|
0
|
|
|
|
|
nextchar(pRExC_state); |
9707
|
0
|
|
|
|
|
if (max < min) { /* If can't match, warn and optimize to fail |
9708
|
|
|
|
|
|
unconditionally */ |
9709
|
0
|
|
|
|
|
if (SIZE_ONLY) { |
9710
|
5653496
|
|
|
|
|
ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); |
9711
|
|
|
|
|
|
|
9712
|
|
|
|
|
|
/* We can't back off the size because we have to reserve |
9713
|
|
|
|
|
|
* enough space for all the things we are about to throw |
9714
|
|
|
|
|
|
* away, but we can shrink it by the ammount we are about |
9715
|
|
|
|
|
|
* to re-use here */ |
9716
|
5653496
|
|
|
|
|
RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; |
9717
|
|
|
|
|
|
} |
9718
|
|
|
|
|
|
else { |
9719
|
0
|
|
|
|
|
RExC_emit = orig_emit; |
9720
|
|
|
|
|
|
} |
9721
|
5653496
|
|
|
|
|
ret = reg_node(pRExC_state, OPFAIL); |
9722
|
5653496
|
|
|
|
|
return ret; |
9723
|
|
|
|
|
|
} |
9724
|
|
|
|
|
|
|
9725
|
|
|
|
|
|
do_curly: |
9726
|
3241146
|
|
|
|
|
if ((flags&SIMPLE)) { |
9727
|
5653496
|
|
|
|
|
RExC_naughty += 2 + RExC_naughty / 2; |
9728
|
9368024
|
|
|
|
|
reginsert(pRExC_state, CURLY, ret, depth+1); |
9729
|
9368024
|
|
|
|
|
Set_Node_Offset(ret, parse_start+1); /* MJD */ |
9730
|
2824626
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
9731
|
|
|
|
|
|
} |
9732
|
|
|
|
|
|
else { |
9733
|
9368024
|
|
|
|
|
regnode * const w = reg_node(pRExC_state, WHILEM); |
9734
|
|
|
|
|
|
|
9735
|
9367188
|
|
|
|
|
w->flags = 0; |
9736
|
9368024
|
|
|
|
|
REGTAIL(pRExC_state, ret, w); |
9737
|
4435696
|
|
|
|
|
if (!SIZE_ONLY && RExC_extralen) { |
9738
|
4435696
|
|
|
|
|
reginsert(pRExC_state, LONGJMP,ret, depth+1); |
9739
|
74829118
|
|
|
|
|
reginsert(pRExC_state, NOTHING,ret, depth+1); |
9740
|
58802534
|
|
|
|
|
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ |
9741
|
|
|
|
|
|
} |
9742
|
58802534
|
|
|
|
|
reginsert(pRExC_state, CURLYX,ret, depth+1); |
9743
|
|
|
|
|
|
/* MJD hk */ |
9744
|
16026584
|
|
|
|
|
Set_Node_Offset(ret, parse_start+1); |
9745
|
16026584
|
|
|
|
|
Set_Node_Length(ret, |
9746
|
|
|
|
|
|
op == '{' ? (RExC_parse - parse_start) : 1); |
9747
|
|
|
|
|
|
|
9748
|
16026584
|
|
|
|
|
if (!SIZE_ONLY && RExC_extralen) |
9749
|
5553704
|
|
|
|
|
NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ |
9750
|
5553704
|
|
|
|
|
REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); |
9751
|
5553704
|
|
|
|
|
if (SIZE_ONLY) |
9752
|
10472880
|
|
|
|
|
RExC_whilem_seen++, RExC_extralen += 3; |
9753
|
9297354
|
|
|
|
|
RExC_naughty += 4 + RExC_naughty; /* compound interest */ |
9754
|
|
|
|
|
|
} |
9755
|
1968024
|
|
|
|
|
ret->flags = 0; |
9756
|
|
|
|
|
|
|
9757
|
1968024
|
|
|
|
|
if (min > 0) |
9758
|
1968024
|
|
|
|
|
*flagp = WORST; |
9759
|
7329330
|
|
|
|
|
if (max > 0) |
9760
|
5346952
|
|
|
|
|
*flagp |= HASWIDTH; |
9761
|
16889752
|
|
|
|
|
if (!SIZE_ONLY) { |
9762
|
80
|
|
|
|
|
ARG1_SET(ret, (U16)min); |
9763
|
80
|
|
|
|
|
ARG2_SET(ret, (U16)max); |
9764
|
|
|
|
|
|
} |
9765
|
|
|
|
|
|
|
9766
|
|
|
|
|
|
goto nest_check; |
9767
|
|
|
|
|
|
} |
9768
|
|
|
|
|
|
} |
9769
|
|
|
|
|
|
|
9770
|
764
|
|
|
|
|
if (!ISMULT1(op)) { |
9771
|
16890436
|
|
|
|
|
*flagp = flags; |
9772
|
3258166
|
|
|
|
|
return(ret); |
9773
|
|
|
|
|
|
} |
9774
|
|
|
|
|
|
|
9775
|
|
|
|
|
|
#if 0 /* Now runtime fix should be reliable. */ |
9776
|
|
|
|
|
|
|
9777
|
|
|
|
|
|
/* if this is reinstated, don't forget to put this back into perldiag: |
9778
|
|
|
|
|
|
|
9779
|
|
|
|
|
|
=item Regexp *+ operand could be empty at {#} in regex m/%s/ |
9780
|
|
|
|
|
|
|
9781
|
|
|
|
|
|
(F) The part of the regexp subject to either the * or + quantifier |
9782
|
|
|
|
|
|
could match an empty string. The {#} shows in the regular |
9783
|
|
|
|
|
|
expression about where the problem was discovered. |
9784
|
|
|
|
|
|
|
9785
|
|
|
|
|
|
*/ |
9786
|
|
|
|
|
|
|
9787
|
|
|
|
|
|
if (!(flags&HASWIDTH) && op != '?') |
9788
|
|
|
|
|
|
vFAIL("Regexp *+ operand could be empty"); |
9789
|
|
|
|
|
|
#endif |
9790
|
|
|
|
|
|
|
9791
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
9792
|
3257478
|
|
|
|
|
parse_start = RExC_parse; |
9793
|
|
|
|
|
|
#endif |
9794
|
3257478
|
|
|
|
|
nextchar(pRExC_state); |
9795
|
|
|
|
|
|
|
9796
|
13632270
|
|
|
|
|
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); |
9797
|
|
|
|
|
|
|
9798
|
13704
|
|
|
|
|
if (op == '*' && (flags&SIMPLE)) { |
9799
|
13704
|
|
|
|
|
reginsert(pRExC_state, STAR, ret, depth+1); |
9800
|
13704
|
|
|
|
|
ret->flags = 0; |
9801
|
13704
|
|
|
|
|
RExC_naughty += 4; |
9802
|
|
|
|
|
|
} |
9803
|
13704
|
|
|
|
|
else if (op == '*') { |
9804
|
|
|
|
|
|
min = 0; |
9805
|
|
|
|
|
|
goto do_curly; |
9806
|
|
|
|
|
|
} |
9807
|
13704
|
|
|
|
|
else if (op == '+' && (flags&SIMPLE)) { |
9808
|
13704
|
|
|
|
|
reginsert(pRExC_state, PLUS, ret, depth+1); |
9809
|
16889748
|
|
|
|
|
ret->flags = 0; |
9810
|
30
|
|
|
|
|
RExC_naughty += 3; |
9811
|
|
|
|
|
|
} |
9812
|
30
|
|
|
|
|
else if (op == '+') { |
9813
|
|
|
|
|
|
min = 1; |
9814
|
|
|
|
|
|
goto do_curly; |
9815
|
|
|
|
|
|
} |
9816
|
10018
|
|
|
|
|
else if (op == '?') { |
9817
|
|
|
|
|
|
min = 0; max = 1; |
9818
|
|
|
|
|
|
goto do_curly; |
9819
|
|
|
|
|
|
} |
9820
|
|
|
|
|
|
nest_check: |
9821
|
10018
|
|
|
|
|
if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { |
9822
|
1446
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ |
9823
|
10741
|
|
|
|
|
ckWARN3reg(RExC_parse, |
9824
|
|
|
|
|
|
"%.*s matches null string many times", |
9825
|
|
|
|
|
|
(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), |
9826
|
|
|
|
|
|
origparse); |
9827
|
10018
|
|
|
|
|
(void)ReREFCNT_inc(RExC_rx_sv); |
9828
|
|
|
|
|
|
} |
9829
|
|
|
|
|
|
|
9830
|
5962
|
|
|
|
|
if (RExC_parse < RExC_end && *RExC_parse == '?') { |
9831
|
5962
|
|
|
|
|
nextchar(pRExC_state); |
9832
|
1638
|
|
|
|
|
reginsert(pRExC_state, MINMOD, ret, depth+1); |
9833
|
14
|
|
|
|
|
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); |
9834
|
|
|
|
|
|
} |
9835
|
|
|
|
|
|
else |
9836
|
4324
|
|
|
|
|
if (RExC_parse < RExC_end && *RExC_parse == '+') { |
9837
|
|
|
|
|
|
regnode *ender; |
9838
|
4324
|
|
|
|
|
nextchar(pRExC_state); |
9839
|
4324
|
|
|
|
|
ender = reg_node(pRExC_state, SUCCEED); |
9840
|
4324
|
|
|
|
|
REGTAIL(pRExC_state, ret, ender); |
9841
|
4324
|
|
|
|
|
reginsert(pRExC_state, SUSPEND, ret, depth+1); |
9842
|
4324
|
|
|
|
|
ret->flags = 0; |
9843
|
4056
|
|
|
|
|
ender = reg_node(pRExC_state, TAIL); |
9844
|
28
|
|
|
|
|
REGTAIL(pRExC_state, ret, ender); |
9845
|
|
|
|
|
|
} |
9846
|
|
|
|
|
|
|
9847
|
4028
|
|
|
|
|
if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { |
9848
|
4028
|
|
|
|
|
RExC_parse++; |
9849
|
6008
|
|
|
|
|
vFAIL("Nested quantifiers"); |
9850
|
|
|
|
|
|
} |
9851
|
|
|
|
|
|
|
9852
|
|
|
|
|
|
return(ret); |
9853
|
|
|
|
|
|
} |
9854
|
|
|
|
|
|
|
9855
|
|
|
|
|
|
STATIC bool |
9856
|
3988
|
|
|
|
|
S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, |
9857
|
|
|
|
|
|
const bool strict /* Apply stricter parsing rules? */ |
9858
|
|
|
|
|
|
) |
9859
|
|
|
|
|
|
{ |
9860
|
|
|
|
|
|
|
9861
|
|
|
|
|
|
/* This is expected to be called by a parser routine that has recognized '\N' |
9862
|
|
|
|
|
|
and needs to handle the rest. RExC_parse is expected to point at the first |
9863
|
|
|
|
|
|
char following the N at the time of the call. On successful return, |
9864
|
|
|
|
|
|
RExC_parse has been updated to point to just after the sequence identified |
9865
|
|
|
|
|
|
by this routine, and <*flagp> has been updated. |
9866
|
|
|
|
|
|
|
9867
|
|
|
|
|
|
The \N may be inside (indicated by the boolean ) or outside a |
9868
|
|
|
|
|
|
character class. |
9869
|
|
|
|
|
|
|
9870
|
|
|
|
|
|
\N may begin either a named sequence, or if outside a character class, mean |
9871
|
|
|
|
|
|
to match a non-newline. For non single-quoted regexes, the tokenizer has |
9872
|
|
|
|
|
|
attempted to decide which, and in the case of a named sequence, converted it |
9873
|
|
|
|
|
|
into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, |
9874
|
|
|
|
|
|
where c1... are the characters in the sequence. For single-quoted regexes, |
9875
|
|
|
|
|
|
the tokenizer passes the \N sequence through unchanged; this code will not |
9876
|
|
|
|
|
|
attempt to determine this nor expand those, instead raising a syntax error. |
9877
|
|
|
|
|
|
The net effect is that if the beginning of the passed-in pattern isn't '{U+' |
9878
|
|
|
|
|
|
or there is no '}', it signals that this \N occurrence means to match a |
9879
|
|
|
|
|
|
non-newline. |
9880
|
|
|
|
|
|
|
9881
|
|
|
|
|
|
Only the \N{U+...} form should occur in a character class, for the same |
9882
|
|
|
|
|
|
reason that '.' inside a character class means to just match a period: it |
9883
|
|
|
|
|
|
just doesn't make sense. |
9884
|
|
|
|
|
|
|
9885
|
|
|
|
|
|
The function raises an error (via vFAIL), and doesn't return for various |
9886
|
|
|
|
|
|
syntax errors. Otherwise it returns TRUE and sets or on |
9887
|
|
|
|
|
|
success; it returns FALSE otherwise. Returns FALSE, setting *flagp to |
9888
|
|
|
|
|
|
RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is |
9889
|
|
|
|
|
|
only possible if node_p is non-NULL. |
9890
|
|
|
|
|
|
|
9891
|
|
|
|
|
|
|
9892
|
|
|
|
|
|
If is non-null, it means the caller can accept an input sequence |
9893
|
|
|
|
|
|
consisting of a just a single code point; <*valuep> is set to that value |
9894
|
|
|
|
|
|
if the input is such. |
9895
|
|
|
|
|
|
|
9896
|
|
|
|
|
|
If is non-null it signifies that the caller can accept any other |
9897
|
|
|
|
|
|
legal sequence (i.e., one that isn't just a single code point). <*node_p> |
9898
|
|
|
|
|
|
is set as follows: |
9899
|
|
|
|
|
|
1) \N means not-a-NL: points to a newly created REG_ANY node; |
9900
|
|
|
|
|
|
2) \N{}: points to a new NOTHING node; |
9901
|
|
|
|
|
|
3) otherwise: points to a new EXACT node containing the resolved |
9902
|
|
|
|
|
|
string. |
9903
|
|
|
|
|
|
Note that FALSE is returned for single code point sequences if is |
9904
|
|
|
|
|
|
null. |
9905
|
|
|
|
|
|
*/ |
9906
|
|
|
|
|
|
|
9907
|
|
|
|
|
|
char * endbrace; /* '}' following the name */ |
9908
|
|
|
|
|
|
char* p; |
9909
|
|
|
|
|
|
char *endchar; /* Points to '.' or '}' ending cur char in the input |
9910
|
|
|
|
|
|
stream */ |
9911
|
|
|
|
|
|
bool has_multiple_chars; /* true if the input stream contains a sequence of |
9912
|
|
|
|
|
|
more than one character */ |
9913
|
|
|
|
|
|
|
9914
|
3988
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
9915
|
|
|
|
|
|
|
9916
|
14
|
|
|
|
|
PERL_ARGS_ASSERT_GROK_BSLASH_N; |
9917
|
|
|
|
|
|
|
9918
|
14
|
|
|
|
|
GET_RE_DEBUG_FLAGS; |
9919
|
|
|
|
|
|
|
9920
|
4014
|
|
|
|
|
assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ |
9921
|
|
|
|
|
|
|
9922
|
|
|
|
|
|
/* The [^\n] meaning of \N ignores spaces and comments under the /x |
9923
|
|
|
|
|
|
* modifier. The other meaning does not */ |
9924
|
26
|
|
|
|
|
p = (RExC_flags & RXf_PMf_EXTENDED) |
9925
|
8
|
|
|
|
|
? regwhite( pRExC_state, RExC_parse ) |
9926
|
18
|
|
|
|
|
: RExC_parse; |
9927
|
|
|
|
|
|
|
9928
|
|
|
|
|
|
/* Disambiguate between \N meaning a named character versus \N meaning |
9929
|
|
|
|
|
|
* [^\n]. The former is assumed when it can't be the latter. */ |
9930
|
14
|
|
|
|
|
if (*p != '{' || regcurly(p, FALSE)) { |
9931
|
8
|
|
|
|
|
RExC_parse = p; |
9932
|
2
|
|
|
|
|
if (! node_p) { |
9933
|
|
|
|
|
|
/* no bare \N in a charclass */ |
9934
|
2
|
|
|
|
|
if (in_char_class) { |
9935
|
6
|
|
|
|
|
vFAIL("\\N in a character class must be a named character: \\N{...}"); |
9936
|
|
|
|
|
|
} |
9937
|
|
|
|
|
|
return FALSE; |
9938
|
|
|
|
|
|
} |
9939
|
20
|
|
|
|
|
nextchar(pRExC_state); |
9940
|
20
|
|
|
|
|
*node_p = reg_node(pRExC_state, REG_ANY); |
9941
|
3988
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
9942
|
3988
|
|
|
|
|
RExC_naughty++; |
9943
|
7976
|
|
|
|
|
RExC_parse--; |
9944
|
3988
|
|
|
|
|
Set_Node_Length(*node_p, 1); /* MJD */ |
9945
|
|
|
|
|
|
return TRUE; |
9946
|
|
|
|
|
|
} |
9947
|
|
|
|
|
|
|
9948
|
|
|
|
|
|
/* Here, we have decided it should be a named character or sequence */ |
9949
|
|
|
|
|
|
|
9950
|
|
|
|
|
|
/* The test above made sure that the next real character is a '{', but |
9951
|
|
|
|
|
|
* under the /x modifier, it could be separated by space (or a comment and |
9952
|
|
|
|
|
|
* \n) and this is not allowed (for consistency with \x{...} and the |
9953
|
|
|
|
|
|
* tokenizer handling of \N{NAME}). */ |
9954
|
7458
|
|
|
|
|
if (*RExC_parse != '{') { |
9955
|
3542
|
|
|
|
|
vFAIL("Missing braces on \\N{}"); |
9956
|
|
|
|
|
|
} |
9957
|
|
|
|
|
|
|
9958
|
3542
|
|
|
|
|
RExC_parse++; /* Skip past the '{' */ |
9959
|
|
|
|
|
|
|
9960
|
3542
|
|
|
|
|
if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ |
9961
|
3542
|
|
|
|
|
|| ! (endbrace == RExC_parse /* nothing between the {} */ |
9962
|
3542
|
|
|
|
|
|| (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ |
9963
|
3514
|
|
|
|
|
&& strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ |
9964
|
|
|
|
|
|
{ |
9965
|
70
|
|
|
|
|
if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ |
9966
|
70
|
|
|
|
|
vFAIL("\\N{NAME} must be resolved by the lexer"); |
9967
|
|
|
|
|
|
} |
9968
|
|
|
|
|
|
|
9969
|
0
|
|
|
|
|
if (endbrace == RExC_parse) { /* empty: \N{} */ |
9970
|
|
|
|
|
|
bool ret = TRUE; |
9971
|
70
|
|
|
|
|
if (node_p) { |
9972
|
28
|
|
|
|
|
*node_p = reg_node(pRExC_state,NOTHING); |
9973
|
|
|
|
|
|
} |
9974
|
70
|
|
|
|
|
else if (in_char_class) { |
9975
|
3472
|
|
|
|
|
if (SIZE_ONLY && in_char_class) { |
9976
|
6
|
|
|
|
|
if (strict) { |
9977
|
2
|
|
|
|
|
RExC_parse++; /* Position after the "}" */ |
9978
|
2
|
|
|
|
|
vFAIL("Zero length \\N{}"); |
9979
|
|
|
|
|
|
} |
9980
|
|
|
|
|
|
else { |
9981
|
4
|
|
|
|
|
ckWARNreg(RExC_parse, |
9982
|
|
|
|
|
|
"Ignoring zero length \\N{} in character class"); |
9983
|
|
|
|
|
|
} |
9984
|
|
|
|
|
|
} |
9985
|
|
|
|
|
|
ret = FALSE; |
9986
|
|
|
|
|
|
} |
9987
|
|
|
|
|
|
else { |
9988
|
|
|
|
|
|
return FALSE; |
9989
|
|
|
|
|
|
} |
9990
|
3470
|
|
|
|
|
nextchar(pRExC_state); |
9991
|
446
|
|
|
|
|
return ret; |
9992
|
|
|
|
|
|
} |
9993
|
|
|
|
|
|
|
9994
|
412
|
|
|
|
|
RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ |
9995
|
412
|
|
|
|
|
RExC_parse += 2; /* Skip past the 'U+' */ |
9996
|
|
|
|
|
|
|
9997
|
34
|
|
|
|
|
endchar = RExC_parse + strcspn(RExC_parse, ".}"); |
9998
|
|
|
|
|
|
|
9999
|
|
|
|
|
|
/* Code points are separated by dots. If none, there is only one code |
10000
|
|
|
|
|
|
* point, and is terminated by the brace */ |
10001
|
34
|
|
|
|
|
has_multiple_chars = (endchar < endbrace); |
10002
|
|
|
|
|
|
|
10003
|
3175
|
|
|
|
|
if (valuep && (! has_multiple_chars || in_char_class)) { |
10004
|
|
|
|
|
|
/* We only pay attention to the first char of |
10005
|
|
|
|
|
|
multichar strings being returned in char classes. I kinda wonder |
10006
|
|
|
|
|
|
if this makes sense as it does change the behaviour |
10007
|
|
|
|
|
|
from earlier versions, OTOH that behaviour was broken |
10008
|
|
|
|
|
|
as well. XXX Solution is to recharacterize as |
10009
|
|
|
|
|
|
[rest-of-class]|multi1|multi2... */ |
10010
|
|
|
|
|
|
|
10011
|
3124
|
|
|
|
|
STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); |
10012
|
3124
|
|
|
|
|
I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES |
10013
|
|
|
|
|
|
| PERL_SCAN_DISALLOW_PREFIX |
10014
|
3124
|
|
|
|
|
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); |
10015
|
|
|
|
|
|
|
10016
|
3124
|
|
|
|
|
*valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); |
10017
|
|
|
|
|
|
|
10018
|
|
|
|
|
|
/* The tokenizer should have guaranteed validity, but it's possible to |
10019
|
|
|
|
|
|
* bypass it by using single quoting, so check */ |
10020
|
6248
|
|
|
|
|
if (length_of_hex == 0 |
10021
|
34
|
|
|
|
|
|| length_of_hex != (STRLEN)(endchar - RExC_parse) ) |
10022
|
|
|
|
|
|
{ |
10023
|
34
|
|
|
|
|
RExC_parse += length_of_hex; /* Includes all the valid */ |
10024
|
34
|
|
|
|
|
RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ |
10025
|
0
|
|
|
|
|
? UTF8SKIP(RExC_parse) |
10026
|
|
|
|
|
|
: 1; |
10027
|
|
|
|
|
|
/* Guard against malformed utf8 */ |
10028
|
34
|
|
|
|
|
if (RExC_parse >= endchar) { |
10029
|
34
|
|
|
|
|
RExC_parse = endchar; |
10030
|
|
|
|
|
|
} |
10031
|
34
|
|
|
|
|
vFAIL("Invalid hexadecimal number in \\N{U+...}"); |
10032
|
|
|
|
|
|
} |
10033
|
|
|
|
|
|
|
10034
|
2
|
|
|
|
|
if (in_char_class && has_multiple_chars) { |
10035
|
2
|
|
|
|
|
if (strict) { |
10036
|
2
|
|
|
|
|
RExC_parse = endbrace; |
10037
|
0
|
|
|
|
|
vFAIL("\\N{} in character class restricted to one character"); |
10038
|
|
|
|
|
|
} |
10039
|
|
|
|
|
|
else { |
10040
|
32
|
|
|
|
|
ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); |
10041
|
|
|
|
|
|
} |
10042
|
|
|
|
|
|
} |
10043
|
|
|
|
|
|
|
10044
|
32
|
|
|
|
|
RExC_parse = endbrace + 1; |
10045
|
|
|
|
|
|
} |
10046
|
32
|
|
|
|
|
else if (! node_p || ! has_multiple_chars) { |
10047
|
|
|
|
|
|
|
10048
|
|
|
|
|
|
/* Here, the input is legal, but not according to the caller's |
10049
|
|
|
|
|
|
* options. We fail without advancing the parse, so that the |
10050
|
|
|
|
|
|
* caller can try again */ |
10051
|
32
|
|
|
|
|
RExC_parse = p; |
10052
|
4960
|
|
|
|
|
return FALSE; |
10053
|
|
|
|
|
|
} |
10054
|
|
|
|
|
|
else { |
10055
|
|
|
|
|
|
|
10056
|
|
|
|
|
|
/* What is done here is to convert this to a sub-pattern of the form |
10057
|
|
|
|
|
|
* (?:\x{char1}\x{char2}...) |
10058
|
|
|
|
|
|
* and then call reg recursively. That way, it retains its atomicness, |
10059
|
|
|
|
|
|
* while not having to worry about special handling that some code |
10060
|
|
|
|
|
|
* points may have. toke.c has converted the original Unicode values |
10061
|
|
|
|
|
|
* to native, so that we can just pass on the hex values unchanged. We |
10062
|
|
|
|
|
|
* do have to set a flag to keep recoding from happening in the |
10063
|
|
|
|
|
|
* recursion */ |
10064
|
|
|
|
|
|
|
10065
|
464
|
|
|
|
|
SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); |
10066
|
|
|
|
|
|
STRLEN len; |
10067
|
464
|
|
|
|
|
char *orig_end = RExC_end; |
10068
|
|
|
|
|
|
I32 flags; |
10069
|
|
|
|
|
|
|
10070
|
464
|
|
|
|
|
while (RExC_parse < endbrace) { |
10071
|
|
|
|
|
|
|
10072
|
|
|
|
|
|
/* Convert to notation the rest of the code understands */ |
10073
|
464
|
|
|
|
|
sv_catpv(substitute_parse, "\\x{"); |
10074
|
464
|
|
|
|
|
sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); |
10075
|
464
|
|
|
|
|
sv_catpv(substitute_parse, "}"); |
10076
|
|
|
|
|
|
|
10077
|
|
|
|
|
|
/* Point to the beginning of the next character in the sequence. */ |
10078
|
464
|
|
|
|
|
RExC_parse = endchar + 1; |
10079
|
464
|
|
|
|
|
endchar = RExC_parse + strcspn(RExC_parse, ".}"); |
10080
|
|
|
|
|
|
} |
10081
|
928
|
|
|
|
|
sv_catpv(substitute_parse, ")"); |
10082
|
|
|
|
|
|
|
10083
|
464
|
|
|
|
|
RExC_parse = SvPV(substitute_parse, len); |
10084
|
|
|
|
|
|
|
10085
|
|
|
|
|
|
/* Don't allow empty number */ |
10086
|
0
|
|
|
|
|
if (len < 8) { |
10087
|
464
|
|
|
|
|
vFAIL("Invalid hexadecimal number in \\N{U+...}"); |
10088
|
|
|
|
|
|
} |
10089
|
29313254
|
|
|
|
|
RExC_end = RExC_parse + len; |
10090
|
|
|
|
|
|
|
10091
|
|
|
|
|
|
/* The values are Unicode, and therefore not subject to recoding */ |
10092
|
16821824
|
|
|
|
|
RExC_override_recoding = 1; |
10093
|
|
|
|
|
|
|
10094
|
8410912
|
|
|
|
|
if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { |
10095
|
2265568
|
|
|
|
|
if (flags & RESTART_UTF8) { |
10096
|
8462674
|
|
|
|
|
*flagp = RESTART_UTF8; |
10097
|
29003628
|
|
|
|
|
return FALSE; |
10098
|
|
|
|
|
|
} |
10099
|
29003628
|
|
|
|
|
FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", |
10100
|
|
|
|
|
|
(UV) flags); |
10101
|
|
|
|
|
|
} |
10102
|
29003628
|
|
|
|
|
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); |
10103
|
|
|
|
|
|
|
10104
|
643952
|
|
|
|
|
RExC_parse = endbrace; |
10105
|
389050
|
|
|
|
|
RExC_end = orig_end; |
10106
|
434032
|
|
|
|
|
RExC_override_recoding = 0; |
10107
|
|
|
|
|
|
|
10108
|
42406
|
|
|
|
|
nextchar(pRExC_state); |
10109
|
|
|
|
|
|
} |
10110
|
|
|
|
|
|
|
10111
|
|
|
|
|
|
return TRUE; |
10112
|
|
|
|
|
|
} |
10113
|
|
|
|
|
|
|
10114
|
|
|
|
|
|
|
10115
|
|
|
|
|
|
/* |
10116
|
|
|
|
|
|
* reg_recode |
10117
|
|
|
|
|
|
* |
10118
|
|
|
|
|
|
* It returns the code point in utf8 for the value in *encp. |
10119
|
|
|
|
|
|
* value: a code value in the source encoding |
10120
|
|
|
|
|
|
* encp: a pointer to an Encode object |
10121
|
|
|
|
|
|
* |
10122
|
|
|
|
|
|
* If the result from Encode is not a single character, |
10123
|
|
|
|
|
|
* it returns U+FFFD (Replacement character) and sets *encp to NULL. |
10124
|
|
|
|
|
|
*/ |
10125
|
|
|
|
|
|
STATIC UV |
10126
|
42406
|
|
|
|
|
S_reg_recode(pTHX_ const char value, SV **encp) |
10127
|
|
|
|
|
|
{ |
10128
|
573915
|
|
|
|
|
STRLEN numlen = 1; |
10129
|
384530
|
|
|
|
|
SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); |
10130
|
0
|
|
|
|
|
const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); |
10131
|
0
|
|
|
|
|
const STRLEN newlen = SvCUR(sv); |
10132
|
|
|
|
|
|
UV uv = UNICODE_REPLACEMENT; |
10133
|
|
|
|
|
|
|
10134
|
384530
|
|
|
|
|
PERL_ARGS_ASSERT_REG_RECODE; |
10135
|
|
|
|
|
|
|
10136
|
384530
|
|
|
|
|
if (newlen) |
10137
|
0
|
|
|
|
|
uv = SvUTF8(sv) |
10138
|
0
|
|
|
|
|
? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) |
10139
|
0
|
|
|
|
|
: *(U8*)s; |
10140
|
|
|
|
|
|
|
10141
|
29003628
|
|
|
|
|
if (!newlen || numlen != newlen) { |
10142
|
|
|
|
|
|
uv = UNICODE_REPLACEMENT; |
10143
|
14557868
|
|
|
|
|
*encp = NULL; |
10144
|
|
|
|
|
|
} |
10145
|
14445760
|
|
|
|
|
return uv; |
10146
|
|
|
|
|
|
} |
10147
|
|
|
|
|
|
|
10148
|
|
|
|
|
|
PERL_STATIC_INLINE U8 |
10149
|
14446192
|
|
|
|
|
S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) |
10150
|
|
|
|
|
|
{ |
10151
|
|
|
|
|
|
U8 op; |
10152
|
|
|
|
|
|
|
10153
|
14446192
|
|
|
|
|
PERL_ARGS_ASSERT_COMPUTE_EXACTISH; |
10154
|
|
|
|
|
|
|
10155
|
322992
|
|
|
|
|
if (! FOLD) { |
10156
|
|
|
|
|
|
return EXACT; |
10157
|
|
|
|
|
|
} |
10158
|
|
|
|
|
|
|
10159
|
29003764
|
|
|
|
|
op = get_regex_charset(RExC_flags); |
10160
|
29003696
|
|
|
|
|
if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { |
10161
|
14434854
|
|
|
|
|
op--; /* /a is same as /u, and map /aa's offset to what /a's would have |
10162
|
|
|
|
|
|
been, so there is no hole */ |
10163
|
|
|
|
|
|
} |
10164
|
|
|
|
|
|
|
10165
|
183677
|
|
|
|
|
return op + EXACTF; |
10166
|
|
|
|
|
|
} |
10167
|
|
|
|
|
|
|
10168
|
|
|
|
|
|
PERL_STATIC_INLINE void |
10169
|
14401972
|
|
|
|
|
S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) |
10170
|
|
|
|
|
|
{ |
10171
|
|
|
|
|
|
/* This knows the details about sizing an EXACTish node, setting flags for |
10172
|
|
|
|
|
|
* it (by setting <*flagp>, and potentially populating it with a single |
10173
|
|
|
|
|
|
* character. |
10174
|
|
|
|
|
|
* |
10175
|
|
|
|
|
|
* If (the length in bytes) is non-zero, this function assumes that |
10176
|
|
|
|
|
|
* the node has already been populated, and just does the sizing. In this |
10177
|
|
|
|
|
|
* case should be the final code point that has already been |
10178
|
|
|
|
|
|
* placed into the node. This value will be ignored except that under some |
10179
|
|
|
|
|
|
* circumstances <*flagp> is set based on it. |
10180
|
|
|
|
|
|
* |
10181
|
|
|
|
|
|
* If is zero, the function assumes that the node is to contain only |
10182
|
|
|
|
|
|
* the single character given by and calculates what |
10183
|
|
|
|
|
|
* should be. In pass 1, it sizes the node appropriately. In pass 2, it |
10184
|
|
|
|
|
|
* additionally will populate the node's STRING with , if |
10185
|
|
|
|
|
|
* is 0. In both cases <*flagp> is appropriately set |
10186
|
|
|
|
|
|
* |
10187
|
|
|
|
|
|
* It knows that under FOLD, the Latin Sharp S and UTF characters above |
10188
|
|
|
|
|
|
* 255, must be folded (the former only when the rules indicate it can |
10189
|
|
|
|
|
|
* match 'ss') */ |
10190
|
|
|
|
|
|
|
10191
|
29004036
|
|
|
|
|
bool len_passed_in = cBOOL(len != 0); |
10192
|
|
|
|
|
|
U8 character[UTF8_MAXBYTES_CASE+1]; |
10193
|
|
|
|
|
|
|
10194
|
76713248
|
|
|
|
|
PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; |
10195
|
|
|
|
|
|
|
10196
|
76713248
|
|
|
|
|
if (! len_passed_in) { |
10197
|
76712872
|
|
|
|
|
if (UTF) { |
10198
|
76712840
|
|
|
|
|
if (FOLD && (! LOC || code_point > 255)) { |
10199
|
76715040
|
|
|
|
|
_to_uni_fold_flags(code_point, |
10200
|
|
|
|
|
|
character, |
10201
|
|
|
|
|
|
&len, |
10202
|
|
|
|
|
|
FOLD_FLAGS_FULL | ((LOC) |
10203
|
|
|
|
|
|
? FOLD_FLAGS_LOCALE |
10204
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
10205
|
|
|
|
|
|
? FOLD_FLAGS_NOMIX_ASCII |
10206
|
|
|
|
|
|
: 0)); |
10207
|
|
|
|
|
|
} |
10208
|
|
|
|
|
|
else { |
10209
|
76717240
|
|
|
|
|
uvchr_to_utf8( character, code_point); |
10210
|
6310938
|
|
|
|
|
len = UTF8SKIP(character); |
10211
|
|
|
|
|
|
} |
10212
|
|
|
|
|
|
} |
10213
|
6311002
|
|
|
|
|
else if (! FOLD |
10214
|
6310970
|
|
|
|
|
|| code_point != LATIN_SMALL_LETTER_SHARP_S |
10215
|
109760
|
|
|
|
|
|| ASCII_FOLD_RESTRICTED |
10216
|
6201178
|
|
|
|
|
|| ! AT_LEAST_UNI_SEMANTICS) |
10217
|
|
|
|
|
|
{ |
10218
|
478512
|
|
|
|
|
*character = (U8) code_point; |
10219
|
5722730
|
|
|
|
|
len = 1; |
10220
|
|
|
|
|
|
} |
10221
|
|
|
|
|
|
else { |
10222
|
4163218
|
|
|
|
|
*character = 's'; |
10223
|
4163218
|
|
|
|
|
*(character + 1) = 's'; |
10224
|
1943588
|
|
|
|
|
len = 2; |
10225
|
|
|
|
|
|
} |
10226
|
|
|
|
|
|
} |
10227
|
|
|
|
|
|
|
10228
|
4163626
|
|
|
|
|
if (SIZE_ONLY) { |
10229
|
59804
|
|
|
|
|
RExC_size += STR_SZ(len); |
10230
|
|
|
|
|
|
} |
10231
|
|
|
|
|
|
else { |
10232
|
4103822
|
|
|
|
|
RExC_emit += STR_SZ(len); |
10233
|
90444
|
|
|
|
|
STR_LEN(node) = len; |
10234
|
4013582
|
|
|
|
|
if (! len_passed_in) { |
10235
|
5133946
|
|
|
|
|
Copy((char *) character, STRING(node), len, char); |
10236
|
|
|
|
|
|
} |
10237
|
|
|
|
|
|
} |
10238
|
|
|
|
|
|
|
10239
|
5134338
|
|
|
|
|
*flagp |= HASWIDTH; |
10240
|
|
|
|
|
|
|
10241
|
|
|
|
|
|
/* A single character node is SIMPLE, except for the special-cased SHARP S |
10242
|
|
|
|
|
|
* under /di. */ |
10243
|
3633514
|
|
|
|
|
if ((len == 1 || (UTF && len == UNISKIP(code_point))) |
10244
|
1501072
|
|
|
|
|
&& (code_point != LATIN_SMALL_LETTER_SHARP_S |
10245
|
5133930
|
|
|
|
|
|| ! FOLD || ! DEPENDS_SEMANTICS)) |
10246
|
|
|
|
|
|
{ |
10247
|
5134178
|
|
|
|
|
*flagp |= SIMPLE; |
10248
|
|
|
|
|
|
} |
10249
|
5134338
|
|
|
|
|
} |
10250
|
|
|
|
|
|
|
10251
|
|
|
|
|
|
/* |
10252
|
|
|
|
|
|
- regatom - the lowest level |
10253
|
|
|
|
|
|
|
10254
|
|
|
|
|
|
Try to identify anything special at the start of the pattern. If there |
10255
|
|
|
|
|
|
is, then handle it as required. This may involve generating a single regop, |
10256
|
|
|
|
|
|
such as for an assertion; or it may involve recursing, such as to |
10257
|
|
|
|
|
|
handle a () structure. |
10258
|
|
|
|
|
|
|
10259
|
|
|
|
|
|
If the string doesn't start with something special then we gobble up |
10260
|
|
|
|
|
|
as much literal text as we can. |
10261
|
|
|
|
|
|
|
10262
|
|
|
|
|
|
Once we have been able to handle whatever type of thing started the |
10263
|
|
|
|
|
|
sequence, we return. |
10264
|
|
|
|
|
|
|
10265
|
|
|
|
|
|
Note: we have to be careful with escapes, as they can be both literal |
10266
|
|
|
|
|
|
and special, and in the case of \10 and friends, context determines which. |
10267
|
|
|
|
|
|
|
10268
|
|
|
|
|
|
A summary of the code structure is: |
10269
|
|
|
|
|
|
|
10270
|
|
|
|
|
|
switch (first_byte) { |
10271
|
|
|
|
|
|
cases for each special: |
10272
|
|
|
|
|
|
handle this special; |
10273
|
|
|
|
|
|
break; |
10274
|
|
|
|
|
|
case '\\': |
10275
|
|
|
|
|
|
switch (2nd byte) { |
10276
|
|
|
|
|
|
cases for each unambiguous special: |
10277
|
|
|
|
|
|
handle this special; |
10278
|
|
|
|
|
|
break; |
10279
|
|
|
|
|
|
cases for each ambigous special/literal: |
10280
|
|
|
|
|
|
disambiguate; |
10281
|
|
|
|
|
|
if (special) handle here |
10282
|
|
|
|
|
|
else goto defchar; |
10283
|
|
|
|
|
|
default: // unambiguously literal: |
10284
|
|
|
|
|
|
goto defchar; |
10285
|
|
|
|
|
|
} |
10286
|
|
|
|
|
|
default: // is a literal char |
10287
|
|
|
|
|
|
// FALL THROUGH |
10288
|
|
|
|
|
|
defchar: |
10289
|
|
|
|
|
|
create EXACTish node for literal; |
10290
|
|
|
|
|
|
while (more input and node isn't full) { |
10291
|
|
|
|
|
|
switch (input_byte) { |
10292
|
|
|
|
|
|
cases for each special; |
10293
|
|
|
|
|
|
make sure parse pointer is set so that the next call to |
10294
|
|
|
|
|
|
regatom will see this special first |
10295
|
|
|
|
|
|
goto loopdone; // EXACTish node terminated by prev. char |
10296
|
|
|
|
|
|
default: |
10297
|
|
|
|
|
|
append char to EXACTISH node; |
10298
|
|
|
|
|
|
} |
10299
|
|
|
|
|
|
get next input byte; |
10300
|
|
|
|
|
|
} |
10301
|
|
|
|
|
|
loopdone: |
10302
|
|
|
|
|
|
} |
10303
|
|
|
|
|
|
return the generated node; |
10304
|
|
|
|
|
|
|
10305
|
|
|
|
|
|
Specifically there are two separate switches for handling |
10306
|
|
|
|
|
|
escape sequences, with the one for handling literal escapes requiring |
10307
|
|
|
|
|
|
a dummy entry for all of the special escapes that are actually handled |
10308
|
|
|
|
|
|
by the other. |
10309
|
|
|
|
|
|
|
10310
|
|
|
|
|
|
Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with |
10311
|
|
|
|
|
|
TRYAGAIN. |
10312
|
|
|
|
|
|
Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be |
10313
|
|
|
|
|
|
restarted. |
10314
|
|
|
|
|
|
Otherwise does not return NULL. |
10315
|
|
|
|
|
|
*/ |
10316
|
|
|
|
|
|
|
10317
|
|
|
|
|
|
STATIC regnode * |
10318
|
7827542
|
|
|
|
|
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) |
10319
|
|
|
|
|
|
{ |
10320
|
|
|
|
|
|
dVAR; |
10321
|
7827542
|
|
|
|
|
regnode *ret = NULL; |
10322
|
7827392
|
|
|
|
|
I32 flags = 0; |
10323
|
800
|
|
|
|
|
char *parse_start = RExC_parse; |
10324
|
|
|
|
|
|
U8 op; |
10325
|
|
|
|
|
|
int invert = 0; |
10326
|
|
|
|
|
|
|
10327
|
800
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
10328
|
|
|
|
|
|
|
10329
|
7827304
|
|
|
|
|
*flagp = WORST; /* Tentatively. */ |
10330
|
|
|
|
|
|
|
10331
|
117096
|
|
|
|
|
DEBUG_PARSE("atom"); |
10332
|
|
|
|
|
|
|
10333
|
712
|
|
|
|
|
PERL_ARGS_ASSERT_REGATOM; |
10334
|
|
|
|
|
|
|
10335
|
|
|
|
|
|
tryagain: |
10336
|
7710920
|
|
|
|
|
switch ((U8)*RExC_parse) { |
10337
|
|
|
|
|
|
case '^': |
10338
|
7710208
|
|
|
|
|
RExC_seen_zerolen++; |
10339
|
18922790
|
|
|
|
|
nextchar(pRExC_state); |
10340
|
18922790
|
|
|
|
|
if (RExC_flags & RXf_PMf_MULTILINE) |
10341
|
18919996
|
|
|
|
|
ret = reg_node(pRExC_state, MBOL); |
10342
|
592196
|
|
|
|
|
else if (RExC_flags & RXf_PMf_SINGLELINE) |
10343
|
4740
|
|
|
|
|
ret = reg_node(pRExC_state, SBOL); |
10344
|
|
|
|
|
|
else |
10345
|
340
|
|
|
|
|
ret = reg_node(pRExC_state, BOL); |
10346
|
340
|
|
|
|
|
Set_Node_Length(ret, 1); /* MJD */ |
10347
|
|
|
|
|
|
break; |
10348
|
|
|
|
|
|
case '$': |
10349
|
587460
|
|
|
|
|
nextchar(pRExC_state); |
10350
|
587460
|
|
|
|
|
if (*RExC_parse) |
10351
|
587456
|
|
|
|
|
RExC_seen_zerolen++; |
10352
|
4
|
|
|
|
|
if (RExC_flags & RXf_PMf_MULTILINE) |
10353
|
18327800
|
|
|
|
|
ret = reg_node(pRExC_state, MEOL); |
10354
|
18327804
|
|
|
|
|
else if (RExC_flags & RXf_PMf_SINGLELINE) |
10355
|
16
|
|
|
|
|
ret = reg_node(pRExC_state, SEOL); |
10356
|
|
|
|
|
|
else |
10357
|
20
|
|
|
|
|
ret = reg_node(pRExC_state, EOL); |
10358
|
20
|
|
|
|
|
Set_Node_Length(ret, 1); /* MJD */ |
10359
|
|
|
|
|
|
break; |
10360
|
|
|
|
|
|
case '.': |
10361
|
8
|
|
|
|
|
nextchar(pRExC_state); |
10362
|
21320
|
|
|
|
|
if (RExC_flags & RXf_PMf_SINGLELINE) |
10363
|
21312
|
|
|
|
|
ret = reg_node(pRExC_state, SANY); |
10364
|
|
|
|
|
|
else |
10365
|
21320
|
|
|
|
|
ret = reg_node(pRExC_state, REG_ANY); |
10366
|
2322
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
10367
|
2322
|
|
|
|
|
RExC_naughty++; |
10368
|
18803348
|
|
|
|
|
Set_Node_Length(ret, 1); /* MJD */ |
10369
|
|
|
|
|
|
break; |
10370
|
|
|
|
|
|
case '[': |
10371
|
|
|
|
|
|
{ |
10372
|
156956
|
|
|
|
|
char * const oregcomp_parse = ++RExC_parse; |
10373
|
156956
|
|
|
|
|
ret = regclass(pRExC_state, flagp,depth+1, |
10374
|
|
|
|
|
|
FALSE, /* means parse the whole char class */ |
10375
|
|
|
|
|
|
TRUE, /* allow multi-char folds */ |
10376
|
|
|
|
|
|
FALSE, /* don't silence non-portable warnings. */ |
10377
|
|
|
|
|
|
NULL); |
10378
|
156956
|
|
|
|
|
if (*RExC_parse != ']') { |
10379
|
156920
|
|
|
|
|
RExC_parse = oregcomp_parse; |
10380
|
247160
|
|
|
|
|
vFAIL("Unmatched ["); |
10381
|
|
|
|
|
|
} |
10382
|
247196
|
|
|
|
|
if (ret == NULL) { |
10383
|
247160
|
|
|
|
|
if (*flagp & RESTART_UTF8) |
10384
|
|
|
|
|
|
return NULL; |
10385
|
247160
|
|
|
|
|
FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", |
10386
|
|
|
|
|
|
(UV) *flagp); |
10387
|
|
|
|
|
|
} |
10388
|
56
|
|
|
|
|
nextchar(pRExC_state); |
10389
|
56
|
|
|
|
|
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ |
10390
|
|
|
|
|
|
break; |
10391
|
|
|
|
|
|
} |
10392
|
|
|
|
|
|
case '(': |
10393
|
284
|
|
|
|
|
nextchar(pRExC_state); |
10394
|
284
|
|
|
|
|
ret = reg(pRExC_state, 2, &flags,depth+1); |
10395
|
284
|
|
|
|
|
if (ret == NULL) { |
10396
|
32616
|
|
|
|
|
if (flags & TRYAGAIN) { |
10397
|
32616
|
|
|
|
|
if (RExC_parse == RExC_end) { |
10398
|
|
|
|
|
|
/* Make parent create an empty node if needed. */ |
10399
|
32616
|
|
|
|
|
*flagp |= TRYAGAIN; |
10400
|
32616
|
|
|
|
|
return(NULL); |
10401
|
|
|
|
|
|
} |
10402
|
|
|
|
|
|
goto tryagain; |
10403
|
|
|
|
|
|
} |
10404
|
904084
|
|
|
|
|
if (flags & RESTART_UTF8) { |
10405
|
904084
|
|
|
|
|
*flagp = RESTART_UTF8; |
10406
|
904084
|
|
|
|
|
return NULL; |
10407
|
|
|
|
|
|
} |
10408
|
904084
|
|
|
|
|
FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); |
10409
|
|
|
|
|
|
} |
10410
|
584
|
|
|
|
|
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); |
10411
|
584
|
|
|
|
|
break; |
10412
|
|
|
|
|
|
case '|': |
10413
|
|
|
|
|
|
case ')': |
10414
|
320
|
|
|
|
|
if (flags & TRYAGAIN) { |
10415
|
320
|
|
|
|
|
*flagp |= TRYAGAIN; |
10416
|
11212
|
|
|
|
|
return NULL; |
10417
|
|
|
|
|
|
} |
10418
|
11212
|
|
|
|
|
vFAIL("Internal urp"); |
10419
|
|
|
|
|
|
/* Supposed to be caught earlier. */ |
10420
|
|
|
|
|
|
break; |
10421
|
|
|
|
|
|
case '{': |
10422
|
118900
|
|
|
|
|
if (!regcurly(RExC_parse, FALSE)) { |
10423
|
1049454
|
|
|
|
|
RExC_parse++; |
10424
|
1049454
|
|
|
|
|
goto defchar; |
10425
|
|
|
|
|
|
} |
10426
|
|
|
|
|
|
/* FALL THROUGH */ |
10427
|
|
|
|
|
|
case '?': |
10428
|
|
|
|
|
|
case '+': |
10429
|
|
|
|
|
|
case '*': |
10430
|
2098908
|
|
|
|
|
RExC_parse++; |
10431
|
1049454
|
|
|
|
|
vFAIL("Quantifier follows nothing"); |
10432
|
|
|
|
|
|
break; |
10433
|
|
|
|
|
|
case '\\': |
10434
|
|
|
|
|
|
/* Special Escapes |
10435
|
|
|
|
|
|
|
10436
|
|
|
|
|
|
This switch handles escape sequences that resolve to some kind |
10437
|
|
|
|
|
|
of special regop and not to literal text. Escape sequnces that |
10438
|
|
|
|
|
|
resolve to literal text are handled below in the switch marked |
10439
|
|
|
|
|
|
"Literal Escapes". |
10440
|
|
|
|
|
|
|
10441
|
|
|
|
|
|
Every entry in this switch *must* have a corresponding entry |
10442
|
|
|
|
|
|
in the literal escape switch. However, the opposite is not |
10443
|
|
|
|
|
|
required, as the default for this switch is to jump to the |
10444
|
|
|
|
|
|
literal text handling code. |
10445
|
|
|
|
|
|
*/ |
10446
|
1049530
|
|
|
|
|
switch ((U8)*++RExC_parse) { |
10447
|
|
|
|
|
|
U8 arg; |
10448
|
|
|
|
|
|
/* Special Escapes */ |
10449
|
|
|
|
|
|
case 'A': |
10450
|
2098908
|
|
|
|
|
RExC_seen_zerolen++; |
10451
|
1049454
|
|
|
|
|
ret = reg_node(pRExC_state, SBOL); |
10452
|
1049454
|
|
|
|
|
*flagp |= SIMPLE; |
10453
|
6
|
|
|
|
|
goto finish_meta_pat; |
10454
|
|
|
|
|
|
case 'G': |
10455
|
5768
|
|
|
|
|
ret = reg_node(pRExC_state, GPOS); |
10456
|
5768
|
|
|
|
|
RExC_seen |= REG_SEEN_GPOS; |
10457
|
11536
|
|
|
|
|
*flagp |= SIMPLE; |
10458
|
5768
|
|
|
|
|
goto finish_meta_pat; |
10459
|
|
|
|
|
|
case 'K': |
10460
|
5768
|
|
|
|
|
RExC_seen_zerolen++; |
10461
|
11536
|
|
|
|
|
ret = reg_node(pRExC_state, KEEPS); |
10462
|
5768
|
|
|
|
|
*flagp |= SIMPLE; |
10463
|
|
|
|
|
|
/* XXX:dmq : disabling in-place substitution seems to |
10464
|
|
|
|
|
|
* be necessary here to avoid cases of memory corruption, as |
10465
|
|
|
|
|
|
* with: C<$_="x" x 80; s/x\K/y/> -- rgs |
10466
|
|
|
|
|
|
*/ |
10467
|
5768
|
|
|
|
|
RExC_seen |= REG_SEEN_LOOKBEHIND; |
10468
|
37124
|
|
|
|
|
goto finish_meta_pat; |
10469
|
|
|
|
|
|
case 'Z': |
10470
|
43196
|
|
|
|
|
ret = reg_node(pRExC_state, SEOL); |
10471
|
43196
|
|
|
|
|
*flagp |= SIMPLE; |
10472
|
109796
|
|
|
|
|
RExC_seen_zerolen++; /* Do not optimize RE away */ |
10473
|
5524732
|
|
|
|
|
goto finish_meta_pat; |
10474
|
|
|
|
|
|
case 'z': |
10475
|
2762366
|
|
|
|
|
ret = reg_node(pRExC_state, EOS); |
10476
|
2767498
|
|
|
|
|
*flagp |= SIMPLE; |
10477
|
398334
|
|
|
|
|
RExC_seen_zerolen++; /* Do not optimize RE away */ |
10478
|
2767498
|
|
|
|
|
goto finish_meta_pat; |
10479
|
|
|
|
|
|
case 'C': |
10480
|
2767498
|
|
|
|
|
ret = reg_node(pRExC_state, CANY); |
10481
|
1383749
|
|
|
|
|
RExC_seen |= REG_SEEN_CANY; |
10482
|
2767498
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
10483
|
5218248
|
|
|
|
|
goto finish_meta_pat; |
10484
|
|
|
|
|
|
case 'X': |
10485
|
5218248
|
|
|
|
|
ret = reg_node(pRExC_state, CLUMP); |
10486
|
410346
|
|
|
|
|
*flagp |= HASWIDTH; |
10487
|
410346
|
|
|
|
|
goto finish_meta_pat; |
10488
|
|
|
|
|
|
|
10489
|
|
|
|
|
|
case 'W': |
10490
|
|
|
|
|
|
invert = 1; |
10491
|
|
|
|
|
|
/* FALLTHROUGH */ |
10492
|
|
|
|
|
|
case 'w': |
10493
|
|
|
|
|
|
arg = ANYOF_WORDCHAR; |
10494
|
|
|
|
|
|
goto join_posix; |
10495
|
|
|
|
|
|
|
10496
|
|
|
|
|
|
case 'b': |
10497
|
410340
|
|
|
|
|
RExC_seen_zerolen++; |
10498
|
0
|
|
|
|
|
RExC_seen |= REG_SEEN_LOOKBEHIND; |
10499
|
410340
|
|
|
|
|
op = BOUND + get_regex_charset(RExC_flags); |
10500
|
410340
|
|
|
|
|
if (op > BOUNDA) { /* /aa is same as /a */ |
10501
|
|
|
|
|
|
op = BOUNDA; |
10502
|
|
|
|
|
|
} |
10503
|
410340
|
|
|
|
|
ret = reg_node(pRExC_state, op); |
10504
|
4774
|
|
|
|
|
FLAGS(ret) = get_regex_charset(RExC_flags); |
10505
|
4774
|
|
|
|
|
*flagp |= SIMPLE; |
10506
|
410
|
|
|
|
|
if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { |
10507
|
408
|
|
|
|
|
ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); |
10508
|
|
|
|
|
|
} |
10509
|
|
|
|
|
|
goto finish_meta_pat; |
10510
|
|
|
|
|
|
case 'B': |
10511
|
408
|
|
|
|
|
RExC_seen_zerolen++; |
10512
|
11346
|
|
|
|
|
RExC_seen |= REG_SEEN_LOOKBEHIND; |
10513
|
11346
|
|
|
|
|
op = NBOUND + get_regex_charset(RExC_flags); |
10514
|
2
|
|
|
|
|
if (op > NBOUNDA) { /* /aa is same as /a */ |
10515
|
|
|
|
|
|
op = NBOUNDA; |
10516
|
|
|
|
|
|
} |
10517
|
2
|
|
|
|
|
ret = reg_node(pRExC_state, op); |
10518
|
11344
|
|
|
|
|
FLAGS(ret) = get_regex_charset(RExC_flags); |
10519
|
11344
|
|
|
|
|
*flagp |= SIMPLE; |
10520
|
11120
|
|
|
|
|
if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { |
10521
|
11120
|
|
|
|
|
ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); |
10522
|
|
|
|
|
|
} |
10523
|
|
|
|
|
|
goto finish_meta_pat; |
10524
|
|
|
|
|
|
|
10525
|
|
|
|
|
|
case 'D': |
10526
|
|
|
|
|
|
invert = 1; |
10527
|
|
|
|
|
|
/* FALLTHROUGH */ |
10528
|
|
|
|
|
|
case 'd': |
10529
|
|
|
|
|
|
arg = ANYOF_DIGIT; |
10530
|
|
|
|
|
|
goto join_posix; |
10531
|
|
|
|
|
|
|
10532
|
|
|
|
|
|
case 'R': |
10533
|
0
|
|
|
|
|
ret = reg_node(pRExC_state, LNBREAK); |
10534
|
11120
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
10535
|
5504
|
|
|
|
|
goto finish_meta_pat; |
10536
|
|
|
|
|
|
|
10537
|
|
|
|
|
|
case 'H': |
10538
|
|
|
|
|
|
invert = 1; |
10539
|
|
|
|
|
|
/* FALLTHROUGH */ |
10540
|
|
|
|
|
|
case 'h': |
10541
|
|
|
|
|
|
arg = ANYOF_BLANK; |
10542
|
|
|
|
|
|
op = POSIXU; |
10543
|
|
|
|
|
|
goto join_posix_op_known; |
10544
|
|
|
|
|
|
|
10545
|
|
|
|
|
|
case 'V': |
10546
|
|
|
|
|
|
invert = 1; |
10547
|
|
|
|
|
|
/* FALLTHROUGH */ |
10548
|
|
|
|
|
|
case 'v': |
10549
|
|
|
|
|
|
arg = ANYOF_VERTWS; |
10550
|
|
|
|
|
|
op = POSIXU; |
10551
|
|
|
|
|
|
goto join_posix_op_known; |
10552
|
|
|
|
|
|
|
10553
|
|
|
|
|
|
case 'S': |
10554
|
|
|
|
|
|
invert = 1; |
10555
|
|
|
|
|
|
/* FALLTHROUGH */ |
10556
|
|
|
|
|
|
case 's': |
10557
|
|
|
|
|
|
arg = ANYOF_SPACE; |
10558
|
|
|
|
|
|
|
10559
|
|
|
|
|
|
join_posix: |
10560
|
|
|
|
|
|
|
10561
|
5504
|
|
|
|
|
op = POSIXD + get_regex_charset(RExC_flags); |
10562
|
5504
|
|
|
|
|
if (op > POSIXA) { /* /aa is same as /a */ |
10563
|
|
|
|
|
|
op = POSIXA; |
10564
|
|
|
|
|
|
} |
10565
|
|
|
|
|
|
|
10566
|
|
|
|
|
|
join_posix_op_known: |
10567
|
|
|
|
|
|
|
10568
|
11120
|
|
|
|
|
if (invert) { |
10569
|
19800
|
|
|
|
|
op += NPOSIXD - POSIXD; |
10570
|
|
|
|
|
|
} |
10571
|
|
|
|
|
|
|
10572
|
11120
|
|
|
|
|
ret = reg_node(pRExC_state, op); |
10573
|
11120
|
|
|
|
|
if (! SIZE_ONLY) { |
10574
|
11120
|
|
|
|
|
FLAGS(ret) = namedclass_to_classnum(arg); |
10575
|
|
|
|
|
|
} |
10576
|
|
|
|
|
|
|
10577
|
47912
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
10578
|
|
|
|
|
|
/* FALL THROUGH */ |
10579
|
|
|
|
|
|
|
10580
|
|
|
|
|
|
finish_meta_pat: |
10581
|
47912
|
|
|
|
|
nextchar(pRExC_state); |
10582
|
1700
|
|
|
|
|
Set_Node_Length(ret, 2); /* MJD */ |
10583
|
|
|
|
|
|
break; |
10584
|
|
|
|
|
|
case 'p': |
10585
|
|
|
|
|
|
case 'P': |
10586
|
|
|
|
|
|
{ |
10587
|
|
|
|
|
|
#ifdef DEBUGGING |
10588
|
1700
|
|
|
|
|
char* parse_start = RExC_parse - 2; |
10589
|
|
|
|
|
|
#endif |
10590
|
|
|
|
|
|
|
10591
|
788
|
|
|
|
|
RExC_parse--; |
10592
|
|
|
|
|
|
|
10593
|
1700
|
|
|
|
|
ret = regclass(pRExC_state, flagp,depth+1, |
10594
|
|
|
|
|
|
TRUE, /* means just parse this element */ |
10595
|
|
|
|
|
|
FALSE, /* don't allow multi-char folds */ |
10596
|
|
|
|
|
|
FALSE, /* don't silence non-portable warnings. |
10597
|
|
|
|
|
|
It would be a bug if these returned |
10598
|
|
|
|
|
|
non-portables */ |
10599
|
|
|
|
|
|
NULL); |
10600
|
|
|
|
|
|
/* regclass() can only return RESTART_UTF8 if multi-char folds |
10601
|
|
|
|
|
|
are allowed. */ |
10602
|
684
|
|
|
|
|
if (!ret) |
10603
|
1700
|
|
|
|
|
FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", |
10604
|
|
|
|
|
|
(UV) *flagp); |
10605
|
|
|
|
|
|
|
10606
|
580
|
|
|
|
|
RExC_parse--; |
10607
|
|
|
|
|
|
|
10608
|
580
|
|
|
|
|
Set_Node_Offset(ret, parse_start + 2); |
10609
|
580
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
10610
|
47332
|
|
|
|
|
nextchar(pRExC_state); |
10611
|
|
|
|
|
|
} |
10612
|
47332
|
|
|
|
|
break; |
10613
|
|
|
|
|
|
case 'N': |
10614
|
|
|
|
|
|
/* Handle \N and \N{NAME} with multiple code points here and not |
10615
|
|
|
|
|
|
* below because it can be multicharacter. join_exact() will join |
10616
|
|
|
|
|
|
* them up later on. Also this makes sure that things like |
10617
|
|
|
|
|
|
* /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. |
10618
|
|
|
|
|
|
* The options to the grok function call causes it to fail if the |
10619
|
|
|
|
|
|
* sequence is just a single code point. We then go treat it as |
10620
|
|
|
|
|
|
* just another character in the current EXACT node, and hence it |
10621
|
|
|
|
|
|
* gets uniform treatment with all the other characters. The |
10622
|
|
|
|
|
|
* special treatment for quantifiers is not needed for such single |
10623
|
|
|
|
|
|
* character sequences */ |
10624
|
58
|
|
|
|
|
++RExC_parse; |
10625
|
56
|
|
|
|
|
if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, |
10626
|
|
|
|
|
|
FALSE /* not strict */ )) { |
10627
|
2
|
|
|
|
|
if (*flagp & RESTART_UTF8) |
10628
|
|
|
|
|
|
return NULL; |
10629
|
47274
|
|
|
|
|
RExC_parse--; |
10630
|
656
|
|
|
|
|
goto defchar; |
10631
|
|
|
|
|
|
} |
10632
|
|
|
|
|
|
break; |
10633
|
|
|
|
|
|
case 'k': /* Handle \k and \k'NAME' */ |
10634
|
|
|
|
|
|
parse_named_seq: |
10635
|
|
|
|
|
|
{ |
10636
|
656
|
|
|
|
|
char ch= RExC_parse[1]; |
10637
|
28
|
|
|
|
|
if (ch != '<' && ch != '\'' && ch != '{') { |
10638
|
47246
|
|
|
|
|
RExC_parse++; |
10639
|
85784
|
|
|
|
|
vFAIL2("Sequence %.2s... not terminated",parse_start); |
10640
|
|
|
|
|
|
} else { |
10641
|
|
|
|
|
|
/* this pretty much dupes the code for (?P=...) in reg(), if |
10642
|
|
|
|
|
|
you change this make sure you change that */ |
10643
|
43056
|
|
|
|
|
char* name_start = (RExC_parse += 2); |
10644
|
|
|
|
|
|
U32 num = 0; |
10645
|
42728
|
|
|
|
|
SV *sv_dat = reg_scan_name(pRExC_state, |
10646
|
|
|
|
|
|
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); |
10647
|
166
|
|
|
|
|
ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; |
10648
|
2
|
|
|
|
|
if (RExC_parse == name_start || *RExC_parse != ch) |
10649
|
164
|
|
|
|
|
vFAIL2("Sequence %.3s... not terminated",parse_start); |
10650
|
|
|
|
|
|
|
10651
|
42726
|
|
|
|
|
if (!SIZE_ONLY) { |
10652
|
21363
|
|
|
|
|
num = add_data( pRExC_state, 1, "S" ); |
10653
|
120
|
|
|
|
|
RExC_rxi->data->data[num]=(void*)sv_dat; |
10654
|
42606
|
|
|
|
|
SvREFCNT_inc_simple_void(sv_dat); |
10655
|
|
|
|
|
|
} |
10656
|
|
|
|
|
|
|
10657
|
52344
|
|
|
|
|
RExC_sawback = 1; |
10658
|
42606
|
|
|
|
|
ret = reganode(pRExC_state, |
10659
|
|
|
|
|
|
((! FOLD) |
10660
|
|
|
|
|
|
? NREF |
10661
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
10662
|
|
|
|
|
|
? NREFFA |
10663
|
|
|
|
|
|
: (AT_LEAST_UNI_SEMANTICS) |
10664
|
|
|
|
|
|
? NREFFU |
10665
|
|
|
|
|
|
: (LOC) |
10666
|
|
|
|
|
|
? NREFFL |
10667
|
|
|
|
|
|
: NREFF), |
10668
|
|
|
|
|
|
num); |
10669
|
42606
|
|
|
|
|
*flagp |= HASWIDTH; |
10670
|
|
|
|
|
|
|
10671
|
|
|
|
|
|
/* override incorrect value set in reganode MJD */ |
10672
|
42606
|
|
|
|
|
Set_Node_Offset(ret, parse_start+1); |
10673
|
42606
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
10674
|
42
|
|
|
|
|
nextchar(pRExC_state); |
10675
|
|
|
|
|
|
|
10676
|
|
|
|
|
|
} |
10677
|
2
|
|
|
|
|
break; |
10678
|
|
|
|
|
|
} |
10679
|
|
|
|
|
|
case 'g': |
10680
|
|
|
|
|
|
case '1': case '2': case '3': case '4': |
10681
|
|
|
|
|
|
case '5': case '6': case '7': case '8': case '9': |
10682
|
|
|
|
|
|
{ |
10683
|
|
|
|
|
|
I32 num; |
10684
|
101836
|
|
|
|
|
bool isg = *RExC_parse == 'g'; |
10685
|
|
|
|
|
|
bool isrel = 0; |
10686
|
|
|
|
|
|
bool hasbrace = 0; |
10687
|
0
|
|
|
|
|
if (isg) { |
10688
|
15532552
|
|
|
|
|
RExC_parse++; |
10689
|
28670082
|
|
|
|
|
if (*RExC_parse == '{') { |
10690
|
28670082
|
|
|
|
|
RExC_parse++; |
10691
|
|
|
|
|
|
hasbrace = 1; |
10692
|
|
|
|
|
|
} |
10693
|
28670082
|
|
|
|
|
if (*RExC_parse == '-') { |
10694
|
28670082
|
|
|
|
|
RExC_parse++; |
10695
|
|
|
|
|
|
isrel = 1; |
10696
|
|
|
|
|
|
} |
10697
|
32548546
|
|
|
|
|
if (hasbrace && !isDIGIT(*RExC_parse)) { |
10698
|
150672493
|
|
|
|
|
if (isrel) RExC_parse--; |
10699
|
204049195
|
|
|
|
|
RExC_parse -= 2; |
10700
|
107861312
|
|
|
|
|
goto parse_named_seq; |
10701
|
|
|
|
|
|
} } |
10702
|
130149556
|
|
|
|
|
num = atoi(RExC_parse); |
10703
|
130149556
|
|
|
|
|
if (isg && num == 0) { |
10704
|
11881294
|
|
|
|
|
if (*RExC_parse == '0') { |
10705
|
130149556
|
|
|
|
|
vFAIL("Reference to invalid group 0"); |
10706
|
|
|
|
|
|
} |
10707
|
|
|
|
|
|
else { |
10708
|
24485600
|
|
|
|
|
vFAIL("Unterminated \\g... pattern"); |
10709
|
|
|
|
|
|
} |
10710
|
|
|
|
|
|
} |
10711
|
1208504
|
|
|
|
|
if (isrel) { |
10712
|
1208504
|
|
|
|
|
num = RExC_npar - num; |
10713
|
6341112
|
|
|
|
|
if (num < 1) |
10714
|
6341112
|
|
|
|
|
vFAIL("Reference to nonexistent or unclosed group"); |
10715
|
|
|
|
|
|
} |
10716
|
6341112
|
|
|
|
|
if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') |
10717
|
|
|
|
|
|
/* Probably a character specified in octal, e.g. \35 */ |
10718
|
|
|
|
|
|
goto defchar; |
10719
|
|
|
|
|
|
else { |
10720
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
10721
|
4718
|
|
|
|
|
char * const parse_start = RExC_parse - 1; /* MJD */ |
10722
|
|
|
|
|
|
#endif |
10723
|
4718
|
|
|
|
|
while (isDIGIT(*RExC_parse)) |
10724
|
1632
|
|
|
|
|
RExC_parse++; |
10725
|
0
|
|
|
|
|
if (hasbrace) { |
10726
|
1632
|
|
|
|
|
if (*RExC_parse != '}') |
10727
|
1632
|
|
|
|
|
vFAIL("Unterminated \\g{...} pattern"); |
10728
|
3002
|
|
|
|
|
RExC_parse++; |
10729
|
|
|
|
|
|
} |
10730
|
3002
|
|
|
|
|
if (!SIZE_ONLY) { |
10731
|
390
|
|
|
|
|
if (num > (I32)RExC_rx->nparens) |
10732
|
40684
|
|
|
|
|
vFAIL("Reference to nonexistent group"); |
10733
|
|
|
|
|
|
} |
10734
|
40684
|
|
|
|
|
RExC_sawback = 1; |
10735
|
40684
|
|
|
|
|
ret = reganode(pRExC_state, |
10736
|
|
|
|
|
|
((! FOLD) |
10737
|
|
|
|
|
|
? REF |
10738
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
10739
|
|
|
|
|
|
? REFFA |
10740
|
|
|
|
|
|
: (AT_LEAST_UNI_SEMANTICS) |
10741
|
|
|
|
|
|
? REFFU |
10742
|
|
|
|
|
|
: (LOC) |
10743
|
|
|
|
|
|
? REFFL |
10744
|
|
|
|
|
|
: REFF), |
10745
|
|
|
|
|
|
num); |
10746
|
102352
|
|
|
|
|
*flagp |= HASWIDTH; |
10747
|
|
|
|
|
|
|
10748
|
|
|
|
|
|
/* override incorrect value set in reganode MJD */ |
10749
|
102352
|
|
|
|
|
Set_Node_Offset(ret, parse_start+1); |
10750
|
102352
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
10751
|
19276
|
|
|
|
|
RExC_parse--; |
10752
|
19276
|
|
|
|
|
nextchar(pRExC_state); |
10753
|
|
|
|
|
|
} |
10754
|
|
|
|
|
|
} |
10755
|
19276
|
|
|
|
|
break; |
10756
|
|
|
|
|
|
case '\0': |
10757
|
10560
|
|
|
|
|
if (RExC_parse >= RExC_end) |
10758
|
10560
|
|
|
|
|
FAIL("Trailing \\"); |
10759
|
|
|
|
|
|
/* FALL THROUGH */ |
10760
|
|
|
|
|
|
default: |
10761
|
|
|
|
|
|
/* Do not generate "unrecognized" warnings here, we fall |
10762
|
|
|
|
|
|
back into the quick-grab loop below */ |
10763
|
10636
|
|
|
|
|
parse_start--; |
10764
|
10464
|
|
|
|
|
goto defchar; |
10765
|
|
|
|
|
|
} |
10766
|
|
|
|
|
|
break; |
10767
|
|
|
|
|
|
|
10768
|
|
|
|
|
|
case '#': |
10769
|
10388
|
|
|
|
|
if (RExC_flags & RXf_PMf_EXTENDED) { |
10770
|
10388
|
|
|
|
|
if ( reg_skipcomment( pRExC_state ) ) |
10771
|
|
|
|
|
|
goto tryagain; |
10772
|
|
|
|
|
|
} |
10773
|
|
|
|
|
|
/* FALL THROUGH */ |
10774
|
|
|
|
|
|
|
10775
|
|
|
|
|
|
default: |
10776
|
|
|
|
|
|
|
10777
|
782
|
|
|
|
|
parse_start = RExC_parse - 1; |
10778
|
|
|
|
|
|
|
10779
|
782
|
|
|
|
|
RExC_parse++; |
10780
|
|
|
|
|
|
|
10781
|
|
|
|
|
|
defchar: { |
10782
|
|
|
|
|
|
STRLEN len = 0; |
10783
|
410
|
|
|
|
|
UV ender = 0; |
10784
|
|
|
|
|
|
char *p; |
10785
|
|
|
|
|
|
char *s; |
10786
|
|
|
|
|
|
#define MAX_NODE_STRING_SIZE 127 |
10787
|
|
|
|
|
|
char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; |
10788
|
|
|
|
|
|
char *s0; |
10789
|
|
|
|
|
|
U8 upper_parse = MAX_NODE_STRING_SIZE; |
10790
|
|
|
|
|
|
STRLEN foldlen; |
10791
|
410
|
|
|
|
|
U8 node_type = compute_EXACTish(pRExC_state); |
10792
|
|
|
|
|
|
bool next_is_quantifier; |
10793
|
|
|
|
|
|
char * oldp = NULL; |
10794
|
|
|
|
|
|
|
10795
|
|
|
|
|
|
/* We can convert EXACTF nodes to EXACTFU if they contain only |
10796
|
|
|
|
|
|
* characters that match identically regardless of the target |
10797
|
|
|
|
|
|
* string's UTF8ness. The reason to do this is that EXACTF is not |
10798
|
|
|
|
|
|
* trie-able, EXACTFU is. (We don't need to figure this out until |
10799
|
|
|
|
|
|
* pass 2) */ |
10800
|
848
|
|
|
|
|
bool maybe_exactfu = node_type == EXACTF && PASS2; |
10801
|
|
|
|
|
|
|
10802
|
|
|
|
|
|
/* If a folding node contains only code points that don't |
10803
|
|
|
|
|
|
* participate in folds, it can be changed into an EXACT node, |
10804
|
|
|
|
|
|
* which allows the optimizer more things to look for */ |
10805
|
|
|
|
|
|
bool maybe_exact; |
10806
|
|
|
|
|
|
|
10807
|
848
|
|
|
|
|
ret = reg_node(pRExC_state, node_type); |
10808
|
|
|
|
|
|
|
10809
|
|
|
|
|
|
/* In pass1, folded, we use a temporary buffer instead of the |
10810
|
|
|
|
|
|
* actual node, as the node doesn't exist yet */ |
10811
|
848
|
|
|
|
|
s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); |
10812
|
|
|
|
|
|
|
10813
|
|
|
|
|
|
s0 = s; |
10814
|
|
|
|
|
|
|
10815
|
|
|
|
|
|
reparse: |
10816
|
|
|
|
|
|
|
10817
|
|
|
|
|
|
/* We do the EXACTFish to EXACT node only if folding, and not if in |
10818
|
|
|
|
|
|
* locale, as whether a character folds or not isn't known until |
10819
|
|
|
|
|
|
* runtime. (And we don't need to figure this out until pass 2) */ |
10820
|
800
|
|
|
|
|
maybe_exact = FOLD && ! LOC && PASS2; |
10821
|
|
|
|
|
|
|
10822
|
|
|
|
|
|
/* XXX The node can hold up to 255 bytes, yet this only goes to |
10823
|
|
|
|
|
|
* 127. I (khw) do not know why. Keeping it somewhat less than |
10824
|
|
|
|
|
|
* 255 allows us to not have to worry about overflow due to |
10825
|
|
|
|
|
|
* converting to utf8 and fold expansion, but that value is |
10826
|
|
|
|
|
|
* 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes |
10827
|
|
|
|
|
|
* split up by this limit into a single one using the real max of |
10828
|
|
|
|
|
|
* 255. Even at 127, this breaks under rare circumstances. If |
10829
|
|
|
|
|
|
* folding, we do not want to split a node at a character that is a |
10830
|
|
|
|
|
|
* non-final in a multi-char fold, as an input string could just |
10831
|
|
|
|
|
|
* happen to want to match across the node boundary. The join |
10832
|
|
|
|
|
|
* would solve that problem if the join actually happens. But a |
10833
|
|
|
|
|
|
* series of more than two nodes in a row each of 127 would cause |
10834
|
|
|
|
|
|
* the first join to succeed to get to 254, but then there wouldn't |
10835
|
|
|
|
|
|
* be room for the next one, which could at be one of those split |
10836
|
|
|
|
|
|
* multi-char folds. I don't know of any fool-proof solution. One |
10837
|
|
|
|
|
|
* could back off to end with only a code point that isn't such a |
10838
|
|
|
|
|
|
* non-final, but it is possible for there not to be any in the |
10839
|
|
|
|
|
|
* entire node. */ |
10840
|
1886380
|
|
|
|
|
for (p = RExC_parse - 1; |
10841
|
1887200
|
|
|
|
|
len < upper_parse && p < RExC_end; |
10842
|
1885580
|
|
|
|
|
len++) |
10843
|
|
|
|
|
|
{ |
10844
|
1136
|
|
|
|
|
oldp = p; |
10845
|
|
|
|
|
|
|
10846
|
1136
|
|
|
|
|
if (RExC_flags & RXf_PMf_EXTENDED) |
10847
|
1884756
|
|
|
|
|
p = regwhite( pRExC_state, p ); |
10848
|
1885888
|
|
|
|
|
switch ((U8)*p) { |
10849
|
|
|
|
|
|
case '^': |
10850
|
|
|
|
|
|
case '$': |
10851
|
|
|
|
|
|
case '.': |
10852
|
|
|
|
|
|
case '[': |
10853
|
|
|
|
|
|
case '(': |
10854
|
|
|
|
|
|
case ')': |
10855
|
|
|
|
|
|
case '|': |
10856
|
|
|
|
|
|
goto loopdone; |
10857
|
|
|
|
|
|
case '\\': |
10858
|
|
|
|
|
|
/* Literal Escapes Switch |
10859
|
|
|
|
|
|
|
10860
|
|
|
|
|
|
This switch is meant to handle escape sequences that |
10861
|
|
|
|
|
|
resolve to a literal character. |
10862
|
|
|
|
|
|
|
10863
|
|
|
|
|
|
Every escape sequence that represents something |
10864
|
|
|
|
|
|
else, like an assertion or a char class, is handled |
10865
|
|
|
|
|
|
in the switch marked 'Special Escapes' above in this |
10866
|
|
|
|
|
|
routine, but also has an entry here as anything that |
10867
|
|
|
|
|
|
isn't explicitly mentioned here will be treated as |
10868
|
|
|
|
|
|
an unescaped equivalent literal. |
10869
|
|
|
|
|
|
*/ |
10870
|
|
|
|
|
|
|
10871
|
1884584
|
|
|
|
|
switch ((U8)*++p) { |
10872
|
|
|
|
|
|
/* These are all the special escapes. */ |
10873
|
|
|
|
|
|
case 'A': /* Start assertion */ |
10874
|
|
|
|
|
|
case 'b': case 'B': /* Word-boundary assertion*/ |
10875
|
|
|
|
|
|
case 'C': /* Single char !DANGEROUS! */ |
10876
|
|
|
|
|
|
case 'd': case 'D': /* digit class */ |
10877
|
|
|
|
|
|
case 'g': case 'G': /* generic-backref, pos assertion */ |
10878
|
|
|
|
|
|
case 'h': case 'H': /* HORIZWS */ |
10879
|
|
|
|
|
|
case 'k': case 'K': /* named backref, keep marker */ |
10880
|
|
|
|
|
|
case 'p': case 'P': /* Unicode property */ |
10881
|
|
|
|
|
|
case 'R': /* LNBREAK */ |
10882
|
|
|
|
|
|
case 's': case 'S': /* space class */ |
10883
|
|
|
|
|
|
case 'v': case 'V': /* VERTWS */ |
10884
|
|
|
|
|
|
case 'w': case 'W': /* word class */ |
10885
|
|
|
|
|
|
case 'X': /* eXtended Unicode "combining character sequence" */ |
10886
|
|
|
|
|
|
case 'z': case 'Z': /* End of line/string assertion */ |
10887
|
991212
|
|
|
|
|
--p; |
10888
|
37652
|
|
|
|
|
goto loopdone; |
10889
|
|
|
|
|
|
|
10890
|
|
|
|
|
|
/* Anything after here is an escape that resolves to a |
10891
|
|
|
|
|
|
literal. (Except digits, which may or may not) |
10892
|
|
|
|
|
|
*/ |
10893
|
|
|
|
|
|
case 'n': |
10894
|
37652
|
|
|
|
|
ender = '\n'; |
10895
|
37624
|
|
|
|
|
p++; |
10896
|
64
|
|
|
|
|
break; |
10897
|
|
|
|
|
|
case 'N': /* Handle a single-code point named character. */ |
10898
|
|
|
|
|
|
/* The options cause it to fail if a multiple code |
10899
|
|
|
|
|
|
* point sequence. Handle those in the switch() above |
10900
|
|
|
|
|
|
* */ |
10901
|
64
|
|
|
|
|
RExC_parse = p + 1; |
10902
|
22296
|
|
|
|
|
if (! grok_bslash_N(pRExC_state, NULL, &ender, |
10903
|
|
|
|
|
|
flagp, depth, FALSE, |
10904
|
|
|
|
|
|
FALSE /* not strict */ )) |
10905
|
|
|
|
|
|
{ |
10906
|
1764
|
|
|
|
|
if (*flagp & RESTART_UTF8) |
10907
|
1764
|
|
|
|
|
FAIL("panic: grok_bslash_N set RESTART_UTF8"); |
10908
|
65914
|
|
|
|
|
RExC_parse = p = oldp; |
10909
|
65914
|
|
|
|
|
goto loopdone; |
10910
|
|
|
|
|
|
} |
10911
|
65914
|
|
|
|
|
p = RExC_parse; |
10912
|
65914
|
|
|
|
|
if (ender > 0xff) { |
10913
|
514
|
|
|
|
|
REQUIRE_UTF8; |
10914
|
|
|
|
|
|
} |
10915
|
|
|
|
|
|
break; |
10916
|
|
|
|
|
|
case 'r': |
10917
|
65764
|
|
|
|
|
ender = '\r'; |
10918
|
65764
|
|
|
|
|
p++; |
10919
|
32882
|
|
|
|
|
break; |
10920
|
|
|
|
|
|
case 't': |
10921
|
9700
|
|
|
|
|
ender = '\t'; |
10922
|
3414
|
|
|
|
|
p++; |
10923
|
4
|
|
|
|
|
break; |
10924
|
|
|
|
|
|
case 'f': |
10925
|
65764
|
|
|
|
|
ender = '\f'; |
10926
|
300
|
|
|
|
|
p++; |
10927
|
300
|
|
|
|
|
break; |
10928
|
|
|
|
|
|
case 'e': |
10929
|
300
|
|
|
|
|
ender = ASCII_TO_NATIVE('\033'); |
10930
|
300
|
|
|
|
|
p++; |
10931
|
0
|
|
|
|
|
break; |
10932
|
|
|
|
|
|
case 'a': |
10933
|
300
|
|
|
|
|
ender = '\a'; |
10934
|
80
|
|
|
|
|
p++; |
10935
|
0
|
|
|
|
|
break; |
10936
|
|
|
|
|
|
case 'o': |
10937
|
|
|
|
|
|
{ |
10938
|
|
|
|
|
|
UV result; |
10939
|
|
|
|
|
|
const char* error_msg; |
10940
|
|
|
|
|
|
|
10941
|
14757394
|
|
|
|
|
bool valid = grok_bslash_o(&p, |
10942
|
|
|
|
|
|
&result, |
10943
|
|
|
|
|
|
&error_msg, |
10944
|
|
|
|
|
|
TRUE, /* out warnings */ |
10945
|
|
|
|
|
|
FALSE, /* not strict */ |
10946
|
|
|
|
|
|
TRUE, /* Output warnings |
10947
|
|
|
|
|
|
for non- |
10948
|
|
|
|
|
|
portables */ |
10949
|
|
|
|
|
|
UTF); |
10950
|
32
|
|
|
|
|
if (! valid) { |
10951
|
32
|
|
|
|
|
RExC_parse = p; /* going to die anyway; point |
10952
|
|
|
|
|
|
to exact spot of failure */ |
10953
|
89353930
|
|
|
|
|
vFAIL(error_msg); |
10954
|
|
|
|
|
|
} |
10955
|
44592865
|
|
|
|
|
ender = result; |
10956
|
2233732
|
|
|
|
|
if (PL_encoding && ender < 0x100) { |
10957
|
|
|
|
|
|
goto recode_encoding; |
10958
|
|
|
|
|
|
} |
10959
|
2233654
|
|
|
|
|
if (ender > 0xff) { |
10960
|
4
|
|
|
|
|
REQUIRE_UTF8; |
10961
|
|
|
|
|
|
} |
10962
|
|
|
|
|
|
break; |
10963
|
|
|
|
|
|
} |
10964
|
|
|
|
|
|
case 'x': |
10965
|
|
|
|
|
|
{ |
10966
|
104295288
|
|
|
|
|
UV result = UV_MAX; /* initialize to erroneous |
10967
|
|
|
|
|
|
value */ |
10968
|
|
|
|
|
|
const char* error_msg; |
10969
|
|
|
|
|
|
|
10970
|
183964
|
|
|
|
|
bool valid = grok_bslash_x(&p, |
10971
|
|
|
|
|
|
&result, |
10972
|
|
|
|
|
|
&error_msg, |
10973
|
|
|
|
|
|
TRUE, /* out warnings */ |
10974
|
|
|
|
|
|
FALSE, /* not strict */ |
10975
|
|
|
|
|
|
TRUE, /* Output warnings |
10976
|
|
|
|
|
|
for non- |
10977
|
|
|
|
|
|
portables */ |
10978
|
|
|
|
|
|
UTF); |
10979
|
183964
|
|
|
|
|
if (! valid) { |
10980
|
103927400
|
|
|
|
|
RExC_parse = p; /* going to die anyway; point |
10981
|
|
|
|
|
|
to exact spot of failure */ |
10982
|
112317160
|
|
|
|
|
vFAIL(error_msg); |
10983
|
|
|
|
|
|
} |
10984
|
8621502
|
|
|
|
|
ender = result; |
10985
|
|
|
|
|
|
|
10986
|
112317200
|
|
|
|
|
if (PL_encoding && ender < 0x100) { |
10987
|
|
|
|
|
|
goto recode_encoding; |
10988
|
|
|
|
|
|
} |
10989
|
932708
|
|
|
|
|
if (ender > 0xff) { |
10990
|
932708
|
|
|
|
|
REQUIRE_UTF8; |
10991
|
|
|
|
|
|
} |
10992
|
|
|
|
|
|
break; |
10993
|
|
|
|
|
|
} |
10994
|
|
|
|
|
|
case 'c': |
10995
|
111384492
|
|
|
|
|
p++; |
10996
|
100926454
|
|
|
|
|
ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); |
10997
|
264068
|
|
|
|
|
break; |
10998
|
|
|
|
|
|
case '8': case '9': /* must be a backreference */ |
10999
|
264068
|
|
|
|
|
--p; |
11000
|
264068
|
|
|
|
|
goto loopdone; |
11001
|
|
|
|
|
|
case '1': case '2': case '3':case '4': |
11002
|
|
|
|
|
|
case '5': case '6': case '7': |
11003
|
|
|
|
|
|
/* When we parse backslash escapes there is ambiguity |
11004
|
|
|
|
|
|
* between backreferences and octal escapes. Any escape |
11005
|
|
|
|
|
|
* from \1 - \9 is a backreference, any multi-digit |
11006
|
|
|
|
|
|
* escape which does not start with 0 and which when |
11007
|
|
|
|
|
|
* evaluated as decimal could refer to an already |
11008
|
|
|
|
|
|
* parsed capture buffer is a backslash. Anything else |
11009
|
|
|
|
|
|
* is octal. |
11010
|
|
|
|
|
|
* |
11011
|
|
|
|
|
|
* Note this implies that \118 could be interpreted as |
11012
|
|
|
|
|
|
* 118 OR as "\11" . "8" depending on whether there |
11013
|
|
|
|
|
|
* were 118 capture buffers defined already in the |
11014
|
|
|
|
|
|
* pattern. */ |
11015
|
264068
|
|
|
|
|
if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) |
11016
|
|
|
|
|
|
{ /* Not to be treated as an octal constant, go |
11017
|
|
|
|
|
|
find backref */ |
11018
|
264068
|
|
|
|
|
--p; |
11019
|
100662386
|
|
|
|
|
goto loopdone; |
11020
|
|
|
|
|
|
} |
11021
|
|
|
|
|
|
case '0': |
11022
|
|
|
|
|
|
{ |
11023
|
10458038
|
|
|
|
|
I32 flags = PERL_SCAN_SILENT_ILLDIGIT; |
11024
|
2056604
|
|
|
|
|
STRLEN numlen = 3; |
11025
|
7137372
|
|
|
|
|
ender = grok_oct(p, &numlen, &flags, NULL); |
11026
|
2858452
|
|
|
|
|
if (ender > 0xff) { |
11027
|
1250202
|
|
|
|
|
REQUIRE_UTF8; |
11028
|
|
|
|
|
|
} |
11029
|
1238714
|
|
|
|
|
p += numlen; |
11030
|
1220638
|
|
|
|
|
if (SIZE_ONLY /* like \08, \178 */ |
11031
|
942760
|
|
|
|
|
&& numlen < 3 |
11032
|
147310
|
|
|
|
|
&& p < RExC_end |
11033
|
7137372
|
|
|
|
|
&& isDIGIT(*p) && ckWARN(WARN_REGEXP)) |
11034
|
|
|
|
|
|
{ |
11035
|
4980999
|
|
|
|
|
reg_warn_non_literal_string( |
11036
|
|
|
|
|
|
p + 1, |
11037
|
|
|
|
|
|
form_short_octal_warning(p, numlen)); |
11038
|
|
|
|
|
|
} |
11039
|
|
|
|
|
|
} |
11040
|
473660
|
|
|
|
|
if (PL_encoding && ender < 0x100) |
11041
|
|
|
|
|
|
goto recode_encoding; |
11042
|
|
|
|
|
|
break; |
11043
|
|
|
|
|
|
recode_encoding: |
11044
|
439560
|
|
|
|
|
if (! RExC_override_recoding) { |
11045
|
439560
|
|
|
|
|
SV* enc = PL_encoding; |
11046
|
34100
|
|
|
|
|
ender = reg_recode((const char)(U8)ender, &enc); |
11047
|
34100
|
|
|
|
|
if (!enc && SIZE_ONLY) |
11048
|
34100
|
|
|
|
|
ckWARNreg(p, "Invalid escape in the specified encoding"); |
11049
|
5636952
|
|
|
|
|
REQUIRE_UTF8; |
11050
|
|
|
|
|
|
} |
11051
|
|
|
|
|
|
break; |
11052
|
|
|
|
|
|
case '\0': |
11053
|
2847006
|
|
|
|
|
if (p >= RExC_end) |
11054
|
1324192
|
|
|
|
|
FAIL("Trailing \\"); |
11055
|
|
|
|
|
|
/* FALL THROUGH */ |
11056
|
|
|
|
|
|
default: |
11057
|
1017300
|
|
|
|
|
if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { |
11058
|
|
|
|
|
|
/* Include any { following the alpha to emphasize |
11059
|
|
|
|
|
|
* that it could be part of an escape at some point |
11060
|
|
|
|
|
|
* in the future */ |
11061
|
40
|
|
|
|
|
int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; |
11062
|
40
|
|
|
|
|
ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); |
11063
|
|
|
|
|
|
} |
11064
|
|
|
|
|
|
goto normal_default; |
11065
|
|
|
|
|
|
} /* End of switch on '\' */ |
11066
|
|
|
|
|
|
break; |
11067
|
|
|
|
|
|
default: /* A literal character */ |
11068
|
|
|
|
|
|
|
11069
|
788
|
|
|
|
|
if (! SIZE_ONLY |
11070
|
408
|
|
|
|
|
&& RExC_flags & RXf_PMf_EXTENDED |
11071
|
1017244
|
|
|
|
|
&& ckWARN_d(WARN_DEPRECATED) |
11072
|
2847006
|
|
|
|
|
&& is_PATWS_non_low(p, UTF)) |
11073
|
|
|
|
|
|
{ |
11074
|
3320666
|
|
|
|
|
vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), |
11075
|
|
|
|
|
|
"Escape literal pattern white space under /x"); |
11076
|
|
|
|
|
|
} |
11077
|
|
|
|
|
|
|
11078
|
|
|
|
|
|
normal_default: |
11079
|
3321510
|
|
|
|
|
if (UTF8_IS_START(*p) && UTF) { |
11080
|
|
|
|
|
|
STRLEN numlen; |
11081
|
111384532
|
|
|
|
|
ender = utf8n_to_uvchr((U8*)p, RExC_end - p, |
11082
|
|
|
|
|
|
&numlen, UTF8_ALLOW_DEFAULT); |
11083
|
3523220
|
|
|
|
|
p += numlen; |
11084
|
|
|
|
|
|
} |
11085
|
|
|
|
|
|
else |
11086
|
3523944
|
|
|
|
|
ender = (U8) *p++; |
11087
|
|
|
|
|
|
break; |
11088
|
|
|
|
|
|
} /* End of switch on the literal */ |
11089
|
|
|
|
|
|
|
11090
|
|
|
|
|
|
/* Here, have looked at the literal character and |
11091
|
|
|
|
|
|
* contains its ordinal, points to the character after it |
11092
|
|
|
|
|
|
*/ |
11093
|
|
|
|
|
|
|
11094
|
6382706
|
|
|
|
|
if ( RExC_flags & RXf_PMf_EXTENDED) |
11095
|
3616
|
|
|
|
|
p = regwhite( pRExC_state, p ); |
11096
|
|
|
|
|
|
|
11097
|
|
|
|
|
|
/* If the next thing is a quantifier, it applies to this |
11098
|
|
|
|
|
|
* character only, which means that this character has to be in |
11099
|
|
|
|
|
|
* its own node and can't just be appended to the string in an |
11100
|
|
|
|
|
|
* existing node, so if there are already other characters in |
11101
|
|
|
|
|
|
* the node, close the node with just them, and set up to do |
11102
|
|
|
|
|
|
* this character again next time through, when it will be the |
11103
|
|
|
|
|
|
* only thing in its new node */ |
11104
|
6220
|
|
|
|
|
if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) |
11105
|
|
|
|
|
|
{ |
11106
|
117048
|
|
|
|
|
p = oldp; |
11107
|
2700
|
|
|
|
|
goto loopdone; |
11108
|
|
|
|
|
|
} |
11109
|
|
|
|
|
|
|
11110
|
1736
|
|
|
|
|
if (! FOLD) { |
11111
|
594
|
|
|
|
|
if (UTF) { |
11112
|
972
|
|
|
|
|
const STRLEN unilen = reguni(pRExC_state, ender, s); |
11113
|
1478
|
|
|
|
|
if (unilen > 0) { |
11114
|
1020
|
|
|
|
|
s += unilen; |
11115
|
1430
|
|
|
|
|
len += unilen; |
11116
|
|
|
|
|
|
} |
11117
|
|
|
|
|
|
|
11118
|
|
|
|
|
|
/* The loop increments each time, as all but this |
11119
|
|
|
|
|
|
* path (and one other) through it add a single byte to |
11120
|
|
|
|
|
|
* the EXACTish node. But this one has changed len to |
11121
|
|
|
|
|
|
* be the correct final value, so subtract one to |
11122
|
|
|
|
|
|
* cancel out the increment that follows */ |
11123
|
972
|
|
|
|
|
len--; |
11124
|
|
|
|
|
|
} |
11125
|
|
|
|
|
|
else { |
11126
|
584
|
|
|
|
|
REGC((char)ender, s++); |
11127
|
|
|
|
|
|
} |
11128
|
|
|
|
|
|
} |
11129
|
228
|
|
|
|
|
else /* FOLD */ if (! ( UTF |
11130
|
|
|
|
|
|
/* See comments for join_exact() as to why we fold this |
11131
|
|
|
|
|
|
* non-UTF at compile time */ |
11132
|
|
|
|
|
|
|| (node_type == EXACTFU |
11133
|
0
|
|
|
|
|
&& ender == LATIN_SMALL_LETTER_SHARP_S))) |
11134
|
|
|
|
|
|
{ |
11135
|
276
|
|
|
|
|
if (IS_IN_SOME_FOLD_L1(ender)) { |
11136
|
|
|
|
|
|
maybe_exact = FALSE; |
11137
|
|
|
|
|
|
|
11138
|
|
|
|
|
|
/* See if the character's fold differs between /d and |
11139
|
|
|
|
|
|
* /u. This includes the multi-char fold SHARP S to |
11140
|
|
|
|
|
|
* 'ss' */ |
11141
|
220
|
|
|
|
|
if (maybe_exactfu |
11142
|
1002
|
|
|
|
|
&& (PL_fold[ender] != PL_fold_latin1[ender] |
11143
|
3702
|
|
|
|
|
|| ender == LATIN_SMALL_LETTER_SHARP_S |
11144
|
986
|
|
|
|
|
|| (len > 0 |
11145
|
2796
|
|
|
|
|
&& isARG2_lower_or_UPPER_ARG1('s', ender) |
11146
|
74
|
|
|
|
|
&& isARG2_lower_or_UPPER_ARG1('s', *(s-1))))) |
11147
|
|
|
|
|
|
{ |
11148
|
|
|
|
|
|
maybe_exactfu = FALSE; |
11149
|
|
|
|
|
|
} |
11150
|
|
|
|
|
|
} |
11151
|
244
|
|
|
|
|
*(s++) = (char) ender; |
11152
|
|
|
|
|
|
} |
11153
|
|
|
|
|
|
else { /* UTF */ |
11154
|
|
|
|
|
|
|
11155
|
|
|
|
|
|
/* Prime the casefolded buffer. Locale rules, which apply |
11156
|
|
|
|
|
|
* only to code points < 256, aren't known until execution, |
11157
|
|
|
|
|
|
* so for them, just output the original character using |
11158
|
|
|
|
|
|
* utf8. If we start to fold non-UTF patterns, be sure to |
11159
|
|
|
|
|
|
* update join_exact() */ |
11160
|
16
|
|
|
|
|
if (LOC && ender < 256) { |
11161
|
48
|
|
|
|
|
if (NATIVE_IS_INVARIANT(ender)) { |
11162
|
48
|
|
|
|
|
*s = (U8) ender; |
11163
|
28359676
|
|
|
|
|
foldlen = 1; |
11164
|
|
|
|
|
|
} else { |
11165
|
0
|
|
|
|
|
*s = UTF8_TWO_BYTE_HI(ender); |
11166
|
28359676
|
|
|
|
|
*(s + 1) = UTF8_TWO_BYTE_LO(ender); |
11167
|
7556628
|
|
|
|
|
foldlen = 2; |
11168
|
|
|
|
|
|
} |
11169
|
|
|
|
|
|
} |
11170
|
|
|
|
|
|
else { |
11171
|
2192202
|
|
|
|
|
UV folded = _to_uni_fold_flags( |
11172
|
|
|
|
|
|
ender, |
11173
|
|
|
|
|
|
(U8 *) s, |
11174
|
|
|
|
|
|
&foldlen, |
11175
|
|
|
|
|
|
FOLD_FLAGS_FULL |
11176
|
|
|
|
|
|
| ((LOC) ? FOLD_FLAGS_LOCALE |
11177
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
11178
|
|
|
|
|
|
? FOLD_FLAGS_NOMIX_ASCII |
11179
|
|
|
|
|
|
: 0) |
11180
|
|
|
|
|
|
); |
11181
|
|
|
|
|
|
|
11182
|
|
|
|
|
|
/* If this node only contains non-folding code points |
11183
|
|
|
|
|
|
* so far, see if this new one is also non-folding */ |
11184
|
5364426
|
|
|
|
|
if (maybe_exact) { |
11185
|
308424
|
|
|
|
|
if (folded != ender) { |
11186
|
|
|
|
|
|
maybe_exact = FALSE; |
11187
|
|
|
|
|
|
} |
11188
|
|
|
|
|
|
else { |
11189
|
|
|
|
|
|
/* Here the fold is the original; we have |
11190
|
|
|
|
|
|
* to check further to see if anything |
11191
|
|
|
|
|
|
* folds to it */ |
11192
|
28359676
|
|
|
|
|
if (! PL_utf8_foldable) { |
11193
|
28359676
|
|
|
|
|
SV* swash = swash_init("utf8", |
11194
|
|
|
|
|
|
"_Perl_Any_Folds", |
11195
|
|
|
|
|
|
&PL_sv_undef, 1, 0); |
11196
|
28359676
|
|
|
|
|
PL_utf8_foldable = |
11197
|
28359648
|
|
|
|
|
_get_swash_invlist(swash); |
11198
|
28359648
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
11199
|
|
|
|
|
|
} |
11200
|
0
|
|
|
|
|
if (_invlist_contains_cp(PL_utf8_foldable, |
11201
|
|
|
|
|
|
ender)) |
11202
|
|
|
|
|
|
{ |
11203
|
|
|
|
|
|
maybe_exact = FALSE; |
11204
|
|
|
|
|
|
} |
11205
|
|
|
|
|
|
} |
11206
|
|
|
|
|
|
} |
11207
|
76199659
|
|
|
|
|
ender = folded; |
11208
|
|
|
|
|
|
} |
11209
|
20504202
|
|
|
|
|
s += foldlen; |
11210
|
|
|
|
|
|
|
11211
|
|
|
|
|
|
/* The loop increments each time, as all but this |
11212
|
|
|
|
|
|
* path (and one other) through it add a single byte to the |
11213
|
|
|
|
|
|
* EXACTish node. But this one has changed len to be the |
11214
|
|
|
|
|
|
* correct final value, so subtract one to cancel out the |
11215
|
|
|
|
|
|
* increment that follows */ |
11216
|
20504202
|
|
|
|
|
len += foldlen - 1; |
11217
|
|
|
|
|
|
} |
11218
|
|
|
|
|
|
|
11219
|
34974671
|
|
|
|
|
if (next_is_quantifier) { |
11220
|
|
|
|
|
|
|
11221
|
|
|
|
|
|
/* Here, the next input is a quantifier, and to get here, |
11222
|
|
|
|
|
|
* the current character is the only one in the node. |
11223
|
|
|
|
|
|
* Also, here doesn't include the final byte for this |
11224
|
|
|
|
|
|
* character */ |
11225
|
24920534
|
|
|
|
|
len++; |
11226
|
4415500
|
|
|
|
|
goto loopdone; |
11227
|
|
|
|
|
|
} |
11228
|
|
|
|
|
|
|
11229
|
|
|
|
|
|
} /* End of loop through literal characters */ |
11230
|
|
|
|
|
|
|
11231
|
|
|
|
|
|
/* Here we have either exhausted the input or ran out of room in |
11232
|
|
|
|
|
|
* the node. (If we encountered a character that can't be in the |
11233
|
|
|
|
|
|
* node, transfer is made directly to , and so we |
11234
|
|
|
|
|
|
* wouldn't have fallen off the end of the loop.) In the latter |
11235
|
|
|
|
|
|
* case, we artificially have to split the node into two, because |
11236
|
|
|
|
|
|
* we just don't have enough space to hold everything. This |
11237
|
|
|
|
|
|
* creates a problem if the final character participates in a |
11238
|
|
|
|
|
|
* multi-character fold in the non-final position, as a match that |
11239
|
|
|
|
|
|
* should have occurred won't, due to the way nodes are matched, |
11240
|
|
|
|
|
|
* and our artificial boundary. So back off until we find a non- |
11241
|
|
|
|
|
|
* problematic character -- one that isn't at the beginning or |
11242
|
|
|
|
|
|
* middle of such a fold. (Either it doesn't participate in any |
11243
|
|
|
|
|
|
* folds, or appears only in the final position of all the folds it |
11244
|
|
|
|
|
|
* does participate in.) A better solution with far fewer false |
11245
|
|
|
|
|
|
* positives, and that would fill the nodes more completely, would |
11246
|
|
|
|
|
|
* be to actually have available all the multi-character folds to |
11247
|
|
|
|
|
|
* test against, and to back-off only far enough to be sure that |
11248
|
|
|
|
|
|
* this node isn't ending with a partial one. is set |
11249
|
|
|
|
|
|
* further below (if we need to reparse the node) to include just |
11250
|
|
|
|
|
|
* up through that final non-problematic character that this code |
11251
|
|
|
|
|
|
* identifies, so when it is set to less than the full node, we can |
11252
|
|
|
|
|
|
* skip the rest of this */ |
11253
|
20505122
|
|
|
|
|
if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { |
11254
|
|
|
|
|
|
|
11255
|
|
|
|
|
|
const STRLEN full_len = len; |
11256
|
|
|
|
|
|
|
11257
|
644176
|
|
|
|
|
assert(len >= MAX_NODE_STRING_SIZE); |
11258
|
|
|
|
|
|
|
11259
|
|
|
|
|
|
/* Here, points to the final byte of the final character. |
11260
|
|
|
|
|
|
* Look backwards through the string until find a non- |
11261
|
|
|
|
|
|
* problematic character */ |
11262
|
|
|
|
|
|
|
11263
|
618672
|
|
|
|
|
if (! UTF) { |
11264
|
|
|
|
|
|
|
11265
|
|
|
|
|
|
/* These two have no multi-char folds to non-UTF characters |
11266
|
|
|
|
|
|
*/ |
11267
|
25608
|
|
|
|
|
if (ASCII_FOLD_RESTRICTED || LOC) { |
11268
|
|
|
|
|
|
goto loopdone; |
11269
|
|
|
|
|
|
} |
11270
|
|
|
|
|
|
|
11271
|
2237886
|
|
|
|
|
while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } |
11272
|
20504202
|
|
|
|
|
len = s - s0 + 1; |
11273
|
|
|
|
|
|
} |
11274
|
|
|
|
|
|
else { |
11275
|
48952
|
|
|
|
|
if (! PL_NonL1NonFinalFold) { |
11276
|
48952
|
|
|
|
|
PL_NonL1NonFinalFold = _new_invlist_C_array( |
11277
|
|
|
|
|
|
NonL1_Perl_Non_Final_Folds_invlist); |
11278
|
|
|
|
|
|
} |
11279
|
|
|
|
|
|
|
11280
|
|
|
|
|
|
/* Point to the first byte of the final character */ |
11281
|
121586
|
|
|
|
|
s = (char *) utf8_hop((U8 *) s, -1); |
11282
|
|
|
|
|
|
|
11283
|
97098
|
|
|
|
|
while (s >= s0) { /* Search backwards until find |
11284
|
|
|
|
|
|
non-problematic char */ |
11285
|
48148
|
|
|
|
|
if (UTF8_IS_INVARIANT(*s)) { |
11286
|
|
|
|
|
|
|
11287
|
|
|
|
|
|
/* There are no ascii characters that participate |
11288
|
|
|
|
|
|
* in multi-char folds under /aa. In EBCDIC, the |
11289
|
|
|
|
|
|
* non-ascii invariants are all control characters, |
11290
|
|
|
|
|
|
* so don't ever participate in any folds. */ |
11291
|
48950
|
|
|
|
|
if (ASCII_FOLD_RESTRICTED |
11292
|
220
|
|
|
|
|
|| ! IS_NON_FINAL_FOLD(*s)) |
11293
|
|
|
|
|
|
{ |
11294
|
|
|
|
|
|
break; |
11295
|
|
|
|
|
|
} |
11296
|
|
|
|
|
|
} |
11297
|
220
|
|
|
|
|
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { |
11298
|
|
|
|
|
|
|
11299
|
|
|
|
|
|
/* No Latin1 characters participate in multi-char |
11300
|
|
|
|
|
|
* folds under /l */ |
11301
|
212
|
|
|
|
|
if (LOC |
11302
|
10
|
|
|
|
|
|| ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( |
11303
|
|
|
|
|
|
*s, *(s+1)))) |
11304
|
|
|
|
|
|
{ |
11305
|
|
|
|
|
|
break; |
11306
|
|
|
|
|
|
} |
11307
|
|
|
|
|
|
} |
11308
|
24080
|
|
|
|
|
else if (! _invlist_contains_cp( |
11309
|
|
|
|
|
|
PL_NonL1NonFinalFold, |
11310
|
|
|
|
|
|
valid_utf8_to_uvchr((U8 *) s, NULL))) |
11311
|
|
|
|
|
|
{ |
11312
|
|
|
|
|
|
break; |
11313
|
|
|
|
|
|
} |
11314
|
|
|
|
|
|
|
11315
|
|
|
|
|
|
/* Here, the current character is problematic in that |
11316
|
|
|
|
|
|
* it does occur in the non-final position of some |
11317
|
|
|
|
|
|
* fold, so try the character before it, but have to |
11318
|
|
|
|
|
|
* special case the very first byte in the string, so |
11319
|
|
|
|
|
|
* we don't read outside the string */ |
11320
|
48952
|
|
|
|
|
s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); |
11321
|
|
|
|
|
|
} /* End of loop backwards through the string */ |
11322
|
|
|
|
|
|
|
11323
|
|
|
|
|
|
/* If there were only problematic characters in the string, |
11324
|
|
|
|
|
|
* will point to before s0, in which case the length |
11325
|
|
|
|
|
|
* should be 0, otherwise include the length of the |
11326
|
|
|
|
|
|
* non-problematic character just found */ |
11327
|
89316
|
|
|
|
|
len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); |
11328
|
|
|
|
|
|
} |
11329
|
|
|
|
|
|
|
11330
|
|
|
|
|
|
/* Here, have found the final character, if any, that is |
11331
|
|
|
|
|
|
* non-problematic as far as ending the node without splitting |
11332
|
|
|
|
|
|
* it across a potential multi-char fold. contains the |
11333
|
|
|
|
|
|
* number of bytes in the node up-to and including that |
11334
|
|
|
|
|
|
* character, or is 0 if there is no such character, meaning |
11335
|
|
|
|
|
|
* the whole node contains only problematic characters. In |
11336
|
|
|
|
|
|
* this case, give up and just take the node as-is. We can't |
11337
|
|
|
|
|
|
* do any better */ |
11338
|
30248
|
|
|
|
|
if (len == 0) { |
11339
|
|
|
|
|
|
len = full_len; |
11340
|
|
|
|
|
|
|
11341
|
|
|
|
|
|
/* If the node ends in an 's' we make sure it stays EXACTF, |
11342
|
|
|
|
|
|
* as if it turns into an EXACTFU, it could later get |
11343
|
|
|
|
|
|
* joined with another 's' that would then wrongly match |
11344
|
|
|
|
|
|
* the sharp s */ |
11345
|
59544
|
|
|
|
|
if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) |
11346
|
|
|
|
|
|
{ |
11347
|
|
|
|
|
|
maybe_exactfu = FALSE; |
11348
|
|
|
|
|
|
} |
11349
|
|
|
|
|
|
} else { |
11350
|
|
|
|
|
|
|
11351
|
|
|
|
|
|
/* Here, the node does contain some characters that aren't |
11352
|
|
|
|
|
|
* problematic. If one such is the final character in the |
11353
|
|
|
|
|
|
* node, we are done */ |
11354
|
59544
|
|
|
|
|
if (len == full_len) { |
11355
|
|
|
|
|
|
goto loopdone; |
11356
|
|
|
|
|
|
} |
11357
|
400740
|
|
|
|
|
else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { |
11358
|
|
|
|
|
|
|
11359
|
|
|
|
|
|
/* If the final character is problematic, but the |
11360
|
|
|
|
|
|
* penultimate is not, back-off that last character to |
11361
|
|
|
|
|
|
* later start a new node with it */ |
11362
|
341196
|
|
|
|
|
p = oldp; |
11363
|
59544
|
|
|
|
|
goto loopdone; |
11364
|
|
|
|
|
|
} |
11365
|
|
|
|
|
|
|
11366
|
|
|
|
|
|
/* Here, the final non-problematic character is earlier |
11367
|
|
|
|
|
|
* in the input than the penultimate character. What we do |
11368
|
|
|
|
|
|
* is reparse from the beginning, going up only as far as |
11369
|
|
|
|
|
|
* this final ok one, thus guaranteeing that the node ends |
11370
|
|
|
|
|
|
* in an acceptable character. The reason we reparse is |
11371
|
|
|
|
|
|
* that we know how far in the character is, but we don't |
11372
|
|
|
|
|
|
* know how to correlate its position with the input parse. |
11373
|
|
|
|
|
|
* An alternate implementation would be to build that |
11374
|
|
|
|
|
|
* correlation as we go along during the original parse, |
11375
|
|
|
|
|
|
* but that would entail extra work for every node, whereas |
11376
|
|
|
|
|
|
* this code gets executed only when the string is too |
11377
|
|
|
|
|
|
* large for the node, and the final two characters are |
11378
|
|
|
|
|
|
* problematic, an infrequent occurrence. Yet another |
11379
|
|
|
|
|
|
* possible strategy would be to save the tail of the |
11380
|
|
|
|
|
|
* string, and the next time regatom is called, initialize |
11381
|
|
|
|
|
|
* with that. The problem with this is that unless you |
11382
|
|
|
|
|
|
* back off one more character, you won't be guaranteed |
11383
|
|
|
|
|
|
* regatom will get called again, unless regbranch, |
11384
|
|
|
|
|
|
* regpiece ... are also changed. If you do back off that |
11385
|
|
|
|
|
|
* extra character, so that there is input guaranteed to |
11386
|
|
|
|
|
|
* force calling regatom, you can't handle the case where |
11387
|
|
|
|
|
|
* just the first character in the node is acceptable. I |
11388
|
|
|
|
|
|
* (khw) decided to try this method which doesn't have that |
11389
|
|
|
|
|
|
* pitfall; if performance issues are found, we can do a |
11390
|
|
|
|
|
|
* combination of the current approach plus that one */ |
11391
|
556
|
|
|
|
|
upper_parse = len; |
11392
|
|
|
|
|
|
len = 0; |
11393
|
|
|
|
|
|
s = s0; |
11394
|
4
|
|
|
|
|
goto reparse; |
11395
|
|
|
|
|
|
} |
11396
|
|
|
|
|
|
} /* End of verifying node ends with an appropriate char */ |
11397
|
|
|
|
|
|
|
11398
|
|
|
|
|
|
loopdone: /* Jumped to when encounters something that shouldn't be in |
11399
|
|
|
|
|
|
the node */ |
11400
|
|
|
|
|
|
|
11401
|
|
|
|
|
|
/* I (khw) don't know if you can get here with zero length, but the |
11402
|
|
|
|
|
|
* old code handled this situation by creating a zero-length EXACT |
11403
|
|
|
|
|
|
* node. Might as well be NOTHING instead */ |
11404
|
400
|
|
|
|
|
if (len == 0) { |
11405
|
20
|
|
|
|
|
OP(ret) = NOTHING; |
11406
|
|
|
|
|
|
} |
11407
|
|
|
|
|
|
else { |
11408
|
380
|
|
|
|
|
if (FOLD) { |
11409
|
|
|
|
|
|
/* If 'maybe_exact' is still set here, means there are no |
11410
|
|
|
|
|
|
* code points in the node that participate in folds; |
11411
|
|
|
|
|
|
* similarly for 'maybe_exactfu' and code points that match |
11412
|
|
|
|
|
|
* differently depending on UTF8ness of the target string |
11413
|
|
|
|
|
|
* */ |
11414
|
620
|
|
|
|
|
if (maybe_exact) { |
11415
|
58988
|
|
|
|
|
OP(ret) = EXACT; |
11416
|
|
|
|
|
|
} |
11417
|
59056
|
|
|
|
|
else if (maybe_exactfu) { |
11418
|
58878
|
|
|
|
|
OP(ret) = EXACTFU; |
11419
|
|
|
|
|
|
} |
11420
|
|
|
|
|
|
} |
11421
|
59220
|
|
|
|
|
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); |
11422
|
|
|
|
|
|
} |
11423
|
|
|
|
|
|
|
11424
|
59220
|
|
|
|
|
RExC_parse = p - 1; |
11425
|
59200
|
|
|
|
|
Set_Node_Cur_Length(ret, parse_start); |
11426
|
59200
|
|
|
|
|
nextchar(pRExC_state); |
11427
|
|
|
|
|
|
{ |
11428
|
|
|
|
|
|
/* len is STRLEN which is unsigned, need to copy to signed */ |
11429
|
59200
|
|
|
|
|
IV iv = len; |
11430
|
1350
|
|
|
|
|
if (iv < 0) |
11431
|
56376
|
|
|
|
|
vFAIL("Internal disaster"); |
11432
|
|
|
|
|
|
} |
11433
|
|
|
|
|
|
|
11434
|
|
|
|
|
|
} /* End of label 'defchar:' */ |
11435
|
|
|
|
|
|
break; |
11436
|
|
|
|
|
|
} /* End of giant switch on input character */ |
11437
|
|
|
|
|
|
|
11438
|
1820
|
|
|
|
|
return(ret); |
11439
|
|
|
|
|
|
} |
11440
|
|
|
|
|
|
|
11441
|
|
|
|
|
|
STATIC char * |
11442
|
696
|
|
|
|
|
S_regwhite( RExC_state_t *pRExC_state, char *p ) |
11443
|
|
|
|
|
|
{ |
11444
|
824
|
|
|
|
|
const char *e = RExC_end; |
11445
|
|
|
|
|
|
|
11446
|
468
|
|
|
|
|
PERL_ARGS_ASSERT_REGWHITE; |
11447
|
|
|
|
|
|
|
11448
|
440
|
|
|
|
|
while (p < e) { |
11449
|
5668
|
|
|
|
|
if (isSPACE(*p)) |
11450
|
952
|
|
|
|
|
++p; |
11451
|
3624
|
|
|
|
|
else if (*p == '#') { |
11452
|
|
|
|
|
|
bool ended = 0; |
11453
|
|
|
|
|
|
do { |
11454
|
1764
|
|
|
|
|
if (*p++ == '\n') { |
11455
|
|
|
|
|
|
ended = 1; |
11456
|
|
|
|
|
|
break; |
11457
|
|
|
|
|
|
} |
11458
|
1860
|
|
|
|
|
} while (p < e); |
11459
|
1860
|
|
|
|
|
if (!ended) |
11460
|
42572
|
|
|
|
|
RExC_seen |= REG_SEEN_RUN_ON_COMMENT; |
11461
|
|
|
|
|
|
} |
11462
|
|
|
|
|
|
else |
11463
|
|
|
|
|
|
break; |
11464
|
|
|
|
|
|
} |
11465
|
41540
|
|
|
|
|
return p; |
11466
|
|
|
|
|
|
} |
11467
|
|
|
|
|
|
|
11468
|
|
|
|
|
|
STATIC char * |
11469
|
1124
|
|
|
|
|
S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) |
11470
|
|
|
|
|
|
{ |
11471
|
|
|
|
|
|
/* Returns the next non-pattern-white space, non-comment character (the |
11472
|
|
|
|
|
|
* latter only if 'recognize_comment is true) in the string p, which is |
11473
|
|
|
|
|
|
* ended by RExC_end. If there is no line break ending a comment, |
11474
|
|
|
|
|
|
* RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ |
11475
|
1412
|
|
|
|
|
const char *e = RExC_end; |
11476
|
|
|
|
|
|
|
11477
|
58824
|
|
|
|
|
PERL_ARGS_ASSERT_REGPATWS; |
11478
|
|
|
|
|
|
|
11479
|
64
|
|
|
|
|
while (p < e) { |
11480
|
|
|
|
|
|
STRLEN len; |
11481
|
58760
|
|
|
|
|
if ((len = is_PATWS_safe(p, e, UTF))) { |
11482
|
43664
|
|
|
|
|
p += len; |
11483
|
|
|
|
|
|
} |
11484
|
20
|
|
|
|
|
else if (recognize_comment && *p == '#') { |
11485
|
|
|
|
|
|
bool ended = 0; |
11486
|
|
|
|
|
|
do { |
11487
|
10
|
|
|
|
|
p++; |
11488
|
0
|
|
|
|
|
if (is_LNBREAK_safe(p, e, UTF)) { |
11489
|
|
|
|
|
|
ended = 1; |
11490
|
|
|
|
|
|
break; |
11491
|
|
|
|
|
|
} |
11492
|
10
|
|
|
|
|
} while (p < e); |
11493
|
144
|
|
|
|
|
if (!ended) |
11494
|
4
|
|
|
|
|
RExC_seen |= REG_SEEN_RUN_ON_COMMENT; |
11495
|
|
|
|
|
|
} |
11496
|
|
|
|
|
|
else |
11497
|
|
|
|
|
|
break; |
11498
|
|
|
|
|
|
} |
11499
|
140
|
|
|
|
|
return p; |
11500
|
|
|
|
|
|
} |
11501
|
|
|
|
|
|
|
11502
|
|
|
|
|
|
/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. |
11503
|
|
|
|
|
|
Character classes ([:foo:]) can also be negated ([:^foo:]). |
11504
|
|
|
|
|
|
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. |
11505
|
|
|
|
|
|
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, |
11506
|
|
|
|
|
|
but trigger failures because they are currently unimplemented. */ |
11507
|
|
|
|
|
|
|
11508
|
|
|
|
|
|
#define POSIXCC_DONE(c) ((c) == ':') |
11509
|
|
|
|
|
|
#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') |
11510
|
|
|
|
|
|
#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) |
11511
|
|
|
|
|
|
|
11512
|
|
|
|
|
|
PERL_STATIC_INLINE I32 |
11513
|
|
|
|
|
|
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) |
11514
|
|
|
|
|
|
{ |
11515
|
|
|
|
|
|
dVAR; |
11516
|
|
|
|
|
|
I32 namedclass = OOB_NAMEDCLASS; |
11517
|
|
|
|
|
|
|
11518
|
6138
|
|
|
|
|
PERL_ARGS_ASSERT_REGPPOSIXCC; |
11519
|
|
|
|
|
|
|
11520
|
6138
|
|
|
|
|
if (value == '[' && RExC_parse + 1 < RExC_end && |
11521
|
|
|
|
|
|
/* I smell either [: or [= or [. -- POSIX has been here, right? */ |
11522
|
6138
|
|
|
|
|
POSIXCC(UCHARAT(RExC_parse))) |
11523
|
|
|
|
|
|
{ |
11524
|
6138
|
|
|
|
|
const char c = UCHARAT(RExC_parse); |
11525
|
34
|
|
|
|
|
char* const s = RExC_parse++; |
11526
|
|
|
|
|
|
|
11527
|
98
|
|
|
|
|
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) |
11528
|
34
|
|
|
|
|
RExC_parse++; |
11529
|
34
|
|
|
|
|
if (RExC_parse == RExC_end) { |
11530
|
30
|
|
|
|
|
if (strict) { |
11531
|
|
|
|
|
|
|
11532
|
|
|
|
|
|
/* Try to give a better location for the error (than the end of |
11533
|
|
|
|
|
|
* the string) by looking for the matching ']' */ |
11534
|
2
|
|
|
|
|
RExC_parse = s; |
11535
|
4
|
|
|
|
|
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { |
11536
|
3071
|
|
|
|
|
RExC_parse++; |
11537
|
|
|
|
|
|
} |
11538
|
4
|
|
|
|
|
vFAIL2("Unmatched '%c' in POSIX class", c); |
11539
|
|
|
|
|
|
} |
11540
|
|
|
|
|
|
/* Grandfather lone [:, [=, [. */ |
11541
|
6
|
|
|
|
|
RExC_parse = s; |
11542
|
|
|
|
|
|
} |
11543
|
|
|
|
|
|
else { |
11544
|
6260
|
|
|
|
|
const char* const t = RExC_parse++; /* skip over the c */ |
11545
|
6260
|
|
|
|
|
assert(*t == c); |
11546
|
|
|
|
|
|
|
11547
|
9390
|
|
|
|
|
if (UCHARAT(RExC_parse) == ']') { |
11548
|
2
|
|
|
|
|
const char *posixcc = s + 1; |
11549
|
6258
|
|
|
|
|
RExC_parse++; /* skip over the ending ] */ |
11550
|
|
|
|
|
|
|
11551
|
6258
|
|
|
|
|
if (*s == ':') { |
11552
|
3162
|
|
|
|
|
const I32 complement = *posixcc == '^' ? *posixcc++ : 0; |
11553
|
3162
|
|
|
|
|
const I32 skip = t - posixcc; |
11554
|
|
|
|
|
|
|
11555
|
|
|
|
|
|
/* Initially switch on the length of the name. */ |
11556
|
8607
|
|
|
|
|
switch (skip) { |
11557
|
|
|
|
|
|
case 4: |
11558
|
7008
|
|
|
|
|
if (memEQ(posixcc, "word", 4)) /* this is not POSIX, |
11559
|
|
|
|
|
|
this is the Perl \w |
11560
|
|
|
|
|
|
*/ |
11561
|
|
|
|
|
|
namedclass = ANYOF_WORDCHAR; |
11562
|
|
|
|
|
|
break; |
11563
|
|
|
|
|
|
case 5: |
11564
|
|
|
|
|
|
/* Names all of length 5. */ |
11565
|
|
|
|
|
|
/* alnum alpha ascii blank cntrl digit graph lower |
11566
|
|
|
|
|
|
print punct space upper */ |
11567
|
|
|
|
|
|
/* Offset 4 gives the best switch position. */ |
11568
|
7008
|
|
|
|
|
switch (posixcc[4]) { |
11569
|
|
|
|
|
|
case 'a': |
11570
|
7008
|
|
|
|
|
if (memEQ(posixcc, "alph", 4)) /* alpha */ |
11571
|
|
|
|
|
|
namedclass = ANYOF_ALPHA; |
11572
|
|
|
|
|
|
break; |
11573
|
|
|
|
|
|
case 'e': |
11574
|
28
|
|
|
|
|
if (memEQ(posixcc, "spac", 4)) /* space */ |
11575
|
|
|
|
|
|
namedclass = ANYOF_PSXSPC; |
11576
|
|
|
|
|
|
break; |
11577
|
|
|
|
|
|
case 'h': |
11578
|
138
|
|
|
|
|
if (memEQ(posixcc, "grap", 4)) /* graph */ |
11579
|
|
|
|
|
|
namedclass = ANYOF_GRAPH; |
11580
|
|
|
|
|
|
break; |
11581
|
|
|
|
|
|
case 'i': |
11582
|
138
|
|
|
|
|
if (memEQ(posixcc, "asci", 4)) /* ascii */ |
11583
|
|
|
|
|
|
namedclass = ANYOF_ASCII; |
11584
|
|
|
|
|
|
break; |
11585
|
|
|
|
|
|
case 'k': |
11586
|
3104
|
|
|
|
|
if (memEQ(posixcc, "blan", 4)) /* blank */ |
11587
|
|
|
|
|
|
namedclass = ANYOF_BLANK; |
11588
|
|
|
|
|
|
break; |
11589
|
|
|
|
|
|
case 'l': |
11590
|
3104
|
|
|
|
|
if (memEQ(posixcc, "cntr", 4)) /* cntrl */ |
11591
|
|
|
|
|
|
namedclass = ANYOF_CNTRL; |
11592
|
|
|
|
|
|
break; |
11593
|
|
|
|
|
|
case 'm': |
11594
|
3086
|
|
|
|
|
if (memEQ(posixcc, "alnu", 4)) /* alnum */ |
11595
|
|
|
|
|
|
namedclass = ANYOF_ALPHANUMERIC; |
11596
|
|
|
|
|
|
break; |
11597
|
|
|
|
|
|
case 'r': |
11598
|
3104
|
|
|
|
|
if (memEQ(posixcc, "lowe", 4)) /* lower */ |
11599
|
0
|
|
|
|
|
namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; |
11600
|
3050
|
|
|
|
|
else if (memEQ(posixcc, "uppe", 4)) /* upper */ |
11601
|
12
|
|
|
|
|
namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; |
11602
|
|
|
|
|
|
break; |
11603
|
|
|
|
|
|
case 't': |
11604
|
3050
|
|
|
|
|
if (memEQ(posixcc, "digi", 4)) /* digit */ |
11605
|
|
|
|
|
|
namedclass = ANYOF_DIGIT; |
11606
|
3050
|
|
|
|
|
else if (memEQ(posixcc, "prin", 4)) /* print */ |
11607
|
|
|
|
|
|
namedclass = ANYOF_PRINT; |
11608
|
3104
|
|
|
|
|
else if (memEQ(posixcc, "punc", 4)) /* punct */ |
11609
|
|
|
|
|
|
namedclass = ANYOF_PUNCT; |
11610
|
|
|
|
|
|
break; |
11611
|
|
|
|
|
|
} |
11612
|
|
|
|
|
|
break; |
11613
|
|
|
|
|
|
case 6: |
11614
|
3090
|
|
|
|
|
if (memEQ(posixcc, "xdigit", 6)) |
11615
|
|
|
|
|
|
namedclass = ANYOF_XDIGIT; |
11616
|
|
|
|
|
|
break; |
11617
|
|
|
|
|
|
} |
11618
|
|
|
|
|
|
|
11619
|
3090
|
|
|
|
|
if (namedclass == OOB_NAMEDCLASS) |
11620
|
3088
|
|
|
|
|
Simple_vFAIL3("POSIX class [:%.*s:] unknown", |
11621
|
|
|
|
|
|
t - s - 1, s + 1); |
11622
|
|
|
|
|
|
|
11623
|
|
|
|
|
|
/* The #defines are structured so each complement is +1 to |
11624
|
|
|
|
|
|
* the normal one */ |
11625
|
3086
|
|
|
|
|
if (complement) { |
11626
|
3086
|
|
|
|
|
namedclass++; |
11627
|
|
|
|
|
|
} |
11628
|
3086
|
|
|
|
|
assert (posixcc[skip] == ':'); |
11629
|
3086
|
|
|
|
|
assert (posixcc[skip+1] == ']'); |
11630
|
3864
|
|
|
|
|
} else if (!SIZE_ONLY) { |
11631
|
|
|
|
|
|
/* [[=foo=]] and [[.foo.]] are still future. */ |
11632
|
|
|
|
|
|
|
11633
|
|
|
|
|
|
/* adjust RExC_parse so the warning shows after |
11634
|
|
|
|
|
|
the class closes */ |
11635
|
22
|
|
|
|
|
while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') |
11636
|
3096
|
|
|
|
|
RExC_parse++; |
11637
|
7910
|
|
|
|
|
vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); |
11638
|
|
|
|
|
|
} |
11639
|
|
|
|
|
|
} else { |
11640
|
|
|
|
|
|
/* Maternal grandfather: |
11641
|
|
|
|
|
|
* "[:" ending in ":" but not in ":]" */ |
11642
|
6362
|
|
|
|
|
if (strict) { |
11643
|
6362
|
|
|
|
|
vFAIL("Unmatched '[' in POSIX class"); |
11644
|
|
|
|
|
|
} |
11645
|
|
|
|
|
|
|
11646
|
|
|
|
|
|
/* Grandfather lone [:, [=, [. */ |
11647
|
6362
|
|
|
|
|
RExC_parse = s; |
11648
|
|
|
|
|
|
} |
11649
|
|
|
|
|
|
} |
11650
|
|
|
|
|
|
} |
11651
|
|
|
|
|
|
|
11652
|
|
|
|
|
|
return namedclass; |
11653
|
|
|
|
|
|
} |
11654
|
|
|
|
|
|
|
11655
|
|
|
|
|
|
STATIC bool |
11656
|
6362
|
|
|
|
|
S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) |
11657
|
|
|
|
|
|
{ |
11658
|
|
|
|
|
|
/* This applies some heuristics at the current parse position (which should |
11659
|
|
|
|
|
|
* be at a '[') to see if what follows might be intended to be a [:posix:] |
11660
|
|
|
|
|
|
* class. It returns true if it really is a posix class, of course, but it |
11661
|
|
|
|
|
|
* also can return true if it thinks that what was intended was a posix |
11662
|
|
|
|
|
|
* class that didn't quite make it. |
11663
|
|
|
|
|
|
* |
11664
|
|
|
|
|
|
* It will return true for |
11665
|
|
|
|
|
|
* [:alphanumerics: |
11666
|
|
|
|
|
|
* [:alphanumerics] (as long as the ] isn't followed immediately by a |
11667
|
|
|
|
|
|
* ')' indicating the end of the (?[ |
11668
|
|
|
|
|
|
* [:any garbage including %^&$ punctuation:] |
11669
|
|
|
|
|
|
* |
11670
|
|
|
|
|
|
* This is designed to be called only from S_handle_regex_sets; it could be |
11671
|
|
|
|
|
|
* easily adapted to be called from the spot at the beginning of regclass() |
11672
|
|
|
|
|
|
* that checks to see in a normal bracketed class if the surrounding [] |
11673
|
|
|
|
|
|
* have been omitted ([:word:] instead of [[:word:]]). But doing so would |
11674
|
|
|
|
|
|
* change long-standing behavior, so I (khw) didn't do that */ |
11675
|
0
|
|
|
|
|
char* p = RExC_parse + 1; |
11676
|
6362
|
|
|
|
|
char first_char = *p; |
11677
|
|
|
|
|
|
|
11678
|
3320
|
|
|
|
|
PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; |
11679
|
|
|
|
|
|
|
11680
|
14
|
|
|
|
|
assert(*(p - 1) == '['); |
11681
|
|
|
|
|
|
|
11682
|
14
|
|
|
|
|
if (! POSIXCC(first_char)) { |
11683
|
|
|
|
|
|
return FALSE; |
11684
|
|
|
|
|
|
} |
11685
|
|
|
|
|
|
|
11686
|
14
|
|
|
|
|
p++; |
11687
|
14
|
|
|
|
|
while (p < RExC_end && isWORDCHAR(*p)) p++; |
11688
|
|
|
|
|
|
|
11689
|
14
|
|
|
|
|
if (p >= RExC_end) { |
11690
|
|
|
|
|
|
return FALSE; |
11691
|
|
|
|
|
|
} |
11692
|
|
|
|
|
|
|
11693
|
14
|
|
|
|
|
if (p - RExC_parse > 2 /* Got at least 1 word character */ |
11694
|
12
|
|
|
|
|
&& (*p == first_char |
11695
|
10
|
|
|
|
|
|| (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) |
11696
|
|
|
|
|
|
{ |
11697
|
|
|
|
|
|
return TRUE; |
11698
|
|
|
|
|
|
} |
11699
|
|
|
|
|
|
|
11700
|
10
|
|
|
|
|
p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); |
11701
|
|
|
|
|
|
|
11702
|
10
|
|
|
|
|
return (p |
11703
|
10
|
|
|
|
|
&& p - RExC_parse > 2 /* [:] evaluates to colon; |
11704
|
|
|
|
|
|
[::] is a bad posix class. */ |
11705
|
10
|
|
|
|
|
&& first_char == *(p - 1)); |
11706
|
|
|
|
|
|
} |
11707
|
|
|
|
|
|
|
11708
|
|
|
|
|
|
STATIC regnode * |
11709
|
2
|
|
|
|
|
S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, |
11710
|
|
|
|
|
|
char * const oregcomp_parse) |
11711
|
|
|
|
|
|
{ |
11712
|
|
|
|
|
|
/* Handle the (?[...]) construct to do set operations */ |
11713
|
|
|
|
|
|
|
11714
|
|
|
|
|
|
U8 curchar; |
11715
|
|
|
|
|
|
UV start, end; /* End points of code point ranges */ |
11716
|
|
|
|
|
|
SV* result_string; |
11717
|
|
|
|
|
|
char *save_end, *save_parse; |
11718
|
|
|
|
|
|
SV* final; |
11719
|
|
|
|
|
|
STRLEN len; |
11720
|
|
|
|
|
|
regnode* node; |
11721
|
|
|
|
|
|
AV* stack; |
11722
|
0
|
|
|
|
|
const bool save_fold = FOLD; |
11723
|
|
|
|
|
|
|
11724
|
2
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
11725
|
|
|
|
|
|
|
11726
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; |
11727
|
|
|
|
|
|
|
11728
|
2
|
|
|
|
|
if (LOC) { |
11729
|
10
|
|
|
|
|
vFAIL("(?[...]) not valid in locale"); |
11730
|
|
|
|
|
|
} |
11731
|
10
|
|
|
|
|
RExC_uni_semantics = 1; |
11732
|
|
|
|
|
|
|
11733
|
|
|
|
|
|
/* This will return only an ANYOF regnode, or (unlikely) something smaller |
11734
|
|
|
|
|
|
* (such as EXACT). Thus we can skip most everything if just sizing. We |
11735
|
|
|
|
|
|
* call regclass to handle '[]' so as to not have to reinvent its parsing |
11736
|
|
|
|
|
|
* rules here (throwing away the size it computes each time). And, we exit |
11737
|
|
|
|
|
|
* upon an unescaped ']' that isn't one ending a regclass. To do both |
11738
|
|
|
|
|
|
* these things, we need to realize that something preceded by a backslash |
11739
|
|
|
|
|
|
* is escaped, so we have to keep track of backslashes */ |
11740
|
10
|
|
|
|
|
if (SIZE_ONLY) { |
11741
|
|
|
|
|
|
UV depth = 0; /* how many nested (?[...]) constructs */ |
11742
|
|
|
|
|
|
|
11743
|
10
|
|
|
|
|
Perl_ck_warner_d(aTHX_ |
11744
|
|
|
|
|
|
packWARN(WARN_EXPERIMENTAL__REGEX_SETS), |
11745
|
|
|
|
|
|
"The regex_sets feature is experimental" REPORT_LOCATION, |
11746
|
10
|
|
|
|
|
(int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); |
11747
|
|
|
|
|
|
|
11748
|
2
|
|
|
|
|
while (RExC_parse < RExC_end) { |
11749
|
2
|
|
|
|
|
SV* current = NULL; |
11750
|
116
|
|
|
|
|
RExC_parse = regpatws(pRExC_state, RExC_parse, |
11751
|
|
|
|
|
|
TRUE); /* means recognize comments */ |
11752
|
0
|
|
|
|
|
switch (*RExC_parse) { |
11753
|
|
|
|
|
|
case '?': |
11754
|
96
|
|
|
|
|
if (RExC_parse[1] == '[') depth++, RExC_parse++; |
11755
|
|
|
|
|
|
/* FALL THROUGH */ |
11756
|
|
|
|
|
|
default: |
11757
|
|
|
|
|
|
break; |
11758
|
|
|
|
|
|
case '\\': |
11759
|
|
|
|
|
|
/* Skip the next byte (which could cause us to end up in |
11760
|
|
|
|
|
|
* the middle of a UTF-8 character, but since none of those |
11761
|
|
|
|
|
|
* are confusable with anything we currently handle in this |
11762
|
|
|
|
|
|
* switch (invariants all), it's safe. We'll just hit the |
11763
|
|
|
|
|
|
* default: case next time and keep on incrementing until |
11764
|
|
|
|
|
|
* we find one of the invariants we do handle. */ |
11765
|
96
|
|
|
|
|
RExC_parse++; |
11766
|
3034
|
|
|
|
|
break; |
11767
|
|
|
|
|
|
case '[': |
11768
|
|
|
|
|
|
{ |
11769
|
|
|
|
|
|
/* If this looks like it is a [:posix:] class, leave the |
11770
|
|
|
|
|
|
* parse pointer at the '[' to fool regclass() into |
11771
|
|
|
|
|
|
* thinking it is part of a '[[:posix:]]'. That function |
11772
|
|
|
|
|
|
* will use strict checking to force a syntax error if it |
11773
|
|
|
|
|
|
* doesn't work out to a legitimate class */ |
11774
|
3034
|
|
|
|
|
bool is_posix_class |
11775
|
|
|
|
|
|
= could_it_be_a_POSIX_class(pRExC_state); |
11776
|
3022
|
|
|
|
|
if (! is_posix_class) { |
11777
|
3034
|
|
|
|
|
RExC_parse++; |
11778
|
|
|
|
|
|
} |
11779
|
|
|
|
|
|
|
11780
|
|
|
|
|
|
/* regclass() can only return RESTART_UTF8 if multi-char |
11781
|
|
|
|
|
|
folds are allowed. */ |
11782
|
0
|
|
|
|
|
if (!regclass(pRExC_state, flagp,depth+1, |
11783
|
|
|
|
|
|
is_posix_class, /* parse the whole char |
11784
|
|
|
|
|
|
class only if not a |
11785
|
|
|
|
|
|
posix class */ |
11786
|
|
|
|
|
|
FALSE, /* don't allow multi-char folds */ |
11787
|
|
|
|
|
|
TRUE, /* silence non-portable warnings. */ |
11788
|
|
|
|
|
|
¤t)) |
11789
|
3024
|
|
|
|
|
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", |
11790
|
|
|
|
|
|
(UV) *flagp); |
11791
|
|
|
|
|
|
|
11792
|
|
|
|
|
|
/* function call leaves parse pointing to the ']', except |
11793
|
|
|
|
|
|
* if we faked it */ |
11794
|
12
|
|
|
|
|
if (is_posix_class) { |
11795
|
86
|
|
|
|
|
RExC_parse--; |
11796
|
|
|
|
|
|
} |
11797
|
|
|
|
|
|
|
11798
|
84
|
|
|
|
|
SvREFCNT_dec(current); /* In case it returned something */ |
11799
|
84
|
|
|
|
|
break; |
11800
|
|
|
|
|
|
} |
11801
|
|
|
|
|
|
|
11802
|
|
|
|
|
|
case ']': |
11803
|
4
|
|
|
|
|
if (depth--) break; |
11804
|
4
|
|
|
|
|
RExC_parse++; |
11805
|
82
|
|
|
|
|
if (RExC_parse < RExC_end |
11806
|
82
|
|
|
|
|
&& *RExC_parse == ')') |
11807
|
|
|
|
|
|
{ |
11808
|
10
|
|
|
|
|
node = reganode(pRExC_state, ANYOF, 0); |
11809
|
10
|
|
|
|
|
RExC_size += ANYOF_SKIP; |
11810
|
30
|
|
|
|
|
nextchar(pRExC_state); |
11811
|
20
|
|
|
|
|
Set_Node_Length(node, |
11812
|
|
|
|
|
|
RExC_parse - oregcomp_parse + 1); /* MJD */ |
11813
|
|
|
|
|
|
return node; |
11814
|
|
|
|
|
|
} |
11815
|
|
|
|
|
|
goto no_close; |
11816
|
|
|
|
|
|
} |
11817
|
20
|
|
|
|
|
RExC_parse++; |
11818
|
|
|
|
|
|
} |
11819
|
|
|
|
|
|
|
11820
|
|
|
|
|
|
no_close: |
11821
|
2
|
|
|
|
|
FAIL("Syntax error in (?[...])"); |
11822
|
|
|
|
|
|
} |
11823
|
|
|
|
|
|
|
11824
|
|
|
|
|
|
/* Pass 2 only after this. Everything in this construct is a |
11825
|
|
|
|
|
|
* metacharacter. Operands begin with either a '\' (for an escape |
11826
|
|
|
|
|
|
* sequence), or a '[' for a bracketed character class. Any other |
11827
|
|
|
|
|
|
* character should be an operator, or parenthesis for grouping. Both |
11828
|
|
|
|
|
|
* types of operands are handled by calling regclass() to parse them. It |
11829
|
|
|
|
|
|
* is called with a parameter to indicate to return the computed inversion |
11830
|
|
|
|
|
|
* list. The parsing here is implemented via a stack. Each entry on the |
11831
|
|
|
|
|
|
* stack is a single character representing one of the operators, or the |
11832
|
|
|
|
|
|
* '('; or else a pointer to an operand inversion list. */ |
11833
|
|
|
|
|
|
|
11834
|
|
|
|
|
|
#define IS_OPERAND(a) (! SvIOK(a)) |
11835
|
|
|
|
|
|
|
11836
|
|
|
|
|
|
/* The stack starts empty. It is a syntax error if the first thing parsed |
11837
|
|
|
|
|
|
* is a binary operator; everything else is pushed on the stack. When an |
11838
|
|
|
|
|
|
* operand is parsed, the top of the stack is examined. If it is a binary |
11839
|
|
|
|
|
|
* operator, the item before it should be an operand, and both are replaced |
11840
|
|
|
|
|
|
* by the result of doing that operation on the new operand and the one on |
11841
|
|
|
|
|
|
* the stack. Thus a sequence of binary operands is reduced to a single |
11842
|
|
|
|
|
|
* one before the next one is parsed. |
11843
|
|
|
|
|
|
* |
11844
|
|
|
|
|
|
* A unary operator may immediately follow a binary in the input, for |
11845
|
|
|
|
|
|
* example |
11846
|
|
|
|
|
|
* [a] + ! [b] |
11847
|
|
|
|
|
|
* When an operand is parsed and the top of the stack is a unary operator, |
11848
|
|
|
|
|
|
* the operation is performed, and then the stack is rechecked to see if |
11849
|
|
|
|
|
|
* this new operand is part of a binary operation; if so, it is handled as |
11850
|
|
|
|
|
|
* above. |
11851
|
|
|
|
|
|
* |
11852
|
|
|
|
|
|
* A '(' is simply pushed on the stack; it is valid only if the stack is |
11853
|
|
|
|
|
|
* empty, or the top element of the stack is an operator or another '(' |
11854
|
|
|
|
|
|
* (for which the parenthesized expression will become an operand). By the |
11855
|
|
|
|
|
|
* time the corresponding ')' is parsed everything in between should have |
11856
|
|
|
|
|
|
* been parsed and evaluated to a single operand (or else is a syntax |
11857
|
|
|
|
|
|
* error), and is handled as a regular operand */ |
11858
|
|
|
|
|
|
|
11859
|
2
|
|
|
|
|
sv_2mortal((SV *)(stack = newAV())); |
11860
|
|
|
|
|
|
|
11861
|
28
|
|
|
|
|
while (RExC_parse < RExC_end) { |
11862
|
28
|
|
|
|
|
I32 top_index = av_tindex(stack); |
11863
|
|
|
|
|
|
SV** top_ptr; |
11864
|
50
|
|
|
|
|
SV* current = NULL; |
11865
|
|
|
|
|
|
|
11866
|
|
|
|
|
|
/* Skip white space */ |
11867
|
28
|
|
|
|
|
RExC_parse = regpatws(pRExC_state, RExC_parse, |
11868
|
|
|
|
|
|
TRUE); /* means recognize comments */ |
11869
|
28
|
|
|
|
|
if (RExC_parse >= RExC_end) { |
11870
|
22
|
|
|
|
|
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); |
11871
|
|
|
|
|
|
} |
11872
|
22
|
|
|
|
|
if ((curchar = UCHARAT(RExC_parse)) == ']') { |
11873
|
|
|
|
|
|
break; |
11874
|
|
|
|
|
|
} |
11875
|
|
|
|
|
|
|
11876
|
22
|
|
|
|
|
switch (curchar) { |
11877
|
|
|
|
|
|
|
11878
|
|
|
|
|
|
case '?': |
11879
|
6
|
|
|
|
|
if (av_tindex(stack) >= 0 /* This makes sure that we can |
11880
|
|
|
|
|
|
safely subtract 1 from |
11881
|
|
|
|
|
|
RExC_parse in the next clause. |
11882
|
|
|
|
|
|
If we have something on the |
11883
|
|
|
|
|
|
stack, we have parsed something |
11884
|
|
|
|
|
|
*/ |
11885
|
6
|
|
|
|
|
&& UCHARAT(RExC_parse - 1) == '(' |
11886
|
6
|
|
|
|
|
&& RExC_parse < RExC_end) |
11887
|
|
|
|
|
|
{ |
11888
|
|
|
|
|
|
/* If is a '(?', could be an embedded '(?flags:(?[...])'. |
11889
|
|
|
|
|
|
* This happens when we have some thing like |
11890
|
|
|
|
|
|
* |
11891
|
|
|
|
|
|
* my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; |
11892
|
|
|
|
|
|
* ... |
11893
|
|
|
|
|
|
* qr/(?[ \p{Digit} & $thai_or_lao ])/; |
11894
|
|
|
|
|
|
* |
11895
|
|
|
|
|
|
* Here we would be handling the interpolated |
11896
|
|
|
|
|
|
* '$thai_or_lao'. We handle this by a recursive call to |
11897
|
|
|
|
|
|
* ourselves which returns the inversion list the |
11898
|
|
|
|
|
|
* interpolated expression evaluates to. We use the flags |
11899
|
|
|
|
|
|
* from the interpolated pattern. */ |
11900
|
22
|
|
|
|
|
U32 save_flags = RExC_flags; |
11901
|
1587
|
|
|
|
|
const char * const save_parse = ++RExC_parse; |
11902
|
|
|
|
|
|
|
11903
|
3162
|
|
|
|
|
parse_lparen_question_flags(pRExC_state); |
11904
|
|
|
|
|
|
|
11905
|
3056
|
|
|
|
|
if (RExC_parse == save_parse /* Makes sure there was at |
11906
|
|
|
|
|
|
least one flag (or this |
11907
|
|
|
|
|
|
embedding wasn't compiled) |
11908
|
|
|
|
|
|
*/ |
11909
|
106
|
|
|
|
|
|| RExC_parse >= RExC_end - 4 |
11910
|
106
|
|
|
|
|
|| UCHARAT(RExC_parse) != ':' |
11911
|
6
|
|
|
|
|
|| UCHARAT(++RExC_parse) != '(' |
11912
|
6
|
|
|
|
|
|| UCHARAT(++RExC_parse) != '?' |
11913
|
6
|
|
|
|
|
|| UCHARAT(++RExC_parse) != '[') |
11914
|
|
|
|
|
|
{ |
11915
|
|
|
|
|
|
|
11916
|
|
|
|
|
|
/* In combination with the above, this moves the |
11917
|
|
|
|
|
|
* pointer to the point just after the first erroneous |
11918
|
|
|
|
|
|
* character (or if there are no flags, to where they |
11919
|
|
|
|
|
|
* should have been) */ |
11920
|
100
|
|
|
|
|
if (RExC_parse >= RExC_end - 4) { |
11921
|
100
|
|
|
|
|
RExC_parse = RExC_end; |
11922
|
|
|
|
|
|
} |
11923
|
22
|
|
|
|
|
else if (RExC_parse != save_parse) { |
11924
|
22
|
|
|
|
|
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; |
11925
|
|
|
|
|
|
} |
11926
|
10
|
|
|
|
|
vFAIL("Expecting '(?flags:(?[...'"); |
11927
|
|
|
|
|
|
} |
11928
|
10
|
|
|
|
|
RExC_parse++; |
11929
|
10
|
|
|
|
|
(void) handle_regex_sets(pRExC_state, ¤t, flagp, |
11930
|
|
|
|
|
|
depth+1, oregcomp_parse); |
11931
|
|
|
|
|
|
|
11932
|
|
|
|
|
|
/* Here, 'current' contains the embedded expression's |
11933
|
|
|
|
|
|
* inversion list, and RExC_parse points to the trailing |
11934
|
|
|
|
|
|
* ']'; the next character should be the ')' which will be |
11935
|
|
|
|
|
|
* paired with the '(' that has been put on the stack, so |
11936
|
|
|
|
|
|
* the whole embedded expression reduces to '(operand)' */ |
11937
|
10
|
|
|
|
|
RExC_parse++; |
11938
|
|
|
|
|
|
|
11939
|
18
|
|
|
|
|
RExC_flags = save_flags; |
11940
|
18
|
|
|
|
|
goto handle_operand; |
11941
|
|
|
|
|
|
} |
11942
|
|
|
|
|
|
/* FALL THROUGH */ |
11943
|
|
|
|
|
|
|
11944
|
|
|
|
|
|
default: |
11945
|
18
|
|
|
|
|
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; |
11946
|
18
|
|
|
|
|
vFAIL("Unexpected character"); |
11947
|
|
|
|
|
|
|
11948
|
|
|
|
|
|
case '\\': |
11949
|
|
|
|
|
|
/* regclass() can only return RESTART_UTF8 if multi-char |
11950
|
|
|
|
|
|
folds are allowed. */ |
11951
|
32
|
|
|
|
|
if (!regclass(pRExC_state, flagp,depth+1, |
11952
|
|
|
|
|
|
TRUE, /* means parse just the next thing */ |
11953
|
|
|
|
|
|
FALSE, /* don't allow multi-char folds */ |
11954
|
|
|
|
|
|
FALSE, /* don't silence non-portable warnings. */ |
11955
|
|
|
|
|
|
¤t)) |
11956
|
32
|
|
|
|
|
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", |
11957
|
|
|
|
|
|
(UV) *flagp); |
11958
|
|
|
|
|
|
/* regclass() will return with parsing just the \ sequence, |
11959
|
|
|
|
|
|
* leaving the parse pointer at the next thing to parse */ |
11960
|
32
|
|
|
|
|
RExC_parse--; |
11961
|
32
|
|
|
|
|
goto handle_operand; |
11962
|
|
|
|
|
|
|
11963
|
|
|
|
|
|
case '[': /* Is a bracketed character class */ |
11964
|
|
|
|
|
|
{ |
11965
|
8
|
|
|
|
|
bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); |
11966
|
|
|
|
|
|
|
11967
|
8
|
|
|
|
|
if (! is_posix_class) { |
11968
|
8
|
|
|
|
|
RExC_parse++; |
11969
|
|
|
|
|
|
} |
11970
|
|
|
|
|
|
|
11971
|
|
|
|
|
|
/* regclass() can only return RESTART_UTF8 if multi-char |
11972
|
|
|
|
|
|
folds are allowed. */ |
11973
|
8
|
|
|
|
|
if(!regclass(pRExC_state, flagp,depth+1, |
11974
|
|
|
|
|
|
is_posix_class, /* parse the whole char class |
11975
|
|
|
|
|
|
only if not a posix class */ |
11976
|
|
|
|
|
|
FALSE, /* don't allow multi-char folds */ |
11977
|
|
|
|
|
|
FALSE, /* don't silence non-portable warnings. */ |
11978
|
|
|
|
|
|
¤t)) |
11979
|
10
|
|
|
|
|
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", |
11980
|
|
|
|
|
|
(UV) *flagp); |
11981
|
|
|
|
|
|
/* function call leaves parse pointing to the ']', except if we |
11982
|
|
|
|
|
|
* faked it */ |
11983
|
10
|
|
|
|
|
if (is_posix_class) { |
11984
|
10
|
|
|
|
|
RExC_parse--; |
11985
|
|
|
|
|
|
} |
11986
|
|
|
|
|
|
|
11987
|
|
|
|
|
|
goto handle_operand; |
11988
|
|
|
|
|
|
} |
11989
|
|
|
|
|
|
|
11990
|
|
|
|
|
|
case '&': |
11991
|
|
|
|
|
|
case '|': |
11992
|
|
|
|
|
|
case '+': |
11993
|
|
|
|
|
|
case '-': |
11994
|
|
|
|
|
|
case '^': |
11995
|
10
|
|
|
|
|
if (top_index < 0 |
11996
|
10
|
|
|
|
|
|| ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) |
11997
|
10
|
|
|
|
|
|| ! IS_OPERAND(*top_ptr)) |
11998
|
|
|
|
|
|
{ |
11999
|
10
|
|
|
|
|
RExC_parse++; |
12000
|
10
|
|
|
|
|
vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); |
12001
|
|
|
|
|
|
} |
12002
|
10
|
|
|
|
|
av_push(stack, newSVuv(curchar)); |
12003
|
10
|
|
|
|
|
break; |
12004
|
|
|
|
|
|
|
12005
|
|
|
|
|
|
case '!': |
12006
|
10
|
|
|
|
|
av_push(stack, newSVuv(curchar)); |
12007
|
10
|
|
|
|
|
break; |
12008
|
|
|
|
|
|
|
12009
|
|
|
|
|
|
case '(': |
12010
|
0
|
|
|
|
|
if (top_index >= 0) { |
12011
|
90
|
|
|
|
|
top_ptr = av_fetch(stack, top_index, FALSE); |
12012
|
90
|
|
|
|
|
assert(top_ptr); |
12013
|
3266
|
|
|
|
|
if (IS_OPERAND(*top_ptr)) { |
12014
|
3042
|
|
|
|
|
RExC_parse++; |
12015
|
3036
|
|
|
|
|
vFAIL("Unexpected '(' with no preceding operator"); |
12016
|
|
|
|
|
|
} |
12017
|
|
|
|
|
|
} |
12018
|
3036
|
|
|
|
|
av_push(stack, newSVuv(curchar)); |
12019
|
3034
|
|
|
|
|
break; |
12020
|
|
|
|
|
|
|
12021
|
|
|
|
|
|
case ')': |
12022
|
|
|
|
|
|
{ |
12023
|
|
|
|
|
|
SV* lparen; |
12024
|
8
|
|
|
|
|
if (top_index < 1 |
12025
|
3034
|
|
|
|
|
|| ! (current = av_pop(stack)) |
12026
|
10
|
|
|
|
|
|| ! IS_OPERAND(current) |
12027
|
10
|
|
|
|
|
|| ! (lparen = av_pop(stack)) |
12028
|
3024
|
|
|
|
|
|| IS_OPERAND(lparen) |
12029
|
188566
|
|
|
|
|
|| SvUV(lparen) != '(') |
12030
|
|
|
|
|
|
{ |
12031
|
184030
|
|
|
|
|
SvREFCNT_dec(current); |
12032
|
65238
|
|
|
|
|
RExC_parse++; |
12033
|
151411
|
|
|
|
|
vFAIL("Unexpected ')'"); |
12034
|
|
|
|
|
|
} |
12035
|
3024
|
|
|
|
|
top_index -= 2; |
12036
|
3024
|
|
|
|
|
SvREFCNT_dec_NN(lparen); |
12037
|
|
|
|
|
|
|
12038
|
|
|
|
|
|
/* FALL THROUGH */ |
12039
|
|
|
|
|
|
} |
12040
|
|
|
|
|
|
|
12041
|
|
|
|
|
|
handle_operand: |
12042
|
|
|
|
|
|
|
12043
|
|
|
|
|
|
/* Here, we have an operand to process, in 'current' */ |
12044
|
|
|
|
|
|
|
12045
|
3024
|
|
|
|
|
if (top_index < 0) { /* Just push if stack is empty */ |
12046
|
3024
|
|
|
|
|
av_push(stack, current); |
12047
|
|
|
|
|
|
} |
12048
|
|
|
|
|
|
else { |
12049
|
3024
|
|
|
|
|
SV* top = av_pop(stack); |
12050
|
|
|
|
|
|
SV *prev = NULL; |
12051
|
|
|
|
|
|
char current_operator; |
12052
|
|
|
|
|
|
|
12053
|
3024
|
|
|
|
|
if (IS_OPERAND(top)) { |
12054
|
3024
|
|
|
|
|
SvREFCNT_dec_NN(top); |
12055
|
0
|
|
|
|
|
SvREFCNT_dec_NN(current); |
12056
|
3024
|
|
|
|
|
vFAIL("Operand with no preceding operator"); |
12057
|
|
|
|
|
|
} |
12058
|
376
|
|
|
|
|
current_operator = (char) SvUV(top); |
12059
|
3024
|
|
|
|
|
switch (current_operator) { |
12060
|
|
|
|
|
|
case '(': /* Push the '(' back on followed by the new |
12061
|
|
|
|
|
|
operand */ |
12062
|
3024
|
|
|
|
|
av_push(stack, top); |
12063
|
3024
|
|
|
|
|
av_push(stack, current); |
12064
|
|
|
|
|
|
SvREFCNT_inc(top); /* Counters the '_dec' done |
12065
|
|
|
|
|
|
just after the 'break', so |
12066
|
|
|
|
|
|
it doesn't get wrongly freed |
12067
|
|
|
|
|
|
*/ |
12068
|
|
|
|
|
|
break; |
12069
|
|
|
|
|
|
|
12070
|
|
|
|
|
|
case '!': |
12071
|
3024
|
|
|
|
|
_invlist_invert(current); |
12072
|
|
|
|
|
|
|
12073
|
|
|
|
|
|
/* Unlike binary operators, the top of the stack, |
12074
|
|
|
|
|
|
* now that this unary one has been popped off, may |
12075
|
|
|
|
|
|
* legally be an operator, and we now have operand |
12076
|
|
|
|
|
|
* for it. */ |
12077
|
3024
|
|
|
|
|
top_index--; |
12078
|
4572
|
|
|
|
|
SvREFCNT_dec_NN(top); |
12079
|
8246454
|
|
|
|
|
goto handle_operand; |
12080
|
|
|
|
|
|
|
12081
|
|
|
|
|
|
case '&': |
12082
|
8246454
|
|
|
|
|
prev = av_pop(stack); |
12083
|
8246454
|
|
|
|
|
_invlist_intersection(prev, |
12084
|
|
|
|
|
|
current, |
12085
|
|
|
|
|
|
¤t); |
12086
|
8246454
|
|
|
|
|
av_push(stack, current); |
12087
|
8246454
|
|
|
|
|
break; |
12088
|
|
|
|
|
|
|
12089
|
|
|
|
|
|
case '|': |
12090
|
|
|
|
|
|
case '+': |
12091
|
8246454
|
|
|
|
|
prev = av_pop(stack); |
12092
|
8246454
|
|
|
|
|
_invlist_union(prev, current, ¤t); |
12093
|
8246454
|
|
|
|
|
av_push(stack, current); |
12094
|
8246454
|
|
|
|
|
break; |
12095
|
|
|
|
|
|
|
12096
|
|
|
|
|
|
case '-': |
12097
|
8246454
|
|
|
|
|
prev = av_pop(stack);; |
12098
|
8246454
|
|
|
|
|
_invlist_subtract(prev, current, ¤t); |
12099
|
8246454
|
|
|
|
|
av_push(stack, current); |
12100
|
8246454
|
|
|
|
|
break; |
12101
|
|
|
|
|
|
|
12102
|
|
|
|
|
|
case '^': /* The union minus the intersection */ |
12103
|
|
|
|
|
|
{ |
12104
|
8246454
|
|
|
|
|
SV* i = NULL; |
12105
|
4179995
|
|
|
|
|
SV* u = NULL; |
12106
|
|
|
|
|
|
SV* element; |
12107
|
|
|
|
|
|
|
12108
|
4066459
|
|
|
|
|
prev = av_pop(stack); |
12109
|
4066459
|
|
|
|
|
_invlist_union(prev, current, &u); |
12110
|
6062609
|
|
|
|
|
_invlist_intersection(prev, current, &i); |
12111
|
|
|
|
|
|
/* _invlist_subtract will overwrite current |
12112
|
|
|
|
|
|
without freeing what it already contains */ |
12113
|
277276
|
|
|
|
|
element = current; |
12114
|
4066459
|
|
|
|
|
_invlist_subtract(u, i, ¤t); |
12115
|
4066459
|
|
|
|
|
av_push(stack, current); |
12116
|
4066459
|
|
|
|
|
SvREFCNT_dec_NN(i); |
12117
|
8246454
|
|
|
|
|
SvREFCNT_dec_NN(u); |
12118
|
6254
|
|
|
|
|
SvREFCNT_dec_NN(element); |
12119
|
8246454
|
|
|
|
|
break; |
12120
|
|
|
|
|
|
} |
12121
|
|
|
|
|
|
|
12122
|
|
|
|
|
|
default: |
12123
|
2126224
|
|
|
|
|
Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); |
12124
|
|
|
|
|
|
} |
12125
|
2126224
|
|
|
|
|
SvREFCNT_dec_NN(top); |
12126
|
2126224
|
|
|
|
|
SvREFCNT_dec(prev); |
12127
|
|
|
|
|
|
} |
12128
|
|
|
|
|
|
} |
12129
|
|
|
|
|
|
|
12130
|
756
|
|
|
|
|
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; |
12131
|
|
|
|
|
|
} |
12132
|
|
|
|
|
|
|
12133
|
8246454
|
|
|
|
|
if (av_tindex(stack) < 0 /* Was empty */ |
12134
|
144220
|
|
|
|
|
|| ((final = av_pop(stack)) == NULL) |
12135
|
144220
|
|
|
|
|
|| ! IS_OPERAND(final) |
12136
|
226130
|
|
|
|
|
|| av_tindex(stack) >= 0) /* More left on stack */ |
12137
|
|
|
|
|
|
{ |
12138
|
13400
|
|
|
|
|
vFAIL("Incomplete expression within '(?[ ])'"); |
12139
|
|
|
|
|
|
} |
12140
|
|
|
|
|
|
|
12141
|
|
|
|
|
|
/* Here, 'final' is the resultant inversion list from evaluating the |
12142
|
|
|
|
|
|
* expression. Return it if so requested */ |
12143
|
144220
|
|
|
|
|
if (return_invlist) { |
12144
|
40
|
|
|
|
|
*return_invlist = final; |
12145
|
40
|
|
|
|
|
return END; |
12146
|
|
|
|
|
|
} |
12147
|
|
|
|
|
|
|
12148
|
|
|
|
|
|
/* Otherwise generate a resultant node, based on 'final'. regclass() is |
12149
|
|
|
|
|
|
* expecting a string of ranges and individual code points */ |
12150
|
36
|
|
|
|
|
invlist_iterinit(final); |
12151
|
8246450
|
|
|
|
|
result_string = newSVpvs(""); |
12152
|
410492
|
|
|
|
|
while (invlist_iternext(final, &start, &end)) { |
12153
|
8246450
|
|
|
|
|
if (start == end) { |
12154
|
27335836
|
|
|
|
|
Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); |
12155
|
|
|
|
|
|
} |
12156
|
|
|
|
|
|
else { |
12157
|
26922254
|
|
|
|
|
Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", |
12158
|
|
|
|
|
|
start, end); |
12159
|
|
|
|
|
|
} |
12160
|
|
|
|
|
|
} |
12161
|
|
|
|
|
|
|
12162
|
17330
|
|
|
|
|
save_parse = RExC_parse; |
12163
|
26922254
|
|
|
|
|
RExC_parse = SvPV(result_string, len); |
12164
|
19146320
|
|
|
|
|
save_end = RExC_end; |
12165
|
19146320
|
|
|
|
|
RExC_end = RExC_parse + len; |
12166
|
|
|
|
|
|
|
12167
|
|
|
|
|
|
/* We turn off folding around the call, as the class we have constructed |
12168
|
|
|
|
|
|
* already has all folding taken into consideration, and we don't want |
12169
|
|
|
|
|
|
* regclass() to add to that */ |
12170
|
17203734
|
|
|
|
|
RExC_flags &= ~RXf_PMf_FOLD; |
12171
|
|
|
|
|
|
/* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. |
12172
|
|
|
|
|
|
*/ |
12173
|
17203734
|
|
|
|
|
node = regclass(pRExC_state, flagp,depth+1, |
12174
|
|
|
|
|
|
FALSE, /* means parse the whole char class */ |
12175
|
|
|
|
|
|
FALSE, /* don't allow multi-char folds */ |
12176
|
|
|
|
|
|
TRUE, /* silence non-portable warnings. The above may very |
12177
|
|
|
|
|
|
well have generated non-portable code points, but |
12178
|
|
|
|
|
|
they're valid on this machine */ |
12179
|
|
|
|
|
|
NULL); |
12180
|
19146320
|
|
|
|
|
if (!node) |
12181
|
1458856
|
|
|
|
|
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, |
12182
|
|
|
|
|
|
PTR2UV(flagp)); |
12183
|
1458856
|
|
|
|
|
if (save_fold) { |
12184
|
17687464
|
|
|
|
|
RExC_flags |= RXf_PMf_FOLD; |
12185
|
|
|
|
|
|
} |
12186
|
19146320
|
|
|
|
|
RExC_parse = save_parse + 1; |
12187
|
126354
|
|
|
|
|
RExC_end = save_end; |
12188
|
126354
|
|
|
|
|
SvREFCNT_dec_NN(final); |
12189
|
89234
|
|
|
|
|
SvREFCNT_dec_NN(result_string); |
12190
|
|
|
|
|
|
|
12191
|
19086776
|
|
|
|
|
nextchar(pRExC_state); |
12192
|
7711468
|
|
|
|
|
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ |
12193
|
|
|
|
|
|
return node; |
12194
|
|
|
|
|
|
} |
12195
|
|
|
|
|
|
#undef IS_OPERAND |
12196
|
|
|
|
|
|
|
12197
|
|
|
|
|
|
/* The names of properties whose definitions are not known at compile time are |
12198
|
|
|
|
|
|
* stored in this SV, after a constant heading. So if the length has been |
12199
|
|
|
|
|
|
* changed since initialization, then there is a run-time definition. */ |
12200
|
|
|
|
|
|
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) |
12201
|
|
|
|
|
|
|
12202
|
|
|
|
|
|
STATIC regnode * |
12203
|
898384
|
|
|
|
|
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, |
12204
|
|
|
|
|
|
const bool stop_at_1, /* Just parse the next thing, don't |
12205
|
|
|
|
|
|
look for a full character class */ |
12206
|
|
|
|
|
|
bool allow_multi_folds, |
12207
|
|
|
|
|
|
const bool silence_non_portable, /* Don't output warnings |
12208
|
|
|
|
|
|
about too large |
12209
|
|
|
|
|
|
characters */ |
12210
|
|
|
|
|
|
SV** ret_invlist) /* Return an inversion list, not a node */ |
12211
|
|
|
|
|
|
{ |
12212
|
|
|
|
|
|
/* parse a bracketed class specification. Most of these will produce an |
12213
|
|
|
|
|
|
* ANYOF node; but something like [a] will produce an EXACT node; [aA], an |
12214
|
|
|
|
|
|
* EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex |
12215
|
|
|
|
|
|
* under /i with multi-character folds: it will be rewritten following the |
12216
|
|
|
|
|
|
* paradigm of this example, where the s are characters which |
12217
|
|
|
|
|
|
* fold to multiple character sequences: |
12218
|
|
|
|
|
|
* /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i |
12219
|
|
|
|
|
|
* gets effectively rewritten as: |
12220
|
|
|
|
|
|
* /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i |
12221
|
|
|
|
|
|
* reg() gets called (recursively) on the rewritten version, and this |
12222
|
|
|
|
|
|
* function will return what it constructs. (Actually the s |
12223
|
|
|
|
|
|
* aren't physically removed from the [abcdefghi], it's just that they are |
12224
|
|
|
|
|
|
* ignored in the recursion by means of a flag: |
12225
|
|
|
|
|
|
* .) |
12226
|
|
|
|
|
|
* |
12227
|
|
|
|
|
|
* ANYOF nodes contain a bit map for the first 256 characters, with the |
12228
|
|
|
|
|
|
* corresponding bit set if that character is in the list. For characters |
12229
|
|
|
|
|
|
* above 255, a range list or swash is used. There are extra bits for \w, |
12230
|
|
|
|
|
|
* etc. in locale ANYOFs, as what these match is not determinable at |
12231
|
|
|
|
|
|
* compile time |
12232
|
|
|
|
|
|
* |
12233
|
|
|
|
|
|
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs |
12234
|
|
|
|
|
|
* to be restarted. This can only happen if ret_invlist is non-NULL. |
12235
|
|
|
|
|
|
*/ |
12236
|
|
|
|
|
|
|
12237
|
|
|
|
|
|
dVAR; |
12238
|
|
|
|
|
|
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; |
12239
|
|
|
|
|
|
IV range = 0; |
12240
|
898384
|
|
|
|
|
UV value = OOB_UNICODE, save_value = OOB_UNICODE; |
12241
|
|
|
|
|
|
regnode *ret; |
12242
|
|
|
|
|
|
STRLEN numlen; |
12243
|
|
|
|
|
|
IV namedclass = OOB_NAMEDCLASS; |
12244
|
|
|
|
|
|
char *rangebegin = NULL; |
12245
|
|
|
|
|
|
bool need_class = 0; |
12246
|
|
|
|
|
|
SV *listsv = NULL; |
12247
|
|
|
|
|
|
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more |
12248
|
|
|
|
|
|
than just initialized. */ |
12249
|
6813156
|
|
|
|
|
SV* properties = NULL; /* Code points that match \p{} \P{} */ |
12250
|
7711504
|
|
|
|
|
SV* posixes = NULL; /* Code points that match classes like, [:word:], |
12251
|
|
|
|
|
|
extended beyond the Latin1 range */ |
12252
|
|
|
|
|
|
UV element_count = 0; /* Number of distinct elements in the class. |
12253
|
|
|
|
|
|
Optimizations may be possible if this is tiny */ |
12254
|
|
|
|
|
|
AV * multi_char_matches = NULL; /* Code points that fold to more than one |
12255
|
|
|
|
|
|
character; used under /i */ |
12256
|
|
|
|
|
|
UV n; |
12257
|
34172
|
|
|
|
|
char * stop_ptr = RExC_end; /* where to stop parsing */ |
12258
|
37870
|
|
|
|
|
const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white |
12259
|
|
|
|
|
|
space? */ |
12260
|
13044
|
|
|
|
|
const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ |
12261
|
|
|
|
|
|
|
12262
|
|
|
|
|
|
/* Unicode properties are stored in a swash; this holds the current one |
12263
|
|
|
|
|
|
* being parsed. If this swash is the only above-latin1 component of the |
12264
|
|
|
|
|
|
* character class, an optimization is to pass it directly on to the |
12265
|
|
|
|
|
|
* execution engine. Otherwise, it is set to NULL to indicate that there |
12266
|
|
|
|
|
|
* are other things in the class that have to be dealt with at execution |
12267
|
|
|
|
|
|
* time */ |
12268
|
|
|
|
|
|
SV* swash = NULL; /* Code points that match \p{} \P{} */ |
12269
|
|
|
|
|
|
|
12270
|
|
|
|
|
|
/* Set if a component of this character class is user-defined; just passed |
12271
|
|
|
|
|
|
* on to the engine */ |
12272
|
|
|
|
|
|
bool has_user_defined_property = FALSE; |
12273
|
|
|
|
|
|
|
12274
|
|
|
|
|
|
/* inversion list of code points this node matches only when the target |
12275
|
|
|
|
|
|
* string is in UTF-8. (Because is under /d) */ |
12276
|
51872
|
|
|
|
|
SV* depends_list = NULL; |
12277
|
|
|
|
|
|
|
12278
|
|
|
|
|
|
/* inversion list of code points this node matches. For much of the |
12279
|
|
|
|
|
|
* function, it includes only those that match regardless of the utf8ness |
12280
|
|
|
|
|
|
* of the target string */ |
12281
|
560
|
|
|
|
|
SV* cp_list = NULL; |
12282
|
|
|
|
|
|
|
12283
|
|
|
|
|
|
#ifdef EBCDIC |
12284
|
|
|
|
|
|
/* In a range, counts how many 0-2 of the ends of it came from literals, |
12285
|
|
|
|
|
|
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */ |
12286
|
|
|
|
|
|
UV literal_endpoint = 0; |
12287
|
|
|
|
|
|
#endif |
12288
|
|
|
|
|
|
bool invert = FALSE; /* Is this class to be complemented */ |
12289
|
|
|
|
|
|
|
12290
|
|
|
|
|
|
/* Is there any thing like \W or [:^digit:] that matches above the legal |
12291
|
|
|
|
|
|
* Unicode range? */ |
12292
|
|
|
|
|
|
bool runtime_posix_matches_above_Unicode = FALSE; |
12293
|
|
|
|
|
|
|
12294
|
52
|
|
|
|
|
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in |
12295
|
|
|
|
|
|
case we need to change the emitted regop to an EXACT. */ |
12296
|
48
|
|
|
|
|
const char * orig_parse = RExC_parse; |
12297
|
176
|
|
|
|
|
const SSize_t orig_size = RExC_size; |
12298
|
432
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
12299
|
|
|
|
|
|
|
12300
|
562
|
|
|
|
|
PERL_ARGS_ASSERT_REGCLASS; |
12301
|
|
|
|
|
|
#ifndef DEBUGGING |
12302
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
12303
|
|
|
|
|
|
#endif |
12304
|
|
|
|
|
|
|
12305
|
48
|
|
|
|
|
DEBUG_PARSE("clas"); |
12306
|
|
|
|
|
|
|
12307
|
|
|
|
|
|
/* Assume we are going to generate an ANYOF node. */ |
12308
|
36
|
|
|
|
|
ret = reganode(pRExC_state, ANYOF, 0); |
12309
|
|
|
|
|
|
|
12310
|
411620
|
|
|
|
|
if (SIZE_ONLY) { |
12311
|
411602
|
|
|
|
|
RExC_size += ANYOF_SKIP; |
12312
|
|
|
|
|
|
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ |
12313
|
|
|
|
|
|
} |
12314
|
|
|
|
|
|
else { |
12315
|
20
|
|
|
|
|
ANYOF_FLAGS(ret) = 0; |
12316
|
|
|
|
|
|
|
12317
|
411600
|
|
|
|
|
RExC_emit += ANYOF_SKIP; |
12318
|
411412
|
|
|
|
|
if (LOC) { |
12319
|
411376
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_LOCALE; |
12320
|
|
|
|
|
|
} |
12321
|
411394
|
|
|
|
|
listsv = newSVpvs_flags("# comment\n", SVs_TEMP); |
12322
|
22
|
|
|
|
|
initial_listsv_len = SvCUR(listsv); |
12323
|
433624
|
|
|
|
|
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ |
12324
|
|
|
|
|
|
} |
12325
|
|
|
|
|
|
|
12326
|
22270
|
|
|
|
|
if (skip_white) { |
12327
|
411372
|
|
|
|
|
RExC_parse = regpatws(pRExC_state, RExC_parse, |
12328
|
|
|
|
|
|
FALSE /* means don't recognize comments */); |
12329
|
|
|
|
|
|
} |
12330
|
|
|
|
|
|
|
12331
|
38
|
|
|
|
|
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ |
12332
|
411370
|
|
|
|
|
RExC_parse++; |
12333
|
|
|
|
|
|
invert = TRUE; |
12334
|
|
|
|
|
|
allow_multi_folds = FALSE; |
12335
|
617089
|
|
|
|
|
RExC_naughty++; |
12336
|
34
|
|
|
|
|
if (skip_white) { |
12337
|
206
|
|
|
|
|
RExC_parse = regpatws(pRExC_state, RExC_parse, |
12338
|
|
|
|
|
|
FALSE /* means don't recognize comments */); |
12339
|
|
|
|
|
|
} |
12340
|
|
|
|
|
|
} |
12341
|
|
|
|
|
|
|
12342
|
|
|
|
|
|
/* Check that they didn't say [:posix:] instead of [[:posix:]] */ |
12343
|
411612
|
|
|
|
|
if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { |
12344
|
205804
|
|
|
|
|
const char *s = RExC_parse; |
12345
|
85732
|
|
|
|
|
const char c = *s++; |
12346
|
|
|
|
|
|
|
12347
|
85732
|
|
|
|
|
while (isWORDCHAR(*s)) |
12348
|
85732
|
|
|
|
|
s++; |
12349
|
137402
|
|
|
|
|
if (*s && c == *s && s[1] == ']') { |
12350
|
8804
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); |
12351
|
8804
|
|
|
|
|
ckWARN3reg(s+2, |
12352
|
|
|
|
|
|
"POSIX syntax [%c %c] belongs inside character classes", |
12353
|
|
|
|
|
|
c, c); |
12354
|
205804
|
|
|
|
|
(void)ReREFCNT_inc(RExC_rx_sv); |
12355
|
|
|
|
|
|
} |
12356
|
|
|
|
|
|
} |
12357
|
|
|
|
|
|
|
12358
|
|
|
|
|
|
/* If the caller wants us to just parse a single element, accomplish this |
12359
|
|
|
|
|
|
* by faking the loop ending condition */ |
12360
|
308742
|
|
|
|
|
if (stop_at_1 && RExC_end > RExC_parse) { |
12361
|
205804
|
|
|
|
|
stop_ptr = RExC_parse + 1; |
12362
|
|
|
|
|
|
} |
12363
|
|
|
|
|
|
|
12364
|
|
|
|
|
|
/* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ |
12365
|
205840
|
|
|
|
|
if (UCHARAT(RExC_parse) == ']') |
12366
|
|
|
|
|
|
goto charclassloop; |
12367
|
|
|
|
|
|
|
12368
|
|
|
|
|
|
parseit: |
12369
|
|
|
|
|
|
while (1) { |
12370
|
205880
|
|
|
|
|
if (RExC_parse >= stop_ptr) { |
12371
|
|
|
|
|
|
break; |
12372
|
|
|
|
|
|
} |
12373
|
|
|
|
|
|
|
12374
|
92
|
|
|
|
|
if (skip_white) { |
12375
|
205804
|
|
|
|
|
RExC_parse = regpatws(pRExC_state, RExC_parse, |
12376
|
|
|
|
|
|
FALSE /* means don't recognize comments */); |
12377
|
|
|
|
|
|
} |
12378
|
|
|
|
|
|
|
12379
|
205880
|
|
|
|
|
if (UCHARAT(RExC_parse) == ']') { |
12380
|
|
|
|
|
|
break; |
12381
|
|
|
|
|
|
} |
12382
|
|
|
|
|
|
|
12383
|
|
|
|
|
|
charclassloop: |
12384
|
|
|
|
|
|
|
12385
|
|
|
|
|
|
namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ |
12386
|
25172
|
|
|
|
|
save_value = value; |
12387
|
|
|
|
|
|
save_prevvalue = prevvalue; |
12388
|
|
|
|
|
|
|
12389
|
40
|
|
|
|
|
if (!range) { |
12390
|
25172
|
|
|
|
|
rangebegin = RExC_parse; |
12391
|
44
|
|
|
|
|
element_count++; |
12392
|
|
|
|
|
|
} |
12393
|
44
|
|
|
|
|
if (UTF) { |
12394
|
25128
|
|
|
|
|
value = utf8n_to_uvchr((U8*)RExC_parse, |
12395
|
|
|
|
|
|
RExC_end - RExC_parse, |
12396
|
|
|
|
|
|
&numlen, UTF8_ALLOW_DEFAULT); |
12397
|
25128
|
|
|
|
|
RExC_parse += numlen; |
12398
|
|
|
|
|
|
} |
12399
|
|
|
|
|
|
else |
12400
|
25168
|
|
|
|
|
value = UCHARAT(RExC_parse++); |
12401
|
|
|
|
|
|
|
12402
|
180712
|
|
|
|
|
if (value == '[' |
12403
|
180672
|
|
|
|
|
&& RExC_parse < RExC_end |
12404
|
180672
|
|
|
|
|
&& POSIXCC(UCHARAT(RExC_parse))) |
12405
|
|
|
|
|
|
{ |
12406
|
86590
|
|
|
|
|
namedclass = regpposixcc(pRExC_state, value, strict); |
12407
|
|
|
|
|
|
} |
12408
|
86630
|
|
|
|
|
else if (value == '\\') { |
12409
|
94082
|
|
|
|
|
if (UTF) { |
12410
|
205800
|
|
|
|
|
value = utf8n_to_uvchr((U8*)RExC_parse, |
12411
|
|
|
|
|
|
RExC_end - RExC_parse, |
12412
|
|
|
|
|
|
&numlen, UTF8_ALLOW_DEFAULT); |
12413
|
411572
|
|
|
|
|
RExC_parse += numlen; |
12414
|
|
|
|
|
|
} |
12415
|
|
|
|
|
|
else |
12416
|
411572
|
|
|
|
|
value = UCHARAT(RExC_parse++); |
12417
|
|
|
|
|
|
|
12418
|
|
|
|
|
|
/* Some compilers cannot handle switching on 64-bit integer |
12419
|
|
|
|
|
|
* values, therefore value cannot be an UV. Yes, this will |
12420
|
|
|
|
|
|
* be a problem later if we want switch on Unicode. |
12421
|
|
|
|
|
|
* A similar issue a little bit later when switching on |
12422
|
|
|
|
|
|
* namedclass. --jhi */ |
12423
|
|
|
|
|
|
|
12424
|
|
|
|
|
|
/* If the \ is escaping white space when white space is being |
12425
|
|
|
|
|
|
* skipped, it means that that white space is wanted literally, and |
12426
|
|
|
|
|
|
* is already in 'value'. Otherwise, need to translate the escape |
12427
|
|
|
|
|
|
* into what it signifies. */ |
12428
|
411572
|
|
|
|
|
if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { |
12429
|
|
|
|
|
|
|
12430
|
|
|
|
|
|
case 'w': namedclass = ANYOF_WORDCHAR; break; |
12431
|
65956
|
|
|
|
|
case 'W': namedclass = ANYOF_NWORDCHAR; break; |
12432
|
37916
|
|
|
|
|
case 's': namedclass = ANYOF_SPACE; break; |
12433
|
29748
|
|
|
|
|
case 'S': namedclass = ANYOF_NSPACE; break; |
12434
|
21052
|
|
|
|
|
case 'd': namedclass = ANYOF_DIGIT; break; |
12435
|
21548
|
|
|
|
|
case 'D': namedclass = ANYOF_NDIGIT; break; |
12436
|
10504
|
|
|
|
|
case 'v': namedclass = ANYOF_VERTWS; break; |
12437
|
10376
|
|
|
|
|
case 'V': namedclass = ANYOF_NVERTWS; break; |
12438
|
398
|
|
|
|
|
case 'h': namedclass = ANYOF_HORIZWS; break; |
12439
|
398
|
|
|
|
|
case 'H': namedclass = ANYOF_NHORIZWS; break; |
12440
|
|
|
|
|
|
case 'N': /* Handle \N{NAME} in class */ |
12441
|
|
|
|
|
|
{ |
12442
|
|
|
|
|
|
/* We only pay attention to the first char of |
12443
|
|
|
|
|
|
multichar strings being returned. I kinda wonder |
12444
|
|
|
|
|
|
if this makes sense as it does change the behaviour |
12445
|
|
|
|
|
|
from earlier versions, OTOH that behaviour was broken |
12446
|
|
|
|
|
|
as well. */ |
12447
|
398
|
|
|
|
|
if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, |
12448
|
|
|
|
|
|
TRUE, /* => charclass */ |
12449
|
|
|
|
|
|
strict)) |
12450
|
|
|
|
|
|
{ |
12451
|
14
|
|
|
|
|
if (*flagp & RESTART_UTF8) |
12452
|
384
|
|
|
|
|
FAIL("panic: grok_bslash_N set RESTART_UTF8"); |
12453
|
|
|
|
|
|
goto parseit; |
12454
|
|
|
|
|
|
} |
12455
|
|
|
|
|
|
} |
12456
|
|
|
|
|
|
break; |
12457
|
|
|
|
|
|
case 'p': |
12458
|
|
|
|
|
|
case 'P': |
12459
|
|
|
|
|
|
{ |
12460
|
|
|
|
|
|
char *e; |
12461
|
|
|
|
|
|
|
12462
|
|
|
|
|
|
/* We will handle any undefined properties ourselves */ |
12463
|
4029980
|
|
|
|
|
U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; |
12464
|
|
|
|
|
|
|
12465
|
4029980
|
|
|
|
|
if (RExC_parse >= RExC_end) |
12466
|
4029980
|
|
|
|
|
vFAIL2("Empty \\%c{}", (U8)value); |
12467
|
10
|
|
|
|
|
if (*RExC_parse == '{') { |
12468
|
4029970
|
|
|
|
|
const U8 c = (U8)value; |
12469
|
143538
|
|
|
|
|
e = strchr(RExC_parse++, '}'); |
12470
|
143538
|
|
|
|
|
if (!e) |
12471
|
153174
|
|
|
|
|
vFAIL2("Missing right brace on \\%c{}", c); |
12472
|
153174
|
|
|
|
|
while (isSPACE(UCHARAT(RExC_parse))) |
12473
|
153174
|
|
|
|
|
RExC_parse++; |
12474
|
153174
|
|
|
|
|
if (e == RExC_parse) |
12475
|
153174
|
|
|
|
|
vFAIL2("Empty \\%c{}", c); |
12476
|
50522
|
|
|
|
|
n = e - RExC_parse; |
12477
|
10
|
|
|
|
|
while (isSPACE(UCHARAT(RExC_parse + n - 1))) |
12478
|
10
|
|
|
|
|
n--; |
12479
|
|
|
|
|
|
} |
12480
|
|
|
|
|
|
else { |
12481
|
50512
|
|
|
|
|
e = RExC_parse; |
12482
|
|
|
|
|
|
n = 1; |
12483
|
|
|
|
|
|
} |
12484
|
25256
|
|
|
|
|
if (!SIZE_ONLY) { |
12485
|
|
|
|
|
|
SV* invlist; |
12486
|
|
|
|
|
|
char* name; |
12487
|
|
|
|
|
|
|
12488
|
25256
|
|
|
|
|
if (UCHARAT(RExC_parse) == '^') { |
12489
|
25256
|
|
|
|
|
RExC_parse++; |
12490
|
16
|
|
|
|
|
n--; |
12491
|
|
|
|
|
|
/* toggle. (The rhs xor gets the single bit that |
12492
|
|
|
|
|
|
* differs between P and p; the other xor inverts just |
12493
|
|
|
|
|
|
* that bit) */ |
12494
|
4
|
|
|
|
|
value ^= 'P' ^ 'p'; |
12495
|
|
|
|
|
|
|
12496
|
4
|
|
|
|
|
while (isSPACE(UCHARAT(RExC_parse))) { |
12497
|
4
|
|
|
|
|
RExC_parse++; |
12498
|
153164
|
|
|
|
|
n--; |
12499
|
|
|
|
|
|
} |
12500
|
|
|
|
|
|
} |
12501
|
|
|
|
|
|
/* Try to get the definition of the property into |
12502
|
|
|
|
|
|
* . If /i is in effect, the effective property |
12503
|
|
|
|
|
|
* will have its name be <__NAME_i>. The design is |
12504
|
|
|
|
|
|
* discussed in commit |
12505
|
|
|
|
|
|
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */ |
12506
|
164
|
|
|
|
|
Newx(name, n + sizeof("_i__\n"), char); |
12507
|
|
|
|
|
|
|
12508
|
164
|
|
|
|
|
sprintf(name, "%s%.*s%s\n", |
12509
|
164
|
|
|
|
|
(FOLD) ? "__" : "", |
12510
|
|
|
|
|
|
(int)n, |
12511
|
|
|
|
|
|
RExC_parse, |
12512
|
164
|
|
|
|
|
(FOLD) ? "_i" : "" |
12513
|
|
|
|
|
|
); |
12514
|
|
|
|
|
|
|
12515
|
|
|
|
|
|
/* Look up the property name, and get its swash and |
12516
|
|
|
|
|
|
* inversion list, if the property is found */ |
12517
|
0
|
|
|
|
|
if (swash) { |
12518
|
0
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
12519
|
|
|
|
|
|
} |
12520
|
0
|
|
|
|
|
swash = _core_swash_init("utf8", name, &PL_sv_undef, |
12521
|
|
|
|
|
|
1, /* binary */ |
12522
|
|
|
|
|
|
0, /* not tr/// */ |
12523
|
|
|
|
|
|
NULL, /* No inversion list */ |
12524
|
|
|
|
|
|
&swash_init_flags |
12525
|
|
|
|
|
|
); |
12526
|
0
|
|
|
|
|
if (! swash || ! (invlist = _get_swash_invlist(swash))) { |
12527
|
2519612
|
|
|
|
|
if (swash) { |
12528
|
424
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
12529
|
|
|
|
|
|
swash = NULL; |
12530
|
|
|
|
|
|
} |
12531
|
|
|
|
|
|
|
12532
|
|
|
|
|
|
/* Here didn't find it. It could be a user-defined |
12533
|
|
|
|
|
|
* property that will be available at run-time. If we |
12534
|
|
|
|
|
|
* accept only compile-time properties, is an error; |
12535
|
|
|
|
|
|
* otherwise add it to the list for run-time look up */ |
12536
|
4
|
|
|
|
|
if (ret_invlist) { |
12537
|
420
|
|
|
|
|
RExC_parse = e + 1; |
12538
|
420
|
|
|
|
|
vFAIL3("Property '%.*s' is unknown", (int) n, name); |
12539
|
|
|
|
|
|
} |
12540
|
416
|
|
|
|
|
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", |
12541
|
28485191
|
|
|
|
|
(value == 'p' ? '+' : '!'), |
12542
|
|
|
|
|
|
name); |
12543
|
|
|
|
|
|
has_user_defined_property = TRUE; |
12544
|
|
|
|
|
|
|
12545
|
|
|
|
|
|
/* We don't know yet, so have to assume that the |
12546
|
|
|
|
|
|
* property could match something in the Latin1 range, |
12547
|
|
|
|
|
|
* hence something that isn't utf8. Note that this |
12548
|
|
|
|
|
|
* would cause things in to match |
12549
|
|
|
|
|
|
* inappropriately, except that any \p{}, including |
12550
|
|
|
|
|
|
* this one forces Unicode semantics, which means there |
12551
|
|
|
|
|
|
* is */ |
12552
|
878488
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; |
12553
|
|
|
|
|
|
} |
12554
|
|
|
|
|
|
else { |
12555
|
|
|
|
|
|
|
12556
|
|
|
|
|
|
/* Here, did get the swash and its inversion list. If |
12557
|
|
|
|
|
|
* the swash is from a user-defined property, then this |
12558
|
|
|
|
|
|
* whole character class should be regarded as such */ |
12559
|
877984
|
|
|
|
|
has_user_defined_property = |
12560
|
|
|
|
|
|
(swash_init_flags |
12561
|
877984
|
|
|
|
|
& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); |
12562
|
|
|
|
|
|
|
12563
|
|
|
|
|
|
/* Invert if asking for the complement */ |
12564
|
1344
|
|
|
|
|
if (value == 'P') { |
12565
|
672
|
|
|
|
|
_invlist_union_complement_2nd(properties, |
12566
|
|
|
|
|
|
invlist, |
12567
|
|
|
|
|
|
&properties); |
12568
|
|
|
|
|
|
|
12569
|
|
|
|
|
|
/* The swash can't be used as-is, because we've |
12570
|
|
|
|
|
|
* inverted things; delay removing it to here after |
12571
|
|
|
|
|
|
* have copied its invlist above */ |
12572
|
672
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
12573
|
|
|
|
|
|
swash = NULL; |
12574
|
|
|
|
|
|
} |
12575
|
|
|
|
|
|
else { |
12576
|
672
|
|
|
|
|
_invlist_union(properties, invlist, &properties); |
12577
|
|
|
|
|
|
} |
12578
|
|
|
|
|
|
} |
12579
|
1344
|
|
|
|
|
Safefree(name); |
12580
|
|
|
|
|
|
} |
12581
|
19146126
|
|
|
|
|
RExC_parse = e + 1; |
12582
|
|
|
|
|
|
namedclass = ANYOF_UNIPROP; /* no official name, but it's |
12583
|
|
|
|
|
|
named */ |
12584
|
|
|
|
|
|
|
12585
|
|
|
|
|
|
/* \p means they want Unicode semantics */ |
12586
|
725880
|
|
|
|
|
RExC_uni_semantics = 1; |
12587
|
|
|
|
|
|
} |
12588
|
376
|
|
|
|
|
break; |
12589
|
282
|
|
|
|
|
case 'n': value = '\n'; break; |
12590
|
188
|
|
|
|
|
case 'r': value = '\r'; break; |
12591
|
188
|
|
|
|
|
case 't': value = '\t'; break; |
12592
|
8
|
|
|
|
|
case 'f': value = '\f'; break; |
12593
|
180
|
|
|
|
|
case 'b': value = '\b'; break; |
12594
|
180
|
|
|
|
|
case 'e': value = ASCII_TO_NATIVE('\033');break; |
12595
|
176
|
|
|
|
|
case 'a': value = '\a'; break; |
12596
|
|
|
|
|
|
case 'o': |
12597
|
176
|
|
|
|
|
RExC_parse--; /* function expects to be pointed at the 'o' */ |
12598
|
|
|
|
|
|
{ |
12599
|
|
|
|
|
|
const char* error_msg; |
12600
|
176
|
|
|
|
|
bool valid = grok_bslash_o(&RExC_parse, |
12601
|
|
|
|
|
|
&value, |
12602
|
|
|
|
|
|
&error_msg, |
12603
|
|
|
|
|
|
SIZE_ONLY, /* warnings in pass |
12604
|
|
|
|
|
|
1 only */ |
12605
|
|
|
|
|
|
strict, |
12606
|
|
|
|
|
|
silence_non_portable, |
12607
|
|
|
|
|
|
UTF); |
12608
|
364
|
|
|
|
|
if (! valid) { |
12609
|
725868
|
|
|
|
|
vFAIL(error_msg); |
12610
|
|
|
|
|
|
} |
12611
|
|
|
|
|
|
} |
12612
|
362946
|
|
|
|
|
if (PL_encoding && value < 0x100) { |
12613
|
|
|
|
|
|
goto recode_encoding; |
12614
|
|
|
|
|
|
} |
12615
|
|
|
|
|
|
break; |
12616
|
|
|
|
|
|
case 'x': |
12617
|
362946
|
|
|
|
|
RExC_parse--; /* function expects to be pointed at the 'x' */ |
12618
|
|
|
|
|
|
{ |
12619
|
|
|
|
|
|
const char* error_msg; |
12620
|
206080
|
|
|
|
|
bool valid = grok_bslash_x(&RExC_parse, |
12621
|
|
|
|
|
|
&value, |
12622
|
|
|
|
|
|
&error_msg, |
12623
|
|
|
|
|
|
TRUE, /* Output warnings */ |
12624
|
|
|
|
|
|
strict, |
12625
|
|
|
|
|
|
silence_non_portable, |
12626
|
|
|
|
|
|
UTF); |
12627
|
282
|
|
|
|
|
if (! valid) { |
12628
|
282
|
|
|
|
|
vFAIL(error_msg); |
12629
|
|
|
|
|
|
} |
12630
|
|
|
|
|
|
} |
12631
|
156866
|
|
|
|
|
if (PL_encoding && value < 0x100) |
12632
|
|
|
|
|
|
goto recode_encoding; |
12633
|
|
|
|
|
|
break; |
12634
|
|
|
|
|
|
case 'c': |
12635
|
351
|
|
|
|
|
value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); |
12636
|
0
|
|
|
|
|
break; |
12637
|
|
|
|
|
|
case '0': case '1': case '2': case '3': case '4': |
12638
|
|
|
|
|
|
case '5': case '6': case '7': |
12639
|
|
|
|
|
|
{ |
12640
|
|
|
|
|
|
/* Take 1-3 octal digits */ |
12641
|
234
|
|
|
|
|
I32 flags = PERL_SCAN_SILENT_ILLDIGIT; |
12642
|
156632
|
|
|
|
|
numlen = (strict) ? 4 : 3; |
12643
|
156632
|
|
|
|
|
value = grok_oct(--RExC_parse, &numlen, &flags, NULL); |
12644
|
156632
|
|
|
|
|
RExC_parse += numlen; |
12645
|
127100
|
|
|
|
|
if (numlen != 3) { |
12646
|
127100
|
|
|
|
|
if (strict) { |
12647
|
39
|
|
|
|
|
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; |
12648
|
26
|
|
|
|
|
vFAIL("Need exactly 3 octal digits"); |
12649
|
|
|
|
|
|
} |
12650
|
127100
|
|
|
|
|
else if (! SIZE_ONLY /* like \08, \178 */ |
12651
|
124366
|
|
|
|
|
&& numlen < 3 |
12652
|
129171
|
|
|
|
|
&& RExC_parse < RExC_end |
12653
|
85686
|
|
|
|
|
&& isDIGIT(*RExC_parse) |
12654
|
129171
|
|
|
|
|
&& ckWARN(WARN_REGEXP)) |
12655
|
|
|
|
|
|
{ |
12656
|
228
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); |
12657
|
128829
|
|
|
|
|
reg_warn_non_literal_string( |
12658
|
|
|
|
|
|
RExC_parse + 1, |
12659
|
|
|
|
|
|
form_short_octal_warning(RExC_parse, numlen)); |
12660
|
56838
|
|
|
|
|
(void)ReREFCNT_inc(RExC_rx_sv); |
12661
|
|
|
|
|
|
} |
12662
|
|
|
|
|
|
} |
12663
|
456
|
|
|
|
|
if (PL_encoding && value < 0x100) |
12664
|
|
|
|
|
|
goto recode_encoding; |
12665
|
|
|
|
|
|
break; |
12666
|
|
|
|
|
|
} |
12667
|
|
|
|
|
|
recode_encoding: |
12668
|
37556
|
|
|
|
|
if (! RExC_override_recoding) { |
12669
|
56154
|
|
|
|
|
SV* enc = PL_encoding; |
12670
|
140
|
|
|
|
|
value = reg_recode((const char)(U8)value, &enc); |
12671
|
37416
|
|
|
|
|
if (!enc) { |
12672
|
37416
|
|
|
|
|
if (strict) { |
12673
|
37416
|
|
|
|
|
vFAIL("Invalid escape in the specified encoding"); |
12674
|
|
|
|
|
|
} |
12675
|
37416
|
|
|
|
|
else if (SIZE_ONLY) { |
12676
|
0
|
|
|
|
|
ckWARNreg(RExC_parse, |
12677
|
|
|
|
|
|
"Invalid escape in the specified encoding"); |
12678
|
|
|
|
|
|
} |
12679
|
|
|
|
|
|
} |
12680
|
|
|
|
|
|
break; |
12681
|
|
|
|
|
|
} |
12682
|
|
|
|
|
|
default: |
12683
|
|
|
|
|
|
/* Allow \_ to not give an error */ |
12684
|
0
|
|
|
|
|
if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { |
12685
|
55944
|
|
|
|
|
if (strict) { |
12686
|
36080
|
|
|
|
|
vFAIL2("Unrecognized escape \\%c in character class", |
12687
|
|
|
|
|
|
(int)value); |
12688
|
|
|
|
|
|
} |
12689
|
|
|
|
|
|
else { |
12690
|
2734
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); |
12691
|
142
|
|
|
|
|
ckWARN2reg(RExC_parse, |
12692
|
|
|
|
|
|
"Unrecognized escape \\%c in character class passed through", |
12693
|
|
|
|
|
|
(int)value); |
12694
|
32266
|
|
|
|
|
(void)ReREFCNT_inc(RExC_rx_sv); |
12695
|
|
|
|
|
|
} |
12696
|
|
|
|
|
|
} |
12697
|
|
|
|
|
|
break; |
12698
|
|
|
|
|
|
} /* End of switch on char following backslash */ |
12699
|
|
|
|
|
|
} /* end of handling backslash escape sequences */ |
12700
|
|
|
|
|
|
#ifdef EBCDIC |
12701
|
|
|
|
|
|
else |
12702
|
|
|
|
|
|
literal_endpoint++; |
12703
|
|
|
|
|
|
#endif |
12704
|
|
|
|
|
|
|
12705
|
|
|
|
|
|
/* Here, we have the current token in 'value' */ |
12706
|
|
|
|
|
|
|
12707
|
|
|
|
|
|
/* What matches in a locale is not known until runtime. This includes |
12708
|
|
|
|
|
|
* what the Posix classes (like \w, [:space:]) match. Room must be |
12709
|
|
|
|
|
|
* reserved (one time per class) to store such classes, either if Perl |
12710
|
|
|
|
|
|
* is compiled so that locale nodes always should have this space, or |
12711
|
|
|
|
|
|
* if there is such class info to be stored. The space will contain a |
12712
|
|
|
|
|
|
* bit for each named class that is to be matched against. This isn't |
12713
|
|
|
|
|
|
* needed for \p{} and pseudo-classes, as they are not affected by |
12714
|
|
|
|
|
|
* locale, and hence are dealt with separately */ |
12715
|
36389
|
|
|
|
|
if (LOC |
12716
|
35955
|
|
|
|
|
&& ! need_class |
12717
|
236
|
|
|
|
|
&& (ANYOF_LOCALE == ANYOF_CLASS |
12718
|
236
|
|
|
|
|
|| (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) |
12719
|
|
|
|
|
|
{ |
12720
|
|
|
|
|
|
need_class = 1; |
12721
|
236
|
|
|
|
|
if (SIZE_ONLY) { |
12722
|
236
|
|
|
|
|
RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; |
12723
|
|
|
|
|
|
} |
12724
|
|
|
|
|
|
else { |
12725
|
0
|
|
|
|
|
RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; |
12726
|
0
|
|
|
|
|
ANYOF_CLASS_ZERO(ret); |
12727
|
|
|
|
|
|
} |
12728
|
236
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_CLASS; |
12729
|
|
|
|
|
|
} |
12730
|
|
|
|
|
|
|
12731
|
11230
|
|
|
|
|
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ |
12732
|
|
|
|
|
|
|
12733
|
|
|
|
|
|
/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a |
12734
|
|
|
|
|
|
* literal, as is the character that began the false range, i.e. |
12735
|
|
|
|
|
|
* the 'a' in the examples */ |
12736
|
10962
|
|
|
|
|
if (range) { |
12737
|
10962
|
|
|
|
|
if (!SIZE_ONLY) { |
12738
|
5664
|
|
|
|
|
const int w = (RExC_parse >= rangebegin) |
12739
|
152
|
|
|
|
|
? RExC_parse - rangebegin |
12740
|
|
|
|
|
|
: 0; |
12741
|
152
|
|
|
|
|
if (strict) { |
12742
|
152
|
|
|
|
|
vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); |
12743
|
|
|
|
|
|
} |
12744
|
|
|
|
|
|
else { |
12745
|
152
|
|
|
|
|
SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ |
12746
|
0
|
|
|
|
|
ckWARN4reg(RExC_parse, |
12747
|
|
|
|
|
|
"False [] range \"%*.*s\"", |
12748
|
|
|
|
|
|
w, w, rangebegin); |
12749
|
0
|
|
|
|
|
(void)ReREFCNT_inc(RExC_rx_sv); |
12750
|
152
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, '-'); |
12751
|
362946
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, prevvalue); |
12752
|
|
|
|
|
|
} |
12753
|
|
|
|
|
|
} |
12754
|
|
|
|
|
|
|
12755
|
|
|
|
|
|
range = 0; /* this was not a true range */ |
12756
|
18783168
|
|
|
|
|
element_count += 2; /* So counts for three values */ |
12757
|
|
|
|
|
|
} |
12758
|
|
|
|
|
|
|
12759
|
10682
|
|
|
|
|
if (! SIZE_ONLY) { |
12760
|
18783168
|
|
|
|
|
U8 classnum = namedclass_to_classnum(namedclass); |
12761
|
1942210
|
|
|
|
|
if (namedclass >= ANYOF_MAX) { /* If a special class */ |
12762
|
30
|
|
|
|
|
if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ |
12763
|
|
|
|
|
|
|
12764
|
|
|
|
|
|
/* Here, should be \h, \H, \v, or \V. Neither /d nor |
12765
|
|
|
|
|
|
* /l make a difference in what these match. There |
12766
|
|
|
|
|
|
* would be problems if these characters had folds |
12767
|
|
|
|
|
|
* other than themselves, as cp_list is subject to |
12768
|
|
|
|
|
|
* folding. */ |
12769
|
30
|
|
|
|
|
if (classnum != _CC_VERTSPACE) { |
12770
|
16840958
|
|
|
|
|
assert( namedclass == ANYOF_HORIZWS |
12771
|
|
|
|
|
|
|| namedclass == ANYOF_NHORIZWS); |
12772
|
|
|
|
|
|
|
12773
|
|
|
|
|
|
/* It turns out that \h is just a synonym for |
12774
|
|
|
|
|
|
* XPosixBlank */ |
12775
|
|
|
|
|
|
classnum = _CC_BLANK; |
12776
|
|
|
|
|
|
} |
12777
|
|
|
|
|
|
|
12778
|
16840958
|
|
|
|
|
_invlist_union_maybe_complement_2nd( |
12779
|
|
|
|
|
|
cp_list, |
12780
|
|
|
|
|
|
PL_XPosix_ptrs[classnum], |
12781
|
|
|
|
|
|
cBOOL(namedclass % 2), /* Complement if odd |
12782
|
|
|
|
|
|
(NHORIZWS, NVERTWS) |
12783
|
|
|
|
|
|
*/ |
12784
|
|
|
|
|
|
&cp_list); |
12785
|
|
|
|
|
|
} |
12786
|
|
|
|
|
|
} |
12787
|
16635728
|
|
|
|
|
else if (classnum == _CC_ASCII) { |
12788
|
|
|
|
|
|
#ifdef HAS_ISASCII |
12789
|
2008838
|
|
|
|
|
if (LOC) { |
12790
|
2008838
|
|
|
|
|
ANYOF_CLASS_SET(ret, namedclass); |
12791
|
|
|
|
|
|
} |
12792
|
|
|
|
|
|
else |
12793
|
|
|
|
|
|
#endif /* Not isascii(); just use the hard-coded definition for it */ |
12794
|
560
|
|
|
|
|
_invlist_union_maybe_complement_2nd( |
12795
|
|
|
|
|
|
posixes, |
12796
|
|
|
|
|
|
PL_ASCII, |
12797
|
|
|
|
|
|
cBOOL(namedclass % 2), /* Complement if odd |
12798
|
|
|
|
|
|
(NASCII) */ |
12799
|
|
|
|
|
|
&posixes); |
12800
|
|
|
|
|
|
} |
12801
|
|
|
|
|
|
else { /* Garden variety class */ |
12802
|
|
|
|
|
|
|
12803
|
|
|
|
|
|
/* The ascii range inversion list */ |
12804
|
560
|
|
|
|
|
SV* ascii_source = PL_Posix_ptrs[classnum]; |
12805
|
|
|
|
|
|
|
12806
|
|
|
|
|
|
/* The full Latin1 range inversion list */ |
12807
|
2008838
|
|
|
|
|
SV* l1_source = PL_L1Posix_ptrs[classnum]; |
12808
|
|
|
|
|
|
|
12809
|
|
|
|
|
|
/* This code is structured into two major clauses. The |
12810
|
|
|
|
|
|
* first is for classes whose complete definitions may not |
12811
|
|
|
|
|
|
* already be known. It not, the Latin1 definition |
12812
|
|
|
|
|
|
* (guaranteed to already known) is used plus code is |
12813
|
|
|
|
|
|
* generated to load the rest at run-time (only if needed). |
12814
|
|
|
|
|
|
* If the complete definition is known, it drops down to |
12815
|
|
|
|
|
|
* the second clause, where the complete definition is |
12816
|
|
|
|
|
|
* known */ |
12817
|
|
|
|
|
|
|
12818
|
1942950
|
|
|
|
|
if (classnum < _FIRST_NON_SWASH_CC) { |
12819
|
|
|
|
|
|
|
12820
|
|
|
|
|
|
/* Here, the class has a swash, which may or not |
12821
|
|
|
|
|
|
* already be loaded */ |
12822
|
|
|
|
|
|
|
12823
|
|
|
|
|
|
/* The name of the property to use to match the full |
12824
|
|
|
|
|
|
* eXtended Unicode range swash for this character |
12825
|
|
|
|
|
|
* class */ |
12826
|
1942950
|
|
|
|
|
const char *Xname = swash_property_names[classnum]; |
12827
|
|
|
|
|
|
|
12828
|
|
|
|
|
|
/* If returning the inversion list, we can't defer |
12829
|
|
|
|
|
|
* getting this until runtime */ |
12830
|
364
|
|
|
|
|
if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { |
12831
|
42
|
|
|
|
|
PL_utf8_swash_ptrs[classnum] = |
12832
|
28
|
|
|
|
|
_core_swash_init("utf8", Xname, &PL_sv_undef, |
12833
|
|
|
|
|
|
1, /* binary */ |
12834
|
|
|
|
|
|
0, /* not tr/// */ |
12835
|
|
|
|
|
|
NULL, /* No inversion list */ |
12836
|
|
|
|
|
|
NULL /* No flags */ |
12837
|
|
|
|
|
|
); |
12838
|
28
|
|
|
|
|
assert(PL_utf8_swash_ptrs[classnum]); |
12839
|
|
|
|
|
|
} |
12840
|
28
|
|
|
|
|
if ( ! PL_utf8_swash_ptrs[classnum]) { |
12841
|
12
|
|
|
|
|
if (namedclass % 2 == 0) { /* A non-complemented |
12842
|
|
|
|
|
|
class */ |
12843
|
|
|
|
|
|
/* If not /a matching, there are code points we |
12844
|
|
|
|
|
|
* don't know at compile time. Arrange for the |
12845
|
|
|
|
|
|
* unknown matches to be loaded at run-time, if |
12846
|
|
|
|
|
|
* needed */ |
12847
|
16
|
|
|
|
|
if (! AT_LEAST_ASCII_RESTRICTED) { |
12848
|
352
|
|
|
|
|
Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", |
12849
|
|
|
|
|
|
Xname); |
12850
|
|
|
|
|
|
} |
12851
|
0
|
|
|
|
|
if (LOC) { /* Under locale, set run-time |
12852
|
|
|
|
|
|
lookup */ |
12853
|
352
|
|
|
|
|
ANYOF_CLASS_SET(ret, namedclass); |
12854
|
|
|
|
|
|
} |
12855
|
|
|
|
|
|
else { |
12856
|
|
|
|
|
|
/* Add the current class's code points to |
12857
|
|
|
|
|
|
* the running total */ |
12858
|
1942938
|
|
|
|
|
_invlist_union(posixes, |
12859
|
|
|
|
|
|
(AT_LEAST_ASCII_RESTRICTED) |
12860
|
|
|
|
|
|
? ascii_source |
12861
|
|
|
|
|
|
: l1_source, |
12862
|
|
|
|
|
|
&posixes); |
12863
|
|
|
|
|
|
} |
12864
|
|
|
|
|
|
} |
12865
|
|
|
|
|
|
else { /* A complemented class */ |
12866
|
16840188
|
|
|
|
|
if (AT_LEAST_ASCII_RESTRICTED) { |
12867
|
|
|
|
|
|
/* Under /a should match everything above |
12868
|
|
|
|
|
|
* ASCII, plus the complement of the set's |
12869
|
|
|
|
|
|
* ASCII matches */ |
12870
|
1324676
|
|
|
|
|
_invlist_union_complement_2nd(posixes, |
12871
|
|
|
|
|
|
ascii_source, |
12872
|
|
|
|
|
|
&posixes); |
12873
|
|
|
|
|
|
} |
12874
|
|
|
|
|
|
else { |
12875
|
|
|
|
|
|
/* Arrange for the unknown matches to be |
12876
|
|
|
|
|
|
* loaded at run-time, if needed */ |
12877
|
16840188
|
|
|
|
|
Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", |
12878
|
|
|
|
|
|
Xname); |
12879
|
|
|
|
|
|
runtime_posix_matches_above_Unicode = TRUE; |
12880
|
3145228
|
|
|
|
|
if (LOC) { |
12881
|
2885488
|
|
|
|
|
ANYOF_CLASS_SET(ret, namedclass); |
12882
|
|
|
|
|
|
} |
12883
|
|
|
|
|
|
else { |
12884
|
|
|
|
|
|
|
12885
|
|
|
|
|
|
/* We want to match everything in |
12886
|
|
|
|
|
|
* Latin1, except those things that |
12887
|
|
|
|
|
|
* l1_source matches */ |
12888
|
1684768
|
|
|
|
|
SV* scratch_list = NULL; |
12889
|
887984
|
|
|
|
|
_invlist_subtract(PL_Latin1, l1_source, |
12890
|
|
|
|
|
|
&scratch_list); |
12891
|
|
|
|
|
|
|
12892
|
|
|
|
|
|
/* Add the list from this class to the |
12893
|
|
|
|
|
|
* running total */ |
12894
|
726704
|
|
|
|
|
if (! posixes) { |
12895
|
523064
|
|
|
|
|
posixes = scratch_list; |
12896
|
|
|
|
|
|
} |
12897
|
|
|
|
|
|
else { |
12898
|
523064
|
|
|
|
|
_invlist_union(posixes, |
12899
|
|
|
|
|
|
scratch_list, |
12900
|
|
|
|
|
|
&posixes); |
12901
|
523064
|
|
|
|
|
SvREFCNT_dec_NN(scratch_list); |
12902
|
|
|
|
|
|
} |
12903
|
523064
|
|
|
|
|
if (DEPENDS_SEMANTICS) { |
12904
|
|
|
|
|
|
ANYOF_FLAGS(ret) |
12905
|
523064
|
|
|
|
|
|= ANYOF_NON_UTF8_LATIN1_ALL; |
12906
|
|
|
|
|
|
} |
12907
|
|
|
|
|
|
} |
12908
|
|
|
|
|
|
} |
12909
|
|
|
|
|
|
} |
12910
|
|
|
|
|
|
goto namedclass_done; |
12911
|
|
|
|
|
|
} |
12912
|
|
|
|
|
|
|
12913
|
|
|
|
|
|
/* Here, there is a swash loaded for the class. If no |
12914
|
|
|
|
|
|
* inversion list for it yet, get it */ |
12915
|
523064
|
|
|
|
|
if (! PL_XPosix_ptrs[classnum]) { |
12916
|
|
|
|
|
|
PL_XPosix_ptrs[classnum] |
12917
|
0
|
|
|
|
|
= _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); |
12918
|
|
|
|
|
|
} |
12919
|
|
|
|
|
|
} |
12920
|
|
|
|
|
|
|
12921
|
|
|
|
|
|
/* Here there is an inversion list already loaded for the |
12922
|
|
|
|
|
|
* entire class */ |
12923
|
|
|
|
|
|
|
12924
|
0
|
|
|
|
|
if (namedclass % 2 == 0) { /* A non-complemented class, |
12925
|
|
|
|
|
|
like ANYOF_PUNCT */ |
12926
|
523064
|
|
|
|
|
if (! LOC) { |
12927
|
|
|
|
|
|
/* For non-locale, just add it to any existing list |
12928
|
|
|
|
|
|
* */ |
12929
|
523064
|
|
|
|
|
_invlist_union(posixes, |
12930
|
|
|
|
|
|
(AT_LEAST_ASCII_RESTRICTED) |
12931
|
|
|
|
|
|
? ascii_source |
12932
|
|
|
|
|
|
: PL_XPosix_ptrs[classnum], |
12933
|
|
|
|
|
|
&posixes); |
12934
|
|
|
|
|
|
} |
12935
|
|
|
|
|
|
else { /* Locale */ |
12936
|
523064
|
|
|
|
|
SV* scratch_list = NULL; |
12937
|
|
|
|
|
|
|
12938
|
|
|
|
|
|
/* For above Latin1 code points, we use the full |
12939
|
|
|
|
|
|
* Unicode range */ |
12940
|
726704
|
|
|
|
|
_invlist_intersection(PL_AboveLatin1, |
12941
|
|
|
|
|
|
PL_XPosix_ptrs[classnum], |
12942
|
|
|
|
|
|
&scratch_list); |
12943
|
|
|
|
|
|
/* And set the output to it, adding instead if |
12944
|
|
|
|
|
|
* there already is an output. Checking if |
12945
|
|
|
|
|
|
* 'posixes' is NULL first saves an extra clone. |
12946
|
|
|
|
|
|
* Its reference count will be decremented at the |
12947
|
|
|
|
|
|
* next union, etc, or if this is the only |
12948
|
|
|
|
|
|
* instance, at the end of the routine */ |
12949
|
726704
|
|
|
|
|
if (! posixes) { |
12950
|
726704
|
|
|
|
|
posixes = scratch_list; |
12951
|
|
|
|
|
|
} |
12952
|
|
|
|
|
|
else { |
12953
|
16113484
|
|
|
|
|
_invlist_union(posixes, scratch_list, &posixes); |
12954
|
7938490
|
|
|
|
|
SvREFCNT_dec_NN(scratch_list); |
12955
|
|
|
|
|
|
} |
12956
|
|
|
|
|
|
|
12957
|
|
|
|
|
|
#ifndef HAS_ISBLANK |
12958
|
|
|
|
|
|
if (namedclass != ANYOF_BLANK) { |
12959
|
|
|
|
|
|
#endif |
12960
|
|
|
|
|
|
/* Set this class in the node for runtime |
12961
|
|
|
|
|
|
* matching */ |
12962
|
8246214
|
|
|
|
|
ANYOF_CLASS_SET(ret, namedclass); |
12963
|
|
|
|
|
|
#ifndef HAS_ISBLANK |
12964
|
|
|
|
|
|
} |
12965
|
|
|
|
|
|
else { |
12966
|
|
|
|
|
|
/* No isblank(), use the hard-coded ASCII-range |
12967
|
|
|
|
|
|
* blanks, adding them to the running total. */ |
12968
|
|
|
|
|
|
|
12969
|
|
|
|
|
|
_invlist_union(posixes, ascii_source, &posixes); |
12970
|
|
|
|
|
|
} |
12971
|
|
|
|
|
|
#endif |
12972
|
|
|
|
|
|
} |
12973
|
|
|
|
|
|
} |
12974
|
|
|
|
|
|
else { /* A complemented class, like ANYOF_NPUNCT */ |
12975
|
523064
|
|
|
|
|
if (! LOC) { |
12976
|
523064
|
|
|
|
|
_invlist_union_complement_2nd( |
12977
|
|
|
|
|
|
posixes, |
12978
|
|
|
|
|
|
(AT_LEAST_ASCII_RESTRICTED) |
12979
|
|
|
|
|
|
? ascii_source |
12980
|
|
|
|
|
|
: PL_XPosix_ptrs[classnum], |
12981
|
|
|
|
|
|
&posixes); |
12982
|
|
|
|
|
|
/* Under /d, everything in the upper half of the |
12983
|
|
|
|
|
|
* Latin1 range matches this complement */ |
12984
|
523064
|
|
|
|
|
if (DEPENDS_SEMANTICS) { |
12985
|
1615508
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; |
12986
|
|
|
|
|
|
} |
12987
|
|
|
|
|
|
} |
12988
|
|
|
|
|
|
else { /* Locale */ |
12989
|
1092444
|
|
|
|
|
SV* scratch_list = NULL; |
12990
|
523064
|
|
|
|
|
_invlist_subtract(PL_AboveLatin1, |
12991
|
|
|
|
|
|
PL_XPosix_ptrs[classnum], |
12992
|
|
|
|
|
|
&scratch_list); |
12993
|
1307660
|
|
|
|
|
if (! posixes) { |
12994
|
523064
|
|
|
|
|
posixes = scratch_list; |
12995
|
|
|
|
|
|
} |
12996
|
|
|
|
|
|
else { |
12997
|
0
|
|
|
|
|
_invlist_union(posixes, scratch_list, &posixes); |
12998
|
523064
|
|
|
|
|
SvREFCNT_dec_NN(scratch_list); |
12999
|
|
|
|
|
|
} |
13000
|
|
|
|
|
|
#ifndef HAS_ISBLANK |
13001
|
|
|
|
|
|
if (namedclass != ANYOF_NBLANK) { |
13002
|
|
|
|
|
|
#endif |
13003
|
523064
|
|
|
|
|
ANYOF_CLASS_SET(ret, namedclass); |
13004
|
|
|
|
|
|
#ifndef HAS_ISBLANK |
13005
|
|
|
|
|
|
} |
13006
|
|
|
|
|
|
else { |
13007
|
|
|
|
|
|
/* Get the list of all code points in Latin1 |
13008
|
|
|
|
|
|
* that are not ASCII blanks, and add them to |
13009
|
|
|
|
|
|
* the running total */ |
13010
|
|
|
|
|
|
_invlist_subtract(PL_Latin1, ascii_source, |
13011
|
|
|
|
|
|
&scratch_list); |
13012
|
|
|
|
|
|
_invlist_union(posixes, scratch_list, &posixes); |
13013
|
|
|
|
|
|
SvREFCNT_dec_NN(scratch_list); |
13014
|
|
|
|
|
|
} |
13015
|
|
|
|
|
|
#endif |
13016
|
|
|
|
|
|
} |
13017
|
|
|
|
|
|
} |
13018
|
|
|
|
|
|
} |
13019
|
|
|
|
|
|
namedclass_done: |
13020
|
261722
|
|
|
|
|
continue; /* Go get next character */ |
13021
|
|
|
|
|
|
} |
13022
|
|
|
|
|
|
} /* end of namedclass \blah */ |
13023
|
|
|
|
|
|
|
13024
|
|
|
|
|
|
/* Here, we have a single value. If 'range' is set, it is the ending |
13025
|
|
|
|
|
|
* of a range--check its validity. Later, we will handle each |
13026
|
|
|
|
|
|
* individual code point in the range. If 'range' isn't set, this |
13027
|
|
|
|
|
|
* could be the beginning of a range, so check for that by looking |
13028
|
|
|
|
|
|
* ahead to see if the next real character to be processed is the range |
13029
|
|
|
|
|
|
* indicator--the minus sign */ |
13030
|
|
|
|
|
|
|
13031
|
261762
|
|
|
|
|
if (skip_white) { |
13032
|
261722
|
|
|
|
|
RExC_parse = regpatws(pRExC_state, RExC_parse, |
13033
|
|
|
|
|
|
FALSE /* means don't recognize comments */); |
13034
|
|
|
|
|
|
} |
13035
|
|
|
|
|
|
|
13036
|
523104
|
|
|
|
|
if (range) { |
13037
|
523064
|
|
|
|
|
if (prevvalue > value) /* b-a */ { |
13038
|
523064
|
|
|
|
|
const int w = RExC_parse - rangebegin; |
13039
|
523064
|
|
|
|
|
Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); |
13040
|
|
|
|
|
|
range = 0; /* not a valid range */ |
13041
|
|
|
|
|
|
} |
13042
|
|
|
|
|
|
} |
13043
|
|
|
|
|
|
else { |
13044
|
523104
|
|
|
|
|
prevvalue = value; /* save the beginning of the potential range */ |
13045
|
523104
|
|
|
|
|
if (! stop_at_1 /* Can't be a range if parsing just one thing */ |
13046
|
523104
|
|
|
|
|
&& *RExC_parse == '-') |
13047
|
|
|
|
|
|
{ |
13048
|
523064
|
|
|
|
|
char* next_char_ptr = RExC_parse + 1; |
13049
|
523064
|
|
|
|
|
if (skip_white) { /* Get the next real char after the '-' */ |
13050
|
523064
|
|
|
|
|
next_char_ptr = regpatws(pRExC_state, |
13051
|
523064
|
|
|
|
|
RExC_parse + 1, |
13052
|
|
|
|
|
|
FALSE); /* means don't recognize |
13053
|
|
|
|
|
|
comments */ |
13054
|
|
|
|
|
|
} |
13055
|
|
|
|
|
|
|
13056
|
|
|
|
|
|
/* If the '-' is at the end of the class (just before the ']', |
13057
|
|
|
|
|
|
* it is a literal minus; otherwise it is a range */ |
13058
|
523064
|
|
|
|
|
if (next_char_ptr < RExC_end && *next_char_ptr != ']') { |
13059
|
7723150
|
|
|
|
|
RExC_parse = next_char_ptr; |
13060
|
|
|
|
|
|
|
13061
|
|
|
|
|
|
/* a bad range like \w-, [:word:]- ? */ |
13062
|
3307274
|
|
|
|
|
if (namedclass > OOB_NAMEDCLASS) { |
13063
|
482500
|
|
|
|
|
if (strict || ckWARN(WARN_REGEXP)) { |
13064
|
472
|
|
|
|
|
const int w = |
13065
|
630
|
|
|
|
|
RExC_parse >= rangebegin ? |
13066
|
420
|
|
|
|
|
RExC_parse - rangebegin : 0; |
13067
|
2124
|
|
|
|
|
if (strict) { |
13068
|
0
|
|
|
|
|
vFAIL4("False [] range \"%*.*s\"", |
13069
|
|
|
|
|
|
w, w, rangebegin); |
13070
|
|
|
|
|
|
} |
13071
|
|
|
|
|
|
else { |
13072
|
0
|
|
|
|
|
vWARN4(RExC_parse, |
13073
|
|
|
|
|
|
"False [] range \"%*.*s\"", |
13074
|
|
|
|
|
|
w, w, rangebegin); |
13075
|
|
|
|
|
|
} |
13076
|
|
|
|
|
|
} |
13077
|
0
|
|
|
|
|
if (!SIZE_ONLY) { |
13078
|
142264
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, '-'); |
13079
|
|
|
|
|
|
} |
13080
|
71132
|
|
|
|
|
element_count++; |
13081
|
|
|
|
|
|
} else |
13082
|
|
|
|
|
|
range = 1; /* yeah, it's a range! */ |
13083
|
72048
|
|
|
|
|
continue; /* but do it the next time */ |
13084
|
|
|
|
|
|
} |
13085
|
|
|
|
|
|
} |
13086
|
|
|
|
|
|
} |
13087
|
|
|
|
|
|
|
13088
|
|
|
|
|
|
/* Here, is the beginning of the range, if any; or |
13089
|
|
|
|
|
|
* if not */ |
13090
|
|
|
|
|
|
|
13091
|
|
|
|
|
|
/* non-Latin1 code point implies unicode semantics. Must be set in |
13092
|
|
|
|
|
|
* pass1 so is there for the whole of pass 2 */ |
13093
|
50496
|
|
|
|
|
if (value > 255) { |
13094
|
50456
|
|
|
|
|
RExC_uni_semantics = 1; |
13095
|
|
|
|
|
|
} |
13096
|
|
|
|
|
|
|
13097
|
|
|
|
|
|
/* Ready to process either the single value, or the completed range. |
13098
|
|
|
|
|
|
* For single-valued non-inverted ranges, we consider the possibility |
13099
|
|
|
|
|
|
* of multi-char folds. (We made a conscious decision to not do this |
13100
|
|
|
|
|
|
* for the other cases because it can often lead to non-intuitive |
13101
|
|
|
|
|
|
* results. For example, you have the peculiar case that: |
13102
|
|
|
|
|
|
* "s s" =~ /^[^\xDF]+$/i => Y |
13103
|
|
|
|
|
|
* "ss" =~ /^[^\xDF]+$/i => N |
13104
|
|
|
|
|
|
* |
13105
|
|
|
|
|
|
* See [perl #89750] */ |
13106
|
72088
|
|
|
|
|
if (FOLD && allow_multi_folds && value == prevvalue) { |
13107
|
72048
|
|
|
|
|
if (value == LATIN_SMALL_LETTER_SHARP_S |
13108
|
2824774
|
|
|
|
|
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, |
13109
|
|
|
|
|
|
value))) |
13110
|
|
|
|
|
|
{ |
13111
|
|
|
|
|
|
/* Here is indeed a multi-char fold. Get what it is */ |
13112
|
|
|
|
|
|
|
13113
|
|
|
|
|
|
U8 foldbuf[UTF8_MAXBYTES_CASE]; |
13114
|
|
|
|
|
|
STRLEN foldlen; |
13115
|
|
|
|
|
|
|
13116
|
1921808
|
|
|
|
|
UV folded = _to_uni_fold_flags( |
13117
|
|
|
|
|
|
value, |
13118
|
|
|
|
|
|
foldbuf, |
13119
|
|
|
|
|
|
&foldlen, |
13120
|
|
|
|
|
|
FOLD_FLAGS_FULL |
13121
|
|
|
|
|
|
| ((LOC) ? FOLD_FLAGS_LOCALE |
13122
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
13123
|
|
|
|
|
|
? FOLD_FLAGS_NOMIX_ASCII |
13124
|
|
|
|
|
|
: 0) |
13125
|
|
|
|
|
|
); |
13126
|
|
|
|
|
|
|
13127
|
|
|
|
|
|
/* Here, should be the first character of the |
13128
|
|
|
|
|
|
* multi-char fold of , with containing the |
13129
|
|
|
|
|
|
* whole thing. But, if this fold is not allowed (because of |
13130
|
|
|
|
|
|
* the flags), will be the same as , and should |
13131
|
|
|
|
|
|
* be processed like any other character, so skip the special |
13132
|
|
|
|
|
|
* handling */ |
13133
|
1613808
|
|
|
|
|
if (folded != value) { |
13134
|
|
|
|
|
|
|
13135
|
|
|
|
|
|
/* Skip if we are recursed, currently parsing the class |
13136
|
|
|
|
|
|
* again. Otherwise add this character to the list of |
13137
|
|
|
|
|
|
* multi-char folds. */ |
13138
|
20436
|
|
|
|
|
if (! RExC_in_multi_char_class) { |
13139
|
|
|
|
|
|
AV** this_array_ptr; |
13140
|
|
|
|
|
|
AV* this_array; |
13141
|
20436
|
|
|
|
|
STRLEN cp_count = utf8_length(foldbuf, |
13142
|
|
|
|
|
|
foldbuf + foldlen); |
13143
|
840176
|
|
|
|
|
SV* multi_fold = sv_2mortal(newSVpvn("", 0)); |
13144
|
|
|
|
|
|
|
13145
|
1335729
|
|
|
|
|
Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); |
13146
|
|
|
|
|
|
|
13147
|
|
|
|
|
|
|
13148
|
902966
|
|
|
|
|
if (! multi_char_matches) { |
13149
|
307656
|
|
|
|
|
multi_char_matches = newAV(); |
13150
|
|
|
|
|
|
} |
13151
|
|
|
|
|
|
|
13152
|
|
|
|
|
|
/* is actually an array of arrays. |
13153
|
|
|
|
|
|
* There will be one or two top-level elements: [2], |
13154
|
|
|
|
|
|
* and/or [3]. The [2] element is an array, each |
13155
|
|
|
|
|
|
* element thereof is a character which folds to TWO |
13156
|
|
|
|
|
|
* characters; [3] is for folds to THREE characters. |
13157
|
|
|
|
|
|
* (Unicode guarantees a maximum of 3 characters in any |
13158
|
|
|
|
|
|
* fold.) When we rewrite the character class below, |
13159
|
|
|
|
|
|
* we will do so such that the longest folds are |
13160
|
|
|
|
|
|
* written first, so that it prefers the longest |
13161
|
|
|
|
|
|
* matching strings first. This is done even if it |
13162
|
|
|
|
|
|
* turns out that any quantifier is non-greedy, out of |
13163
|
|
|
|
|
|
* programmer laziness. Tom Christiansen has agreed |
13164
|
|
|
|
|
|
* that this is ok. This makes the test for the |
13165
|
|
|
|
|
|
* ligature 'ffi' come before the test for 'ff' */ |
13166
|
3307274
|
|
|
|
|
if (av_exists(multi_char_matches, cp_count)) { |
13167
|
1013840
|
|
|
|
|
this_array_ptr = (AV**) av_fetch(multi_char_matches, |
13168
|
|
|
|
|
|
cp_count, FALSE); |
13169
|
1013840
|
|
|
|
|
this_array = *this_array_ptr; |
13170
|
|
|
|
|
|
} |
13171
|
|
|
|
|
|
else { |
13172
|
1013840
|
|
|
|
|
this_array = newAV(); |
13173
|
757749
|
|
|
|
|
av_store(multi_char_matches, cp_count, |
13174
|
|
|
|
|
|
(SV*) this_array); |
13175
|
|
|
|
|
|
} |
13176
|
455842
|
|
|
|
|
av_push(this_array, multi_fold); |
13177
|
|
|
|
|
|
} |
13178
|
|
|
|
|
|
|
13179
|
|
|
|
|
|
/* This element should not be processed further in this |
13180
|
|
|
|
|
|
* class */ |
13181
|
507114
|
|
|
|
|
element_count--; |
13182
|
507114
|
|
|
|
|
value = save_value; |
13183
|
|
|
|
|
|
prevvalue = save_prevvalue; |
13184
|
175116
|
|
|
|
|
continue; |
13185
|
|
|
|
|
|
} |
13186
|
|
|
|
|
|
} |
13187
|
|
|
|
|
|
} |
13188
|
|
|
|
|
|
|
13189
|
|
|
|
|
|
/* Deal with this element of the class */ |
13190
|
25336
|
|
|
|
|
if (! SIZE_ONLY) { |
13191
|
|
|
|
|
|
#ifndef EBCDIC |
13192
|
1013860
|
|
|
|
|
cp_list = _add_range_to_invlist(cp_list, prevvalue, value); |
13193
|
|
|
|
|
|
#else |
13194
|
|
|
|
|
|
SV* this_range = _new_invlist(1); |
13195
|
|
|
|
|
|
_append_range_to_invlist(this_range, prevvalue, value); |
13196
|
|
|
|
|
|
|
13197
|
|
|
|
|
|
/* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. |
13198
|
|
|
|
|
|
* If this range was specified using something like 'i-j', we want |
13199
|
|
|
|
|
|
* to include only the 'i' and the 'j', and not anything in |
13200
|
|
|
|
|
|
* between, so exclude non-ASCII, non-alphabetics from it. |
13201
|
|
|
|
|
|
* However, if the range was specified with something like |
13202
|
|
|
|
|
|
* [\x89-\x91] or [\x89-j], all code points within it should be |
13203
|
|
|
|
|
|
* included. literal_endpoint==2 means both ends of the range used |
13204
|
|
|
|
|
|
* a literal character, not \x{foo} */ |
13205
|
|
|
|
|
|
if (literal_endpoint == 2 |
13206
|
|
|
|
|
|
&& ((prevvalue >= 'a' && value <= 'z') |
13207
|
|
|
|
|
|
|| (prevvalue >= 'A' && value <= 'Z'))) |
13208
|
|
|
|
|
|
{ |
13209
|
|
|
|
|
|
_invlist_intersection(this_range, PL_ASCII, |
13210
|
|
|
|
|
|
&this_range); |
13211
|
|
|
|
|
|
_invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], |
13212
|
|
|
|
|
|
&this_range); |
13213
|
|
|
|
|
|
} |
13214
|
|
|
|
|
|
_invlist_union(cp_list, this_range, &cp_list); |
13215
|
|
|
|
|
|
literal_endpoint = 0; |
13216
|
|
|
|
|
|
#endif |
13217
|
|
|
|
|
|
} |
13218
|
|
|
|
|
|
|
13219
|
|
|
|
|
|
range = 0; /* this range (if it was one) is done now */ |
13220
|
|
|
|
|
|
} /* End of loop through all the text within the brackets */ |
13221
|
|
|
|
|
|
|
13222
|
|
|
|
|
|
/* If anything in the class expands to more than one character, we have to |
13223
|
|
|
|
|
|
* deal with them by building up a substitute parse string, and recursively |
13224
|
|
|
|
|
|
* calling reg() on it, instead of proceeding */ |
13225
|
1013876
|
|
|
|
|
if (multi_char_matches) { |
13226
|
350232
|
|
|
|
|
SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); |
13227
|
|
|
|
|
|
I32 cp_count; |
13228
|
|
|
|
|
|
STRLEN len; |
13229
|
175116
|
|
|
|
|
char *save_end = RExC_end; |
13230
|
350232
|
|
|
|
|
char *save_parse = RExC_parse; |
13231
|
|
|
|
|
|
bool first_time = TRUE; /* First multi-char occurrence doesn't get |
13232
|
|
|
|
|
|
a "|" */ |
13233
|
|
|
|
|
|
I32 reg_flags; |
13234
|
|
|
|
|
|
|
13235
|
663608
|
|
|
|
|
assert(! invert); |
13236
|
|
|
|
|
|
#if 0 /* Have decided not to deal with multi-char folds in inverted classes, |
13237
|
|
|
|
|
|
because too confusing */ |
13238
|
|
|
|
|
|
if (invert) { |
13239
|
|
|
|
|
|
sv_catpv(substitute_parse, "(?:"); |
13240
|
|
|
|
|
|
} |
13241
|
|
|
|
|
|
#endif |
13242
|
|
|
|
|
|
|
13243
|
|
|
|
|
|
/* Look at the longest folds first */ |
13244
|
643172
|
|
|
|
|
for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { |
13245
|
|
|
|
|
|
|
13246
|
1013840
|
|
|
|
|
if (av_exists(multi_char_matches, cp_count)) { |
13247
|
|
|
|
|
|
AV** this_array_ptr; |
13248
|
|
|
|
|
|
SV* this_sequence; |
13249
|
|
|
|
|
|
|
13250
|
1013840
|
|
|
|
|
this_array_ptr = (AV**) av_fetch(multi_char_matches, |
13251
|
|
|
|
|
|
cp_count, FALSE); |
13252
|
1013840
|
|
|
|
|
while ((this_sequence = av_pop(*this_array_ptr)) != |
13253
|
|
|
|
|
|
&PL_sv_undef) |
13254
|
|
|
|
|
|
{ |
13255
|
1013840
|
|
|
|
|
if (! first_time) { |
13256
|
6709310
|
|
|
|
|
sv_catpv(substitute_parse, "|"); |
13257
|
|
|
|
|
|
} |
13258
|
|
|
|
|
|
first_time = FALSE; |
13259
|
|
|
|
|
|
|
13260
|
3355953
|
|
|
|
|
sv_catpv(substitute_parse, SvPVX(this_sequence)); |
13261
|
|
|
|
|
|
} |
13262
|
|
|
|
|
|
} |
13263
|
|
|
|
|
|
} |
13264
|
|
|
|
|
|
|
13265
|
|
|
|
|
|
/* If the character class contains anything else besides these |
13266
|
|
|
|
|
|
* multi-character folds, have to include it in recursive parsing */ |
13267
|
931452
|
|
|
|
|
if (element_count) { |
13268
|
931452
|
|
|
|
|
sv_catpv(substitute_parse, "|["); |
13269
|
594446
|
|
|
|
|
sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); |
13270
|
337006
|
|
|
|
|
sv_catpv(substitute_parse, "]"); |
13271
|
|
|
|
|
|
} |
13272
|
|
|
|
|
|
|
13273
|
18
|
|
|
|
|
sv_catpv(substitute_parse, ")"); |
13274
|
|
|
|
|
|
#if 0 |
13275
|
|
|
|
|
|
if (invert) { |
13276
|
|
|
|
|
|
/* This is a way to get the parse to skip forward a whole named |
13277
|
|
|
|
|
|
* sequence instead of matching the 2nd character when it fails the |
13278
|
|
|
|
|
|
* first */ |
13279
|
|
|
|
|
|
sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); |
13280
|
|
|
|
|
|
} |
13281
|
|
|
|
|
|
#endif |
13282
|
|
|
|
|
|
|
13283
|
18
|
|
|
|
|
RExC_parse = SvPV(substitute_parse, len); |
13284
|
18
|
|
|
|
|
RExC_end = RExC_parse + len; |
13285
|
337006
|
|
|
|
|
RExC_in_multi_char_class = 1; |
13286
|
36
|
|
|
|
|
RExC_emit = (regnode *)orig_emit; |
13287
|
|
|
|
|
|
|
13288
|
0
|
|
|
|
|
ret = reg(pRExC_state, 1, ®_flags, depth+1); |
13289
|
|
|
|
|
|
|
13290
|
24
|
|
|
|
|
*flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); |
13291
|
|
|
|
|
|
|
13292
|
18
|
|
|
|
|
RExC_parse = save_parse; |
13293
|
24
|
|
|
|
|
RExC_end = save_end; |
13294
|
24
|
|
|
|
|
RExC_in_multi_char_class = 0; |
13295
|
337006
|
|
|
|
|
SvREFCNT_dec_NN(multi_char_matches); |
13296
|
931452
|
|
|
|
|
return ret; |
13297
|
|
|
|
|
|
} |
13298
|
|
|
|
|
|
|
13299
|
|
|
|
|
|
/* If the character class contains only a single element, it may be |
13300
|
|
|
|
|
|
* optimizable into another node type which is smaller and runs faster. |
13301
|
|
|
|
|
|
* Check if this is the case for this class */ |
13302
|
1785706
|
|
|
|
|
if (element_count == 1 && ! ret_invlist) { |
13303
|
|
|
|
|
|
U8 op = END; |
13304
|
|
|
|
|
|
U8 arg = 0; |
13305
|
|
|
|
|
|
|
13306
|
1280279
|
|
|
|
|
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or |
13307
|
|
|
|
|
|
[:digit:] or \p{foo} */ |
13308
|
|
|
|
|
|
|
13309
|
|
|
|
|
|
/* All named classes are mapped into POSIXish nodes, with its FLAG |
13310
|
|
|
|
|
|
* argument giving which class it is */ |
13311
|
93600
|
|
|
|
|
switch ((I32)namedclass) { |
13312
|
|
|
|
|
|
case ANYOF_UNIPROP: |
13313
|
|
|
|
|
|
break; |
13314
|
|
|
|
|
|
|
13315
|
|
|
|
|
|
/* These don't depend on the charset modifiers. They always |
13316
|
|
|
|
|
|
* match under /u rules */ |
13317
|
|
|
|
|
|
case ANYOF_NHORIZWS: |
13318
|
|
|
|
|
|
case ANYOF_HORIZWS: |
13319
|
1967138
|
|
|
|
|
namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; |
13320
|
|
|
|
|
|
/* FALLTHROUGH */ |
13321
|
|
|
|
|
|
|
13322
|
|
|
|
|
|
case ANYOF_NVERTWS: |
13323
|
|
|
|
|
|
case ANYOF_VERTWS: |
13324
|
|
|
|
|
|
op = POSIXU; |
13325
|
|
|
|
|
|
goto join_posix; |
13326
|
|
|
|
|
|
|
13327
|
|
|
|
|
|
/* The actual POSIXish node for all the rest depends on the |
13328
|
|
|
|
|
|
* charset modifier. The ones in the first set depend only on |
13329
|
|
|
|
|
|
* ASCII or, if available on this platform, locale */ |
13330
|
|
|
|
|
|
case ANYOF_ASCII: |
13331
|
|
|
|
|
|
case ANYOF_NASCII: |
13332
|
|
|
|
|
|
#ifdef HAS_ISASCII |
13333
|
1112920
|
|
|
|
|
op = (LOC) ? POSIXL : POSIXA; |
13334
|
|
|
|
|
|
#else |
13335
|
|
|
|
|
|
op = POSIXA; |
13336
|
|
|
|
|
|
#endif |
13337
|
785108
|
|
|
|
|
goto join_posix; |
13338
|
|
|
|
|
|
|
13339
|
|
|
|
|
|
case ANYOF_NCASED: |
13340
|
|
|
|
|
|
case ANYOF_LOWER: |
13341
|
|
|
|
|
|
case ANYOF_NLOWER: |
13342
|
|
|
|
|
|
case ANYOF_UPPER: |
13343
|
|
|
|
|
|
case ANYOF_NUPPER: |
13344
|
|
|
|
|
|
/* under /a could be alpha */ |
13345
|
879713
|
|
|
|
|
if (FOLD) { |
13346
|
745616
|
|
|
|
|
if (ASCII_RESTRICTED) { |
13347
|
745616
|
|
|
|
|
namedclass = ANYOF_ALPHA + (namedclass % 2); |
13348
|
|
|
|
|
|
} |
13349
|
39488
|
|
|
|
|
else if (! LOC) { |
13350
|
|
|
|
|
|
break; |
13351
|
|
|
|
|
|
} |
13352
|
|
|
|
|
|
} |
13353
|
|
|
|
|
|
/* FALLTHROUGH */ |
13354
|
|
|
|
|
|
|
13355
|
|
|
|
|
|
/* The rest have more possibilities depending on the charset. |
13356
|
|
|
|
|
|
* We take advantage of the enum ordering of the charset |
13357
|
|
|
|
|
|
* modifiers to get the exact node type, */ |
13358
|
|
|
|
|
|
default: |
13359
|
39488
|
|
|
|
|
op = POSIXD + get_regex_charset(RExC_flags); |
13360
|
785108
|
|
|
|
|
if (op > POSIXA) { /* /aa is same as /a */ |
13361
|
|
|
|
|
|
op = POSIXA; |
13362
|
|
|
|
|
|
} |
13363
|
|
|
|
|
|
#ifndef HAS_ISBLANK |
13364
|
|
|
|
|
|
if (op == POSIXL |
13365
|
|
|
|
|
|
&& (namedclass == ANYOF_BLANK |
13366
|
|
|
|
|
|
|| namedclass == ANYOF_NBLANK)) |
13367
|
|
|
|
|
|
{ |
13368
|
|
|
|
|
|
op = POSIXA; |
13369
|
|
|
|
|
|
} |
13370
|
|
|
|
|
|
#endif |
13371
|
|
|
|
|
|
|
13372
|
|
|
|
|
|
join_posix: |
13373
|
|
|
|
|
|
/* The odd numbered ones are the complements of the |
13374
|
|
|
|
|
|
* next-lower even number one */ |
13375
|
677123
|
|
|
|
|
if (namedclass % 2 == 1) { |
13376
|
462730
|
|
|
|
|
invert = ! invert; |
13377
|
57268
|
|
|
|
|
namedclass--; |
13378
|
|
|
|
|
|
} |
13379
|
57268
|
|
|
|
|
arg = namedclass_to_classnum(namedclass); |
13380
|
57268
|
|
|
|
|
break; |
13381
|
|
|
|
|
|
} |
13382
|
|
|
|
|
|
} |
13383
|
40240
|
|
|
|
|
else if (value == prevvalue) { |
13384
|
|
|
|
|
|
|
13385
|
|
|
|
|
|
/* Here, the class consists of just a single code point */ |
13386
|
|
|
|
|
|
|
13387
|
40240
|
|
|
|
|
if (invert) { |
13388
|
34566
|
|
|
|
|
if (! LOC && value == '\n') { |
13389
|
|
|
|
|
|
op = REG_ANY; /* Optimize [^\n] */ |
13390
|
34566
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
13391
|
34566
|
|
|
|
|
RExC_naughty++; |
13392
|
|
|
|
|
|
} |
13393
|
|
|
|
|
|
} |
13394
|
77810
|
|
|
|
|
else if (value < 256 || UTF) { |
13395
|
|
|
|
|
|
|
13396
|
|
|
|
|
|
/* Optimize a single value into an EXACTish node, but not if it |
13397
|
|
|
|
|
|
* would require converting the pattern to UTF-8. */ |
13398
|
77810
|
|
|
|
|
op = compute_EXACTish(pRExC_state); |
13399
|
|
|
|
|
|
} |
13400
|
|
|
|
|
|
} /* Otherwise is a range */ |
13401
|
77778
|
|
|
|
|
else if (! LOC) { /* locale could vary these */ |
13402
|
13030
|
|
|
|
|
if (prevvalue == '0') { |
13403
|
13030
|
|
|
|
|
if (value == '9') { |
13404
|
|
|
|
|
|
arg = _CC_DIGIT; |
13405
|
|
|
|
|
|
op = POSIXA; |
13406
|
|
|
|
|
|
} |
13407
|
|
|
|
|
|
} |
13408
|
|
|
|
|
|
} |
13409
|
|
|
|
|
|
|
13410
|
|
|
|
|
|
/* Here, we have changed away from its initial value iff we found |
13411
|
|
|
|
|
|
* an optimization */ |
13412
|
11562
|
|
|
|
|
if (op != END) { |
13413
|
|
|
|
|
|
|
13414
|
|
|
|
|
|
/* Throw away this ANYOF regnode, and emit the calculated one, |
13415
|
|
|
|
|
|
* which should correspond to the beginning, not current, state of |
13416
|
|
|
|
|
|
* the parse */ |
13417
|
11562
|
|
|
|
|
const char * cur_parse = RExC_parse; |
13418
|
32
|
|
|
|
|
RExC_parse = (char *)orig_parse; |
13419
|
32
|
|
|
|
|
if ( SIZE_ONLY) { |
13420
|
785140
|
|
|
|
|
if (! LOC) { |
13421
|
|
|
|
|
|
|
13422
|
|
|
|
|
|
/* To get locale nodes to not use the full ANYOF size would |
13423
|
|
|
|
|
|
* require moving the code above that writes the portions |
13424
|
|
|
|
|
|
* of it that aren't in other nodes to after this point. |
13425
|
|
|
|
|
|
* e.g. ANYOF_CLASS_SET */ |
13426
|
614360
|
|
|
|
|
RExC_size = orig_size; |
13427
|
|
|
|
|
|
} |
13428
|
|
|
|
|
|
} |
13429
|
|
|
|
|
|
else { |
13430
|
327828
|
|
|
|
|
RExC_emit = (regnode *)orig_emit; |
13431
|
214308
|
|
|
|
|
if (PL_regkind[op] == POSIXD) { |
13432
|
813106
|
|
|
|
|
if (invert) { |
13433
|
598814
|
|
|
|
|
op += NPOSIXD - POSIXD; |
13434
|
|
|
|
|
|
} |
13435
|
|
|
|
|
|
} |
13436
|
|
|
|
|
|
} |
13437
|
|
|
|
|
|
|
13438
|
598846
|
|
|
|
|
ret = reg_node(pRExC_state, op); |
13439
|
|
|
|
|
|
|
13440
|
32
|
|
|
|
|
if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { |
13441
|
598814
|
|
|
|
|
if (! SIZE_ONLY) { |
13442
|
898221
|
|
|
|
|
FLAGS(ret) = arg; |
13443
|
|
|
|
|
|
} |
13444
|
53760
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
13445
|
|
|
|
|
|
} |
13446
|
898253
|
|
|
|
|
else if (PL_regkind[op] == EXACT) { |
13447
|
11552
|
|
|
|
|
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); |
13448
|
|
|
|
|
|
} |
13449
|
|
|
|
|
|
|
13450
|
643055
|
|
|
|
|
RExC_parse = (char *) cur_parse; |
13451
|
|
|
|
|
|
|
13452
|
587326
|
|
|
|
|
SvREFCNT_dec(posixes); |
13453
|
32
|
|
|
|
|
SvREFCNT_dec(cp_list); |
13454
|
931484
|
|
|
|
|
return ret; |
13455
|
|
|
|
|
|
} |
13456
|
|
|
|
|
|
} |
13457
|
|
|
|
|
|
|
13458
|
3355957
|
|
|
|
|
if (SIZE_ONLY) |
13459
|
|
|
|
|
|
return ret; |
13460
|
|
|
|
|
|
/****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ |
13461
|
|
|
|
|
|
|
13462
|
|
|
|
|
|
/* If folding, we calculate all characters that could fold to or from the |
13463
|
|
|
|
|
|
* ones already on the list */ |
13464
|
157112
|
|
|
|
|
if (FOLD && cp_list) { |
13465
|
|
|
|
|
|
UV start, end; /* End points of code point ranges */ |
13466
|
|
|
|
|
|
|
13467
|
7694
|
|
|
|
|
SV* fold_intersection = NULL; |
13468
|
|
|
|
|
|
|
13469
|
|
|
|
|
|
/* If the highest code point is within Latin1, we can use the |
13470
|
|
|
|
|
|
* compiled-in Alphas list, and not have to go out to disk. This |
13471
|
|
|
|
|
|
* yields two false positives, the masculine and feminine ordinal |
13472
|
|
|
|
|
|
* indicators, which are weeded out below using the |
13473
|
|
|
|
|
|
* IS_IN_SOME_FOLD_L1() macro */ |
13474
|
6814
|
|
|
|
|
if (invlist_highest(cp_list) < 256) { |
13475
|
6814
|
|
|
|
|
_invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, |
13476
|
|
|
|
|
|
&fold_intersection); |
13477
|
|
|
|
|
|
} |
13478
|
|
|
|
|
|
else { |
13479
|
|
|
|
|
|
|
13480
|
|
|
|
|
|
/* Here, there are non-Latin1 code points, so we will have to go |
13481
|
|
|
|
|
|
* fetch the list of all the characters that participate in folds |
13482
|
|
|
|
|
|
*/ |
13483
|
880
|
|
|
|
|
if (! PL_utf8_foldable) { |
13484
|
98006
|
|
|
|
|
SV* swash = swash_init("utf8", "_Perl_Any_Folds", |
13485
|
|
|
|
|
|
&PL_sv_undef, 1, 0); |
13486
|
98006
|
|
|
|
|
PL_utf8_foldable = _get_swash_invlist(swash); |
13487
|
98006
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
13488
|
|
|
|
|
|
} |
13489
|
|
|
|
|
|
|
13490
|
|
|
|
|
|
/* This is a hash that for a particular fold gives all characters |
13491
|
|
|
|
|
|
* that are involved in it */ |
13492
|
98006
|
|
|
|
|
if (! PL_utf8_foldclosures) { |
13493
|
|
|
|
|
|
|
13494
|
|
|
|
|
|
/* If we were unable to find any folds, then we likely won't be |
13495
|
|
|
|
|
|
* able to find the closures. So just create an empty list. |
13496
|
|
|
|
|
|
* Folding will effectively be restricted to the non-Unicode |
13497
|
|
|
|
|
|
* rules hard-coded into Perl. (This case happens legitimately |
13498
|
|
|
|
|
|
* during compilation of Perl itself before the Unicode tables |
13499
|
|
|
|
|
|
* are generated) */ |
13500
|
98006
|
|
|
|
|
if (_invlist_len(PL_utf8_foldable) == 0) { |
13501
|
83188
|
|
|
|
|
PL_utf8_foldclosures = newHV(); |
13502
|
|
|
|
|
|
} |
13503
|
|
|
|
|
|
else { |
13504
|
|
|
|
|
|
/* If the folds haven't been read in, call a fold function |
13505
|
|
|
|
|
|
* to force that */ |
13506
|
83188
|
|
|
|
|
if (! PL_utf8_tofold) { |
13507
|
|
|
|
|
|
U8 dummy[UTF8_MAXBYTES_CASE+1]; |
13508
|
|
|
|
|
|
|
13509
|
|
|
|
|
|
/* This string is just a short named one above \xff */ |
13510
|
14818
|
|
|
|
|
to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); |
13511
|
98006
|
|
|
|
|
assert(PL_utf8_tofold); /* Verify that worked */ |
13512
|
|
|
|
|
|
} |
13513
|
0
|
|
|
|
|
PL_utf8_foldclosures = |
13514
|
0
|
|
|
|
|
_swash_inversion_hash(PL_utf8_tofold); |
13515
|
|
|
|
|
|
} |
13516
|
|
|
|
|
|
} |
13517
|
|
|
|
|
|
|
13518
|
|
|
|
|
|
/* Only the characters in this class that participate in folds need |
13519
|
|
|
|
|
|
* be checked. Get the intersection of this class and all the |
13520
|
|
|
|
|
|
* possible characters that are foldable. This can quickly narrow |
13521
|
|
|
|
|
|
* down a large class */ |
13522
|
98006
|
|
|
|
|
_invlist_intersection(PL_utf8_foldable, cp_list, |
13523
|
|
|
|
|
|
&fold_intersection); |
13524
|
|
|
|
|
|
} |
13525
|
|
|
|
|
|
|
13526
|
|
|
|
|
|
/* Now look at the foldable characters in this class individually */ |
13527
|
3355953
|
|
|
|
|
invlist_iterinit(fold_intersection); |
13528
|
180652
|
|
|
|
|
while (invlist_iternext(fold_intersection, &start, &end)) { |
13529
|
|
|
|
|
|
UV j; |
13530
|
|
|
|
|
|
|
13531
|
|
|
|
|
|
/* Locale folding for Latin1 characters is deferred until runtime */ |
13532
|
180652
|
|
|
|
|
if (LOC && start < 256) { |
13533
|
506
|
|
|
|
|
start = 256; |
13534
|
|
|
|
|
|
} |
13535
|
|
|
|
|
|
|
13536
|
|
|
|
|
|
/* Look at every character in the range */ |
13537
|
504
|
|
|
|
|
for (j = start; j <= end; j++) { |
13538
|
|
|
|
|
|
|
13539
|
|
|
|
|
|
U8 foldbuf[UTF8_MAXBYTES_CASE+1]; |
13540
|
|
|
|
|
|
STRLEN foldlen; |
13541
|
|
|
|
|
|
SV** listp; |
13542
|
|
|
|
|
|
|
13543
|
504
|
|
|
|
|
if (j < 256) { |
13544
|
|
|
|
|
|
|
13545
|
|
|
|
|
|
/* We have the latin1 folding rules hard-coded here so that |
13546
|
|
|
|
|
|
* an innocent-looking character class, like /[ks]/i won't |
13547
|
|
|
|
|
|
* have to go out to disk to find the possible matches. |
13548
|
|
|
|
|
|
* XXX It would be better to generate these via regen, in |
13549
|
|
|
|
|
|
* case a new version of the Unicode standard adds new |
13550
|
|
|
|
|
|
* mappings, though that is not really likely, and may be |
13551
|
|
|
|
|
|
* caught by the default: case of the switch below. */ |
13552
|
|
|
|
|
|
|
13553
|
504
|
|
|
|
|
if (IS_IN_SOME_FOLD_L1(j)) { |
13554
|
|
|
|
|
|
|
13555
|
|
|
|
|
|
/* ASCII is always matched; non-ASCII is matched only |
13556
|
|
|
|
|
|
* under Unicode rules */ |
13557
|
0
|
|
|
|
|
if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { |
13558
|
0
|
|
|
|
|
cp_list = |
13559
|
504
|
|
|
|
|
add_cp_to_invlist(cp_list, PL_fold_latin1[j]); |
13560
|
|
|
|
|
|
} |
13561
|
|
|
|
|
|
else { |
13562
|
506
|
|
|
|
|
depends_list = |
13563
|
506
|
|
|
|
|
add_cp_to_invlist(depends_list, PL_fold_latin1[j]); |
13564
|
|
|
|
|
|
} |
13565
|
|
|
|
|
|
} |
13566
|
|
|
|
|
|
|
13567
|
180146
|
|
|
|
|
if (HAS_NONLATIN1_FOLD_CLOSURE(j) |
13568
|
180652
|
|
|
|
|
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) |
13569
|
|
|
|
|
|
{ |
13570
|
|
|
|
|
|
/* Certain Latin1 characters have matches outside |
13571
|
|
|
|
|
|
* Latin1. To get here, is one of those |
13572
|
|
|
|
|
|
* characters. None of these matches is valid for |
13573
|
|
|
|
|
|
* ASCII characters under /aa, which is why the 'if' |
13574
|
|
|
|
|
|
* just above excludes those. These matches only |
13575
|
|
|
|
|
|
* happen when the target string is utf8. The code |
13576
|
|
|
|
|
|
* below adds the single fold closures for to the |
13577
|
|
|
|
|
|
* inversion list. */ |
13578
|
180170
|
|
|
|
|
switch (j) { |
13579
|
|
|
|
|
|
case 'k': |
13580
|
|
|
|
|
|
case 'K': |
13581
|
3355953
|
|
|
|
|
cp_list = |
13582
|
1565073
|
|
|
|
|
add_cp_to_invlist(cp_list, KELVIN_SIGN); |
13583
|
949782
|
|
|
|
|
break; |
13584
|
|
|
|
|
|
case 's': |
13585
|
|
|
|
|
|
case 'S': |
13586
|
885746
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, |
13587
|
|
|
|
|
|
LATIN_SMALL_LETTER_LONG_S); |
13588
|
885746
|
|
|
|
|
break; |
13589
|
|
|
|
|
|
case MICRO_SIGN: |
13590
|
885746
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, |
13591
|
|
|
|
|
|
GREEK_CAPITAL_LETTER_MU); |
13592
|
64
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, |
13593
|
|
|
|
|
|
GREEK_SMALL_LETTER_MU); |
13594
|
3355953
|
|
|
|
|
break; |
13595
|
|
|
|
|
|
case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: |
13596
|
|
|
|
|
|
case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: |
13597
|
3120
|
|
|
|
|
cp_list = |
13598
|
3120
|
|
|
|
|
add_cp_to_invlist(cp_list, ANGSTROM_SIGN); |
13599
|
3120
|
|
|
|
|
break; |
13600
|
|
|
|
|
|
case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: |
13601
|
0
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, |
13602
|
|
|
|
|
|
LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); |
13603
|
3120
|
|
|
|
|
break; |
13604
|
|
|
|
|
|
case LATIN_SMALL_LETTER_SHARP_S: |
13605
|
3814846
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, |
13606
|
|
|
|
|
|
LATIN_CAPITAL_LETTER_SHARP_S); |
13607
|
195360
|
|
|
|
|
break; |
13608
|
|
|
|
|
|
case 'F': case 'f': |
13609
|
|
|
|
|
|
case 'I': case 'i': |
13610
|
|
|
|
|
|
case 'L': case 'l': |
13611
|
|
|
|
|
|
case 'T': case 't': |
13612
|
|
|
|
|
|
case 'A': case 'a': |
13613
|
|
|
|
|
|
case 'H': case 'h': |
13614
|
|
|
|
|
|
case 'J': case 'j': |
13615
|
|
|
|
|
|
case 'N': case 'n': |
13616
|
|
|
|
|
|
case 'W': case 'w': |
13617
|
|
|
|
|
|
case 'Y': case 'y': |
13618
|
|
|
|
|
|
/* These all are targets of multi-character |
13619
|
|
|
|
|
|
* folds from code points that require UTF8 to |
13620
|
|
|
|
|
|
* express, so they can't match unless the |
13621
|
|
|
|
|
|
* target string is in UTF-8, so no action here |
13622
|
|
|
|
|
|
* is necessary, as regexec.c properly handles |
13623
|
|
|
|
|
|
* the general case for UTF-8 matching and |
13624
|
|
|
|
|
|
* multi-char folds */ |
13625
|
|
|
|
|
|
break; |
13626
|
|
|
|
|
|
default: |
13627
|
|
|
|
|
|
/* Use deprecated warning to increase the |
13628
|
|
|
|
|
|
* chances of this being output */ |
13629
|
3352833
|
|
|
|
|
ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); |
13630
|
3327701
|
|
|
|
|
break; |
13631
|
|
|
|
|
|
} |
13632
|
|
|
|
|
|
} |
13633
|
3164785
|
|
|
|
|
continue; |
13634
|
|
|
|
|
|
} |
13635
|
|
|
|
|
|
|
13636
|
|
|
|
|
|
/* Here is an above Latin1 character. We don't have the rules |
13637
|
|
|
|
|
|
* hard-coded for it. First, get its fold. This is the simple |
13638
|
|
|
|
|
|
* fold, as the multi-character folds have been handled earlier |
13639
|
|
|
|
|
|
* and separated out */ |
13640
|
3091451
|
|
|
|
|
_to_uni_fold_flags(j, foldbuf, &foldlen, |
13641
|
|
|
|
|
|
((LOC) |
13642
|
|
|
|
|
|
? FOLD_FLAGS_LOCALE |
13643
|
|
|
|
|
|
: (ASCII_FOLD_RESTRICTED) |
13644
|
|
|
|
|
|
? FOLD_FLAGS_NOMIX_ASCII |
13645
|
|
|
|
|
|
: 0)); |
13646
|
|
|
|
|
|
|
13647
|
|
|
|
|
|
/* Single character fold of above Latin1. Add everything in |
13648
|
|
|
|
|
|
* its fold closure to the list that this node should match. |
13649
|
|
|
|
|
|
* The fold closures data structure is a hash with the keys |
13650
|
|
|
|
|
|
* being the UTF-8 of every character that is folded to, like |
13651
|
|
|
|
|
|
* 'k', and the values each an array of all code points that |
13652
|
|
|
|
|
|
* fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. |
13653
|
|
|
|
|
|
* Multi-character folds are not included */ |
13654
|
3090783
|
|
|
|
|
if ((listp = hv_fetch(PL_utf8_foldclosures, |
13655
|
|
|
|
|
|
(char *) foldbuf, foldlen, FALSE))) |
13656
|
|
|
|
|
|
{ |
13657
|
3088039
|
|
|
|
|
AV* list = (AV*) *listp; |
13658
|
|
|
|
|
|
IV k; |
13659
|
3088039
|
|
|
|
|
for (k = 0; k <= av_len(list); k++) { |
13660
|
3088039
|
|
|
|
|
SV** c_p = av_fetch(list, k, FALSE); |
13661
|
|
|
|
|
|
UV c; |
13662
|
124
|
|
|
|
|
if (c_p == NULL) { |
13663
|
3087915
|
|
|
|
|
Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); |
13664
|
|
|
|
|
|
} |
13665
|
1505627
|
|
|
|
|
c = SvUV(*c_p); |
13666
|
|
|
|
|
|
|
13667
|
|
|
|
|
|
/* /aa doesn't allow folds between ASCII and non-; /l |
13668
|
|
|
|
|
|
* doesn't allow them between above and below 256 */ |
13669
|
54984
|
|
|
|
|
if ((ASCII_FOLD_RESTRICTED |
13670
|
780
|
|
|
|
|
&& (isASCII(c) != isASCII(j))) |
13671
|
780
|
|
|
|
|
|| (LOC && c < 256)) { |
13672
|
0
|
|
|
|
|
continue; |
13673
|
|
|
|
|
|
} |
13674
|
|
|
|
|
|
|
13675
|
|
|
|
|
|
/* Folds involving non-ascii Latin1 characters |
13676
|
|
|
|
|
|
* under /d are added to a separate list */ |
13677
|
0
|
|
|
|
|
if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) |
13678
|
|
|
|
|
|
{ |
13679
|
0
|
|
|
|
|
cp_list = add_cp_to_invlist(cp_list, c); |
13680
|
|
|
|
|
|
} |
13681
|
|
|
|
|
|
else { |
13682
|
0
|
|
|
|
|
depends_list = add_cp_to_invlist(depends_list, c); |
13683
|
|
|
|
|
|
} |
13684
|
|
|
|
|
|
} |
13685
|
|
|
|
|
|
} |
13686
|
|
|
|
|
|
} |
13687
|
|
|
|
|
|
} |
13688
|
0
|
|
|
|
|
SvREFCNT_dec_NN(fold_intersection); |
13689
|
|
|
|
|
|
} |
13690
|
|
|
|
|
|
|
13691
|
|
|
|
|
|
/* And combine the result (if any) with any inversion list from posix |
13692
|
|
|
|
|
|
* classes. The lists are kept separate up to now because we don't want to |
13693
|
|
|
|
|
|
* fold the classes (folding of those is automatically handled by the swash |
13694
|
|
|
|
|
|
* fetching code) */ |
13695
|
2
|
|
|
|
|
if (posixes) { |
13696
|
0
|
|
|
|
|
if (! DEPENDS_SEMANTICS) { |
13697
|
0
|
|
|
|
|
if (cp_list) { |
13698
|
0
|
|
|
|
|
_invlist_union(cp_list, posixes, &cp_list); |
13699
|
1582288
|
|
|
|
|
SvREFCNT_dec_NN(posixes); |
13700
|
|
|
|
|
|
} |
13701
|
|
|
|
|
|
else { |
13702
|
984682
|
|
|
|
|
cp_list = posixes; |
13703
|
|
|
|
|
|
} |
13704
|
|
|
|
|
|
} |
13705
|
|
|
|
|
|
else { |
13706
|
|
|
|
|
|
/* Under /d, we put into a separate list the Latin1 things that |
13707
|
|
|
|
|
|
* match only when the target string is utf8 */ |
13708
|
166
|
|
|
|
|
SV* nonascii_but_latin1_properties = NULL; |
13709
|
166
|
|
|
|
|
_invlist_intersection(posixes, PL_Latin1, |
13710
|
|
|
|
|
|
&nonascii_but_latin1_properties); |
13711
|
984516
|
|
|
|
|
_invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, |
13712
|
|
|
|
|
|
&nonascii_but_latin1_properties); |
13713
|
3280
|
|
|
|
|
_invlist_subtract(posixes, nonascii_but_latin1_properties, |
13714
|
|
|
|
|
|
&posixes); |
13715
|
3250
|
|
|
|
|
if (cp_list) { |
13716
|
272
|
|
|
|
|
_invlist_union(cp_list, posixes, &cp_list); |
13717
|
272
|
|
|
|
|
SvREFCNT_dec_NN(posixes); |
13718
|
|
|
|
|
|
} |
13719
|
|
|
|
|
|
else { |
13720
|
3088039
|
|
|
|
|
cp_list = posixes; |
13721
|
|
|
|
|
|
} |
13722
|
|
|
|
|
|
|
13723
|
3088039
|
|
|
|
|
if (depends_list) { |
13724
|
1342
|
|
|
|
|
_invlist_union(depends_list, nonascii_but_latin1_properties, |
13725
|
|
|
|
|
|
&depends_list); |
13726
|
1342
|
|
|
|
|
SvREFCNT_dec_NN(nonascii_but_latin1_properties); |
13727
|
|
|
|
|
|
} |
13728
|
|
|
|
|
|
else { |
13729
|
1342
|
|
|
|
|
depends_list = nonascii_but_latin1_properties; |
13730
|
|
|
|
|
|
} |
13731
|
|
|
|
|
|
} |
13732
|
|
|
|
|
|
} |
13733
|
|
|
|
|
|
|
13734
|
|
|
|
|
|
/* And combine the result (if any) with any inversion list from properties. |
13735
|
|
|
|
|
|
* The lists are kept separate up to now so that we can distinguish the two |
13736
|
|
|
|
|
|
* in regards to matching above-Unicode. A run-time warning is generated |
13737
|
|
|
|
|
|
* if a Unicode property is matched against a non-Unicode code point. But, |
13738
|
|
|
|
|
|
* we allow user-defined properties to match anything, without any warning, |
13739
|
|
|
|
|
|
* and we also suppress the warning if there is a portion of the character |
13740
|
|
|
|
|
|
* class that isn't a Unicode property, and which matches above Unicode, \W |
13741
|
|
|
|
|
|
* or [\x{110000}] for example. |
13742
|
|
|
|
|
|
* (Note that in this case, unlike the Posix one above, there is no |
13743
|
|
|
|
|
|
* , because having a Unicode property forces Unicode |
13744
|
|
|
|
|
|
* semantics */ |
13745
|
1344
|
|
|
|
|
if (properties) { |
13746
|
1342
|
|
|
|
|
bool warn_super = ! has_user_defined_property; |
13747
|
780
|
|
|
|
|
if (cp_list) { |
13748
|
|
|
|
|
|
|
13749
|
|
|
|
|
|
/* If it matters to the final outcome, see if a non-property |
13750
|
|
|
|
|
|
* component of the class matches above Unicode. If so, the |
13751
|
|
|
|
|
|
* warning gets suppressed. This is true even if just a single |
13752
|
|
|
|
|
|
* such code point is specified, as though not strictly correct if |
13753
|
|
|
|
|
|
* another such code point is matched against, the fact that they |
13754
|
|
|
|
|
|
* are using above-Unicode code points indicates they should know |
13755
|
|
|
|
|
|
* the issues involved */ |
13756
|
1342
|
|
|
|
|
if (warn_super) { |
13757
|
1342
|
|
|
|
|
bool non_prop_matches_above_Unicode = |
13758
|
|
|
|
|
|
runtime_posix_matches_above_Unicode |
13759
|
3351491
|
|
|
|
|
| (invlist_highest(cp_list) > PERL_UNICODE_MAX); |
13760
|
3351491
|
|
|
|
|
if (invert) { |
13761
|
3326359
|
|
|
|
|
non_prop_matches_above_Unicode = |
13762
|
11152661
|
|
|
|
|
! non_prop_matches_above_Unicode; |
13763
|
|
|
|
|
|
} |
13764
|
8632132
|
|
|
|
|
warn_super = ! non_prop_matches_above_Unicode; |
13765
|
|
|
|
|
|
} |
13766
|
|
|
|
|
|
|
13767
|
653708
|
|
|
|
|
_invlist_union(properties, cp_list, &cp_list); |
13768
|
8632132
|
|
|
|
|
SvREFCNT_dec_NN(properties); |
13769
|
|
|
|
|
|
} |
13770
|
|
|
|
|
|
else { |
13771
|
7826302
|
|
|
|
|
cp_list = properties; |
13772
|
|
|
|
|
|
} |
13773
|
|
|
|
|
|
|
13774
|
276181514
|
|
|
|
|
if (warn_super) { |
13775
|
268355212
|
|
|
|
|
OP(ret) = ANYOF_WARN_SUPER; |
13776
|
|
|
|
|
|
} |
13777
|
|
|
|
|
|
} |
13778
|
|
|
|
|
|
|
13779
|
|
|
|
|
|
/* Here, we have calculated what code points should be in the character |
13780
|
|
|
|
|
|
* class. |
13781
|
|
|
|
|
|
* |
13782
|
|
|
|
|
|
* Now we can see about various optimizations. Fold calculation (which we |
13783
|
|
|
|
|
|
* did above) needs to take place before inversion. Otherwise /[^k]/i |
13784
|
|
|
|
|
|
* would invert to include K, which under /i would match k, which it |
13785
|
|
|
|
|
|
* shouldn't. Therefore we can't invert folded locale now, as it won't be |
13786
|
|
|
|
|
|
* folded until runtime */ |
13787
|
|
|
|
|
|
|
13788
|
|
|
|
|
|
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known |
13789
|
|
|
|
|
|
* at compile time. Besides not inverting folded locale now, we can't |
13790
|
|
|
|
|
|
* invert if there are things such as \w, which aren't known until runtime |
13791
|
|
|
|
|
|
* */ |
13792
|
268355214
|
|
|
|
|
if (invert |
13793
|
3326359
|
|
|
|
|
&& ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) |
13794
|
3326359
|
|
|
|
|
&& ! depends_list |
13795
|
3176295
|
|
|
|
|
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) |
13796
|
|
|
|
|
|
{ |
13797
|
4954799
|
|
|
|
|
_invlist_invert(cp_list); |
13798
|
|
|
|
|
|
|
13799
|
|
|
|
|
|
/* Any swash can't be used as-is, because we've inverted things */ |
13800
|
1865159
|
|
|
|
|
if (swash) { |
13801
|
1865159
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
13802
|
|
|
|
|
|
swash = NULL; |
13803
|
|
|
|
|
|
} |
13804
|
|
|
|
|
|
|
13805
|
|
|
|
|
|
/* Clear the invert flag since have just done it here */ |
13806
|
|
|
|
|
|
invert = FALSE; |
13807
|
|
|
|
|
|
} |
13808
|
|
|
|
|
|
|
13809
|
3351493
|
|
|
|
|
if (ret_invlist) { |
13810
|
162916
|
|
|
|
|
*ret_invlist = cp_list; |
13811
|
3351491
|
|
|
|
|
SvREFCNT_dec(swash); |
13812
|
|
|
|
|
|
|
13813
|
|
|
|
|
|
/* Discard the generated node */ |
13814
|
137370
|
|
|
|
|
if (SIZE_ONLY) { |
13815
|
98250
|
|
|
|
|
RExC_size = orig_size; |
13816
|
|
|
|
|
|
} |
13817
|
|
|
|
|
|
else { |
13818
|
98250
|
|
|
|
|
RExC_emit = orig_emit; |
13819
|
|
|
|
|
|
} |
13820
|
|
|
|
|
|
return orig_emit; |
13821
|
|
|
|
|
|
} |
13822
|
|
|
|
|
|
|
13823
|
|
|
|
|
|
/* If we didn't do folding, it's because some information isn't available |
13824
|
|
|
|
|
|
* until runtime; set the run-time fold flag for these. (We don't have to |
13825
|
|
|
|
|
|
* worry about properties folding, as that is taken care of by the swash |
13826
|
|
|
|
|
|
* fetching) */ |
13827
|
39122
|
|
|
|
|
if (FOLD && LOC) |
13828
|
|
|
|
|
|
{ |
13829
|
3351491
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; |
13830
|
|
|
|
|
|
} |
13831
|
|
|
|
|
|
|
13832
|
|
|
|
|
|
/* Some character classes are equivalent to other nodes. Such nodes take |
13833
|
|
|
|
|
|
* up less room and generally fewer operations to execute than ANYOF nodes. |
13834
|
|
|
|
|
|
* Above, we checked for and optimized into some such equivalents for |
13835
|
|
|
|
|
|
* certain common classes that are easy to test. Getting to this point in |
13836
|
|
|
|
|
|
* the code means that the class didn't get optimized there. Since this |
13837
|
|
|
|
|
|
* code is only executed in Pass 2, it is too late to save space--it has |
13838
|
|
|
|
|
|
* been allocated in Pass 1, and currently isn't given back. But turning |
13839
|
|
|
|
|
|
* things into an EXACTish node can allow the optimizer to join it to any |
13840
|
|
|
|
|
|
* adjacent such nodes. And if the class is equivalent to things like /./, |
13841
|
|
|
|
|
|
* expensive run-time swashes can be avoided. Now that we have more |
13842
|
|
|
|
|
|
* complete information, we can find things necessarily missed by the |
13843
|
|
|
|
|
|
* earlier code. I (khw) am not sure how much to look for here. It would |
13844
|
|
|
|
|
|
* be easy, but perhaps too slow, to check any candidates against all the |
13845
|
|
|
|
|
|
* node types they could possibly match using _invlistEQ(). */ |
13846
|
|
|
|
|
|
|
13847
|
278
|
|
|
|
|
if (cp_list |
13848
|
3351493
|
|
|
|
|
&& ! invert |
13849
|
1851173
|
|
|
|
|
&& ! depends_list |
13850
|
1824923
|
|
|
|
|
&& ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) |
13851
|
1526572
|
|
|
|
|
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) |
13852
|
|
|
|
|
|
{ |
13853
|
|
|
|
|
|
UV start, end; |
13854
|
|
|
|
|
|
U8 op = END; /* The optimzation node-type */ |
13855
|
2338313
|
|
|
|
|
const char * cur_parse= RExC_parse; |
13856
|
|
|
|
|
|
|
13857
|
1526572
|
|
|
|
|
invlist_iterinit(cp_list); |
13858
|
90254
|
|
|
|
|
if (! invlist_iternext(cp_list, &start, &end)) { |
13859
|
|
|
|
|
|
|
13860
|
|
|
|
|
|
/* Here, the list is empty. This happens, for example, when a |
13861
|
|
|
|
|
|
* Unicode property is the only thing in the character class, and |
13862
|
|
|
|
|
|
* it doesn't match anything. (perluniprops.pod notes such |
13863
|
|
|
|
|
|
* properties) */ |
13864
|
|
|
|
|
|
op = OPFAIL; |
13865
|
90252
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
13866
|
|
|
|
|
|
} |
13867
|
1436320
|
|
|
|
|
else if (start == end) { /* The range is a single code point */ |
13868
|
1436318
|
|
|
|
|
if (! invlist_iternext(cp_list, &start, &end) |
13869
|
|
|
|
|
|
|
13870
|
|
|
|
|
|
/* Don't do this optimization if it would require changing |
13871
|
|
|
|
|
|
* the pattern to UTF-8 */ |
13872
|
1410068
|
|
|
|
|
&& (start < 256 || UTF)) |
13873
|
|
|
|
|
|
{ |
13874
|
|
|
|
|
|
/* Here, the list contains a single code point. Can optimize |
13875
|
|
|
|
|
|
* into an EXACT node */ |
13876
|
|
|
|
|
|
|
13877
|
1410068
|
|
|
|
|
value = start; |
13878
|
|
|
|
|
|
|
13879
|
1526570
|
|
|
|
|
if (! FOLD) { |
13880
|
|
|
|
|
|
op = EXACT; |
13881
|
|
|
|
|
|
} |
13882
|
1526570
|
|
|
|
|
else if (LOC) { |
13883
|
|
|
|
|
|
|
13884
|
|
|
|
|
|
/* A locale node under folding with one code point can be |
13885
|
|
|
|
|
|
* an EXACTFL, as its fold won't be calculated until |
13886
|
|
|
|
|
|
* runtime */ |
13887
|
|
|
|
|
|
op = EXACTFL; |
13888
|
|
|
|
|
|
} |
13889
|
|
|
|
|
|
else { |
13890
|
|
|
|
|
|
|
13891
|
|
|
|
|
|
/* Here, we are generally folding, but there is only one |
13892
|
|
|
|
|
|
* code point to match. If we have to, we use an EXACT |
13893
|
|
|
|
|
|
* node, but it would be better for joining with adjacent |
13894
|
|
|
|
|
|
* nodes in the optimization pass if we used the same |
13895
|
|
|
|
|
|
* EXACTFish node that any such are likely to be. We can |
13896
|
|
|
|
|
|
* do this iff the code point doesn't participate in any |
13897
|
|
|
|
|
|
* folds. For example, an EXACTF of a colon is the same as |
13898
|
|
|
|
|
|
* an EXACT one, since nothing folds to or from a colon. */ |
13899
|
1526570
|
|
|
|
|
if (value < 256) { |
13900
|
1526570
|
|
|
|
|
if (IS_IN_SOME_FOLD_L1(value)) { |
13901
|
|
|
|
|
|
op = EXACT; |
13902
|
|
|
|
|
|
} |
13903
|
|
|
|
|
|
} |
13904
|
|
|
|
|
|
else { |
13905
|
3351491
|
|
|
|
|
if (! PL_utf8_foldable) { |
13906
|
5838272
|
|
|
|
|
SV* swash = swash_init("utf8", "_Perl_Any_Folds", |
13907
|
|
|
|
|
|
&PL_sv_undef, 1, 0); |
13908
|
11470056
|
|
|
|
|
PL_utf8_foldable = _get_swash_invlist(swash); |
13909
|
12016300
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
13910
|
|
|
|
|
|
} |
13911
|
1073048
|
|
|
|
|
if (_invlist_contains_cp(PL_utf8_foldable, value)) { |
13912
|
|
|
|
|
|
op = EXACT; |
13913
|
|
|
|
|
|
} |
13914
|
|
|
|
|
|
} |
13915
|
|
|
|
|
|
|
13916
|
|
|
|
|
|
/* If we haven't found the node type, above, it means we |
13917
|
|
|
|
|
|
* can use the prevailing one */ |
13918
|
0
|
|
|
|
|
if (op == END) { |
13919
|
165799008
|
|
|
|
|
op = compute_EXACTish(pRExC_state); |
13920
|
|
|
|
|
|
} |
13921
|
|
|
|
|
|
} |
13922
|
|
|
|
|
|
} |
13923
|
|
|
|
|
|
} |
13924
|
196100742
|
|
|
|
|
else if (start == 0) { |
13925
|
225688232
|
|
|
|
|
if (end == UV_MAX) { |
13926
|
|
|
|
|
|
op = SANY; |
13927
|
190061120
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
13928
|
22476186
|
|
|
|
|
RExC_naughty++; |
13929
|
|
|
|
|
|
} |
13930
|
8990800
|
|
|
|
|
else if (end == '\n' - 1 |
13931
|
6808
|
|
|
|
|
&& invlist_iternext(cp_list, &start, &end) |
13932
|
5856
|
|
|
|
|
&& start == '\n' + 1 && end == UV_MAX) |
13933
|
|
|
|
|
|
{ |
13934
|
|
|
|
|
|
op = REG_ANY; |
13935
|
28
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
13936
|
5828
|
|
|
|
|
RExC_naughty++; |
13937
|
|
|
|
|
|
} |
13938
|
|
|
|
|
|
} |
13939
|
954
|
|
|
|
|
invlist_iterfinish(cp_list); |
13940
|
|
|
|
|
|
|
13941
|
954
|
|
|
|
|
if (op != END) { |
13942
|
225687252
|
|
|
|
|
RExC_parse = (char *)orig_parse; |
13943
|
81662828
|
|
|
|
|
RExC_emit = (regnode *)orig_emit; |
13944
|
|
|
|
|
|
|
13945
|
58815224
|
|
|
|
|
ret = reg_node(pRExC_state, op); |
13946
|
|
|
|
|
|
|
13947
|
58815224
|
|
|
|
|
RExC_parse = (char *)cur_parse; |
13948
|
|
|
|
|
|
|
13949
|
22847604
|
|
|
|
|
if (PL_regkind[op] == EXACT) { |
13950
|
1073048
|
|
|
|
|
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); |
13951
|
|
|
|
|
|
} |
13952
|
|
|
|
|
|
|
13953
|
1073048
|
|
|
|
|
SvREFCNT_dec_NN(cp_list); |
13954
|
165798980
|
|
|
|
|
return ret; |
13955
|
|
|
|
|
|
} |
13956
|
|
|
|
|
|
} |
13957
|
|
|
|
|
|
|
13958
|
|
|
|
|
|
/* Here, contains all the code points we can determine at |
13959
|
|
|
|
|
|
* compile time that match under all conditions. Go through it, and |
13960
|
|
|
|
|
|
* for things that belong in the bitmap, put them there, and delete from |
13961
|
|
|
|
|
|
* . While we are at it, see if everything above 255 is in the |
13962
|
|
|
|
|
|
* list, and if so, set a flag to speed up execution */ |
13963
|
85740864
|
|
|
|
|
ANYOF_BITMAP_ZERO(ret); |
13964
|
85740864
|
|
|
|
|
if (cp_list) { |
13965
|
|
|
|
|
|
|
13966
|
|
|
|
|
|
/* This gets set if we actually need to modify things */ |
13967
|
|
|
|
|
|
bool change_invlist = FALSE; |
13968
|
|
|
|
|
|
|
13969
|
|
|
|
|
|
UV start, end; |
13970
|
|
|
|
|
|
|
13971
|
|
|
|
|
|
/* Start looking through */ |
13972
|
85740864
|
|
|
|
|
invlist_iterinit(cp_list); |
13973
|
44009311
|
|
|
|
|
while (invlist_iternext(cp_list, &start, &end)) { |
13974
|
|
|
|
|
|
UV high; |
13975
|
|
|
|
|
|
int i; |
13976
|
|
|
|
|
|
|
13977
|
44009307
|
|
|
|
|
if (end == UV_MAX && start <= 256) { |
13978
|
41731557
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; |
13979
|
|
|
|
|
|
} |
13980
|
|
|
|
|
|
|
13981
|
|
|
|
|
|
/* Quit if are above what we should change */ |
13982
|
2
|
|
|
|
|
if (start > 255) { |
13983
|
|
|
|
|
|
break; |
13984
|
|
|
|
|
|
} |
13985
|
|
|
|
|
|
|
13986
|
|
|
|
|
|
change_invlist = TRUE; |
13987
|
|
|
|
|
|
|
13988
|
|
|
|
|
|
/* Set all the bits in the range, up to the max that we are doing */ |
13989
|
41731559
|
|
|
|
|
high = (end < 255) ? end : 255; |
13990
|
41731563
|
|
|
|
|
for (i = start; i <= (int) high; i++) { |
13991
|
41731561
|
|
|
|
|
if (! ANYOF_BITMAP_TEST(ret, i)) { |
13992
|
64131094
|
|
|
|
|
ANYOF_BITMAP_SET(ret, i); |
13993
|
|
|
|
|
|
} |
13994
|
|
|
|
|
|
} |
13995
|
|
|
|
|
|
} |
13996
|
31165056
|
|
|
|
|
invlist_iterfinish(cp_list); |
13997
|
|
|
|
|
|
|
13998
|
|
|
|
|
|
/* Done with loop; remove any code points that are in the bitmap from |
13999
|
|
|
|
|
|
* */ |
14000
|
31165056
|
|
|
|
|
if (change_invlist) { |
14001
|
31165056
|
|
|
|
|
_invlist_subtract(cp_list, PL_Latin1, &cp_list); |
14002
|
|
|
|
|
|
} |
14003
|
|
|
|
|
|
|
14004
|
|
|
|
|
|
/* If have completely emptied it, remove it completely */ |
14005
|
15782902
|
|
|
|
|
if (_invlist_len(cp_list) == 0) { |
14006
|
15782902
|
|
|
|
|
SvREFCNT_dec_NN(cp_list); |
14007
|
15382156
|
|
|
|
|
cp_list = NULL; |
14008
|
|
|
|
|
|
} |
14009
|
|
|
|
|
|
} |
14010
|
|
|
|
|
|
|
14011
|
2
|
|
|
|
|
if (invert) { |
14012
|
15382154
|
|
|
|
|
ANYOF_FLAGS(ret) |= ANYOF_INVERT; |
14013
|
|
|
|
|
|
} |
14014
|
|
|
|
|
|
|
14015
|
|
|
|
|
|
/* Here, the bitmap has been populated with all the Latin1 code points that |
14016
|
|
|
|
|
|
* always match. Can now add to the overall list those that match only |
14017
|
|
|
|
|
|
* when the target string is UTF-8 (). */ |
14018
|
15382156
|
|
|
|
|
if (depends_list) { |
14019
|
15382154
|
|
|
|
|
if (cp_list) { |
14020
|
23385730
|
|
|
|
|
_invlist_union(cp_list, depends_list, &cp_list); |
14021
|
264068
|
|
|
|
|
SvREFCNT_dec_NN(depends_list); |
14022
|
|
|
|
|
|
} |
14023
|
|
|
|
|
|
else { |
14024
|
22860448
|
|
|
|
|
cp_list = depends_list; |
14025
|
|
|
|
|
|
} |
14026
|
|
|
|
|
|
} |
14027
|
|
|
|
|
|
|
14028
|
|
|
|
|
|
/* If there is a swash and more than one element, we can't use the swash in |
14029
|
|
|
|
|
|
* the optimization below. */ |
14030
|
22860450
|
|
|
|
|
if (swash && element_count > 1) { |
14031
|
22860448
|
|
|
|
|
SvREFCNT_dec_NN(swash); |
14032
|
|
|
|
|
|
swash = NULL; |
14033
|
|
|
|
|
|
} |
14034
|
|
|
|
|
|
|
14035
|
22860450
|
|
|
|
|
if (! cp_list |
14036
|
11448702
|
|
|
|
|
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) |
14037
|
|
|
|
|
|
{ |
14038
|
28493730
|
|
|
|
|
ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); |
14039
|
|
|
|
|
|
} |
14040
|
|
|
|
|
|
else { |
14041
|
|
|
|
|
|
/* av[0] stores the character class description in its textual form: |
14042
|
|
|
|
|
|
* used later (regexec.c:Perl_regclass_swash()) to initialize the |
14043
|
|
|
|
|
|
* appropriate swash, and is also useful for dumping the regnode. |
14044
|
|
|
|
|
|
* av[1] if NULL, is a placeholder to later contain the swash computed |
14045
|
|
|
|
|
|
* from av[0]. But if no further computation need be done, the |
14046
|
|
|
|
|
|
* swash is stored there now. |
14047
|
|
|
|
|
|
* av[2] stores the cp_list inversion list for use in addition or |
14048
|
|
|
|
|
|
* instead of av[0]; used only if av[1] is NULL |
14049
|
|
|
|
|
|
* av[3] is set if any component of the class is from a user-defined |
14050
|
|
|
|
|
|
* property; used only if av[1] is NULL */ |
14051
|
11411748
|
|
|
|
|
AV * const av = newAV(); |
14052
|
|
|
|
|
|
SV *rv; |
14053
|
|
|
|
|
|
|
14054
|
11411748
|
|
|
|
|
av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) |
14055
|
|
|
|
|
|
? SvREFCNT_inc(listsv) : &PL_sv_undef); |
14056
|
11411748
|
|
|
|
|
if (swash) { |
14057
|
11411748
|
|
|
|
|
av_store(av, 1, swash); |
14058
|
5826930
|
|
|
|
|
SvREFCNT_dec_NN(cp_list); |
14059
|
|
|
|
|
|
} |
14060
|
|
|
|
|
|
else { |
14061
|
29986
|
|
|
|
|
av_store(av, 1, NULL); |
14062
|
2754
|
|
|
|
|
if (cp_list) { |
14063
|
29986
|
|
|
|
|
av_store(av, 2, cp_list); |
14064
|
2754
|
|
|
|
|
av_store(av, 3, newSVuv(has_user_defined_property)); |
14065
|
|
|
|
|
|
} |
14066
|
|
|
|
|
|
} |
14067
|
|
|
|
|
|
|
14068
|
78024679
|
|
|
|
|
rv = newRV_noinc(MUTABLE_SV(av)); |
14069
|
66612931
|
|
|
|
|
n = add_data(pRExC_state, 1, "s"); |
14070
|
11411748
|
|
|
|
|
RExC_rxi->data->data[n] = (void*)rv; |
14071
|
11411748
|
|
|
|
|
ARG_SET(ret, n); |
14072
|
|
|
|
|
|
} |
14073
|
|
|
|
|
|
|
14074
|
11411750
|
|
|
|
|
*flagp |= HASWIDTH|SIMPLE; |
14075
|
99636294
|
|
|
|
|
return ret; |
14076
|
|
|
|
|
|
} |
14077
|
|
|
|
|
|
#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION |
14078
|
|
|
|
|
|
|
14079
|
|
|
|
|
|
|
14080
|
|
|
|
|
|
/* reg_skipcomment() |
14081
|
|
|
|
|
|
|
14082
|
|
|
|
|
|
Absorbs an /x style # comments from the input stream. |
14083
|
|
|
|
|
|
Returns true if there is more text remaining in the stream. |
14084
|
|
|
|
|
|
Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment |
14085
|
|
|
|
|
|
terminates the pattern without including a newline. |
14086
|
|
|
|
|
|
|
14087
|
|
|
|
|
|
Note its the callers responsibility to ensure that we are |
14088
|
|
|
|
|
|
actually in /x mode |
14089
|
|
|
|
|
|
|
14090
|
|
|
|
|
|
*/ |
14091
|
|
|
|
|
|
|
14092
|
|
|
|
|
|
STATIC bool |
14093
|
99636292
|
|
|
|
|
S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) |
14094
|
|
|
|
|
|
{ |
14095
|
|
|
|
|
|
bool ended = 0; |
14096
|
|
|
|
|
|
|
14097
|
99636292
|
|
|
|
|
PERL_ARGS_ASSERT_REG_SKIPCOMMENT; |
14098
|
|
|
|
|
|
|
14099
|
102950058
|
|
|
|
|
while (RExC_parse < RExC_end) |
14100
|
102950058
|
|
|
|
|
if (*RExC_parse++ == '\n') { |
14101
|
|
|
|
|
|
ended = 1; |
14102
|
|
|
|
|
|
break; |
14103
|
|
|
|
|
|
} |
14104
|
50675954
|
|
|
|
|
if (!ended) { |
14105
|
|
|
|
|
|
/* we ran off the end of the pattern without ending |
14106
|
|
|
|
|
|
the comment, so we have to add an \n when wrapping */ |
14107
|
241953
|
|
|
|
|
RExC_seen |= REG_SEEN_RUN_ON_COMMENT; |
14108
|
50434001
|
|
|
|
|
return 0; |
14109
|
|
|
|
|
|
} else |
14110
|
|
|
|
|
|
return 1; |
14111
|
|
|
|
|
|
} |
14112
|
|
|
|
|
|
|
14113
|
|
|
|
|
|
/* nextchar() |
14114
|
|
|
|
|
|
|
14115
|
|
|
|
|
|
Advances the parse position, and optionally absorbs |
14116
|
|
|
|
|
|
"whitespace" from the inputstream. |
14117
|
|
|
|
|
|
|
14118
|
|
|
|
|
|
Without /x "whitespace" means (?#...) style comments only, |
14119
|
|
|
|
|
|
with /x this means (?#...) and # comments and whitespace proper. |
14120
|
|
|
|
|
|
|
14121
|
|
|
|
|
|
Returns the RExC_parse point from BEFORE the scan occurs. |
14122
|
|
|
|
|
|
|
14123
|
|
|
|
|
|
This is the /x friendly way of saying RExC_parse++. |
14124
|
|
|
|
|
|
*/ |
14125
|
|
|
|
|
|
|
14126
|
|
|
|
|
|
STATIC char* |
14127
|
1628
|
|
|
|
|
S_nextchar(pTHX_ RExC_state_t *pRExC_state) |
14128
|
|
|
|
|
|
{ |
14129
|
1628
|
|
|
|
|
char* const retval = RExC_parse++; |
14130
|
|
|
|
|
|
|
14131
|
1628
|
|
|
|
|
PERL_ARGS_ASSERT_NEXTCHAR; |
14132
|
|
|
|
|
|
|
14133
|
|
|
|
|
|
for (;;) { |
14134
|
1628
|
|
|
|
|
if (RExC_end - RExC_parse >= 3 |
14135
|
2534004
|
|
|
|
|
&& *RExC_parse == '(' |
14136
|
2533104
|
|
|
|
|
&& RExC_parse[1] == '?' |
14137
|
300
|
|
|
|
|
&& RExC_parse[2] == '#') |
14138
|
|
|
|
|
|
{ |
14139
|
0
|
|
|
|
|
while (*RExC_parse != ')') { |
14140
|
0
|
|
|
|
|
if (RExC_parse == RExC_end) |
14141
|
20183530
|
|
|
|
|
FAIL("Sequence (?#... not terminated"); |
14142
|
20183530
|
|
|
|
|
RExC_parse++; |
14143
|
|
|
|
|
|
} |
14144
|
15671000
|
|
|
|
|
RExC_parse++; |
14145
|
6763936
|
|
|
|
|
continue; |
14146
|
|
|
|
|
|
} |
14147
|
4514158
|
|
|
|
|
if (RExC_flags & RXf_PMf_EXTENDED) { |
14148
|
4512530
|
|
|
|
|
if (isSPACE(*RExC_parse)) { |
14149
|
20183530
|
|
|
|
|
RExC_parse++; |
14150
|
20183210
|
|
|
|
|
continue; |
14151
|
|
|
|
|
|
} |
14152
|
20183210
|
|
|
|
|
else if (*RExC_parse == '#') { |
14153
|
20183210
|
|
|
|
|
if ( reg_skipcomment( pRExC_state ) ) |
14154
|
20183210
|
|
|
|
|
continue; |
14155
|
|
|
|
|
|
} |
14156
|
|
|
|
|
|
} |
14157
|
20184838
|
|
|
|
|
return retval; |
14158
|
|
|
|
|
|
} |
14159
|
|
|
|
|
|
} |
14160
|
|
|
|
|
|
|
14161
|
|
|
|
|
|
/* |
14162
|
|
|
|
|
|
- reg_node - emit a node |
14163
|
|
|
|
|
|
*/ |
14164
|
|
|
|
|
|
STATIC regnode * /* Location. */ |
14165
|
34696613
|
|
|
|
|
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) |
14166
|
|
|
|
|
|
{ |
14167
|
|
|
|
|
|
dVAR; |
14168
|
|
|
|
|
|
regnode *ptr; |
14169
|
20184350
|
|
|
|
|
regnode * const ret = RExC_emit; |
14170
|
20184350
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14171
|
|
|
|
|
|
|
14172
|
20184350
|
|
|
|
|
PERL_ARGS_ASSERT_REG_NODE; |
14173
|
|
|
|
|
|
|
14174
|
20184350
|
|
|
|
|
if (SIZE_ONLY) { |
14175
|
|
|
|
|
|
SIZE_ALIGN(RExC_size); |
14176
|
20183952
|
|
|
|
|
RExC_size += 1; |
14177
|
15723825
|
|
|
|
|
return(ret); |
14178
|
|
|
|
|
|
} |
14179
|
15723801
|
|
|
|
|
if (RExC_emit >= RExC_emit_bound) |
14180
|
15723403
|
|
|
|
|
Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", |
14181
|
|
|
|
|
|
op, RExC_emit, RExC_emit_bound); |
14182
|
|
|
|
|
|
|
14183
|
15723735
|
|
|
|
|
NODE_ALIGN_FILL(ret); |
14184
|
|
|
|
|
|
ptr = ret; |
14185
|
464
|
|
|
|
|
FILL_ADVANCE_NODE(ptr, op); |
14186
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
14187
|
464
|
|
|
|
|
if (RExC_offsets) { /* MJD */ |
14188
|
408
|
|
|
|
|
MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", |
14189
|
|
|
|
|
|
"reg_node", __LINE__, |
14190
|
|
|
|
|
|
PL_reg_name[op], |
14191
|
|
|
|
|
|
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] |
14192
|
|
|
|
|
|
? "Overwriting end of array!\n" : "OK", |
14193
|
|
|
|
|
|
(UV)(RExC_emit - RExC_emit_start), |
14194
|
|
|
|
|
|
(UV)(RExC_parse - RExC_start), |
14195
|
|
|
|
|
|
(UV)RExC_offsets[0])); |
14196
|
408
|
|
|
|
|
Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); |
14197
|
|
|
|
|
|
} |
14198
|
|
|
|
|
|
#endif |
14199
|
408
|
|
|
|
|
RExC_emit = ptr; |
14200
|
408
|
|
|
|
|
return(ret); |
14201
|
|
|
|
|
|
} |
14202
|
|
|
|
|
|
|
14203
|
|
|
|
|
|
/* |
14204
|
|
|
|
|
|
- reganode - emit a node with an argument |
14205
|
|
|
|
|
|
*/ |
14206
|
|
|
|
|
|
STATIC regnode * /* Location. */ |
14207
|
230
|
|
|
|
|
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) |
14208
|
|
|
|
|
|
{ |
14209
|
|
|
|
|
|
dVAR; |
14210
|
|
|
|
|
|
regnode *ptr; |
14211
|
230
|
|
|
|
|
regnode * const ret = RExC_emit; |
14212
|
15723623
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14213
|
|
|
|
|
|
|
14214
|
15723623
|
|
|
|
|
PERL_ARGS_ASSERT_REGANODE; |
14215
|
|
|
|
|
|
|
14216
|
15723623
|
|
|
|
|
if (SIZE_ONLY) { |
14217
|
|
|
|
|
|
SIZE_ALIGN(RExC_size); |
14218
|
15723513
|
|
|
|
|
RExC_size += 2; |
14219
|
|
|
|
|
|
/* |
14220
|
|
|
|
|
|
We can't do this: |
14221
|
|
|
|
|
|
|
14222
|
|
|
|
|
|
assert(2==regarglen[op]+1); |
14223
|
|
|
|
|
|
|
14224
|
|
|
|
|
|
Anything larger than this has to allocate the extra amount. |
14225
|
|
|
|
|
|
If we changed this to be: |
14226
|
|
|
|
|
|
|
14227
|
|
|
|
|
|
RExC_size += (1 + regarglen[op]); |
14228
|
|
|
|
|
|
|
14229
|
|
|
|
|
|
then it wouldn't matter. Its not clear what side effect |
14230
|
|
|
|
|
|
might come from that so its not done so far. |
14231
|
|
|
|
|
|
-- dmq |
14232
|
|
|
|
|
|
*/ |
14233
|
15723513
|
|
|
|
|
return(ret); |
14234
|
|
|
|
|
|
} |
14235
|
15723513
|
|
|
|
|
if (RExC_emit >= RExC_emit_bound) |
14236
|
15723403
|
|
|
|
|
Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", |
14237
|
|
|
|
|
|
op, RExC_emit, RExC_emit_bound); |
14238
|
|
|
|
|
|
|
14239
|
15723513
|
|
|
|
|
NODE_ALIGN_FILL(ret); |
14240
|
|
|
|
|
|
ptr = ret; |
14241
|
15723513
|
|
|
|
|
FILL_ADVANCE_NODE_ARG(ptr, op, arg); |
14242
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
14243
|
15723513
|
|
|
|
|
if (RExC_offsets) { /* MJD */ |
14244
|
15723513
|
|
|
|
|
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", |
14245
|
|
|
|
|
|
"reganode", |
14246
|
|
|
|
|
|
__LINE__, |
14247
|
|
|
|
|
|
PL_reg_name[op], |
14248
|
|
|
|
|
|
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? |
14249
|
|
|
|
|
|
"Overwriting end of array!\n" : "OK", |
14250
|
|
|
|
|
|
(UV)(RExC_emit - RExC_emit_start), |
14251
|
|
|
|
|
|
(UV)(RExC_parse - RExC_start), |
14252
|
|
|
|
|
|
(UV)RExC_offsets[0])); |
14253
|
15723513
|
|
|
|
|
Set_Cur_Node_Offset; |
14254
|
|
|
|
|
|
} |
14255
|
|
|
|
|
|
#endif |
14256
|
15723513
|
|
|
|
|
RExC_emit = ptr; |
14257
|
15723513
|
|
|
|
|
return(ret); |
14258
|
|
|
|
|
|
} |
14259
|
|
|
|
|
|
|
14260
|
|
|
|
|
|
/* |
14261
|
|
|
|
|
|
- reguni - emit (if appropriate) a Unicode character |
14262
|
|
|
|
|
|
*/ |
14263
|
|
|
|
|
|
STATIC STRLEN |
14264
|
15723459
|
|
|
|
|
S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) |
14265
|
|
|
|
|
|
{ |
14266
|
|
|
|
|
|
dVAR; |
14267
|
|
|
|
|
|
|
14268
|
15723459
|
|
|
|
|
PERL_ARGS_ASSERT_REGUNI; |
14269
|
|
|
|
|
|
|
14270
|
23386600
|
|
|
|
|
return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); |
14271
|
|
|
|
|
|
} |
14272
|
|
|
|
|
|
|
14273
|
|
|
|
|
|
/* |
14274
|
|
|
|
|
|
- reginsert - insert an operator in front of already-emitted operand |
14275
|
|
|
|
|
|
* |
14276
|
|
|
|
|
|
* Means relocating the operand. |
14277
|
|
|
|
|
|
*/ |
14278
|
|
|
|
|
|
STATIC void |
14279
|
15723419
|
|
|
|
|
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) |
14280
|
|
|
|
|
|
{ |
14281
|
|
|
|
|
|
dVAR; |
14282
|
|
|
|
|
|
regnode *src; |
14283
|
|
|
|
|
|
regnode *dst; |
14284
|
|
|
|
|
|
regnode *place; |
14285
|
15723419
|
|
|
|
|
const int offset = regarglen[(U8)op]; |
14286
|
4512440
|
|
|
|
|
const int size = NODE_STEP_REGNODE + offset; |
14287
|
4512440
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14288
|
|
|
|
|
|
|
14289
|
4512440
|
|
|
|
|
PERL_ARGS_ASSERT_REGINSERT; |
14290
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
14291
|
|
|
|
|
|
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ |
14292
|
5266
|
|
|
|
|
DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); |
14293
|
3606
|
|
|
|
|
if (SIZE_ONLY) { |
14294
|
3328
|
|
|
|
|
RExC_size += size; |
14295
|
4512448
|
|
|
|
|
return; |
14296
|
|
|
|
|
|
} |
14297
|
|
|
|
|
|
|
14298
|
1293102
|
|
|
|
|
src = RExC_emit; |
14299
|
4207675
|
|
|
|
|
RExC_emit += size; |
14300
|
2268574
|
|
|
|
|
dst = RExC_emit; |
14301
|
1586628
|
|
|
|
|
if (RExC_open_parens) { |
14302
|
|
|
|
|
|
int paren; |
14303
|
|
|
|
|
|
/*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ |
14304
|
1928673
|
|
|
|
|
for ( paren=0 ; paren < RExC_npar ; paren++ ) { |
14305
|
210870
|
|
|
|
|
if ( RExC_open_parens[paren] >= opnd ) { |
14306
|
|
|
|
|
|
/*DEBUG_PARSE_FMT("open"," - %d",size);*/ |
14307
|
210870
|
|
|
|
|
RExC_open_parens[paren] += size; |
14308
|
|
|
|
|
|
} else { |
14309
|
|
|
|
|
|
/*DEBUG_PARSE_FMT("open"," - %s","ok");*/ |
14310
|
|
|
|
|
|
} |
14311
|
12076
|
|
|
|
|
if ( RExC_close_parens[paren] >= opnd ) { |
14312
|
|
|
|
|
|
/*DEBUG_PARSE_FMT("close"," - %d",size);*/ |
14313
|
12076
|
|
|
|
|
RExC_close_parens[paren] += size; |
14314
|
|
|
|
|
|
} else { |
14315
|
|
|
|
|
|
/*DEBUG_PARSE_FMT("close"," - %s","ok");*/ |
14316
|
|
|
|
|
|
} |
14317
|
|
|
|
|
|
} |
14318
|
|
|
|
|
|
} |
14319
|
|
|
|
|
|
|
14320
|
12100
|
|
|
|
|
while (src > opnd) { |
14321
|
12092
|
|
|
|
|
StructCopy(--src, --dst, regnode); |
14322
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
14323
|
12092
|
|
|
|
|
if (RExC_offsets) { /* MJD 20010112 */ |
14324
|
12092
|
|
|
|
|
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", |
14325
|
|
|
|
|
|
"reg_insert", |
14326
|
|
|
|
|
|
__LINE__, |
14327
|
|
|
|
|
|
PL_reg_name[op], |
14328
|
|
|
|
|
|
(UV)(dst - RExC_emit_start) > RExC_offsets[0] |
14329
|
|
|
|
|
|
? "Overwriting end of array!\n" : "OK", |
14330
|
|
|
|
|
|
(UV)(src - RExC_emit_start), |
14331
|
|
|
|
|
|
(UV)(dst - RExC_emit_start), |
14332
|
|
|
|
|
|
(UV)RExC_offsets[0])); |
14333
|
12092
|
|
|
|
|
Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); |
14334
|
355330
|
|
|
|
|
Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); |
14335
|
|
|
|
|
|
} |
14336
|
|
|
|
|
|
#endif |
14337
|
|
|
|
|
|
} |
14338
|
|
|
|
|
|
|
14339
|
|
|
|
|
|
|
14340
|
|
|
|
|
|
place = opnd; /* Op node, where operand used to be. */ |
14341
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
14342
|
355322
|
|
|
|
|
if (RExC_offsets) { /* MJD */ |
14343
|
355322
|
|
|
|
|
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", |
14344
|
|
|
|
|
|
"reginsert", |
14345
|
|
|
|
|
|
__LINE__, |
14346
|
|
|
|
|
|
PL_reg_name[op], |
14347
|
|
|
|
|
|
(UV)(place - RExC_emit_start) > RExC_offsets[0] |
14348
|
|
|
|
|
|
? "Overwriting end of array!\n" : "OK", |
14349
|
|
|
|
|
|
(UV)(place - RExC_emit_start), |
14350
|
|
|
|
|
|
(UV)(RExC_parse - RExC_start), |
14351
|
|
|
|
|
|
(UV)RExC_offsets[0])); |
14352
|
355322
|
|
|
|
|
Set_Node_Offset(place, RExC_parse); |
14353
|
355322
|
|
|
|
|
Set_Node_Length(place, 1); |
14354
|
|
|
|
|
|
} |
14355
|
|
|
|
|
|
#endif |
14356
|
355322
|
|
|
|
|
src = NEXTOPER(place); |
14357
|
355322
|
|
|
|
|
FILL_ADVANCE_NODE(place, op); |
14358
|
248418
|
|
|
|
|
Zero(src, offset, regnode); |
14359
|
|
|
|
|
|
} |
14360
|
|
|
|
|
|
|
14361
|
|
|
|
|
|
/* |
14362
|
|
|
|
|
|
- regtail - set the next-pointer at the end of a node chain of p to val. |
14363
|
|
|
|
|
|
- SEE ALSO: regtail_study |
14364
|
|
|
|
|
|
*/ |
14365
|
|
|
|
|
|
/* TODO: All three parms should be const */ |
14366
|
|
|
|
|
|
STATIC void |
14367
|
355934
|
|
|
|
|
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) |
14368
|
|
|
|
|
|
{ |
14369
|
|
|
|
|
|
dVAR; |
14370
|
|
|
|
|
|
regnode *scan; |
14371
|
127682
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14372
|
|
|
|
|
|
|
14373
|
355934
|
|
|
|
|
PERL_ARGS_ASSERT_REGTAIL; |
14374
|
|
|
|
|
|
#ifndef DEBUGGING |
14375
|
|
|
|
|
|
PERL_UNUSED_ARG(depth); |
14376
|
|
|
|
|
|
#endif |
14377
|
|
|
|
|
|
|
14378
|
355934
|
|
|
|
|
if (SIZE_ONLY) |
14379
|
620
|
|
|
|
|
return; |
14380
|
|
|
|
|
|
|
14381
|
|
|
|
|
|
/* Find last node. */ |
14382
|
|
|
|
|
|
scan = p; |
14383
|
|
|
|
|
|
for (;;) { |
14384
|
1293620
|
|
|
|
|
regnode * const temp = regnext(scan); |
14385
|
1293620
|
|
|
|
|
DEBUG_PARSE_r({ |
14386
|
|
|
|
|
|
SV * const mysv=sv_newmortal(); |
14387
|
|
|
|
|
|
DEBUG_PARSE_MSG((scan==p ? "tail" : "")); |
14388
|
|
|
|
|
|
regprop(RExC_rx, mysv, scan); |
14389
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", |
14390
|
|
|
|
|
|
SvPV_nolen_const(mysv), REG_NODE_NUM(scan), |
14391
|
|
|
|
|
|
(temp == NULL ? "->" : ""), |
14392
|
|
|
|
|
|
(temp == NULL ? PL_reg_name[OP(val)] : "") |
14393
|
|
|
|
|
|
); |
14394
|
|
|
|
|
|
}); |
14395
|
4512950
|
|
|
|
|
if (temp == NULL) |
14396
|
|
|
|
|
|
break; |
14397
|
|
|
|
|
|
scan = temp; |
14398
|
|
|
|
|
|
} |
14399
|
|
|
|
|
|
|
14400
|
4512734
|
|
|
|
|
if (reg_off_by_arg[OP(scan)]) { |
14401
|
264077073
|
|
|
|
|
ARG_SET(scan, val - scan); |
14402
|
|
|
|
|
|
} |
14403
|
|
|
|
|
|
else { |
14404
|
264077383
|
|
|
|
|
NEXT_OFF(scan) = val - scan; |
14405
|
|
|
|
|
|
} |
14406
|
|
|
|
|
|
} |
14407
|
|
|
|
|
|
|
14408
|
|
|
|
|
|
#ifdef DEBUGGING |
14409
|
|
|
|
|
|
/* |
14410
|
|
|
|
|
|
- regtail_study - set the next-pointer at the end of a node chain of p to val. |
14411
|
|
|
|
|
|
- Look for optimizable sequences at the same time. |
14412
|
|
|
|
|
|
- currently only looks for EXACT chains. |
14413
|
|
|
|
|
|
|
14414
|
|
|
|
|
|
This is experimental code. The idea is to use this routine to perform |
14415
|
|
|
|
|
|
in place optimizations on branches and groups as they are constructed, |
14416
|
|
|
|
|
|
with the long term intention of removing optimization from study_chunk so |
14417
|
|
|
|
|
|
that it is purely analytical. |
14418
|
|
|
|
|
|
|
14419
|
|
|
|
|
|
Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used |
14420
|
|
|
|
|
|
to control which is which. |
14421
|
|
|
|
|
|
|
14422
|
|
|
|
|
|
*/ |
14423
|
|
|
|
|
|
/* TODO: All four parms should be const */ |
14424
|
|
|
|
|
|
|
14425
|
|
|
|
|
|
STATIC U8 |
14426
|
264077129
|
|
|
|
|
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) |
14427
|
|
|
|
|
|
{ |
14428
|
|
|
|
|
|
dVAR; |
14429
|
|
|
|
|
|
regnode *scan; |
14430
|
|
|
|
|
|
U8 exact = PSEUDO; |
14431
|
|
|
|
|
|
#ifdef EXPERIMENTAL_INPLACESCAN |
14432
|
|
|
|
|
|
I32 min = 0; |
14433
|
|
|
|
|
|
#endif |
14434
|
56
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14435
|
|
|
|
|
|
|
14436
|
264077129
|
|
|
|
|
PERL_ARGS_ASSERT_REGTAIL_STUDY; |
14437
|
|
|
|
|
|
|
14438
|
|
|
|
|
|
|
14439
|
264077129
|
|
|
|
|
if (SIZE_ONLY) |
14440
|
|
|
|
|
|
return exact; |
14441
|
|
|
|
|
|
|
14442
|
|
|
|
|
|
/* Find last node. */ |
14443
|
|
|
|
|
|
|
14444
|
|
|
|
|
|
scan = p; |
14445
|
|
|
|
|
|
for (;;) { |
14446
|
233538108
|
|
|
|
|
regnode * const temp = regnext(scan); |
14447
|
|
|
|
|
|
#ifdef EXPERIMENTAL_INPLACESCAN |
14448
|
|
|
|
|
|
if (PL_regkind[OP(scan)] == EXACT) { |
14449
|
|
|
|
|
|
bool has_exactf_sharp_s; /* Unexamined in this routine */ |
14450
|
|
|
|
|
|
if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) |
14451
|
|
|
|
|
|
return EXACT; |
14452
|
|
|
|
|
|
} |
14453
|
|
|
|
|
|
#endif |
14454
|
298
|
|
|
|
|
if ( exact ) { |
14455
|
298
|
|
|
|
|
switch (OP(scan)) { |
14456
|
|
|
|
|
|
case EXACT: |
14457
|
|
|
|
|
|
case EXACTF: |
14458
|
|
|
|
|
|
case EXACTFA_NO_TRIE: |
14459
|
|
|
|
|
|
case EXACTFA: |
14460
|
|
|
|
|
|
case EXACTFU: |
14461
|
|
|
|
|
|
case EXACTFU_SS: |
14462
|
|
|
|
|
|
case EXACTFL: |
14463
|
296
|
|
|
|
|
if( exact == PSEUDO ) |
14464
|
294
|
|
|
|
|
exact= OP(scan); |
14465
|
2
|
|
|
|
|
else if ( exact != OP(scan) ) |
14466
|
|
|
|
|
|
exact= 0; |
14467
|
|
|
|
|
|
case NOTHING: |
14468
|
|
|
|
|
|
break; |
14469
|
|
|
|
|
|
default: |
14470
|
|
|
|
|
|
exact= 0; |
14471
|
|
|
|
|
|
} |
14472
|
|
|
|
|
|
} |
14473
|
298
|
|
|
|
|
DEBUG_PARSE_r({ |
14474
|
|
|
|
|
|
SV * const mysv=sv_newmortal(); |
14475
|
|
|
|
|
|
DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); |
14476
|
|
|
|
|
|
regprop(RExC_rx, mysv, scan); |
14477
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", |
14478
|
|
|
|
|
|
SvPV_nolen_const(mysv), |
14479
|
|
|
|
|
|
REG_NODE_NUM(scan), |
14480
|
|
|
|
|
|
PL_reg_name[exact]); |
14481
|
|
|
|
|
|
}); |
14482
|
58
|
|
|
|
|
if (temp == NULL) |
14483
|
|
|
|
|
|
break; |
14484
|
|
|
|
|
|
scan = temp; |
14485
|
|
|
|
|
|
} |
14486
|
296
|
|
|
|
|
DEBUG_PARSE_r({ |
14487
|
|
|
|
|
|
SV * const mysv_val=sv_newmortal(); |
14488
|
|
|
|
|
|
DEBUG_PARSE_MSG(""); |
14489
|
|
|
|
|
|
regprop(RExC_rx, mysv_val, val); |
14490
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", |
14491
|
|
|
|
|
|
SvPV_nolen_const(mysv_val), |
14492
|
|
|
|
|
|
(IV)REG_NODE_NUM(val), |
14493
|
|
|
|
|
|
(IV)(val - scan) |
14494
|
|
|
|
|
|
); |
14495
|
|
|
|
|
|
}); |
14496
|
296
|
|
|
|
|
if (reg_off_by_arg[OP(scan)]) { |
14497
|
240
|
|
|
|
|
ARG_SET(scan, val - scan); |
14498
|
|
|
|
|
|
} |
14499
|
|
|
|
|
|
else { |
14500
|
296
|
|
|
|
|
NEXT_OFF(scan) = val - scan; |
14501
|
|
|
|
|
|
} |
14502
|
|
|
|
|
|
|
14503
|
|
|
|
|
|
return exact; |
14504
|
|
|
|
|
|
} |
14505
|
|
|
|
|
|
#endif |
14506
|
|
|
|
|
|
|
14507
|
|
|
|
|
|
/* |
14508
|
|
|
|
|
|
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form |
14509
|
|
|
|
|
|
*/ |
14510
|
|
|
|
|
|
#ifdef DEBUGGING |
14511
|
|
|
|
|
|
|
14512
|
|
|
|
|
|
static void |
14513
|
240
|
|
|
|
|
S_regdump_intflags(pTHX_ const char *lead, const U32 flags) |
14514
|
|
|
|
|
|
{ |
14515
|
|
|
|
|
|
int bit; |
14516
|
|
|
|
|
|
int set=0; |
14517
|
|
|
|
|
|
|
14518
|
240
|
|
|
|
|
for (bit=0; bit<32; bit++) { |
14519
|
240
|
|
|
|
|
if (flags & (1<
|
14520
|
240
|
|
|
|
|
if (!set++ && lead) |
14521
|
240
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s",lead); |
14522
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]); |
14523
|
|
|
|
|
|
} |
14524
|
|
|
|
|
|
} |
14525
|
240
|
|
|
|
|
if (lead) { |
14526
|
240
|
|
|
|
|
if (set) |
14527
|
240
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n"); |
14528
|
|
|
|
|
|
else |
14529
|
529658
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); |
14530
|
|
|
|
|
|
} |
14531
|
529658
|
|
|
|
|
} |
14532
|
|
|
|
|
|
|
14533
|
|
|
|
|
|
static void |
14534
|
226366
|
|
|
|
|
S_regdump_extflags(pTHX_ const char *lead, const U32 flags) |
14535
|
|
|
|
|
|
{ |
14536
|
|
|
|
|
|
int bit; |
14537
|
|
|
|
|
|
int set=0; |
14538
|
|
|
|
|
|
regex_charset cs; |
14539
|
|
|
|
|
|
|
14540
|
226366
|
|
|
|
|
for (bit=0; bit<32; bit++) { |
14541
|
447208
|
|
|
|
|
if (flags & (1<
|
14542
|
220842
|
|
|
|
|
if ((1<
|
14543
|
147228
|
|
|
|
|
continue; |
14544
|
|
|
|
|
|
} |
14545
|
147228
|
|
|
|
|
if (!set++ && lead) |
14546
|
147228
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s",lead); |
14547
|
146950
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); |
14548
|
|
|
|
|
|
} |
14549
|
|
|
|
|
|
} |
14550
|
146950
|
|
|
|
|
if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { |
14551
|
146950
|
|
|
|
|
if (!set++ && lead) { |
14552
|
123588305
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s",lead); |
14553
|
|
|
|
|
|
} |
14554
|
0
|
|
|
|
|
switch (cs) { |
14555
|
|
|
|
|
|
case REGEX_UNICODE_CHARSET: |
14556
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "UNICODE"); |
14557
|
0
|
|
|
|
|
break; |
14558
|
|
|
|
|
|
case REGEX_LOCALE_CHARSET: |
14559
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "LOCALE"); |
14560
|
0
|
|
|
|
|
break; |
14561
|
|
|
|
|
|
case REGEX_ASCII_RESTRICTED_CHARSET: |
14562
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED"); |
14563
|
0
|
|
|
|
|
break; |
14564
|
|
|
|
|
|
case REGEX_ASCII_MORE_RESTRICTED_CHARSET: |
14565
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED"); |
14566
|
0
|
|
|
|
|
break; |
14567
|
|
|
|
|
|
default: |
14568
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET"); |
14569
|
0
|
|
|
|
|
break; |
14570
|
|
|
|
|
|
} |
14571
|
|
|
|
|
|
} |
14572
|
0
|
|
|
|
|
if (lead) { |
14573
|
0
|
|
|
|
|
if (set) |
14574
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n"); |
14575
|
|
|
|
|
|
else |
14576
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); |
14577
|
|
|
|
|
|
} |
14578
|
0
|
|
|
|
|
} |
14579
|
|
|
|
|
|
#endif |
14580
|
|
|
|
|
|
|
14581
|
|
|
|
|
|
void |
14582
|
132
|
|
|
|
|
Perl_regdump(pTHX_ const regexp *r) |
14583
|
|
|
|
|
|
{ |
14584
|
|
|
|
|
|
#ifdef DEBUGGING |
14585
|
|
|
|
|
|
dVAR; |
14586
|
132
|
|
|
|
|
SV * const sv = sv_newmortal(); |
14587
|
132
|
|
|
|
|
SV *dsv= sv_newmortal(); |
14588
|
132
|
|
|
|
|
RXi_GET_DECL(r,ri); |
14589
|
132
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14590
|
|
|
|
|
|
|
14591
|
132
|
|
|
|
|
PERL_ARGS_ASSERT_REGDUMP; |
14592
|
|
|
|
|
|
|
14593
|
132
|
|
|
|
|
(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); |
14594
|
|
|
|
|
|
|
14595
|
|
|
|
|
|
/* Header fields of interest. */ |
14596
|
132
|
|
|
|
|
if (r->anchored_substr) { |
14597
|
96
|
|
|
|
|
RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), |
14598
|
|
|
|
|
|
RE_SV_DUMPLEN(r->anchored_substr), 30); |
14599
|
192
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
14600
|
|
|
|
|
|
"anchored %s%s at %"IVdf" ", |
14601
|
96
|
|
|
|
|
s, RE_SV_TAIL(r->anchored_substr), |
14602
|
96
|
|
|
|
|
(IV)r->anchored_offset); |
14603
|
36
|
|
|
|
|
} else if (r->anchored_utf8) { |
14604
|
20
|
|
|
|
|
RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), |
14605
|
|
|
|
|
|
RE_SV_DUMPLEN(r->anchored_utf8), 30); |
14606
|
40
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
14607
|
|
|
|
|
|
"anchored utf8 %s%s at %"IVdf" ", |
14608
|
20
|
|
|
|
|
s, RE_SV_TAIL(r->anchored_utf8), |
14609
|
20
|
|
|
|
|
(IV)r->anchored_offset); |
14610
|
|
|
|
|
|
} |
14611
|
132
|
|
|
|
|
if (r->float_substr) { |
14612
|
4
|
|
|
|
|
RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), |
14613
|
|
|
|
|
|
RE_SV_DUMPLEN(r->float_substr), 30); |
14614
|
12
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
14615
|
|
|
|
|
|
"floating %s%s at %"IVdf"..%"UVuf" ", |
14616
|
4
|
|
|
|
|
s, RE_SV_TAIL(r->float_substr), |
14617
|
8
|
|
|
|
|
(IV)r->float_min_offset, (UV)r->float_max_offset); |
14618
|
128
|
|
|
|
|
} else if (r->float_utf8) { |
14619
|
0
|
|
|
|
|
RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), |
14620
|
|
|
|
|
|
RE_SV_DUMPLEN(r->float_utf8), 30); |
14621
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
14622
|
|
|
|
|
|
"floating utf8 %s%s at %"IVdf"..%"UVuf" ", |
14623
|
0
|
|
|
|
|
s, RE_SV_TAIL(r->float_utf8), |
14624
|
0
|
|
|
|
|
(IV)r->float_min_offset, (UV)r->float_max_offset); |
14625
|
|
|
|
|
|
} |
14626
|
132
|
|
|
|
|
if (r->check_substr || r->check_utf8) |
14627
|
236
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
14628
|
|
|
|
|
|
(const char *) |
14629
|
118
|
|
|
|
|
(r->check_substr == r->float_substr |
14630
|
24
|
|
|
|
|
&& r->check_utf8 == r->float_utf8 |
14631
|
|
|
|
|
|
? "(checking floating" : "(checking anchored")); |
14632
|
132
|
|
|
|
|
if (r->extflags & RXf_NOSCAN) |
14633
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, " noscan"); |
14634
|
132
|
|
|
|
|
if (r->extflags & RXf_CHECK_ALL) |
14635
|
48
|
|
|
|
|
PerlIO_printf(Perl_debug_log, " isall"); |
14636
|
132
|
|
|
|
|
if (r->check_substr || r->check_utf8) |
14637
|
118
|
|
|
|
|
PerlIO_printf(Perl_debug_log, ") "); |
14638
|
|
|
|
|
|
|
14639
|
132
|
|
|
|
|
if (ri->regstclass) { |
14640
|
6
|
|
|
|
|
regprop(r, sv, ri->regstclass); |
14641
|
6
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); |
14642
|
|
|
|
|
|
} |
14643
|
132
|
|
|
|
|
if (r->extflags & RXf_ANCH) { |
14644
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "anchored"); |
14645
|
0
|
|
|
|
|
if (r->extflags & RXf_ANCH_BOL) |
14646
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "(BOL)"); |
14647
|
0
|
|
|
|
|
if (r->extflags & RXf_ANCH_MBOL) |
14648
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "(MBOL)"); |
14649
|
0
|
|
|
|
|
if (r->extflags & RXf_ANCH_SBOL) |
14650
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "(SBOL)"); |
14651
|
0
|
|
|
|
|
if (r->extflags & RXf_ANCH_GPOS) |
14652
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "(GPOS)"); |
14653
|
0
|
|
|
|
|
PerlIO_putc(Perl_debug_log, ' '); |
14654
|
|
|
|
|
|
} |
14655
|
132
|
|
|
|
|
if (r->extflags & RXf_GPOS_SEEN) |
14656
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); |
14657
|
132
|
|
|
|
|
if (r->intflags & PREGf_SKIP) |
14658
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "plus "); |
14659
|
132
|
|
|
|
|
if (r->intflags & PREGf_IMPLICIT) |
14660
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "implicit "); |
14661
|
132
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); |
14662
|
132
|
|
|
|
|
if (r->extflags & RXf_EVAL_SEEN) |
14663
|
68
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "with eval "); |
14664
|
132
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n"); |
14665
|
132
|
|
|
|
|
DEBUG_FLAGS_r({ |
14666
|
|
|
|
|
|
regdump_extflags("r->extflags: ",r->extflags); |
14667
|
|
|
|
|
|
regdump_intflags("r->intflags: ",r->intflags); |
14668
|
|
|
|
|
|
}); |
14669
|
|
|
|
|
|
#else |
14670
|
|
|
|
|
|
PERL_ARGS_ASSERT_REGDUMP; |
14671
|
|
|
|
|
|
PERL_UNUSED_CONTEXT; |
14672
|
|
|
|
|
|
PERL_UNUSED_ARG(r); |
14673
|
|
|
|
|
|
#endif /* DEBUGGING */ |
14674
|
132
|
|
|
|
|
} |
14675
|
|
|
|
|
|
|
14676
|
|
|
|
|
|
/* |
14677
|
|
|
|
|
|
- regprop - printable representation of opcode |
14678
|
|
|
|
|
|
*/ |
14679
|
|
|
|
|
|
#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ |
14680
|
|
|
|
|
|
STMT_START { \ |
14681
|
|
|
|
|
|
if (do_sep) { \ |
14682
|
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ |
14683
|
|
|
|
|
|
if (flags & ANYOF_INVERT) \ |
14684
|
|
|
|
|
|
/*make sure the invert info is in each */ \ |
14685
|
|
|
|
|
|
sv_catpvs(sv, "^"); \ |
14686
|
|
|
|
|
|
do_sep = 0; \ |
14687
|
|
|
|
|
|
} \ |
14688
|
|
|
|
|
|
} STMT_END |
14689
|
|
|
|
|
|
|
14690
|
|
|
|
|
|
void |
14691
|
684
|
|
|
|
|
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) |
14692
|
|
|
|
|
|
{ |
14693
|
|
|
|
|
|
#ifdef DEBUGGING |
14694
|
|
|
|
|
|
dVAR; |
14695
|
|
|
|
|
|
int k; |
14696
|
|
|
|
|
|
|
14697
|
|
|
|
|
|
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ |
14698
|
|
|
|
|
|
static const char * const anyofs[] = { |
14699
|
|
|
|
|
|
#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ |
14700
|
|
|
|
|
|
|| _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ |
14701
|
|
|
|
|
|
|| _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ |
14702
|
|
|
|
|
|
|| _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ |
14703
|
|
|
|
|
|
|| _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ |
14704
|
|
|
|
|
|
|| _CC_VERTSPACE != 16 |
14705
|
|
|
|
|
|
#error Need to adjust order of anyofs[] |
14706
|
|
|
|
|
|
#endif |
14707
|
|
|
|
|
|
"[\\w]", |
14708
|
|
|
|
|
|
"[\\W]", |
14709
|
|
|
|
|
|
"[\\d]", |
14710
|
|
|
|
|
|
"[\\D]", |
14711
|
|
|
|
|
|
"[:alpha:]", |
14712
|
|
|
|
|
|
"[:^alpha:]", |
14713
|
|
|
|
|
|
"[:lower:]", |
14714
|
|
|
|
|
|
"[:^lower:]", |
14715
|
|
|
|
|
|
"[:upper:]", |
14716
|
|
|
|
|
|
"[:^upper:]", |
14717
|
|
|
|
|
|
"[:punct:]", |
14718
|
|
|
|
|
|
"[:^punct:]", |
14719
|
|
|
|
|
|
"[:print:]", |
14720
|
|
|
|
|
|
"[:^print:]", |
14721
|
|
|
|
|
|
"[:alnum:]", |
14722
|
|
|
|
|
|
"[:^alnum:]", |
14723
|
|
|
|
|
|
"[:graph:]", |
14724
|
|
|
|
|
|
"[:^graph:]", |
14725
|
|
|
|
|
|
"[:cased:]", |
14726
|
|
|
|
|
|
"[:^cased:]", |
14727
|
|
|
|
|
|
"[\\s]", |
14728
|
|
|
|
|
|
"[\\S]", |
14729
|
|
|
|
|
|
"[:blank:]", |
14730
|
|
|
|
|
|
"[:^blank:]", |
14731
|
|
|
|
|
|
"[:xdigit:]", |
14732
|
|
|
|
|
|
"[:^xdigit:]", |
14733
|
|
|
|
|
|
"[:space:]", |
14734
|
|
|
|
|
|
"[:^space:]", |
14735
|
|
|
|
|
|
"[:cntrl:]", |
14736
|
|
|
|
|
|
"[:^cntrl:]", |
14737
|
|
|
|
|
|
"[:ascii:]", |
14738
|
|
|
|
|
|
"[:^ascii:]", |
14739
|
|
|
|
|
|
"[\\v]", |
14740
|
|
|
|
|
|
"[\\V]" |
14741
|
|
|
|
|
|
}; |
14742
|
684
|
|
|
|
|
RXi_GET_DECL(prog,progi); |
14743
|
684
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14744
|
|
|
|
|
|
|
14745
|
684
|
|
|
|
|
PERL_ARGS_ASSERT_REGPROP; |
14746
|
|
|
|
|
|
|
14747
|
684
|
|
|
|
|
sv_setpvs(sv, ""); |
14748
|
|
|
|
|
|
|
14749
|
684
|
|
|
|
|
if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ |
14750
|
|
|
|
|
|
/* It would be nice to FAIL() here, but this may be called from |
14751
|
|
|
|
|
|
regexec.c, and it would be hard to supply pRExC_state. */ |
14752
|
0
|
|
|
|
|
Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); |
14753
|
684
|
|
|
|
|
sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ |
14754
|
|
|
|
|
|
|
14755
|
684
|
|
|
|
|
k = PL_regkind[OP(o)]; |
14756
|
|
|
|
|
|
|
14757
|
684
|
|
|
|
|
if (k == EXACT) { |
14758
|
216
|
|
|
|
|
sv_catpvs(sv, " "); |
14759
|
|
|
|
|
|
/* Using is_utf8_string() (via PERL_PV_UNI_DETECT) |
14760
|
|
|
|
|
|
* is a crude hack but it may be the best for now since |
14761
|
|
|
|
|
|
* we have no flag "this EXACTish node was UTF-8" |
14762
|
|
|
|
|
|
* --jhi */ |
14763
|
216
|
|
|
|
|
pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], |
14764
|
|
|
|
|
|
PERL_PV_ESCAPE_UNI_DETECT | |
14765
|
|
|
|
|
|
PERL_PV_ESCAPE_NONASCII | |
14766
|
|
|
|
|
|
PERL_PV_PRETTY_ELLIPSES | |
14767
|
|
|
|
|
|
PERL_PV_PRETTY_LTGT | |
14768
|
|
|
|
|
|
PERL_PV_PRETTY_NOCLEAR |
14769
|
|
|
|
|
|
); |
14770
|
468
|
|
|
|
|
} else if (k == TRIE) { |
14771
|
|
|
|
|
|
/* print the details of the trie in dumpuntil instead, as |
14772
|
|
|
|
|
|
* progi->data isn't available here */ |
14773
|
22
|
|
|
|
|
const char op = OP(o); |
14774
|
22
|
|
|
|
|
const U32 n = ARG(o); |
14775
|
|
|
|
|
|
const reg_ac_data * const ac = IS_TRIE_AC(op) ? |
14776
|
22
|
|
|
|
|
(reg_ac_data *)progi->data->data[n] : |
14777
|
|
|
|
|
|
NULL; |
14778
|
22
|
|
|
|
|
const reg_trie_data * const trie |
14779
|
22
|
|
|
|
|
= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; |
14780
|
|
|
|
|
|
|
14781
|
22
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); |
14782
|
22
|
|
|
|
|
DEBUG_TRIE_COMPILE_r( |
14783
|
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, |
14784
|
|
|
|
|
|
"", |
14785
|
|
|
|
|
|
(UV)trie->startstate, |
14786
|
|
|
|
|
|
(IV)trie->statecount-1, /* -1 because of the unused 0 element */ |
14787
|
|
|
|
|
|
(UV)trie->wordcount, |
14788
|
|
|
|
|
|
(UV)trie->minlen, |
14789
|
|
|
|
|
|
(UV)trie->maxlen, |
14790
|
|
|
|
|
|
(UV)TRIE_CHARCOUNT(trie), |
14791
|
|
|
|
|
|
(UV)trie->uniquecharcount |
14792
|
|
|
|
|
|
) |
14793
|
|
|
|
|
|
); |
14794
|
22
|
|
|
|
|
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { |
14795
|
14
|
|
|
|
|
sv_catpvs(sv, "["); |
14796
|
14
|
|
|
|
|
(void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) |
14797
|
|
|
|
|
|
? ANYOF_BITMAP(o) |
14798
|
|
|
|
|
|
: TRIE_BITMAP(trie)); |
14799
|
14
|
|
|
|
|
sv_catpvs(sv, "]"); |
14800
|
|
|
|
|
|
} |
14801
|
|
|
|
|
|
|
14802
|
446
|
|
|
|
|
} else if (k == CURLY) { |
14803
|
0
|
|
|
|
|
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) |
14804
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ |
14805
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); |
14806
|
|
|
|
|
|
} |
14807
|
446
|
|
|
|
|
else if (k == WHILEM && o->flags) /* Ordinal/of */ |
14808
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); |
14809
|
446
|
|
|
|
|
else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { |
14810
|
66
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ |
14811
|
66
|
|
|
|
|
if ( RXp_PAREN_NAMES(prog) ) { |
14812
|
0
|
|
|
|
|
if ( k != REF || (OP(o) < NREF)) { |
14813
|
0
|
|
|
|
|
AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); |
14814
|
0
|
|
|
|
|
SV **name= av_fetch(list, ARG(o), 0 ); |
14815
|
0
|
|
|
|
|
if (name) |
14816
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); |
14817
|
|
|
|
|
|
} |
14818
|
|
|
|
|
|
else { |
14819
|
0
|
|
|
|
|
AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); |
14820
|
0
|
|
|
|
|
SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); |
14821
|
0
|
|
|
|
|
I32 *nums=(I32*)SvPVX(sv_dat); |
14822
|
0
|
|
|
|
|
SV **name= av_fetch(list, nums[0], 0 ); |
14823
|
|
|
|
|
|
I32 n; |
14824
|
0
|
|
|
|
|
if (name) { |
14825
|
0
|
|
|
|
|
for ( n=0; n
|
14826
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf, |
14827
|
0
|
|
|
|
|
(n ? "," : ""), (IV)nums[n]); |
14828
|
|
|
|
|
|
} |
14829
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); |
14830
|
|
|
|
|
|
} |
14831
|
|
|
|
|
|
} |
14832
|
|
|
|
|
|
} |
14833
|
380
|
|
|
|
|
} else if (k == GOSUB) |
14834
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ |
14835
|
380
|
|
|
|
|
else if (k == VERB) { |
14836
|
0
|
|
|
|
|
if (!o->flags) |
14837
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, ":%"SVf, |
14838
|
0
|
|
|
|
|
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); |
14839
|
380
|
|
|
|
|
} else if (k == LOGICAL) |
14840
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ |
14841
|
380
|
|
|
|
|
else if (k == ANYOF) { |
14842
|
2
|
|
|
|
|
const U8 flags = ANYOF_FLAGS(o); |
14843
|
|
|
|
|
|
int do_sep = 0; |
14844
|
|
|
|
|
|
|
14845
|
|
|
|
|
|
|
14846
|
2
|
|
|
|
|
if (flags & ANYOF_LOCALE) |
14847
|
0
|
|
|
|
|
sv_catpvs(sv, "{loc}"); |
14848
|
2
|
|
|
|
|
if (flags & ANYOF_LOC_FOLD) |
14849
|
0
|
|
|
|
|
sv_catpvs(sv, "{i}"); |
14850
|
2
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); |
14851
|
2
|
|
|
|
|
if (flags & ANYOF_INVERT) |
14852
|
0
|
|
|
|
|
sv_catpvs(sv, "^"); |
14853
|
|
|
|
|
|
|
14854
|
|
|
|
|
|
/* output what the standard cp 0-255 bitmap matches */ |
14855
|
2
|
|
|
|
|
do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); |
14856
|
|
|
|
|
|
|
14857
|
2
|
|
|
|
|
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); |
14858
|
|
|
|
|
|
/* output any special charclass tests (used entirely under use locale) */ |
14859
|
2
|
|
|
|
|
if (ANYOF_CLASS_TEST_ANY_SET(o)) { |
14860
|
|
|
|
|
|
int i; |
14861
|
0
|
|
|
|
|
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) { |
14862
|
0
|
|
|
|
|
if (ANYOF_CLASS_TEST(o,i)) { |
14863
|
0
|
|
|
|
|
sv_catpv(sv, anyofs[i]); |
14864
|
|
|
|
|
|
do_sep = 1; |
14865
|
|
|
|
|
|
} |
14866
|
|
|
|
|
|
} |
14867
|
|
|
|
|
|
} |
14868
|
|
|
|
|
|
|
14869
|
2
|
|
|
|
|
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); |
14870
|
|
|
|
|
|
|
14871
|
2
|
|
|
|
|
if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { |
14872
|
0
|
|
|
|
|
sv_catpvs(sv, "{non-utf8-latin1-all}"); |
14873
|
|
|
|
|
|
} |
14874
|
|
|
|
|
|
|
14875
|
|
|
|
|
|
/* output information about the unicode matching */ |
14876
|
2
|
|
|
|
|
if (flags & ANYOF_UNICODE_ALL) |
14877
|
0
|
|
|
|
|
sv_catpvs(sv, "{unicode_all}"); |
14878
|
2
|
|
|
|
|
else if (ANYOF_NONBITMAP(o)) { |
14879
|
|
|
|
|
|
SV *lv; /* Set if there is something outside the bit map. */ |
14880
|
|
|
|
|
|
bool byte_output = FALSE; /* If something in the bitmap has been |
14881
|
|
|
|
|
|
output */ |
14882
|
|
|
|
|
|
|
14883
|
0
|
|
|
|
|
if (flags & ANYOF_NONBITMAP_NON_UTF8) { |
14884
|
0
|
|
|
|
|
sv_catpvs(sv, "{outside bitmap}"); |
14885
|
|
|
|
|
|
} |
14886
|
|
|
|
|
|
else { |
14887
|
0
|
|
|
|
|
sv_catpvs(sv, "{utf8}"); |
14888
|
|
|
|
|
|
} |
14889
|
|
|
|
|
|
|
14890
|
|
|
|
|
|
/* Get the stuff that wasn't in the bitmap */ |
14891
|
0
|
|
|
|
|
(void) regclass_swash(prog, o, FALSE, &lv, NULL); |
14892
|
0
|
|
|
|
|
if (lv && lv != &PL_sv_undef) { |
14893
|
0
|
|
|
|
|
char *s = savesvpv(lv); |
14894
|
|
|
|
|
|
char * const origs = s; |
14895
|
|
|
|
|
|
|
14896
|
0
|
|
|
|
|
while (*s && *s != '\n') |
14897
|
0
|
|
|
|
|
s++; |
14898
|
|
|
|
|
|
|
14899
|
0
|
|
|
|
|
if (*s == '\n') { |
14900
|
0
|
|
|
|
|
const char * const t = ++s; |
14901
|
|
|
|
|
|
|
14902
|
|
|
|
|
|
if (byte_output) { |
14903
|
|
|
|
|
|
sv_catpvs(sv, " "); |
14904
|
|
|
|
|
|
} |
14905
|
|
|
|
|
|
|
14906
|
0
|
|
|
|
|
while (*s) { |
14907
|
0
|
|
|
|
|
if (*s == '\n') { |
14908
|
|
|
|
|
|
|
14909
|
|
|
|
|
|
/* Truncate very long output */ |
14910
|
0
|
|
|
|
|
if (s - origs > 256) { |
14911
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, |
14912
|
|
|
|
|
|
"%.*s...", |
14913
|
0
|
|
|
|
|
(int) (s - origs - 1), |
14914
|
|
|
|
|
|
t); |
14915
|
0
|
|
|
|
|
goto out_dump; |
14916
|
|
|
|
|
|
} |
14917
|
0
|
|
|
|
|
*s = ' '; |
14918
|
|
|
|
|
|
} |
14919
|
0
|
|
|
|
|
else if (*s == '\t') { |
14920
|
0
|
|
|
|
|
*s = '-'; |
14921
|
|
|
|
|
|
} |
14922
|
0
|
|
|
|
|
s++; |
14923
|
|
|
|
|
|
} |
14924
|
0
|
|
|
|
|
if (s[-1] == ' ') |
14925
|
0
|
|
|
|
|
s[-1] = 0; |
14926
|
|
|
|
|
|
|
14927
|
0
|
|
|
|
|
sv_catpv(sv, t); |
14928
|
|
|
|
|
|
} |
14929
|
|
|
|
|
|
|
14930
|
|
|
|
|
|
out_dump: |
14931
|
|
|
|
|
|
|
14932
|
0
|
|
|
|
|
Safefree(origs); |
14933
|
0
|
|
|
|
|
SvREFCNT_dec_NN(lv); |
14934
|
|
|
|
|
|
} |
14935
|
|
|
|
|
|
} |
14936
|
|
|
|
|
|
|
14937
|
2
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); |
14938
|
|
|
|
|
|
} |
14939
|
378
|
|
|
|
|
else if (k == POSIXD || k == NPOSIXD) { |
14940
|
0
|
|
|
|
|
U8 index = FLAGS(o) * 2; |
14941
|
0
|
|
|
|
|
if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { |
14942
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); |
14943
|
|
|
|
|
|
} |
14944
|
|
|
|
|
|
else { |
14945
|
0
|
|
|
|
|
sv_catpv(sv, anyofs[index]); |
14946
|
|
|
|
|
|
} |
14947
|
|
|
|
|
|
} |
14948
|
378
|
|
|
|
|
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) |
14949
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); |
14950
|
|
|
|
|
|
#else |
14951
|
|
|
|
|
|
PERL_UNUSED_CONTEXT; |
14952
|
|
|
|
|
|
PERL_UNUSED_ARG(sv); |
14953
|
|
|
|
|
|
PERL_UNUSED_ARG(o); |
14954
|
|
|
|
|
|
PERL_UNUSED_ARG(prog); |
14955
|
|
|
|
|
|
#endif /* DEBUGGING */ |
14956
|
684
|
|
|
|
|
} |
14957
|
|
|
|
|
|
|
14958
|
|
|
|
|
|
SV * |
14959
|
0
|
|
|
|
|
Perl_re_intuit_string(pTHX_ REGEXP * const r) |
14960
|
|
|
|
|
|
{ /* Assume that RE_INTUIT is set */ |
14961
|
|
|
|
|
|
dVAR; |
14962
|
0
|
|
|
|
|
struct regexp *const prog = ReANY(r); |
14963
|
0
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
14964
|
|
|
|
|
|
|
14965
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_RE_INTUIT_STRING; |
14966
|
|
|
|
|
|
PERL_UNUSED_CONTEXT; |
14967
|
|
|
|
|
|
|
14968
|
0
|
|
|
|
|
DEBUG_COMPILE_r( |
14969
|
|
|
|
|
|
{ |
14970
|
|
|
|
|
|
const char * const s = SvPV_nolen_const(prog->check_substr |
14971
|
|
|
|
|
|
? prog->check_substr : prog->check_utf8); |
14972
|
|
|
|
|
|
|
14973
|
|
|
|
|
|
if (!PL_colorset) reginitcolors(); |
14974
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, |
14975
|
|
|
|
|
|
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", |
14976
|
|
|
|
|
|
PL_colors[4], |
14977
|
|
|
|
|
|
prog->check_substr ? "" : "utf8 ", |
14978
|
|
|
|
|
|
PL_colors[5],PL_colors[0], |
14979
|
|
|
|
|
|
s, |
14980
|
|
|
|
|
|
PL_colors[1], |
14981
|
|
|
|
|
|
(strlen(s) > 60 ? "..." : "")); |
14982
|
|
|
|
|
|
} ); |
14983
|
|
|
|
|
|
|
14984
|
0
|
|
|
|
|
return prog->check_substr ? prog->check_substr : prog->check_utf8; |
14985
|
|
|
|
|
|
} |
14986
|
|
|
|
|
|
|
14987
|
|
|
|
|
|
/* |
14988
|
|
|
|
|
|
pregfree() |
14989
|
|
|
|
|
|
|
14990
|
|
|
|
|
|
handles refcounting and freeing the perl core regexp structure. When |
14991
|
|
|
|
|
|
it is necessary to actually free the structure the first thing it |
14992
|
|
|
|
|
|
does is call the 'free' method of the regexp_engine associated to |
14993
|
|
|
|
|
|
the regexp, allowing the handling of the void *pprivate; member |
14994
|
|
|
|
|
|
first. (This routine is not overridable by extensions, which is why |
14995
|
|
|
|
|
|
the extensions free is called first.) |
14996
|
|
|
|
|
|
|
14997
|
|
|
|
|
|
See regdupe and regdupe_internal if you change anything here. |
14998
|
|
|
|
|
|
*/ |
14999
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
15000
|
|
|
|
|
|
void |
15001
|
|
|
|
|
|
Perl_pregfree(pTHX_ REGEXP *r) |
15002
|
|
|
|
|
|
{ |
15003
|
|
|
|
|
|
SvREFCNT_dec(r); |
15004
|
|
|
|
|
|
} |
15005
|
|
|
|
|
|
|
15006
|
|
|
|
|
|
void |
15007
|
|
|
|
|
|
Perl_pregfree2(pTHX_ REGEXP *rx) |
15008
|
|
|
|
|
|
{ |
15009
|
|
|
|
|
|
dVAR; |
15010
|
|
|
|
|
|
struct regexp *const r = ReANY(rx); |
15011
|
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
15012
|
|
|
|
|
|
|
15013
|
|
|
|
|
|
PERL_ARGS_ASSERT_PREGFREE2; |
15014
|
|
|
|
|
|
|
15015
|
|
|
|
|
|
if (r->mother_re) { |
15016
|
|
|
|
|
|
ReREFCNT_dec(r->mother_re); |
15017
|
|
|
|
|
|
} else { |
15018
|
|
|
|
|
|
CALLREGFREE_PVT(rx); /* free the private data */ |
15019
|
|
|
|
|
|
SvREFCNT_dec(RXp_PAREN_NAMES(r)); |
15020
|
|
|
|
|
|
Safefree(r->xpv_len_u.xpvlenu_pv); |
15021
|
|
|
|
|
|
} |
15022
|
|
|
|
|
|
if (r->substrs) { |
15023
|
|
|
|
|
|
SvREFCNT_dec(r->anchored_substr); |
15024
|
|
|
|
|
|
SvREFCNT_dec(r->anchored_utf8); |
15025
|
|
|
|
|
|
SvREFCNT_dec(r->float_substr); |
15026
|
|
|
|
|
|
SvREFCNT_dec(r->float_utf8); |
15027
|
|
|
|
|
|
Safefree(r->substrs); |
15028
|
|
|
|
|
|
} |
15029
|
|
|
|
|
|
RX_MATCH_COPY_FREE(rx); |
15030
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
15031
|
|
|
|
|
|
SvREFCNT_dec(r->saved_copy); |
15032
|
|
|
|
|
|
#endif |
15033
|
|
|
|
|
|
Safefree(r->offs); |
15034
|
|
|
|
|
|
SvREFCNT_dec(r->qr_anoncv); |
15035
|
|
|
|
|
|
rx->sv_u.svu_rx = 0; |
15036
|
|
|
|
|
|
} |
15037
|
|
|
|
|
|
|
15038
|
|
|
|
|
|
/* reg_temp_copy() |
15039
|
|
|
|
|
|
|
15040
|
|
|
|
|
|
This is a hacky workaround to the structural issue of match results |
15041
|
|
|
|
|
|
being stored in the regexp structure which is in turn stored in |
15042
|
|
|
|
|
|
PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern |
15043
|
|
|
|
|
|
could be PL_curpm in multiple contexts, and could require multiple |
15044
|
|
|
|
|
|
result sets being associated with the pattern simultaneously, such |
15045
|
|
|
|
|
|
as when doing a recursive match with (??{$qr}) |
15046
|
|
|
|
|
|
|
15047
|
|
|
|
|
|
The solution is to make a lightweight copy of the regexp structure |
15048
|
|
|
|
|
|
when a qr// is returned from the code executed by (??{$qr}) this |
15049
|
|
|
|
|
|
lightweight copy doesn't actually own any of its data except for |
15050
|
|
|
|
|
|
the starp/end and the actual regexp structure itself. |
15051
|
|
|
|
|
|
|
15052
|
|
|
|
|
|
*/ |
15053
|
|
|
|
|
|
|
15054
|
|
|
|
|
|
|
15055
|
|
|
|
|
|
REGEXP * |
15056
|
|
|
|
|
|
Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) |
15057
|
|
|
|
|
|
{ |
15058
|
|
|
|
|
|
struct regexp *ret; |
15059
|
|
|
|
|
|
struct regexp *const r = ReANY(rx); |
15060
|
|
|
|
|
|
const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; |
15061
|
|
|
|
|
|
|
15062
|
|
|
|
|
|
PERL_ARGS_ASSERT_REG_TEMP_COPY; |
15063
|
|
|
|
|
|
|
15064
|
|
|
|
|
|
if (!ret_x) |
15065
|
|
|
|
|
|
ret_x = (REGEXP*) newSV_type(SVt_REGEXP); |
15066
|
|
|
|
|
|
else { |
15067
|
|
|
|
|
|
SvOK_off((SV *)ret_x); |
15068
|
|
|
|
|
|
if (islv) { |
15069
|
|
|
|
|
|
/* For PVLVs, SvANY points to the xpvlv body while sv_u points |
15070
|
|
|
|
|
|
to the regexp. (For SVt_REGEXPs, sv_upgrade has already |
15071
|
|
|
|
|
|
made both spots point to the same regexp body.) */ |
15072
|
|
|
|
|
|
REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); |
15073
|
|
|
|
|
|
assert(!SvPVX(ret_x)); |
15074
|
|
|
|
|
|
ret_x->sv_u.svu_rx = temp->sv_any; |
15075
|
|
|
|
|
|
temp->sv_any = NULL; |
15076
|
|
|
|
|
|
SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; |
15077
|
|
|
|
|
|
SvREFCNT_dec_NN(temp); |
15078
|
|
|
|
|
|
/* SvCUR still resides in the xpvlv struct, so the regexp copy- |
15079
|
|
|
|
|
|
ing below will not set it. */ |
15080
|
|
|
|
|
|
SvCUR_set(ret_x, SvCUR(rx)); |
15081
|
|
|
|
|
|
} |
15082
|
|
|
|
|
|
} |
15083
|
|
|
|
|
|
/* This ensures that SvTHINKFIRST(sv) is true, and hence that |
15084
|
|
|
|
|
|
sv_force_normal(sv) is called. */ |
15085
|
|
|
|
|
|
SvFAKE_on(ret_x); |
15086
|
|
|
|
|
|
ret = ReANY(ret_x); |
15087
|
|
|
|
|
|
|
15088
|
|
|
|
|
|
SvFLAGS(ret_x) |= SvUTF8(rx); |
15089
|
|
|
|
|
|
/* We share the same string buffer as the original regexp, on which we |
15090
|
|
|
|
|
|
hold a reference count, incremented when mother_re is set below. |
15091
|
|
|
|
|
|
The string pointer is copied here, being part of the regexp struct. |
15092
|
|
|
|
|
|
*/ |
15093
|
|
|
|
|
|
memcpy(&(ret->xpv_cur), &(r->xpv_cur), |
15094
|
|
|
|
|
|
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); |
15095
|
|
|
|
|
|
if (r->offs) { |
15096
|
|
|
|
|
|
const I32 npar = r->nparens+1; |
15097
|
|
|
|
|
|
Newx(ret->offs, npar, regexp_paren_pair); |
15098
|
|
|
|
|
|
Copy(r->offs, ret->offs, npar, regexp_paren_pair); |
15099
|
|
|
|
|
|
} |
15100
|
|
|
|
|
|
if (r->substrs) { |
15101
|
|
|
|
|
|
Newx(ret->substrs, 1, struct reg_substr_data); |
15102
|
|
|
|
|
|
StructCopy(r->substrs, ret->substrs, struct reg_substr_data); |
15103
|
|
|
|
|
|
|
15104
|
|
|
|
|
|
SvREFCNT_inc_void(ret->anchored_substr); |
15105
|
|
|
|
|
|
SvREFCNT_inc_void(ret->anchored_utf8); |
15106
|
|
|
|
|
|
SvREFCNT_inc_void(ret->float_substr); |
15107
|
|
|
|
|
|
SvREFCNT_inc_void(ret->float_utf8); |
15108
|
|
|
|
|
|
|
15109
|
|
|
|
|
|
/* check_substr and check_utf8, if non-NULL, point to either their |
15110
|
|
|
|
|
|
anchored or float namesakes, and don't hold a second reference. */ |
15111
|
|
|
|
|
|
} |
15112
|
|
|
|
|
|
RX_MATCH_COPIED_off(ret_x); |
15113
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
15114
|
|
|
|
|
|
ret->saved_copy = NULL; |
15115
|
|
|
|
|
|
#endif |
15116
|
|
|
|
|
|
ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); |
15117
|
|
|
|
|
|
SvREFCNT_inc_void(ret->qr_anoncv); |
15118
|
|
|
|
|
|
|
15119
|
|
|
|
|
|
return ret_x; |
15120
|
|
|
|
|
|
} |
15121
|
|
|
|
|
|
#endif |
15122
|
|
|
|
|
|
|
15123
|
|
|
|
|
|
/* regfree_internal() |
15124
|
|
|
|
|
|
|
15125
|
|
|
|
|
|
Free the private data in a regexp. This is overloadable by |
15126
|
|
|
|
|
|
extensions. Perl takes care of the regexp structure in pregfree(), |
15127
|
|
|
|
|
|
this covers the *pprivate pointer which technically perl doesn't |
15128
|
|
|
|
|
|
know about, however of course we have to handle the |
15129
|
|
|
|
|
|
regexp_internal structure when no extension is in use. |
15130
|
|
|
|
|
|
|
15131
|
|
|
|
|
|
Note this is called before freeing anything in the regexp |
15132
|
|
|
|
|
|
structure. |
15133
|
|
|
|
|
|
*/ |
15134
|
|
|
|
|
|
|
15135
|
|
|
|
|
|
void |
15136
|
106
|
|
|
|
|
Perl_regfree_internal(pTHX_ REGEXP * const rx) |
15137
|
|
|
|
|
|
{ |
15138
|
|
|
|
|
|
dVAR; |
15139
|
106
|
|
|
|
|
struct regexp *const r = ReANY(rx); |
15140
|
106
|
|
|
|
|
RXi_GET_DECL(r,ri); |
15141
|
106
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
15142
|
|
|
|
|
|
|
15143
|
106
|
|
|
|
|
PERL_ARGS_ASSERT_REGFREE_INTERNAL; |
15144
|
|
|
|
|
|
|
15145
|
418
|
|
|
|
|
DEBUG_COMPILE_r({ |
15146
|
|
|
|
|
|
if (!PL_colorset) |
15147
|
|
|
|
|
|
reginitcolors(); |
15148
|
|
|
|
|
|
{ |
15149
|
|
|
|
|
|
SV *dsv= sv_newmortal(); |
15150
|
|
|
|
|
|
RE_PV_QUOTED_DECL(s, RX_UTF8(rx), |
15151
|
|
|
|
|
|
dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); |
15152
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", |
15153
|
|
|
|
|
|
PL_colors[4],PL_colors[5],s); |
15154
|
|
|
|
|
|
} |
15155
|
|
|
|
|
|
}); |
15156
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
15157
|
106
|
|
|
|
|
if (ri->u.offsets) |
15158
|
106
|
|
|
|
|
Safefree(ri->u.offsets); /* 20010421 MJD */ |
15159
|
|
|
|
|
|
#endif |
15160
|
106
|
|
|
|
|
if (ri->code_blocks) { |
15161
|
|
|
|
|
|
int n; |
15162
|
32
|
|
|
|
|
for (n = 0; n < ri->num_code_blocks; n++) |
15163
|
32
|
|
|
|
|
SvREFCNT_dec(ri->code_blocks[n].src_regex); |
15164
|
28
|
|
|
|
|
Safefree(ri->code_blocks); |
15165
|
|
|
|
|
|
} |
15166
|
|
|
|
|
|
|
15167
|
106
|
|
|
|
|
if (ri->data) { |
15168
|
48
|
|
|
|
|
int n = ri->data->count; |
15169
|
|
|
|
|
|
|
15170
|
212
|
|
|
|
|
while (--n >= 0) { |
15171
|
|
|
|
|
|
/* If you add a ->what type here, update the comment in regcomp.h */ |
15172
|
116
|
|
|
|
|
switch (ri->data->what[n]) { |
15173
|
|
|
|
|
|
case 'a': |
15174
|
|
|
|
|
|
case 'r': |
15175
|
|
|
|
|
|
case 's': |
15176
|
|
|
|
|
|
case 'S': |
15177
|
|
|
|
|
|
case 'u': |
15178
|
56
|
|
|
|
|
SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); |
15179
|
56
|
|
|
|
|
break; |
15180
|
|
|
|
|
|
case 'f': |
15181
|
0
|
|
|
|
|
Safefree(ri->data->data[n]); |
15182
|
0
|
|
|
|
|
break; |
15183
|
|
|
|
|
|
case 'l': |
15184
|
|
|
|
|
|
case 'L': |
15185
|
|
|
|
|
|
break; |
15186
|
|
|
|
|
|
case 'T': |
15187
|
|
|
|
|
|
{ /* Aho Corasick add-on structure for a trie node. |
15188
|
|
|
|
|
|
Used in stclass optimization only */ |
15189
|
|
|
|
|
|
U32 refcount; |
15190
|
4
|
|
|
|
|
reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; |
15191
|
|
|
|
|
|
OP_REFCNT_LOCK; |
15192
|
4
|
|
|
|
|
refcount = --aho->refcount; |
15193
|
|
|
|
|
|
OP_REFCNT_UNLOCK; |
15194
|
4
|
|
|
|
|
if ( !refcount ) { |
15195
|
4
|
|
|
|
|
PerlMemShared_free(aho->states); |
15196
|
4
|
|
|
|
|
PerlMemShared_free(aho->fail); |
15197
|
|
|
|
|
|
/* do this last!!!! */ |
15198
|
4
|
|
|
|
|
PerlMemShared_free(ri->data->data[n]); |
15199
|
4
|
|
|
|
|
PerlMemShared_free(ri->regstclass); |
15200
|
|
|
|
|
|
} |
15201
|
|
|
|
|
|
} |
15202
|
|
|
|
|
|
break; |
15203
|
|
|
|
|
|
case 't': |
15204
|
|
|
|
|
|
{ |
15205
|
|
|
|
|
|
/* trie structure. */ |
15206
|
|
|
|
|
|
U32 refcount; |
15207
|
8
|
|
|
|
|
reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; |
15208
|
|
|
|
|
|
OP_REFCNT_LOCK; |
15209
|
8
|
|
|
|
|
refcount = --trie->refcount; |
15210
|
|
|
|
|
|
OP_REFCNT_UNLOCK; |
15211
|
8
|
|
|
|
|
if ( !refcount ) { |
15212
|
8
|
|
|
|
|
PerlMemShared_free(trie->charmap); |
15213
|
8
|
|
|
|
|
PerlMemShared_free(trie->states); |
15214
|
8
|
|
|
|
|
PerlMemShared_free(trie->trans); |
15215
|
8
|
|
|
|
|
if (trie->bitmap) |
15216
|
4
|
|
|
|
|
PerlMemShared_free(trie->bitmap); |
15217
|
8
|
|
|
|
|
if (trie->jump) |
15218
|
0
|
|
|
|
|
PerlMemShared_free(trie->jump); |
15219
|
8
|
|
|
|
|
PerlMemShared_free(trie->wordinfo); |
15220
|
|
|
|
|
|
/* do this last!!!! */ |
15221
|
8
|
|
|
|
|
PerlMemShared_free(ri->data->data[n]); |
15222
|
|
|
|
|
|
} |
15223
|
|
|
|
|
|
} |
15224
|
|
|
|
|
|
break; |
15225
|
|
|
|
|
|
default: |
15226
|
0
|
|
|
|
|
Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); |
15227
|
|
|
|
|
|
} |
15228
|
|
|
|
|
|
} |
15229
|
48
|
|
|
|
|
Safefree(ri->data->what); |
15230
|
48
|
|
|
|
|
Safefree(ri->data); |
15231
|
|
|
|
|
|
} |
15232
|
|
|
|
|
|
|
15233
|
106
|
|
|
|
|
Safefree(ri); |
15234
|
106
|
|
|
|
|
} |
15235
|
|
|
|
|
|
|
15236
|
|
|
|
|
|
#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) |
15237
|
|
|
|
|
|
#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) |
15238
|
|
|
|
|
|
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) |
15239
|
|
|
|
|
|
|
15240
|
|
|
|
|
|
/* |
15241
|
|
|
|
|
|
re_dup - duplicate a regexp. |
15242
|
|
|
|
|
|
|
15243
|
|
|
|
|
|
This routine is expected to clone a given regexp structure. It is only |
15244
|
|
|
|
|
|
compiled under USE_ITHREADS. |
15245
|
|
|
|
|
|
|
15246
|
|
|
|
|
|
After all of the core data stored in struct regexp is duplicated |
15247
|
|
|
|
|
|
the regexp_engine.dupe method is used to copy any private data |
15248
|
|
|
|
|
|
stored in the *pprivate pointer. This allows extensions to handle |
15249
|
|
|
|
|
|
any duplication it needs to do. |
15250
|
|
|
|
|
|
|
15251
|
|
|
|
|
|
See pregfree() and regfree_internal() if you change anything here. |
15252
|
|
|
|
|
|
*/ |
15253
|
|
|
|
|
|
#if defined(USE_ITHREADS) |
15254
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
15255
|
|
|
|
|
|
void |
15256
|
|
|
|
|
|
Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) |
15257
|
|
|
|
|
|
{ |
15258
|
|
|
|
|
|
dVAR; |
15259
|
|
|
|
|
|
I32 npar; |
15260
|
|
|
|
|
|
const struct regexp *r = ReANY(sstr); |
15261
|
|
|
|
|
|
struct regexp *ret = ReANY(dstr); |
15262
|
|
|
|
|
|
|
15263
|
|
|
|
|
|
PERL_ARGS_ASSERT_RE_DUP_GUTS; |
15264
|
|
|
|
|
|
|
15265
|
|
|
|
|
|
npar = r->nparens+1; |
15266
|
|
|
|
|
|
Newx(ret->offs, npar, regexp_paren_pair); |
15267
|
|
|
|
|
|
Copy(r->offs, ret->offs, npar, regexp_paren_pair); |
15268
|
|
|
|
|
|
|
15269
|
|
|
|
|
|
if (ret->substrs) { |
15270
|
|
|
|
|
|
/* Do it this way to avoid reading from *r after the StructCopy(). |
15271
|
|
|
|
|
|
That way, if any of the sv_dup_inc()s dislodge *r from the L1 |
15272
|
|
|
|
|
|
cache, it doesn't matter. */ |
15273
|
|
|
|
|
|
const bool anchored = r->check_substr |
15274
|
|
|
|
|
|
? r->check_substr == r->anchored_substr |
15275
|
|
|
|
|
|
: r->check_utf8 == r->anchored_utf8; |
15276
|
|
|
|
|
|
Newx(ret->substrs, 1, struct reg_substr_data); |
15277
|
|
|
|
|
|
StructCopy(r->substrs, ret->substrs, struct reg_substr_data); |
15278
|
|
|
|
|
|
|
15279
|
|
|
|
|
|
ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); |
15280
|
|
|
|
|
|
ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); |
15281
|
|
|
|
|
|
ret->float_substr = sv_dup_inc(ret->float_substr, param); |
15282
|
|
|
|
|
|
ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); |
15283
|
|
|
|
|
|
|
15284
|
|
|
|
|
|
/* check_substr and check_utf8, if non-NULL, point to either their |
15285
|
|
|
|
|
|
anchored or float namesakes, and don't hold a second reference. */ |
15286
|
|
|
|
|
|
|
15287
|
|
|
|
|
|
if (ret->check_substr) { |
15288
|
|
|
|
|
|
if (anchored) { |
15289
|
|
|
|
|
|
assert(r->check_utf8 == r->anchored_utf8); |
15290
|
|
|
|
|
|
ret->check_substr = ret->anchored_substr; |
15291
|
|
|
|
|
|
ret->check_utf8 = ret->anchored_utf8; |
15292
|
|
|
|
|
|
} else { |
15293
|
|
|
|
|
|
assert(r->check_substr == r->float_substr); |
15294
|
|
|
|
|
|
assert(r->check_utf8 == r->float_utf8); |
15295
|
|
|
|
|
|
ret->check_substr = ret->float_substr; |
15296
|
|
|
|
|
|
ret->check_utf8 = ret->float_utf8; |
15297
|
|
|
|
|
|
} |
15298
|
|
|
|
|
|
} else if (ret->check_utf8) { |
15299
|
|
|
|
|
|
if (anchored) { |
15300
|
|
|
|
|
|
ret->check_utf8 = ret->anchored_utf8; |
15301
|
|
|
|
|
|
} else { |
15302
|
|
|
|
|
|
ret->check_utf8 = ret->float_utf8; |
15303
|
|
|
|
|
|
} |
15304
|
|
|
|
|
|
} |
15305
|
|
|
|
|
|
} |
15306
|
|
|
|
|
|
|
15307
|
|
|
|
|
|
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); |
15308
|
|
|
|
|
|
ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); |
15309
|
|
|
|
|
|
|
15310
|
|
|
|
|
|
if (ret->pprivate) |
15311
|
|
|
|
|
|
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); |
15312
|
|
|
|
|
|
|
15313
|
|
|
|
|
|
if (RX_MATCH_COPIED(dstr)) |
15314
|
|
|
|
|
|
ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); |
15315
|
|
|
|
|
|
else |
15316
|
|
|
|
|
|
ret->subbeg = NULL; |
15317
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
15318
|
|
|
|
|
|
ret->saved_copy = NULL; |
15319
|
|
|
|
|
|
#endif |
15320
|
|
|
|
|
|
|
15321
|
|
|
|
|
|
/* Whether mother_re be set or no, we need to copy the string. We |
15322
|
|
|
|
|
|
cannot refrain from copying it when the storage points directly to |
15323
|
|
|
|
|
|
our mother regexp, because that's |
15324
|
|
|
|
|
|
1: a buffer in a different thread |
15325
|
|
|
|
|
|
2: something we no longer hold a reference on |
15326
|
|
|
|
|
|
so we need to copy it locally. */ |
15327
|
|
|
|
|
|
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); |
15328
|
|
|
|
|
|
ret->mother_re = NULL; |
15329
|
|
|
|
|
|
} |
15330
|
|
|
|
|
|
#endif /* PERL_IN_XSUB_RE */ |
15331
|
|
|
|
|
|
|
15332
|
|
|
|
|
|
/* |
15333
|
|
|
|
|
|
regdupe_internal() |
15334
|
|
|
|
|
|
|
15335
|
|
|
|
|
|
This is the internal complement to regdupe() which is used to copy |
15336
|
|
|
|
|
|
the structure pointed to by the *pprivate pointer in the regexp. |
15337
|
|
|
|
|
|
This is the core version of the extension overridable cloning hook. |
15338
|
|
|
|
|
|
The regexp structure being duplicated will be copied by perl prior |
15339
|
|
|
|
|
|
to this and will be provided as the regexp *r argument, however |
15340
|
|
|
|
|
|
with the /old/ structures pprivate pointer value. Thus this routine |
15341
|
|
|
|
|
|
may override any copying normally done by perl. |
15342
|
|
|
|
|
|
|
15343
|
|
|
|
|
|
It returns a pointer to the new regexp_internal structure. |
15344
|
|
|
|
|
|
*/ |
15345
|
|
|
|
|
|
|
15346
|
|
|
|
|
|
void * |
15347
|
|
|
|
|
|
Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) |
15348
|
|
|
|
|
|
{ |
15349
|
|
|
|
|
|
dVAR; |
15350
|
|
|
|
|
|
struct regexp *const r = ReANY(rx); |
15351
|
|
|
|
|
|
regexp_internal *reti; |
15352
|
|
|
|
|
|
int len; |
15353
|
|
|
|
|
|
RXi_GET_DECL(r,ri); |
15354
|
|
|
|
|
|
|
15355
|
|
|
|
|
|
PERL_ARGS_ASSERT_REGDUPE_INTERNAL; |
15356
|
|
|
|
|
|
|
15357
|
|
|
|
|
|
len = ProgLen(ri); |
15358
|
|
|
|
|
|
|
15359
|
|
|
|
|
|
Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); |
15360
|
|
|
|
|
|
Copy(ri->program, reti->program, len+1, regnode); |
15361
|
|
|
|
|
|
|
15362
|
|
|
|
|
|
reti->num_code_blocks = ri->num_code_blocks; |
15363
|
|
|
|
|
|
if (ri->code_blocks) { |
15364
|
|
|
|
|
|
int n; |
15365
|
|
|
|
|
|
Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, |
15366
|
|
|
|
|
|
struct reg_code_block); |
15367
|
|
|
|
|
|
Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, |
15368
|
|
|
|
|
|
struct reg_code_block); |
15369
|
|
|
|
|
|
for (n = 0; n < ri->num_code_blocks; n++) |
15370
|
|
|
|
|
|
reti->code_blocks[n].src_regex = (REGEXP*) |
15371
|
|
|
|
|
|
sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); |
15372
|
|
|
|
|
|
} |
15373
|
|
|
|
|
|
else |
15374
|
|
|
|
|
|
reti->code_blocks = NULL; |
15375
|
|
|
|
|
|
|
15376
|
|
|
|
|
|
reti->regstclass = NULL; |
15377
|
|
|
|
|
|
|
15378
|
|
|
|
|
|
if (ri->data) { |
15379
|
|
|
|
|
|
struct reg_data *d; |
15380
|
|
|
|
|
|
const int count = ri->data->count; |
15381
|
|
|
|
|
|
int i; |
15382
|
|
|
|
|
|
|
15383
|
|
|
|
|
|
Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), |
15384
|
|
|
|
|
|
char, struct reg_data); |
15385
|
|
|
|
|
|
Newx(d->what, count, U8); |
15386
|
|
|
|
|
|
|
15387
|
|
|
|
|
|
d->count = count; |
15388
|
|
|
|
|
|
for (i = 0; i < count; i++) { |
15389
|
|
|
|
|
|
d->what[i] = ri->data->what[i]; |
15390
|
|
|
|
|
|
switch (d->what[i]) { |
15391
|
|
|
|
|
|
/* see also regcomp.h and regfree_internal() */ |
15392
|
|
|
|
|
|
case 'a': /* actually an AV, but the dup function is identical. */ |
15393
|
|
|
|
|
|
case 'r': |
15394
|
|
|
|
|
|
case 's': |
15395
|
|
|
|
|
|
case 'S': |
15396
|
|
|
|
|
|
case 'u': /* actually an HV, but the dup function is identical. */ |
15397
|
|
|
|
|
|
d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); |
15398
|
|
|
|
|
|
break; |
15399
|
|
|
|
|
|
case 'f': |
15400
|
|
|
|
|
|
/* This is cheating. */ |
15401
|
|
|
|
|
|
Newx(d->data[i], 1, struct regnode_charclass_class); |
15402
|
|
|
|
|
|
StructCopy(ri->data->data[i], d->data[i], |
15403
|
|
|
|
|
|
struct regnode_charclass_class); |
15404
|
|
|
|
|
|
reti->regstclass = (regnode*)d->data[i]; |
15405
|
|
|
|
|
|
break; |
15406
|
|
|
|
|
|
case 'T': |
15407
|
|
|
|
|
|
/* Trie stclasses are readonly and can thus be shared |
15408
|
|
|
|
|
|
* without duplication. We free the stclass in pregfree |
15409
|
|
|
|
|
|
* when the corresponding reg_ac_data struct is freed. |
15410
|
|
|
|
|
|
*/ |
15411
|
|
|
|
|
|
reti->regstclass= ri->regstclass; |
15412
|
|
|
|
|
|
/* Fall through */ |
15413
|
|
|
|
|
|
case 't': |
15414
|
|
|
|
|
|
OP_REFCNT_LOCK; |
15415
|
|
|
|
|
|
((reg_trie_data*)ri->data->data[i])->refcount++; |
15416
|
|
|
|
|
|
OP_REFCNT_UNLOCK; |
15417
|
|
|
|
|
|
/* Fall through */ |
15418
|
|
|
|
|
|
case 'l': |
15419
|
|
|
|
|
|
case 'L': |
15420
|
|
|
|
|
|
d->data[i] = ri->data->data[i]; |
15421
|
|
|
|
|
|
break; |
15422
|
|
|
|
|
|
default: |
15423
|
|
|
|
|
|
Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); |
15424
|
|
|
|
|
|
} |
15425
|
|
|
|
|
|
} |
15426
|
|
|
|
|
|
|
15427
|
|
|
|
|
|
reti->data = d; |
15428
|
|
|
|
|
|
} |
15429
|
|
|
|
|
|
else |
15430
|
|
|
|
|
|
reti->data = NULL; |
15431
|
|
|
|
|
|
|
15432
|
|
|
|
|
|
reti->name_list_idx = ri->name_list_idx; |
15433
|
|
|
|
|
|
|
15434
|
|
|
|
|
|
#ifdef RE_TRACK_PATTERN_OFFSETS |
15435
|
|
|
|
|
|
if (ri->u.offsets) { |
15436
|
|
|
|
|
|
Newx(reti->u.offsets, 2*len+1, U32); |
15437
|
|
|
|
|
|
Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); |
15438
|
|
|
|
|
|
} |
15439
|
|
|
|
|
|
#else |
15440
|
|
|
|
|
|
SetProgLen(reti,len); |
15441
|
|
|
|
|
|
#endif |
15442
|
|
|
|
|
|
|
15443
|
|
|
|
|
|
return (void*)reti; |
15444
|
|
|
|
|
|
} |
15445
|
|
|
|
|
|
|
15446
|
|
|
|
|
|
#endif /* USE_ITHREADS */ |
15447
|
|
|
|
|
|
|
15448
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
15449
|
|
|
|
|
|
|
15450
|
|
|
|
|
|
/* |
15451
|
|
|
|
|
|
- regnext - dig the "next" pointer out of a node |
15452
|
|
|
|
|
|
*/ |
15453
|
|
|
|
|
|
regnode * |
15454
|
|
|
|
|
|
Perl_regnext(pTHX_ regnode *p) |
15455
|
|
|
|
|
|
{ |
15456
|
|
|
|
|
|
dVAR; |
15457
|
|
|
|
|
|
I32 offset; |
15458
|
|
|
|
|
|
|
15459
|
|
|
|
|
|
if (!p) |
15460
|
|
|
|
|
|
return(NULL); |
15461
|
|
|
|
|
|
|
15462
|
|
|
|
|
|
if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ |
15463
|
|
|
|
|
|
Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); |
15464
|
|
|
|
|
|
} |
15465
|
|
|
|
|
|
|
15466
|
|
|
|
|
|
offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); |
15467
|
|
|
|
|
|
if (offset == 0) |
15468
|
|
|
|
|
|
return(NULL); |
15469
|
|
|
|
|
|
|
15470
|
|
|
|
|
|
return(p+offset); |
15471
|
|
|
|
|
|
} |
15472
|
|
|
|
|
|
#endif |
15473
|
|
|
|
|
|
|
15474
|
|
|
|
|
|
STATIC void |
15475
|
0
|
|
|
|
|
S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) |
15476
|
|
|
|
|
|
{ |
15477
|
|
|
|
|
|
va_list args; |
15478
|
0
|
|
|
|
|
STRLEN l1 = strlen(pat1); |
15479
|
0
|
|
|
|
|
STRLEN l2 = strlen(pat2); |
15480
|
|
|
|
|
|
char buf[512]; |
15481
|
|
|
|
|
|
SV *msv; |
15482
|
|
|
|
|
|
const char *message; |
15483
|
|
|
|
|
|
|
15484
|
0
|
|
|
|
|
PERL_ARGS_ASSERT_RE_CROAK2; |
15485
|
|
|
|
|
|
|
15486
|
0
|
|
|
|
|
if (l1 > 510) |
15487
|
0
|
|
|
|
|
l1 = 510; |
15488
|
0
|
|
|
|
|
if (l1 + l2 > 510) |
15489
|
0
|
|
|
|
|
l2 = 510 - l1; |
15490
|
0
|
|
|
|
|
Copy(pat1, buf, l1 , char); |
15491
|
0
|
|
|
|
|
Copy(pat2, buf + l1, l2 , char); |
15492
|
0
|
|
|
|
|
buf[l1 + l2] = '\n'; |
15493
|
0
|
|
|
|
|
buf[l1 + l2 + 1] = '\0'; |
15494
|
|
|
|
|
|
#ifdef I_STDARG |
15495
|
|
|
|
|
|
/* ANSI variant takes additional second argument */ |
15496
|
0
|
|
|
|
|
va_start(args, pat2); |
15497
|
|
|
|
|
|
#else |
15498
|
|
|
|
|
|
va_start(args); |
15499
|
|
|
|
|
|
#endif |
15500
|
0
|
|
|
|
|
msv = vmess(buf, &args); |
15501
|
0
|
|
|
|
|
va_end(args); |
15502
|
0
|
|
|
|
|
message = SvPV_const(msv,l1); |
15503
|
0
|
|
|
|
|
if (l1 > 512) |
15504
|
0
|
|
|
|
|
l1 = 512; |
15505
|
0
|
|
|
|
|
Copy(message, buf, l1 , char); |
15506
|
0
|
|
|
|
|
buf[l1-1] = '\0'; /* Overwrite \n */ |
15507
|
0
|
|
|
|
|
Perl_croak(aTHX_ "%s", buf); |
15508
|
|
|
|
|
|
} |
15509
|
|
|
|
|
|
|
15510
|
|
|
|
|
|
/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ |
15511
|
|
|
|
|
|
|
15512
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE |
15513
|
|
|
|
|
|
void |
15514
|
|
|
|
|
|
Perl_save_re_context(pTHX) |
15515
|
|
|
|
|
|
{ |
15516
|
|
|
|
|
|
dVAR; |
15517
|
|
|
|
|
|
|
15518
|
|
|
|
|
|
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ |
15519
|
|
|
|
|
|
if (PL_curpm) { |
15520
|
|
|
|
|
|
const REGEXP * const rx = PM_GETRE(PL_curpm); |
15521
|
|
|
|
|
|
if (rx) { |
15522
|
|
|
|
|
|
U32 i; |
15523
|
|
|
|
|
|
for (i = 1; i <= RX_NPARENS(rx); i++) { |
15524
|
|
|
|
|
|
char digits[TYPE_CHARS(long)]; |
15525
|
|
|
|
|
|
const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); |
15526
|
|
|
|
|
|
GV *const *const gvp |
15527
|
|
|
|
|
|
= (GV**)hv_fetch(PL_defstash, digits, len, 0); |
15528
|
|
|
|
|
|
|
15529
|
|
|
|
|
|
if (gvp) { |
15530
|
|
|
|
|
|
GV * const gv = *gvp; |
15531
|
|
|
|
|
|
if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) |
15532
|
|
|
|
|
|
save_scalar(gv); |
15533
|
|
|
|
|
|
} |
15534
|
|
|
|
|
|
} |
15535
|
|
|
|
|
|
} |
15536
|
|
|
|
|
|
} |
15537
|
|
|
|
|
|
} |
15538
|
|
|
|
|
|
#endif |
15539
|
|
|
|
|
|
|
15540
|
|
|
|
|
|
#ifdef DEBUGGING |
15541
|
|
|
|
|
|
|
15542
|
|
|
|
|
|
STATIC void |
15543
|
40
|
|
|
|
|
S_put_byte(pTHX_ SV *sv, int c) |
15544
|
|
|
|
|
|
{ |
15545
|
40
|
|
|
|
|
PERL_ARGS_ASSERT_PUT_BYTE; |
15546
|
|
|
|
|
|
|
15547
|
|
|
|
|
|
/* Our definition of isPRINT() ignores locales, so only bytes that are |
15548
|
|
|
|
|
|
not part of UTF-8 are considered printable. I assume that the same |
15549
|
|
|
|
|
|
holds for UTF-EBCDIC. |
15550
|
|
|
|
|
|
Also, code point 255 is not printable in either (it's E0 in EBCDIC, |
15551
|
|
|
|
|
|
which Wikipedia says: |
15552
|
|
|
|
|
|
|
15553
|
|
|
|
|
|
EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all |
15554
|
|
|
|
|
|
ones (binary 1111 1111, hexadecimal FF). It is similar, but not |
15555
|
|
|
|
|
|
identical, to the ASCII delete (DEL) or rubout control character. ... |
15556
|
|
|
|
|
|
it is typically mapped to hexadecimal code 9F, in order to provide a |
15557
|
|
|
|
|
|
unique character mapping in both directions) |
15558
|
|
|
|
|
|
|
15559
|
|
|
|
|
|
So the old condition can be simplified to !isPRINT(c) */ |
15560
|
40
|
|
|
|
|
if (!isPRINT(c)) { |
15561
|
0
|
|
|
|
|
switch (c) { |
15562
|
0
|
|
|
|
|
case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; |
15563
|
0
|
|
|
|
|
case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; |
15564
|
0
|
|
|
|
|
case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; |
15565
|
0
|
|
|
|
|
case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; |
15566
|
0
|
|
|
|
|
case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; |
15567
|
|
|
|
|
|
|
15568
|
|
|
|
|
|
default: |
15569
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); |
15570
|
0
|
|
|
|
|
break; |
15571
|
|
|
|
|
|
} |
15572
|
|
|
|
|
|
} |
15573
|
|
|
|
|
|
else { |
15574
|
40
|
|
|
|
|
const char string = c; |
15575
|
40
|
|
|
|
|
if (c == '-' || c == ']' || c == '\\' || c == '^') |
15576
|
0
|
|
|
|
|
sv_catpvs(sv, "\\"); |
15577
|
40
|
|
|
|
|
sv_catpvn(sv, &string, 1); |
15578
|
|
|
|
|
|
} |
15579
|
40
|
|
|
|
|
} |
15580
|
|
|
|
|
|
|
15581
|
|
|
|
|
|
STATIC bool |
15582
|
16
|
|
|
|
|
S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) |
15583
|
|
|
|
|
|
{ |
15584
|
|
|
|
|
|
/* Appends to 'sv' a displayable version of the innards of the bracketed |
15585
|
|
|
|
|
|
* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually |
15586
|
|
|
|
|
|
* output anything */ |
15587
|
|
|
|
|
|
|
15588
|
|
|
|
|
|
int i; |
15589
|
|
|
|
|
|
int rangestart = -1; |
15590
|
|
|
|
|
|
bool has_output_anything = FALSE; |
15591
|
|
|
|
|
|
|
15592
|
16
|
|
|
|
|
PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; |
15593
|
|
|
|
|
|
|
15594
|
4112
|
|
|
|
|
for (i = 0; i <= 256; i++) { |
15595
|
4112
|
|
|
|
|
if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { |
15596
|
60
|
|
|
|
|
if (rangestart == -1) |
15597
|
|
|
|
|
|
rangestart = i; |
15598
|
4052
|
|
|
|
|
} else if (rangestart != -1) { |
15599
|
30
|
|
|
|
|
int j = i - 1; |
15600
|
30
|
|
|
|
|
if (i <= rangestart + 3) { /* Individual chars in short ranges */ |
15601
|
24
|
|
|
|
|
for (; rangestart < i; rangestart++) |
15602
|
24
|
|
|
|
|
put_byte(sv, rangestart); |
15603
|
|
|
|
|
|
} |
15604
|
8
|
|
|
|
|
else if ( j > 255 |
15605
|
8
|
|
|
|
|
|| ! isALPHANUMERIC(rangestart) |
15606
|
8
|
|
|
|
|
|| ! isALPHANUMERIC(j) |
15607
|
8
|
|
|
|
|
|| isDIGIT(rangestart) != isDIGIT(j) |
15608
|
8
|
|
|
|
|
|| isUPPER(rangestart) != isUPPER(j) |
15609
|
8
|
|
|
|
|
|| isLOWER(rangestart) != isLOWER(j) |
15610
|
|
|
|
|
|
|
15611
|
|
|
|
|
|
/* This final test should get optimized out except |
15612
|
|
|
|
|
|
* on EBCDIC platforms, where it causes ranges that |
15613
|
|
|
|
|
|
* cross discontinuities like i/j to be shown as hex |
15614
|
|
|
|
|
|
* instead of the misleading, e.g. H-K (since that |
15615
|
|
|
|
|
|
* range includes more than H, I, J, K). */ |
15616
|
8
|
|
|
|
|
|| (j - rangestart) |
15617
|
|
|
|
|
|
!= NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart)) |
15618
|
|
|
|
|
|
{ |
15619
|
0
|
|
|
|
|
Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}", |
15620
|
|
|
|
|
|
rangestart, |
15621
|
|
|
|
|
|
(j < 256) ? j : 255); |
15622
|
|
|
|
|
|
} |
15623
|
|
|
|
|
|
else { /* Here, the ends of the range are both digits, or both |
15624
|
|
|
|
|
|
uppercase, or both lowercase; and there's no |
15625
|
|
|
|
|
|
discontinuity in the range (which could happen on EBCDIC |
15626
|
|
|
|
|
|
platforms) */ |
15627
|
8
|
|
|
|
|
put_byte(sv, rangestart); |
15628
|
8
|
|
|
|
|
sv_catpvs(sv, "-"); |
15629
|
8
|
|
|
|
|
put_byte(sv, j); |
15630
|
|
|
|
|
|
} |
15631
|
|
|
|
|
|
rangestart = -1; |
15632
|
|
|
|
|
|
has_output_anything = TRUE; |
15633
|
|
|
|
|
|
} |
15634
|
|
|
|
|
|
} |
15635
|
|
|
|
|
|
|
15636
|
16
|
|
|
|
|
return has_output_anything; |
15637
|
|
|
|
|
|
} |
15638
|
|
|
|
|
|
|
15639
|
|
|
|
|
|
#define CLEAR_OPTSTART \ |
15640
|
|
|
|
|
|
if (optstart) STMT_START { \ |
15641
|
|
|
|
|
|
DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ |
15642
|
|
|
|
|
|
optstart=NULL; \ |
15643
|
|
|
|
|
|
} STMT_END |
15644
|
|
|
|
|
|
|
15645
|
|
|
|
|
|
#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); |
15646
|
|
|
|
|
|
|
15647
|
|
|
|
|
|
STATIC const regnode * |
15648
|
132
|
|
|
|
|
S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, |
15649
|
|
|
|
|
|
const regnode *last, const regnode *plast, |
15650
|
|
|
|
|
|
SV* sv, I32 indent, U32 depth) |
15651
|
|
|
|
|
|
{ |
15652
|
|
|
|
|
|
dVAR; |
15653
|
|
|
|
|
|
U8 op = PSEUDO; /* Arbitrary non-END op. */ |
15654
|
|
|
|
|
|
const regnode *next; |
15655
|
|
|
|
|
|
const regnode *optstart= NULL; |
15656
|
|
|
|
|
|
|
15657
|
132
|
|
|
|
|
RXi_GET_DECL(r,ri); |
15658
|
132
|
|
|
|
|
GET_RE_DEBUG_FLAGS_DECL; |
15659
|
|
|
|
|
|
|
15660
|
132
|
|
|
|
|
PERL_ARGS_ASSERT_DUMPUNTIL; |
15661
|
|
|
|
|
|
|
15662
|
|
|
|
|
|
#ifdef DEBUG_DUMPUNTIL |
15663
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, |
15664
|
|
|
|
|
|
last ? last-start : 0,plast ? plast-start : 0); |
15665
|
|
|
|
|
|
#endif |
15666
|
|
|
|
|
|
|
15667
|
132
|
|
|
|
|
if (plast && plast < last) |
15668
|
|
|
|
|
|
last= plast; |
15669
|
|
|
|
|
|
|
15670
|
546
|
|
|
|
|
while (PL_regkind[op] != END && (!last || node < last)) { |
15671
|
|
|
|
|
|
/* While that wasn't END last time... */ |
15672
|
|
|
|
|
|
NODE_ALIGN(node); |
15673
|
414
|
|
|
|
|
op = OP(node); |
15674
|
414
|
|
|
|
|
if (op == CLOSE || op == WHILEM) |
15675
|
6
|
|
|
|
|
indent--; |
15676
|
414
|
|
|
|
|
next = regnext((regnode *)node); |
15677
|
|
|
|
|
|
|
15678
|
|
|
|
|
|
/* Where, what. */ |
15679
|
414
|
|
|
|
|
if (OP(node) == OPTIMIZED) { |
15680
|
50
|
|
|
|
|
if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) |
15681
|
|
|
|
|
|
optstart = node; |
15682
|
|
|
|
|
|
else |
15683
|
|
|
|
|
|
goto after_print; |
15684
|
|
|
|
|
|
} else |
15685
|
364
|
|
|
|
|
CLEAR_OPTSTART; |
15686
|
|
|
|
|
|
|
15687
|
364
|
|
|
|
|
regprop(r, sv, node); |
15688
|
364
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), |
15689
|
364
|
|
|
|
|
(int)(2*indent + 1), "", SvPVX_const(sv)); |
15690
|
|
|
|
|
|
|
15691
|
364
|
|
|
|
|
if (OP(node) != OPTIMIZED) { |
15692
|
364
|
|
|
|
|
if (next == NULL) /* Next ptr. */ |
15693
|
132
|
|
|
|
|
PerlIO_printf(Perl_debug_log, " (0)"); |
15694
|
232
|
|
|
|
|
else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) |
15695
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, " (FAIL)"); |
15696
|
|
|
|
|
|
else |
15697
|
232
|
|
|
|
|
PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); |
15698
|
364
|
|
|
|
|
(void)PerlIO_putc(Perl_debug_log, '\n'); |
15699
|
|
|
|
|
|
} |
15700
|
|
|
|
|
|
|
15701
|
|
|
|
|
|
after_print: |
15702
|
414
|
|
|
|
|
if (PL_regkind[(U8)op] == BRANCHJ) { |
15703
|
0
|
|
|
|
|
assert(next); |
15704
|
|
|
|
|
|
{ |
15705
|
0
|
|
|
|
|
const regnode *nnode = (OP(next) == LONGJMP |
15706
|
|
|
|
|
|
? regnext((regnode *)next) |
15707
|
0
|
|
|
|
|
: next); |
15708
|
0
|
|
|
|
|
if (last && nnode > last) |
15709
|
|
|
|
|
|
nnode = last; |
15710
|
0
|
|
|
|
|
DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); |
15711
|
|
|
|
|
|
} |
15712
|
|
|
|
|
|
} |
15713
|
414
|
|
|
|
|
else if (PL_regkind[(U8)op] == BRANCH) { |
15714
|
0
|
|
|
|
|
assert(next); |
15715
|
0
|
|
|
|
|
DUMPUNTIL(NEXTOPER(node), next); |
15716
|
|
|
|
|
|
} |
15717
|
414
|
|
|
|
|
else if ( PL_regkind[(U8)op] == TRIE ) { |
15718
|
|
|
|
|
|
const regnode *this_trie = node; |
15719
|
8
|
|
|
|
|
const char op = OP(node); |
15720
|
8
|
|
|
|
|
const U32 n = ARG(node); |
15721
|
|
|
|
|
|
const reg_ac_data * const ac = op>=AHOCORASICK ? |
15722
|
8
|
|
|
|
|
(reg_ac_data *)ri->data->data[n] : |
15723
|
|
|
|
|
|
NULL; |
15724
|
8
|
|
|
|
|
const reg_trie_data * const trie = |
15725
|
8
|
|
|
|
|
(reg_trie_data*)ri->data->data[optrie]; |
15726
|
|
|
|
|
|
#ifdef DEBUGGING |
15727
|
8
|
|
|
|
|
AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); |
15728
|
|
|
|
|
|
#endif |
15729
|
|
|
|
|
|
const regnode *nextbranch= NULL; |
15730
|
|
|
|
|
|
I32 word_idx; |
15731
|
8
|
|
|
|
|
sv_setpvs(sv, ""); |
15732
|
64
|
|
|
|
|
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { |
15733
|
56
|
|
|
|
|
SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); |
15734
|
|
|
|
|
|
|
15735
|
112
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "%*s%s ", |
15736
|
|
|
|
|
|
(int)(2*(indent+3)), "", |
15737
|
56
|
|
|
|
|
elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, |
15738
|
|
|
|
|
|
PL_colors[0], PL_colors[1], |
15739
|
|
|
|
|
|
(SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | |
15740
|
|
|
|
|
|
PERL_PV_PRETTY_ELLIPSES | |
15741
|
|
|
|
|
|
PERL_PV_PRETTY_LTGT |
15742
|
|
|
|
|
|
) |
15743
|
|
|
|
|
|
: "???" |
15744
|
|
|
|
|
|
); |
15745
|
56
|
|
|
|
|
if (trie->jump) { |
15746
|
0
|
|
|
|
|
U16 dist= trie->jump[word_idx+1]; |
15747
|
0
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", |
15748
|
0
|
|
|
|
|
(UV)((dist ? this_trie + dist : next) - start)); |
15749
|
0
|
|
|
|
|
if (dist) { |
15750
|
0
|
|
|
|
|
if (!nextbranch) |
15751
|
0
|
|
|
|
|
nextbranch= this_trie + trie->jump[0]; |
15752
|
0
|
|
|
|
|
DUMPUNTIL(this_trie + dist, nextbranch); |
15753
|
|
|
|
|
|
} |
15754
|
0
|
|
|
|
|
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) |
15755
|
0
|
|
|
|
|
nextbranch= regnext((regnode *)nextbranch); |
15756
|
|
|
|
|
|
} else { |
15757
|
56
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "\n"); |
15758
|
|
|
|
|
|
} |
15759
|
|
|
|
|
|
} |
15760
|
8
|
|
|
|
|
if (last && next > last) |
15761
|
|
|
|
|
|
node= last; |
15762
|
|
|
|
|
|
else |
15763
|
|
|
|
|
|
node= next; |
15764
|
|
|
|
|
|
} |
15765
|
406
|
|
|
|
|
else if ( op == CURLY ) { /* "next" might be very big: optimizer */ |
15766
|
0
|
|
|
|
|
DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, |
15767
|
|
|
|
|
|
NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); |
15768
|
|
|
|
|
|
} |
15769
|
406
|
|
|
|
|
else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { |
15770
|
0
|
|
|
|
|
assert(next); |
15771
|
0
|
|
|
|
|
DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); |
15772
|
|
|
|
|
|
} |
15773
|
406
|
|
|
|
|
else if ( op == PLUS || op == STAR) { |
15774
|
0
|
|
|
|
|
DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); |
15775
|
|
|
|
|
|
} |
15776
|
406
|
|
|
|
|
else if (PL_regkind[(U8)op] == ANYOF) { |
15777
|
|
|
|
|
|
/* arglen 1 + class block */ |
15778
|
2
|
|
|
|
|
node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) |
15779
|
|
|
|
|
|
? ANYOF_CLASS_SKIP : ANYOF_SKIP); |
15780
|
2
|
|
|
|
|
node = NEXTOPER(node); |
15781
|
|
|
|
|
|
} |
15782
|
404
|
|
|
|
|
else if (PL_regkind[(U8)op] == EXACT) { |
15783
|
|
|
|
|
|
/* Literal string, where present. */ |
15784
|
124
|
|
|
|
|
node += NODE_SZ_STR(node) - 1; |
15785
|
124
|
|
|
|
|
node = NEXTOPER(node); |
15786
|
|
|
|
|
|
} |
15787
|
|
|
|
|
|
else { |
15788
|
280
|
|
|
|
|
node = NEXTOPER(node); |
15789
|
280
|
|
|
|
|
node += regarglen[(U8)op]; |
15790
|
|
|
|
|
|
} |
15791
|
414
|
|
|
|
|
if (op == CURLYX || op == OPEN) |
15792
|
6
|
|
|
|
|
indent++; |
15793
|
|
|
|
|
|
} |
15794
|
132
|
|
|
|
|
CLEAR_OPTSTART; |
15795
|
|
|
|
|
|
#ifdef DEBUG_DUMPUNTIL |
15796
|
|
|
|
|
|
PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); |
15797
|
|
|
|
|
|
#endif |
15798
|
132
|
|
|
|
|
return node; |
15799
|
1402
|
|
|
|
|
} |
15800
|
|
|
|
|
|
|
15801
|
|
|
|
|
|
#endif /* DEBUGGING */ |
15802
|
|
|
|
|
|
|
15803
|
|
|
|
|
|
/* |
15804
|
|
|
|
|
|
* Local variables: |
15805
|
|
|
|
|
|
* c-indentation-style: bsd |
15806
|
|
|
|
|
|
* c-basic-offset: 4 |
15807
|
|
|
|
|
|
* indent-tabs-mode: nil |
15808
|
|
|
|
|
|
* End: |
15809
|
|
|
|
|
|
* |
15810
|
|
|
|
|
|
* ex: set ts=8 sts=4 sw=4 et: |
15811
|
|
|
|
|
|
*/ |