line |
stmt |
bran |
cond |
sub |
time |
code |
1
|
|
|
|
|
|
/* regexp.h |
2
|
|
|
|
|
|
* |
3
|
|
|
|
|
|
* Copyright (C) 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2003, |
4
|
|
|
|
|
|
* 2005, 2006, 2007, 2008 by Larry Wall and others |
5
|
|
|
|
|
|
* |
6
|
|
|
|
|
|
* You may distribute under the terms of either the GNU General Public |
7
|
|
|
|
|
|
* License or the Artistic License, as specified in the README file. |
8
|
|
|
|
|
|
* |
9
|
|
|
|
|
|
*/ |
10
|
|
|
|
|
|
|
11
|
|
|
|
|
|
/* |
12
|
|
|
|
|
|
* Definitions etc. for regexp(3) routines. |
13
|
|
|
|
|
|
* |
14
|
|
|
|
|
|
* Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], |
15
|
|
|
|
|
|
* not the System V one. |
16
|
|
|
|
|
|
*/ |
17
|
|
|
|
|
|
#ifndef PLUGGABLE_RE_EXTENSION |
18
|
|
|
|
|
|
/* we don't want to include this stuff if we are inside of |
19
|
|
|
|
|
|
an external regex engine based on the core one - like re 'debug'*/ |
20
|
|
|
|
|
|
|
21
|
|
|
|
|
|
#include "utf8.h" |
22
|
|
|
|
|
|
|
23
|
|
|
|
|
|
struct regnode { |
24
|
|
|
|
|
|
U8 flags; |
25
|
|
|
|
|
|
U8 type; |
26
|
|
|
|
|
|
U16 next_off; |
27
|
|
|
|
|
|
}; |
28
|
|
|
|
|
|
|
29
|
|
|
|
|
|
typedef struct regnode regnode; |
30
|
|
|
|
|
|
|
31
|
|
|
|
|
|
struct reg_substr_data; |
32
|
|
|
|
|
|
|
33
|
|
|
|
|
|
struct reg_data; |
34
|
|
|
|
|
|
|
35
|
|
|
|
|
|
struct regexp_engine; |
36
|
|
|
|
|
|
struct regexp; |
37
|
|
|
|
|
|
|
38
|
|
|
|
|
|
struct reg_substr_datum { |
39
|
|
|
|
|
|
SSize_t min_offset; |
40
|
|
|
|
|
|
SSize_t max_offset; |
41
|
|
|
|
|
|
SV *substr; /* non-utf8 variant */ |
42
|
|
|
|
|
|
SV *utf8_substr; /* utf8 variant */ |
43
|
|
|
|
|
|
SSize_t end_shift; |
44
|
|
|
|
|
|
}; |
45
|
|
|
|
|
|
struct reg_substr_data { |
46
|
|
|
|
|
|
struct reg_substr_datum data[3]; /* Actual array */ |
47
|
|
|
|
|
|
}; |
48
|
|
|
|
|
|
|
49
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
50
|
|
|
|
|
|
#define SV_SAVED_COPY SV *saved_copy; /* If non-NULL, SV which is COW from original */ |
51
|
|
|
|
|
|
#else |
52
|
|
|
|
|
|
#define SV_SAVED_COPY |
53
|
|
|
|
|
|
#endif |
54
|
|
|
|
|
|
|
55
|
|
|
|
|
|
/* offsets within a string of a particular /(.)/ capture */ |
56
|
|
|
|
|
|
|
57
|
|
|
|
|
|
typedef struct regexp_paren_pair { |
58
|
|
|
|
|
|
SSize_t start; |
59
|
|
|
|
|
|
SSize_t end; |
60
|
|
|
|
|
|
/* 'start_tmp' records a new opening position before the matching end |
61
|
|
|
|
|
|
* has been found, so that the old start and end values are still |
62
|
|
|
|
|
|
* valid, e.g. |
63
|
|
|
|
|
|
* "abc" =~ /(.(?{print "[$1]"}))+/ |
64
|
|
|
|
|
|
*outputs [][a][b] |
65
|
|
|
|
|
|
* This field is not part of the API. */ |
66
|
|
|
|
|
|
SSize_t start_tmp; |
67
|
|
|
|
|
|
} regexp_paren_pair; |
68
|
|
|
|
|
|
|
69
|
|
|
|
|
|
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C) |
70
|
|
|
|
|
|
#define _invlist_union(a, b, output) _invlist_union_maybe_complement_2nd(a, b, FALSE, output) |
71
|
|
|
|
|
|
#define _invlist_intersection(a, b, output) _invlist_intersection_maybe_complement_2nd(a, b, FALSE, output) |
72
|
|
|
|
|
|
|
73
|
|
|
|
|
|
/* Subtracting b from a leaves in a everything that was there that isn't in b, |
74
|
|
|
|
|
|
* that is the intersection of a with b's complement */ |
75
|
|
|
|
|
|
#define _invlist_subtract(a, b, output) _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) |
76
|
|
|
|
|
|
#endif |
77
|
|
|
|
|
|
|
78
|
|
|
|
|
|
/* record the position of a (?{...}) within a pattern */ |
79
|
|
|
|
|
|
|
80
|
|
|
|
|
|
struct reg_code_block { |
81
|
|
|
|
|
|
STRLEN start; |
82
|
|
|
|
|
|
STRLEN end; |
83
|
|
|
|
|
|
OP *block; |
84
|
|
|
|
|
|
REGEXP *src_regex; |
85
|
|
|
|
|
|
}; |
86
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
88
|
|
|
|
|
|
/* |
89
|
|
|
|
|
|
The regexp/REGEXP struct, see L for further documentation |
90
|
|
|
|
|
|
on the individual fields. The struct is ordered so that the most |
91
|
|
|
|
|
|
commonly used fields are placed at the start. |
92
|
|
|
|
|
|
|
93
|
|
|
|
|
|
Any patch that adds items to this struct will need to include |
94
|
|
|
|
|
|
changes to F (C) and F |
95
|
|
|
|
|
|
(C). This involves freeing or cloning items in the |
96
|
|
|
|
|
|
regexp's data array based on the data item's type. |
97
|
|
|
|
|
|
*/ |
98
|
|
|
|
|
|
|
99
|
|
|
|
|
|
#define _REGEXP_COMMON \ |
100
|
|
|
|
|
|
/* what engine created this regexp? */ \ |
101
|
|
|
|
|
|
const struct regexp_engine* engine; \ |
102
|
|
|
|
|
|
REGEXP *mother_re; /* what re is this a lightweight copy of? */ \ |
103
|
|
|
|
|
|
HV *paren_names; /* Optional hash of paren names */ \ |
104
|
|
|
|
|
|
/* Information about the match that the perl core uses to */ \ |
105
|
|
|
|
|
|
/* manage things */ \ |
106
|
|
|
|
|
|
U32 extflags; /* Flags used both externally and internally */ \ |
107
|
|
|
|
|
|
SSize_t minlen; /* mininum possible number of chars in string to match */\ |
108
|
|
|
|
|
|
SSize_t minlenret; /* mininum possible number of chars in $& */ \ |
109
|
|
|
|
|
|
STRLEN gofs; /* chars left of pos that we search from */ \ |
110
|
|
|
|
|
|
/* substring data about strings that must appear in the */ \ |
111
|
|
|
|
|
|
/* final match, used for optimisations */ \ |
112
|
|
|
|
|
|
struct reg_substr_data *substrs; \ |
113
|
|
|
|
|
|
U32 nparens; /* number of capture buffers */ \ |
114
|
|
|
|
|
|
/* private engine specific data */ \ |
115
|
|
|
|
|
|
U32 intflags; /* Engine Specific Internal flags */ \ |
116
|
|
|
|
|
|
void *pprivate; /* Data private to the regex engine which */ \ |
117
|
|
|
|
|
|
/* created this object. */ \ |
118
|
|
|
|
|
|
/* Data about the last/current match. These are modified */ \ |
119
|
|
|
|
|
|
/* during matching */ \ |
120
|
|
|
|
|
|
U32 lastparen; /* last open paren matched */ \ |
121
|
|
|
|
|
|
U32 lastcloseparen; /* last close paren matched */ \ |
122
|
|
|
|
|
|
/* Array of offsets for (@-) and (@+) */ \ |
123
|
|
|
|
|
|
regexp_paren_pair *offs; \ |
124
|
|
|
|
|
|
/* saved or original string so \digit works forever. */ \ |
125
|
|
|
|
|
|
char *subbeg; \ |
126
|
|
|
|
|
|
SV_SAVED_COPY /* If non-NULL, SV which is COW from original */\ |
127
|
|
|
|
|
|
SSize_t sublen; /* Length of string pointed by subbeg */ \ |
128
|
|
|
|
|
|
SSize_t suboffset; /* byte offset of subbeg from logical start of str */ \ |
129
|
|
|
|
|
|
SSize_t subcoffset; /* suboffset equiv, but in chars (for @-/@+) */ \ |
130
|
|
|
|
|
|
/* Information about the match that isn't often used */ \ |
131
|
|
|
|
|
|
/* offset from wrapped to the start of precomp */ \ |
132
|
|
|
|
|
|
PERL_BITFIELD32 pre_prefix:4; \ |
133
|
|
|
|
|
|
/* original flags used to compile the pattern, may differ */ \ |
134
|
|
|
|
|
|
/* from extflags in various ways */ \ |
135
|
|
|
|
|
|
PERL_BITFIELD32 compflags:9; \ |
136
|
|
|
|
|
|
CV *qr_anoncv /* the anon sub wrapped round qr/(?{..})/ */ |
137
|
|
|
|
|
|
|
138
|
|
|
|
|
|
typedef struct regexp { |
139
|
|
|
|
|
|
_XPV_HEAD; |
140
|
|
|
|
|
|
_REGEXP_COMMON; |
141
|
|
|
|
|
|
} regexp; |
142
|
|
|
|
|
|
|
143
|
|
|
|
|
|
#define RXp_PAREN_NAMES(rx) ((rx)->paren_names) |
144
|
|
|
|
|
|
|
145
|
|
|
|
|
|
/* used for high speed searches */ |
146
|
|
|
|
|
|
typedef struct re_scream_pos_data_s |
147
|
|
|
|
|
|
{ |
148
|
|
|
|
|
|
char **scream_olds; /* match pos */ |
149
|
|
|
|
|
|
SSize_t *scream_pos; /* Internal iterator of scream. */ |
150
|
|
|
|
|
|
} re_scream_pos_data; |
151
|
|
|
|
|
|
|
152
|
|
|
|
|
|
/* regexp_engine structure. This is the dispatch table for regexes. |
153
|
|
|
|
|
|
* Any regex engine implementation must be able to build one of these. |
154
|
|
|
|
|
|
*/ |
155
|
|
|
|
|
|
typedef struct regexp_engine { |
156
|
|
|
|
|
|
REGEXP* (*comp) (pTHX_ SV * const pattern, U32 flags); |
157
|
|
|
|
|
|
I32 (*exec) (pTHX_ REGEXP * const rx, char* stringarg, char* strend, |
158
|
|
|
|
|
|
char* strbeg, SSize_t minend, SV* sv, |
159
|
|
|
|
|
|
void* data, U32 flags); |
160
|
|
|
|
|
|
char* (*intuit) (pTHX_ |
161
|
|
|
|
|
|
REGEXP * const rx, |
162
|
|
|
|
|
|
SV *sv, |
163
|
|
|
|
|
|
const char * const strbeg, |
164
|
|
|
|
|
|
char *strpos, |
165
|
|
|
|
|
|
char *strend, |
166
|
|
|
|
|
|
const U32 flags, |
167
|
|
|
|
|
|
re_scream_pos_data *data); |
168
|
|
|
|
|
|
SV* (*checkstr) (pTHX_ REGEXP * const rx); |
169
|
|
|
|
|
|
void (*free) (pTHX_ REGEXP * const rx); |
170
|
|
|
|
|
|
void (*numbered_buff_FETCH) (pTHX_ REGEXP * const rx, const I32 paren, |
171
|
|
|
|
|
|
SV * const sv); |
172
|
|
|
|
|
|
void (*numbered_buff_STORE) (pTHX_ REGEXP * const rx, const I32 paren, |
173
|
|
|
|
|
|
SV const * const value); |
174
|
|
|
|
|
|
I32 (*numbered_buff_LENGTH) (pTHX_ REGEXP * const rx, const SV * const sv, |
175
|
|
|
|
|
|
const I32 paren); |
176
|
|
|
|
|
|
SV* (*named_buff) (pTHX_ REGEXP * const rx, SV * const key, |
177
|
|
|
|
|
|
SV * const value, const U32 flags); |
178
|
|
|
|
|
|
SV* (*named_buff_iter) (pTHX_ REGEXP * const rx, const SV * const lastkey, |
179
|
|
|
|
|
|
const U32 flags); |
180
|
|
|
|
|
|
SV* (*qr_package)(pTHX_ REGEXP * const rx); |
181
|
|
|
|
|
|
#ifdef USE_ITHREADS |
182
|
|
|
|
|
|
void* (*dupe) (pTHX_ REGEXP * const rx, CLONE_PARAMS *param); |
183
|
|
|
|
|
|
#endif |
184
|
|
|
|
|
|
REGEXP* (*op_comp) (pTHX_ SV ** const patternp, int pat_count, |
185
|
|
|
|
|
|
OP *expr, const struct regexp_engine* eng, |
186
|
|
|
|
|
|
REGEXP *VOL old_re, |
187
|
|
|
|
|
|
bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags); |
188
|
|
|
|
|
|
} regexp_engine; |
189
|
|
|
|
|
|
|
190
|
|
|
|
|
|
/* |
191
|
|
|
|
|
|
These are passed to the numbered capture variable callbacks as the |
192
|
|
|
|
|
|
paren name. >= 1 is reserved for actual numbered captures, i.e. $1, |
193
|
|
|
|
|
|
$2 etc. |
194
|
|
|
|
|
|
*/ |
195
|
|
|
|
|
|
#define RX_BUFF_IDX_CARET_PREMATCH -5 /* ${^PREMATCH} */ |
196
|
|
|
|
|
|
#define RX_BUFF_IDX_CARET_POSTMATCH -4 /* ${^POSTMATCH} */ |
197
|
|
|
|
|
|
#define RX_BUFF_IDX_CARET_FULLMATCH -3 /* ${^MATCH} */ |
198
|
|
|
|
|
|
#define RX_BUFF_IDX_PREMATCH -2 /* $` */ |
199
|
|
|
|
|
|
#define RX_BUFF_IDX_POSTMATCH -1 /* $' */ |
200
|
|
|
|
|
|
#define RX_BUFF_IDX_FULLMATCH 0 /* $& */ |
201
|
|
|
|
|
|
|
202
|
|
|
|
|
|
/* |
203
|
|
|
|
|
|
Flags that are passed to the named_buff and named_buff_iter |
204
|
|
|
|
|
|
callbacks above. Those routines are called from universal.c via the |
205
|
|
|
|
|
|
Tie::Hash::NamedCapture interface for %+ and %- and the re:: |
206
|
|
|
|
|
|
functions in the same file. |
207
|
|
|
|
|
|
*/ |
208
|
|
|
|
|
|
|
209
|
|
|
|
|
|
/* The Tie::Hash::NamedCapture operation this is part of, if any */ |
210
|
|
|
|
|
|
#define RXapif_FETCH 0x0001 |
211
|
|
|
|
|
|
#define RXapif_STORE 0x0002 |
212
|
|
|
|
|
|
#define RXapif_DELETE 0x0004 |
213
|
|
|
|
|
|
#define RXapif_CLEAR 0x0008 |
214
|
|
|
|
|
|
#define RXapif_EXISTS 0x0010 |
215
|
|
|
|
|
|
#define RXapif_SCALAR 0x0020 |
216
|
|
|
|
|
|
#define RXapif_FIRSTKEY 0x0040 |
217
|
|
|
|
|
|
#define RXapif_NEXTKEY 0x0080 |
218
|
|
|
|
|
|
|
219
|
|
|
|
|
|
/* Whether %+ or %- is being operated on */ |
220
|
|
|
|
|
|
#define RXapif_ONE 0x0100 /* %+ */ |
221
|
|
|
|
|
|
#define RXapif_ALL 0x0200 /* %- */ |
222
|
|
|
|
|
|
|
223
|
|
|
|
|
|
/* Whether this is being called from a re:: function */ |
224
|
|
|
|
|
|
#define RXapif_REGNAME 0x0400 |
225
|
|
|
|
|
|
#define RXapif_REGNAMES 0x0800 |
226
|
|
|
|
|
|
#define RXapif_REGNAMES_COUNT 0x1000 |
227
|
|
|
|
|
|
|
228
|
|
|
|
|
|
/* |
229
|
|
|
|
|
|
=head1 REGEXP Functions |
230
|
|
|
|
|
|
|
231
|
|
|
|
|
|
=for apidoc Am|REGEXP *|SvRX|SV *sv |
232
|
|
|
|
|
|
|
233
|
|
|
|
|
|
Convenience macro to get the REGEXP from a SV. This is approximately |
234
|
|
|
|
|
|
equivalent to the following snippet: |
235
|
|
|
|
|
|
|
236
|
|
|
|
|
|
if (SvMAGICAL(sv)) |
237
|
|
|
|
|
|
mg_get(sv); |
238
|
|
|
|
|
|
if (SvROK(sv)) |
239
|
|
|
|
|
|
sv = MUTABLE_SV(SvRV(sv)); |
240
|
|
|
|
|
|
if (SvTYPE(sv) == SVt_REGEXP) |
241
|
|
|
|
|
|
return (REGEXP*) sv; |
242
|
|
|
|
|
|
|
243
|
|
|
|
|
|
NULL will be returned if a REGEXP* is not found. |
244
|
|
|
|
|
|
|
245
|
|
|
|
|
|
=for apidoc Am|bool|SvRXOK|SV* sv |
246
|
|
|
|
|
|
|
247
|
|
|
|
|
|
Returns a boolean indicating whether the SV (or the one it references) |
248
|
|
|
|
|
|
is a REGEXP. |
249
|
|
|
|
|
|
|
250
|
|
|
|
|
|
If you want to do something with the REGEXP* later use SvRX instead |
251
|
|
|
|
|
|
and check for NULL. |
252
|
|
|
|
|
|
|
253
|
|
|
|
|
|
=cut |
254
|
|
|
|
|
|
*/ |
255
|
|
|
|
|
|
|
256
|
|
|
|
|
|
#define SvRX(sv) (Perl_get_re_arg(aTHX_ sv)) |
257
|
|
|
|
|
|
#define SvRXOK(sv) (Perl_get_re_arg(aTHX_ sv) ? TRUE : FALSE) |
258
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
260
|
|
|
|
|
|
/* Flags stored in regexp->extflags |
261
|
|
|
|
|
|
* These are used by code external to the regexp engine |
262
|
|
|
|
|
|
* |
263
|
|
|
|
|
|
* Note that the flags whose names start with RXf_PMf_ are defined in |
264
|
|
|
|
|
|
* op_reg_common.h, being copied from the parallel flags of op_pmflags |
265
|
|
|
|
|
|
* |
266
|
|
|
|
|
|
* NOTE: if you modify any RXf flags you should run regen.pl or |
267
|
|
|
|
|
|
* regen/regcomp.pl so that regnodes.h is updated with the changes. |
268
|
|
|
|
|
|
* |
269
|
|
|
|
|
|
*/ |
270
|
|
|
|
|
|
|
271
|
|
|
|
|
|
#include "op_reg_common.h" |
272
|
|
|
|
|
|
|
273
|
|
|
|
|
|
#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED) |
274
|
|
|
|
|
|
|
275
|
|
|
|
|
|
#define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl) \ |
276
|
|
|
|
|
|
case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \ |
277
|
|
|
|
|
|
case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \ |
278
|
|
|
|
|
|
case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ |
279
|
|
|
|
|
|
case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break |
280
|
|
|
|
|
|
|
281
|
|
|
|
|
|
/* Note, includes charset ones, assumes 0 is the default for them */ |
282
|
|
|
|
|
|
#define STD_PMMOD_FLAGS_CLEAR(pmfl) \ |
283
|
|
|
|
|
|
*(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET) |
284
|
|
|
|
|
|
|
285
|
|
|
|
|
|
/* chars and strings used as regex pattern modifiers |
286
|
|
|
|
|
|
* Singular is a 'c'har, plural is a "string" |
287
|
|
|
|
|
|
* |
288
|
|
|
|
|
|
* NOTE, KEEPCOPY was originally 'k', but was changed to 'p' for preserve |
289
|
|
|
|
|
|
* for compatibility reasons with Regexp::Common which highjacked (?k:...) |
290
|
|
|
|
|
|
* for its own uses. So 'k' is out as well. |
291
|
|
|
|
|
|
*/ |
292
|
|
|
|
|
|
#define DEFAULT_PAT_MOD '^' /* Short for all the default modifiers */ |
293
|
|
|
|
|
|
#define EXEC_PAT_MOD 'e' |
294
|
|
|
|
|
|
#define KEEPCOPY_PAT_MOD 'p' |
295
|
|
|
|
|
|
#define ONCE_PAT_MOD 'o' |
296
|
|
|
|
|
|
#define GLOBAL_PAT_MOD 'g' |
297
|
|
|
|
|
|
#define CONTINUE_PAT_MOD 'c' |
298
|
|
|
|
|
|
#define MULTILINE_PAT_MOD 'm' |
299
|
|
|
|
|
|
#define SINGLE_PAT_MOD 's' |
300
|
|
|
|
|
|
#define IGNORE_PAT_MOD 'i' |
301
|
|
|
|
|
|
#define XTENDED_PAT_MOD 'x' |
302
|
|
|
|
|
|
#define NONDESTRUCT_PAT_MOD 'r' |
303
|
|
|
|
|
|
#define LOCALE_PAT_MOD 'l' |
304
|
|
|
|
|
|
#define UNICODE_PAT_MOD 'u' |
305
|
|
|
|
|
|
#define DEPENDS_PAT_MOD 'd' |
306
|
|
|
|
|
|
#define ASCII_RESTRICT_PAT_MOD 'a' |
307
|
|
|
|
|
|
|
308
|
|
|
|
|
|
#define ONCE_PAT_MODS "o" |
309
|
|
|
|
|
|
#define KEEPCOPY_PAT_MODS "p" |
310
|
|
|
|
|
|
#define EXEC_PAT_MODS "e" |
311
|
|
|
|
|
|
#define LOOP_PAT_MODS "gc" |
312
|
|
|
|
|
|
#define NONDESTRUCT_PAT_MODS "r" |
313
|
|
|
|
|
|
#define LOCALE_PAT_MODS "l" |
314
|
|
|
|
|
|
#define UNICODE_PAT_MODS "u" |
315
|
|
|
|
|
|
#define DEPENDS_PAT_MODS "d" |
316
|
|
|
|
|
|
#define ASCII_RESTRICT_PAT_MODS "a" |
317
|
|
|
|
|
|
#define ASCII_MORE_RESTRICT_PAT_MODS "aa" |
318
|
|
|
|
|
|
|
319
|
|
|
|
|
|
/* This string is expected by regcomp.c to be ordered so that the first |
320
|
|
|
|
|
|
* character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next |
321
|
|
|
|
|
|
* character is bit +1, etc. */ |
322
|
|
|
|
|
|
#define STD_PAT_MODS "msix" |
323
|
|
|
|
|
|
|
324
|
|
|
|
|
|
#define CHARSET_PAT_MODS ASCII_RESTRICT_PAT_MODS DEPENDS_PAT_MODS LOCALE_PAT_MODS UNICODE_PAT_MODS |
325
|
|
|
|
|
|
|
326
|
|
|
|
|
|
/* This string is expected by XS_re_regexp_pattern() in universal.c to be ordered |
327
|
|
|
|
|
|
* so that the first character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of |
328
|
|
|
|
|
|
* extflags; the next character is in bit +1, etc. */ |
329
|
|
|
|
|
|
#define INT_PAT_MODS STD_PAT_MODS KEEPCOPY_PAT_MODS |
330
|
|
|
|
|
|
|
331
|
|
|
|
|
|
#define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS |
332
|
|
|
|
|
|
#define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS CHARSET_PAT_MODS |
333
|
|
|
|
|
|
#define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS |
334
|
|
|
|
|
|
#define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS NONDESTRUCT_PAT_MODS |
335
|
|
|
|
|
|
|
336
|
|
|
|
|
|
/* |
337
|
|
|
|
|
|
* NOTE: if you modify any RXf flags you should run regen.pl or |
338
|
|
|
|
|
|
* regen/regcomp.pl so that regnodes.h is updated with the changes. |
339
|
|
|
|
|
|
* |
340
|
|
|
|
|
|
*/ |
341
|
|
|
|
|
|
|
342
|
|
|
|
|
|
/* Leave some space, so future bit allocations can go either in the shared or |
343
|
|
|
|
|
|
* unshared area without affecting binary compatibility */ |
344
|
|
|
|
|
|
#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT) |
345
|
|
|
|
|
|
|
346
|
|
|
|
|
|
/* |
347
|
|
|
|
|
|
Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will |
348
|
|
|
|
|
|
be used by regex engines to check whether they should set |
349
|
|
|
|
|
|
RXf_SKIPWHITE |
350
|
|
|
|
|
|
*/ |
351
|
|
|
|
|
|
#define RXf_SPLIT (1<<(RXf_BASE_SHIFT-1)) |
352
|
|
|
|
|
|
#if RXf_SPLIT != RXf_PMf_SPLIT |
353
|
|
|
|
|
|
# error "RXf_SPLIT does not match RXf_PMf_SPLIT" |
354
|
|
|
|
|
|
#endif |
355
|
|
|
|
|
|
|
356
|
|
|
|
|
|
/* Manually decorate this function with gcc-style attributes just to |
357
|
|
|
|
|
|
* avoid having to restructure the header files and their called order, |
358
|
|
|
|
|
|
* as proto.h would have to be included before this file, and isn't */ |
359
|
|
|
|
|
|
|
360
|
|
|
|
|
|
PERL_STATIC_INLINE const char * |
361
|
|
|
|
|
|
get_regex_charset_name(const U32 flags, STRLEN* const lenp) |
362
|
|
|
|
|
|
__attribute__warn_unused_result__; |
363
|
|
|
|
|
|
|
364
|
|
|
|
|
|
#define MAX_CHARSET_NAME_LENGTH 2 |
365
|
|
|
|
|
|
|
366
|
|
|
|
|
|
PERL_STATIC_INLINE const char * |
367
|
|
|
|
|
|
get_regex_charset_name(const U32 flags, STRLEN* const lenp) |
368
|
|
|
|
|
|
{ |
369
|
|
|
|
|
|
/* Returns a string that corresponds to the name of the regex character set |
370
|
|
|
|
|
|
* given by 'flags', and *lenp is set the length of that string, which |
371
|
|
|
|
|
|
* cannot exceed MAX_CHARSET_NAME_LENGTH characters */ |
372
|
|
|
|
|
|
|
373
|
|
|
|
|
|
*lenp = 1; |
374
|
20
|
|
|
|
|
switch (get_regex_charset(flags)) { |
375
|
|
|
|
|
|
case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; |
376
|
|
|
|
|
|
case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; |
377
|
|
|
|
|
|
case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; |
378
|
|
|
|
|
|
case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; |
379
|
|
|
|
|
|
case REGEX_ASCII_MORE_RESTRICTED_CHARSET: |
380
|
|
|
|
|
|
*lenp = 2; |
381
|
|
|
|
|
|
return ASCII_MORE_RESTRICT_PAT_MODS; |
382
|
|
|
|
|
|
default: |
383
|
|
|
|
|
|
return "?"; /* Unknown */ |
384
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
387
|
|
|
|
|
|
/* Anchor and GPOS related stuff */ |
388
|
|
|
|
|
|
#define RXf_ANCH_BOL (1<<(RXf_BASE_SHIFT+0)) |
389
|
|
|
|
|
|
#define RXf_ANCH_MBOL (1<<(RXf_BASE_SHIFT+1)) |
390
|
|
|
|
|
|
#define RXf_ANCH_SBOL (1<<(RXf_BASE_SHIFT+2)) |
391
|
|
|
|
|
|
#define RXf_ANCH_GPOS (1<<(RXf_BASE_SHIFT+3)) |
392
|
|
|
|
|
|
#define RXf_GPOS_SEEN (1<<(RXf_BASE_SHIFT+4)) |
393
|
|
|
|
|
|
#define RXf_GPOS_FLOAT (1<<(RXf_BASE_SHIFT+5)) |
394
|
|
|
|
|
|
/* two bits here */ |
395
|
|
|
|
|
|
#define RXf_ANCH (RXf_ANCH_BOL|RXf_ANCH_MBOL|RXf_ANCH_GPOS|RXf_ANCH_SBOL) |
396
|
|
|
|
|
|
#define RXf_GPOS_CHECK (RXf_GPOS_SEEN|RXf_ANCH_GPOS) |
397
|
|
|
|
|
|
#define RXf_ANCH_SINGLE (RXf_ANCH_SBOL|RXf_ANCH_GPOS) |
398
|
|
|
|
|
|
|
399
|
|
|
|
|
|
/* What we have seen */ |
400
|
|
|
|
|
|
#define RXf_NO_INPLACE_SUBST (1<<(RXf_BASE_SHIFT+6)) |
401
|
|
|
|
|
|
#define RXf_EVAL_SEEN (1<<(RXf_BASE_SHIFT+7)) |
402
|
|
|
|
|
|
#define RXf_CANY_SEEN (1<<(RXf_BASE_SHIFT+8)) |
403
|
|
|
|
|
|
|
404
|
|
|
|
|
|
/* Special */ |
405
|
|
|
|
|
|
#define RXf_NOSCAN (1<<(RXf_BASE_SHIFT+9)) |
406
|
|
|
|
|
|
#define RXf_CHECK_ALL (1<<(RXf_BASE_SHIFT+10)) |
407
|
|
|
|
|
|
|
408
|
|
|
|
|
|
/* UTF8 related */ |
409
|
|
|
|
|
|
#define RXf_MATCH_UTF8 (1<<(RXf_BASE_SHIFT+11)) /* $1 etc are utf8 */ |
410
|
|
|
|
|
|
|
411
|
|
|
|
|
|
/* Intuit related */ |
412
|
|
|
|
|
|
#define RXf_USE_INTUIT_NOML (1<<(RXf_BASE_SHIFT+12)) |
413
|
|
|
|
|
|
#define RXf_USE_INTUIT_ML (1<<(RXf_BASE_SHIFT+13)) |
414
|
|
|
|
|
|
#define RXf_INTUIT_TAIL (1<<(RXf_BASE_SHIFT+14)) |
415
|
|
|
|
|
|
#define RXf_USE_INTUIT (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML) |
416
|
|
|
|
|
|
|
417
|
|
|
|
|
|
/* Copy and tainted info */ |
418
|
|
|
|
|
|
#define RXf_COPY_DONE (1<<(RXf_BASE_SHIFT+16)) |
419
|
|
|
|
|
|
|
420
|
|
|
|
|
|
/* during execution: pattern temporarily tainted by executing locale ops; |
421
|
|
|
|
|
|
* post-execution: $1 et al are tainted */ |
422
|
|
|
|
|
|
#define RXf_TAINTED_SEEN (1<<(RXf_BASE_SHIFT+17)) |
423
|
|
|
|
|
|
/* this pattern was tainted during compilation */ |
424
|
|
|
|
|
|
#define RXf_TAINTED (1<<(RXf_BASE_SHIFT+18)) |
425
|
|
|
|
|
|
|
426
|
|
|
|
|
|
/* Flags indicating special patterns */ |
427
|
|
|
|
|
|
#define RXf_START_ONLY (1<<(RXf_BASE_SHIFT+19)) /* Pattern is /^/ */ |
428
|
|
|
|
|
|
#define RXf_SKIPWHITE (1<<(RXf_BASE_SHIFT+20)) /* Pattern is for a split " " */ |
429
|
|
|
|
|
|
#define RXf_WHITE (1<<(RXf_BASE_SHIFT+21)) /* Pattern is /\s+/ */ |
430
|
|
|
|
|
|
#define RXf_NULL (1U<<(RXf_BASE_SHIFT+22)) /* Pattern is // */ |
431
|
|
|
|
|
|
#if RXf_BASE_SHIFT+22 > 31 |
432
|
|
|
|
|
|
# error Too many RXf_PMf bits used. See regnodes.h for any spare in middle |
433
|
|
|
|
|
|
#endif |
434
|
|
|
|
|
|
|
435
|
|
|
|
|
|
/* |
436
|
|
|
|
|
|
* NOTE: if you modify any RXf flags you should run regen.pl or |
437
|
|
|
|
|
|
* regen/regcomp.pl so that regnodes.h is updated with the changes. |
438
|
|
|
|
|
|
* |
439
|
|
|
|
|
|
*/ |
440
|
|
|
|
|
|
|
441
|
|
|
|
|
|
#if NO_TAINT_SUPPORT |
442
|
|
|
|
|
|
# define RX_ISTAINTED(prog) 0 |
443
|
|
|
|
|
|
# define RX_TAINT_on(prog) NOOP |
444
|
|
|
|
|
|
# define RXp_MATCH_TAINTED(prog) 0 |
445
|
|
|
|
|
|
# define RX_MATCH_TAINTED(prog) 0 |
446
|
|
|
|
|
|
# define RXp_MATCH_TAINTED_on(prog) NOOP |
447
|
|
|
|
|
|
# define RX_MATCH_TAINTED_on(prog) NOOP |
448
|
|
|
|
|
|
# define RX_MATCH_TAINTED_off(prog) NOOP |
449
|
|
|
|
|
|
#else |
450
|
|
|
|
|
|
# define RX_ISTAINTED(prog) (RX_EXTFLAGS(prog) & RXf_TAINTED) |
451
|
|
|
|
|
|
# define RX_TAINT_on(prog) (RX_EXTFLAGS(prog) |= RXf_TAINTED) |
452
|
|
|
|
|
|
# define RXp_MATCH_TAINTED(prog) (RXp_EXTFLAGS(prog) & RXf_TAINTED_SEEN) |
453
|
|
|
|
|
|
# define RX_MATCH_TAINTED(prog) (RX_EXTFLAGS(prog) & RXf_TAINTED_SEEN) |
454
|
|
|
|
|
|
# define RXp_MATCH_TAINTED_on(prog) (RXp_EXTFLAGS(prog) |= RXf_TAINTED_SEEN) |
455
|
|
|
|
|
|
# define RX_MATCH_TAINTED_on(prog) (RX_EXTFLAGS(prog) |= RXf_TAINTED_SEEN) |
456
|
|
|
|
|
|
# define RX_MATCH_TAINTED_off(prog) (RX_EXTFLAGS(prog) &= ~RXf_TAINTED_SEEN) |
457
|
|
|
|
|
|
#endif |
458
|
|
|
|
|
|
|
459
|
|
|
|
|
|
#define RX_HAS_CUTGROUP(prog) ((prog)->intflags & PREGf_CUTGROUP_SEEN) |
460
|
|
|
|
|
|
#define RX_MATCH_TAINTED_set(prog, t) ((t) \ |
461
|
|
|
|
|
|
? RX_MATCH_TAINTED_on(prog) \ |
462
|
|
|
|
|
|
: RX_MATCH_TAINTED_off(prog)) |
463
|
|
|
|
|
|
|
464
|
|
|
|
|
|
#define RXp_MATCH_COPIED(prog) (RXp_EXTFLAGS(prog) & RXf_COPY_DONE) |
465
|
|
|
|
|
|
#define RX_MATCH_COPIED(prog) (RX_EXTFLAGS(prog) & RXf_COPY_DONE) |
466
|
|
|
|
|
|
#define RXp_MATCH_COPIED_on(prog) (RXp_EXTFLAGS(prog) |= RXf_COPY_DONE) |
467
|
|
|
|
|
|
#define RX_MATCH_COPIED_on(prog) (RX_EXTFLAGS(prog) |= RXf_COPY_DONE) |
468
|
|
|
|
|
|
#define RXp_MATCH_COPIED_off(prog) (RXp_EXTFLAGS(prog) &= ~RXf_COPY_DONE) |
469
|
|
|
|
|
|
#define RX_MATCH_COPIED_off(prog) (RX_EXTFLAGS(prog) &= ~RXf_COPY_DONE) |
470
|
|
|
|
|
|
#define RX_MATCH_COPIED_set(prog,t) ((t) \ |
471
|
|
|
|
|
|
? RX_MATCH_COPIED_on(prog) \ |
472
|
|
|
|
|
|
: RX_MATCH_COPIED_off(prog)) |
473
|
|
|
|
|
|
|
474
|
|
|
|
|
|
#define RXp_EXTFLAGS(rx) ((rx)->extflags) |
475
|
|
|
|
|
|
#define RXp_COMPFLAGS(rx) ((rx)->compflags) |
476
|
|
|
|
|
|
|
477
|
|
|
|
|
|
/* For source compatibility. We used to store these explicitly. */ |
478
|
|
|
|
|
|
#define RX_PRECOMP(prog) (RX_WRAPPED(prog) + ReANY(prog)->pre_prefix) |
479
|
|
|
|
|
|
#define RX_PRECOMP_const(prog) (RX_WRAPPED_const(prog) + ReANY(prog)->pre_prefix) |
480
|
|
|
|
|
|
/* FIXME? Are we hardcoding too much here and constraining plugin extension |
481
|
|
|
|
|
|
writers? Specifically, the value 1 assumes that the wrapped version always |
482
|
|
|
|
|
|
has exactly one character at the end, a ')'. Will that always be true? */ |
483
|
|
|
|
|
|
#define RX_PRELEN(prog) (RX_WRAPLEN(prog) - ReANY(prog)->pre_prefix - 1) |
484
|
|
|
|
|
|
#define RX_WRAPPED(prog) ReANY(prog)->xpv_len_u.xpvlenu_pv |
485
|
|
|
|
|
|
#define RX_WRAPPED_const(prog) ((const char *)RX_WRAPPED(prog)) |
486
|
|
|
|
|
|
#define RX_WRAPLEN(prog) SvCUR(prog) |
487
|
|
|
|
|
|
#define RX_CHECK_SUBSTR(prog) (ReANY(prog)->check_substr) |
488
|
|
|
|
|
|
#define RX_REFCNT(prog) SvREFCNT(prog) |
489
|
|
|
|
|
|
#define RX_EXTFLAGS(prog) RXp_EXTFLAGS(ReANY(prog)) |
490
|
|
|
|
|
|
#define RX_COMPFLAGS(prog) RXp_COMPFLAGS(ReANY(prog)) |
491
|
|
|
|
|
|
#define RX_ENGINE(prog) (ReANY(prog)->engine) |
492
|
|
|
|
|
|
#define RX_SUBBEG(prog) (ReANY(prog)->subbeg) |
493
|
|
|
|
|
|
#define RX_SUBOFFSET(prog) (ReANY(prog)->suboffset) |
494
|
|
|
|
|
|
#define RX_SUBCOFFSET(prog) (ReANY(prog)->subcoffset) |
495
|
|
|
|
|
|
#define RX_OFFS(prog) (ReANY(prog)->offs) |
496
|
|
|
|
|
|
#define RX_NPARENS(prog) (ReANY(prog)->nparens) |
497
|
|
|
|
|
|
#define RX_SUBLEN(prog) (ReANY(prog)->sublen) |
498
|
|
|
|
|
|
#define RX_MINLEN(prog) (ReANY(prog)->minlen) |
499
|
|
|
|
|
|
#define RX_MINLENRET(prog) (ReANY(prog)->minlenret) |
500
|
|
|
|
|
|
#define RX_GOFS(prog) (ReANY(prog)->gofs) |
501
|
|
|
|
|
|
#define RX_LASTPAREN(prog) (ReANY(prog)->lastparen) |
502
|
|
|
|
|
|
#define RX_LASTCLOSEPAREN(prog) (ReANY(prog)->lastcloseparen) |
503
|
|
|
|
|
|
#define RX_SAVED_COPY(prog) (ReANY(prog)->saved_copy) |
504
|
|
|
|
|
|
/* last match was zero-length */ |
505
|
|
|
|
|
|
#define RX_ZERO_LEN(prog) \ |
506
|
|
|
|
|
|
(RX_OFFS(prog)[0].start + (SSize_t)RX_GOFS(prog) \ |
507
|
|
|
|
|
|
== RX_OFFS(prog)[0].end) |
508
|
|
|
|
|
|
|
509
|
|
|
|
|
|
#endif /* PLUGGABLE_RE_EXTENSION */ |
510
|
|
|
|
|
|
|
511
|
|
|
|
|
|
/* Stuff that needs to be included in the pluggable extension goes below here */ |
512
|
|
|
|
|
|
|
513
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
514
|
|
|
|
|
|
#define RX_MATCH_COPY_FREE(rx) \ |
515
|
|
|
|
|
|
STMT_START {if (RX_SAVED_COPY(rx)) { \ |
516
|
|
|
|
|
|
SV_CHECK_THINKFIRST_COW_DROP(RX_SAVED_COPY(rx)); \ |
517
|
|
|
|
|
|
} \ |
518
|
|
|
|
|
|
if (RX_MATCH_COPIED(rx)) { \ |
519
|
|
|
|
|
|
Safefree(RX_SUBBEG(rx)); \ |
520
|
|
|
|
|
|
RX_MATCH_COPIED_off(rx); \ |
521
|
|
|
|
|
|
}} STMT_END |
522
|
|
|
|
|
|
#else |
523
|
|
|
|
|
|
#define RX_MATCH_COPY_FREE(rx) \ |
524
|
|
|
|
|
|
STMT_START {if (RX_MATCH_COPIED(rx)) { \ |
525
|
|
|
|
|
|
Safefree(RX_SUBBEG(rx)); \ |
526
|
|
|
|
|
|
RX_MATCH_COPIED_off(rx); \ |
527
|
|
|
|
|
|
}} STMT_END |
528
|
|
|
|
|
|
#endif |
529
|
|
|
|
|
|
|
530
|
|
|
|
|
|
#define RXp_MATCH_UTF8(prog) (RXp_EXTFLAGS(prog) & RXf_MATCH_UTF8) |
531
|
|
|
|
|
|
#define RX_MATCH_UTF8(prog) (RX_EXTFLAGS(prog) & RXf_MATCH_UTF8) |
532
|
|
|
|
|
|
#define RX_MATCH_UTF8_on(prog) (RX_EXTFLAGS(prog) |= RXf_MATCH_UTF8) |
533
|
|
|
|
|
|
#define RX_MATCH_UTF8_off(prog) (RX_EXTFLAGS(prog) &= ~RXf_MATCH_UTF8) |
534
|
|
|
|
|
|
#define RX_MATCH_UTF8_set(prog, t) ((t) \ |
535
|
|
|
|
|
|
? RX_MATCH_UTF8_on(prog) \ |
536
|
|
|
|
|
|
: RX_MATCH_UTF8_off(prog)) |
537
|
|
|
|
|
|
|
538
|
|
|
|
|
|
/* Whether the pattern stored at RX_WRAPPED is in UTF-8 */ |
539
|
|
|
|
|
|
#define RX_UTF8(prog) SvUTF8(prog) |
540
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
542
|
|
|
|
|
|
/* bits in flags arg of Perl_regexec_flags() */ |
543
|
|
|
|
|
|
|
544
|
|
|
|
|
|
#define REXEC_COPY_STR 0x01 /* Need to copy the string for captures. */ |
545
|
|
|
|
|
|
#define REXEC_CHECKED 0x02 /* re_intuit_start() already called. */ |
546
|
|
|
|
|
|
#define REXEC_SCREAM 0x04 /* currently unused. */ |
547
|
|
|
|
|
|
#define REXEC_IGNOREPOS 0x08 /* use stringarg, not pos(), for \G match */ |
548
|
|
|
|
|
|
#define REXEC_NOT_FIRST 0x10 /* This is another iteration of //g: |
549
|
|
|
|
|
|
no need to copy string again */ |
550
|
|
|
|
|
|
|
551
|
|
|
|
|
|
/* under REXEC_COPY_STR, it's ok for the |
552
|
|
|
|
|
|
engine (modulo PL_sawamperand etc) |
553
|
|
|
|
|
|
to skip copying: ... */ |
554
|
|
|
|
|
|
#define REXEC_COPY_SKIP_PRE 0x20 /* ...the $` part of the string, or */ |
555
|
|
|
|
|
|
#define REXEC_COPY_SKIP_POST 0x40 /* ...the $' part of the string */ |
556
|
|
|
|
|
|
#define REXEC_FAIL_ON_UNDERFLOW 0x80 /* fail the match if $& would start before |
557
|
|
|
|
|
|
the start pos (so s/.\G// would fail |
558
|
|
|
|
|
|
on second iteration */ |
559
|
|
|
|
|
|
|
560
|
|
|
|
|
|
#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) |
561
|
|
|
|
|
|
# define ReREFCNT_inc(re) \ |
562
|
|
|
|
|
|
({ \ |
563
|
|
|
|
|
|
/* This is here to generate a casting warning if incorrect. */ \ |
564
|
|
|
|
|
|
REGEXP *const _rerefcnt_inc = (re); \ |
565
|
|
|
|
|
|
assert(SvTYPE(_rerefcnt_inc) == SVt_REGEXP); \ |
566
|
|
|
|
|
|
SvREFCNT_inc(_rerefcnt_inc); \ |
567
|
|
|
|
|
|
_rerefcnt_inc; \ |
568
|
|
|
|
|
|
}) |
569
|
|
|
|
|
|
# define ReREFCNT_dec(re) \ |
570
|
|
|
|
|
|
({ \ |
571
|
|
|
|
|
|
/* This is here to generate a casting warning if incorrect. */ \ |
572
|
|
|
|
|
|
REGEXP *const _rerefcnt_dec = (re); \ |
573
|
|
|
|
|
|
SvREFCNT_dec(_rerefcnt_dec); \ |
574
|
|
|
|
|
|
}) |
575
|
|
|
|
|
|
#else |
576
|
|
|
|
|
|
# define ReREFCNT_dec(re) SvREFCNT_dec(re) |
577
|
|
|
|
|
|
# define ReREFCNT_inc(re) ((REGEXP *) SvREFCNT_inc(re)) |
578
|
|
|
|
|
|
#endif |
579
|
|
|
|
|
|
#define ReANY(re) S_ReANY((const REGEXP *)(re)) |
580
|
|
|
|
|
|
|
581
|
|
|
|
|
|
/* FIXME for plugins. */ |
582
|
|
|
|
|
|
|
583
|
|
|
|
|
|
#define FBMcf_TAIL_DOLLAR 1 |
584
|
|
|
|
|
|
#define FBMcf_TAIL_DOLLARM 2 |
585
|
|
|
|
|
|
#define FBMcf_TAIL_Z 4 |
586
|
|
|
|
|
|
#define FBMcf_TAIL_z 8 |
587
|
|
|
|
|
|
#define FBMcf_TAIL (FBMcf_TAIL_DOLLAR|FBMcf_TAIL_DOLLARM|FBMcf_TAIL_Z|FBMcf_TAIL_z) |
588
|
|
|
|
|
|
|
589
|
|
|
|
|
|
#define FBMrf_MULTILINE 1 |
590
|
|
|
|
|
|
|
591
|
|
|
|
|
|
struct regmatch_state; |
592
|
|
|
|
|
|
struct regmatch_slab; |
593
|
|
|
|
|
|
|
594
|
|
|
|
|
|
/* like regmatch_info_aux, but contains extra fields only needed if the |
595
|
|
|
|
|
|
* pattern contains (?{}). If used, is snuck into the second slot in the |
596
|
|
|
|
|
|
* regmatch_state stack at the start of execution */ |
597
|
|
|
|
|
|
|
598
|
|
|
|
|
|
typedef struct { |
599
|
|
|
|
|
|
regexp *rex; |
600
|
|
|
|
|
|
PMOP *curpm; /* saved PL_curpm */ |
601
|
|
|
|
|
|
#ifdef PERL_ANY_COW |
602
|
|
|
|
|
|
SV *saved_copy; /* saved saved_copy field from rex */ |
603
|
|
|
|
|
|
#endif |
604
|
|
|
|
|
|
char *subbeg; /* saved subbeg field from rex */ |
605
|
|
|
|
|
|
STRLEN sublen; /* saved sublen field from rex */ |
606
|
|
|
|
|
|
STRLEN suboffset; /* saved suboffset field from rex */ |
607
|
|
|
|
|
|
STRLEN subcoffset; /* saved subcoffset field from rex */ |
608
|
|
|
|
|
|
MAGIC *pos_magic; /* pos() magic attached to $_ */ |
609
|
|
|
|
|
|
SSize_t pos; /* the original value of pos() in pos_magic */ |
610
|
|
|
|
|
|
U8 pos_flags; /* flags to be restored; currently only MGf_BYTES*/ |
611
|
|
|
|
|
|
} regmatch_info_aux_eval; |
612
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
614
|
|
|
|
|
|
/* fields that logically live in regmatch_info, but which need cleaning |
615
|
|
|
|
|
|
* up on croak(), and so are instead are snuck into the first slot in |
616
|
|
|
|
|
|
* the regmatch_state stack at the start of execution */ |
617
|
|
|
|
|
|
|
618
|
|
|
|
|
|
typedef struct { |
619
|
|
|
|
|
|
regmatch_info_aux_eval *info_aux_eval; |
620
|
|
|
|
|
|
struct regmatch_state *old_regmatch_state; /* saved PL_regmatch_state */ |
621
|
|
|
|
|
|
struct regmatch_slab *old_regmatch_slab; /* saved PL_regmatch_slab */ |
622
|
|
|
|
|
|
char *poscache; /* S-L cache of fail positions of WHILEMs */ |
623
|
|
|
|
|
|
} regmatch_info_aux; |
624
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
626
|
|
|
|
|
|
/* some basic information about the current match that is created by |
627
|
|
|
|
|
|
* Perl_regexec_flags and then passed to regtry(), regmatch() etc. |
628
|
|
|
|
|
|
* It is allocated as a local var on the stack, so nothing should be |
629
|
|
|
|
|
|
* stored in it that needs preserving or clearing up on croak(). |
630
|
|
|
|
|
|
* For that, see the aux_info and aux_info_eval members of the |
631
|
|
|
|
|
|
* regmatch_state union. */ |
632
|
|
|
|
|
|
|
633
|
|
|
|
|
|
typedef struct { |
634
|
|
|
|
|
|
REGEXP *prog; /* the regex being executed */ |
635
|
|
|
|
|
|
const char * strbeg; /* real start of string */ |
636
|
|
|
|
|
|
char *strend; /* one byte beyond last char of match string */ |
637
|
|
|
|
|
|
char *till; /* matches shorter than this fail (see minlen arg) */ |
638
|
|
|
|
|
|
SV *sv; /* the SV string currently being matched */ |
639
|
|
|
|
|
|
char *ganch; /* position of \G anchor */ |
640
|
|
|
|
|
|
char *cutpoint; /* (*COMMIT) position (if any) */ |
641
|
|
|
|
|
|
regmatch_info_aux *info_aux; /* extra fields that need cleanup */ |
642
|
|
|
|
|
|
regmatch_info_aux_eval *info_aux_eval; /* extra saved state for (?{}) */ |
643
|
|
|
|
|
|
I32 poscache_maxiter; /* how many whilems todo before S-L cache kicks in */ |
644
|
|
|
|
|
|
I32 poscache_iter; /* current countdown from _maxiter to zero */ |
645
|
|
|
|
|
|
STRLEN poscache_size; /* size of regmatch_info_aux.poscache */ |
646
|
|
|
|
|
|
bool intuit; /* re_intuit_start() is the top-level caller */ |
647
|
|
|
|
|
|
bool is_utf8_pat; /* regex is utf8 */ |
648
|
|
|
|
|
|
bool is_utf8_target; /* string being matched is utf8 */ |
649
|
|
|
|
|
|
bool warned; /* we have issued a recursion warning; no need for more */ |
650
|
|
|
|
|
|
} regmatch_info; |
651
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
653
|
|
|
|
|
|
/* structures for holding and saving the state maintained by regmatch() */ |
654
|
|
|
|
|
|
|
655
|
|
|
|
|
|
#ifndef MAX_RECURSE_EVAL_NOCHANGE_DEPTH |
656
|
|
|
|
|
|
#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 1000 |
657
|
|
|
|
|
|
#endif |
658
|
|
|
|
|
|
|
659
|
|
|
|
|
|
typedef I32 CHECKPOINT; |
660
|
|
|
|
|
|
|
661
|
|
|
|
|
|
typedef struct regmatch_state { |
662
|
|
|
|
|
|
int resume_state; /* where to jump to on return */ |
663
|
|
|
|
|
|
char *locinput; /* where to backtrack in string on failure */ |
664
|
|
|
|
|
|
|
665
|
|
|
|
|
|
union { |
666
|
|
|
|
|
|
|
667
|
|
|
|
|
|
/* the 'info_aux' and 'info_aux_eval' union members are cuckoos in |
668
|
|
|
|
|
|
* the nest. They aren't saved backtrack state; rather they |
669
|
|
|
|
|
|
* represent one or two extra chunks of data that need allocating |
670
|
|
|
|
|
|
* at the start of a match. These fields would logically live in |
671
|
|
|
|
|
|
* the regmatch_info struct, except that is allocated on the |
672
|
|
|
|
|
|
* C stack, and these fields are all things that require cleanup |
673
|
|
|
|
|
|
* after a croak(), when the stack is lost. |
674
|
|
|
|
|
|
* As a convenience, we just use the first 1 or 2 regmatch_state |
675
|
|
|
|
|
|
* slots to store this info, as we will be allocating a slab of |
676
|
|
|
|
|
|
* these anyway. Otherwise we'd have to malloc and then free them, |
677
|
|
|
|
|
|
* or allocate them on the save stack (where they will get |
678
|
|
|
|
|
|
* realloced if the save stack grows). |
679
|
|
|
|
|
|
* info_aux contains the extra fields that are always needed; |
680
|
|
|
|
|
|
* info_aux_eval contains extra fields that only needed if |
681
|
|
|
|
|
|
* the pattern contains code blocks |
682
|
|
|
|
|
|
* We split them into two separate structs to avoid increasing |
683
|
|
|
|
|
|
* the size of the union. |
684
|
|
|
|
|
|
*/ |
685
|
|
|
|
|
|
|
686
|
|
|
|
|
|
regmatch_info_aux info_aux; |
687
|
|
|
|
|
|
|
688
|
|
|
|
|
|
regmatch_info_aux_eval info_aux_eval; |
689
|
|
|
|
|
|
|
690
|
|
|
|
|
|
/* this is a fake union member that matches the first element |
691
|
|
|
|
|
|
* of each member that needs to store positive backtrack |
692
|
|
|
|
|
|
* information */ |
693
|
|
|
|
|
|
struct { |
694
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
695
|
|
|
|
|
|
} yes; |
696
|
|
|
|
|
|
|
697
|
|
|
|
|
|
/* branchlike members */ |
698
|
|
|
|
|
|
/* this is a fake union member that matches the first elements |
699
|
|
|
|
|
|
* of each member that needs to behave like a branch */ |
700
|
|
|
|
|
|
struct { |
701
|
|
|
|
|
|
/* this first element must match u.yes */ |
702
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
703
|
|
|
|
|
|
U32 lastparen; |
704
|
|
|
|
|
|
U32 lastcloseparen; |
705
|
|
|
|
|
|
CHECKPOINT cp; |
706
|
|
|
|
|
|
|
707
|
|
|
|
|
|
} branchlike; |
708
|
|
|
|
|
|
|
709
|
|
|
|
|
|
struct { |
710
|
|
|
|
|
|
/* the first elements must match u.branchlike */ |
711
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
712
|
|
|
|
|
|
U32 lastparen; |
713
|
|
|
|
|
|
U32 lastcloseparen; |
714
|
|
|
|
|
|
CHECKPOINT cp; |
715
|
|
|
|
|
|
|
716
|
|
|
|
|
|
regnode *next_branch; /* next branch node */ |
717
|
|
|
|
|
|
} branch; |
718
|
|
|
|
|
|
|
719
|
|
|
|
|
|
struct { |
720
|
|
|
|
|
|
/* the first elements must match u.branchlike */ |
721
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
722
|
|
|
|
|
|
U32 lastparen; |
723
|
|
|
|
|
|
U32 lastcloseparen; |
724
|
|
|
|
|
|
CHECKPOINT cp; |
725
|
|
|
|
|
|
|
726
|
|
|
|
|
|
U32 accepted; /* how many accepting states left */ |
727
|
|
|
|
|
|
bool longfold;/* saw a fold with a 1->n char mapping */ |
728
|
|
|
|
|
|
U16 *jump; /* positive offsets from me */ |
729
|
|
|
|
|
|
regnode *me; /* Which node am I - needed for jump tries*/ |
730
|
|
|
|
|
|
U8 *firstpos;/* pos in string of first trie match */ |
731
|
|
|
|
|
|
U32 firstchars;/* len in chars of firstpos from start */ |
732
|
|
|
|
|
|
U16 nextword;/* next word to try */ |
733
|
|
|
|
|
|
U16 topword; /* longest accepted word */ |
734
|
|
|
|
|
|
} trie; |
735
|
|
|
|
|
|
|
736
|
|
|
|
|
|
/* special types - these members are used to store state for special |
737
|
|
|
|
|
|
regops like eval, if/then, lookaround and the markpoint state */ |
738
|
|
|
|
|
|
struct { |
739
|
|
|
|
|
|
/* this first element must match u.yes */ |
740
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
741
|
|
|
|
|
|
struct regmatch_state *prev_eval; |
742
|
|
|
|
|
|
struct regmatch_state *prev_curlyx; |
743
|
|
|
|
|
|
REGEXP *prev_rex; |
744
|
|
|
|
|
|
CHECKPOINT cp; /* remember current savestack indexes */ |
745
|
|
|
|
|
|
CHECKPOINT lastcp; |
746
|
|
|
|
|
|
U32 close_paren; /* which close bracket is our end */ |
747
|
|
|
|
|
|
regnode *B; /* the node following us */ |
748
|
|
|
|
|
|
} eval; |
749
|
|
|
|
|
|
|
750
|
|
|
|
|
|
struct { |
751
|
|
|
|
|
|
/* this first element must match u.yes */ |
752
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
753
|
|
|
|
|
|
I32 wanted; |
754
|
|
|
|
|
|
I32 logical; /* saved copy of 'logical' var */ |
755
|
|
|
|
|
|
regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */ |
756
|
|
|
|
|
|
} ifmatch; /* and SUSPEND/UNLESSM */ |
757
|
|
|
|
|
|
|
758
|
|
|
|
|
|
struct { |
759
|
|
|
|
|
|
/* this first element must match u.yes */ |
760
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
761
|
|
|
|
|
|
struct regmatch_state *prev_mark; |
762
|
|
|
|
|
|
SV* mark_name; |
763
|
|
|
|
|
|
char *mark_loc; |
764
|
|
|
|
|
|
} mark; |
765
|
|
|
|
|
|
|
766
|
|
|
|
|
|
struct { |
767
|
|
|
|
|
|
int val; |
768
|
|
|
|
|
|
} keeper; |
769
|
|
|
|
|
|
|
770
|
|
|
|
|
|
/* quantifiers - these members are used for storing state for |
771
|
|
|
|
|
|
for the regops used to implement quantifiers */ |
772
|
|
|
|
|
|
struct { |
773
|
|
|
|
|
|
/* this first element must match u.yes */ |
774
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
775
|
|
|
|
|
|
struct regmatch_state *prev_curlyx; /* previous cur_curlyx */ |
776
|
|
|
|
|
|
regnode *me; /* the CURLYX node */ |
777
|
|
|
|
|
|
regnode *B; /* the B node in /A*B/ */ |
778
|
|
|
|
|
|
CHECKPOINT cp; /* remember current savestack index */ |
779
|
|
|
|
|
|
bool minmod; |
780
|
|
|
|
|
|
int parenfloor;/* how far back to strip paren data */ |
781
|
|
|
|
|
|
|
782
|
|
|
|
|
|
/* these two are modified by WHILEM */ |
783
|
|
|
|
|
|
int count; /* how many instances of A we've matched */ |
784
|
|
|
|
|
|
char *lastloc;/* where previous A matched (0-len detect) */ |
785
|
|
|
|
|
|
} curlyx; |
786
|
|
|
|
|
|
|
787
|
|
|
|
|
|
struct { |
788
|
|
|
|
|
|
/* this first element must match u.yes */ |
789
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
790
|
|
|
|
|
|
struct regmatch_state *save_curlyx; |
791
|
|
|
|
|
|
CHECKPOINT cp; /* remember current savestack indexes */ |
792
|
|
|
|
|
|
CHECKPOINT lastcp; |
793
|
|
|
|
|
|
char *save_lastloc; /* previous curlyx.lastloc */ |
794
|
|
|
|
|
|
I32 cache_offset; |
795
|
|
|
|
|
|
I32 cache_mask; |
796
|
|
|
|
|
|
} whilem; |
797
|
|
|
|
|
|
|
798
|
|
|
|
|
|
struct { |
799
|
|
|
|
|
|
/* this first element must match u.yes */ |
800
|
|
|
|
|
|
struct regmatch_state *prev_yes_state; |
801
|
|
|
|
|
|
int c1, c2; /* case fold search */ |
802
|
|
|
|
|
|
CHECKPOINT cp; |
803
|
|
|
|
|
|
U32 lastparen; |
804
|
|
|
|
|
|
U32 lastcloseparen; |
805
|
|
|
|
|
|
I32 alen; /* length of first-matched A string */ |
806
|
|
|
|
|
|
I32 count; |
807
|
|
|
|
|
|
bool minmod; |
808
|
|
|
|
|
|
regnode *A, *B; /* the nodes corresponding to /A*B/ */ |
809
|
|
|
|
|
|
regnode *me; /* the curlym node */ |
810
|
|
|
|
|
|
U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ |
811
|
|
|
|
|
|
U8 c2_utf8[UTF8_MAXBYTES+1]; |
812
|
|
|
|
|
|
} curlym; |
813
|
|
|
|
|
|
|
814
|
|
|
|
|
|
struct { |
815
|
|
|
|
|
|
U32 paren; |
816
|
|
|
|
|
|
CHECKPOINT cp; |
817
|
|
|
|
|
|
U32 lastparen; |
818
|
|
|
|
|
|
U32 lastcloseparen; |
819
|
|
|
|
|
|
int c1, c2; /* case fold search */ |
820
|
|
|
|
|
|
char *maxpos; /* highest possible point in string to match */ |
821
|
|
|
|
|
|
char *oldloc; /* the previous locinput */ |
822
|
|
|
|
|
|
int count; |
823
|
|
|
|
|
|
int min, max; /* {m,n} */ |
824
|
|
|
|
|
|
regnode *A, *B; /* the nodes corresponding to /A*B/ */ |
825
|
|
|
|
|
|
U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ |
826
|
|
|
|
|
|
U8 c2_utf8[UTF8_MAXBYTES+1]; |
827
|
|
|
|
|
|
} curly; /* and CURLYN/PLUS/STAR */ |
828
|
|
|
|
|
|
|
829
|
|
|
|
|
|
} u; |
830
|
|
|
|
|
|
} regmatch_state; |
831
|
|
|
|
|
|
|
832
|
|
|
|
|
|
/* how many regmatch_state structs to allocate as a single slab. |
833
|
|
|
|
|
|
* We do it in 4K blocks for efficiency. The "3" is 2 for the next/prev |
834
|
|
|
|
|
|
* pointers, plus 1 for any mythical malloc overhead. */ |
835
|
|
|
|
|
|
|
836
|
|
|
|
|
|
#define PERL_REGMATCH_SLAB_SLOTS \ |
837
|
|
|
|
|
|
((4096 - 3 * sizeof (void*)) / sizeof(regmatch_state)) |
838
|
|
|
|
|
|
|
839
|
|
|
|
|
|
typedef struct regmatch_slab { |
840
|
|
|
|
|
|
regmatch_state states[PERL_REGMATCH_SLAB_SLOTS]; |
841
|
|
|
|
|
|
struct regmatch_slab *prev, *next; |
842
|
|
|
|
|
|
} regmatch_slab; |
843
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
846
|
|
|
|
|
|
/* |
847
|
|
|
|
|
|
* Local variables: |
848
|
|
|
|
|
|
* c-indentation-style: bsd |
849
|
|
|
|
|
|
* c-basic-offset: 4 |
850
|
|
|
|
|
|
* indent-tabs-mode: nil |
851
|
|
|
|
|
|
* End: |
852
|
|
|
|
|
|
* |
853
|
|
|
|
|
|
* ex: set ts=8 sts=4 sw=4 et: |
854
|
|
|
|
|
|
*/ |