| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* While the compiled Perl regular expression itself will have a character-class (set) |
|
2
|
|
|
|
|
|
|
* implementation that could be used directly, its API is private and changes across |
|
3
|
|
|
|
|
|
|
* perl versions. I gave up on interfacing directly with that, and took this approach of |
|
4
|
|
|
|
|
|
|
* building my own bitmaps. |
|
5
|
|
|
|
|
|
|
* |
|
6
|
|
|
|
|
|
|
* The bitmaps only cache the result of testing the perl character class against bytes 0-0xFF |
|
7
|
|
|
|
|
|
|
* in a non-unicode context. In a unicode context, it uses the cache for codepoints 0-0x7F |
|
8
|
|
|
|
|
|
|
* and falls back to invoking the regex engine on each character with a higher codepoint value. |
|
9
|
|
|
|
|
|
|
* This is inefficient, but I expect 7-bit ascii or non-unicode context is what gets used the |
|
10
|
|
|
|
|
|
|
* most anyway. |
|
11
|
|
|
|
|
|
|
* |
|
12
|
|
|
|
|
|
|
* This file gets sourced directly into SecretBuffer.xs, so its static functions are availabe |
|
13
|
|
|
|
|
|
|
* in other source files as well. |
|
14
|
|
|
|
|
|
|
*/ |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
struct secret_buffer_charset { |
|
17
|
|
|
|
|
|
|
uint64_t bitmap[4]; // covers 0..255 codepoints |
|
18
|
|
|
|
|
|
|
REGEXP *rx; // refers to Regexp object this was derived from |
|
19
|
|
|
|
|
|
|
#define SECRET_BUFFER_CHARSET_NOUNI 0 |
|
20
|
|
|
|
|
|
|
#define SECRET_BUFFER_CHARSET_ALLUNI 1 |
|
21
|
|
|
|
|
|
|
#define SECRET_BUFFER_CHARSET_TESTUNI 2 |
|
22
|
|
|
|
|
|
|
int unicode_above_7F; // controls action when matching against unicode |
|
23
|
|
|
|
|
|
|
bool match_multi; // stores whether regex ended with '+' |
|
24
|
|
|
|
|
|
|
}; |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
/* MAGIC vtable for cached charset */ |
|
27
|
197
|
|
|
|
|
|
static int secret_buffer_charset_magic_free(pTHX_ SV *sv, MAGIC *mg) { |
|
28
|
197
|
50
|
|
|
|
|
if (mg->mg_ptr) { |
|
29
|
197
|
|
|
|
|
|
secret_buffer_charset *cset = (secret_buffer_charset*)mg->mg_ptr; |
|
30
|
197
|
|
|
|
|
|
Safefree(cset); |
|
31
|
197
|
|
|
|
|
|
mg->mg_ptr = NULL; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
197
|
|
|
|
|
|
return 0; |
|
34
|
|
|
|
|
|
|
} |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
#ifdef USE_ITHREADS |
|
37
|
|
|
|
|
|
|
static int secret_buffer_charset_magic_dup(pTHX_ MAGIC *mg, CLONE_PARAMS *param) { |
|
38
|
|
|
|
|
|
|
secret_buffer_charset *old_cset = (secret_buffer_charset*)mg->mg_ptr; |
|
39
|
|
|
|
|
|
|
secret_buffer_charset *new_cset; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Newx(new_cset, 1, secret_buffer_charset); |
|
42
|
|
|
|
|
|
|
Copy(old_cset, new_cset, 1, secret_buffer_charset); |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
new_cset->rx = NULL; // filled again later during charset_from_regexp_ref |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
mg->mg_ptr = (char*)new_cset; |
|
47
|
|
|
|
|
|
|
return 0; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
#else |
|
50
|
|
|
|
|
|
|
#define secret_buffer_charset_magic_dup 0 |
|
51
|
|
|
|
|
|
|
#endif |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
static MGVTBL secret_buffer_charset_magic_vtbl = { |
|
54
|
|
|
|
|
|
|
NULL, /* get */ |
|
55
|
|
|
|
|
|
|
NULL, /* set */ |
|
56
|
|
|
|
|
|
|
NULL, /* len */ |
|
57
|
|
|
|
|
|
|
NULL, /* clear */ |
|
58
|
|
|
|
|
|
|
secret_buffer_charset_magic_free, /* free */ |
|
59
|
|
|
|
|
|
|
NULL, /* copy */ |
|
60
|
|
|
|
|
|
|
secret_buffer_charset_magic_dup, /* dup */ |
|
61
|
|
|
|
|
|
|
NULL /* local */ |
|
62
|
|
|
|
|
|
|
}; |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
/* Set a bit in the bitmap */ |
|
65
|
2067
|
|
|
|
|
|
static inline void sbc_bitmap_set(uint64_t *bitmap, U8 c) { |
|
66
|
2067
|
|
|
|
|
|
bitmap[c >> 6] |= (1ULL << (c & 63)); |
|
67
|
2067
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
/* Test for byte in bitmap */ |
|
69
|
1123
|
|
|
|
|
|
static inline bool sbc_bitmap_test(const uint64_t *bitmap, U8 c) { |
|
70
|
1123
|
|
|
|
|
|
return (bitmap[c >> 6] >> (c & 63)) & 1; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
/* Helper to test if a unicode codepoint matches the charset */ |
|
74
|
36
|
|
|
|
|
|
static bool sbc_test_codepoint(pTHX_ const secret_buffer_charset *cset, U32 cp) { |
|
75
|
|
|
|
|
|
|
/* Codepoints 0..7F are cached. Could cache up to 0xFF but locale might mess things up */ |
|
76
|
36
|
100
|
|
|
|
|
if (cp <= 0x80) |
|
77
|
26
|
|
|
|
|
|
return sbc_bitmap_test(cset->bitmap, (U8) cp); |
|
78
|
|
|
|
|
|
|
/* High codepoint handling */ |
|
79
|
10
|
100
|
|
|
|
|
if (cset->unicode_above_7F == SECRET_BUFFER_CHARSET_TESTUNI) { |
|
80
|
|
|
|
|
|
|
/* Must test with regex engine */ |
|
81
|
4
|
50
|
|
|
|
|
if (!cset->rx) return false; |
|
82
|
4
|
|
|
|
|
|
SV *test_sv= sv_2mortal(newSV(8)); |
|
83
|
4
|
|
|
|
|
|
char *utf8_buf= SvPVX(test_sv); |
|
84
|
4
|
|
|
|
|
|
char *end = (char*) uvchr_to_utf8((U8*) utf8_buf, cp); |
|
85
|
4
|
|
|
|
|
|
*end= '\0'; |
|
86
|
4
|
|
|
|
|
|
SvPOK_on(test_sv); |
|
87
|
4
|
|
|
|
|
|
SvCUR_set(test_sv, (end - utf8_buf)); |
|
88
|
4
|
|
|
|
|
|
SvUTF8_on(test_sv); |
|
89
|
4
|
|
|
|
|
|
I32 result = pregexec(cset->rx, utf8_buf, end, utf8_buf, 0, test_sv, 1); |
|
90
|
4
|
|
|
|
|
|
return result > 0; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
else |
|
93
|
6
|
|
|
|
|
|
return cset->unicode_above_7F == SECRET_BUFFER_CHARSET_ALLUNI; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
/* implement extern functions for public API */ |
|
97
|
0
|
|
|
|
|
|
bool secret_buffer_charset_test_byte(const secret_buffer_charset *cset, U8 b) { |
|
98
|
0
|
|
|
|
|
|
return sbc_bitmap_test(cset->bitmap, b); |
|
99
|
|
|
|
|
|
|
} |
|
100
|
0
|
|
|
|
|
|
bool secret_buffer_charset_test_codepoint(const secret_buffer_charset *cset, U32 cp) { |
|
101
|
|
|
|
|
|
|
dTHX; |
|
102
|
0
|
|
|
|
|
|
return sbc_test_codepoint(aTHX_ cset, cp); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
/* Parse a simple character class into bitmap. Returns true if it is confident |
|
106
|
|
|
|
|
|
|
* it fully handled the spec. Returns false if anything might be a problem, |
|
107
|
|
|
|
|
|
|
* in which case caller should use build_bitmap_via_regex. |
|
108
|
|
|
|
|
|
|
*/ |
|
109
|
|
|
|
|
|
|
#define HEXCHAR_TO_INT(c) (((c) >= '0' && (c) <= '9')? ((c) - '0') \ |
|
110
|
|
|
|
|
|
|
: ((c) >= 'A' && (c) <= 'F')? ((c) - 'A' + 10) \ |
|
111
|
|
|
|
|
|
|
: ((c) >= 'a' && (c) <= 'f')? ((c) - 'a' + 10) \ |
|
112
|
|
|
|
|
|
|
: -1) |
|
113
|
199
|
|
|
|
|
|
static bool parse_simple_charclass(pTHX_ secret_buffer_charset *cset, SV *qr_ref) { |
|
114
|
199
|
|
|
|
|
|
uint64_t *bitmap= cset->bitmap; |
|
115
|
199
|
|
|
|
|
|
I32 range_start= -1; |
|
116
|
199
|
|
|
|
|
|
bool negated = false; |
|
117
|
|
|
|
|
|
|
/* before 5.10 the flags are hidden somewhere and ->extflgs doesn't exist */ |
|
118
|
|
|
|
|
|
|
#ifdef RX_EXTFLAGS |
|
119
|
199
|
|
|
|
|
|
U32 rx_flags = RX_EXTFLAGS(cset->rx); |
|
120
|
199
|
|
|
|
|
|
bool flag_i= !!(rx_flags & RXf_PMf_FOLD); |
|
121
|
|
|
|
|
|
|
/* the /xx flag was added in 5.26 */ |
|
122
|
|
|
|
|
|
|
#ifdef RXf_PMf_EXTENDED_MORE |
|
123
|
199
|
|
|
|
|
|
bool flag_xx= !!(rx_flags & RXf_PMf_EXTENDED_MORE); |
|
124
|
|
|
|
|
|
|
#endif |
|
125
|
199
|
|
|
|
|
|
const char *pos = RX_PRECOMP(cset->rx); |
|
126
|
199
|
|
|
|
|
|
const char *lim = pos + RX_PRELEN(cset->rx); |
|
127
|
|
|
|
|
|
|
#else |
|
128
|
|
|
|
|
|
|
/* collect the flags by parsing the stringified representation. */ |
|
129
|
|
|
|
|
|
|
bool flag_i= false; |
|
130
|
|
|
|
|
|
|
STRLEN len; |
|
131
|
|
|
|
|
|
|
const char *pos= SvPV(qr_ref, len); |
|
132
|
|
|
|
|
|
|
const char *lim= pos + len; |
|
133
|
|
|
|
|
|
|
if (len < 3 || pos[0] != '(' || pos[1] != '?' || lim[-1] != ')') |
|
134
|
|
|
|
|
|
|
return false; |
|
135
|
|
|
|
|
|
|
bool ignore= false; |
|
136
|
|
|
|
|
|
|
for (pos += 2, lim--; *pos != ':'; ++pos) { |
|
137
|
|
|
|
|
|
|
if (pos >= lim) // we can read *lim because we bcked it up one char |
|
138
|
|
|
|
|
|
|
return false; |
|
139
|
|
|
|
|
|
|
if (*pos == 'i' && !ignore) |
|
140
|
|
|
|
|
|
|
flag_i= true; |
|
141
|
|
|
|
|
|
|
else if (*pos == '-') |
|
142
|
|
|
|
|
|
|
ignore= true; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
pos++; /* cross ':' char */ |
|
145
|
|
|
|
|
|
|
#endif |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
//warn("Attempting to parse '%.*s' %d %c %c\n", (int)(lim-pos), pos, (int)RX_PRELEN(rx), *pos, lim[-1]); |
|
148
|
199
|
50
|
|
|
|
|
if (pos < lim && lim[-1] == '+') { |
|
|
|
100
|
|
|
|
|
|
|
149
|
107
|
|
|
|
|
|
cset->match_multi= true; |
|
150
|
107
|
|
|
|
|
|
lim--; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
199
|
50
|
|
|
|
|
if (pos >= lim || *pos != '[' || lim[-1] != ']') |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
153
|
0
|
|
|
|
|
|
return false; |
|
154
|
199
|
|
|
|
|
|
pos++; /* Skip [ */ |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
/* Check for negation */ |
|
157
|
199
|
50
|
|
|
|
|
if (pos < lim && *pos == '^') { |
|
|
|
100
|
|
|
|
|
|
|
158
|
58
|
|
|
|
|
|
negated = true; |
|
159
|
58
|
|
|
|
|
|
pos++; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
/* first character may be ] without ending charset */ |
|
162
|
199
|
50
|
|
|
|
|
if (pos < lim && *pos == ']') { |
|
|
|
100
|
|
|
|
|
|
|
163
|
12
|
|
|
|
|
|
sbc_bitmap_set(bitmap, ']'); |
|
164
|
12
|
|
|
|
|
|
pos++; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
/* Parse characters and ranges */ |
|
167
|
546
|
50
|
|
|
|
|
while (pos < lim && *pos != ']') { |
|
|
|
100
|
|
|
|
|
|
|
168
|
363
|
|
|
|
|
|
I32 c= (I32)(unsigned char) *pos++; |
|
169
|
|
|
|
|
|
|
int high, low; |
|
170
|
|
|
|
|
|
|
// in case of a literal char over 0x7F, things get confusing because I |
|
171
|
|
|
|
|
|
|
// can't tell whether the pattern itself is latin-1 or unicode. |
|
172
|
363
|
100
|
|
|
|
|
if (c >= 0x80) |
|
173
|
1
|
|
|
|
|
|
return false; |
|
174
|
|
|
|
|
|
|
// but if ascii notation describes a codepoint over 0x80, that's OK. |
|
175
|
362
|
100
|
|
|
|
|
else if (c == '\\') { |
|
176
|
123
|
50
|
|
|
|
|
if (pos >= lim) return false; |
|
177
|
123
|
|
|
|
|
|
switch (*pos++) { |
|
178
|
|
|
|
|
|
|
/* is it escaping something we can use literally below? */ |
|
179
|
0
|
|
|
|
|
|
case '\\': case ']': case ' ': |
|
180
|
0
|
|
|
|
|
|
c= (unsigned char) pos[-1]; |
|
181
|
0
|
|
|
|
|
|
break; |
|
182
|
|
|
|
|
|
|
/* is it a special constant? */ |
|
183
|
0
|
|
|
|
|
|
case 'a': c= '\a'; break; |
|
184
|
0
|
|
|
|
|
|
case 'b': c= '\b'; break; |
|
185
|
1
|
|
|
|
|
|
case 'e': c= '\e'; break; |
|
186
|
0
|
|
|
|
|
|
case 'f': c= '\f'; break; |
|
187
|
56
|
|
|
|
|
|
case 'n': c= '\n'; break; |
|
188
|
1
|
|
|
|
|
|
case 'r': c= '\r'; break; |
|
189
|
40
|
|
|
|
|
|
case 't': c= '\t'; break; |
|
190
|
0
|
|
|
|
|
|
case 'o': // octal |
|
191
|
0
|
0
|
|
|
|
|
if (pos + 1 >= lim || !(*pos >= '0' && *pos <= '7')) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
192
|
0
|
|
|
|
|
|
return false; |
|
193
|
0
|
|
|
|
|
|
++pos; |
|
194
|
6
|
|
|
|
|
|
case '0': case '1': case '2': case '3': |
|
195
|
|
|
|
|
|
|
case '4': case '5': case '6': case '7': |
|
196
|
6
|
|
|
|
|
|
c= pos[-1] - '0'; |
|
197
|
6
|
50
|
|
|
|
|
if (pos < lim && *pos >= '0' && *pos <= '7') |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
198
|
2
|
|
|
|
|
|
c= (c << 3) | (*pos++ - '0'); |
|
199
|
6
|
50
|
|
|
|
|
if (pos < lim && *pos >= '0' && *pos <= '7') |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
200
|
1
|
|
|
|
|
|
c= (c << 3) | (*pos++ - '0'); |
|
201
|
6
|
100
|
|
|
|
|
if (c > 0xFF) |
|
202
|
1
|
|
|
|
|
|
cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI; |
|
203
|
6
|
|
|
|
|
|
break; |
|
204
|
11
|
|
|
|
|
|
case 'x': |
|
205
|
11
|
50
|
|
|
|
|
if (pos+1 >= lim) return false; |
|
206
|
11
|
50
|
|
|
|
|
high= HEXCHAR_TO_INT(pos[0]); |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
207
|
11
|
50
|
|
|
|
|
low= HEXCHAR_TO_INT(pos[1]); |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
208
|
11
|
100
|
|
|
|
|
if (high < 0 || low < 0) return false; |
|
|
|
50
|
|
|
|
|
|
|
209
|
7
|
|
|
|
|
|
c= (high << 4) | low; |
|
210
|
7
|
|
|
|
|
|
pos += 2; |
|
211
|
7
|
|
|
|
|
|
break; |
|
212
|
8
|
|
|
|
|
|
default: |
|
213
|
|
|
|
|
|
|
/* too complicated, give up and fall back to exhaustive test*/ |
|
214
|
8
|
|
|
|
|
|
return false; |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
} |
|
217
|
|
|
|
|
|
|
// abort on [:class:] notation |
|
218
|
239
|
100
|
|
|
|
|
else if (c == '[' && pos < lim && *pos == ':') |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
219
|
3
|
|
|
|
|
|
return false; |
|
220
|
|
|
|
|
|
|
// the /xx flag was added in 5.26 |
|
221
|
|
|
|
|
|
|
#ifdef RXf_PMf_EXTENDED_MORE |
|
222
|
236
|
100
|
|
|
|
|
else if ((c == ' ' || c == '\t') && flag_xx) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
223
|
2
|
|
|
|
|
|
continue; |
|
224
|
|
|
|
|
|
|
#endif |
|
225
|
345
|
100
|
|
|
|
|
if (range_start >= 0) { |
|
226
|
48
|
50
|
|
|
|
|
if (c < range_start) /* Invalid range */ |
|
227
|
0
|
|
|
|
|
|
return false; |
|
228
|
48
|
50
|
|
|
|
|
if (c > 0xFF) |
|
229
|
0
|
|
|
|
|
|
c= 0xFF; |
|
230
|
966
|
100
|
|
|
|
|
while (range_start <= c) |
|
231
|
918
|
|
|
|
|
|
sbc_bitmap_set(bitmap, (unsigned char) range_start++); |
|
232
|
48
|
|
|
|
|
|
range_start= -1; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
297
|
100
|
|
|
|
|
else if (pos + 1 < lim && *pos == '-' && pos[1] != ']') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
235
|
48
|
|
|
|
|
|
range_start= c; |
|
236
|
48
|
|
|
|
|
|
++pos; // skip '-' char |
|
237
|
|
|
|
|
|
|
} |
|
238
|
249
|
100
|
|
|
|
|
else if (c < 0xFF) { |
|
239
|
248
|
|
|
|
|
|
sbc_bitmap_set(bitmap, (U8) c); |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
} |
|
242
|
183
|
50
|
|
|
|
|
if (pos+1 != lim) // regex did not end at ']', give up |
|
243
|
0
|
|
|
|
|
|
return false; |
|
244
|
|
|
|
|
|
|
//warn("bitmaps: %08llX %08llX %08llX %08llX\n", bitmap[0], bitmap[1], bitmap[2], bitmap[3]); |
|
245
|
183
|
100
|
|
|
|
|
if (flag_i) { |
|
246
|
|
|
|
|
|
|
// Latin1 case folding will be a mess best handled by the regex engine |
|
247
|
2
|
50
|
|
|
|
|
if (bitmap[2] | bitmap[3]) |
|
248
|
0
|
|
|
|
|
|
return false; |
|
249
|
|
|
|
|
|
|
// Bits in range 0x41-0x5A need ORed into 0x61-0x7A and vice-versa |
|
250
|
2
|
|
|
|
|
|
bitmap[1] |= ((bitmap[1]>>32) & 0x7FFFFFE); |
|
251
|
2
|
|
|
|
|
|
bitmap[1] |= (bitmap[1] & 0x7FFFFFE) << 32; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
// If any char 0x80-0xFF is set, a unicode context should use the regex engine. |
|
254
|
|
|
|
|
|
|
// Otherwise, the charset doesn't contain any upper chars at all. |
|
255
|
183
|
50
|
|
|
|
|
if (bitmap[2] || bitmap[3]) |
|
|
|
100
|
|
|
|
|
|
|
256
|
2
|
|
|
|
|
|
cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI; |
|
257
|
|
|
|
|
|
|
// Apply negation |
|
258
|
183
|
100
|
|
|
|
|
if (negated) { |
|
259
|
|
|
|
|
|
|
int i; |
|
260
|
285
|
100
|
|
|
|
|
for (i = 0; i < 4; i++) |
|
261
|
228
|
|
|
|
|
|
bitmap[i] = ~bitmap[i]; |
|
262
|
57
|
50
|
|
|
|
|
if (cset->unicode_above_7F == SECRET_BUFFER_CHARSET_NOUNI) |
|
263
|
57
|
|
|
|
|
|
cset->unicode_above_7F= SECRET_BUFFER_CHARSET_ALLUNI; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
183
|
|
|
|
|
|
return true; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
/* Build bitmap by testing each byte through regex engine */ |
|
269
|
16
|
|
|
|
|
|
static void build_charset_via_regex_engine(pTHX_ secret_buffer_charset *cset) { |
|
270
|
16
|
|
|
|
|
|
SV *test_sv= sv_2mortal(newSV(2)); |
|
271
|
|
|
|
|
|
|
int c; |
|
272
|
16
|
|
|
|
|
|
SvPOK_on(test_sv); |
|
273
|
16
|
|
|
|
|
|
SvCUR_set(test_sv, 1); |
|
274
|
16
|
|
|
|
|
|
char *buf= SvPVX(test_sv); |
|
275
|
|
|
|
|
|
|
//warn("Run regex test on chars 0x00-0xFF\n"); |
|
276
|
4112
|
100
|
|
|
|
|
for (c= 0; c < 256; c++) { |
|
277
|
4096
|
|
|
|
|
|
buf[0]= (char) c; |
|
278
|
|
|
|
|
|
|
/* find the next match */ |
|
279
|
4096
|
|
|
|
|
|
I32 result = pregexec(cset->rx, buf, buf+1, buf, 0, test_sv, 1); |
|
280
|
4096
|
100
|
|
|
|
|
if (result > 0) |
|
281
|
889
|
|
|
|
|
|
sbc_bitmap_set(cset->bitmap, (unsigned char) c); |
|
282
|
|
|
|
|
|
|
} |
|
283
|
16
|
|
|
|
|
|
} |
|
284
|
|
|
|
|
|
|
|
|
285
|
199
|
|
|
|
|
|
static bool regex_is_single_charclass(REGEXP *rx) { |
|
286
|
|
|
|
|
|
|
/* Get the pattern string */ |
|
287
|
199
|
|
|
|
|
|
STRLEN pat_len = RX_PRELEN(rx); |
|
288
|
199
|
|
|
|
|
|
const char *pattern = RX_PRECOMP(rx); |
|
289
|
|
|
|
|
|
|
// struct regexp *re= |
|
290
|
|
|
|
|
|
|
//#ifndef SVt_REGEXP |
|
291
|
|
|
|
|
|
|
// (struct regexp*) rx; // before 5.12 REGEXP was struct regexp |
|
292
|
|
|
|
|
|
|
//#else |
|
293
|
|
|
|
|
|
|
// (struct regexp*) SvANY(rx); // after 5.12 REGEXP is a type of SV |
|
294
|
|
|
|
|
|
|
//#endif |
|
295
|
|
|
|
|
|
|
/* Try to validate that this regex is a single char class, with optional '+' */ |
|
296
|
|
|
|
|
|
|
//warn("pattern = '%.*s' re->nparens = %d re->minlen = %d", pat_len, pattern, re->nparens, re->minlen); |
|
297
|
398
|
50
|
|
|
|
|
return pat_len >= 3 && pattern[0] == '[' && ( |
|
|
|
50
|
|
|
|
|
|
|
298
|
199
|
100
|
|
|
|
|
pattern[pat_len-1] == ']' |
|
299
|
107
|
50
|
|
|
|
|
|| (pattern[pat_len-1] == '+' && pattern[pat_len-2] == ']') |
|
|
|
50
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
); |
|
301
|
|
|
|
|
|
|
// && re->nparens == 0 && re->minlen == 1; <-- this doesn't seem to be reliable |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
/* Main function: Get or create cached charset from regexp */ |
|
305
|
360
|
|
|
|
|
|
secret_buffer_charset *secret_buffer_charset_from_regexpref(SV *qr_ref) { |
|
306
|
|
|
|
|
|
|
MAGIC *mg; |
|
307
|
|
|
|
|
|
|
REGEXP *rx; |
|
308
|
|
|
|
|
|
|
secret_buffer_charset *cset; |
|
309
|
|
|
|
|
|
|
dTHX; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
/* Validate input */ |
|
312
|
360
|
50
|
|
|
|
|
if (!qr_ref || !(rx= (REGEXP*)SvRX(qr_ref))) |
|
|
|
50
|
|
|
|
|
|
|
313
|
0
|
|
|
|
|
|
croak("Expected Regexp ref"); |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
/* Check for existing cached charset */ |
|
316
|
360
|
100
|
|
|
|
|
if (SvMAGICAL(qr_ref)) { |
|
317
|
161
|
|
|
|
|
|
mg = mg_findext(qr_ref, PERL_MAGIC_ext, &secret_buffer_charset_magic_vtbl); |
|
318
|
161
|
50
|
|
|
|
|
if (mg && mg->mg_ptr) { |
|
|
|
50
|
|
|
|
|
|
|
319
|
161
|
|
|
|
|
|
cset= (secret_buffer_charset*)mg->mg_ptr; |
|
320
|
161
|
|
|
|
|
|
cset->rx= rx; // in case threading cloned us |
|
321
|
161
|
|
|
|
|
|
return cset; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
199
|
50
|
|
|
|
|
if (!regex_is_single_charclass(rx)) |
|
326
|
0
|
|
|
|
|
|
croak("Regex must contain a single character class and nothing else"); |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
/* Need to create new charset */ |
|
329
|
199
|
|
|
|
|
|
Newxz(cset, 1, secret_buffer_charset); |
|
330
|
199
|
|
|
|
|
|
cset->rx = rx; |
|
331
|
|
|
|
|
|
|
|
|
332
|
199
|
100
|
|
|
|
|
if (!parse_simple_charclass(aTHX_ cset, qr_ref)) { |
|
333
|
|
|
|
|
|
|
int i; |
|
334
|
|
|
|
|
|
|
// reset bitmap |
|
335
|
80
|
100
|
|
|
|
|
for (i= 0; i < sizeof(cset->bitmap)/sizeof(cset->bitmap[0]); i++) |
|
336
|
64
|
|
|
|
|
|
cset->bitmap[i]= 0; |
|
337
|
|
|
|
|
|
|
// Need to use regex engine and cache results of first 256 codepoints. |
|
338
|
16
|
|
|
|
|
|
build_charset_via_regex_engine(aTHX_ cset); |
|
339
|
|
|
|
|
|
|
// If pattern has PMf_UNICODE or similar, it might match unicode |
|
340
|
|
|
|
|
|
|
//if (rx_flags & (RXf_PMf_LOCALE | RXf_PMf_UNICODE)) { |
|
341
|
|
|
|
|
|
|
// ...actually, if 'parse simple' couldn't handle it, need engine regardless |
|
342
|
16
|
|
|
|
|
|
cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
/* Attach magic to cache the charset */ |
|
346
|
199
|
|
|
|
|
|
sv_magicext(qr_ref, NULL, PERL_MAGIC_ext, |
|
347
|
|
|
|
|
|
|
&secret_buffer_charset_magic_vtbl, (char*)cset, 0); |
|
348
|
|
|
|
|
|
|
|
|
349
|
199
|
|
|
|
|
|
return cset; |
|
350
|
|
|
|
|
|
|
} |