File Coverage

secret_buffer_charset.c
Criterion Covered Total %
statement 153 173 88.4
branch 135 194 69.5
condition n/a
subroutine n/a
pod n/a
total 288 367 78.4


line stmt bran cond sub pod time code
1             /* While the compiled Perl regular expression itself will have a character-class (set)
2             * implementation that could be used directly, its API is private and changes across
3             * perl versions. I gave up on interfacing directly with that, and took this approach of
4             * building my own bitmaps.
5             *
6             * The bitmaps only cache the result of testing the perl character class against bytes 0-0xFF
7             * in a non-unicode context. In a unicode context, it uses the cache for codepoints 0-0x7F
8             * and falls back to invoking the regex engine on each character with a higher codepoint value.
9             * This is inefficient, but I expect 7-bit ascii or non-unicode context is what gets used the
10             * most anyway.
11             *
12             * This file gets sourced directly into SecretBuffer.xs, so its static functions are availabe
13             * in other source files as well.
14             */
15              
16             struct secret_buffer_charset {
17             uint64_t bitmap[4]; // covers 0..255 codepoints
18             REGEXP *rx; // refers to Regexp object this was derived from
19             #define SECRET_BUFFER_CHARSET_NOUNI 0
20             #define SECRET_BUFFER_CHARSET_ALLUNI 1
21             #define SECRET_BUFFER_CHARSET_TESTUNI 2
22             int unicode_above_7F; // controls action when matching against unicode
23             bool match_multi; // stores whether regex ended with '+'
24             };
25              
26             /* MAGIC vtable for cached charset */
27 276           static int secret_buffer_charset_magic_free(pTHX_ SV *sv, MAGIC *mg) {
28 276 50         if (mg->mg_ptr) {
29 276           secret_buffer_charset *cset = (secret_buffer_charset*)mg->mg_ptr;
30 276           Safefree(cset);
31 276           mg->mg_ptr = NULL;
32             }
33 276           return 0;
34             }
35              
36             #ifdef USE_ITHREADS
37             static int secret_buffer_charset_magic_dup(pTHX_ MAGIC *mg, CLONE_PARAMS *param) {
38             secret_buffer_charset *old_cset = (secret_buffer_charset*)mg->mg_ptr;
39             secret_buffer_charset *new_cset;
40              
41             Newx(new_cset, 1, secret_buffer_charset);
42             Copy(old_cset, new_cset, 1, secret_buffer_charset);
43              
44             new_cset->rx = NULL; // filled again later during charset_from_regexp_ref
45              
46             mg->mg_ptr = (char*)new_cset;
47             return 0;
48             }
49             #else
50             #define secret_buffer_charset_magic_dup 0
51             #endif
52              
53             static MGVTBL secret_buffer_charset_magic_vtbl = {
54             NULL, /* get */
55             NULL, /* set */
56             NULL, /* len */
57             NULL, /* clear */
58             secret_buffer_charset_magic_free, /* free */
59             NULL, /* copy */
60             secret_buffer_charset_magic_dup, /* dup */
61             NULL /* local */
62             };
63              
64             /* Set a bit in the bitmap */
65 5327           static inline void sbc_bitmap_set(uint64_t *bitmap, U8 c) {
66 5327           bitmap[c >> 6] |= (1ULL << (c & 63));
67 5327           }
68             /* Test for byte in bitmap */
69 1368           static inline bool sbc_bitmap_test(const uint64_t *bitmap, U8 c) {
70 1368           return (bitmap[c >> 6] >> (c & 63)) & 1;
71             }
72              
73             /* Helper to test if a unicode codepoint matches the charset */
74 224           static bool sbc_test_codepoint(pTHX_ const secret_buffer_charset *cset, U32 cp) {
75             /* Codepoints 0..7F are cached. Could cache up to 0xFF but locale might mess things up */
76 224 100         if (cp <= 0x80)
77 202           return sbc_bitmap_test(cset->bitmap, (U8) cp);
78             /* High codepoint handling */
79 22 100         if (cset->unicode_above_7F == SECRET_BUFFER_CHARSET_TESTUNI) {
80             /* Must test with regex engine */
81 16 50         if (!cset->rx) return false;
82 16           SV *test_sv= sv_2mortal(newSV(8));
83 16           char *utf8_buf= SvPVX(test_sv);
84 16           char *end = (char*) uvchr_to_utf8((U8*) utf8_buf, cp);
85 16           *end= '\0';
86 16           SvPOK_on(test_sv);
87 16           SvCUR_set(test_sv, (end - utf8_buf));
88 16           SvUTF8_on(test_sv);
89 16           I32 result = pregexec(cset->rx, utf8_buf, end, utf8_buf, 0, test_sv, 1);
90 16           return result > 0;
91             }
92             else
93 6           return cset->unicode_above_7F == SECRET_BUFFER_CHARSET_ALLUNI;
94             }
95              
96             /* implement extern functions for public API */
97 0           bool secret_buffer_charset_test_byte(const secret_buffer_charset *cset, U8 b) {
98 0           return sbc_bitmap_test(cset->bitmap, b);
99             }
100 0           bool secret_buffer_charset_test_codepoint(const secret_buffer_charset *cset, U32 cp) {
101             dTHX;
102 0           return sbc_test_codepoint(aTHX_ cset, cp);
103             }
104              
105             /* Parse a simple character class into bitmap. Returns true if it is confident
106             * it fully handled the spec. Returns false if anything might be a problem,
107             * in which case caller should use build_bitmap_via_regex.
108             */
109             #define HEXCHAR_TO_INT(c) (((c) >= '0' && (c) <= '9')? ((c) - '0') \
110             : ((c) >= 'A' && (c) <= 'F')? ((c) - 'A' + 10) \
111             : ((c) >= 'a' && (c) <= 'f')? ((c) - 'a' + 10) \
112             : -1)
113 278           static bool parse_simple_charclass(pTHX_ secret_buffer_charset *cset, SV *qr_ref) {
114 278           uint64_t *bitmap= cset->bitmap;
115 278           I32 range_start= -1;
116 278           bool negated = false;
117             /* before 5.10 the flags are hidden somewhere and ->extflgs doesn't exist */
118             #ifdef RX_EXTFLAGS
119 278           U32 rx_flags = RX_EXTFLAGS(cset->rx);
120 278           bool flag_i= !!(rx_flags & RXf_PMf_FOLD);
121 278           bool flag_s= !!(rx_flags & RXf_PMf_SINGLELINE);
122             /* the /xx flag was added in 5.26 */
123             #ifdef RXf_PMf_EXTENDED_MORE
124 278           bool flag_xx= !!(rx_flags & RXf_PMf_EXTENDED_MORE);
125             #endif
126 278           const char *pos = RX_PRECOMP(cset->rx);
127 278           const char *lim = pos + RX_PRELEN(cset->rx);
128             #else
129             /* collect the flags by parsing the stringified representation. */
130             bool flag_i= false, flag_s= false;
131             STRLEN len;
132             const char *pos= SvPV(qr_ref, len);
133             const char *lim= pos + len;
134             if (len < 3 || pos[0] != '(' || pos[1] != '?' || lim[-1] != ')')
135             return false;
136             bool ignore= false;
137             for (pos += 2, lim--; *pos != ':'; ++pos) {
138             if (pos >= lim) // we can read *lim because we backed it up one char
139             return false;
140             if (*pos == 'i' && !ignore)
141             flag_i= true;
142             else if (*pos == 's' && !ignore)
143             flag_s= true;
144             else if (*pos == '-')
145             ignore= true;
146             }
147             pos++; /* cross ':' char */
148             #endif
149              
150             //warn("Attempting to parse '%.*s' %d %c %c\n", (int)(lim-pos), pos, (int)RX_PRELEN(rx), *pos, lim[-1]);
151 278 50         if (pos < lim && lim[-1] == '+') {
    100          
152 107           cset->match_multi= true;
153 107           lim--;
154             }
155 278 50         if (pos >= lim || *pos != '[' || lim[-1] != ']') {
    100          
    50          
156             /* special case qr/./ which is easy to process */
157 8 100         if (pos + 1 == lim && *pos == '.') {
    50          
158 2           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_ALLUNI;
159 2           memset(cset->bitmap, 0xFF, sizeof(cset->bitmap));
160 2 100         if (!flag_s) /* all but newline */
161 1           cset->bitmap[0] &= ~(1ULL << 0x0A);
162 2           return true;
163             }
164 6           return false;
165             }
166 270           pos++; /* Skip [ */
167              
168             /* Check for negation */
169 270 50         if (pos < lim && *pos == '^') {
    100          
170 58           negated = true;
171 58           pos++;
172             }
173             /* first character may be ] without ending charset */
174 270 50         if (pos < lim && *pos == ']') {
    100          
175 12           sbc_bitmap_set(bitmap, ']');
176 12           pos++;
177             }
178             /* Parse characters and ranges */
179 811 50         while (pos < lim && *pos != ']') {
    100          
180 559           I32 c= (I32)(unsigned char) *pos++;
181             int high, low;
182             // in case of a literal char over 0x7F, things get confusing because I
183             // can't tell whether the pattern itself is latin-1 or unicode.
184 559 100         if (c >= 0x80)
185 1           return false;
186             // but if ascii notation describes a codepoint over 0x80, that's OK.
187 558 100         else if (c == '\\') {
188 319 50         if (pos >= lim) return false;
189 319           switch (*pos++) {
190             /* is it escaping something we can use literally below? */
191 0           case '\\': case ']': case ' ':
192 0           c= (unsigned char) pos[-1];
193 0           break;
194             /* is it a special constant? */
195 0           case 'a': c= '\a'; break;
196 0           case 'b': c= '\b'; break;
197 1           case 'e': c= '\e'; break;
198 0           case 'f': c= '\f'; break;
199 59           case 'n': c= '\n'; break;
200 4           case 'r': c= '\r'; break;
201 40           case 't': c= '\t'; break;
202 0           case 'o': // octal
203 0 0         if (pos + 1 >= lim || !(*pos >= '0' && *pos <= '7'))
    0          
    0          
204 0           return false;
205 0           ++pos;
206 62           case '0': case '1': case '2': case '3':
207             case '4': case '5': case '6': case '7':
208 62           c= pos[-1] - '0';
209 62 50         if (pos < lim && *pos >= '0' && *pos <= '7')
    100          
    100          
210 2           c= (c << 3) | (*pos++ - '0');
211 62 50         if (pos < lim && *pos >= '0' && *pos <= '7')
    100          
    100          
212 1           c= (c << 3) | (*pos++ - '0');
213 62 100         if (c > 0xFF)
214 1           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI;
215 62           break;
216 143           case 'x':
217 143 50         if (pos+1 >= lim) return false;
218 143 50         high= HEXCHAR_TO_INT(pos[0]);
    100          
    50          
    100          
    50          
    50          
219 143 50         low= HEXCHAR_TO_INT(pos[1]);
    100          
    50          
    50          
    0          
    0          
220 143 100         if (high < 0 || low < 0) return false;
    50          
221 139           c= (high << 4) | low;
222 139           pos += 2;
223 139           break;
224 10           default:
225             /* too complicated, give up and fall back to exhaustive test*/
226 10           return false;
227             }
228             }
229             // abort on [:class:] notation
230 239 100         else if (c == '[' && pos < lim && *pos == ':')
    50          
    50          
231 3           return false;
232             // the /xx flag was added in 5.26
233             #ifdef RXf_PMf_EXTENDED_MORE
234 236 100         else if ((c == ' ' || c == '\t') && flag_xx)
    50          
    100          
235 2           continue;
236             #endif
237 539 100         if (range_start >= 0) {
238 114 50         if (c < range_start) /* Invalid range */
239 0           return false;
240 114 50         if (c > 0xFF)
241 0           c= 0xFF;
242 3080 100         while (range_start <= c)
243 2966           sbc_bitmap_set(bitmap, (unsigned char) range_start++);
244 114           range_start= -1;
245             }
246 425 100         else if (pos + 1 < lim && *pos == '-' && pos[1] != ']') {
    100          
    50          
247 114           range_start= c;
248 114           ++pos; // skip '-' char
249             }
250 311 100         else if (c < 0xFF) {
251 310           sbc_bitmap_set(bitmap, (U8) c);
252             }
253             }
254 252 50         if (pos+1 != lim) // regex did not end at ']', give up
255 0           return false;
256             //warn("bitmaps: %08llX %08llX %08llX %08llX\n", bitmap[0], bitmap[1], bitmap[2], bitmap[3]);
257 252 100         if (flag_i) {
258             // Latin1 case folding will be a mess best handled by the regex engine
259 2 50         if (bitmap[2] | bitmap[3])
260 0           return false;
261             // Bits in range 0x41-0x5A need ORed into 0x61-0x7A and vice-versa
262 2           bitmap[1] |= ((bitmap[1]>>32) & 0x7FFFFFE);
263 2           bitmap[1] |= (bitmap[1] & 0x7FFFFFE) << 32;
264             }
265             // If any char 0x80-0xFF is set, a unicode context should use the regex engine.
266             // Otherwise, the charset doesn't contain any upper chars at all.
267 252 50         if (bitmap[2] || bitmap[3])
    100          
268 12           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI;
269             // Apply negation
270 252 100         if (negated) {
271             int i;
272 285 100         for (i = 0; i < 4; i++)
273 228           bitmap[i] = ~bitmap[i];
274 57 50         if (cset->unicode_above_7F == SECRET_BUFFER_CHARSET_NOUNI)
275 57           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_ALLUNI;
276             }
277 252           return true;
278             }
279              
280             /* Build bitmap by testing each byte through regex engine */
281 24           static void build_charset_via_regex_engine(pTHX_ secret_buffer_charset *cset) {
282 24           SV *test_sv= sv_2mortal(newSV(2));
283             int c;
284 24           SvPOK_on(test_sv);
285 24           SvCUR_set(test_sv, 1);
286 24           char *buf= SvPVX(test_sv);
287             //warn("Run regex test on chars 0x00-0xFF\n");
288 6168 100         for (c= 0; c < 256; c++) {
289 6144           buf[0]= (char) c;
290             /* find the next match */
291 6144           I32 result = pregexec(cset->rx, buf, buf+1, buf, 0, test_sv, 1);
292 6144 100         if (result > 0)
293 2039           sbc_bitmap_set(cset->bitmap, (unsigned char) c);
294             }
295 24           }
296              
297             /* Return true if the regex appears to meet our limitation of
298             * "only a single character class with optional '+' "
299             * This will return true for some things that parse_simple_charclass cannot parse
300             * since it can fall back to build_charset_via_regex_engine.
301             */
302 278           static bool regex_is_single_charclass(REGEXP *rx) {
303             /* Get the pattern string */
304 278           STRLEN pat_len = RX_PRELEN(rx);
305 278           const char *pattern = RX_PRECOMP(rx);
306 270 50         return pat_len >= 3 && pattern[0] == '['
307 270 100         && (pattern[pat_len-1] == ']' || (pattern[pat_len-1] == '+' && pattern[pat_len-2] == ']'))
    50          
    50          
308             /* character class '.' */
309 8 100         || (pat_len == 1 || (pat_len == 2 && pattern[1] == '+'))
    50          
    50          
310 2 50         && pattern[0] == '.'
311             /* the character classes \w \W \s \S \d \D */
312 562 100         || (pat_len == 2 || (pat_len == 3 && pattern[2] == '+'))
    50          
    0          
    0          
313 6 50         && pattern[0] == '\\' && (
314 6 100         pattern[1] == 'w' || pattern[1] == 'W'
    100          
315 4 100         || pattern[1] == 's' || pattern[1] == 'S'
    100          
316 2 100         || pattern[1] == 'd' || pattern[1] == 'D'
    50          
317             );
318             }
319              
320             /* Main function: Get or create cached charset from regexp */
321 631           secret_buffer_charset *secret_buffer_charset_from_regexpref(SV *qr_ref) {
322             MAGIC *mg;
323             REGEXP *rx;
324             secret_buffer_charset *cset;
325             dTHX;
326              
327             /* Validate input */
328 631 50         if (!qr_ref || !(rx= (REGEXP*)SvRX(qr_ref)))
    50          
329 0           croak("Expected Regexp ref");
330              
331             /* Check for existing cached charset */
332 631 100         if (SvMAGICAL(qr_ref)) {
333 353           mg = mg_findext(qr_ref, PERL_MAGIC_ext, &secret_buffer_charset_magic_vtbl);
334 353 50         if (mg && mg->mg_ptr) {
    50          
335 353           cset= (secret_buffer_charset*)mg->mg_ptr;
336 353           cset->rx= rx; // in case threading cloned us
337 353           return cset;
338             }
339             }
340              
341 278 50         if (!regex_is_single_charclass(rx))
342 0           croak("Regex must contain a single character class and nothing else");
343              
344             /* Need to create new charset */
345 278           Newxz(cset, 1, secret_buffer_charset);
346 278           cset->rx = rx;
347              
348 278 100         if (!parse_simple_charclass(aTHX_ cset, qr_ref)) {
349             int i;
350             // reset bitmap
351 120 100         for (i= 0; i < sizeof(cset->bitmap)/sizeof(cset->bitmap[0]); i++)
352 96           cset->bitmap[i]= 0;
353             // Need to use regex engine and cache results of first 256 codepoints.
354 24           build_charset_via_regex_engine(aTHX_ cset);
355             // If pattern has PMf_UNICODE or similar, it might match unicode
356             //if (rx_flags & (RXf_PMf_LOCALE | RXf_PMf_UNICODE)) {
357             // ...actually, if 'parse simple' couldn't handle it, need engine regardless
358 24           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI;
359             }
360              
361             /* Attach magic to cache the charset */
362 278           sv_magicext(qr_ref, NULL, PERL_MAGIC_ext,
363             &secret_buffer_charset_magic_vtbl, (char*)cset, 0);
364              
365 278           return cset;
366             }