File Coverage

secret_buffer_charset.c
Criterion Covered Total %
statement 139 160 86.8
branch 110 160 68.7
condition n/a
subroutine n/a
pod n/a
total 249 320 77.8


line stmt bran cond sub pod time code
1             /* While the compiled Perl regular expression itself will have a character-class (set)
2             * implementation that could be used directly, its API is private and changes across
3             * perl versions. I gave up on interfacing directly with that, and took this approach of
4             * building my own bitmaps.
5             *
6             * The bitmaps only cache the result of testing the perl character class against bytes 0-0xFF
7             * in a non-unicode context. In a unicode context, it uses the cache for codepoints 0-0x7F
8             * and falls back to invoking the regex engine on each character with a higher codepoint value.
9             * This is inefficient, but I expect 7-bit ascii or non-unicode context is what gets used the
10             * most anyway.
11             *
12             * This file gets sourced directly into SecretBuffer.xs, so its static functions are availabe
13             * in other source files as well.
14             */
15              
16             struct secret_buffer_charset {
17             uint64_t bitmap[4]; // covers 0..255 codepoints
18             REGEXP *rx; // refers to Regexp object this was derived from
19             #define SECRET_BUFFER_CHARSET_NOUNI 0
20             #define SECRET_BUFFER_CHARSET_ALLUNI 1
21             #define SECRET_BUFFER_CHARSET_TESTUNI 2
22             int unicode_above_7F; // controls action when matching against unicode
23             bool match_multi; // stores whether regex ended with '+'
24             };
25              
26             /* MAGIC vtable for cached charset */
27 197           static int secret_buffer_charset_magic_free(pTHX_ SV *sv, MAGIC *mg) {
28 197 50         if (mg->mg_ptr) {
29 197           secret_buffer_charset *cset = (secret_buffer_charset*)mg->mg_ptr;
30 197           Safefree(cset);
31 197           mg->mg_ptr = NULL;
32             }
33 197           return 0;
34             }
35              
36             #ifdef USE_ITHREADS
37             static int secret_buffer_charset_magic_dup(pTHX_ MAGIC *mg, CLONE_PARAMS *param) {
38             secret_buffer_charset *old_cset = (secret_buffer_charset*)mg->mg_ptr;
39             secret_buffer_charset *new_cset;
40              
41             Newx(new_cset, 1, secret_buffer_charset);
42             Copy(old_cset, new_cset, 1, secret_buffer_charset);
43              
44             new_cset->rx = NULL; // filled again later during charset_from_regexp_ref
45              
46             mg->mg_ptr = (char*)new_cset;
47             return 0;
48             }
49             #else
50             #define secret_buffer_charset_magic_dup 0
51             #endif
52              
53             static MGVTBL secret_buffer_charset_magic_vtbl = {
54             NULL, /* get */
55             NULL, /* set */
56             NULL, /* len */
57             NULL, /* clear */
58             secret_buffer_charset_magic_free, /* free */
59             NULL, /* copy */
60             secret_buffer_charset_magic_dup, /* dup */
61             NULL /* local */
62             };
63              
64             /* Set a bit in the bitmap */
65 2067           static inline void sbc_bitmap_set(uint64_t *bitmap, U8 c) {
66 2067           bitmap[c >> 6] |= (1ULL << (c & 63));
67 2067           }
68             /* Test for byte in bitmap */
69 1123           static inline bool sbc_bitmap_test(const uint64_t *bitmap, U8 c) {
70 1123           return (bitmap[c >> 6] >> (c & 63)) & 1;
71             }
72              
73             /* Helper to test if a unicode codepoint matches the charset */
74 36           static bool sbc_test_codepoint(pTHX_ const secret_buffer_charset *cset, U32 cp) {
75             /* Codepoints 0..7F are cached. Could cache up to 0xFF but locale might mess things up */
76 36 100         if (cp <= 0x80)
77 26           return sbc_bitmap_test(cset->bitmap, (U8) cp);
78             /* High codepoint handling */
79 10 100         if (cset->unicode_above_7F == SECRET_BUFFER_CHARSET_TESTUNI) {
80             /* Must test with regex engine */
81 4 50         if (!cset->rx) return false;
82 4           SV *test_sv= sv_2mortal(newSV(8));
83 4           char *utf8_buf= SvPVX(test_sv);
84 4           char *end = (char*) uvchr_to_utf8((U8*) utf8_buf, cp);
85 4           *end= '\0';
86 4           SvPOK_on(test_sv);
87 4           SvCUR_set(test_sv, (end - utf8_buf));
88 4           SvUTF8_on(test_sv);
89 4           I32 result = pregexec(cset->rx, utf8_buf, end, utf8_buf, 0, test_sv, 1);
90 4           return result > 0;
91             }
92             else
93 6           return cset->unicode_above_7F == SECRET_BUFFER_CHARSET_ALLUNI;
94             }
95              
96             /* implement extern functions for public API */
97 0           bool secret_buffer_charset_test_byte(const secret_buffer_charset *cset, U8 b) {
98 0           return sbc_bitmap_test(cset->bitmap, b);
99             }
100 0           bool secret_buffer_charset_test_codepoint(const secret_buffer_charset *cset, U32 cp) {
101             dTHX;
102 0           return sbc_test_codepoint(aTHX_ cset, cp);
103             }
104              
105             /* Parse a simple character class into bitmap. Returns true if it is confident
106             * it fully handled the spec. Returns false if anything might be a problem,
107             * in which case caller should use build_bitmap_via_regex.
108             */
109             #define HEXCHAR_TO_INT(c) (((c) >= '0' && (c) <= '9')? ((c) - '0') \
110             : ((c) >= 'A' && (c) <= 'F')? ((c) - 'A' + 10) \
111             : ((c) >= 'a' && (c) <= 'f')? ((c) - 'a' + 10) \
112             : -1)
113 199           static bool parse_simple_charclass(pTHX_ secret_buffer_charset *cset, SV *qr_ref) {
114 199           uint64_t *bitmap= cset->bitmap;
115 199           I32 range_start= -1;
116 199           bool negated = false;
117             /* before 5.10 the flags are hidden somewhere and ->extflgs doesn't exist */
118             #ifdef RX_EXTFLAGS
119 199           U32 rx_flags = RX_EXTFLAGS(cset->rx);
120 199           bool flag_i= !!(rx_flags & RXf_PMf_FOLD);
121             /* the /xx flag was added in 5.26 */
122             #ifdef RXf_PMf_EXTENDED_MORE
123 199           bool flag_xx= !!(rx_flags & RXf_PMf_EXTENDED_MORE);
124             #endif
125 199           const char *pos = RX_PRECOMP(cset->rx);
126 199           const char *lim = pos + RX_PRELEN(cset->rx);
127             #else
128             /* collect the flags by parsing the stringified representation. */
129             bool flag_i= false;
130             STRLEN len;
131             const char *pos= SvPV(qr_ref, len);
132             const char *lim= pos + len;
133             if (len < 3 || pos[0] != '(' || pos[1] != '?' || lim[-1] != ')')
134             return false;
135             bool ignore= false;
136             for (pos += 2, lim--; *pos != ':'; ++pos) {
137             if (pos >= lim) // we can read *lim because we bcked it up one char
138             return false;
139             if (*pos == 'i' && !ignore)
140             flag_i= true;
141             else if (*pos == '-')
142             ignore= true;
143             }
144             pos++; /* cross ':' char */
145             #endif
146              
147             //warn("Attempting to parse '%.*s' %d %c %c\n", (int)(lim-pos), pos, (int)RX_PRELEN(rx), *pos, lim[-1]);
148 199 50         if (pos < lim && lim[-1] == '+') {
    100          
149 107           cset->match_multi= true;
150 107           lim--;
151             }
152 199 50         if (pos >= lim || *pos != '[' || lim[-1] != ']')
    50          
    50          
153 0           return false;
154 199           pos++; /* Skip [ */
155              
156             /* Check for negation */
157 199 50         if (pos < lim && *pos == '^') {
    100          
158 58           negated = true;
159 58           pos++;
160             }
161             /* first character may be ] without ending charset */
162 199 50         if (pos < lim && *pos == ']') {
    100          
163 12           sbc_bitmap_set(bitmap, ']');
164 12           pos++;
165             }
166             /* Parse characters and ranges */
167 546 50         while (pos < lim && *pos != ']') {
    100          
168 363           I32 c= (I32)(unsigned char) *pos++;
169             int high, low;
170             // in case of a literal char over 0x7F, things get confusing because I
171             // can't tell whether the pattern itself is latin-1 or unicode.
172 363 100         if (c >= 0x80)
173 1           return false;
174             // but if ascii notation describes a codepoint over 0x80, that's OK.
175 362 100         else if (c == '\\') {
176 123 50         if (pos >= lim) return false;
177 123           switch (*pos++) {
178             /* is it escaping something we can use literally below? */
179 0           case '\\': case ']': case ' ':
180 0           c= (unsigned char) pos[-1];
181 0           break;
182             /* is it a special constant? */
183 0           case 'a': c= '\a'; break;
184 0           case 'b': c= '\b'; break;
185 1           case 'e': c= '\e'; break;
186 0           case 'f': c= '\f'; break;
187 56           case 'n': c= '\n'; break;
188 1           case 'r': c= '\r'; break;
189 40           case 't': c= '\t'; break;
190 0           case 'o': // octal
191 0 0         if (pos + 1 >= lim || !(*pos >= '0' && *pos <= '7'))
    0          
    0          
192 0           return false;
193 0           ++pos;
194 6           case '0': case '1': case '2': case '3':
195             case '4': case '5': case '6': case '7':
196 6           c= pos[-1] - '0';
197 6 50         if (pos < lim && *pos >= '0' && *pos <= '7')
    100          
    100          
198 2           c= (c << 3) | (*pos++ - '0');
199 6 50         if (pos < lim && *pos >= '0' && *pos <= '7')
    100          
    100          
200 1           c= (c << 3) | (*pos++ - '0');
201 6 100         if (c > 0xFF)
202 1           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI;
203 6           break;
204 11           case 'x':
205 11 50         if (pos+1 >= lim) return false;
206 11 50         high= HEXCHAR_TO_INT(pos[0]);
    100          
    50          
    100          
    50          
    50          
207 11 50         low= HEXCHAR_TO_INT(pos[1]);
    100          
    50          
    50          
    0          
    0          
208 11 100         if (high < 0 || low < 0) return false;
    50          
209 7           c= (high << 4) | low;
210 7           pos += 2;
211 7           break;
212 8           default:
213             /* too complicated, give up and fall back to exhaustive test*/
214 8           return false;
215             }
216             }
217             // abort on [:class:] notation
218 239 100         else if (c == '[' && pos < lim && *pos == ':')
    50          
    50          
219 3           return false;
220             // the /xx flag was added in 5.26
221             #ifdef RXf_PMf_EXTENDED_MORE
222 236 100         else if ((c == ' ' || c == '\t') && flag_xx)
    50          
    100          
223 2           continue;
224             #endif
225 345 100         if (range_start >= 0) {
226 48 50         if (c < range_start) /* Invalid range */
227 0           return false;
228 48 50         if (c > 0xFF)
229 0           c= 0xFF;
230 966 100         while (range_start <= c)
231 918           sbc_bitmap_set(bitmap, (unsigned char) range_start++);
232 48           range_start= -1;
233             }
234 297 100         else if (pos + 1 < lim && *pos == '-' && pos[1] != ']') {
    100          
    50          
235 48           range_start= c;
236 48           ++pos; // skip '-' char
237             }
238 249 100         else if (c < 0xFF) {
239 248           sbc_bitmap_set(bitmap, (U8) c);
240             }
241             }
242 183 50         if (pos+1 != lim) // regex did not end at ']', give up
243 0           return false;
244             //warn("bitmaps: %08llX %08llX %08llX %08llX\n", bitmap[0], bitmap[1], bitmap[2], bitmap[3]);
245 183 100         if (flag_i) {
246             // Latin1 case folding will be a mess best handled by the regex engine
247 2 50         if (bitmap[2] | bitmap[3])
248 0           return false;
249             // Bits in range 0x41-0x5A need ORed into 0x61-0x7A and vice-versa
250 2           bitmap[1] |= ((bitmap[1]>>32) & 0x7FFFFFE);
251 2           bitmap[1] |= (bitmap[1] & 0x7FFFFFE) << 32;
252             }
253             // If any char 0x80-0xFF is set, a unicode context should use the regex engine.
254             // Otherwise, the charset doesn't contain any upper chars at all.
255 183 50         if (bitmap[2] || bitmap[3])
    100          
256 2           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI;
257             // Apply negation
258 183 100         if (negated) {
259             int i;
260 285 100         for (i = 0; i < 4; i++)
261 228           bitmap[i] = ~bitmap[i];
262 57 50         if (cset->unicode_above_7F == SECRET_BUFFER_CHARSET_NOUNI)
263 57           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_ALLUNI;
264             }
265 183           return true;
266             }
267              
268             /* Build bitmap by testing each byte through regex engine */
269 16           static void build_charset_via_regex_engine(pTHX_ secret_buffer_charset *cset) {
270 16           SV *test_sv= sv_2mortal(newSV(2));
271             int c;
272 16           SvPOK_on(test_sv);
273 16           SvCUR_set(test_sv, 1);
274 16           char *buf= SvPVX(test_sv);
275             //warn("Run regex test on chars 0x00-0xFF\n");
276 4112 100         for (c= 0; c < 256; c++) {
277 4096           buf[0]= (char) c;
278             /* find the next match */
279 4096           I32 result = pregexec(cset->rx, buf, buf+1, buf, 0, test_sv, 1);
280 4096 100         if (result > 0)
281 889           sbc_bitmap_set(cset->bitmap, (unsigned char) c);
282             }
283 16           }
284              
285 199           static bool regex_is_single_charclass(REGEXP *rx) {
286             /* Get the pattern string */
287 199           STRLEN pat_len = RX_PRELEN(rx);
288 199           const char *pattern = RX_PRECOMP(rx);
289             // struct regexp *re=
290             //#ifndef SVt_REGEXP
291             // (struct regexp*) rx; // before 5.12 REGEXP was struct regexp
292             //#else
293             // (struct regexp*) SvANY(rx); // after 5.12 REGEXP is a type of SV
294             //#endif
295             /* Try to validate that this regex is a single char class, with optional '+' */
296             //warn("pattern = '%.*s' re->nparens = %d re->minlen = %d", pat_len, pattern, re->nparens, re->minlen);
297 398 50         return pat_len >= 3 && pattern[0] == '[' && (
    50          
298 199 100         pattern[pat_len-1] == ']'
299 107 50         || (pattern[pat_len-1] == '+' && pattern[pat_len-2] == ']')
    50          
300             );
301             // && re->nparens == 0 && re->minlen == 1; <-- this doesn't seem to be reliable
302             }
303              
304             /* Main function: Get or create cached charset from regexp */
305 360           secret_buffer_charset *secret_buffer_charset_from_regexpref(SV *qr_ref) {
306             MAGIC *mg;
307             REGEXP *rx;
308             secret_buffer_charset *cset;
309             dTHX;
310              
311             /* Validate input */
312 360 50         if (!qr_ref || !(rx= (REGEXP*)SvRX(qr_ref)))
    50          
313 0           croak("Expected Regexp ref");
314              
315             /* Check for existing cached charset */
316 360 100         if (SvMAGICAL(qr_ref)) {
317 161           mg = mg_findext(qr_ref, PERL_MAGIC_ext, &secret_buffer_charset_magic_vtbl);
318 161 50         if (mg && mg->mg_ptr) {
    50          
319 161           cset= (secret_buffer_charset*)mg->mg_ptr;
320 161           cset->rx= rx; // in case threading cloned us
321 161           return cset;
322             }
323             }
324              
325 199 50         if (!regex_is_single_charclass(rx))
326 0           croak("Regex must contain a single character class and nothing else");
327              
328             /* Need to create new charset */
329 199           Newxz(cset, 1, secret_buffer_charset);
330 199           cset->rx = rx;
331              
332 199 100         if (!parse_simple_charclass(aTHX_ cset, qr_ref)) {
333             int i;
334             // reset bitmap
335 80 100         for (i= 0; i < sizeof(cset->bitmap)/sizeof(cset->bitmap[0]); i++)
336 64           cset->bitmap[i]= 0;
337             // Need to use regex engine and cache results of first 256 codepoints.
338 16           build_charset_via_regex_engine(aTHX_ cset);
339             // If pattern has PMf_UNICODE or similar, it might match unicode
340             //if (rx_flags & (RXf_PMf_LOCALE | RXf_PMf_UNICODE)) {
341             // ...actually, if 'parse simple' couldn't handle it, need engine regardless
342 16           cset->unicode_above_7F= SECRET_BUFFER_CHARSET_TESTUNI;
343             }
344              
345             /* Attach magic to cache the charset */
346 199           sv_magicext(qr_ref, NULL, PERL_MAGIC_ext,
347             &secret_buffer_charset_magic_vtbl, (char*)cset, 0);
348              
349 199           return cset;
350             }