File Coverage

secret_buffer_parse.c
Criterion Covered Total %
statement 540 692 78.0
branch 391 592 66.0
condition n/a
subroutine n/a
pod n/a
total 931 1284 72.5


line stmt bran cond sub pod time code
1              
2             /* These local parse functions are independenct of the SecretBuffer instance,
3             * needing only the 'data' pointer to whch the parse_state refers.
4             * The pos/lim of the parse state must already be checked against the length
5             * of the data before calling these.
6             */
7             static int sizeof_codepoint_encoding(int codepoint, int encoding);
8             static int sb_parse_prev_codepoint(secret_buffer_parse *parse);
9             static int sb_parse_next_codepoint(secret_buffer_parse *parse);
10             static bool sb_parse_encode_codepoint(secret_buffer_parse *parse, int codepoint);
11             static bool sb_parse_match_charset_bytes(secret_buffer_parse *parse, const secret_buffer_charset *cset, int flags);
12             static bool sb_parse_match_charset_codepoints(secret_buffer_parse *parse, const secret_buffer_charset *cset, int flags);
13             static bool sb_parse_match_str_U8(secret_buffer_parse *parse, const U8 *pattern, size_t pattern_len, int flags);
14             static bool sb_parse_match_str_I32(secret_buffer_parse *parse, const I32 *pattern, size_t pattern_len, int flags);
15              
16 66           static bool parse_encoding(pTHX_ SV *sv, int *out) {
17             int enc;
18 66 50         if (looks_like_number(sv)) {
19 0           IV i= SvIV(sv);
20 0 0         if (i < 0 || i > SECRET_BUFFER_ENCODING_MAX)
    0          
21 0           return false;
22 0           enc= (int) i;
23             } else {
24             STRLEN len;
25 66           const char *str= SvPV(sv, len);
26 66           switch (len) {
27 6 50         case 3: if (0 == strcmp(str, "HEX")) { enc= SECRET_BUFFER_ENCODING_HEX; break; }
28 1 50         case 4: if (0 == strcmp(str, "UTF8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
29 13 100         case 5: if (0 == strcmp(str, "ASCII")) { enc= SECRET_BUFFER_ENCODING_ASCII; break; }
30 12 50         if (0 == strcmp(str, "UTF-8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
31 26 50         case 6: if (0 == strcmp(str, "BASE64")) { enc= SECRET_BUFFER_ENCODING_BASE64; break; }
32 1 50         case 7: if (0 == strcmp(str, "UTF16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
33 0 0         if (0 == strcmp(str, "UTF16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
34 6 100         case 8: if (0 == strcmp(str, "UTF-16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
35 3 50         if (0 == strcmp(str, "UTF-16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
36 0 0         case 9: if (0 == strcmp(str, "ISO8859_1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
37 13 50         case 10: if (0 == strcmp(str, "ISO-8859-1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
38             default:
39 0           return false;
40             }
41             }
42 66 50         if (out) *out= enc;
43 66           return true;
44             }
45              
46             /* Public API --------------------------------------------------------------*/
47              
48             /* initialize a parse struct, but only if it is valid span of the buffer */
49 2001           bool secret_buffer_parse_init(secret_buffer_parse *parse,
50             secret_buffer *buf, size_t pos, size_t lim, int encoding
51             ) {
52 2001           Zero(parse, 1, secret_buffer_parse);
53             // Sanity check this parse state vs. the buffer
54 2001 100         if (lim > buf->len || pos > lim) {
    50          
55 1 50         parse->error= pos > lim? "span starts beyond buffer" : "span ends beyond buffer";
56 1           return false;
57             }
58 2000           parse->pos= ((U8*) buf->data) + pos;
59 2000           parse->lim= ((U8*) buf->data) + lim;
60 2000           parse->encoding= encoding;
61 2000           parse->sbuf= buf;
62 2000           return true;
63             }
64              
65             /* Initialize a parse struct, either from a Span, or a SecretBuffer, or a plain Scalar.
66             */
67 1737           bool secret_buffer_parse_init_from_sv(secret_buffer_parse *parse, SV *sv) {
68             dTHX;
69             secret_buffer *sb;
70             secret_buffer_span *span;
71             /* Is the sv a Span object? */
72 1737 100         if ((span= secret_buffer_span_from_magic(sv, 0)) && SvTYPE(SvRV(sv)) == SVt_PVHV) {
    50          
73 1284           SV **sb_sv= hv_fetchs((HV*)SvRV(sv), "buf", 1);
74 1284           sb= secret_buffer_from_magic(*sb_sv, SECRET_BUFFER_MAGIC_OR_DIE);
75 1284           return secret_buffer_parse_init(parse, sb, span->pos, span->lim, span->encoding);
76             }
77             /* Is the sv a SecretBuffer? */
78 453 100         else if ((sb= secret_buffer_from_magic(sv, 0))) {
79 2           return secret_buffer_parse_init(parse, sb, 0, sb->len, SECRET_BUFFER_ENCODING_ISO8859_1);
80             }
81             /* It needs to at least be defined */
82 451 50         else if (SvOK(sv)) {
83             STRLEN len;
84 451           char *buf= SvPV(sv, len);
85 451           Zero(parse, 1, secret_buffer_parse);
86 451           parse->pos= (U8*) buf;
87 451           parse->lim= (U8*) buf + len;
88 451           parse->encoding= SvUTF8(sv)? SECRET_BUFFER_ENCODING_UTF8 : SECRET_BUFFER_ENCODING_ISO8859_1;
89 451           return true;
90             }
91             else {
92 0           Zero(parse, 1, secret_buffer_parse);
93 0           parse->error= "Not a Span, SecretBuffer, or defined scalar";
94 0           return false;
95             }
96             }
97              
98             /* Scan for a pattern which may be a regex or literal string.
99             * Regexes are currently limited to a single charclass.
100             */
101 785           bool secret_buffer_match(secret_buffer_parse *parse, SV *pattern, int flags) {
102             dTHX;
103 785           REGEXP *rx= (REGEXP*)SvRX(pattern);
104             secret_buffer_parse pat_parse;
105              
106             /* Is the pattern a regexp-ref? */
107 785 100         if (rx) {
108 346           secret_buffer_charset *cset= secret_buffer_charset_from_regexpref(pattern);
109 346           return secret_buffer_match_charset(parse, cset, flags);
110             }
111              
112             /* load up a parse struct with the pos, lim, and encoding */
113 439 50         if (!secret_buffer_parse_init_from_sv(&pat_parse, pattern))
114 0           croak("%s", pat_parse.error);
115              
116             /* Remove edge case of zero-length pattern (always matches) */
117 439 100         if (pat_parse.pos >= pat_parse.lim) {
118 2 50         if ((flags & SECRET_BUFFER_MATCH_REVERSE))
119 0           parse->pos= parse->lim;
120             else
121 2           parse->lim= parse->pos;
122 2           return !(flags & SECRET_BUFFER_MATCH_NEGATE);
123             }
124             /* Remove edge case of zero-length subject (never matches) */
125 437 100         if (parse->pos >= parse->lim) {
126 4           return (flags & SECRET_BUFFER_MATCH_NEGATE);
127             }
128              
129             /* Since unicode iteration of the pattern is a hassle and might happen lots of times,
130             * convert it to either plain bytes or array of U32 codepoints.
131             */
132 433 100         if (pat_parse.encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
133 17           int dst_enc=
134             /* these can be transcoded to bytes */
135 17           (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
136 17 100         || pat_parse.encoding == SECRET_BUFFER_ENCODING_HEX
137 16 50         || pat_parse.encoding == SECRET_BUFFER_ENCODING_BASE64)
138             ? SECRET_BUFFER_ENCODING_ISO8859_1
139 34 50         : SECRET_BUFFER_ENCODING_I32;
140 17           SSize_t dst_len= secret_buffer_sizeof_transcode(&pat_parse, dst_enc);
141 17 50         if (dst_len < 0)
142 0           croak("transcode of pattern failed: %s", pat_parse.error);
143             /* No need to transcode SECRET_BUFFER_ENCODING_ASCII, but the above size check
144             * verified it is clean 7-bit, which is the whole point of that encoding.
145             */
146 17 50         if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
147             /* Likewise, if SECRET_BUFFER_ENCODING_UTF8's I32 len is exactly 4x the number of
148             * original bytes, that means every byte became a character, which means every
149             * character could fit in a byte. */
150 17 100         || (pat_parse.encoding == SECRET_BUFFER_ENCODING_UTF8
151 16 100         && dst_len == (pat_parse.lim - pat_parse.pos) * 4)
152             ) {
153 9           pat_parse.encoding= SECRET_BUFFER_ENCODING_ISO8859_1;
154             } else {
155             /* create a temporary secret buffer to hold the transcode */
156 8           secret_buffer *tmp= secret_buffer_new(0, NULL);
157 8           secret_buffer_parse pat_orig= pat_parse;
158 8           secret_buffer_set_len(tmp, dst_len);
159 8 50         if (!secret_buffer_parse_init(&pat_parse, tmp, 0, dst_len, dst_enc))
160 0           croak("transcode of pattern failed: %s", pat_parse.error);
161             /* Transcode the pattern */
162 8 50         if (!secret_buffer_transcode(&pat_orig, &pat_parse))
163 0 0         croak("transcode of pattern failed: %s", pat_orig.error? pat_orig.error : pat_parse.error);
164             }
165             }
166             /* In some cases it would also be nice to transcode the subject first, but the
167             * final state of the parse struct carries information back to the caller and
168             * needs to refer to original positions of characters. */
169              
170             /* Now dipatch to sb_parse_match_str_X */
171 433 100         if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ISO8859_1) {
172 426           size_t pat_len= pat_parse.lim - pat_parse.pos;
173 426           return sb_parse_match_str_U8(parse, pat_parse.pos, pat_len, flags);
174             } else { /* must be _I32 encoding, from above */
175 7           size_t pat_len= (pat_parse.lim - pat_parse.pos) >> 2;
176 7           return sb_parse_match_str_I32(parse, (I32*) pat_parse.pos, pat_len, flags);
177             }
178             }
179              
180             /* Scan for a pattern which is a set of characters */
181 346           bool secret_buffer_match_charset(secret_buffer_parse *parse, secret_buffer_charset *cset, int flags) {
182 346 100         if (parse->pos >= parse->lim) // empty range
183 48           return false;
184              
185             // byte matching gets to use a more efficient algorithm
186 298           return parse->encoding == SECRET_BUFFER_ENCODING_ISO8859_1
187 289           ? sb_parse_match_charset_bytes(parse, cset, flags)
188 587 100         : sb_parse_match_charset_codepoints(parse, cset, flags);
189             }
190              
191             /* Scan for a pattern which is a literal string of bytes.
192             */
193 0           bool secret_buffer_match_bytestr(secret_buffer_parse *parse, char *data, size_t datalen, int flags) {
194 0           return sb_parse_match_str_U8(parse, (U8*) data, datalen, flags);
195             }
196              
197             /* Count number of bytes required to transcode the source.
198             * If the source contains an invalid character for its encoding, or that codepoint
199             * can't be encoded as the dst_encoding, this returns -1 and sets src->error
200             * and also sets src->pos pointing at the character that could not be converted.
201             */
202 118           SSize_t secret_buffer_sizeof_transcode(secret_buffer_parse *src, int dst_encoding) {
203             // If the source and destination encodings are both bytes, return the length
204 118 100         if (dst_encoding == src->encoding && src->encoding == 0)
    100          
205 17           return src->lim - src->pos;
206             // Else need to iterate characters (to validate) and re-encode them
207             else {
208 101           size_t dst_size_needed= 0;
209             secret_buffer_parse tmp;
210 101           Zero(&tmp, 1, secret_buffer_parse);
211 101           tmp.pos= src->pos;
212 101           tmp.lim= src->lim;
213 101           tmp.encoding= src->encoding;
214 859 100         while (tmp.pos < tmp.lim) {
215 758           int cp= sb_parse_next_codepoint(&tmp);
216 758 50         if (cp < 0) return -1;
217 758           int ch_size= sizeof_codepoint_encoding(cp, dst_encoding);
218 758 50         if (ch_size < 0) return -1;
219 758           dst_size_needed += ch_size;
220             }
221             // If dest is base64, need special calculation
222 101 100         if (dst_encoding == SECRET_BUFFER_ENCODING_BASE64) {
223 10           dst_size_needed= ((dst_size_needed + 2) / 3) * 4;
224             }
225 101           return dst_size_needed;
226             }
227             }
228              
229             static const char base64_alphabet[64]=
230             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
231             "abcdefghijklmnopqrstuvwxyz"
232             "0123456789+/";
233              
234             /*
235             perl -E 'my @tbl= (-1)x256;
236             $tbl[ord]= -ord(A)+ord for A..Z;
237             $tbl[ord]= 26-ord(a)+ord for a..z;
238             $tbl[ord]= 52-ord(0)+ord for 0..9;
239             $tbl[ord "+"]= 62;
240             $tbl[ord "/"]= 63;
241             $tbl[ord "="]= 64;
242             say join ",\n", map join(",", @tbl[$_*16 .. $_*16+15]), 0..0xF'
243             */
244             static const int8_t base64_decode_table[256]= {
245             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
246             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
247             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
248             52,53,54,55,56,57,58,59,60,61,-1,-1,-1,64,-1,-1,
249             -1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
250             15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
251             -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
252             41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
253             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
254             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
255             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
256             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
257             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
258             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
259             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
260             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
261             };
262              
263             /* Transcode characters from one parse state into another.
264             * This works sort of like
265             * $data= decode($src_enc, substr($src, $src_pos, $src_len));
266             * substr($dst, $dst_pos, $dst_lim, encode($dst_enc, $data));
267             * processing only a range of the source, and replacing only a range of the dest,
268             * adjusting the size of dst as needed. Both src->pos and dst->pos
269             * are updated.
270             */
271 109           bool secret_buffer_transcode(secret_buffer_parse *src, secret_buffer_parse *dst) {
272 109           src->error= NULL;
273 109           dst->error= NULL;
274             // If the source and destination encodings are both bytes, use memcpy
275 109 100         if (dst->encoding == src->encoding && src->encoding == 0) {
    100          
276 17           size_t cnt= dst->lim - dst->pos;
277 17 50         if (src->lim - src->pos != cnt) {
278 0           dst->error= "miscalculated buffer length";
279 0           return false;
280             }
281 17           memcpy(dst->pos, src->pos, cnt);
282 17           dst->pos += cnt;
283 17           src->pos += cnt;
284             }
285             // Else need to iterate characters and re-encode them
286             // base64 encoding doesn't work with sb_parse_encode_codepoint, so it gets
287             // special treatment.
288 92 100         else if (dst->encoding == SECRET_BUFFER_ENCODING_BASE64) {
289             // Read 3, write 4
290 10           int accum= 0;
291 10           int shift= 16, cp;
292 88 100         while (src->pos < src->lim) {
293 78           cp= sb_parse_next_codepoint(src);
294 78 50         if (cp > 0xFF) {
295 0           dst->error= "byte out of range";
296 0           return false;
297             }
298 78 100         if (!shift) {
299 24 50         if (dst->pos + 4 > dst->lim) {
300 0           dst->error= "miscalculated buffer length";
301 0           return false;
302             }
303 24           accum |= cp;
304 24           *dst->pos++ = base64_alphabet[0x3F & (accum >> 18)];
305 24           *dst->pos++ = base64_alphabet[0x3F & (accum >> 12)];
306 24           *dst->pos++ = base64_alphabet[0x3F & (accum >> 6)];
307 24           *dst->pos++ = base64_alphabet[0x3F & accum];
308 24           accum= 0;
309 24           shift= 16;
310             }
311             else {
312 54           accum |= (cp << shift);
313 54           shift -= 8;
314             }
315             }
316 10 100         if (dst->pos + (shift < 16? 4 : 0) != dst->lim) {
    50          
317 0           dst->error= "miscalculated buffer length";
318 0           return false;
319             }
320             // write leftover accumulated bits
321 10 100         if (shift < 16) {
322 5           *dst->pos++ = base64_alphabet[0x3F & (accum >> 18)];
323 5           *dst->pos++ = base64_alphabet[0x3F & (accum >> 12)];
324 5 100         *dst->pos++ = shift? '=' : base64_alphabet[0x3F & (accum >> 6)];
325 5           *dst->pos++ = '=';
326             }
327             }
328             else {
329 526 100         while (src->pos < src->lim) {
330 444           int cp= sb_parse_next_codepoint(src);
331 444 50         if (cp < 0)
332 0           return false; // error is already set
333 444           int len= sb_parse_encode_codepoint(dst, cp);
334 444 50         if (len < 0)
335 0           return false; // error is already set
336             }
337 82 50         if (dst->pos != dst->lim) {
338 0           dst->error= "miscalculated buffer length";
339 0           return false;
340             }
341             }
342 109           return true;
343             }
344              
345             bool
346 101           secret_buffer_copy_to(secret_buffer_parse *src, SV *dst_sv, int encoding, bool append) {
347             dTHX;
348             secret_buffer_parse dst;
349 101           secret_buffer *dst_sbuf= NULL;
350             SSize_t need_bytes;
351 101           bool dst_wide= false;
352              
353 101           Zero(&dst, 1, secret_buffer_parse);
354             // Encoding may be -1 to indicate the user didn't specify, in which case we use the
355             // same encoding as the source, unless the destination is a perl scalar (handled below)
356 101 100         dst.encoding= encoding >= 0? encoding : src->encoding;
357 101 100         if (sv_isobject(dst_sv)) {
358             // if object, must be a SecretBuffer
359 27           dst_sbuf= secret_buffer_from_magic(dst_sv, SECRET_BUFFER_MAGIC_OR_DIE);
360             }
361             else {
362             // Going to overwrite the scalar, or if its a scalar-ref, overwrite that.
363 74 50         if (SvROK(dst_sv) && !sv_isobject(dst_sv) && SvTYPE(SvRV(dst_sv)) <= SVt_PVMG)
    0          
    0          
364 0           dst_sv= SvRV(dst_sv);
365             // Refuse to overwrite any other kind of ref
366 74 50         if (SvTYPE(dst_sv) > SVt_PVMG || SvROK(dst_sv)) {
    50          
367 0           src->error= "Can only copy_to scalars or scalar-refs";
368 0           return false;
369             }
370             // If the source encoding is a type of unicode, and the destination encoding is not
371             // specified, then write wide characters (utf-8) to the perl scalar and flag it as utf8
372 74 100         if (encoding < 0 && SECRET_BUFFER_ENCODING_IS_UNICODE(src->encoding)) {
    100          
    100          
    100          
    50          
373 66           dst.encoding= SECRET_BUFFER_ENCODING_UTF8;
374 66           dst_wide= true;
375             }
376             }
377             // Determine how many bytes we need
378 101           need_bytes= secret_buffer_sizeof_transcode(src, dst.encoding);
379 101 50         if (need_bytes < 0)
380 0           return false;
381             // Prepare the buffers for that many bytes
382 101 100         if (dst_sbuf) {
383             // For destination SecretBuffer, set length to 0 unless appending, then
384             // ensure enough allocated space for need_bytes, then transcode and update
385             // the length in the block below.
386 27 100         if (!append)
387 20           secret_buffer_set_len(dst_sbuf, 0); /* clears secrets */
388 27           secret_buffer_alloc_at_least(dst_sbuf, dst_sbuf->len + need_bytes);
389 27           dst.pos= (U8*) dst_sbuf->data + dst_sbuf->len;
390 27           dst.lim= dst.pos + need_bytes;
391             }
392             else {
393             // For destination SV, set length to 0 unless appending, then force it to
394             // be bytes or utf-8, then grow it to ensure room for additional `need_bytes`.
395             STRLEN len;
396             // If overwriting, set the length to 0 before forcing to bytes or utf8
397 74 100         if (!append)
398 72           sv_setpvn(dst_sv, "", 0);
399             // force it to the type required
400 74 100         if (dst_wide) SvPVutf8(dst_sv, len);
401 8           else SvPVbyte(dst_sv, len);
402             // grow it to the required length, for writing
403 74 100         sv_grow(dst_sv, (append? len : 0) + need_bytes + 1);
404 74           dst.pos= (U8*) SvPVX_mutable(dst_sv) + len;
405 74           dst.lim= dst.pos + need_bytes;
406             // don't forget the NUL terminator
407 74           *dst.lim= '\0';
408             }
409 101 50         if (!secret_buffer_transcode(src, &dst)) {
410 0 0         if (!src->error) src->error= dst.error;
411 0           return false;
412             }
413             /* update the lengths */
414 101 100         if (dst_sbuf) {
415 27           dst_sbuf->len += need_bytes;
416             }
417             else {
418 74           SvCUR_set(dst_sv, SvCUR(dst_sv) + need_bytes);
419 74 50         SvSETMAGIC(dst_sv);
420             }
421 101           return true;
422             }
423              
424             /* Append DER length octets (ASN.1 Length field, definite form only).
425             *
426             * DER rules:
427             * - If len <= 127: single byte [0x00..0x7F]
428             * - Else: first byte is 0x80 | N, where N is # of following length bytes (big-endian),
429             * and the length must be encoded in the minimal number of bytes (no leading 0x00).
430             *
431             * This function encodes ONLY the length field (not tag/value).
432             */
433             void
434 384           secret_buffer_append_uv_asn1_der_length(secret_buffer *buf, UV val) {
435             dTHX;
436 384           int enc_len = 1;
437             U8 *pos;
438 384 100         if (val > 127) {
439             /* Determine minimal number of bytes needed to represent len in base-256. */
440 339           UV tmp = val;
441 2001 100         while (tmp) {
442 1662           enc_len++;
443 1662           tmp >>= 8;
444             }
445             }
446             /* In BER/DER, the long-form initial octet has 7 bits of length-of-length.
447             * 0x80 is indefinite length (forbidden in DER), 0xFF would mean 127 length bytes.
448             * With 64-bit UV enc_len will never exceed 9.
449             */
450 384 50         ASSUME(enc_len < 127);
451 384           secret_buffer_set_len(buf, buf->len + enc_len);
452 384           pos= (U8*) buf->data + buf->len - 1;
453 384 100         if (val <= 127) {
454 45           *pos = (U8) val;
455             } else {
456 339           UV tmp = val;
457             /* Write the length big-endian into enc[1..n]. */
458 2001 100         while (tmp) {
459 1662           *pos-- = (U8)(tmp & 0xFF);
460 1662           tmp >>= 8;
461             }
462 339           *pos= (U8) (0x80 | (U8)(enc_len-1));
463             }
464 384           }
465              
466             /* Parse ASN.1 DER Length (definite form only) */
467             bool
468 384           secret_buffer_parse_uv_asn1_der_length(secret_buffer_parse *parse, UV *out) {
469             /* Work on a local cursor so we can roll back on failure */
470 384           U8 *pos = parse->pos;
471 384           U8 *lim = parse->lim;
472             UV result;
473              
474 384 50         if (pos >= lim) {
475 0           parse->error = "unexpected end of buffer";
476 0           return false;
477             }
478              
479 384           result = *pos++;
480              
481             /* If 0..127, the byte is the length value itself, otherwise it is the number of octets
482             * to read following that byte. */
483 384 100         if ((result & 0x80)) {
484 339           int n = result & 0x7F;
485             /* 0x80 means indefinite length (BER/CER), forbidden in DER */
486 339 50         if (n == 0) {
487 0           parse->error = "ASN.1 DER indefinite length not allowed";
488 0           return false;
489             }
490             /* Number of octets should be smallest possible encoding, so if it is larger than size_t
491             * don't even bother trying to decode it.
492             */
493 339 50         if (n > sizeof(UV)) {
494 0           parse->error = "ASN.1 DER length too large for perl UV";
495 0           return false;
496             }
497             /* ensure we have that many bytes */
498 339 50         if ((size_t)(lim - pos) < (size_t)n) {
499 0           parse->error = "unexpected end of buffer";
500 0           return false;
501             }
502             /* DER minimal encoding rules:
503             * - no leading 0x00 in the length octets
504             * - long form must not be used for lengths <= 127
505             */
506 339           lim= pos + n;
507 339           result= *pos++;
508 339 50         if (!result) {
509 0           parse->error = "ASN.1 DER length has leading zero (non-minimal)";
510 0           return false;
511             }
512             /* Parse remaining bytes of big-endian unsigned integer */
513 1662 100         while (pos < lim)
514 1323           result= (result << 8) | *pos++;
515             /* DER should not use 1-byte encoding if it would have fit in the initial byte */
516 339 50         if (result < 0x80) {
517 0           parse->error = "ASN.1 DER length should use short form (non-minimal)";
518 0           return false;
519             }
520             }
521 384 50         if (out) *out = result;
522 384           parse->pos = pos;
523 384           parse->error = NULL;
524 384           return true;
525             }
526              
527             /* Append canonical unsigned Base128, Little-Endian
528             *
529             * Rules:
530             * - 7 data bits per byte, little-endian (least significant group first)
531             * - High bit 0x80 set on all bytes except the final byte
532             * - Canonical/minimal: stop as soon as remaining value is 0
533             */
534             void
535 384           secret_buffer_append_uv_base128le(secret_buffer *buf, UV val) {
536             dTHX;
537             U8 *pos;
538 384           int enc_len= 1;
539 384           UV tmp= val >> 7;
540 1923 100         while (tmp) {
541 1539           enc_len++;
542 1539           tmp >>= 7;
543             }
544 384           secret_buffer_set_len(buf, buf->len + enc_len);
545 384           pos= (U8*) buf->data + buf->len - enc_len;
546             /* Encode */
547 384           tmp= val;
548             do {
549 1923           U8 byte = (U8)(tmp & 0x7F);
550 1923           tmp >>= 7;
551 1923 100         if (tmp)
552 1539           byte |= 0x80;
553 1923           *pos++ = byte;
554 1923 100         } while (tmp);
555 384 50         ASSUME(pos == (U8*)(buf->data + buf->len));
556 384           }
557              
558             /* Parse Unsigned LittleEndian Base128 (also requiring canonical / minimal encoding) */
559             bool
560 384           secret_buffer_parse_uv_base128le(secret_buffer_parse *parse, UV *out) {
561 384           U8 *pos = parse->pos;
562 384           U8 *lim = parse->lim;
563 384           UV result= 0, payload;
564 384           int shift= 7;
565              
566 384 50         if (pos >= lim) {
567 0           parse->error = "unexpected end of buffer";
568 0           return false;
569             }
570 384           result= payload= *pos & 0x7F;
571             /* Scan forward looking for the first byte without the continuation flag */
572 1923 100         while (*pos++ & 0x80) {
573 1539 50         if (pos >= lim) {
574 0           parse->error = "unexpected end of buffer";
575 0           return false;
576             }
577 1539           payload= *pos & 0x7F;
578 1539 100         if (shift > sizeof(UV)*8 - 7) {
579             /* Do any of the bits overflow? Is the continuation flag set? */
580 3 50         if (shift >= sizeof(UV)*8 || (payload >> (sizeof(UV)*8 - shift))) {
    50          
581 0           parse->error = "Base128-LE value overflows perl UV";
582 0           return false;
583             }
584             }
585 1539           result |= payload << shift;
586 1539           shift += 7;
587             }
588             /* check if the high bits were all zero, meaning an unnecessary byte was encoded */
589 384 100         if (!payload && result != 0) {
    50          
590 0           parse->error = "Over-long encoding of Base128-LE";
591 0           return false;
592             }
593 384 50         if (out) *out = result;
594 384           parse->pos = pos;
595 384           parse->error = NULL;
596 384           return true;
597             }
598              
599             /* Append canonical unsigned Base128, Big-Endian
600             *
601             * Rules:
602             * - 7 data bits per byte, big-endian (most significant group first)
603             * - High bit 0x80 set on all bytes except the final byte
604             * - Canonical/minimal: stop as soon as remaining value is 0
605             */
606             void
607 387           secret_buffer_append_uv_base128be(secret_buffer *buf, UV val) {
608             dTHX;
609             U8 *pos;
610 387           int enc_len= 1, shift;
611 387           UV tmp= val >> 7;
612 1926 100         while (tmp) {
613 1539           enc_len++;
614 1539           tmp >>= 7;
615             }
616 387           secret_buffer_set_len(buf, buf->len + enc_len);
617 387           pos= (U8*) buf->data + buf->len - enc_len;
618             /* Encode */
619 2313 100         for (shift= (enc_len-1) * 7; shift >= 0; shift -= 7) {
620 1926           U8 byte = (U8)((val >> shift) & 0x7F);
621 1926 100         if (shift)
622 1539           byte |= 0x80;
623 1926           *pos++ = byte;
624             }
625 387 50         ASSUME(pos == (U8*)(buf->data + buf->len));
626 387           }
627              
628             /* Parse Unsigned BigEndian Base128 (also requiring canonical / minimal encoding) */
629             bool
630 395           secret_buffer_parse_uv_base128be(secret_buffer_parse *parse, UV *out) {
631 395           U8 *pos = parse->pos;
632 395           U8 *lim = parse->lim;
633 395           UV result= 0;
634              
635 395 50         if (pos >= lim) {
636 0           parse->error = "unexpected end of buffer";
637 0           return false;
638             }
639             /* high-bit payload == 0 with continue bit set is an error. */
640 395 50         if (*pos == 0x80) {
641 0           parse->error = "Over-long encoding of Base128-BE";
642 0           return false;
643             }
644 395           result= *pos & 0x7F;
645 1934 100         while (*pos++ & 0x80) {
646             /* Will existing bits overflow UV when shifted? */
647 1539 50         if (result >> (sizeof(UV)*8 - 7)) {
648 0           parse->error = "Base128-BE value overflows perl UV";
649 0           return false;
650             }
651 1539 50         if (pos >= lim) {
652 0           parse->error = "unexpected end of buffer";
653 0           return false;
654             }
655 1539           result= (result << 7) | (*pos & 0x7F);
656             }
657 395 50         if (out) *out = result;
658 395           parse->pos = pos;
659 395           parse->error = NULL;
660 395           return true;
661             }
662              
663             /* Private API -------------------------------------------------------------*/
664              
665             /* Scan raw bytes using only the bitmap */
666 289           static bool sb_parse_match_charset_bytes(
667             secret_buffer_parse *parse,
668             const secret_buffer_charset *cset,
669             int flags
670             ) {
671 289           bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
672 289           bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
673 289 100         bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) || cset->match_multi;
    100          
674 289           bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
675 289           bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
676 289 100         int step= reverse? -1 : 1;
677 289 100         U8 *pos= reverse? parse->lim-1 : parse->pos,
678 289 100         *lim= reverse? parse->pos-1 : parse->lim,
679 289           *span_start= NULL;
680             //warn("scan_charset_bytes pos=%p lim=%p len=%d", parse->pos, parse->lim, (int)(parse->lim - parse->pos));
681              
682 1102 100         while (pos != lim) {
683 1097 100         if (sbc_bitmap_test(cset->bitmap, *pos) != negate) {
684             // Found. Now are we looking for a span?
685 238 100         if (span_start)
686 105           break;
687 133           span_start= pos;
688 133 100         if (!multi) {
689 27           pos += step;
690 27           break;
691             }
692 106           negate= !negate;
693 859 100         } else if (anchored && !span_start)
    100          
694 152           break;
695 813           pos += step;
696             }
697             /* If constant time operation is requested, we need to perform one sbc_bitmap_test
698             * for every character in the span, and make sure the compiler doesn't eliminate it.
699             */
700 289 50         if (consttime) {
701 0           volatile bool sink= false;
702 0 0         while (pos != lim) {
703 0           sink ^= sbc_bitmap_test(cset->bitmap, *pos);
704 0           pos += step;
705             }
706 0           (void) sink;
707             }
708             // reached end of defined range, and implicitly ends span
709 289 100         if (reverse) {
710 86           parse->pos= pos + 1;
711 86 100         parse->lim= span_start? span_start + 1 : parse->pos;
712             } else {
713 203           parse->lim= pos;
714 203 100         parse->pos= span_start? span_start : parse->lim;
715             }
716 289           return span_start != NULL;
717             }
718              
719 9           static bool sb_parse_match_charset_codepoints(
720             secret_buffer_parse *parse,
721             const secret_buffer_charset *cset,
722             int flags
723             ) {
724             dTHX;
725 9           bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
726 9           bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
727 9 50         bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) || cset->match_multi;
    100          
728 9           bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
729 9           bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
730 9           bool span_started= false;
731 9           bool encoding_error= false;
732 9 100         U8 *span_mark= NULL, *prev_mark= reverse? parse->lim : parse->pos;
733              
734 37 50         while (parse->pos < parse->lim) {
735 19           int codepoint= reverse? sb_parse_prev_codepoint(parse)
736 37 100         : sb_parse_next_codepoint(parse);
737             // warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
738 37 100         if (codepoint < 0) {// encoding error
739 1           encoding_error= true;
740 1           break;
741             }
742 36 100         if (sbc_test_codepoint(aTHX_ cset, codepoint) != negate) {
743             // Found. Mark boundaries of char.
744             // Now are we looking for a span?
745 10 100         if (span_started)
746 2           break;
747 8           span_started= true;
748 8           span_mark= prev_mark;
749 8           negate= !negate;
750 8 100         if (!multi) {
751 6 100         prev_mark= reverse? parse->lim : parse->pos;
752 6           break;
753             }
754 26 50         } else if (anchored && !span_started)
    0          
755 0           break;
756 28 100         prev_mark= reverse? parse->lim : parse->pos;
757             }
758             /* If constant time operation is requested, we need to perform one sbc_bitmap_test
759             * for every character in the span, and make sure the compiler doesn't eliminate it.
760             */
761 9 50         if (consttime) {
762 0           volatile bool sink= false;
763 0 0         while (parse->pos < parse->lim) {
764 0           int codepoint= reverse? sb_parse_prev_codepoint(parse)
765 0 0         : sb_parse_next_codepoint(parse);
766             // warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
767 0 0         if (codepoint < 0) { // encoding error
768 0           encoding_error= true;
769 0           sink ^= sbc_test_codepoint(aTHX_ cset, 0);
770             }
771             else
772 0           sink ^= sbc_test_codepoint(aTHX_ cset, codepoint);
773             }
774 0           (void) sink;
775             }
776 9 100         if (encoding_error)
777 1           return false;
778             // reached end of defined range
779 8 50         if (span_started) { // and implicitly ends span
780 8 100         if (reverse) {
781 5           parse->pos= prev_mark;
782 5           parse->lim= span_mark;
783             }
784             else {
785 3           parse->pos= span_mark;
786 3           parse->lim= prev_mark;
787             }
788 8           return true;
789             }
790 0           return false;
791             }
792              
793 18           int sb_parse_codepointcmp(secret_buffer_parse *lhs, secret_buffer_parse *rhs) {
794             I32 lhs_cp, rhs_cp;
795 18           volatile int ret= 0;
796             /* constant-time iteration per the shorter of the two strings */
797 87 100         while (lhs->pos < lhs->lim && rhs->pos < rhs->lim) {
    50          
798 69           lhs_cp= sb_parse_next_codepoint(lhs);
799 69 50         if (lhs_cp < 0)
800 0           croak("Encoding error in left-hand buffer");
801 69           rhs_cp= sb_parse_next_codepoint(rhs);
802 69 50         if (rhs_cp < 0)
803 0           croak("Encoding error in right-hand buffer");
804 69 100         if (lhs_cp != rhs_cp && !ret)
    50          
805 2 50         ret= lhs_cp < rhs_cp? -1 : 1;
806             }
807 18           return ret? ret
808 34 100         : (lhs->pos < lhs->lim)? 1 /* right string shorter than left */
809 32 50         : (rhs->pos < rhs->lim)? -1 /* left string shorter than right */
810 16 50         : 0;
811             }
812              
813             /* UTF-8 decoding helper */
814 3369           static int sb_parse_next_codepoint(secret_buffer_parse *parse) {
815 3369           U8 *pos= parse->pos, *lim= parse->lim;
816 3369           int cp, encoding= parse->encoding;
817             #define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
818              
819 3369 100         if (encoding == SECRET_BUFFER_ENCODING_ASCII
820 3368 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
821 577 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
822             ) {
823 3127 50         if (lim - pos < 1)
824 0           SB_RETURN_ERROR("end of span")
825 3127           cp= *pos++;
826 3127 100         if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_ASCII)
    100          
827 1           SB_RETURN_ERROR("not 7-bit ASCII")
828 3126 100         else if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_UTF8) {
    100          
829 47           int min_cp= 0;
830 47           switch ((cp >> 3) & 0xF) {
831 13           case 14: // 0b1[1110]yyy
832 13 50         { if (lim - pos < 3) goto incomplete;
833 13           min_cp= 0x10000;
834 13           cp &= 0x07;
835             }
836 13 50         if ((*pos & 0xC0) != 0x80) goto invalid;
837 13           cp= (cp << 6) | (*pos++ & 0x3F);
838             if (0)
839             case 12: case 13: // 0b1[110x]yyy
840 14 50         { if (lim - pos < 2) goto incomplete;
841 14           min_cp= 0x800;
842 14           cp &= 0x0F;
843             }
844 27 50         if ((*pos & 0xC0) != 0x80) goto invalid;
845 27           cp= (cp << 6) | (*pos++ & 0x3F);
846             if (0)
847             case 8: case 9: case 10: case 11: // 0b1[10xx]yyy
848 20 50         { if (lim - pos < 1) goto incomplete;
849 20           min_cp= 0x80;
850 20           cp &= 0x1F;
851             }
852 47 50         if ((*pos & 0xC0) != 0x80) goto invalid;
853 47           cp= (cp << 6) | (*pos++ & 0x3F);
854 47           break;
855             default:
856 0           invalid: SB_RETURN_ERROR("invalid UTF8 character")
857 0           incomplete: SB_RETURN_ERROR("incomplete UTF8 character")
858             }
859 47 50         if (cp < min_cp)
860 0           SB_RETURN_ERROR("overlong encoding of UTF8 character")
861 47 50         else if (cp > 0x10FFFF)
862 0           SB_RETURN_ERROR("UTF8 character exceeds max")
863             }
864             // else all ISO-8859-1 bytes are valid codepoints
865             }
866 242 100         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
867 221 100         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
868 36           ) {
869 36           int low= encoding == SECRET_BUFFER_ENCODING_UTF16LE? 0 : 1;
870 36 50         if (lim - pos < 2)
871 0           SB_RETURN_ERROR("end of span")
872 36           cp= pos[low] | ((int)pos[low^1] << 8);
873 36           pos += 2;
874 36 100         if (cp >= 0xD800 && cp <= 0xDFFF) {
    50          
875 10 50         if (lim - pos < 2)
876 0           SB_RETURN_ERROR("incomplete UTF16 character")
877 10           int w2= pos[low] | ((int)pos[low^1] << 8);
878 10           pos += 2;
879 10 50         if (w2 < 0xDC00 || w2 > 0xDFFF)
    50          
880 0           SB_RETURN_ERROR("invalid UTF16 low surrogate")
881 10           cp = 0x10000 + (((cp & 0x3FF) << 10) | (w2 & 0x3FF));
882             }
883             }
884 206 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
885             // Skip over whitespace
886 38 50         while (pos < lim && isspace(*pos))
    50          
887 0           pos++;
888 38 50         if (lim - pos < 2)
889 0           SB_RETURN_ERROR("end of span")
890 38           int high= *pos++ - '0';
891 38           int low= *pos++ - '0';
892 38 50         if (low >= ('a'-'0')) low -= ('a'-'0'-10);
893 38 100         else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
894 38 50         if (high >= ('a'-'0')) high -= ('a'-'0'-10);
895 38 100         else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
896 38 50         if ((low >> 4) | (high >> 4))
897 0           SB_RETURN_ERROR("not a pair of hex digits")
898 38           cp= (high << 4) | low;
899             // skip over whitespace if it takes us to the end of buffer so that caller
900             // knows it's EOF before trying another decode.
901 38 100         while (pos < lim && isspace(*pos))
    50          
902 0           pos++;
903             }
904 168 50         else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
905             // Skip over whitespace and control chars
906 168 50         while (pos < lim && *pos <= ' ')
    50          
907 0           pos++;
908             // There need to be at least 2 base64 characters left
909 168 50         if (pos < lim) {
910 168 50         if (base64_decode_table[*pos] < 0)
911 0           SB_RETURN_ERROR("invalid base64 character");
912             // ->pos_bit > 0 means pointer is pointing at a sub-bit of the base64
913             // character at *pos (and possible values are 0, 2, or 4)
914 168           cp= (((int)base64_decode_table[*pos++]) << (2 + parse->pos_bit)) & 0xFF;
915 168 50         while (pos < lim && *pos <= ' ')
    50          
916 0           pos++;
917             }
918 168 50         if (pos >= lim) {
919 0           parse->pos_bit= 0;
920 0           SB_RETURN_ERROR("end of span")
921             }
922 168 50         if (base64_decode_table[*pos] < 0)
923 0           SB_RETURN_ERROR("invalid base64 character");
924 168           cp |= base64_decode_table[*pos] >> (4-parse->pos_bit);
925 168           parse->pos_bit += 2;
926             // If pos_bit == 6 we've completed a set of 4 b64 chars and fully consumed them.
927 168 100         if (parse->pos_bit >= 6) {
928 51           pos++;
929 51           parse->pos_bit= 0;
930             // consume trailing whitespace
931 55 100         while (pos < lim && *pos <= ' ')
    100          
932 4           pos++;
933             }
934             else {
935             // if next char is '=', terminate the decoding
936 117           U8 *next= pos+1;
937 117 50         while (next < lim && *next <= ' ')
    50          
938 0           next++;
939 117 50         if (next < lim && *next == '=') {
    100          
940 13           pos= lim; // indicate parsing complete
941 13           parse->pos_bit= 0;
942             }
943             }
944             }
945 0 0         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
946 0 0         if (lim - pos < 4)
947 0           SB_RETURN_ERROR("end of span");
948 0           cp= *(I32*)pos;
949 0           pos+= 4;
950             }
951 0           else SB_RETURN_ERROR("unsupported encoding")
952 3368           parse->pos= pos;
953 3368           return cp;
954             #undef SB_RETURN_ERROR
955             }
956              
957 850           static int sb_parse_prev_codepoint(secret_buffer_parse *parse) {
958 850           U8 *pos= parse->pos, *lim= parse->lim;
959 850           int encoding= parse->encoding;
960             int cp;
961             #define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
962              
963 850 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII
964 850 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
965 25 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
966             ) {
967 842 50         if (lim <= pos)
968 0           SB_RETURN_ERROR("end of span")
969 842           cp= *--lim;
970             // handle the simple case first
971 842 100         if (cp >= 0x80 && encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
    50          
972             // Strict ASCII can't encode above 0x7F
973 4 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII)
974 0           SB_RETURN_ERROR("not 7-bit ASCII")
975             // else need to backtrack and then call next_codepoint
976 4           U8 *start= lim;
977 12 50         while (start >= pos && (*start & 0xC0) == 0x80)
    100          
978 8           --start;
979 4           parse->pos= start;
980 4           cp= sb_parse_next_codepoint(parse);
981 4 50         if (parse->pos != parse->lim) {// consumed all characters we gave it?
982 0           parse->pos= pos; // restore original pos
983 0 0         if (cp >= 0) // had a valid char, but extra 0x80 bytes
984 0           parse->error= "invalid UTF8 character";
985             // else use the error message from next_codepoint
986 0           return -1;
987             }
988 4           parse->pos= pos; // restore original pos
989 4           lim= start; // new lim is where we started the parse from
990             }
991             }
992 8 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
993 8 100         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
994 1           ) {
995 1 50         if (lim - pos < 2)
996 0           SB_RETURN_ERROR("end of span");
997             // handle the simple case first
998 1           lim -= 2;
999 1           int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1000 1           cp= lim[low] | ((int)lim[low^1] << 8);
1001 1 50         if (cp >= 0xD800 && cp <= 0xDFFF) {
    50          
1002 1 50         if (lim - pos < 4)
1003 0           SB_RETURN_ERROR("end of span");
1004 1           lim -= 2;
1005 1           int w1= lim[low] | ((int)lim[low^1] << 8);
1006 1 50         if (w1 < 0xD800 || w1 > 0xDFFF || cp < 0xDC00)
    50          
    50          
1007 0           SB_RETURN_ERROR("invalid UTF16 surrogate");
1008 1           cp = 0x10000 + (((w1 & 0x3FF) << 10) | (cp & 0x3FF));
1009             }
1010             }
1011 7 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1012             // Skip over whitespace
1013 1 50         while (pos < lim && isspace(lim[-1]))
    50          
1014 0           lim--;
1015 1 50         if (lim - pos < 2)
1016 0 0         SB_RETURN_ERROR((pos == lim? "end of span" : "incomplete hex pair at end of span"))
1017 1           int low= *--lim - '0';
1018 1           int high= *--lim - '0';
1019 1 50         if (low >= ('a'-'0')) low -= ('a'-'0'-10);
1020 0 0         else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
1021 1 50         if (high >= ('a'-'0')) high -= ('a'-'0'-10);
1022 0 0         else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
1023 1 50         if ((low >> 4) | (high >> 4))
1024 0           SB_RETURN_ERROR("not a pair of hex digits")
1025 1           cp= (high << 4) | low;
1026             }
1027 6 50         else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
1028             bool again;
1029             do {
1030 9           again= false;
1031             // Skip over non-base64 chars
1032 12 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    100          
1033 3           lim--;
1034 9 50         if (pos < lim) {
1035             //warn("lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1036 9 50         if (base64_decode_table[lim[-1]] < 0)
1037 0           SB_RETURN_ERROR("invalid base64 character");
1038             // ->lim_bit > 0 means the character lim[-1] is partially consumed.
1039             // (sequence is 0, 2, 4, 0)
1040 9           cp= ((int)base64_decode_table[lim[-1]]) >> parse->lim_bit;
1041             // parsing an equal sign means 'cp' is bogus and need to go again
1042 9 100         if (lim[-1] == '=')
1043 3           again= true;
1044 9           --lim;
1045             // find next base64 char
1046 9 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    50          
1047 0           lim--;
1048             }
1049 9 50         if (pos >= lim) {
1050 0           parse->lim_bit= 0;
1051 0           SB_RETURN_ERROR("end of span")
1052             }
1053 9 50         if (base64_decode_table[lim[-1]] < 0)
1054 0           SB_RETURN_ERROR("invalid base64 character");
1055             //warn(" lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1056 9           cp |= (((int)base64_decode_table[lim[-1]]) << (6 - parse->lim_bit)) & 0xFF;
1057 9           parse->lim_bit += 2;
1058 9 100         if (parse->lim_bit >= 6) {
1059 3           parse->lim_bit= 0;
1060             // If completed a set of 4 b64 chars, lim[-1] is consumed, and need to
1061             // walk backward to find next base64 char
1062 3           --lim;
1063 3 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    0          
1064 0           lim--;
1065             }
1066             //warn(" cp=%d, lim-pos=%d, lim_bit=%d", cp, (int)(lim-pos), parse->lim_bit);
1067 9 100         } while (again);
1068             }
1069 0 0         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1070 0 0         if (lim - pos < 4)
1071 0           SB_RETURN_ERROR("end of span");
1072 0           lim -= 4;
1073 0           cp= *(I32*)lim;
1074             }
1075 0           else SB_RETURN_ERROR("unsupported encoding")
1076 850           parse->lim= lim;
1077 850           return cp;
1078             #undef SB_RETURN_ERROR
1079             }
1080              
1081 1202           static int sizeof_codepoint_encoding(int codepoint, int encoding) {
1082 1202 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII)
1083 0 0         return codepoint < 0x80? 1 : -1;
1084 1202 100         if (encoding == SECRET_BUFFER_ENCODING_ISO8859_1)
1085 110 50         return codepoint < 0x100? 1 : -1;
1086 1092 100         else if (encoding == SECRET_BUFFER_ENCODING_UTF8)
1087 736 100         return codepoint < 0x80? 1 : codepoint < 0x800? 2 : codepoint < 0x10000? 3 : 4;
    100          
    100          
1088 356 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1089 356 50         || encoding == SECRET_BUFFER_ENCODING_UTF16BE)
1090 0 0         return codepoint >= 0xD800 && codepoint < 0xE000? -1
1091 0 0         : codepoint < 0x10000? 2 : 4;
    0          
1092 356 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX)
1093 6 50         return codepoint < 0x100? 2 : -1;
1094             /* Base64 would need to track an accumulator, so just return 1 and fix it in the caller */
1095 350 100         else if (encoding == SECRET_BUFFER_ENCODING_BASE64)
1096 78 50         return codepoint < 0x100? 1 : -1;
1097 272 50         else if (encoding == SECRET_BUFFER_ENCODING_I32)
1098 272           return 4;
1099             else
1100 0           return -1;
1101             }
1102              
1103 444           static bool sb_parse_encode_codepoint(secret_buffer_parse *dst, int codepoint) {
1104             #define SB_RETURN_ERROR(msg) { dst->error= msg; return false; }
1105 444           int encoding= dst->encoding, n;
1106 444           U8 *dst_pos= dst->pos;
1107             // codepoints above 0x10FFFF are illegal
1108 444 50         if (codepoint >= 0x110000)
1109 0           SB_RETURN_ERROR("invalid codepoint");
1110             // not quite as efficient as checking during the code below, but saves a bunch of redundancy
1111 444           n= sizeof_codepoint_encoding(codepoint, encoding);
1112 444 50         if (n < 0)
1113 0           SB_RETURN_ERROR("character too wide for encoding")
1114 444 50         if (dst->lim - dst_pos < n)
1115 0           SB_RETURN_ERROR("buffer too small")
1116 444           dst->pos += n;
1117              
1118 444 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII
1119 444 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
1120 389 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
1121             ) {
1122 423           switch ((n-1)&0x3) { // help the compiler understand there are only 4 possible values
1123 401           case 0: *dst_pos++ = (U8) codepoint;
1124 401           break;
1125 10           case 1: *dst_pos++ = (U8)(0xC0 | (codepoint >> 6));
1126 10           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1127 10           break;
1128 4           case 2: *dst_pos++ = (U8)(0xE0 | (codepoint >> 12));
1129 4           *dst_pos++ = (U8)(0x80 | ((codepoint >> 6) & 0x3F));
1130 4           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1131 4           break;
1132 8           case 3: *dst_pos++ = (U8)(0xF0 | (codepoint >> 18));
1133 8           *dst_pos++ = (U8)(0x80 | ((codepoint >> 12) & 0x3F));
1134 8           *dst_pos++ = (U8)(0x80 | ((codepoint >> 6) & 0x3F));
1135 8           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1136 8           break;
1137             }
1138             }
1139 21 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1140 21 50         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
1141 0           ) {
1142 0           int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1143 0 0         if (n == 2) {
1144 0           dst_pos[low] = (U8)(codepoint & 0xFF);
1145 0           dst_pos[low^1] = (U8)(codepoint >> 8);
1146             }
1147             else {
1148 0           int adjusted = codepoint - 0x10000;
1149 0           int w0 = 0xD800 | (adjusted >> 10);
1150 0           int w1 = 0xDC00 | (adjusted & 0x3FF);
1151 0           dst_pos[low] = (U8)(w0 & 0xFF);
1152 0           dst_pos[1^low] = (U8)(w0 >> 8);
1153 0           dst_pos[2^low] = (U8)(w1 & 0xFF);
1154 0           dst_pos[3^low] = (U8)(w1 >> 8);
1155             }
1156             }
1157 21 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1158 3           dst_pos[0] = "0123456789ABCDEF"[(codepoint >> 4) & 0xF];
1159 3           dst_pos[1] = "0123456789ABCDEF"[codepoint & 0xF];
1160             }
1161 18 50         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1162 18           *(I32*)dst_pos = codepoint;
1163             }
1164             /* BASE64 is not handled here because the '=' padding can only be generated in
1165             * a context that knows when we are ending on a non-multiple-of-4. */
1166 0           else SB_RETURN_ERROR("unsupported encoding");
1167 444           return true;
1168             #undef SB_RETURN_ERROR
1169             }
1170              
1171             #define SB_PARSE_MATCH_STR_FN sb_parse_match_str_U8
1172             #define SB_PATTERN_EL_TYPE const U8
1173             #include "secret_buffer_parse_match_str.c"
1174             #undef SB_PARSE_MATCH_STR_FN
1175             #undef SB_PATTERN_EL_TYPE
1176              
1177             #define SB_PARSE_MATCH_STR_FN sb_parse_match_str_I32
1178             #define SB_PATTERN_EL_TYPE const I32
1179             #include "secret_buffer_parse_match_str.c"
1180             #undef SB_PARSE_MATCH_STR_FN
1181             #undef SB_PATTERN_EL_TYPE