File Coverage

secret_buffer_parse.c
Criterion Covered Total %
statement 546 700 78.0
branch 392 594 65.9
condition n/a
subroutine n/a
pod n/a
total 938 1294 72.4


line stmt bran cond sub pod time code
1              
2             /* These local parse functions are independenct of the SecretBuffer instance,
3             * needing only the 'data' pointer to whch the parse_state refers.
4             * The pos/lim of the parse state must already be checked against the length
5             * of the data before calling these.
6             */
7             static int sizeof_codepoint_encoding(int codepoint, int encoding);
8             static int sb_parse_prev_codepoint(secret_buffer_parse *parse);
9             static int sb_parse_next_codepoint(secret_buffer_parse *parse);
10             static bool sb_parse_encode_codepoint(secret_buffer_parse *parse, int codepoint);
11             static bool sb_parse_match_charset_bytes(secret_buffer_parse *parse, const secret_buffer_charset *cset, int flags);
12             static bool sb_parse_match_charset_codepoints(secret_buffer_parse *parse, const secret_buffer_charset *cset, int flags);
13             static bool sb_parse_match_str_U8(secret_buffer_parse *parse, const U8 *pattern, size_t pattern_len, int flags);
14             static bool sb_parse_match_str_I32(secret_buffer_parse *parse, const I32 *pattern, size_t pattern_len, int flags);
15              
16 66           static bool parse_encoding(pTHX_ SV *sv, int *out) {
17             int enc;
18 66 50         if (looks_like_number(sv)) {
19 0           IV i= SvIV(sv);
20 0 0         if (i < 0 || i > SECRET_BUFFER_ENCODING_MAX)
    0          
21 0           return false;
22 0           enc= (int) i;
23             } else {
24             STRLEN len;
25 66           const char *str= SvPV(sv, len);
26 66           switch (len) {
27 6 50         case 3: if (0 == strcmp(str, "HEX")) { enc= SECRET_BUFFER_ENCODING_HEX; break; }
28 1 50         case 4: if (0 == strcmp(str, "UTF8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
29 13 100         case 5: if (0 == strcmp(str, "ASCII")) { enc= SECRET_BUFFER_ENCODING_ASCII; break; }
30 12 50         if (0 == strcmp(str, "UTF-8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
31 26 50         case 6: if (0 == strcmp(str, "BASE64")) { enc= SECRET_BUFFER_ENCODING_BASE64; break; }
32 1 50         case 7: if (0 == strcmp(str, "UTF16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
33 0 0         if (0 == strcmp(str, "UTF16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
34 6 100         case 8: if (0 == strcmp(str, "UTF-16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
35 3 50         if (0 == strcmp(str, "UTF-16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
36 0 0         case 9: if (0 == strcmp(str, "ISO8859_1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
37 13 50         case 10: if (0 == strcmp(str, "ISO-8859-1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
38             default:
39 0           return false;
40             }
41             }
42 66 50         if (out) *out= enc;
43 66           return true;
44             }
45              
46             /* Public API --------------------------------------------------------------*/
47              
48             /* initialize a parse struct, but only if it is valid span of the buffer */
49 2001           bool secret_buffer_parse_init(secret_buffer_parse *parse,
50             secret_buffer *buf, size_t pos, size_t lim, int encoding
51             ) {
52 2001           Zero(parse, 1, secret_buffer_parse);
53             // Sanity check this parse state vs. the buffer
54 2001 100         if (lim > buf->len || pos > lim) {
    50          
55 1 50         parse->error= pos > lim? "span starts beyond buffer" : "span ends beyond buffer";
56 1           return false;
57             }
58 2000           parse->pos= ((U8*) buf->data) + pos;
59 2000           parse->lim= ((U8*) buf->data) + lim;
60 2000           parse->encoding= encoding;
61 2000           parse->sbuf= buf;
62 2000           return true;
63             }
64              
65             /* Initialize a parse struct, either from a Span, or a SecretBuffer, or a plain Scalar.
66             */
67 1737           bool secret_buffer_parse_init_from_sv(secret_buffer_parse *parse, SV *sv) {
68             dTHX;
69             secret_buffer *sb;
70             secret_buffer_span *span;
71             /* Is the sv a Span object? */
72 1737 100         if ((span= secret_buffer_span_from_magic(sv, 0)) && SvTYPE(SvRV(sv)) == SVt_PVHV) {
    50          
73 1284           SV **sb_sv= hv_fetchs((HV*)SvRV(sv), "buf", 1);
74 1284           sb= secret_buffer_from_magic(*sb_sv, SECRET_BUFFER_MAGIC_OR_DIE);
75 1284           return secret_buffer_parse_init(parse, sb, span->pos, span->lim, span->encoding);
76             }
77             /* Is the sv a SecretBuffer? */
78 453 100         else if ((sb= secret_buffer_from_magic(sv, 0))) {
79 2           return secret_buffer_parse_init(parse, sb, 0, sb->len, SECRET_BUFFER_ENCODING_ISO8859_1);
80             }
81             /* It needs to at least be defined */
82 451 50         else if (SvOK(sv)) {
83             STRLEN len;
84 451           char *buf= SvPV(sv, len);
85 451           Zero(parse, 1, secret_buffer_parse);
86 451           parse->pos= (U8*) buf;
87 451           parse->lim= (U8*) buf + len;
88 451           parse->encoding= SvUTF8(sv)? SECRET_BUFFER_ENCODING_UTF8 : SECRET_BUFFER_ENCODING_ISO8859_1;
89 451           return true;
90             }
91             else {
92 0           Zero(parse, 1, secret_buffer_parse);
93 0           parse->error= "Not a Span, SecretBuffer, or defined scalar";
94 0           return false;
95             }
96             }
97              
98             /* Scan for a pattern which may be a regex or literal string.
99             * Regexes are currently limited to a single charclass.
100             */
101 785           bool secret_buffer_match(secret_buffer_parse *parse, SV *pattern, int flags) {
102             dTHX;
103 785           REGEXP *rx= (REGEXP*)SvRX(pattern);
104             secret_buffer_parse pat_parse;
105              
106             /* Is the pattern a regexp-ref? */
107 785 100         if (rx) {
108 346           secret_buffer_charset *cset= secret_buffer_charset_from_regexpref(pattern);
109 346           return secret_buffer_match_charset(parse, cset, flags);
110             }
111              
112             /* load up a parse struct with the pos, lim, and encoding */
113 439 50         if (!secret_buffer_parse_init_from_sv(&pat_parse, pattern))
114 0           croak("%s", pat_parse.error);
115              
116             /* Remove edge case of zero-length pattern (always matches) */
117 439 100         if (pat_parse.pos >= pat_parse.lim) {
118 2 50         if ((flags & SECRET_BUFFER_MATCH_REVERSE))
119 0           parse->pos= parse->lim;
120             else
121 2           parse->lim= parse->pos;
122 2           return !(flags & SECRET_BUFFER_MATCH_NEGATE);
123             }
124             /* Remove edge case of zero-length subject (never matches) */
125 437 100         if (parse->pos >= parse->lim) {
126 4           return (flags & SECRET_BUFFER_MATCH_NEGATE);
127             }
128              
129             /* Since unicode iteration of the pattern is a hassle and might happen lots of times,
130             * convert it to either plain bytes or array of U32 codepoints.
131             */
132 433 100         if (pat_parse.encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
133 17           int dst_enc=
134             /* these can be transcoded to bytes */
135 17           (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
136 17 100         || pat_parse.encoding == SECRET_BUFFER_ENCODING_HEX
137 16 50         || pat_parse.encoding == SECRET_BUFFER_ENCODING_BASE64)
138             ? SECRET_BUFFER_ENCODING_ISO8859_1
139 34 50         : SECRET_BUFFER_ENCODING_I32;
140 17           SSize_t dst_len= secret_buffer_sizeof_transcode(&pat_parse, dst_enc);
141 17 50         if (dst_len < 0)
142 0           croak("transcode of pattern failed: %s", pat_parse.error);
143             /* No need to transcode SECRET_BUFFER_ENCODING_ASCII, but the above size check
144             * verified it is clean 7-bit, which is the whole point of that encoding.
145             */
146 17 50         if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
147             /* Likewise, if SECRET_BUFFER_ENCODING_UTF8's I32 len is exactly 4x the number of
148             * original bytes, that means every byte became a character, which means every
149             * character could fit in a byte. */
150 17 100         || (pat_parse.encoding == SECRET_BUFFER_ENCODING_UTF8
151 16 100         && dst_len == (pat_parse.lim - pat_parse.pos) * 4)
152             ) {
153 9           pat_parse.encoding= SECRET_BUFFER_ENCODING_ISO8859_1;
154             } else {
155             /* create a temporary secret buffer to hold the transcode */
156 8           secret_buffer *tmp= secret_buffer_new(0, NULL);
157 8           secret_buffer_parse pat_orig= pat_parse;
158 8           secret_buffer_set_len(tmp, dst_len);
159 8 50         if (!secret_buffer_parse_init(&pat_parse, tmp, 0, dst_len, dst_enc))
160 0           croak("transcode of pattern failed: %s", pat_parse.error);
161             /* Transcode the pattern */
162 8 50         if (!secret_buffer_transcode(&pat_orig, &pat_parse))
163 0 0         croak("transcode of pattern failed: %s", pat_orig.error? pat_orig.error : pat_parse.error);
164             }
165             }
166             /* In some cases it would also be nice to transcode the subject first, but the
167             * final state of the parse struct carries information back to the caller and
168             * needs to refer to original positions of characters. */
169              
170             /* Now dipatch to sb_parse_match_str_X */
171 433 100         if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ISO8859_1) {
172 426           size_t pat_len= pat_parse.lim - pat_parse.pos;
173 426           return sb_parse_match_str_U8(parse, pat_parse.pos, pat_len, flags);
174             } else { /* must be _I32 encoding, from above */
175 7           size_t pat_len= (pat_parse.lim - pat_parse.pos) >> 2;
176 7           return sb_parse_match_str_I32(parse, (I32*) pat_parse.pos, pat_len, flags);
177             }
178             }
179              
180             /* Scan for a pattern which is a set of characters */
181 346           bool secret_buffer_match_charset(secret_buffer_parse *parse, secret_buffer_charset *cset, int flags) {
182 346 100         if (parse->pos >= parse->lim) // empty range
183 48           return false;
184              
185             // byte matching gets to use a more efficient algorithm
186 298           return parse->encoding == SECRET_BUFFER_ENCODING_ISO8859_1
187 289           ? sb_parse_match_charset_bytes(parse, cset, flags)
188 587 100         : sb_parse_match_charset_codepoints(parse, cset, flags);
189             }
190              
191             /* Scan for a pattern which is a literal string of bytes.
192             */
193 0           bool secret_buffer_match_bytestr(secret_buffer_parse *parse, char *data, size_t datalen, int flags) {
194 0           return sb_parse_match_str_U8(parse, (U8*) data, datalen, flags);
195             }
196              
197             /* Count number of bytes required to transcode the source.
198             * If the source contains an invalid character for its encoding, or that codepoint
199             * can't be encoded as the dst_encoding, this returns -1 and sets src->error
200             * and also sets src->pos pointing at the character that could not be converted.
201             */
202 118           SSize_t secret_buffer_sizeof_transcode(secret_buffer_parse *src, int dst_encoding) {
203             // If the source and destination encodings are both bytes, return the length
204 118 100         if (dst_encoding == src->encoding && src->encoding == 0)
    100          
205 17           return src->lim - src->pos;
206             // Else need to iterate characters (to validate) and re-encode them
207             else {
208 101           size_t dst_size_needed= 0;
209             secret_buffer_parse tmp;
210 101           Zero(&tmp, 1, secret_buffer_parse);
211 101           tmp.pos= src->pos;
212 101           tmp.lim= src->lim;
213 101           tmp.encoding= src->encoding;
214 859 100         while (tmp.pos < tmp.lim) {
215 758           int cp= sb_parse_next_codepoint(&tmp);
216 758 50         if (cp < 0) return -1;
217 758           int ch_size= sizeof_codepoint_encoding(cp, dst_encoding);
218 758 50         if (ch_size < 0) return -1;
219 758           dst_size_needed += ch_size;
220             }
221             // If dest is base64, need special calculation
222 101 100         if (dst_encoding == SECRET_BUFFER_ENCODING_BASE64) {
223 10           dst_size_needed= ((dst_size_needed + 2) / 3) * 4;
224             }
225 101           return dst_size_needed;
226             }
227             }
228              
229             static const char base64_alphabet[64]=
230             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
231             "abcdefghijklmnopqrstuvwxyz"
232             "0123456789+/";
233              
234             /*
235             perl -E 'my @tbl= (-1)x256;
236             $tbl[ord]= -ord(A)+ord for A..Z;
237             $tbl[ord]= 26-ord(a)+ord for a..z;
238             $tbl[ord]= 52-ord(0)+ord for 0..9;
239             $tbl[ord "+"]= 62;
240             $tbl[ord "/"]= 63;
241             $tbl[ord "="]= 64;
242             say join ",\n", map join(",", @tbl[$_*16 .. $_*16+15]), 0..0xF'
243             */
244             static const int8_t base64_decode_table[256]= {
245             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
246             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
247             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
248             52,53,54,55,56,57,58,59,60,61,-1,-1,-1,64,-1,-1,
249             -1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
250             15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
251             -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
252             41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
253             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
254             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
255             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
256             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
257             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
258             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
259             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
260             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
261             };
262              
263             /* Transcode characters from one parse state into another.
264             * This works sort of like
265             * $data= decode($src_enc, substr($src, $src_pos, $src_len));
266             * substr($dst, $dst_pos, $dst_lim, encode($dst_enc, $data));
267             * processing only a range of the source, and replacing only a range of the dest,
268             * adjusting the size of dst as needed. Both src->pos and dst->pos
269             * are updated.
270             */
271 109           bool secret_buffer_transcode(secret_buffer_parse *src, secret_buffer_parse *dst) {
272 109           src->error= NULL;
273 109           dst->error= NULL;
274             // If the source and destination encodings are both bytes, use memcpy
275 109 100         if (dst->encoding == src->encoding && src->encoding == 0) {
    100          
276 17           size_t cnt= dst->lim - dst->pos;
277 17 50         if (src->lim - src->pos != cnt) {
278 0           dst->error= "miscalculated buffer length";
279 0           return false;
280             }
281 17           memcpy((U8*)dst->pos, src->pos, cnt);
282 17           dst->pos += cnt;
283 17           src->pos += cnt;
284             }
285             // Else need to iterate characters and re-encode them
286             // base64 encoding doesn't work with sb_parse_encode_codepoint, so it gets
287             // special treatment.
288 92 100         else if (dst->encoding == SECRET_BUFFER_ENCODING_BASE64) {
289             // Read 3, write 4
290 10           int accum= 0;
291 10           int shift= 16, cp;
292 88 100         while (src->pos < src->lim) {
293 78           cp= sb_parse_next_codepoint(src);
294 78 50         if (cp > 0xFF) {
295 0           dst->error= "byte out of range";
296 0           return false;
297             }
298 78 100         if (!shift) {
299 24           U8 *writable= (U8*) dst->pos;
300 24 50         if (dst->pos + 4 > dst->lim) {
301 0           dst->error= "miscalculated buffer length";
302 0           return false;
303             }
304 24           dst->pos += 4;
305 24           accum |= cp;
306 24           writable[0] = base64_alphabet[0x3F & (accum >> 18)];
307 24           writable[1] = base64_alphabet[0x3F & (accum >> 12)];
308 24           writable[2] = base64_alphabet[0x3F & (accum >> 6)];
309 24           writable[3] = base64_alphabet[0x3F & accum];
310 24           accum= 0;
311 24           shift= 16;
312             }
313             else {
314 54           accum |= (cp << shift);
315 54           shift -= 8;
316             }
317             }
318 10 100         if (dst->pos + (shift < 16? 4 : 0) != dst->lim) {
    50          
319 0           dst->error= "miscalculated buffer length";
320 0           return false;
321             }
322             // write leftover accumulated bits
323 10 100         if (shift < 16) {
324 5           U8 *writable= (U8*) dst->pos;
325 5 50         if (dst->pos + 4 > dst->lim) {
326 0           dst->error= "miscalculated buffer length";
327 0           return false;
328             }
329 5           dst->pos += 4;
330 5           writable[0] = base64_alphabet[0x3F & (accum >> 18)];
331 5           writable[1] = base64_alphabet[0x3F & (accum >> 12)];
332 5 100         writable[2] = shift? '=' : base64_alphabet[0x3F & (accum >> 6)];
333 5           writable[3] = '=';
334             }
335             }
336             else {
337 526 100         while (src->pos < src->lim) {
338 444           int cp= sb_parse_next_codepoint(src);
339 444 50         if (cp < 0)
340 0           return false; // error is already set
341 444           int len= sb_parse_encode_codepoint(dst, cp);
342 444 50         if (len < 0)
343 0           return false; // error is already set
344             }
345 82 50         if (dst->pos != dst->lim) {
346 0           dst->error= "miscalculated buffer length";
347 0           return false;
348             }
349             }
350 109           return true;
351             }
352              
353             bool
354 101           secret_buffer_copy_to(secret_buffer_parse *src, SV *dst_sv, int encoding, bool append) {
355             dTHX;
356             secret_buffer_parse dst;
357 101           secret_buffer *dst_sbuf= NULL;
358             SSize_t need_bytes;
359 101           bool dst_wide= false;
360              
361 101           Zero(&dst, 1, secret_buffer_parse);
362             // Encoding may be -1 to indicate the user didn't specify, in which case we use the
363             // same encoding as the source, unless the destination is a perl scalar (handled below)
364 101 100         dst.encoding= encoding >= 0? encoding : src->encoding;
365 101 100         if (sv_isobject(dst_sv)) {
366             // if object, must be a SecretBuffer
367 27           dst_sbuf= secret_buffer_from_magic(dst_sv, SECRET_BUFFER_MAGIC_OR_DIE);
368             }
369             else {
370             // Going to overwrite the scalar, or if its a scalar-ref, overwrite that.
371 74 50         if (SvROK(dst_sv) && !sv_isobject(dst_sv) && SvTYPE(SvRV(dst_sv)) <= SVt_PVMG)
    0          
    0          
372 0           dst_sv= SvRV(dst_sv);
373             // Refuse to overwrite any other kind of ref
374 74 50         if (SvTYPE(dst_sv) > SVt_PVMG || SvROK(dst_sv)) {
    50          
375 0           src->error= "Can only copy_to scalars or scalar-refs";
376 0           return false;
377             }
378             // If the source encoding is a type of unicode, and the destination encoding is not
379             // specified, then write wide characters (utf-8) to the perl scalar and flag it as utf8
380 74 100         if (encoding < 0 && SECRET_BUFFER_ENCODING_IS_UNICODE(src->encoding)) {
    100          
    100          
    100          
    50          
381 66           dst.encoding= SECRET_BUFFER_ENCODING_UTF8;
382 66           dst_wide= true;
383             }
384             }
385             // Determine how many bytes we need
386 101           need_bytes= secret_buffer_sizeof_transcode(src, dst.encoding);
387 101 50         if (need_bytes < 0)
388 0           return false;
389             // Prepare the buffers for that many bytes
390 101 100         if (dst_sbuf) {
391             // For destination SecretBuffer, set length to 0 unless appending, then
392             // ensure enough allocated space for need_bytes, then transcode and update
393             // the length in the block below.
394 27 100         if (!append)
395 20           secret_buffer_set_len(dst_sbuf, 0); /* clears secrets */
396 27           secret_buffer_alloc_at_least(dst_sbuf, dst_sbuf->len + need_bytes);
397 27           dst.pos= (U8*) dst_sbuf->data + dst_sbuf->len;
398 27           dst.lim= dst.pos + need_bytes;
399             }
400             else {
401             // For destination SV, set length to 0 unless appending, then force it to
402             // be bytes or utf-8, then grow it to ensure room for additional `need_bytes`.
403             U8* ptr;
404             STRLEN len;
405             // If overwriting, set the length to 0 before forcing to bytes or utf8
406 74 100         if (!append)
407 72           sv_setpvn(dst_sv, "", 0);
408             // force it to the type required
409 74 100         if (dst_wide) SvPVutf8(dst_sv, len);
410 8           else SvPVbyte(dst_sv, len);
411             // grow it to the required length, for writing
412 74 100         sv_grow(dst_sv, (append? len : 0) + need_bytes + 1);
413 74           ptr= (U8*) SvPVX_mutable(dst_sv) + len;
414             // don't forget the NUL terminator
415 74           ptr[need_bytes]= '\0';
416 74           dst.pos= ptr;
417 74           dst.lim= dst.pos + need_bytes;
418             }
419 101 50         if (!secret_buffer_transcode(src, &dst)) {
420 0 0         if (!src->error) src->error= dst.error;
421 0           return false;
422             }
423             /* update the lengths */
424 101 100         if (dst_sbuf) {
425 27           dst_sbuf->len += need_bytes;
426             }
427             else {
428 74           SvCUR_set(dst_sv, SvCUR(dst_sv) + need_bytes);
429 74 50         SvSETMAGIC(dst_sv);
430             }
431 101           return true;
432             }
433              
434             /* Append DER length octets (ASN.1 Length field, definite form only).
435             *
436             * DER rules:
437             * - If len <= 127: single byte [0x00..0x7F]
438             * - Else: first byte is 0x80 | N, where N is # of following length bytes (big-endian),
439             * and the length must be encoded in the minimal number of bytes (no leading 0x00).
440             *
441             * This function encodes ONLY the length field (not tag/value).
442             */
443             void
444 384           secret_buffer_append_uv_asn1_der_length(secret_buffer *buf, UV val) {
445             dTHX;
446 384           int enc_len = 1;
447             U8 *pos;
448 384 100         if (val > 127) {
449             /* Determine minimal number of bytes needed to represent len in base-256. */
450 339           UV tmp = val;
451 2001 100         while (tmp) {
452 1662           enc_len++;
453 1662           tmp >>= 8;
454             }
455             }
456             /* In BER/DER, the long-form initial octet has 7 bits of length-of-length.
457             * 0x80 is indefinite length (forbidden in DER), 0xFF would mean 127 length bytes.
458             * With 64-bit UV enc_len will never exceed 9.
459             */
460 384 50         ASSUME(enc_len < 127);
461 384           secret_buffer_set_len(buf, buf->len + enc_len);
462 384           pos= (U8*) buf->data + buf->len - 1;
463 384 100         if (val <= 127) {
464 45           *pos = (U8) val;
465             } else {
466 339           UV tmp = val;
467             /* Write the length big-endian into enc[1..n]. */
468 2001 100         while (tmp) {
469 1662           *pos-- = (U8)(tmp & 0xFF);
470 1662           tmp >>= 8;
471             }
472 339           *pos= (U8) (0x80 | (U8)(enc_len-1));
473             }
474 384           }
475              
476             /* Parse ASN.1 DER Length (definite form only) */
477             bool
478 384           secret_buffer_parse_uv_asn1_der_length(secret_buffer_parse *parse, UV *out) {
479             /* Work on a local cursor so we can roll back on failure */
480 384           const U8 *pos = parse->pos;
481 384           const U8 *lim = parse->lim;
482             UV result;
483              
484 384 50         if (pos >= lim) {
485 0           parse->error = "unexpected end of buffer";
486 0           return false;
487             }
488              
489 384           result = *pos++;
490              
491             /* If 0..127, the byte is the length value itself, otherwise it is the number of octets
492             * to read following that byte. */
493 384 100         if ((result & 0x80)) {
494 339           int n = result & 0x7F;
495             /* 0x80 means indefinite length (BER/CER), forbidden in DER */
496 339 50         if (n == 0) {
497 0           parse->error = "ASN.1 DER indefinite length not allowed";
498 0           return false;
499             }
500             /* Number of octets should be smallest possible encoding, so if it is larger than size_t
501             * don't even bother trying to decode it.
502             */
503 339 50         if (n > sizeof(UV)) {
504 0           parse->error = "ASN.1 DER length too large for perl UV";
505 0           return false;
506             }
507             /* ensure we have that many bytes */
508 339 50         if ((size_t)(lim - pos) < (size_t)n) {
509 0           parse->error = "unexpected end of buffer";
510 0           return false;
511             }
512             /* DER minimal encoding rules:
513             * - no leading 0x00 in the length octets
514             * - long form must not be used for lengths <= 127
515             */
516 339           lim= pos + n;
517 339           result= *pos++;
518 339 50         if (!result) {
519 0           parse->error = "ASN.1 DER length has leading zero (non-minimal)";
520 0           return false;
521             }
522             /* Parse remaining bytes of big-endian unsigned integer */
523 1662 100         while (pos < lim)
524 1323           result= (result << 8) | *pos++;
525             /* DER should not use 1-byte encoding if it would have fit in the initial byte */
526 339 50         if (result < 0x80) {
527 0           parse->error = "ASN.1 DER length should use short form (non-minimal)";
528 0           return false;
529             }
530             }
531 384 50         if (out) *out = result;
532 384           parse->pos = pos;
533 384           parse->error = NULL;
534 384           return true;
535             }
536              
537             /* Append canonical unsigned Base128, Little-Endian
538             *
539             * Rules:
540             * - 7 data bits per byte, little-endian (least significant group first)
541             * - High bit 0x80 set on all bytes except the final byte
542             * - Canonical/minimal: stop as soon as remaining value is 0
543             */
544             void
545 384           secret_buffer_append_uv_base128le(secret_buffer *buf, UV val) {
546             dTHX;
547             U8 *pos;
548 384           int enc_len= 1;
549 384           UV tmp= val >> 7;
550 1923 100         while (tmp) {
551 1539           enc_len++;
552 1539           tmp >>= 7;
553             }
554 384           secret_buffer_set_len(buf, buf->len + enc_len);
555 384           pos= (U8*) buf->data + buf->len - enc_len;
556             /* Encode */
557 384           tmp= val;
558             do {
559 1923           U8 byte = (U8)(tmp & 0x7F);
560 1923           tmp >>= 7;
561 1923 100         if (tmp)
562 1539           byte |= 0x80;
563 1923           *pos++ = byte;
564 1923 100         } while (tmp);
565 384 50         ASSUME(pos == (U8*)(buf->data + buf->len));
566 384           }
567              
568             /* Parse Unsigned LittleEndian Base128 (also requiring canonical / minimal encoding) */
569             bool
570 384           secret_buffer_parse_uv_base128le(secret_buffer_parse *parse, UV *out) {
571 384           const U8 *pos = parse->pos;
572 384           const U8 *lim = parse->lim;
573 384           UV result= 0, payload;
574 384           int shift= 7;
575              
576 384 50         if (pos >= lim) {
577 0           parse->error = "unexpected end of buffer";
578 0           return false;
579             }
580 384           result= payload= *pos & 0x7F;
581             /* Scan forward looking for the first byte without the continuation flag */
582 1923 100         while (*pos++ & 0x80) {
583 1539 50         if (pos >= lim) {
584 0           parse->error = "unexpected end of buffer";
585 0           return false;
586             }
587 1539           payload= *pos & 0x7F;
588 1539 100         if (shift > sizeof(UV)*8 - 7) {
589             /* Do any of the bits overflow? Is the continuation flag set? */
590 3 50         if (shift >= sizeof(UV)*8 || (payload >> (sizeof(UV)*8 - shift))) {
    50          
591 0           parse->error = "Base128-LE value overflows perl UV";
592 0           return false;
593             }
594             }
595 1539           result |= payload << shift;
596 1539           shift += 7;
597             }
598             /* check if the high bits were all zero, meaning an unnecessary byte was encoded */
599 384 100         if (!payload && result != 0) {
    50          
600 0           parse->error = "Over-long encoding of Base128-LE";
601 0           return false;
602             }
603 384 50         if (out) *out = result;
604 384           parse->pos = pos;
605 384           parse->error = NULL;
606 384           return true;
607             }
608              
609             /* Append canonical unsigned Base128, Big-Endian
610             *
611             * Rules:
612             * - 7 data bits per byte, big-endian (most significant group first)
613             * - High bit 0x80 set on all bytes except the final byte
614             * - Canonical/minimal: stop as soon as remaining value is 0
615             */
616             void
617 387           secret_buffer_append_uv_base128be(secret_buffer *buf, UV val) {
618             dTHX;
619             U8 *pos;
620 387           int enc_len= 1, shift;
621 387           UV tmp= val >> 7;
622 1926 100         while (tmp) {
623 1539           enc_len++;
624 1539           tmp >>= 7;
625             }
626 387           secret_buffer_set_len(buf, buf->len + enc_len);
627 387           pos= (U8*) buf->data + buf->len - enc_len;
628             /* Encode */
629 2313 100         for (shift= (enc_len-1) * 7; shift >= 0; shift -= 7) {
630 1926           U8 byte = (U8)((val >> shift) & 0x7F);
631 1926 100         if (shift)
632 1539           byte |= 0x80;
633 1926           *pos++ = byte;
634             }
635 387 50         ASSUME(pos == (U8*)(buf->data + buf->len));
636 387           }
637              
638             /* Parse Unsigned BigEndian Base128 (also requiring canonical / minimal encoding) */
639             bool
640 395           secret_buffer_parse_uv_base128be(secret_buffer_parse *parse, UV *out) {
641 395           const U8 *pos = parse->pos;
642 395           const U8 *lim = parse->lim;
643 395           UV result= 0;
644              
645 395 50         if (pos >= lim) {
646 0           parse->error = "unexpected end of buffer";
647 0           return false;
648             }
649             /* high-bit payload == 0 with continue bit set is an error. */
650 395 50         if (*pos == 0x80) {
651 0           parse->error = "Over-long encoding of Base128-BE";
652 0           return false;
653             }
654 395           result= *pos & 0x7F;
655 1934 100         while (*pos++ & 0x80) {
656             /* Will existing bits overflow UV when shifted? */
657 1539 50         if (result >> (sizeof(UV)*8 - 7)) {
658 0           parse->error = "Base128-BE value overflows perl UV";
659 0           return false;
660             }
661 1539 50         if (pos >= lim) {
662 0           parse->error = "unexpected end of buffer";
663 0           return false;
664             }
665 1539           result= (result << 7) | (*pos & 0x7F);
666             }
667 395 50         if (out) *out = result;
668 395           parse->pos = pos;
669 395           parse->error = NULL;
670 395           return true;
671             }
672              
673             /* Private API -------------------------------------------------------------*/
674              
675             /* Scan raw bytes using only the bitmap */
676 289           static bool sb_parse_match_charset_bytes(
677             secret_buffer_parse *parse,
678             const secret_buffer_charset *cset,
679             int flags
680             ) {
681 289           bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
682 289           bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
683 289 100         bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) || cset->match_multi;
    100          
684 289           bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
685 289           bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
686 289 100         int step= reverse? -1 : 1;
687 289 100         const U8 *pos= reverse? parse->lim-1 : parse->pos,
688 289 100         *lim= reverse? parse->pos-1 : parse->lim,
689 289           *span_start= NULL;
690             //warn("scan_charset_bytes pos=%p lim=%p len=%d", parse->pos, parse->lim, (int)(parse->lim - parse->pos));
691              
692 1102 100         while (pos != lim) {
693 1097 100         if (sbc_bitmap_test(cset->bitmap, *pos) != negate) {
694             // Found. Now are we looking for a span?
695 238 100         if (span_start)
696 105           break;
697 133           span_start= pos;
698 133 100         if (!multi) {
699 27           pos += step;
700 27           break;
701             }
702 106           negate= !negate;
703 859 100         } else if (anchored && !span_start)
    100          
704 152           break;
705 813           pos += step;
706             }
707             /* If constant time operation is requested, we need to perform one sbc_bitmap_test
708             * for every character in the span, and make sure the compiler doesn't eliminate it.
709             */
710 289 50         if (consttime) {
711 0           volatile bool sink= false;
712 0 0         while (pos != lim) {
713 0           sink ^= sbc_bitmap_test(cset->bitmap, *pos);
714 0           pos += step;
715             }
716 0           (void) sink;
717             }
718             // reached end of defined range, and implicitly ends span
719 289 100         if (reverse) {
720 86           parse->pos= pos + 1;
721 86 100         parse->lim= span_start? span_start + 1 : parse->pos;
722             } else {
723 203           parse->lim= pos;
724 203 100         parse->pos= span_start? span_start : parse->lim;
725             }
726 289           return span_start != NULL;
727             }
728              
729 9           static bool sb_parse_match_charset_codepoints(
730             secret_buffer_parse *parse,
731             const secret_buffer_charset *cset,
732             int flags
733             ) {
734             dTHX;
735 9           bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
736 9           bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
737 9 50         bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) || cset->match_multi;
    100          
738 9           bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
739 9           bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
740 9           bool span_started= false;
741 9           bool encoding_error= false;
742 9 100         const U8 *span_mark= NULL, *prev_mark= reverse? parse->lim : parse->pos;
743              
744 37 50         while (parse->pos < parse->lim) {
745 19           int codepoint= reverse? sb_parse_prev_codepoint(parse)
746 37 100         : sb_parse_next_codepoint(parse);
747             // warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
748 37 100         if (codepoint < 0) {// encoding error
749 1           encoding_error= true;
750 1           break;
751             }
752 36 100         if (sbc_test_codepoint(aTHX_ cset, codepoint) != negate) {
753             // Found. Mark boundaries of char.
754             // Now are we looking for a span?
755 10 100         if (span_started)
756 2           break;
757 8           span_started= true;
758 8           span_mark= prev_mark;
759 8           negate= !negate;
760 8 100         if (!multi) {
761 6 100         prev_mark= reverse? parse->lim : parse->pos;
762 6           break;
763             }
764 26 50         } else if (anchored && !span_started)
    0          
765 0           break;
766 28 100         prev_mark= reverse? parse->lim : parse->pos;
767             }
768             /* If constant time operation is requested, we need to perform one sbc_bitmap_test
769             * for every character in the span, and make sure the compiler doesn't eliminate it.
770             */
771 9 50         if (consttime) {
772 0           volatile bool sink= false;
773 0 0         while (parse->pos < parse->lim) {
774 0           int codepoint= reverse? sb_parse_prev_codepoint(parse)
775 0 0         : sb_parse_next_codepoint(parse);
776             // warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
777 0 0         if (codepoint < 0) { // encoding error
778 0           encoding_error= true;
779 0           sink ^= sbc_test_codepoint(aTHX_ cset, 0);
780             }
781             else
782 0           sink ^= sbc_test_codepoint(aTHX_ cset, codepoint);
783             }
784 0           (void) sink;
785             }
786 9 100         if (encoding_error)
787 1           return false;
788             // reached end of defined range
789 8 50         if (span_started) { // and implicitly ends span
790 8 100         if (reverse) {
791 5           parse->pos= prev_mark;
792 5           parse->lim= span_mark;
793             }
794             else {
795 3           parse->pos= span_mark;
796 3           parse->lim= prev_mark;
797             }
798 8           return true;
799             }
800 0           return false;
801             }
802              
803 18           int sb_parse_codepointcmp(secret_buffer_parse *lhs, secret_buffer_parse *rhs) {
804             I32 lhs_cp, rhs_cp;
805 18           volatile int ret= 0;
806             /* constant-time iteration per the shorter of the two strings */
807 87 100         while (lhs->pos < lhs->lim && rhs->pos < rhs->lim) {
    50          
808 69           lhs_cp= sb_parse_next_codepoint(lhs);
809 69 50         if (lhs_cp < 0)
810 0           croak("Encoding error in left-hand buffer");
811 69           rhs_cp= sb_parse_next_codepoint(rhs);
812 69 50         if (rhs_cp < 0)
813 0           croak("Encoding error in right-hand buffer");
814 69 100         if (lhs_cp != rhs_cp && !ret)
    50          
815 2 50         ret= lhs_cp < rhs_cp? -1 : 1;
816             }
817 18           return ret? ret
818 34 100         : (lhs->pos < lhs->lim)? 1 /* right string shorter than left */
819 32 50         : (rhs->pos < rhs->lim)? -1 /* left string shorter than right */
820 16 50         : 0;
821             }
822              
823             /* UTF-8 decoding helper */
824 3365           static int sb_parse_next_codepoint(secret_buffer_parse *parse) {
825 3365           const U8 *pos= parse->pos, *lim= parse->lim;
826 3365           int cp, encoding= parse->encoding;
827             #define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
828              
829 3365 100         if (encoding == SECRET_BUFFER_ENCODING_ASCII
830 3364 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
831 577 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
832             ) {
833 3123 50         if (lim - pos < 1)
834 0           SB_RETURN_ERROR("end of span")
835 3123           cp= *pos++;
836 3123 100         if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_ASCII)
    100          
837 1           SB_RETURN_ERROR("not 7-bit ASCII")
838 3122 100         else if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_UTF8) {
    100          
839 47           int min_cp= 0;
840 47           switch ((cp >> 3) & 0xF) {
841 13           case 14: // 0b1[1110]yyy
842 13 50         { if (lim - pos < 3) goto incomplete;
843 13           min_cp= 0x10000;
844 13           cp &= 0x07;
845             }
846 13 50         if ((*pos & 0xC0) != 0x80) goto invalid;
847 13           cp= (cp << 6) | (*pos++ & 0x3F);
848             if (0)
849             case 12: case 13: // 0b1[110x]yyy
850 14 50         { if (lim - pos < 2) goto incomplete;
851 14           min_cp= 0x800;
852 14           cp &= 0x0F;
853             }
854 27 50         if ((*pos & 0xC0) != 0x80) goto invalid;
855 27           cp= (cp << 6) | (*pos++ & 0x3F);
856             if (0)
857             case 8: case 9: case 10: case 11: // 0b1[10xx]yyy
858 20 50         { if (lim - pos < 1) goto incomplete;
859 20           min_cp= 0x80;
860 20           cp &= 0x1F;
861             }
862 47 50         if ((*pos & 0xC0) != 0x80) goto invalid;
863 47           cp= (cp << 6) | (*pos++ & 0x3F);
864 47           break;
865             default:
866 0           invalid: SB_RETURN_ERROR("invalid UTF8 character")
867 0           incomplete: SB_RETURN_ERROR("incomplete UTF8 character")
868             }
869 47 50         if (cp < min_cp)
870 0           SB_RETURN_ERROR("overlong encoding of UTF8 character")
871 47 50         else if (cp > 0x10FFFF)
872 0           SB_RETURN_ERROR("UTF8 character exceeds max")
873             }
874             // else all ISO-8859-1 bytes are valid codepoints
875             }
876 242 100         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
877 221 100         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
878 36           ) {
879 36           int low= encoding == SECRET_BUFFER_ENCODING_UTF16LE? 0 : 1;
880 36 50         if (lim - pos < 2)
881 0           SB_RETURN_ERROR("end of span")
882 36           cp= pos[low] | ((int)pos[low^1] << 8);
883 36           pos += 2;
884 36 100         if (cp >= 0xD800 && cp <= 0xDFFF) {
    50          
885 10 50         if (lim - pos < 2)
886 0           SB_RETURN_ERROR("incomplete UTF16 character")
887 10           int w2= pos[low] | ((int)pos[low^1] << 8);
888 10           pos += 2;
889 10 50         if (w2 < 0xDC00 || w2 > 0xDFFF)
    50          
890 0           SB_RETURN_ERROR("invalid UTF16 low surrogate")
891 10           cp = 0x10000 + (((cp & 0x3FF) << 10) | (w2 & 0x3FF));
892             }
893             }
894 206 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
895             // Skip over whitespace
896 38 50         while (pos < lim && isspace(*pos))
    50          
897 0           pos++;
898 38 50         if (lim - pos < 2)
899 0           SB_RETURN_ERROR("end of span")
900 38           int high= *pos++ - '0';
901 38           int low= *pos++ - '0';
902 38 50         if (low >= ('a'-'0')) low -= ('a'-'0'-10);
903 38 100         else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
904 38 50         if (high >= ('a'-'0')) high -= ('a'-'0'-10);
905 38 100         else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
906 38 50         if ((low >> 4) | (high >> 4))
907 0           SB_RETURN_ERROR("not a pair of hex digits")
908 38           cp= (high << 4) | low;
909             // skip over whitespace if it takes us to the end of buffer so that caller
910             // knows it's EOF before trying another decode.
911 38 100         while (pos < lim && isspace(*pos))
    50          
912 0           pos++;
913             }
914 168 50         else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
915             // Skip over whitespace and control chars
916 168 50         while (pos < lim && *pos <= ' ')
    50          
917 0           pos++;
918             // There need to be at least 2 base64 characters left
919 168 50         if (pos < lim) {
920 168 50         if (base64_decode_table[*pos] < 0)
921 0           SB_RETURN_ERROR("invalid base64 character");
922             // ->pos_bit > 0 means pointer is pointing at a sub-bit of the base64
923             // character at *pos (and possible values are 0, 2, or 4)
924 168           cp= (((int)base64_decode_table[*pos++]) << (2 + parse->pos_bit)) & 0xFF;
925 168 50         while (pos < lim && *pos <= ' ')
    50          
926 0           pos++;
927             }
928 168 50         if (pos >= lim) {
929 0           parse->pos_bit= 0;
930 0           SB_RETURN_ERROR("end of span")
931             }
932 168 50         if (base64_decode_table[*pos] < 0)
933 0           SB_RETURN_ERROR("invalid base64 character");
934 168           cp |= base64_decode_table[*pos] >> (4-parse->pos_bit);
935 168           parse->pos_bit += 2;
936             // If pos_bit == 6 we've completed a set of 4 b64 chars and fully consumed them.
937 168 100         if (parse->pos_bit >= 6) {
938 51           pos++;
939 51           parse->pos_bit= 0;
940             // consume trailing whitespace
941 55 100         while (pos < lim && *pos <= ' ')
    100          
942 4           pos++;
943             }
944             else {
945             // if next char is '=', terminate the decoding
946 117           const U8 *next= pos+1;
947 117 50         while (next < lim && *next <= ' ')
    50          
948 0           next++;
949 117 50         if (next < lim && *next == '=') {
    100          
950 13           pos= lim; // indicate parsing complete
951 13           parse->pos_bit= 0;
952             }
953             }
954             }
955 0 0         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
956 0 0         if (lim - pos < 4)
957 0           SB_RETURN_ERROR("end of span");
958 0           cp= *(I32*)pos;
959 0           pos+= 4;
960             }
961 0           else SB_RETURN_ERROR("unsupported encoding")
962 3364           parse->pos= pos;
963 3364           return cp;
964             #undef SB_RETURN_ERROR
965             }
966              
967 850           static int sb_parse_prev_codepoint(secret_buffer_parse *parse) {
968 850           const U8 *pos= parse->pos, *lim= parse->lim;
969 850           int encoding= parse->encoding;
970             int cp;
971             #define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
972              
973 850 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII
974 850 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
975 25 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
976             ) {
977 842 50         if (lim <= pos)
978 0           SB_RETURN_ERROR("end of span")
979 842           cp= *--lim;
980             // handle the simple case first
981 842 100         if (cp >= 0x80 && encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
    50          
982             // Strict ASCII can't encode above 0x7F
983 4 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII)
984 0           SB_RETURN_ERROR("not 7-bit ASCII")
985             // else need to backtrack and then call next_codepoint
986 4           const U8 *start= lim;
987 12 50         while (start >= pos && (*start & 0xC0) == 0x80)
    100          
988 8           --start;
989 4           parse->pos= start;
990 4           cp= sb_parse_next_codepoint(parse);
991 4 50         if (parse->pos != parse->lim) {// consumed all characters we gave it?
992 0           parse->pos= pos; // restore original pos
993 0 0         if (cp >= 0) // had a valid char, but extra 0x80 bytes
994 0           parse->error= "invalid UTF8 character";
995             // else use the error message from next_codepoint
996 0           return -1;
997             }
998 4           parse->pos= pos; // restore original pos
999 4           lim= start; // new lim is where we started the parse from
1000             }
1001             }
1002 8 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1003 8 100         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
1004 1           ) {
1005 1 50         if (lim - pos < 2)
1006 0           SB_RETURN_ERROR("end of span");
1007             // handle the simple case first
1008 1           lim -= 2;
1009 1           int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1010 1           cp= lim[low] | ((int)lim[low^1] << 8);
1011 1 50         if (cp >= 0xD800 && cp <= 0xDFFF) {
    50          
1012 1 50         if (lim - pos < 4)
1013 0           SB_RETURN_ERROR("end of span");
1014 1           lim -= 2;
1015 1           int w1= lim[low] | ((int)lim[low^1] << 8);
1016 1 50         if (w1 < 0xD800 || w1 > 0xDFFF || cp < 0xDC00)
    50          
    50          
1017 0           SB_RETURN_ERROR("invalid UTF16 surrogate");
1018 1           cp = 0x10000 + (((w1 & 0x3FF) << 10) | (cp & 0x3FF));
1019             }
1020             }
1021 7 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1022             // Skip over whitespace
1023 1 50         while (pos < lim && isspace(lim[-1]))
    50          
1024 0           lim--;
1025 1 50         if (lim - pos < 2)
1026 0 0         SB_RETURN_ERROR((pos == lim? "end of span" : "incomplete hex pair at end of span"))
1027 1           int low= *--lim - '0';
1028 1           int high= *--lim - '0';
1029 1 50         if (low >= ('a'-'0')) low -= ('a'-'0'-10);
1030 0 0         else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
1031 1 50         if (high >= ('a'-'0')) high -= ('a'-'0'-10);
1032 0 0         else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
1033 1 50         if ((low >> 4) | (high >> 4))
1034 0           SB_RETURN_ERROR("not a pair of hex digits")
1035 1           cp= (high << 4) | low;
1036             }
1037 6 50         else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
1038             bool again;
1039             do {
1040 9           again= false;
1041             // Skip over non-base64 chars
1042 12 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    100          
1043 3           lim--;
1044 9 50         if (pos < lim) {
1045             //warn("lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1046 9 50         if (base64_decode_table[lim[-1]] < 0)
1047 0           SB_RETURN_ERROR("invalid base64 character");
1048             // ->lim_bit > 0 means the character lim[-1] is partially consumed.
1049             // (sequence is 0, 2, 4, 0)
1050 9           cp= ((int)base64_decode_table[lim[-1]]) >> parse->lim_bit;
1051             // parsing an equal sign means 'cp' is bogus and need to go again
1052 9 100         if (lim[-1] == '=')
1053 3           again= true;
1054 9           --lim;
1055             // find next base64 char
1056 9 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    50          
1057 0           lim--;
1058             }
1059 9 50         if (pos >= lim) {
1060 0           parse->lim_bit= 0;
1061 0           SB_RETURN_ERROR("end of span")
1062             }
1063 9 50         if (base64_decode_table[lim[-1]] < 0)
1064 0           SB_RETURN_ERROR("invalid base64 character");
1065             //warn(" lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1066 9           cp |= (((int)base64_decode_table[lim[-1]]) << (6 - parse->lim_bit)) & 0xFF;
1067 9           parse->lim_bit += 2;
1068 9 100         if (parse->lim_bit >= 6) {
1069 3           parse->lim_bit= 0;
1070             // If completed a set of 4 b64 chars, lim[-1] is consumed, and need to
1071             // walk backward to find next base64 char
1072 3           --lim;
1073 3 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    0          
1074 0           lim--;
1075             }
1076             //warn(" cp=%d, lim-pos=%d, lim_bit=%d", cp, (int)(lim-pos), parse->lim_bit);
1077 9 100         } while (again);
1078             }
1079 0 0         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1080 0 0         if (lim - pos < 4)
1081 0           SB_RETURN_ERROR("end of span");
1082 0           lim -= 4;
1083 0           cp= *(I32*)lim;
1084             }
1085 0           else SB_RETURN_ERROR("unsupported encoding")
1086 850           parse->lim= lim;
1087 850           return cp;
1088             #undef SB_RETURN_ERROR
1089             }
1090              
1091 1202           static int sizeof_codepoint_encoding(int codepoint, int encoding) {
1092 1202 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII)
1093 0 0         return codepoint < 0x80? 1 : -1;
1094 1202 100         if (encoding == SECRET_BUFFER_ENCODING_ISO8859_1)
1095 110 50         return codepoint < 0x100? 1 : -1;
1096 1092 100         else if (encoding == SECRET_BUFFER_ENCODING_UTF8)
1097 736 100         return codepoint < 0x80? 1 : codepoint < 0x800? 2 : codepoint < 0x10000? 3 : 4;
    100          
    100          
1098 356 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1099 356 50         || encoding == SECRET_BUFFER_ENCODING_UTF16BE)
1100 0 0         return codepoint >= 0xD800 && codepoint < 0xE000? -1
1101 0 0         : codepoint < 0x10000? 2 : 4;
    0          
1102 356 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX)
1103 6 50         return codepoint < 0x100? 2 : -1;
1104             /* Base64 would need to track an accumulator, so just return 1 and fix it in the caller */
1105 350 100         else if (encoding == SECRET_BUFFER_ENCODING_BASE64)
1106 78 50         return codepoint < 0x100? 1 : -1;
1107 272 50         else if (encoding == SECRET_BUFFER_ENCODING_I32)
1108 272           return 4;
1109             else
1110 0           return -1;
1111             }
1112              
1113 444           static bool sb_parse_encode_codepoint(secret_buffer_parse *dst, int codepoint) {
1114             #define SB_RETURN_ERROR(msg) { dst->error= msg; return false; }
1115 444           int encoding= dst->encoding, n;
1116 444           U8 *dst_pos= (U8*) dst->pos;
1117             // codepoints above 0x10FFFF are illegal
1118 444 50         if (codepoint >= 0x110000)
1119 0           SB_RETURN_ERROR("invalid codepoint");
1120             // not quite as efficient as checking during the code below, but saves a bunch of redundancy
1121 444           n= sizeof_codepoint_encoding(codepoint, encoding);
1122 444 50         if (n < 0)
1123 0           SB_RETURN_ERROR("character too wide for encoding")
1124 444 50         if (dst->lim - dst_pos < n)
1125 0           SB_RETURN_ERROR("buffer too small")
1126 444           dst->pos += n;
1127              
1128 444 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII
1129 444 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
1130 389 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
1131             ) {
1132 423           switch ((n-1)&0x3) { // help the compiler understand there are only 4 possible values
1133 401           case 0: *dst_pos++ = (U8) codepoint;
1134 401           break;
1135 10           case 1: *dst_pos++ = (U8)(0xC0 | (codepoint >> 6));
1136 10           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1137 10           break;
1138 4           case 2: *dst_pos++ = (U8)(0xE0 | (codepoint >> 12));
1139 4           *dst_pos++ = (U8)(0x80 | ((codepoint >> 6) & 0x3F));
1140 4           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1141 4           break;
1142 8           case 3: *dst_pos++ = (U8)(0xF0 | (codepoint >> 18));
1143 8           *dst_pos++ = (U8)(0x80 | ((codepoint >> 12) & 0x3F));
1144 8           *dst_pos++ = (U8)(0x80 | ((codepoint >> 6) & 0x3F));
1145 8           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1146 8           break;
1147             }
1148             }
1149 21 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1150 21 50         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
1151 0           ) {
1152 0           int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1153 0 0         if (n == 2) {
1154 0           dst_pos[low] = (U8)(codepoint & 0xFF);
1155 0           dst_pos[low^1] = (U8)(codepoint >> 8);
1156             }
1157             else {
1158 0           int adjusted = codepoint - 0x10000;
1159 0           int w0 = 0xD800 | (adjusted >> 10);
1160 0           int w1 = 0xDC00 | (adjusted & 0x3FF);
1161 0           dst_pos[low] = (U8)(w0 & 0xFF);
1162 0           dst_pos[1^low] = (U8)(w0 >> 8);
1163 0           dst_pos[2^low] = (U8)(w1 & 0xFF);
1164 0           dst_pos[3^low] = (U8)(w1 >> 8);
1165             }
1166             }
1167 21 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1168 3           dst_pos[0] = "0123456789ABCDEF"[(codepoint >> 4) & 0xF];
1169 3           dst_pos[1] = "0123456789ABCDEF"[codepoint & 0xF];
1170             }
1171 18 50         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1172 18           *(I32*)dst_pos = codepoint;
1173             }
1174             /* BASE64 is not handled here because the '=' padding can only be generated in
1175             * a context that knows when we are ending on a non-multiple-of-4. */
1176 0           else SB_RETURN_ERROR("unsupported encoding");
1177 444           return true;
1178             #undef SB_RETURN_ERROR
1179             }
1180              
1181             #define SB_PARSE_MATCH_STR_FN sb_parse_match_str_U8
1182             #define SB_PATTERN_EL_TYPE const U8
1183             #include "secret_buffer_parse_match_str.c"
1184             #undef SB_PARSE_MATCH_STR_FN
1185             #undef SB_PATTERN_EL_TYPE
1186              
1187             #define SB_PARSE_MATCH_STR_FN sb_parse_match_str_I32
1188             #define SB_PATTERN_EL_TYPE const I32
1189             #include "secret_buffer_parse_match_str.c"
1190             #undef SB_PARSE_MATCH_STR_FN
1191             #undef SB_PATTERN_EL_TYPE