File Coverage

secret_buffer_parse.c
Criterion Covered Total %
statement 550 700 78.5
branch 397 594 66.8
condition n/a
subroutine n/a
pod n/a
total 947 1294 73.1


line stmt bran cond sub pod time code
1              
2             /* These local parse functions are independent of the SecretBuffer instance,
3             * needing only the 'data' pointer to which the parse_state refers.
4             * The pos/lim of the parse state must already be checked against the length
5             * of the data before calling these.
6             */
7              
8             /* compute number of bytes needed for one character */
9             static int sizeof_codepoint_encoding(int codepoint, int encoding);
10             /* parse codepoint from end of parse and decrement ->lim */
11             static int sb_parse_prev_codepoint(secret_buffer_parse *parse);
12             /* parse codepoint from start of parse and increment ->pos */
13             static int sb_parse_next_codepoint(secret_buffer_parse *parse);
14             /* encode codepoint into buffer range described by 'parse' */
15             static bool sb_parse_encode_codepoint(secret_buffer_parse_rw *parse, int codepoint);
16              
17             static bool sb_parse_match_charset_bytes(secret_buffer_parse *parse, const secret_buffer_charset *cset, int flags);
18             static bool sb_parse_match_charset_codepoints(secret_buffer_parse *parse, const secret_buffer_charset *cset, int flags);
19             static bool sb_parse_match_str_U8(secret_buffer_parse *parse, const U8 *pattern, size_t pattern_len, int flags);
20             static bool sb_parse_match_str_I32(secret_buffer_parse *parse, const I32 *pattern, size_t pattern_len, int flags);
21              
22 102           static bool parse_encoding(pTHX_ SV *sv, int *out) {
23             int enc;
24 102 100         if (looks_like_number(sv)) {
25 6           IV i= SvIV(sv);
26 6 50         if (i < 0 || i > SECRET_BUFFER_ENCODING_MAX)
    50          
27 0           return false;
28 6           enc= (int) i;
29             } else {
30             STRLEN len;
31 96           const char *str= SvPV(sv, len);
32 96           switch (len) {
33 6 50         case 3: if (0 == strcmp(str, "HEX")) { enc= SECRET_BUFFER_ENCODING_HEX; break; }
34 1 50         case 4: if (0 == strcmp(str, "UTF8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
35 43 100         case 5: if (0 == strcmp(str, "ASCII")) { enc= SECRET_BUFFER_ENCODING_ASCII; break; }
36 42 50         if (0 == strcmp(str, "UTF-8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
37 26 50         case 6: if (0 == strcmp(str, "BASE64")) { enc= SECRET_BUFFER_ENCODING_BASE64; break; }
38 1 50         case 7: if (0 == strcmp(str, "UTF16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
39 0 0         if (0 == strcmp(str, "UTF16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
40 6 100         case 8: if (0 == strcmp(str, "UTF-16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
41 3 50         if (0 == strcmp(str, "UTF-16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
42 0 0         case 9: if (0 == strcmp(str, "ISO8859_1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
43 13 50         case 10: if (0 == strcmp(str, "ISO-8859-1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
44             default:
45 0           return false;
46             }
47             }
48 102 50         if (out) *out= enc;
49 102           return true;
50             }
51              
52             /* Public API --------------------------------------------------------------*/
53              
54             /* initialize a parse struct, but only if it is valid span of the buffer */
55 2266           bool secret_buffer_parse_init(secret_buffer_parse *parse,
56             secret_buffer *buf, size_t pos, size_t lim, int encoding
57             ) {
58 2266           Zero(parse, 1, secret_buffer_parse);
59             // Sanity check this parse state vs. the buffer
60 2266 100         if (lim > buf->len || pos > lim) {
    50          
61 1 50         parse->error= pos > lim? "span starts beyond buffer" : "span ends beyond buffer";
62 1           return false;
63             }
64 2265           parse->pos= ((U8*) buf->data) + pos;
65 2265           parse->lim= ((U8*) buf->data) + lim;
66 2265           parse->encoding= encoding;
67 2265           parse->sbuf= buf;
68 2265           return true;
69             }
70              
71             /* Initialize a parse struct, either from a Span, or a SecretBuffer, or a plain Scalar.
72             */
73 1741           bool secret_buffer_parse_init_from_sv(secret_buffer_parse *parse, SV *sv) {
74             dTHX;
75             secret_buffer *sb;
76             secret_buffer_span *span;
77             /* Is the sv a Span object? */
78 1741 100         if ((span= secret_buffer_span_from_magic(sv, 0)) && SvTYPE(SvRV(sv)) == SVt_PVHV) {
    50          
79 1286           SV **sb_sv= hv_fetchs((HV*)SvRV(sv), "buf", 1);
80 1286           sb= secret_buffer_from_magic(*sb_sv, SECRET_BUFFER_MAGIC_OR_DIE);
81 1286           return secret_buffer_parse_init(parse, sb, span->pos, span->lim, span->encoding);
82             }
83             /* Is the sv a SecretBuffer? */
84 455 100         else if ((sb= secret_buffer_from_magic(sv, 0))) {
85 2           return secret_buffer_parse_init(parse, sb, 0, sb->len, SECRET_BUFFER_ENCODING_ISO8859_1);
86             }
87             /* It needs to at least be defined */
88 453 50         else if (SvOK(sv)) {
89             STRLEN len;
90 453           char *buf= SvPV(sv, len);
91 453           Zero(parse, 1, secret_buffer_parse);
92 453           parse->pos= (U8*) buf;
93 453           parse->lim= (U8*) buf + len;
94 453           parse->encoding= SvUTF8(sv)? SECRET_BUFFER_ENCODING_UTF8 : SECRET_BUFFER_ENCODING_ISO8859_1;
95 453           return true;
96             }
97             else {
98 0           Zero(parse, 1, secret_buffer_parse);
99 0           parse->error= "Not a Span, SecretBuffer, or defined scalar";
100 0           return false;
101             }
102             }
103              
104             /* Scan for a pattern which may be a regex or literal string.
105             * Regexes are currently limited to a single charclass.
106             */
107 1048           bool secret_buffer_match(secret_buffer_parse *parse, SV *pattern, int flags) {
108             dTHX;
109 1048           REGEXP *rx= (REGEXP*)SvRX(pattern);
110             secret_buffer_parse pat_parse;
111              
112             /* Is the pattern a regexp-ref? */
113 1048 100         if (rx) {
114 609           secret_buffer_charset *cset= secret_buffer_charset_from_regexpref(pattern);
115 609           return secret_buffer_match_charset(parse, cset, flags);
116             }
117              
118             /* load up a parse struct with the pos, lim, and encoding */
119 439 50         if (!secret_buffer_parse_init_from_sv(&pat_parse, pattern))
120 0           croak("%s", pat_parse.error);
121              
122             /* Remove edge case of zero-length pattern (always matches) */
123 439 100         if (pat_parse.pos >= pat_parse.lim) {
124 2 50         if ((flags & SECRET_BUFFER_MATCH_REVERSE))
125 0           parse->pos= parse->lim;
126             else
127 2           parse->lim= parse->pos;
128 2           return !(flags & SECRET_BUFFER_MATCH_NEGATE);
129             }
130             /* Remove edge case of zero-length subject (never matches) */
131 437 100         if (parse->pos >= parse->lim) {
132 4           return (flags & SECRET_BUFFER_MATCH_NEGATE);
133             }
134              
135             /* Since unicode iteration of the pattern is a hassle and might happen lots of times,
136             * convert it to either plain bytes or array of U32 codepoints.
137             */
138 433 100         if (pat_parse.encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
139 17           int dst_enc=
140             /* these can be transcoded to bytes */
141 17           (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
142 17 100         || pat_parse.encoding == SECRET_BUFFER_ENCODING_HEX
143 16 50         || pat_parse.encoding == SECRET_BUFFER_ENCODING_BASE64)
144             ? SECRET_BUFFER_ENCODING_ISO8859_1
145 34 50         : SECRET_BUFFER_ENCODING_I32;
146 17           SSize_t dst_len= secret_buffer_sizeof_transcode(&pat_parse, dst_enc);
147 17 50         if (dst_len < 0)
148 0           croak("transcode of pattern failed: %s", pat_parse.error);
149             /* No need to transcode SECRET_BUFFER_ENCODING_ASCII, but the above size check
150             * verified it is clean 7-bit, which is the whole point of that encoding.
151             */
152 17 50         if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
153             /* Likewise, if SECRET_BUFFER_ENCODING_UTF8's I32 len is exactly 4x the number of
154             * original bytes, that means every byte became a character, which means every
155             * character could fit in a byte. */
156 17 100         || (pat_parse.encoding == SECRET_BUFFER_ENCODING_UTF8
157 16 100         && dst_len == (pat_parse.lim - pat_parse.pos) * 4)
158             ) {
159 9           pat_parse.encoding= SECRET_BUFFER_ENCODING_ISO8859_1;
160             } else {
161             /* create a temporary secret buffer to hold the transcode */
162 8           secret_buffer *tmp= secret_buffer_new(0, NULL);
163 8           secret_buffer_parse pat_orig= pat_parse;
164 8           secret_buffer_set_len(tmp, dst_len);
165 8 50         if (!secret_buffer_parse_init(&pat_parse, tmp, 0, dst_len, dst_enc))
166 0           croak("transcode of pattern failed: %s", pat_parse.error);
167             /* Transcode the pattern */
168 8 50         if (!secret_buffer_transcode(&pat_orig, (secret_buffer_parse_rw*) &pat_parse))
169 0 0         croak("transcode of pattern failed: %s", pat_orig.error? pat_orig.error : pat_parse.error);
170             }
171             }
172             /* In some cases it would also be nice to transcode the subject first, but the
173             * final state of the parse struct carries information back to the caller and
174             * needs to refer to original positions of characters. */
175              
176             /* Now dipatch to sb_parse_match_str_X */
177 433 100         if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ISO8859_1) {
178 426           size_t pat_len= pat_parse.lim - pat_parse.pos;
179 426           return sb_parse_match_str_U8(parse, pat_parse.pos, pat_len, flags);
180             } else { /* must be _I32 encoding, from above */
181 7           size_t pat_len= (pat_parse.lim - pat_parse.pos) >> 2;
182 7           return sb_parse_match_str_I32(parse, (I32*) pat_parse.pos, pat_len, flags);
183             }
184             }
185              
186             /* Scan for a pattern which is a set of characters */
187 609           bool secret_buffer_match_charset(secret_buffer_parse *parse, secret_buffer_charset *cset, int flags) {
188 609 100         if (parse->pos >= parse->lim) // empty range
189 48           return false;
190              
191             // byte matching gets to use a more efficient algorithm
192 561           return parse->encoding == SECRET_BUFFER_ENCODING_ISO8859_1
193 358           ? sb_parse_match_charset_bytes(parse, cset, flags)
194 919 100         : sb_parse_match_charset_codepoints(parse, cset, flags);
195             }
196              
197             /* Scan for a pattern which is a literal string of bytes.
198             */
199 0           bool secret_buffer_match_bytestr(secret_buffer_parse *parse, char *data, size_t datalen, int flags) {
200 0           return sb_parse_match_str_U8(parse, (U8*) data, datalen, flags);
201             }
202              
203             /* Count number of bytes required to transcode the source.
204             * If the source contains an invalid character for its encoding, or that codepoint
205             * can't be encoded as the dst_encoding, this returns -1 and sets src->error
206             * and also sets src->pos pointing at the character that could not be converted.
207             */
208 118           SSize_t secret_buffer_sizeof_transcode(secret_buffer_parse *src, int dst_encoding) {
209             // If the source and destination encodings are both bytes, return the length
210 118 100         if (dst_encoding == src->encoding && src->encoding == 0)
    100          
211 17           return src->lim - src->pos;
212             // Else need to iterate characters (to validate) and re-encode them
213             else {
214 101           size_t dst_size_needed= 0;
215             secret_buffer_parse tmp;
216 101           Zero(&tmp, 1, secret_buffer_parse);
217 101           tmp.pos= src->pos;
218 101           tmp.lim= src->lim;
219 101           tmp.encoding= src->encoding;
220 859 100         while (tmp.pos < tmp.lim) {
221 758           int cp= sb_parse_next_codepoint(&tmp);
222 758 50         if (cp < 0) return -1;
223 758           int ch_size= sizeof_codepoint_encoding(cp, dst_encoding);
224 758 50         if (ch_size < 0) return -1;
225 758           dst_size_needed += ch_size;
226             }
227             // If dest is base64, need special calculation
228 101 100         if (dst_encoding == SECRET_BUFFER_ENCODING_BASE64) {
229 10           dst_size_needed= ((dst_size_needed + 2) / 3) * 4;
230             }
231 101           return dst_size_needed;
232             }
233             }
234              
235             static const char base64_alphabet[64]=
236             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
237             "abcdefghijklmnopqrstuvwxyz"
238             "0123456789+/";
239              
240             /*
241             perl -E 'my @tbl= (-1)x256;
242             $tbl[ord]= -ord(A)+ord for A..Z;
243             $tbl[ord]= 26-ord(a)+ord for a..z;
244             $tbl[ord]= 52-ord(0)+ord for 0..9;
245             $tbl[ord "+"]= 62;
246             $tbl[ord "/"]= 63;
247             $tbl[ord "="]= 64;
248             say join ",\n", map join(",", @tbl[$_*16 .. $_*16+15]), 0..0xF'
249             */
250             static const int8_t base64_decode_table[256]= {
251             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
252             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
253             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
254             52,53,54,55,56,57,58,59,60,61,-1,-1,-1,64,-1,-1,
255             -1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
256             15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
257             -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
258             41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
259             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
260             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
261             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
262             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
263             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
264             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
265             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
266             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
267             };
268              
269             /* Transcode characters from one parse state into another.
270             * This works sort of like
271             * $data= decode($src_enc, substr($src, $src_pos, $src_len));
272             * substr($dst, $dst_pos, $dst_lim, encode($dst_enc, $data));
273             * processing only a range of the source, and replacing only a range of the dest,
274             * adjusting the size of dst as needed. Both src->pos and dst->pos
275             * are updated.
276             */
277 109           bool secret_buffer_transcode(secret_buffer_parse *src, secret_buffer_parse_rw *dst) {
278 109           src->error= NULL;
279 109           dst->error= NULL;
280             // If the source and destination encodings are both bytes, use memcpy
281 109 100         if (dst->encoding == src->encoding && src->encoding == 0) {
    100          
282 17           size_t cnt= dst->lim - dst->pos;
283 17 50         if (src->lim - src->pos != cnt) {
284 0           dst->error= "miscalculated buffer length";
285 0           return false;
286             }
287 17           memcpy((U8*)dst->pos, src->pos, cnt);
288 17           dst->pos += cnt;
289 17           src->pos += cnt;
290             }
291             // Else need to iterate characters and re-encode them
292             // base64 encoding doesn't work with sb_parse_encode_codepoint, so it gets
293             // special treatment.
294 92 100         else if (dst->encoding == SECRET_BUFFER_ENCODING_BASE64) {
295             // Read 3, write 4
296 10           int accum= 0;
297 10           int shift= 16, cp;
298 88 100         while (src->pos < src->lim) {
299 78           cp= sb_parse_next_codepoint(src);
300 78 50         if (cp > 0xFF) {
301 0           dst->error= "byte out of range";
302 0           return false;
303             }
304 78 100         if (!shift) {
305 24           U8 *writable= (U8*) dst->pos;
306 24 50         if (dst->pos + 4 > dst->lim) {
307 0           dst->error= "miscalculated buffer length";
308 0           return false;
309             }
310 24           dst->pos += 4;
311 24           accum |= cp;
312 24           writable[0] = base64_alphabet[0x3F & (accum >> 18)];
313 24           writable[1] = base64_alphabet[0x3F & (accum >> 12)];
314 24           writable[2] = base64_alphabet[0x3F & (accum >> 6)];
315 24           writable[3] = base64_alphabet[0x3F & accum];
316 24           accum= 0;
317 24           shift= 16;
318             }
319             else {
320 54           accum |= (cp << shift);
321 54           shift -= 8;
322             }
323             }
324 10 100         if (dst->pos + (shift < 16? 4 : 0) != dst->lim) {
    50          
325 0           dst->error= "miscalculated buffer length";
326 0           return false;
327             }
328             // write leftover accumulated bits
329 10 100         if (shift < 16) {
330 5           U8 *writable= (U8*) dst->pos;
331 5 50         if (dst->pos + 4 > dst->lim) {
332 0           dst->error= "miscalculated buffer length";
333 0           return false;
334             }
335 5           dst->pos += 4;
336 5           writable[0] = base64_alphabet[0x3F & (accum >> 18)];
337 5           writable[1] = base64_alphabet[0x3F & (accum >> 12)];
338 5 100         writable[2] = shift? '=' : base64_alphabet[0x3F & (accum >> 6)];
339 5           writable[3] = '=';
340             }
341             }
342             else {
343 526 100         while (src->pos < src->lim) {
344 444           int len, cp= sb_parse_next_codepoint(src);
345 444 50         if (cp < 0)
346 0           return false; // error is already set
347 444           len= sb_parse_encode_codepoint(dst, cp);
348 444 50         if (len < 0)
349 0           return false; // error is already set
350             }
351 82 50         if (dst->pos != dst->lim) {
352 0           dst->error= "miscalculated buffer length";
353 0           return false;
354             }
355             }
356 109           return true;
357             }
358              
359             bool
360 101           secret_buffer_copy_to(secret_buffer_parse *src, SV *dst_sv, int encoding, bool append) {
361             dTHX;
362             secret_buffer_parse_rw dst;
363 101           secret_buffer *dst_sbuf= NULL;
364             SSize_t need_bytes;
365 101           bool dst_wide= false;
366              
367 101           Zero(&dst, 1, secret_buffer_parse_rw);
368             // Encoding may be -1 to indicate the user didn't specify, in which case we use the
369             // same encoding as the source, unless the destination is a perl scalar (handled below)
370 101 100         dst.encoding= encoding >= 0? encoding : src->encoding;
371 101 100         if (sv_isobject(dst_sv)) {
372             // if object, must be a SecretBuffer
373 27           dst_sbuf= secret_buffer_from_magic(dst_sv, SECRET_BUFFER_MAGIC_OR_DIE);
374             }
375             else {
376             // Going to overwrite the scalar, or if its a scalar-ref, overwrite that.
377 74 50         if (SvROK(dst_sv) && !sv_isobject(dst_sv) && SvTYPE(SvRV(dst_sv)) <= SVt_PVMG)
    0          
    0          
378 0           dst_sv= SvRV(dst_sv);
379             // Refuse to overwrite any other kind of ref
380 74 50         if (SvTYPE(dst_sv) > SVt_PVMG || SvROK(dst_sv)) {
    50          
381 0           src->error= "Can only copy_to scalars or scalar-refs";
382 0           return false;
383             }
384             // If the source encoding is a type of unicode, and the destination encoding is not
385             // specified, then write wide characters (utf-8) to the perl scalar and flag it as utf8
386 74 100         if (encoding < 0 && SECRET_BUFFER_ENCODING_IS_UNICODE(src->encoding)) {
    100          
    100          
    100          
    50          
387 66           dst.encoding= SECRET_BUFFER_ENCODING_UTF8;
388 66           dst_wide= true;
389             }
390             }
391             // Determine how many bytes we need
392 101           need_bytes= secret_buffer_sizeof_transcode(src, dst.encoding);
393 101 50         if (need_bytes < 0)
394 0           return false;
395             // Prepare the buffers for that many bytes
396 101 100         if (dst_sbuf) {
397             // For destination SecretBuffer, set length to 0 unless appending, then
398             // ensure enough allocated space for need_bytes, then transcode and update
399             // the length in the block below.
400 27 100         if (!append)
401 20           secret_buffer_set_len(dst_sbuf, 0); /* clears secrets */
402 27           secret_buffer_alloc_at_least(dst_sbuf, dst_sbuf->len + need_bytes);
403 27           dst.pos= (U8*) dst_sbuf->data + dst_sbuf->len;
404 27           dst.lim= dst.pos + need_bytes;
405             }
406             else {
407             // For destination SV, set length to 0 unless appending, then force it to
408             // be bytes or utf-8, then grow it to ensure room for additional `need_bytes`.
409             U8* ptr;
410             STRLEN len;
411             // If overwriting, set the length to 0 before forcing to bytes or utf8
412 74 100         if (!append)
413 72           sv_setpvn(dst_sv, "", 0);
414             // force it to the type required
415 74 100         if (dst_wide) SvPVutf8(dst_sv, len);
416 8           else SvPVbyte(dst_sv, len);
417             // grow it to the required length, for writing
418 74 100         sv_grow(dst_sv, (append? len : 0) + need_bytes + 1);
419 74           ptr= (U8*) SvPVX_mutable(dst_sv) + len;
420             // don't forget the NUL terminator
421 74           ptr[need_bytes]= '\0';
422 74           dst.pos= ptr;
423 74           dst.lim= dst.pos + need_bytes;
424             }
425 101 50         if (!secret_buffer_transcode(src, &dst)) {
426 0 0         if (!src->error) src->error= dst.error;
427 0           return false;
428             }
429             /* update the lengths */
430 101 100         if (dst_sbuf) {
431 27           dst_sbuf->len += need_bytes;
432             }
433             else {
434 74           SvCUR_set(dst_sv, SvCUR(dst_sv) + need_bytes);
435 74 50         SvSETMAGIC(dst_sv);
436             }
437 101           return true;
438             }
439              
440             /* Append DER length octets (ASN.1 Length field, definite form only).
441             *
442             * DER rules:
443             * - If len <= 127: single byte [0x00..0x7F]
444             * - Else: first byte is 0x80 | N, where N is # of following length bytes (big-endian),
445             * and the length must be encoded in the minimal number of bytes (no leading 0x00).
446             *
447             * This function encodes ONLY the length field (not tag/value).
448             */
449             void
450 384           secret_buffer_append_uv_asn1_der_length(secret_buffer *buf, UV val) {
451             dTHX;
452 384           int enc_len = 1;
453             U8 *pos;
454 384 100         if (val > 127) {
455             /* Determine minimal number of bytes needed to represent len in base-256. */
456 339           UV tmp = val;
457 2001 100         while (tmp) {
458 1662           enc_len++;
459 1662           tmp >>= 8;
460             }
461             }
462             /* In BER/DER, the long-form initial octet has 7 bits of length-of-length.
463             * 0x80 is indefinite length (forbidden in DER), 0xFF would mean 127 length bytes.
464             * With 64-bit UV enc_len will never exceed 9.
465             */
466 384 50         ASSUME(enc_len < 127);
467 384           secret_buffer_set_len(buf, buf->len + enc_len);
468 384           pos= (U8*) buf->data + buf->len - 1;
469 384 100         if (val <= 127) {
470 45           *pos = (U8) val;
471             } else {
472 339           UV tmp = val;
473             /* Write the length big-endian into enc[1..n]. */
474 2001 100         while (tmp) {
475 1662           *pos-- = (U8)(tmp & 0xFF);
476 1662           tmp >>= 8;
477             }
478 339           *pos= (U8) (0x80 | (U8)(enc_len-1));
479             }
480 384           }
481              
482             /* Parse ASN.1 DER Length (definite form only) */
483             bool
484 384           secret_buffer_parse_uv_asn1_der_length(secret_buffer_parse *parse, UV *out) {
485             /* Work on a local cursor so we can roll back on failure */
486 384           const U8 *pos = parse->pos;
487 384           const U8 *lim = parse->lim;
488             UV result;
489              
490 384 50         if (pos >= lim) {
491 0           parse->error = "unexpected end of buffer";
492 0           return false;
493             }
494              
495 384           result = *pos++;
496              
497             /* If 0..127, the byte is the length value itself, otherwise it is the number of octets
498             * to read following that byte. */
499 384 100         if ((result & 0x80)) {
500 339           int n = result & 0x7F;
501             /* 0x80 means indefinite length (BER/CER), forbidden in DER */
502 339 50         if (n == 0) {
503 0           parse->error = "ASN.1 DER indefinite length not allowed";
504 0           return false;
505             }
506             /* Number of octets should be smallest possible encoding, so if it is larger than size_t
507             * don't even bother trying to decode it.
508             */
509 339 50         if (n > sizeof(UV)) {
510 0           parse->error = "ASN.1 DER length too large for perl UV";
511 0           return false;
512             }
513             /* ensure we have that many bytes */
514 339 50         if ((size_t)(lim - pos) < (size_t)n) {
515 0           parse->error = "unexpected end of buffer";
516 0           return false;
517             }
518             /* DER minimal encoding rules:
519             * - no leading 0x00 in the length octets
520             * - long form must not be used for lengths <= 127
521             */
522 339           lim= pos + n;
523 339           result= *pos++;
524 339 50         if (!result) {
525 0           parse->error = "ASN.1 DER length has leading zero (non-minimal)";
526 0           return false;
527             }
528             /* Parse remaining bytes of big-endian unsigned integer */
529 1662 100         while (pos < lim)
530 1323           result= (result << 8) | *pos++;
531             /* DER should not use 1-byte encoding if it would have fit in the initial byte */
532 339 50         if (result < 0x80) {
533 0           parse->error = "ASN.1 DER length should use short form (non-minimal)";
534 0           return false;
535             }
536             }
537 384 50         if (out) *out = result;
538 384           parse->pos = pos;
539 384           parse->error = NULL;
540 384           return true;
541             }
542              
543             /* Append canonical unsigned Base128, Little-Endian
544             *
545             * Rules:
546             * - 7 data bits per byte, little-endian (least significant group first)
547             * - High bit 0x80 set on all bytes except the final byte
548             * - Canonical/minimal: stop as soon as remaining value is 0
549             */
550             void
551 384           secret_buffer_append_uv_base128le(secret_buffer *buf, UV val) {
552             dTHX;
553             U8 *pos;
554 384           int enc_len= 1;
555 384           UV tmp= val >> 7;
556 1923 100         while (tmp) {
557 1539           enc_len++;
558 1539           tmp >>= 7;
559             }
560 384           secret_buffer_set_len(buf, buf->len + enc_len);
561 384           pos= (U8*) buf->data + buf->len - enc_len;
562             /* Encode */
563 384           tmp= val;
564             do {
565 1923           U8 byte = (U8)(tmp & 0x7F);
566 1923           tmp >>= 7;
567 1923 100         if (tmp)
568 1539           byte |= 0x80;
569 1923           *pos++ = byte;
570 1923 100         } while (tmp);
571 384 50         ASSUME(pos == (U8*)(buf->data + buf->len));
572 384           }
573              
574             /* Parse Unsigned LittleEndian Base128 (also requiring canonical / minimal encoding) */
575             bool
576 384           secret_buffer_parse_uv_base128le(secret_buffer_parse *parse, UV *out) {
577 384           const U8 *pos = parse->pos;
578 384           const U8 *lim = parse->lim;
579 384           UV result= 0, payload;
580 384           int shift= 7;
581              
582 384 50         if (pos >= lim) {
583 0           parse->error = "unexpected end of buffer";
584 0           return false;
585             }
586 384           result= payload= *pos & 0x7F;
587             /* Scan forward looking for the first byte without the continuation flag */
588 1923 100         while (*pos++ & 0x80) {
589 1539 50         if (pos >= lim) {
590 0           parse->error = "unexpected end of buffer";
591 0           return false;
592             }
593 1539           payload= *pos & 0x7F;
594 1539 100         if (shift > sizeof(UV)*8 - 7) {
595             /* Do any of the bits overflow? Is the continuation flag set? */
596 3 50         if (shift >= sizeof(UV)*8 || (payload >> (sizeof(UV)*8 - shift))) {
    50          
597 0           parse->error = "Base128-LE value overflows perl UV";
598 0           return false;
599             }
600             }
601 1539           result |= payload << shift;
602 1539           shift += 7;
603             }
604             /* check if the high bits were all zero, meaning an unnecessary byte was encoded */
605 384 100         if (!payload && result != 0) {
    50          
606 0           parse->error = "Over-long encoding of Base128-LE";
607 0           return false;
608             }
609 384 50         if (out) *out = result;
610 384           parse->pos = pos;
611 384           parse->error = NULL;
612 384           return true;
613             }
614              
615             /* Append canonical unsigned Base128, Big-Endian
616             *
617             * Rules:
618             * - 7 data bits per byte, big-endian (most significant group first)
619             * - High bit 0x80 set on all bytes except the final byte
620             * - Canonical/minimal: stop as soon as remaining value is 0
621             */
622             void
623 387           secret_buffer_append_uv_base128be(secret_buffer *buf, UV val) {
624             dTHX;
625             U8 *pos;
626 387           int enc_len= 1, shift;
627 387           UV tmp= val >> 7;
628 1926 100         while (tmp) {
629 1539           enc_len++;
630 1539           tmp >>= 7;
631             }
632 387           secret_buffer_set_len(buf, buf->len + enc_len);
633 387           pos= (U8*) buf->data + buf->len - enc_len;
634             /* Encode */
635 2313 100         for (shift= (enc_len-1) * 7; shift >= 0; shift -= 7) {
636 1926           U8 byte = (U8)((val >> shift) & 0x7F);
637 1926 100         if (shift)
638 1539           byte |= 0x80;
639 1926           *pos++ = byte;
640             }
641 387 50         ASSUME(pos == (U8*)(buf->data + buf->len));
642 387           }
643              
644             /* Parse Unsigned BigEndian Base128 (also requiring canonical / minimal encoding) */
645             bool
646 395           secret_buffer_parse_uv_base128be(secret_buffer_parse *parse, UV *out) {
647 395           const U8 *pos = parse->pos;
648 395           const U8 *lim = parse->lim;
649 395           UV result= 0;
650              
651 395 50         if (pos >= lim) {
652 0           parse->error = "unexpected end of buffer";
653 0           return false;
654             }
655             /* high-bit payload == 0 with continue bit set is an error. */
656 395 50         if (*pos == 0x80) {
657 0           parse->error = "Over-long encoding of Base128-BE";
658 0           return false;
659             }
660 395           result= *pos & 0x7F;
661 1934 100         while (*pos++ & 0x80) {
662             /* Will existing bits overflow UV when shifted? */
663 1539 50         if (result >> (sizeof(UV)*8 - 7)) {
664 0           parse->error = "Base128-BE value overflows perl UV";
665 0           return false;
666             }
667 1539 50         if (pos >= lim) {
668 0           parse->error = "unexpected end of buffer";
669 0           return false;
670             }
671 1539           result= (result << 7) | (*pos & 0x7F);
672             }
673 395 50         if (out) *out = result;
674 395           parse->pos = pos;
675 395           parse->error = NULL;
676 395           return true;
677             }
678              
679             /* Private API -------------------------------------------------------------*/
680              
681             /* Scan raw bytes using only the bitmap */
682 358           static bool sb_parse_match_charset_bytes(
683             secret_buffer_parse *parse,
684             const secret_buffer_charset *cset,
685             int flags
686             ) {
687 358           bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
688 358           bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
689 358 100         bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) || cset->match_multi;
    100          
690 358           bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
691 358           bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
692 358 100         int step= reverse? -1 : 1;
693 358 100         const U8 *pos= reverse? parse->lim-1 : parse->pos,
694 358 100         *lim= reverse? parse->pos-1 : parse->lim,
695 358           *span_start= NULL;
696             //warn("scan_charset_bytes pos=%p lim=%p len=%d", parse->pos, parse->lim, (int)(parse->lim - parse->pos));
697              
698 1224 100         while (pos != lim) {
699 1166 100         if (sbc_bitmap_test(cset->bitmap, *pos) != negate) {
700             // Found. Now are we looking for a span?
701 250 100         if (span_start)
702 105           break;
703 145           span_start= pos;
704 145 100         if (!multi) {
705 39           pos += step;
706 39           break;
707             }
708 106           negate= !negate;
709 916 100         } else if (anchored && !span_start)
    100          
710 156           break;
711 866           pos += step;
712             }
713             /* If constant time operation is requested, we need to perform one sbc_bitmap_test
714             * for every character in the span, and make sure the compiler doesn't eliminate it.
715             */
716 358 50         if (consttime) {
717 0           volatile bool sink= false;
718 0 0         while (pos != lim) {
719 0           sink ^= sbc_bitmap_test(cset->bitmap, *pos);
720 0           pos += step;
721             }
722 0           (void) sink;
723             }
724             // reached end of defined range, and implicitly ends span
725 358 100         if (reverse) {
726 86           parse->pos= pos + 1;
727 86 100         parse->lim= span_start? span_start + 1 : parse->pos;
728             } else {
729 272           parse->lim= pos;
730 272 100         parse->pos= span_start? span_start : parse->lim;
731             }
732 358           return span_start != NULL;
733             }
734              
735 203           static bool sb_parse_match_charset_codepoints(
736             secret_buffer_parse *parse,
737             const secret_buffer_charset *cset,
738             int flags
739             ) {
740             dTHX;
741 203           bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
742 203           bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
743 203 50         bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) || cset->match_multi;
    100          
744 203           bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
745 203           bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
746 203           bool span_started= false;
747 203           bool encoding_error= false;
748 203 100         const U8 *span_mark= NULL, *prev_mark= reverse? parse->lim : parse->pos;
749              
750 231 50         while (parse->pos < parse->lim) {
751 19           int codepoint= reverse? sb_parse_prev_codepoint(parse)
752 231 100         : sb_parse_next_codepoint(parse);
753             // warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
754 231 100         if (codepoint < 0) {// encoding error
755 7           encoding_error= true;
756 7           break;
757             }
758 224 100         if (sbc_test_codepoint(aTHX_ cset, codepoint) != negate) {
759             // Found. Mark boundaries of char.
760             // Now are we looking for a span?
761 198 100         if (span_started)
762 2           break;
763 196           span_started= true;
764 196           span_mark= prev_mark;
765 196           negate= !negate;
766 196 100         if (!multi) {
767 194 100         prev_mark= reverse? parse->lim : parse->pos;
768 194           break;
769             }
770 26 50         } else if (anchored && !span_started)
    0          
771 0           break;
772 28 100         prev_mark= reverse? parse->lim : parse->pos;
773             }
774             /* If constant time operation is requested, we need to perform one sbc_bitmap_test
775             * for every character in the span, and make sure the compiler doesn't eliminate it.
776             */
777 203 50         if (consttime) {
778 0           volatile bool sink= false;
779 0 0         while (parse->pos < parse->lim) {
780 0           int codepoint= reverse? sb_parse_prev_codepoint(parse)
781 0 0         : sb_parse_next_codepoint(parse);
782             // warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
783 0 0         if (codepoint < 0) { // encoding error
784 0           encoding_error= true;
785 0           sink ^= sbc_test_codepoint(aTHX_ cset, 0);
786             }
787             else
788 0           sink ^= sbc_test_codepoint(aTHX_ cset, codepoint);
789             }
790 0           (void) sink;
791             }
792 203 100         if (encoding_error)
793 7           return false;
794             // reached end of defined range
795 196 50         if (span_started) { // and implicitly ends span
796 196 100         if (reverse) {
797 5           parse->pos= prev_mark;
798 5           parse->lim= span_mark;
799             }
800             else {
801 191           parse->pos= span_mark;
802 191           parse->lim= prev_mark;
803             }
804 196           return true;
805             }
806 0           return false;
807             }
808              
809 20           int sb_parse_codepointcmp(secret_buffer_parse *lhs, secret_buffer_parse *rhs) {
810             I32 lhs_cp, rhs_cp;
811 20           volatile int ret= 0;
812             /* constant-time iteration per the shorter of the two strings */
813 111 100         while (lhs->pos < lhs->lim && rhs->pos < rhs->lim) {
    50          
814 91           lhs_cp= sb_parse_next_codepoint(lhs);
815 91 50         if (lhs_cp < 0)
816 0           croak("Encoding error in left-hand buffer");
817 91           rhs_cp= sb_parse_next_codepoint(rhs);
818 91 50         if (rhs_cp < 0)
819 0           croak("Encoding error in right-hand buffer");
820 91 100         if (lhs_cp != rhs_cp && !ret)
    50          
821 2 50         ret= lhs_cp < rhs_cp? -1 : 1;
822             }
823 20           return ret? ret
824 38 100         : (lhs->pos < lhs->lim)? 1 /* right string shorter than left */
825 36 50         : (rhs->pos < rhs->lim)? -1 /* left string shorter than right */
826 18 50         : 0;
827             }
828              
829             /* UTF-8 decoding helper */
830 3607           static int sb_parse_next_codepoint(secret_buffer_parse *parse) {
831 3607           const U8 *pos= parse->pos, *lim= parse->lim;
832 3607           int cp, encoding= parse->encoding;
833             #define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
834              
835 3607 100         if (encoding == SECRET_BUFFER_ENCODING_ASCII
836 3606 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
837 815 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
838             ) {
839 3365 50         if (lim - pos < 1)
840 0           SB_RETURN_ERROR("end of span")
841 3365           cp= *pos++;
842 3365 100         if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_ASCII)
    100          
843 1           SB_RETURN_ERROR("not 7-bit ASCII")
844 3364 100         else if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_UTF8) {
    100          
845 73           int min_cp= 0;
846 73           switch ((cp >> 3) & 0xF) {
847 13           case 14: // 0b1[1110]yyy
848 13 50         { if (lim - pos < 3) goto incomplete;
849 13           min_cp= 0x10000;
850 13           cp &= 0x07;
851             }
852 13 50         if ((*pos & 0xC0) != 0x80) goto invalid;
853 13           cp= (cp << 6) | (*pos++ & 0x3F);
854             if (0)
855             case 12: case 13: // 0b1[110x]yyy
856 24 100         { if (lim - pos < 2) goto incomplete;
857 20           min_cp= 0x800;
858 20           cp &= 0x0F;
859             }
860 33 50         if ((*pos & 0xC0) != 0x80) goto invalid;
861 33           cp= (cp << 6) | (*pos++ & 0x3F);
862             if (0)
863             case 8: case 9: case 10: case 11: // 0b1[10xx]yyy
864 36 100         { if (lim - pos < 1) goto incomplete;
865 34           min_cp= 0x80;
866 34           cp &= 0x1F;
867             }
868 67 50         if ((*pos & 0xC0) != 0x80) goto invalid;
869 67           cp= (cp << 6) | (*pos++ & 0x3F);
870 67           break;
871             default:
872 0           invalid: SB_RETURN_ERROR("invalid UTF8 encoding")
873 6           incomplete: SB_RETURN_ERROR("incomplete UTF8 encoding")
874             }
875 67 50         if (cp < min_cp)
876 0           SB_RETURN_ERROR("overlong encoding of UTF8 character")
877 67 50         else if (cp > 0x10FFFF)
878 0           SB_RETURN_ERROR("UTF8 character exceeds max")
879             }
880             // else all ISO-8859-1 bytes are valid codepoints
881             }
882 242 100         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
883 221 100         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
884 36           ) {
885 36           int low= encoding == SECRET_BUFFER_ENCODING_UTF16LE? 0 : 1;
886 36 50         if (lim - pos < 2)
887 0           SB_RETURN_ERROR("end of span")
888 36           cp= pos[low] | ((int)pos[low^1] << 8);
889 36           pos += 2;
890 36 100         if (cp >= 0xD800 && cp <= 0xDFFF) {
    50          
891 10 50         if (lim - pos < 2)
892 0           SB_RETURN_ERROR("incomplete UTF16 character")
893 10           int w2= pos[low] | ((int)pos[low^1] << 8);
894 10           pos += 2;
895 10 50         if (w2 < 0xDC00 || w2 > 0xDFFF)
    50          
896 0           SB_RETURN_ERROR("invalid UTF16 low surrogate")
897 10           cp = 0x10000 + (((cp & 0x3FF) << 10) | (w2 & 0x3FF));
898             }
899             }
900 206 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
901             // Skip over whitespace
902 38 50         while (pos < lim && isspace(*pos))
    50          
903 0           pos++;
904 38 50         if (lim - pos < 2)
905 0           SB_RETURN_ERROR("end of span")
906 38           int high= *pos++ - '0';
907 38           int low= *pos++ - '0';
908 38 50         if (low >= ('a'-'0')) low -= ('a'-'0'-10);
909 38 100         else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
910 38 50         if (high >= ('a'-'0')) high -= ('a'-'0'-10);
911 38 100         else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
912 38 50         if ((low >> 4) | (high >> 4))
913 0           SB_RETURN_ERROR("not a pair of hex digits")
914 38           cp= (high << 4) | low;
915             // skip over whitespace if it takes us to the end of buffer so that caller
916             // knows it's EOF before trying another decode.
917 38 100         while (pos < lim && isspace(*pos))
    50          
918 0           pos++;
919             }
920 168 50         else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
921             // Skip over whitespace and control chars
922 168 50         while (pos < lim && *pos <= ' ')
    50          
923 0           pos++;
924             // There need to be at least 2 base64 characters left
925 168 50         if (pos < lim) {
926 168 50         if (base64_decode_table[*pos] < 0)
927 0           SB_RETURN_ERROR("invalid base64 character");
928             // ->pos_bit > 0 means pointer is pointing at a sub-bit of the base64
929             // character at *pos (and possible values are 0, 2, or 4)
930 168           cp= (((int)base64_decode_table[*pos++]) << (2 + parse->pos_bit)) & 0xFF;
931 168 50         while (pos < lim && *pos <= ' ')
    50          
932 0           pos++;
933             }
934 168 50         if (pos >= lim) {
935 0           parse->pos_bit= 0;
936 0           SB_RETURN_ERROR("end of span")
937             }
938 168 50         if (base64_decode_table[*pos] < 0)
939 0           SB_RETURN_ERROR("invalid base64 character");
940 168           cp |= base64_decode_table[*pos] >> (4-parse->pos_bit);
941 168           parse->pos_bit += 2;
942             // If pos_bit == 6 we've completed a set of 4 b64 chars and fully consumed them.
943 168 100         if (parse->pos_bit >= 6) {
944 51           pos++;
945 51           parse->pos_bit= 0;
946             // consume trailing whitespace
947 55 100         while (pos < lim && *pos <= ' ')
    100          
948 4           pos++;
949             }
950             else {
951             // if next char is '=', terminate the decoding
952 117           const U8 *next= pos+1;
953 117 50         while (next < lim && *next <= ' ')
    50          
954 0           next++;
955 117 50         if (next < lim && *next == '=') {
    100          
956 13           pos= lim; // indicate parsing complete
957 13           parse->pos_bit= 0;
958             }
959             }
960             }
961 0 0         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
962 0 0         if (lim - pos < 4)
963 0           SB_RETURN_ERROR("end of span");
964 0           cp= *(I32*)pos;
965 0           pos+= 4;
966             }
967 0           else SB_RETURN_ERROR("unsupported encoding")
968 3600           parse->pos= pos;
969 3600           return cp;
970             #undef SB_RETURN_ERROR
971             }
972              
973 850           static int sb_parse_prev_codepoint(secret_buffer_parse *parse) {
974 850           const U8 *pos= parse->pos, *lim= parse->lim;
975 850           int encoding= parse->encoding;
976             int cp;
977             #define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
978              
979 850 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII
980 850 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
981 25 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
982             ) {
983 842 50         if (lim <= pos)
984 0           SB_RETURN_ERROR("end of span")
985 842           cp= *--lim;
986             // handle the simple case first
987 842 100         if (cp >= 0x80 && encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
    50          
988             // Strict ASCII can't encode above 0x7F
989 4 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII)
990 0           SB_RETURN_ERROR("not 7-bit ASCII")
991             // else need to backtrack and then call next_codepoint
992 4           const U8 *start= lim;
993 12 50         while (start >= pos && (*start & 0xC0) == 0x80)
    100          
994 8           --start;
995 4           parse->pos= start;
996 4           cp= sb_parse_next_codepoint(parse);
997 4 50         if (parse->pos != parse->lim) {// consumed all characters we gave it?
998 0           parse->pos= pos; // restore original pos
999 0 0         if (cp >= 0) // had a valid char, but extra 0x80 bytes
1000 0           parse->error= "invalid UTF8 character";
1001             // else use the error message from next_codepoint
1002 0           return -1;
1003             }
1004 4           parse->pos= pos; // restore original pos
1005 4           lim= start; // new lim is where we started the parse from
1006             }
1007             }
1008 8 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1009 8 100         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
1010 1           ) {
1011 1 50         if (lim - pos < 2)
1012 0           SB_RETURN_ERROR("end of span");
1013             // handle the simple case first
1014 1           lim -= 2;
1015 1           int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1016 1           cp= lim[low] | ((int)lim[low^1] << 8);
1017 1 50         if (cp >= 0xD800 && cp <= 0xDFFF) {
    50          
1018 1 50         if (lim - pos < 4)
1019 0           SB_RETURN_ERROR("end of span");
1020 1           lim -= 2;
1021 1           int w1= lim[low] | ((int)lim[low^1] << 8);
1022 1 50         if (w1 < 0xD800 || w1 > 0xDFFF || cp < 0xDC00)
    50          
    50          
1023 0           SB_RETURN_ERROR("invalid UTF16 surrogate");
1024 1           cp = 0x10000 + (((w1 & 0x3FF) << 10) | (cp & 0x3FF));
1025             }
1026             }
1027 7 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1028             // Skip over whitespace
1029 1 50         while (pos < lim && isspace(lim[-1]))
    50          
1030 0           lim--;
1031 1 50         if (lim - pos < 2)
1032 0 0         SB_RETURN_ERROR((pos == lim? "end of span" : "incomplete hex pair at end of span"))
1033 1           int low= *--lim - '0';
1034 1           int high= *--lim - '0';
1035 1 50         if (low >= ('a'-'0')) low -= ('a'-'0'-10);
1036 0 0         else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
1037 1 50         if (high >= ('a'-'0')) high -= ('a'-'0'-10);
1038 0 0         else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
1039 1 50         if ((low >> 4) | (high >> 4))
1040 0           SB_RETURN_ERROR("not a pair of hex digits")
1041 1           cp= (high << 4) | low;
1042             }
1043 6 50         else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
1044             bool again;
1045             do {
1046 9           again= false;
1047             // Skip over non-base64 chars
1048 12 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    100          
1049 3           lim--;
1050 9 50         if (pos < lim) {
1051             //warn("lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1052 9 50         if (base64_decode_table[lim[-1]] < 0)
1053 0           SB_RETURN_ERROR("invalid base64 character");
1054             // ->lim_bit > 0 means the character lim[-1] is partially consumed.
1055             // (sequence is 0, 2, 4, 0)
1056 9           cp= ((int)base64_decode_table[lim[-1]]) >> parse->lim_bit;
1057             // parsing an equal sign means 'cp' is bogus and need to go again
1058 9 100         if (lim[-1] == '=')
1059 3           again= true;
1060 9           --lim;
1061             // find next base64 char
1062 9 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    50          
1063 0           lim--;
1064             }
1065 9 50         if (pos >= lim) {
1066 0           parse->lim_bit= 0;
1067 0           SB_RETURN_ERROR("end of span")
1068             }
1069 9 50         if (base64_decode_table[lim[-1]] < 0)
1070 0           SB_RETURN_ERROR("invalid base64 character");
1071             //warn(" lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1072 9           cp |= (((int)base64_decode_table[lim[-1]]) << (6 - parse->lim_bit)) & 0xFF;
1073 9           parse->lim_bit += 2;
1074 9 100         if (parse->lim_bit >= 6) {
1075 3           parse->lim_bit= 0;
1076             // If completed a set of 4 b64 chars, lim[-1] is consumed, and need to
1077             // walk backward to find next base64 char
1078 3           --lim;
1079 3 50         while (pos < lim && base64_decode_table[lim[-1]] < 0)
    0          
1080 0           lim--;
1081             }
1082             //warn(" cp=%d, lim-pos=%d, lim_bit=%d", cp, (int)(lim-pos), parse->lim_bit);
1083 9 100         } while (again);
1084             }
1085 0 0         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1086 0 0         if (lim - pos < 4)
1087 0           SB_RETURN_ERROR("end of span");
1088 0           lim -= 4;
1089 0           cp= *(I32*)lim;
1090             }
1091 0           else SB_RETURN_ERROR("unsupported encoding")
1092 850           parse->lim= lim;
1093 850           return cp;
1094             #undef SB_RETURN_ERROR
1095             }
1096              
1097 1202           static int sizeof_codepoint_encoding(int codepoint, int encoding) {
1098 1202 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII)
1099 0 0         return codepoint < 0x80? 1 : -1;
1100 1202 100         if (encoding == SECRET_BUFFER_ENCODING_ISO8859_1)
1101 110 50         return codepoint < 0x100? 1 : -1;
1102 1092 100         else if (encoding == SECRET_BUFFER_ENCODING_UTF8)
1103 736 100         return codepoint < 0x80? 1 : codepoint < 0x800? 2 : codepoint < 0x10000? 3 : 4;
    100          
    100          
1104 356 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1105 356 50         || encoding == SECRET_BUFFER_ENCODING_UTF16BE)
1106 0 0         return codepoint >= 0xD800 && codepoint < 0xE000? -1
1107 0 0         : codepoint < 0x10000? 2 : 4;
    0          
1108 356 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX)
1109 6 50         return codepoint < 0x100? 2 : -1;
1110             /* Base64 would need to track an accumulator, so just return 1 and fix it in the caller */
1111 350 100         else if (encoding == SECRET_BUFFER_ENCODING_BASE64)
1112 78 50         return codepoint < 0x100? 1 : -1;
1113 272 50         else if (encoding == SECRET_BUFFER_ENCODING_I32)
1114 272           return 4;
1115             else
1116 0           return -1;
1117             }
1118              
1119 444           static bool sb_parse_encode_codepoint(secret_buffer_parse_rw *dst, int codepoint) {
1120             #define SB_RETURN_ERROR(msg) { dst->error= msg; return false; }
1121 444           int encoding= dst->encoding, n;
1122 444           U8 *dst_pos= dst->pos;
1123             // codepoints above 0x10FFFF are illegal
1124 444 50         if (codepoint >= 0x110000)
1125 0           SB_RETURN_ERROR("invalid codepoint");
1126             // not quite as efficient as checking during the code below, but saves a bunch of redundancy
1127 444           n= sizeof_codepoint_encoding(codepoint, encoding);
1128 444 50         if (n < 0)
1129 0           SB_RETURN_ERROR("character too wide for encoding")
1130 444 50         if (dst->lim - dst_pos < n)
1131 0           SB_RETURN_ERROR("buffer too small")
1132 444           dst->pos += n;
1133              
1134 444 50         if (encoding == SECRET_BUFFER_ENCODING_ASCII
1135 444 100         || encoding == SECRET_BUFFER_ENCODING_ISO8859_1
1136 389 100         || encoding == SECRET_BUFFER_ENCODING_UTF8
1137             ) {
1138 423           switch ((n-1)&0x3) { // help the compiler understand there are only 4 possible values
1139 401           case 0: *dst_pos++ = (U8) codepoint;
1140 401           break;
1141 10           case 1: *dst_pos++ = (U8)(0xC0 | (codepoint >> 6));
1142 10           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1143 10           break;
1144 4           case 2: *dst_pos++ = (U8)(0xE0 | (codepoint >> 12));
1145 4           *dst_pos++ = (U8)(0x80 | ((codepoint >> 6) & 0x3F));
1146 4           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1147 4           break;
1148 8           case 3: *dst_pos++ = (U8)(0xF0 | (codepoint >> 18));
1149 8           *dst_pos++ = (U8)(0x80 | ((codepoint >> 12) & 0x3F));
1150 8           *dst_pos++ = (U8)(0x80 | ((codepoint >> 6) & 0x3F));
1151 8           *dst_pos++ = (U8)(0x80 | (codepoint & 0x3F));
1152 8           break;
1153             }
1154             }
1155 21 50         else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1156 21 50         || encoding == SECRET_BUFFER_ENCODING_UTF16BE
1157 0           ) {
1158 0           int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1159 0 0         if (n == 2) {
1160 0           dst_pos[low] = (U8)(codepoint & 0xFF);
1161 0           dst_pos[low^1] = (U8)(codepoint >> 8);
1162             }
1163             else {
1164 0           int adjusted = codepoint - 0x10000;
1165 0           int w0 = 0xD800 | (adjusted >> 10);
1166 0           int w1 = 0xDC00 | (adjusted & 0x3FF);
1167 0           dst_pos[low] = (U8)(w0 & 0xFF);
1168 0           dst_pos[1^low] = (U8)(w0 >> 8);
1169 0           dst_pos[2^low] = (U8)(w1 & 0xFF);
1170 0           dst_pos[3^low] = (U8)(w1 >> 8);
1171             }
1172             }
1173 21 100         else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1174 3           dst_pos[0] = "0123456789ABCDEF"[(codepoint >> 4) & 0xF];
1175 3           dst_pos[1] = "0123456789ABCDEF"[codepoint & 0xF];
1176             }
1177 18 50         else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1178 18           *(I32*)dst_pos = codepoint;
1179             }
1180             /* BASE64 is not handled here because the '=' padding can only be generated in
1181             * a context that knows when we are ending on a non-multiple-of-4. */
1182 0           else SB_RETURN_ERROR("unsupported encoding");
1183 444           return true;
1184             #undef SB_RETURN_ERROR
1185             }
1186              
1187             #define SB_PARSE_MATCH_STR_FN sb_parse_match_str_U8
1188             #define SB_PATTERN_EL_TYPE const U8
1189             #include "secret_buffer_parse_match_str.c"
1190             #undef SB_PARSE_MATCH_STR_FN
1191             #undef SB_PATTERN_EL_TYPE
1192              
1193             #define SB_PARSE_MATCH_STR_FN sb_parse_match_str_I32
1194             #define SB_PATTERN_EL_TYPE const I32
1195             #include "secret_buffer_parse_match_str.c"
1196             #undef SB_PARSE_MATCH_STR_FN
1197             #undef SB_PATTERN_EL_TYPE