File Coverage

secret_buffer_parse.c

Criterion	Covered	Total	%
statement	546	700	78.0
branch	392	594	65.9
condition			n/a
subroutine			n/a
pod			n/a
total	938	1294	72.4

line	stmt	bran	code
1
2			/* These local parse functions are independenct of the SecretBuffer instance,
3			* needing only the 'data' pointer to whch the parse_state refers.
4			* The pos/lim of the parse state must already be checked against the length
5			* of the data before calling these.
6			*/
7			static int sizeof_codepoint_encoding(int codepoint, int encoding);
8			static int sb_parse_prev_codepoint(secret_buffer_parse *parse);
9			static int sb_parse_next_codepoint(secret_buffer_parse *parse);
10			static bool sb_parse_encode_codepoint(secret_buffer_parse *parse, int codepoint);
11			static bool sb_parse_match_charset_bytes(secret_buffer_parse parse, const secret_buffer_charset cset, int flags);
12			static bool sb_parse_match_charset_codepoints(secret_buffer_parse parse, const secret_buffer_charset cset, int flags);
13			static bool sb_parse_match_str_U8(secret_buffer_parse parse, const U8 pattern, size_t pattern_len, int flags);
14			static bool sb_parse_match_str_I32(secret_buffer_parse parse, const I32 pattern, size_t pattern_len, int flags);
15
16	66		static bool parse_encoding(pTHX_ SV sv, int out) {
17			int enc;
18	66	50	if (looks_like_number(sv)) {
19	0		IV i= SvIV(sv);
20	0	0	if (i < 0 \|\| i > SECRET_BUFFER_ENCODING_MAX)
		0
21	0		return false;
22	0		enc= (int) i;
23			} else {
24			STRLEN len;
25	66		const char *str= SvPV(sv, len);
26	66		switch (len) {
27	6	50	case 3: if (0 == strcmp(str, "HEX")) { enc= SECRET_BUFFER_ENCODING_HEX; break; }
28	1	50	case 4: if (0 == strcmp(str, "UTF8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
29	13	100	case 5: if (0 == strcmp(str, "ASCII")) { enc= SECRET_BUFFER_ENCODING_ASCII; break; }
30	12	50	if (0 == strcmp(str, "UTF-8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
31	26	50	case 6: if (0 == strcmp(str, "BASE64")) { enc= SECRET_BUFFER_ENCODING_BASE64; break; }
32	1	50	case 7: if (0 == strcmp(str, "UTF16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
33	0	0	if (0 == strcmp(str, "UTF16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
34	6	100	case 8: if (0 == strcmp(str, "UTF-16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
35	3	50	if (0 == strcmp(str, "UTF-16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
36	0	0	case 9: if (0 == strcmp(str, "ISO8859_1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
37	13	50	case 10: if (0 == strcmp(str, "ISO-8859-1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
38			default:
39	0		return false;
40			}
41			}
42	66	50	if (out) *out= enc;
43	66		return true;
44			}
45
46			/* Public API --------------------------------------------------------------*/
47
48			/* initialize a parse struct, but only if it is valid span of the buffer */
49	2001		bool secret_buffer_parse_init(secret_buffer_parse *parse,
50			secret_buffer *buf, size_t pos, size_t lim, int encoding
51			) {
52	2001		Zero(parse, 1, secret_buffer_parse);
53			// Sanity check this parse state vs. the buffer
54	2001	100	if (lim > buf->len \|\| pos > lim) {
		50
55	1	50	parse->error= pos > lim? "span starts beyond buffer" : "span ends beyond buffer";
56	1		return false;
57			}
58	2000		parse->pos= ((U8*) buf->data) + pos;
59	2000		parse->lim= ((U8*) buf->data) + lim;
60	2000		parse->encoding= encoding;
61	2000		parse->sbuf= buf;
62	2000		return true;
63			}
64
65			/* Initialize a parse struct, either from a Span, or a SecretBuffer, or a plain Scalar.
66			*/
67	1737		bool secret_buffer_parse_init_from_sv(secret_buffer_parse parse, SV sv) {
68			dTHX;
69			secret_buffer *sb;
70			secret_buffer_span *span;
71			/* Is the sv a Span object? */
72	1737	100	if ((span= secret_buffer_span_from_magic(sv, 0)) && SvTYPE(SvRV(sv)) == SVt_PVHV) {
		50
73	1284		SV *sb_sv= hv_fetchs((HV)SvRV(sv), "buf", 1);
74	1284		sb= secret_buffer_from_magic(*sb_sv, SECRET_BUFFER_MAGIC_OR_DIE);
75	1284		return secret_buffer_parse_init(parse, sb, span->pos, span->lim, span->encoding);
76			}
77			/* Is the sv a SecretBuffer? */
78	453	100	else if ((sb= secret_buffer_from_magic(sv, 0))) {
79	2		return secret_buffer_parse_init(parse, sb, 0, sb->len, SECRET_BUFFER_ENCODING_ISO8859_1);
80			}
81			/* It needs to at least be defined */
82	451	50	else if (SvOK(sv)) {
83			STRLEN len;
84	451		char *buf= SvPV(sv, len);
85	451		Zero(parse, 1, secret_buffer_parse);
86	451		parse->pos= (U8*) buf;
87	451		parse->lim= (U8*) buf + len;
88	451		parse->encoding= SvUTF8(sv)? SECRET_BUFFER_ENCODING_UTF8 : SECRET_BUFFER_ENCODING_ISO8859_1;
89	451		return true;
90			}
91			else {
92	0		Zero(parse, 1, secret_buffer_parse);
93	0		parse->error= "Not a Span, SecretBuffer, or defined scalar";
94	0		return false;
95			}
96			}
97
98			/* Scan for a pattern which may be a regex or literal string.
99			* Regexes are currently limited to a single charclass.
100			*/
101	785		bool secret_buffer_match(secret_buffer_parse parse, SV pattern, int flags) {
102			dTHX;
103	785		REGEXP rx= (REGEXP)SvRX(pattern);
104			secret_buffer_parse pat_parse;
105
106			/* Is the pattern a regexp-ref? */
107	785	100	if (rx) {
108	346		secret_buffer_charset *cset= secret_buffer_charset_from_regexpref(pattern);
109	346		return secret_buffer_match_charset(parse, cset, flags);
110			}
111
112			/* load up a parse struct with the pos, lim, and encoding */
113	439	50	if (!secret_buffer_parse_init_from_sv(&pat_parse, pattern))
114	0		croak("%s", pat_parse.error);
115
116			/* Remove edge case of zero-length pattern (always matches) */
117	439	100	if (pat_parse.pos >= pat_parse.lim) {
118	2	50	if ((flags & SECRET_BUFFER_MATCH_REVERSE))
119	0		parse->pos= parse->lim;
120			else
121	2		parse->lim= parse->pos;
122	2		return !(flags & SECRET_BUFFER_MATCH_NEGATE);
123			}
124			/* Remove edge case of zero-length subject (never matches) */
125	437	100	if (parse->pos >= parse->lim) {
126	4		return (flags & SECRET_BUFFER_MATCH_NEGATE);
127			}
128
129			/* Since unicode iteration of the pattern is a hassle and might happen lots of times,
130			* convert it to either plain bytes or array of U32 codepoints.
131			*/
132	433	100	if (pat_parse.encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
133	17		int dst_enc=
134			/* these can be transcoded to bytes */
135	17		(pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
136	17	100	\|\| pat_parse.encoding == SECRET_BUFFER_ENCODING_HEX
137	16	50	\|\| pat_parse.encoding == SECRET_BUFFER_ENCODING_BASE64)
138			? SECRET_BUFFER_ENCODING_ISO8859_1
139	34	50	: SECRET_BUFFER_ENCODING_I32;
140	17		SSize_t dst_len= secret_buffer_sizeof_transcode(&pat_parse, dst_enc);
141	17	50	if (dst_len < 0)
142	0		croak("transcode of pattern failed: %s", pat_parse.error);
143			/* No need to transcode SECRET_BUFFER_ENCODING_ASCII, but the above size check
144			* verified it is clean 7-bit, which is the whole point of that encoding.
145			*/
146	17	50	if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
147			/* Likewise, if SECRET_BUFFER_ENCODING_UTF8's I32 len is exactly 4x the number of
148			* original bytes, that means every byte became a character, which means every
149			* character could fit in a byte. */
150	17	100	\|\| (pat_parse.encoding == SECRET_BUFFER_ENCODING_UTF8
151	16	100	&& dst_len == (pat_parse.lim - pat_parse.pos) * 4)
152			) {
153	9		pat_parse.encoding= SECRET_BUFFER_ENCODING_ISO8859_1;
154			} else {
155			/* create a temporary secret buffer to hold the transcode */
156	8		secret_buffer *tmp= secret_buffer_new(0, NULL);
157	8		secret_buffer_parse pat_orig= pat_parse;
158	8		secret_buffer_set_len(tmp, dst_len);
159	8	50	if (!secret_buffer_parse_init(&pat_parse, tmp, 0, dst_len, dst_enc))
160	0		croak("transcode of pattern failed: %s", pat_parse.error);
161			/* Transcode the pattern */
162	8	50	if (!secret_buffer_transcode(&pat_orig, &pat_parse))
163	0	0	croak("transcode of pattern failed: %s", pat_orig.error? pat_orig.error : pat_parse.error);
164			}
165			}
166			/* In some cases it would also be nice to transcode the subject first, but the
167			* final state of the parse struct carries information back to the caller and
168			* needs to refer to original positions of characters. */
169
170			/* Now dipatch to sb_parse_match_str_X */
171	433	100	if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ISO8859_1) {
172	426		size_t pat_len= pat_parse.lim - pat_parse.pos;
173	426		return sb_parse_match_str_U8(parse, pat_parse.pos, pat_len, flags);
174			} else { /* must be _I32 encoding, from above */
175	7		size_t pat_len= (pat_parse.lim - pat_parse.pos) >> 2;
176	7		return sb_parse_match_str_I32(parse, (I32*) pat_parse.pos, pat_len, flags);
177			}
178			}
179
180			/* Scan for a pattern which is a set of characters */
181	346		bool secret_buffer_match_charset(secret_buffer_parse parse, secret_buffer_charset cset, int flags) {
182	346	100	if (parse->pos >= parse->lim) // empty range
183	48		return false;
184
185			// byte matching gets to use a more efficient algorithm
186	298		return parse->encoding == SECRET_BUFFER_ENCODING_ISO8859_1
187	289		? sb_parse_match_charset_bytes(parse, cset, flags)
188	587	100	: sb_parse_match_charset_codepoints(parse, cset, flags);
189			}
190
191			/* Scan for a pattern which is a literal string of bytes.
192			*/
193	0		bool secret_buffer_match_bytestr(secret_buffer_parse parse, char data, size_t datalen, int flags) {
194	0		return sb_parse_match_str_U8(parse, (U8*) data, datalen, flags);
195			}
196
197			/* Count number of bytes required to transcode the source.
198			* If the source contains an invalid character for its encoding, or that codepoint
199			* can't be encoded as the dst_encoding, this returns -1 and sets src->error
200			* and also sets src->pos pointing at the character that could not be converted.
201			*/
202	118		SSize_t secret_buffer_sizeof_transcode(secret_buffer_parse *src, int dst_encoding) {
203			// If the source and destination encodings are both bytes, return the length
204	118	100	if (dst_encoding == src->encoding && src->encoding == 0)
		100
205	17		return src->lim - src->pos;
206			// Else need to iterate characters (to validate) and re-encode them
207			else {
208	101		size_t dst_size_needed= 0;
209			secret_buffer_parse tmp;
210	101		Zero(&tmp, 1, secret_buffer_parse);
211	101		tmp.pos= src->pos;
212	101		tmp.lim= src->lim;
213	101		tmp.encoding= src->encoding;
214	859	100	while (tmp.pos < tmp.lim) {
215	758		int cp= sb_parse_next_codepoint(&tmp);
216	758	50	if (cp < 0) return -1;
217	758		int ch_size= sizeof_codepoint_encoding(cp, dst_encoding);
218	758	50	if (ch_size < 0) return -1;
219	758		dst_size_needed += ch_size;
220			}
221			// If dest is base64, need special calculation
222	101	100	if (dst_encoding == SECRET_BUFFER_ENCODING_BASE64) {
223	10		dst_size_needed= ((dst_size_needed + 2) / 3) * 4;
224			}
225	101		return dst_size_needed;
226			}
227			}
228
229			static const char base64_alphabet[64]=
230			"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
231			"abcdefghijklmnopqrstuvwxyz"
232			"0123456789+/";
233
234			/*
235			perl -E 'my @tbl= (-1)x256;
236			$tbl[ord]= -ord(A)+ord for A..Z;
237			$tbl[ord]= 26-ord(a)+ord for a..z;
238			$tbl[ord]= 52-ord(0)+ord for 0..9;
239			$tbl[ord "+"]= 62;
240			$tbl[ord "/"]= 63;
241			$tbl[ord "="]= 64;
242			say join ",\n", map join(",", @tbl[$_16 .. $_16+15]), 0..0xF'
243			*/
244			static const int8_t base64_decode_table[256]= {
245			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
246			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
247			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
248			52,53,54,55,56,57,58,59,60,61,-1,-1,-1,64,-1,-1,
249			-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
250			15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
251			-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
252			41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
253			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
254			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
255			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
256			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
257			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
258			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
259			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
260			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
261			};
262
263			/* Transcode characters from one parse state into another.
264			* This works sort of like
265			* $data= decode($src_enc, substr($src, $src_pos, $src_len));
266			* substr($dst, $dst_pos, $dst_lim, encode($dst_enc, $data));
267			* processing only a range of the source, and replacing only a range of the dest,
268			* adjusting the size of dst as needed. Both src->pos and dst->pos
269			* are updated.
270			*/
271	109		bool secret_buffer_transcode(secret_buffer_parse src, secret_buffer_parse dst) {
272	109		src->error= NULL;
273	109		dst->error= NULL;
274			// If the source and destination encodings are both bytes, use memcpy
275	109	100	if (dst->encoding == src->encoding && src->encoding == 0) {
		100
276	17		size_t cnt= dst->lim - dst->pos;
277	17	50	if (src->lim - src->pos != cnt) {
278	0		dst->error= "miscalculated buffer length";
279	0		return false;
280			}
281	17		memcpy((U8*)dst->pos, src->pos, cnt);
282	17		dst->pos += cnt;
283	17		src->pos += cnt;
284			}
285			// Else need to iterate characters and re-encode them
286			// base64 encoding doesn't work with sb_parse_encode_codepoint, so it gets
287			// special treatment.
288	92	100	else if (dst->encoding == SECRET_BUFFER_ENCODING_BASE64) {
289			// Read 3, write 4
290	10		int accum= 0;
291	10		int shift= 16, cp;
292	88	100	while (src->pos < src->lim) {
293	78		cp= sb_parse_next_codepoint(src);
294	78	50	if (cp > 0xFF) {
295	0		dst->error= "byte out of range";
296	0		return false;
297			}
298	78	100	if (!shift) {
299	24		U8 writable= (U8) dst->pos;
300	24	50	if (dst->pos + 4 > dst->lim) {
301	0		dst->error= "miscalculated buffer length";
302	0		return false;
303			}
304	24		dst->pos += 4;
305	24		accum \|= cp;
306	24		writable[0] = base64_alphabet[0x3F & (accum >> 18)];
307	24		writable[1] = base64_alphabet[0x3F & (accum >> 12)];
308	24		writable[2] = base64_alphabet[0x3F & (accum >> 6)];
309	24		writable[3] = base64_alphabet[0x3F & accum];
310	24		accum= 0;
311	24		shift= 16;
312			}
313			else {
314	54		accum \|= (cp << shift);
315	54		shift -= 8;
316			}
317			}
318	10	100	if (dst->pos + (shift < 16? 4 : 0) != dst->lim) {
		50
319	0		dst->error= "miscalculated buffer length";
320	0		return false;
321			}
322			// write leftover accumulated bits
323	10	100	if (shift < 16) {
324	5		U8 writable= (U8) dst->pos;
325	5	50	if (dst->pos + 4 > dst->lim) {
326	0		dst->error= "miscalculated buffer length";
327	0		return false;
328			}
329	5		dst->pos += 4;
330	5		writable[0] = base64_alphabet[0x3F & (accum >> 18)];
331	5		writable[1] = base64_alphabet[0x3F & (accum >> 12)];
332	5	100	writable[2] = shift? '=' : base64_alphabet[0x3F & (accum >> 6)];
333	5		writable[3] = '=';
334			}
335			}
336			else {
337	526	100	while (src->pos < src->lim) {
338	444		int cp= sb_parse_next_codepoint(src);
339	444	50	if (cp < 0)
340	0		return false; // error is already set
341	444		int len= sb_parse_encode_codepoint(dst, cp);
342	444	50	if (len < 0)
343	0		return false; // error is already set
344			}
345	82	50	if (dst->pos != dst->lim) {
346	0		dst->error= "miscalculated buffer length";
347	0		return false;
348			}
349			}
350	109		return true;
351			}
352
353			bool
354	101		secret_buffer_copy_to(secret_buffer_parse src, SV dst_sv, int encoding, bool append) {
355			dTHX;
356			secret_buffer_parse dst;
357	101		secret_buffer *dst_sbuf= NULL;
358			SSize_t need_bytes;
359	101		bool dst_wide= false;
360
361	101		Zero(&dst, 1, secret_buffer_parse);
362			// Encoding may be -1 to indicate the user didn't specify, in which case we use the
363			// same encoding as the source, unless the destination is a perl scalar (handled below)
364	101	100	dst.encoding= encoding >= 0? encoding : src->encoding;
365	101	100	if (sv_isobject(dst_sv)) {
366			// if object, must be a SecretBuffer
367	27		dst_sbuf= secret_buffer_from_magic(dst_sv, SECRET_BUFFER_MAGIC_OR_DIE);
368			}
369			else {
370			// Going to overwrite the scalar, or if its a scalar-ref, overwrite that.
371	74	50	if (SvROK(dst_sv) && !sv_isobject(dst_sv) && SvTYPE(SvRV(dst_sv)) <= SVt_PVMG)
		0
		0
372	0		dst_sv= SvRV(dst_sv);
373			// Refuse to overwrite any other kind of ref
374	74	50	if (SvTYPE(dst_sv) > SVt_PVMG \|\| SvROK(dst_sv)) {
		50
375	0		src->error= "Can only copy_to scalars or scalar-refs";
376	0		return false;
377			}
378			// If the source encoding is a type of unicode, and the destination encoding is not
379			// specified, then write wide characters (utf-8) to the perl scalar and flag it as utf8
380	74	100	if (encoding < 0 && SECRET_BUFFER_ENCODING_IS_UNICODE(src->encoding)) {
		100
		100
		100
		50
381	66		dst.encoding= SECRET_BUFFER_ENCODING_UTF8;
382	66		dst_wide= true;
383			}
384			}
385			// Determine how many bytes we need
386	101		need_bytes= secret_buffer_sizeof_transcode(src, dst.encoding);
387	101	50	if (need_bytes < 0)
388	0		return false;
389			// Prepare the buffers for that many bytes
390	101	100	if (dst_sbuf) {
391			// For destination SecretBuffer, set length to 0 unless appending, then
392			// ensure enough allocated space for need_bytes, then transcode and update
393			// the length in the block below.
394	27	100	if (!append)
395	20		secret_buffer_set_len(dst_sbuf, 0); /* clears secrets */
396	27		secret_buffer_alloc_at_least(dst_sbuf, dst_sbuf->len + need_bytes);
397	27		dst.pos= (U8*) dst_sbuf->data + dst_sbuf->len;
398	27		dst.lim= dst.pos + need_bytes;
399			}
400			else {
401			// For destination SV, set length to 0 unless appending, then force it to
402			// be bytes or utf-8, then grow it to ensure room for additional `need_bytes`.
403			U8* ptr;
404			STRLEN len;
405			// If overwriting, set the length to 0 before forcing to bytes or utf8
406	74	100	if (!append)
407	72		sv_setpvn(dst_sv, "", 0);
408			// force it to the type required
409	74	100	if (dst_wide) SvPVutf8(dst_sv, len);
410	8		else SvPVbyte(dst_sv, len);
411			// grow it to the required length, for writing
412	74	100	sv_grow(dst_sv, (append? len : 0) + need_bytes + 1);
413	74		ptr= (U8*) SvPVX_mutable(dst_sv) + len;
414			// don't forget the NUL terminator
415	74		ptr[need_bytes]= '\0';
416	74		dst.pos= ptr;
417	74		dst.lim= dst.pos + need_bytes;
418			}
419	101	50	if (!secret_buffer_transcode(src, &dst)) {
420	0	0	if (!src->error) src->error= dst.error;
421	0		return false;
422			}
423			/* update the lengths */
424	101	100	if (dst_sbuf) {
425	27		dst_sbuf->len += need_bytes;
426			}
427			else {
428	74		SvCUR_set(dst_sv, SvCUR(dst_sv) + need_bytes);
429	74	50	SvSETMAGIC(dst_sv);
430			}
431	101		return true;
432			}
433
434			/* Append DER length octets (ASN.1 Length field, definite form only).
435			*
436			* DER rules:
437			* - If len <= 127: single byte [0x00..0x7F]
438			* - Else: first byte is 0x80 \| N, where N is # of following length bytes (big-endian),
439			* and the length must be encoded in the minimal number of bytes (no leading 0x00).
440			*
441			* This function encodes ONLY the length field (not tag/value).
442			*/
443			void
444	384		secret_buffer_append_uv_asn1_der_length(secret_buffer *buf, UV val) {
445			dTHX;
446	384		int enc_len = 1;
447			U8 *pos;
448	384	100	if (val > 127) {
449			/* Determine minimal number of bytes needed to represent len in base-256. */
450	339		UV tmp = val;
451	2001	100	while (tmp) {
452	1662		enc_len++;
453	1662		tmp >>= 8;
454			}
455			}
456			/* In BER/DER, the long-form initial octet has 7 bits of length-of-length.
457			* 0x80 is indefinite length (forbidden in DER), 0xFF would mean 127 length bytes.
458			* With 64-bit UV enc_len will never exceed 9.
459			*/
460	384	50	ASSUME(enc_len < 127);
461	384		secret_buffer_set_len(buf, buf->len + enc_len);
462	384		pos= (U8*) buf->data + buf->len - 1;
463	384	100	if (val <= 127) {
464	45		*pos = (U8) val;
465			} else {
466	339		UV tmp = val;
467			/* Write the length big-endian into enc[1..n]. */
468	2001	100	while (tmp) {
469	1662		*pos-- = (U8)(tmp & 0xFF);
470	1662		tmp >>= 8;
471			}
472	339		*pos= (U8) (0x80 \| (U8)(enc_len-1));
473			}
474	384		}
475
476			/* Parse ASN.1 DER Length (definite form only) */
477			bool
478	384		secret_buffer_parse_uv_asn1_der_length(secret_buffer_parse parse, UV out) {
479			/* Work on a local cursor so we can roll back on failure */
480	384		const U8 *pos = parse->pos;
481	384		const U8 *lim = parse->lim;
482			UV result;
483
484	384	50	if (pos >= lim) {
485	0		parse->error = "unexpected end of buffer";
486	0		return false;
487			}
488
489	384		result = *pos++;
490
491			/* If 0..127, the byte is the length value itself, otherwise it is the number of octets
492			* to read following that byte. */
493	384	100	if ((result & 0x80)) {
494	339		int n = result & 0x7F;
495			/* 0x80 means indefinite length (BER/CER), forbidden in DER */
496	339	50	if (n == 0) {
497	0		parse->error = "ASN.1 DER indefinite length not allowed";
498	0		return false;
499			}
500			/* Number of octets should be smallest possible encoding, so if it is larger than size_t
501			* don't even bother trying to decode it.
502			*/
503	339	50	if (n > sizeof(UV)) {
504	0		parse->error = "ASN.1 DER length too large for perl UV";
505	0		return false;
506			}
507			/* ensure we have that many bytes */
508	339	50	if ((size_t)(lim - pos) < (size_t)n) {
509	0		parse->error = "unexpected end of buffer";
510	0		return false;
511			}
512			/* DER minimal encoding rules:
513			* - no leading 0x00 in the length octets
514			* - long form must not be used for lengths <= 127
515			*/
516	339		lim= pos + n;
517	339		result= *pos++;
518	339	50	if (!result) {
519	0		parse->error = "ASN.1 DER length has leading zero (non-minimal)";
520	0		return false;
521			}
522			/* Parse remaining bytes of big-endian unsigned integer */
523	1662	100	while (pos < lim)
524	1323		result= (result << 8) \| *pos++;
525			/* DER should not use 1-byte encoding if it would have fit in the initial byte */
526	339	50	if (result < 0x80) {
527	0		parse->error = "ASN.1 DER length should use short form (non-minimal)";
528	0		return false;
529			}
530			}
531	384	50	if (out) *out = result;
532	384		parse->pos = pos;
533	384		parse->error = NULL;
534	384		return true;
535			}
536
537			/* Append canonical unsigned Base128, Little-Endian
538			*
539			* Rules:
540			* - 7 data bits per byte, little-endian (least significant group first)
541			* - High bit 0x80 set on all bytes except the final byte
542			* - Canonical/minimal: stop as soon as remaining value is 0
543			*/
544			void
545	384		secret_buffer_append_uv_base128le(secret_buffer *buf, UV val) {
546			dTHX;
547			U8 *pos;
548	384		int enc_len= 1;
549	384		UV tmp= val >> 7;
550	1923	100	while (tmp) {
551	1539		enc_len++;
552	1539		tmp >>= 7;
553			}
554	384		secret_buffer_set_len(buf, buf->len + enc_len);
555	384		pos= (U8*) buf->data + buf->len - enc_len;
556			/* Encode */
557	384		tmp= val;
558			do {
559	1923		U8 byte = (U8)(tmp & 0x7F);
560	1923		tmp >>= 7;
561	1923	100	if (tmp)
562	1539		byte \|= 0x80;
563	1923		*pos++ = byte;
564	1923	100	} while (tmp);
565	384	50	ASSUME(pos == (U8*)(buf->data + buf->len));
566	384		}
567
568			/* Parse Unsigned LittleEndian Base128 (also requiring canonical / minimal encoding) */
569			bool
570	384		secret_buffer_parse_uv_base128le(secret_buffer_parse parse, UV out) {
571	384		const U8 *pos = parse->pos;
572	384		const U8 *lim = parse->lim;
573	384		UV result= 0, payload;
574	384		int shift= 7;
575
576	384	50	if (pos >= lim) {
577	0		parse->error = "unexpected end of buffer";
578	0		return false;
579			}
580	384		result= payload= *pos & 0x7F;
581			/* Scan forward looking for the first byte without the continuation flag */
582	1923	100	while (*pos++ & 0x80) {
583	1539	50	if (pos >= lim) {
584	0		parse->error = "unexpected end of buffer";
585	0		return false;
586			}
587	1539		payload= *pos & 0x7F;
588	1539	100	if (shift > sizeof(UV)*8 - 7) {
589			/* Do any of the bits overflow? Is the continuation flag set? */
590	3	50	if (shift >= sizeof(UV)8 \|\| (payload >> (sizeof(UV)8 - shift))) {
		50
591	0		parse->error = "Base128-LE value overflows perl UV";
592	0		return false;
593			}
594			}
595	1539		result \|= payload << shift;
596	1539		shift += 7;
597			}
598			/* check if the high bits were all zero, meaning an unnecessary byte was encoded */
599	384	100	if (!payload && result != 0) {
		50
600	0		parse->error = "Over-long encoding of Base128-LE";
601	0		return false;
602			}
603	384	50	if (out) *out = result;
604	384		parse->pos = pos;
605	384		parse->error = NULL;
606	384		return true;
607			}
608
609			/* Append canonical unsigned Base128, Big-Endian
610			*
611			* Rules:
612			* - 7 data bits per byte, big-endian (most significant group first)
613			* - High bit 0x80 set on all bytes except the final byte
614			* - Canonical/minimal: stop as soon as remaining value is 0
615			*/
616			void
617	387		secret_buffer_append_uv_base128be(secret_buffer *buf, UV val) {
618			dTHX;
619			U8 *pos;
620	387		int enc_len= 1, shift;
621	387		UV tmp= val >> 7;
622	1926	100	while (tmp) {
623	1539		enc_len++;
624	1539		tmp >>= 7;
625			}
626	387		secret_buffer_set_len(buf, buf->len + enc_len);
627	387		pos= (U8*) buf->data + buf->len - enc_len;
628			/* Encode */
629	2313	100	for (shift= (enc_len-1) * 7; shift >= 0; shift -= 7) {
630	1926		U8 byte = (U8)((val >> shift) & 0x7F);
631	1926	100	if (shift)
632	1539		byte \|= 0x80;
633	1926		*pos++ = byte;
634			}
635	387	50	ASSUME(pos == (U8*)(buf->data + buf->len));
636	387		}
637
638			/* Parse Unsigned BigEndian Base128 (also requiring canonical / minimal encoding) */
639			bool
640	395		secret_buffer_parse_uv_base128be(secret_buffer_parse parse, UV out) {
641	395		const U8 *pos = parse->pos;
642	395		const U8 *lim = parse->lim;
643	395		UV result= 0;
644
645	395	50	if (pos >= lim) {
646	0		parse->error = "unexpected end of buffer";
647	0		return false;
648			}
649			/* high-bit payload == 0 with continue bit set is an error. */
650	395	50	if (*pos == 0x80) {
651	0		parse->error = "Over-long encoding of Base128-BE";
652	0		return false;
653			}
654	395		result= *pos & 0x7F;
655	1934	100	while (*pos++ & 0x80) {
656			/* Will existing bits overflow UV when shifted? */
657	1539	50	if (result >> (sizeof(UV)*8 - 7)) {
658	0		parse->error = "Base128-BE value overflows perl UV";
659	0		return false;
660			}
661	1539	50	if (pos >= lim) {
662	0		parse->error = "unexpected end of buffer";
663	0		return false;
664			}
665	1539		result= (result << 7) \| (*pos & 0x7F);
666			}
667	395	50	if (out) *out = result;
668	395		parse->pos = pos;
669	395		parse->error = NULL;
670	395		return true;
671			}
672
673			/* Private API -------------------------------------------------------------*/
674
675			/* Scan raw bytes using only the bitmap */
676	289		static bool sb_parse_match_charset_bytes(
677			secret_buffer_parse *parse,
678			const secret_buffer_charset *cset,
679			int flags
680			) {
681	289		bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
682	289		bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
683	289	100	bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) \|\| cset->match_multi;
		100
684	289		bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
685	289		bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
686	289	100	int step= reverse? -1 : 1;
687	289	100	const U8 *pos= reverse? parse->lim-1 : parse->pos,
688	289	100	*lim= reverse? parse->pos-1 : parse->lim,
689	289		*span_start= NULL;
690			//warn("scan_charset_bytes pos=%p lim=%p len=%d", parse->pos, parse->lim, (int)(parse->lim - parse->pos));
691
692	1102	100	while (pos != lim) {
693	1097	100	if (sbc_bitmap_test(cset->bitmap, *pos) != negate) {
694			// Found. Now are we looking for a span?
695	238	100	if (span_start)
696	105		break;
697	133		span_start= pos;
698	133	100	if (!multi) {
699	27		pos += step;
700	27		break;
701			}
702	106		negate= !negate;
703	859	100	} else if (anchored && !span_start)
		100
704	152		break;
705	813		pos += step;
706			}
707			/* If constant time operation is requested, we need to perform one sbc_bitmap_test
708			* for every character in the span, and make sure the compiler doesn't eliminate it.
709			*/
710	289	50	if (consttime) {
711	0		volatile bool sink= false;
712	0	0	while (pos != lim) {
713	0		sink ^= sbc_bitmap_test(cset->bitmap, *pos);
714	0		pos += step;
715			}
716	0		(void) sink;
717			}
718			// reached end of defined range, and implicitly ends span
719	289	100	if (reverse) {
720	86		parse->pos= pos + 1;
721	86	100	parse->lim= span_start? span_start + 1 : parse->pos;
722			} else {
723	203		parse->lim= pos;
724	203	100	parse->pos= span_start? span_start : parse->lim;
725			}
726	289		return span_start != NULL;
727			}
728
729	9		static bool sb_parse_match_charset_codepoints(
730			secret_buffer_parse *parse,
731			const secret_buffer_charset *cset,
732			int flags
733			) {
734			dTHX;
735	9		bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
736	9		bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
737	9	50	bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) \|\| cset->match_multi;
		100
738	9		bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
739	9		bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
740	9		bool span_started= false;
741	9		bool encoding_error= false;
742	9	100	const U8 span_mark= NULL, prev_mark= reverse? parse->lim : parse->pos;
743
744	37	50	while (parse->pos < parse->lim) {
745	19		int codepoint= reverse? sb_parse_prev_codepoint(parse)
746	37	100	: sb_parse_next_codepoint(parse);
747			// warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
748	37	100	if (codepoint < 0) {// encoding error
749	1		encoding_error= true;
750	1		break;
751			}
752	36	100	if (sbc_test_codepoint(aTHX_ cset, codepoint) != negate) {
753			// Found. Mark boundaries of char.
754			// Now are we looking for a span?
755	10	100	if (span_started)
756	2		break;
757	8		span_started= true;
758	8		span_mark= prev_mark;
759	8		negate= !negate;
760	8	100	if (!multi) {
761	6	100	prev_mark= reverse? parse->lim : parse->pos;
762	6		break;
763			}
764	26	50	} else if (anchored && !span_started)
		0
765	0		break;
766	28	100	prev_mark= reverse? parse->lim : parse->pos;
767			}
768			/* If constant time operation is requested, we need to perform one sbc_bitmap_test
769			* for every character in the span, and make sure the compiler doesn't eliminate it.
770			*/
771	9	50	if (consttime) {
772	0		volatile bool sink= false;
773	0	0	while (parse->pos < parse->lim) {
774	0		int codepoint= reverse? sb_parse_prev_codepoint(parse)
775	0	0	: sb_parse_next_codepoint(parse);
776			// warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
777	0	0	if (codepoint < 0) { // encoding error
778	0		encoding_error= true;
779	0		sink ^= sbc_test_codepoint(aTHX_ cset, 0);
780			}
781			else
782	0		sink ^= sbc_test_codepoint(aTHX_ cset, codepoint);
783			}
784	0		(void) sink;
785			}
786	9	100	if (encoding_error)
787	1		return false;
788			// reached end of defined range
789	8	50	if (span_started) { // and implicitly ends span
790	8	100	if (reverse) {
791	5		parse->pos= prev_mark;
792	5		parse->lim= span_mark;
793			}
794			else {
795	3		parse->pos= span_mark;
796	3		parse->lim= prev_mark;
797			}
798	8		return true;
799			}
800	0		return false;
801			}
802
803	18		int sb_parse_codepointcmp(secret_buffer_parse lhs, secret_buffer_parse rhs) {
804			I32 lhs_cp, rhs_cp;
805	18		volatile int ret= 0;
806			/* constant-time iteration per the shorter of the two strings */
807	87	100	while (lhs->pos < lhs->lim && rhs->pos < rhs->lim) {
		50
808	69		lhs_cp= sb_parse_next_codepoint(lhs);
809	69	50	if (lhs_cp < 0)
810	0		croak("Encoding error in left-hand buffer");
811	69		rhs_cp= sb_parse_next_codepoint(rhs);
812	69	50	if (rhs_cp < 0)
813	0		croak("Encoding error in right-hand buffer");
814	69	100	if (lhs_cp != rhs_cp && !ret)
		50
815	2	50	ret= lhs_cp < rhs_cp? -1 : 1;
816			}
817	18		return ret? ret
818	34	100	: (lhs->pos < lhs->lim)? 1 /* right string shorter than left */
819	32	50	: (rhs->pos < rhs->lim)? -1 /* left string shorter than right */
820	16	50	: 0;
821			}
822
823			/* UTF-8 decoding helper */
824	3365		static int sb_parse_next_codepoint(secret_buffer_parse *parse) {
825	3365		const U8 pos= parse->pos, lim= parse->lim;
826	3365		int cp, encoding= parse->encoding;
827			#define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
828
829	3365	100	if (encoding == SECRET_BUFFER_ENCODING_ASCII
830	3364	100	\|\| encoding == SECRET_BUFFER_ENCODING_ISO8859_1
831	577	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF8
832			) {
833	3123	50	if (lim - pos < 1)
834	0		SB_RETURN_ERROR("end of span")
835	3123		cp= *pos++;
836	3123	100	if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_ASCII)
		100
837	1		SB_RETURN_ERROR("not 7-bit ASCII")
838	3122	100	else if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_UTF8) {
		100
839	47		int min_cp= 0;
840	47		switch ((cp >> 3) & 0xF) {
841	13		case 14: // 0b1[1110]yyy
842	13	50	{ if (lim - pos < 3) goto incomplete;
843	13		min_cp= 0x10000;
844	13		cp &= 0x07;
845			}
846	13	50	if ((*pos & 0xC0) != 0x80) goto invalid;
847	13		cp= (cp << 6) \| (*pos++ & 0x3F);
848			if (0)
849			case 12: case 13: // 0b1[110x]yyy
850	14	50	{ if (lim - pos < 2) goto incomplete;
851	14		min_cp= 0x800;
852	14		cp &= 0x0F;
853			}
854	27	50	if ((*pos & 0xC0) != 0x80) goto invalid;
855	27		cp= (cp << 6) \| (*pos++ & 0x3F);
856			if (0)
857			case 8: case 9: case 10: case 11: // 0b1[10xx]yyy
858	20	50	{ if (lim - pos < 1) goto incomplete;
859	20		min_cp= 0x80;
860	20		cp &= 0x1F;
861			}
862	47	50	if ((*pos & 0xC0) != 0x80) goto invalid;
863	47		cp= (cp << 6) \| (*pos++ & 0x3F);
864	47		break;
865			default:
866	0		invalid: SB_RETURN_ERROR("invalid UTF8 character")
867	0		incomplete: SB_RETURN_ERROR("incomplete UTF8 character")
868			}
869	47	50	if (cp < min_cp)
870	0		SB_RETURN_ERROR("overlong encoding of UTF8 character")
871	47	50	else if (cp > 0x10FFFF)
872	0		SB_RETURN_ERROR("UTF8 character exceeds max")
873			}
874			// else all ISO-8859-1 bytes are valid codepoints
875			}
876	242	100	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
877	221	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE
878	36		) {
879	36		int low= encoding == SECRET_BUFFER_ENCODING_UTF16LE? 0 : 1;
880	36	50	if (lim - pos < 2)
881	0		SB_RETURN_ERROR("end of span")
882	36		cp= pos[low] \| ((int)pos[low^1] << 8);
883	36		pos += 2;
884	36	100	if (cp >= 0xD800 && cp <= 0xDFFF) {
		50
885	10	50	if (lim - pos < 2)
886	0		SB_RETURN_ERROR("incomplete UTF16 character")
887	10		int w2= pos[low] \| ((int)pos[low^1] << 8);
888	10		pos += 2;
889	10	50	if (w2 < 0xDC00 \|\| w2 > 0xDFFF)
		50
890	0		SB_RETURN_ERROR("invalid UTF16 low surrogate")
891	10		cp = 0x10000 + (((cp & 0x3FF) << 10) \| (w2 & 0x3FF));
892			}
893			}
894	206	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
895			// Skip over whitespace
896	38	50	while (pos < lim && isspace(*pos))
		50
897	0		pos++;
898	38	50	if (lim - pos < 2)
899	0		SB_RETURN_ERROR("end of span")
900	38		int high= *pos++ - '0';
901	38		int low= *pos++ - '0';
902	38	50	if (low >= ('a'-'0')) low -= ('a'-'0'-10);
903	38	100	else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
904	38	50	if (high >= ('a'-'0')) high -= ('a'-'0'-10);
905	38	100	else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
906	38	50	if ((low >> 4) \| (high >> 4))
907	0		SB_RETURN_ERROR("not a pair of hex digits")
908	38		cp= (high << 4) \| low;
909			// skip over whitespace if it takes us to the end of buffer so that caller
910			// knows it's EOF before trying another decode.
911	38	100	while (pos < lim && isspace(*pos))
		50
912	0		pos++;
913			}
914	168	50	else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
915			// Skip over whitespace and control chars
916	168	50	while (pos < lim && *pos <= ' ')
		50
917	0		pos++;
918			// There need to be at least 2 base64 characters left
919	168	50	if (pos < lim) {
920	168	50	if (base64_decode_table[*pos] < 0)
921	0		SB_RETURN_ERROR("invalid base64 character");
922			// ->pos_bit > 0 means pointer is pointing at a sub-bit of the base64
923			// character at *pos (and possible values are 0, 2, or 4)
924	168		cp= (((int)base64_decode_table[*pos++]) << (2 + parse->pos_bit)) & 0xFF;
925	168	50	while (pos < lim && *pos <= ' ')
		50
926	0		pos++;
927			}
928	168	50	if (pos >= lim) {
929	0		parse->pos_bit= 0;
930	0		SB_RETURN_ERROR("end of span")
931			}
932	168	50	if (base64_decode_table[*pos] < 0)
933	0		SB_RETURN_ERROR("invalid base64 character");
934	168		cp \|= base64_decode_table[*pos] >> (4-parse->pos_bit);
935	168		parse->pos_bit += 2;
936			// If pos_bit == 6 we've completed a set of 4 b64 chars and fully consumed them.
937	168	100	if (parse->pos_bit >= 6) {
938	51		pos++;
939	51		parse->pos_bit= 0;
940			// consume trailing whitespace
941	55	100	while (pos < lim && *pos <= ' ')
		100
942	4		pos++;
943			}
944			else {
945			// if next char is '=', terminate the decoding
946	117		const U8 *next= pos+1;
947	117	50	while (next < lim && *next <= ' ')
		50
948	0		next++;
949	117	50	if (next < lim && *next == '=') {
		100
950	13		pos= lim; // indicate parsing complete
951	13		parse->pos_bit= 0;
952			}
953			}
954			}
955	0	0	else if (encoding == SECRET_BUFFER_ENCODING_I32) {
956	0	0	if (lim - pos < 4)
957	0		SB_RETURN_ERROR("end of span");
958	0		cp= (I32)pos;
959	0		pos+= 4;
960			}
961	0		else SB_RETURN_ERROR("unsupported encoding")
962	3364		parse->pos= pos;
963	3364		return cp;
964			#undef SB_RETURN_ERROR
965			}
966
967	850		static int sb_parse_prev_codepoint(secret_buffer_parse *parse) {
968	850		const U8 pos= parse->pos, lim= parse->lim;
969	850		int encoding= parse->encoding;
970			int cp;
971			#define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
972
973	850	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII
974	850	100	\|\| encoding == SECRET_BUFFER_ENCODING_ISO8859_1
975	25	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF8
976			) {
977	842	50	if (lim <= pos)
978	0		SB_RETURN_ERROR("end of span")
979	842		cp= *--lim;
980			// handle the simple case first
981	842	100	if (cp >= 0x80 && encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
		50
982			// Strict ASCII can't encode above 0x7F
983	4	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII)
984	0		SB_RETURN_ERROR("not 7-bit ASCII")
985			// else need to backtrack and then call next_codepoint
986	4		const U8 *start= lim;
987	12	50	while (start >= pos && (*start & 0xC0) == 0x80)
		100
988	8		--start;
989	4		parse->pos= start;
990	4		cp= sb_parse_next_codepoint(parse);
991	4	50	if (parse->pos != parse->lim) {// consumed all characters we gave it?
992	0		parse->pos= pos; // restore original pos
993	0	0	if (cp >= 0) // had a valid char, but extra 0x80 bytes
994	0		parse->error= "invalid UTF8 character";
995			// else use the error message from next_codepoint
996	0		return -1;
997			}
998	4		parse->pos= pos; // restore original pos
999	4		lim= start; // new lim is where we started the parse from
1000			}
1001			}
1002	8	50	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1003	8	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE
1004	1		) {
1005	1	50	if (lim - pos < 2)
1006	0		SB_RETURN_ERROR("end of span");
1007			// handle the simple case first
1008	1		lim -= 2;
1009	1		int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1010	1		cp= lim[low] \| ((int)lim[low^1] << 8);
1011	1	50	if (cp >= 0xD800 && cp <= 0xDFFF) {
		50
1012	1	50	if (lim - pos < 4)
1013	0		SB_RETURN_ERROR("end of span");
1014	1		lim -= 2;
1015	1		int w1= lim[low] \| ((int)lim[low^1] << 8);
1016	1	50	if (w1 < 0xD800 \|\| w1 > 0xDFFF \|\| cp < 0xDC00)
		50
		50
1017	0		SB_RETURN_ERROR("invalid UTF16 surrogate");
1018	1		cp = 0x10000 + (((w1 & 0x3FF) << 10) \| (cp & 0x3FF));
1019			}
1020			}
1021	7	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1022			// Skip over whitespace
1023	1	50	while (pos < lim && isspace(lim[-1]))
		50
1024	0		lim--;
1025	1	50	if (lim - pos < 2)
1026	0	0	SB_RETURN_ERROR((pos == lim? "end of span" : "incomplete hex pair at end of span"))
1027	1		int low= *--lim - '0';
1028	1		int high= *--lim - '0';
1029	1	50	if (low >= ('a'-'0')) low -= ('a'-'0'-10);
1030	0	0	else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
1031	1	50	if (high >= ('a'-'0')) high -= ('a'-'0'-10);
1032	0	0	else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
1033	1	50	if ((low >> 4) \| (high >> 4))
1034	0		SB_RETURN_ERROR("not a pair of hex digits")
1035	1		cp= (high << 4) \| low;
1036			}
1037	6	50	else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
1038			bool again;
1039			do {
1040	9		again= false;
1041			// Skip over non-base64 chars
1042	12	50	while (pos < lim && base64_decode_table[lim[-1]] < 0)
		100
1043	3		lim--;
1044	9	50	if (pos < lim) {
1045			//warn("lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1046	9	50	if (base64_decode_table[lim[-1]] < 0)
1047	0		SB_RETURN_ERROR("invalid base64 character");
1048			// ->lim_bit > 0 means the character lim[-1] is partially consumed.
1049			// (sequence is 0, 2, 4, 0)
1050	9		cp= ((int)base64_decode_table[lim[-1]]) >> parse->lim_bit;
1051			// parsing an equal sign means 'cp' is bogus and need to go again
1052	9	100	if (lim[-1] == '=')
1053	3		again= true;
1054	9		--lim;
1055			// find next base64 char
1056	9	50	while (pos < lim && base64_decode_table[lim[-1]] < 0)
		50
1057	0		lim--;
1058			}
1059	9	50	if (pos >= lim) {
1060	0		parse->lim_bit= 0;
1061	0		SB_RETURN_ERROR("end of span")
1062			}
1063	9	50	if (base64_decode_table[lim[-1]] < 0)
1064	0		SB_RETURN_ERROR("invalid base64 character");
1065			//warn(" lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1066	9		cp \|= (((int)base64_decode_table[lim[-1]]) << (6 - parse->lim_bit)) & 0xFF;
1067	9		parse->lim_bit += 2;
1068	9	100	if (parse->lim_bit >= 6) {
1069	3		parse->lim_bit= 0;
1070			// If completed a set of 4 b64 chars, lim[-1] is consumed, and need to
1071			// walk backward to find next base64 char
1072	3		--lim;
1073	3	50	while (pos < lim && base64_decode_table[lim[-1]] < 0)
		0
1074	0		lim--;
1075			}
1076			//warn(" cp=%d, lim-pos=%d, lim_bit=%d", cp, (int)(lim-pos), parse->lim_bit);
1077	9	100	} while (again);
1078			}
1079	0	0	else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1080	0	0	if (lim - pos < 4)
1081	0		SB_RETURN_ERROR("end of span");
1082	0		lim -= 4;
1083	0		cp= (I32)lim;
1084			}
1085	0		else SB_RETURN_ERROR("unsupported encoding")
1086	850		parse->lim= lim;
1087	850		return cp;
1088			#undef SB_RETURN_ERROR
1089			}
1090
1091	1202		static int sizeof_codepoint_encoding(int codepoint, int encoding) {
1092	1202	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII)
1093	0	0	return codepoint < 0x80? 1 : -1;
1094	1202	100	if (encoding == SECRET_BUFFER_ENCODING_ISO8859_1)
1095	110	50	return codepoint < 0x100? 1 : -1;
1096	1092	100	else if (encoding == SECRET_BUFFER_ENCODING_UTF8)
1097	736	100	return codepoint < 0x80? 1 : codepoint < 0x800? 2 : codepoint < 0x10000? 3 : 4;
		100
		100
1098	356	50	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1099	356	50	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE)
1100	0	0	return codepoint >= 0xD800 && codepoint < 0xE000? -1
1101	0	0	: codepoint < 0x10000? 2 : 4;
		0
1102	356	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX)
1103	6	50	return codepoint < 0x100? 2 : -1;
1104			/* Base64 would need to track an accumulator, so just return 1 and fix it in the caller */
1105	350	100	else if (encoding == SECRET_BUFFER_ENCODING_BASE64)
1106	78	50	return codepoint < 0x100? 1 : -1;
1107	272	50	else if (encoding == SECRET_BUFFER_ENCODING_I32)
1108	272		return 4;
1109			else
1110	0		return -1;
1111			}
1112
1113	444		static bool sb_parse_encode_codepoint(secret_buffer_parse *dst, int codepoint) {
1114			#define SB_RETURN_ERROR(msg) { dst->error= msg; return false; }
1115	444		int encoding= dst->encoding, n;
1116	444		U8 dst_pos= (U8) dst->pos;
1117			// codepoints above 0x10FFFF are illegal
1118	444	50	if (codepoint >= 0x110000)
1119	0		SB_RETURN_ERROR("invalid codepoint");
1120			// not quite as efficient as checking during the code below, but saves a bunch of redundancy
1121	444		n= sizeof_codepoint_encoding(codepoint, encoding);
1122	444	50	if (n < 0)
1123	0		SB_RETURN_ERROR("character too wide for encoding")
1124	444	50	if (dst->lim - dst_pos < n)
1125	0		SB_RETURN_ERROR("buffer too small")
1126	444		dst->pos += n;
1127
1128	444	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII
1129	444	100	\|\| encoding == SECRET_BUFFER_ENCODING_ISO8859_1
1130	389	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF8
1131			) {
1132	423		switch ((n-1)&0x3) { // help the compiler understand there are only 4 possible values
1133	401		case 0: *dst_pos++ = (U8) codepoint;
1134	401		break;
1135	10		case 1: *dst_pos++ = (U8)(0xC0 \| (codepoint >> 6));
1136	10		*dst_pos++ = (U8)(0x80 \| (codepoint & 0x3F));
1137	10		break;
1138	4		case 2: *dst_pos++ = (U8)(0xE0 \| (codepoint >> 12));
1139	4		*dst_pos++ = (U8)(0x80 \| ((codepoint >> 6) & 0x3F));
1140	4		*dst_pos++ = (U8)(0x80 \| (codepoint & 0x3F));
1141	4		break;
1142	8		case 3: *dst_pos++ = (U8)(0xF0 \| (codepoint >> 18));
1143	8		*dst_pos++ = (U8)(0x80 \| ((codepoint >> 12) & 0x3F));
1144	8		*dst_pos++ = (U8)(0x80 \| ((codepoint >> 6) & 0x3F));
1145	8		*dst_pos++ = (U8)(0x80 \| (codepoint & 0x3F));
1146	8		break;
1147			}
1148			}
1149	21	50	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1150	21	50	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE
1151	0		) {
1152	0		int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1153	0	0	if (n == 2) {
1154	0		dst_pos[low] = (U8)(codepoint & 0xFF);
1155	0		dst_pos[low^1] = (U8)(codepoint >> 8);
1156			}
1157			else {
1158	0		int adjusted = codepoint - 0x10000;
1159	0		int w0 = 0xD800 \| (adjusted >> 10);
1160	0		int w1 = 0xDC00 \| (adjusted & 0x3FF);
1161	0		dst_pos[low] = (U8)(w0 & 0xFF);
1162	0		dst_pos[1^low] = (U8)(w0 >> 8);
1163	0		dst_pos[2^low] = (U8)(w1 & 0xFF);
1164	0		dst_pos[3^low] = (U8)(w1 >> 8);
1165			}
1166			}
1167	21	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1168	3		dst_pos[0] = "0123456789ABCDEF"[(codepoint >> 4) & 0xF];
1169	3		dst_pos[1] = "0123456789ABCDEF"[codepoint & 0xF];
1170			}
1171	18	50	else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1172	18		(I32)dst_pos = codepoint;
1173			}
1174			/* BASE64 is not handled here because the '=' padding can only be generated in
1175			* a context that knows when we are ending on a non-multiple-of-4. */
1176	0		else SB_RETURN_ERROR("unsupported encoding");
1177	444		return true;
1178			#undef SB_RETURN_ERROR
1179			}
1180
1181			#define SB_PARSE_MATCH_STR_FN sb_parse_match_str_U8
1182			#define SB_PATTERN_EL_TYPE const U8
1183			#include "secret_buffer_parse_match_str.c"
1184			#undef SB_PARSE_MATCH_STR_FN
1185			#undef SB_PATTERN_EL_TYPE
1186
1187			#define SB_PARSE_MATCH_STR_FN sb_parse_match_str_I32
1188			#define SB_PATTERN_EL_TYPE const I32
1189			#include "secret_buffer_parse_match_str.c"
1190			#undef SB_PARSE_MATCH_STR_FN
1191			#undef SB_PATTERN_EL_TYPE