File Coverage

secret_buffer_parse.c

Criterion	Covered	Total	%
statement	550	700	78.5
branch	397	594	66.8
condition			n/a
subroutine			n/a
pod			n/a
total	947	1294	73.1

line	stmt	bran	code
1
2			/* These local parse functions are independent of the SecretBuffer instance,
3			* needing only the 'data' pointer to which the parse_state refers.
4			* The pos/lim of the parse state must already be checked against the length
5			* of the data before calling these.
6			*/
7
8			/* compute number of bytes needed for one character */
9			static int sizeof_codepoint_encoding(int codepoint, int encoding);
10			/* parse codepoint from end of parse and decrement ->lim */
11			static int sb_parse_prev_codepoint(secret_buffer_parse *parse);
12			/* parse codepoint from start of parse and increment ->pos */
13			static int sb_parse_next_codepoint(secret_buffer_parse *parse);
14			/* encode codepoint into buffer range described by 'parse' */
15			static bool sb_parse_encode_codepoint(secret_buffer_parse_rw *parse, int codepoint);
16
17			static bool sb_parse_match_charset_bytes(secret_buffer_parse parse, const secret_buffer_charset cset, int flags);
18			static bool sb_parse_match_charset_codepoints(secret_buffer_parse parse, const secret_buffer_charset cset, int flags);
19			static bool sb_parse_match_str_U8(secret_buffer_parse parse, const U8 pattern, size_t pattern_len, int flags);
20			static bool sb_parse_match_str_I32(secret_buffer_parse parse, const I32 pattern, size_t pattern_len, int flags);
21
22	102		static bool parse_encoding(pTHX_ SV sv, int out) {
23			int enc;
24	102	100	if (looks_like_number(sv)) {
25	6		IV i= SvIV(sv);
26	6	50	if (i < 0 \|\| i > SECRET_BUFFER_ENCODING_MAX)
		50
27	0		return false;
28	6		enc= (int) i;
29			} else {
30			STRLEN len;
31	96		const char *str= SvPV(sv, len);
32	96		switch (len) {
33	6	50	case 3: if (0 == strcmp(str, "HEX")) { enc= SECRET_BUFFER_ENCODING_HEX; break; }
34	1	50	case 4: if (0 == strcmp(str, "UTF8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
35	43	100	case 5: if (0 == strcmp(str, "ASCII")) { enc= SECRET_BUFFER_ENCODING_ASCII; break; }
36	42	50	if (0 == strcmp(str, "UTF-8")) { enc= SECRET_BUFFER_ENCODING_UTF8; break; }
37	26	50	case 6: if (0 == strcmp(str, "BASE64")) { enc= SECRET_BUFFER_ENCODING_BASE64; break; }
38	1	50	case 7: if (0 == strcmp(str, "UTF16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
39	0	0	if (0 == strcmp(str, "UTF16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
40	6	100	case 8: if (0 == strcmp(str, "UTF-16LE")) { enc= SECRET_BUFFER_ENCODING_UTF16LE; break; }
41	3	50	if (0 == strcmp(str, "UTF-16BE")) { enc= SECRET_BUFFER_ENCODING_UTF16BE; break; }
42	0	0	case 9: if (0 == strcmp(str, "ISO8859_1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
43	13	50	case 10: if (0 == strcmp(str, "ISO-8859-1")) { enc= SECRET_BUFFER_ENCODING_ISO8859_1; break; }
44			default:
45	0		return false;
46			}
47			}
48	102	50	if (out) *out= enc;
49	102		return true;
50			}
51
52			/* Public API --------------------------------------------------------------*/
53
54			/* initialize a parse struct, but only if it is valid span of the buffer */
55	2266		bool secret_buffer_parse_init(secret_buffer_parse *parse,
56			secret_buffer *buf, size_t pos, size_t lim, int encoding
57			) {
58	2266		Zero(parse, 1, secret_buffer_parse);
59			// Sanity check this parse state vs. the buffer
60	2266	100	if (lim > buf->len \|\| pos > lim) {
		50
61	1	50	parse->error= pos > lim? "span starts beyond buffer" : "span ends beyond buffer";
62	1		return false;
63			}
64	2265		parse->pos= ((U8*) buf->data) + pos;
65	2265		parse->lim= ((U8*) buf->data) + lim;
66	2265		parse->encoding= encoding;
67	2265		parse->sbuf= buf;
68	2265		return true;
69			}
70
71			/* Initialize a parse struct, either from a Span, or a SecretBuffer, or a plain Scalar.
72			*/
73	1741		bool secret_buffer_parse_init_from_sv(secret_buffer_parse parse, SV sv) {
74			dTHX;
75			secret_buffer *sb;
76			secret_buffer_span *span;
77			/* Is the sv a Span object? */
78	1741	100	if ((span= secret_buffer_span_from_magic(sv, 0)) && SvTYPE(SvRV(sv)) == SVt_PVHV) {
		50
79	1286		SV *sb_sv= hv_fetchs((HV)SvRV(sv), "buf", 1);
80	1286		sb= secret_buffer_from_magic(*sb_sv, SECRET_BUFFER_MAGIC_OR_DIE);
81	1286		return secret_buffer_parse_init(parse, sb, span->pos, span->lim, span->encoding);
82			}
83			/* Is the sv a SecretBuffer? */
84	455	100	else if ((sb= secret_buffer_from_magic(sv, 0))) {
85	2		return secret_buffer_parse_init(parse, sb, 0, sb->len, SECRET_BUFFER_ENCODING_ISO8859_1);
86			}
87			/* It needs to at least be defined */
88	453	50	else if (SvOK(sv)) {
89			STRLEN len;
90	453		char *buf= SvPV(sv, len);
91	453		Zero(parse, 1, secret_buffer_parse);
92	453		parse->pos= (U8*) buf;
93	453		parse->lim= (U8*) buf + len;
94	453		parse->encoding= SvUTF8(sv)? SECRET_BUFFER_ENCODING_UTF8 : SECRET_BUFFER_ENCODING_ISO8859_1;
95	453		return true;
96			}
97			else {
98	0		Zero(parse, 1, secret_buffer_parse);
99	0		parse->error= "Not a Span, SecretBuffer, or defined scalar";
100	0		return false;
101			}
102			}
103
104			/* Scan for a pattern which may be a regex or literal string.
105			* Regexes are currently limited to a single charclass.
106			*/
107	1048		bool secret_buffer_match(secret_buffer_parse parse, SV pattern, int flags) {
108			dTHX;
109	1048		REGEXP rx= (REGEXP)SvRX(pattern);
110			secret_buffer_parse pat_parse;
111
112			/* Is the pattern a regexp-ref? */
113	1048	100	if (rx) {
114	609		secret_buffer_charset *cset= secret_buffer_charset_from_regexpref(pattern);
115	609		return secret_buffer_match_charset(parse, cset, flags);
116			}
117
118			/* load up a parse struct with the pos, lim, and encoding */
119	439	50	if (!secret_buffer_parse_init_from_sv(&pat_parse, pattern))
120	0		croak("%s", pat_parse.error);
121
122			/* Remove edge case of zero-length pattern (always matches) */
123	439	100	if (pat_parse.pos >= pat_parse.lim) {
124	2	50	if ((flags & SECRET_BUFFER_MATCH_REVERSE))
125	0		parse->pos= parse->lim;
126			else
127	2		parse->lim= parse->pos;
128	2		return !(flags & SECRET_BUFFER_MATCH_NEGATE);
129			}
130			/* Remove edge case of zero-length subject (never matches) */
131	437	100	if (parse->pos >= parse->lim) {
132	4		return (flags & SECRET_BUFFER_MATCH_NEGATE);
133			}
134
135			/* Since unicode iteration of the pattern is a hassle and might happen lots of times,
136			* convert it to either plain bytes or array of U32 codepoints.
137			*/
138	433	100	if (pat_parse.encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
139	17		int dst_enc=
140			/* these can be transcoded to bytes */
141	17		(pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
142	17	100	\|\| pat_parse.encoding == SECRET_BUFFER_ENCODING_HEX
143	16	50	\|\| pat_parse.encoding == SECRET_BUFFER_ENCODING_BASE64)
144			? SECRET_BUFFER_ENCODING_ISO8859_1
145	34	50	: SECRET_BUFFER_ENCODING_I32;
146	17		SSize_t dst_len= secret_buffer_sizeof_transcode(&pat_parse, dst_enc);
147	17	50	if (dst_len < 0)
148	0		croak("transcode of pattern failed: %s", pat_parse.error);
149			/* No need to transcode SECRET_BUFFER_ENCODING_ASCII, but the above size check
150			* verified it is clean 7-bit, which is the whole point of that encoding.
151			*/
152	17	50	if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ASCII
153			/* Likewise, if SECRET_BUFFER_ENCODING_UTF8's I32 len is exactly 4x the number of
154			* original bytes, that means every byte became a character, which means every
155			* character could fit in a byte. */
156	17	100	\|\| (pat_parse.encoding == SECRET_BUFFER_ENCODING_UTF8
157	16	100	&& dst_len == (pat_parse.lim - pat_parse.pos) * 4)
158			) {
159	9		pat_parse.encoding= SECRET_BUFFER_ENCODING_ISO8859_1;
160			} else {
161			/* create a temporary secret buffer to hold the transcode */
162	8		secret_buffer *tmp= secret_buffer_new(0, NULL);
163	8		secret_buffer_parse pat_orig= pat_parse;
164	8		secret_buffer_set_len(tmp, dst_len);
165	8	50	if (!secret_buffer_parse_init(&pat_parse, tmp, 0, dst_len, dst_enc))
166	0		croak("transcode of pattern failed: %s", pat_parse.error);
167			/* Transcode the pattern */
168	8	50	if (!secret_buffer_transcode(&pat_orig, (secret_buffer_parse_rw*) &pat_parse))
169	0	0	croak("transcode of pattern failed: %s", pat_orig.error? pat_orig.error : pat_parse.error);
170			}
171			}
172			/* In some cases it would also be nice to transcode the subject first, but the
173			* final state of the parse struct carries information back to the caller and
174			* needs to refer to original positions of characters. */
175
176			/* Now dipatch to sb_parse_match_str_X */
177	433	100	if (pat_parse.encoding == SECRET_BUFFER_ENCODING_ISO8859_1) {
178	426		size_t pat_len= pat_parse.lim - pat_parse.pos;
179	426		return sb_parse_match_str_U8(parse, pat_parse.pos, pat_len, flags);
180			} else { /* must be _I32 encoding, from above */
181	7		size_t pat_len= (pat_parse.lim - pat_parse.pos) >> 2;
182	7		return sb_parse_match_str_I32(parse, (I32*) pat_parse.pos, pat_len, flags);
183			}
184			}
185
186			/* Scan for a pattern which is a set of characters */
187	609		bool secret_buffer_match_charset(secret_buffer_parse parse, secret_buffer_charset cset, int flags) {
188	609	100	if (parse->pos >= parse->lim) // empty range
189	48		return false;
190
191			// byte matching gets to use a more efficient algorithm
192	561		return parse->encoding == SECRET_BUFFER_ENCODING_ISO8859_1
193	358		? sb_parse_match_charset_bytes(parse, cset, flags)
194	919	100	: sb_parse_match_charset_codepoints(parse, cset, flags);
195			}
196
197			/* Scan for a pattern which is a literal string of bytes.
198			*/
199	0		bool secret_buffer_match_bytestr(secret_buffer_parse parse, char data, size_t datalen, int flags) {
200	0		return sb_parse_match_str_U8(parse, (U8*) data, datalen, flags);
201			}
202
203			/* Count number of bytes required to transcode the source.
204			* If the source contains an invalid character for its encoding, or that codepoint
205			* can't be encoded as the dst_encoding, this returns -1 and sets src->error
206			* and also sets src->pos pointing at the character that could not be converted.
207			*/
208	118		SSize_t secret_buffer_sizeof_transcode(secret_buffer_parse *src, int dst_encoding) {
209			// If the source and destination encodings are both bytes, return the length
210	118	100	if (dst_encoding == src->encoding && src->encoding == 0)
		100
211	17		return src->lim - src->pos;
212			// Else need to iterate characters (to validate) and re-encode them
213			else {
214	101		size_t dst_size_needed= 0;
215			secret_buffer_parse tmp;
216	101		Zero(&tmp, 1, secret_buffer_parse);
217	101		tmp.pos= src->pos;
218	101		tmp.lim= src->lim;
219	101		tmp.encoding= src->encoding;
220	859	100	while (tmp.pos < tmp.lim) {
221	758		int cp= sb_parse_next_codepoint(&tmp);
222	758	50	if (cp < 0) return -1;
223	758		int ch_size= sizeof_codepoint_encoding(cp, dst_encoding);
224	758	50	if (ch_size < 0) return -1;
225	758		dst_size_needed += ch_size;
226			}
227			// If dest is base64, need special calculation
228	101	100	if (dst_encoding == SECRET_BUFFER_ENCODING_BASE64) {
229	10		dst_size_needed= ((dst_size_needed + 2) / 3) * 4;
230			}
231	101		return dst_size_needed;
232			}
233			}
234
235			static const char base64_alphabet[64]=
236			"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
237			"abcdefghijklmnopqrstuvwxyz"
238			"0123456789+/";
239
240			/*
241			perl -E 'my @tbl= (-1)x256;
242			$tbl[ord]= -ord(A)+ord for A..Z;
243			$tbl[ord]= 26-ord(a)+ord for a..z;
244			$tbl[ord]= 52-ord(0)+ord for 0..9;
245			$tbl[ord "+"]= 62;
246			$tbl[ord "/"]= 63;
247			$tbl[ord "="]= 64;
248			say join ",\n", map join(",", @tbl[$_16 .. $_16+15]), 0..0xF'
249			*/
250			static const int8_t base64_decode_table[256]= {
251			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
252			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
253			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,
254			52,53,54,55,56,57,58,59,60,61,-1,-1,-1,64,-1,-1,
255			-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
256			15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
257			-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
258			41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
259			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
260			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
261			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
262			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
263			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
264			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
265			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
266			-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
267			};
268
269			/* Transcode characters from one parse state into another.
270			* This works sort of like
271			* $data= decode($src_enc, substr($src, $src_pos, $src_len));
272			* substr($dst, $dst_pos, $dst_lim, encode($dst_enc, $data));
273			* processing only a range of the source, and replacing only a range of the dest,
274			* adjusting the size of dst as needed. Both src->pos and dst->pos
275			* are updated.
276			*/
277	109		bool secret_buffer_transcode(secret_buffer_parse src, secret_buffer_parse_rw dst) {
278	109		src->error= NULL;
279	109		dst->error= NULL;
280			// If the source and destination encodings are both bytes, use memcpy
281	109	100	if (dst->encoding == src->encoding && src->encoding == 0) {
		100
282	17		size_t cnt= dst->lim - dst->pos;
283	17	50	if (src->lim - src->pos != cnt) {
284	0		dst->error= "miscalculated buffer length";
285	0		return false;
286			}
287	17		memcpy((U8*)dst->pos, src->pos, cnt);
288	17		dst->pos += cnt;
289	17		src->pos += cnt;
290			}
291			// Else need to iterate characters and re-encode them
292			// base64 encoding doesn't work with sb_parse_encode_codepoint, so it gets
293			// special treatment.
294	92	100	else if (dst->encoding == SECRET_BUFFER_ENCODING_BASE64) {
295			// Read 3, write 4
296	10		int accum= 0;
297	10		int shift= 16, cp;
298	88	100	while (src->pos < src->lim) {
299	78		cp= sb_parse_next_codepoint(src);
300	78	50	if (cp > 0xFF) {
301	0		dst->error= "byte out of range";
302	0		return false;
303			}
304	78	100	if (!shift) {
305	24		U8 writable= (U8) dst->pos;
306	24	50	if (dst->pos + 4 > dst->lim) {
307	0		dst->error= "miscalculated buffer length";
308	0		return false;
309			}
310	24		dst->pos += 4;
311	24		accum \|= cp;
312	24		writable[0] = base64_alphabet[0x3F & (accum >> 18)];
313	24		writable[1] = base64_alphabet[0x3F & (accum >> 12)];
314	24		writable[2] = base64_alphabet[0x3F & (accum >> 6)];
315	24		writable[3] = base64_alphabet[0x3F & accum];
316	24		accum= 0;
317	24		shift= 16;
318			}
319			else {
320	54		accum \|= (cp << shift);
321	54		shift -= 8;
322			}
323			}
324	10	100	if (dst->pos + (shift < 16? 4 : 0) != dst->lim) {
		50
325	0		dst->error= "miscalculated buffer length";
326	0		return false;
327			}
328			// write leftover accumulated bits
329	10	100	if (shift < 16) {
330	5		U8 writable= (U8) dst->pos;
331	5	50	if (dst->pos + 4 > dst->lim) {
332	0		dst->error= "miscalculated buffer length";
333	0		return false;
334			}
335	5		dst->pos += 4;
336	5		writable[0] = base64_alphabet[0x3F & (accum >> 18)];
337	5		writable[1] = base64_alphabet[0x3F & (accum >> 12)];
338	5	100	writable[2] = shift? '=' : base64_alphabet[0x3F & (accum >> 6)];
339	5		writable[3] = '=';
340			}
341			}
342			else {
343	526	100	while (src->pos < src->lim) {
344	444		int len, cp= sb_parse_next_codepoint(src);
345	444	50	if (cp < 0)
346	0		return false; // error is already set
347	444		len= sb_parse_encode_codepoint(dst, cp);
348	444	50	if (len < 0)
349	0		return false; // error is already set
350			}
351	82	50	if (dst->pos != dst->lim) {
352	0		dst->error= "miscalculated buffer length";
353	0		return false;
354			}
355			}
356	109		return true;
357			}
358
359			bool
360	101		secret_buffer_copy_to(secret_buffer_parse src, SV dst_sv, int encoding, bool append) {
361			dTHX;
362			secret_buffer_parse_rw dst;
363	101		secret_buffer *dst_sbuf= NULL;
364			SSize_t need_bytes;
365	101		bool dst_wide= false;
366
367	101		Zero(&dst, 1, secret_buffer_parse_rw);
368			// Encoding may be -1 to indicate the user didn't specify, in which case we use the
369			// same encoding as the source, unless the destination is a perl scalar (handled below)
370	101	100	dst.encoding= encoding >= 0? encoding : src->encoding;
371	101	100	if (sv_isobject(dst_sv)) {
372			// if object, must be a SecretBuffer
373	27		dst_sbuf= secret_buffer_from_magic(dst_sv, SECRET_BUFFER_MAGIC_OR_DIE);
374			}
375			else {
376			// Going to overwrite the scalar, or if its a scalar-ref, overwrite that.
377	74	50	if (SvROK(dst_sv) && !sv_isobject(dst_sv) && SvTYPE(SvRV(dst_sv)) <= SVt_PVMG)
		0
		0
378	0		dst_sv= SvRV(dst_sv);
379			// Refuse to overwrite any other kind of ref
380	74	50	if (SvTYPE(dst_sv) > SVt_PVMG \|\| SvROK(dst_sv)) {
		50
381	0		src->error= "Can only copy_to scalars or scalar-refs";
382	0		return false;
383			}
384			// If the source encoding is a type of unicode, and the destination encoding is not
385			// specified, then write wide characters (utf-8) to the perl scalar and flag it as utf8
386	74	100	if (encoding < 0 && SECRET_BUFFER_ENCODING_IS_UNICODE(src->encoding)) {
		100
		100
		100
		50
387	66		dst.encoding= SECRET_BUFFER_ENCODING_UTF8;
388	66		dst_wide= true;
389			}
390			}
391			// Determine how many bytes we need
392	101		need_bytes= secret_buffer_sizeof_transcode(src, dst.encoding);
393	101	50	if (need_bytes < 0)
394	0		return false;
395			// Prepare the buffers for that many bytes
396	101	100	if (dst_sbuf) {
397			// For destination SecretBuffer, set length to 0 unless appending, then
398			// ensure enough allocated space for need_bytes, then transcode and update
399			// the length in the block below.
400	27	100	if (!append)
401	20		secret_buffer_set_len(dst_sbuf, 0); /* clears secrets */
402	27		secret_buffer_alloc_at_least(dst_sbuf, dst_sbuf->len + need_bytes);
403	27		dst.pos= (U8*) dst_sbuf->data + dst_sbuf->len;
404	27		dst.lim= dst.pos + need_bytes;
405			}
406			else {
407			// For destination SV, set length to 0 unless appending, then force it to
408			// be bytes or utf-8, then grow it to ensure room for additional `need_bytes`.
409			U8* ptr;
410			STRLEN len;
411			// If overwriting, set the length to 0 before forcing to bytes or utf8
412	74	100	if (!append)
413	72		sv_setpvn(dst_sv, "", 0);
414			// force it to the type required
415	74	100	if (dst_wide) SvPVutf8(dst_sv, len);
416	8		else SvPVbyte(dst_sv, len);
417			// grow it to the required length, for writing
418	74	100	sv_grow(dst_sv, (append? len : 0) + need_bytes + 1);
419	74		ptr= (U8*) SvPVX_mutable(dst_sv) + len;
420			// don't forget the NUL terminator
421	74		ptr[need_bytes]= '\0';
422	74		dst.pos= ptr;
423	74		dst.lim= dst.pos + need_bytes;
424			}
425	101	50	if (!secret_buffer_transcode(src, &dst)) {
426	0	0	if (!src->error) src->error= dst.error;
427	0		return false;
428			}
429			/* update the lengths */
430	101	100	if (dst_sbuf) {
431	27		dst_sbuf->len += need_bytes;
432			}
433			else {
434	74		SvCUR_set(dst_sv, SvCUR(dst_sv) + need_bytes);
435	74	50	SvSETMAGIC(dst_sv);
436			}
437	101		return true;
438			}
439
440			/* Append DER length octets (ASN.1 Length field, definite form only).
441			*
442			* DER rules:
443			* - If len <= 127: single byte [0x00..0x7F]
444			* - Else: first byte is 0x80 \| N, where N is # of following length bytes (big-endian),
445			* and the length must be encoded in the minimal number of bytes (no leading 0x00).
446			*
447			* This function encodes ONLY the length field (not tag/value).
448			*/
449			void
450	384		secret_buffer_append_uv_asn1_der_length(secret_buffer *buf, UV val) {
451			dTHX;
452	384		int enc_len = 1;
453			U8 *pos;
454	384	100	if (val > 127) {
455			/* Determine minimal number of bytes needed to represent len in base-256. */
456	339		UV tmp = val;
457	2001	100	while (tmp) {
458	1662		enc_len++;
459	1662		tmp >>= 8;
460			}
461			}
462			/* In BER/DER, the long-form initial octet has 7 bits of length-of-length.
463			* 0x80 is indefinite length (forbidden in DER), 0xFF would mean 127 length bytes.
464			* With 64-bit UV enc_len will never exceed 9.
465			*/
466	384	50	ASSUME(enc_len < 127);
467	384		secret_buffer_set_len(buf, buf->len + enc_len);
468	384		pos= (U8*) buf->data + buf->len - 1;
469	384	100	if (val <= 127) {
470	45		*pos = (U8) val;
471			} else {
472	339		UV tmp = val;
473			/* Write the length big-endian into enc[1..n]. */
474	2001	100	while (tmp) {
475	1662		*pos-- = (U8)(tmp & 0xFF);
476	1662		tmp >>= 8;
477			}
478	339		*pos= (U8) (0x80 \| (U8)(enc_len-1));
479			}
480	384		}
481
482			/* Parse ASN.1 DER Length (definite form only) */
483			bool
484	384		secret_buffer_parse_uv_asn1_der_length(secret_buffer_parse parse, UV out) {
485			/* Work on a local cursor so we can roll back on failure */
486	384		const U8 *pos = parse->pos;
487	384		const U8 *lim = parse->lim;
488			UV result;
489
490	384	50	if (pos >= lim) {
491	0		parse->error = "unexpected end of buffer";
492	0		return false;
493			}
494
495	384		result = *pos++;
496
497			/* If 0..127, the byte is the length value itself, otherwise it is the number of octets
498			* to read following that byte. */
499	384	100	if ((result & 0x80)) {
500	339		int n = result & 0x7F;
501			/* 0x80 means indefinite length (BER/CER), forbidden in DER */
502	339	50	if (n == 0) {
503	0		parse->error = "ASN.1 DER indefinite length not allowed";
504	0		return false;
505			}
506			/* Number of octets should be smallest possible encoding, so if it is larger than size_t
507			* don't even bother trying to decode it.
508			*/
509	339	50	if (n > sizeof(UV)) {
510	0		parse->error = "ASN.1 DER length too large for perl UV";
511	0		return false;
512			}
513			/* ensure we have that many bytes */
514	339	50	if ((size_t)(lim - pos) < (size_t)n) {
515	0		parse->error = "unexpected end of buffer";
516	0		return false;
517			}
518			/* DER minimal encoding rules:
519			* - no leading 0x00 in the length octets
520			* - long form must not be used for lengths <= 127
521			*/
522	339		lim= pos + n;
523	339		result= *pos++;
524	339	50	if (!result) {
525	0		parse->error = "ASN.1 DER length has leading zero (non-minimal)";
526	0		return false;
527			}
528			/* Parse remaining bytes of big-endian unsigned integer */
529	1662	100	while (pos < lim)
530	1323		result= (result << 8) \| *pos++;
531			/* DER should not use 1-byte encoding if it would have fit in the initial byte */
532	339	50	if (result < 0x80) {
533	0		parse->error = "ASN.1 DER length should use short form (non-minimal)";
534	0		return false;
535			}
536			}
537	384	50	if (out) *out = result;
538	384		parse->pos = pos;
539	384		parse->error = NULL;
540	384		return true;
541			}
542
543			/* Append canonical unsigned Base128, Little-Endian
544			*
545			* Rules:
546			* - 7 data bits per byte, little-endian (least significant group first)
547			* - High bit 0x80 set on all bytes except the final byte
548			* - Canonical/minimal: stop as soon as remaining value is 0
549			*/
550			void
551	384		secret_buffer_append_uv_base128le(secret_buffer *buf, UV val) {
552			dTHX;
553			U8 *pos;
554	384		int enc_len= 1;
555	384		UV tmp= val >> 7;
556	1923	100	while (tmp) {
557	1539		enc_len++;
558	1539		tmp >>= 7;
559			}
560	384		secret_buffer_set_len(buf, buf->len + enc_len);
561	384		pos= (U8*) buf->data + buf->len - enc_len;
562			/* Encode */
563	384		tmp= val;
564			do {
565	1923		U8 byte = (U8)(tmp & 0x7F);
566	1923		tmp >>= 7;
567	1923	100	if (tmp)
568	1539		byte \|= 0x80;
569	1923		*pos++ = byte;
570	1923	100	} while (tmp);
571	384	50	ASSUME(pos == (U8*)(buf->data + buf->len));
572	384		}
573
574			/* Parse Unsigned LittleEndian Base128 (also requiring canonical / minimal encoding) */
575			bool
576	384		secret_buffer_parse_uv_base128le(secret_buffer_parse parse, UV out) {
577	384		const U8 *pos = parse->pos;
578	384		const U8 *lim = parse->lim;
579	384		UV result= 0, payload;
580	384		int shift= 7;
581
582	384	50	if (pos >= lim) {
583	0		parse->error = "unexpected end of buffer";
584	0		return false;
585			}
586	384		result= payload= *pos & 0x7F;
587			/* Scan forward looking for the first byte without the continuation flag */
588	1923	100	while (*pos++ & 0x80) {
589	1539	50	if (pos >= lim) {
590	0		parse->error = "unexpected end of buffer";
591	0		return false;
592			}
593	1539		payload= *pos & 0x7F;
594	1539	100	if (shift > sizeof(UV)*8 - 7) {
595			/* Do any of the bits overflow? Is the continuation flag set? */
596	3	50	if (shift >= sizeof(UV)8 \|\| (payload >> (sizeof(UV)8 - shift))) {
		50
597	0		parse->error = "Base128-LE value overflows perl UV";
598	0		return false;
599			}
600			}
601	1539		result \|= payload << shift;
602	1539		shift += 7;
603			}
604			/* check if the high bits were all zero, meaning an unnecessary byte was encoded */
605	384	100	if (!payload && result != 0) {
		50
606	0		parse->error = "Over-long encoding of Base128-LE";
607	0		return false;
608			}
609	384	50	if (out) *out = result;
610	384		parse->pos = pos;
611	384		parse->error = NULL;
612	384		return true;
613			}
614
615			/* Append canonical unsigned Base128, Big-Endian
616			*
617			* Rules:
618			* - 7 data bits per byte, big-endian (most significant group first)
619			* - High bit 0x80 set on all bytes except the final byte
620			* - Canonical/minimal: stop as soon as remaining value is 0
621			*/
622			void
623	387		secret_buffer_append_uv_base128be(secret_buffer *buf, UV val) {
624			dTHX;
625			U8 *pos;
626	387		int enc_len= 1, shift;
627	387		UV tmp= val >> 7;
628	1926	100	while (tmp) {
629	1539		enc_len++;
630	1539		tmp >>= 7;
631			}
632	387		secret_buffer_set_len(buf, buf->len + enc_len);
633	387		pos= (U8*) buf->data + buf->len - enc_len;
634			/* Encode */
635	2313	100	for (shift= (enc_len-1) * 7; shift >= 0; shift -= 7) {
636	1926		U8 byte = (U8)((val >> shift) & 0x7F);
637	1926	100	if (shift)
638	1539		byte \|= 0x80;
639	1926		*pos++ = byte;
640			}
641	387	50	ASSUME(pos == (U8*)(buf->data + buf->len));
642	387		}
643
644			/* Parse Unsigned BigEndian Base128 (also requiring canonical / minimal encoding) */
645			bool
646	395		secret_buffer_parse_uv_base128be(secret_buffer_parse parse, UV out) {
647	395		const U8 *pos = parse->pos;
648	395		const U8 *lim = parse->lim;
649	395		UV result= 0;
650
651	395	50	if (pos >= lim) {
652	0		parse->error = "unexpected end of buffer";
653	0		return false;
654			}
655			/* high-bit payload == 0 with continue bit set is an error. */
656	395	50	if (*pos == 0x80) {
657	0		parse->error = "Over-long encoding of Base128-BE";
658	0		return false;
659			}
660	395		result= *pos & 0x7F;
661	1934	100	while (*pos++ & 0x80) {
662			/* Will existing bits overflow UV when shifted? */
663	1539	50	if (result >> (sizeof(UV)*8 - 7)) {
664	0		parse->error = "Base128-BE value overflows perl UV";
665	0		return false;
666			}
667	1539	50	if (pos >= lim) {
668	0		parse->error = "unexpected end of buffer";
669	0		return false;
670			}
671	1539		result= (result << 7) \| (*pos & 0x7F);
672			}
673	395	50	if (out) *out = result;
674	395		parse->pos = pos;
675	395		parse->error = NULL;
676	395		return true;
677			}
678
679			/* Private API -------------------------------------------------------------*/
680
681			/* Scan raw bytes using only the bitmap */
682	358		static bool sb_parse_match_charset_bytes(
683			secret_buffer_parse *parse,
684			const secret_buffer_charset *cset,
685			int flags
686			) {
687	358		bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
688	358		bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
689	358	100	bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) \|\| cset->match_multi;
		100
690	358		bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
691	358		bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
692	358	100	int step= reverse? -1 : 1;
693	358	100	const U8 *pos= reverse? parse->lim-1 : parse->pos,
694	358	100	*lim= reverse? parse->pos-1 : parse->lim,
695	358		*span_start= NULL;
696			//warn("scan_charset_bytes pos=%p lim=%p len=%d", parse->pos, parse->lim, (int)(parse->lim - parse->pos));
697
698	1224	100	while (pos != lim) {
699	1166	100	if (sbc_bitmap_test(cset->bitmap, *pos) != negate) {
700			// Found. Now are we looking for a span?
701	250	100	if (span_start)
702	105		break;
703	145		span_start= pos;
704	145	100	if (!multi) {
705	39		pos += step;
706	39		break;
707			}
708	106		negate= !negate;
709	916	100	} else if (anchored && !span_start)
		100
710	156		break;
711	866		pos += step;
712			}
713			/* If constant time operation is requested, we need to perform one sbc_bitmap_test
714			* for every character in the span, and make sure the compiler doesn't eliminate it.
715			*/
716	358	50	if (consttime) {
717	0		volatile bool sink= false;
718	0	0	while (pos != lim) {
719	0		sink ^= sbc_bitmap_test(cset->bitmap, *pos);
720	0		pos += step;
721			}
722	0		(void) sink;
723			}
724			// reached end of defined range, and implicitly ends span
725	358	100	if (reverse) {
726	86		parse->pos= pos + 1;
727	86	100	parse->lim= span_start? span_start + 1 : parse->pos;
728			} else {
729	272		parse->lim= pos;
730	272	100	parse->pos= span_start? span_start : parse->lim;
731			}
732	358		return span_start != NULL;
733			}
734
735	203		static bool sb_parse_match_charset_codepoints(
736			secret_buffer_parse *parse,
737			const secret_buffer_charset *cset,
738			int flags
739			) {
740			dTHX;
741	203		bool negate= 0 != (flags & SECRET_BUFFER_MATCH_NEGATE);
742	203		bool reverse= 0 != (flags & SECRET_BUFFER_MATCH_REVERSE);
743	203	50	bool multi= 0 != (flags & SECRET_BUFFER_MATCH_MULTI) \|\| cset->match_multi;
		100
744	203		bool anchored= 0 != (flags & SECRET_BUFFER_MATCH_ANCHORED);
745	203		bool consttime=0 != (flags & SECRET_BUFFER_MATCH_CONST_TIME);
746	203		bool span_started= false;
747	203		bool encoding_error= false;
748	203	100	const U8 span_mark= NULL, prev_mark= reverse? parse->lim : parse->pos;
749
750	231	50	while (parse->pos < parse->lim) {
751	19		int codepoint= reverse? sb_parse_prev_codepoint(parse)
752	231	100	: sb_parse_next_codepoint(parse);
753			// warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
754	231	100	if (codepoint < 0) {// encoding error
755	7		encoding_error= true;
756	7		break;
757			}
758	224	100	if (sbc_test_codepoint(aTHX_ cset, codepoint) != negate) {
759			// Found. Mark boundaries of char.
760			// Now are we looking for a span?
761	198	100	if (span_started)
762	2		break;
763	196		span_started= true;
764	196		span_mark= prev_mark;
765	196		negate= !negate;
766	196	100	if (!multi) {
767	194	100	prev_mark= reverse? parse->lim : parse->pos;
768	194		break;
769			}
770	26	50	} else if (anchored && !span_started)
		0
771	0		break;
772	28	100	prev_mark= reverse? parse->lim : parse->pos;
773			}
774			/* If constant time operation is requested, we need to perform one sbc_bitmap_test
775			* for every character in the span, and make sure the compiler doesn't eliminate it.
776			*/
777	203	50	if (consttime) {
778	0		volatile bool sink= false;
779	0	0	while (parse->pos < parse->lim) {
780	0		int codepoint= reverse? sb_parse_prev_codepoint(parse)
781	0	0	: sb_parse_next_codepoint(parse);
782			// warn("parse.pos=%p parse.lim=%p parse.enc=%d cp=%d parse.err=%s", parse->pos, parse->lim, parse->encoding, codepoint, parse->error);
783	0	0	if (codepoint < 0) { // encoding error
784	0		encoding_error= true;
785	0		sink ^= sbc_test_codepoint(aTHX_ cset, 0);
786			}
787			else
788	0		sink ^= sbc_test_codepoint(aTHX_ cset, codepoint);
789			}
790	0		(void) sink;
791			}
792	203	100	if (encoding_error)
793	7		return false;
794			// reached end of defined range
795	196	50	if (span_started) { // and implicitly ends span
796	196	100	if (reverse) {
797	5		parse->pos= prev_mark;
798	5		parse->lim= span_mark;
799			}
800			else {
801	191		parse->pos= span_mark;
802	191		parse->lim= prev_mark;
803			}
804	196		return true;
805			}
806	0		return false;
807			}
808
809	20		int sb_parse_codepointcmp(secret_buffer_parse lhs, secret_buffer_parse rhs) {
810			I32 lhs_cp, rhs_cp;
811	20		volatile int ret= 0;
812			/* constant-time iteration per the shorter of the two strings */
813	111	100	while (lhs->pos < lhs->lim && rhs->pos < rhs->lim) {
		50
814	91		lhs_cp= sb_parse_next_codepoint(lhs);
815	91	50	if (lhs_cp < 0)
816	0		croak("Encoding error in left-hand buffer");
817	91		rhs_cp= sb_parse_next_codepoint(rhs);
818	91	50	if (rhs_cp < 0)
819	0		croak("Encoding error in right-hand buffer");
820	91	100	if (lhs_cp != rhs_cp && !ret)
		50
821	2	50	ret= lhs_cp < rhs_cp? -1 : 1;
822			}
823	20		return ret? ret
824	38	100	: (lhs->pos < lhs->lim)? 1 /* right string shorter than left */
825	36	50	: (rhs->pos < rhs->lim)? -1 /* left string shorter than right */
826	18	50	: 0;
827			}
828
829			/* UTF-8 decoding helper */
830	3607		static int sb_parse_next_codepoint(secret_buffer_parse *parse) {
831	3607		const U8 pos= parse->pos, lim= parse->lim;
832	3607		int cp, encoding= parse->encoding;
833			#define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
834
835	3607	100	if (encoding == SECRET_BUFFER_ENCODING_ASCII
836	3606	100	\|\| encoding == SECRET_BUFFER_ENCODING_ISO8859_1
837	815	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF8
838			) {
839	3365	50	if (lim - pos < 1)
840	0		SB_RETURN_ERROR("end of span")
841	3365		cp= *pos++;
842	3365	100	if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_ASCII)
		100
843	1		SB_RETURN_ERROR("not 7-bit ASCII")
844	3364	100	else if (cp >= 0x80 && encoding == SECRET_BUFFER_ENCODING_UTF8) {
		100
845	73		int min_cp= 0;
846	73		switch ((cp >> 3) & 0xF) {
847	13		case 14: // 0b1[1110]yyy
848	13	50	{ if (lim - pos < 3) goto incomplete;
849	13		min_cp= 0x10000;
850	13		cp &= 0x07;
851			}
852	13	50	if ((*pos & 0xC0) != 0x80) goto invalid;
853	13		cp= (cp << 6) \| (*pos++ & 0x3F);
854			if (0)
855			case 12: case 13: // 0b1[110x]yyy
856	24	100	{ if (lim - pos < 2) goto incomplete;
857	20		min_cp= 0x800;
858	20		cp &= 0x0F;
859			}
860	33	50	if ((*pos & 0xC0) != 0x80) goto invalid;
861	33		cp= (cp << 6) \| (*pos++ & 0x3F);
862			if (0)
863			case 8: case 9: case 10: case 11: // 0b1[10xx]yyy
864	36	100	{ if (lim - pos < 1) goto incomplete;
865	34		min_cp= 0x80;
866	34		cp &= 0x1F;
867			}
868	67	50	if ((*pos & 0xC0) != 0x80) goto invalid;
869	67		cp= (cp << 6) \| (*pos++ & 0x3F);
870	67		break;
871			default:
872	0		invalid: SB_RETURN_ERROR("invalid UTF8 encoding")
873	6		incomplete: SB_RETURN_ERROR("incomplete UTF8 encoding")
874			}
875	67	50	if (cp < min_cp)
876	0		SB_RETURN_ERROR("overlong encoding of UTF8 character")
877	67	50	else if (cp > 0x10FFFF)
878	0		SB_RETURN_ERROR("UTF8 character exceeds max")
879			}
880			// else all ISO-8859-1 bytes are valid codepoints
881			}
882	242	100	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
883	221	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE
884	36		) {
885	36		int low= encoding == SECRET_BUFFER_ENCODING_UTF16LE? 0 : 1;
886	36	50	if (lim - pos < 2)
887	0		SB_RETURN_ERROR("end of span")
888	36		cp= pos[low] \| ((int)pos[low^1] << 8);
889	36		pos += 2;
890	36	100	if (cp >= 0xD800 && cp <= 0xDFFF) {
		50
891	10	50	if (lim - pos < 2)
892	0		SB_RETURN_ERROR("incomplete UTF16 character")
893	10		int w2= pos[low] \| ((int)pos[low^1] << 8);
894	10		pos += 2;
895	10	50	if (w2 < 0xDC00 \|\| w2 > 0xDFFF)
		50
896	0		SB_RETURN_ERROR("invalid UTF16 low surrogate")
897	10		cp = 0x10000 + (((cp & 0x3FF) << 10) \| (w2 & 0x3FF));
898			}
899			}
900	206	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
901			// Skip over whitespace
902	38	50	while (pos < lim && isspace(*pos))
		50
903	0		pos++;
904	38	50	if (lim - pos < 2)
905	0		SB_RETURN_ERROR("end of span")
906	38		int high= *pos++ - '0';
907	38		int low= *pos++ - '0';
908	38	50	if (low >= ('a'-'0')) low -= ('a'-'0'-10);
909	38	100	else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
910	38	50	if (high >= ('a'-'0')) high -= ('a'-'0'-10);
911	38	100	else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
912	38	50	if ((low >> 4) \| (high >> 4))
913	0		SB_RETURN_ERROR("not a pair of hex digits")
914	38		cp= (high << 4) \| low;
915			// skip over whitespace if it takes us to the end of buffer so that caller
916			// knows it's EOF before trying another decode.
917	38	100	while (pos < lim && isspace(*pos))
		50
918	0		pos++;
919			}
920	168	50	else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
921			// Skip over whitespace and control chars
922	168	50	while (pos < lim && *pos <= ' ')
		50
923	0		pos++;
924			// There need to be at least 2 base64 characters left
925	168	50	if (pos < lim) {
926	168	50	if (base64_decode_table[*pos] < 0)
927	0		SB_RETURN_ERROR("invalid base64 character");
928			// ->pos_bit > 0 means pointer is pointing at a sub-bit of the base64
929			// character at *pos (and possible values are 0, 2, or 4)
930	168		cp= (((int)base64_decode_table[*pos++]) << (2 + parse->pos_bit)) & 0xFF;
931	168	50	while (pos < lim && *pos <= ' ')
		50
932	0		pos++;
933			}
934	168	50	if (pos >= lim) {
935	0		parse->pos_bit= 0;
936	0		SB_RETURN_ERROR("end of span")
937			}
938	168	50	if (base64_decode_table[*pos] < 0)
939	0		SB_RETURN_ERROR("invalid base64 character");
940	168		cp \|= base64_decode_table[*pos] >> (4-parse->pos_bit);
941	168		parse->pos_bit += 2;
942			// If pos_bit == 6 we've completed a set of 4 b64 chars and fully consumed them.
943	168	100	if (parse->pos_bit >= 6) {
944	51		pos++;
945	51		parse->pos_bit= 0;
946			// consume trailing whitespace
947	55	100	while (pos < lim && *pos <= ' ')
		100
948	4		pos++;
949			}
950			else {
951			// if next char is '=', terminate the decoding
952	117		const U8 *next= pos+1;
953	117	50	while (next < lim && *next <= ' ')
		50
954	0		next++;
955	117	50	if (next < lim && *next == '=') {
		100
956	13		pos= lim; // indicate parsing complete
957	13		parse->pos_bit= 0;
958			}
959			}
960			}
961	0	0	else if (encoding == SECRET_BUFFER_ENCODING_I32) {
962	0	0	if (lim - pos < 4)
963	0		SB_RETURN_ERROR("end of span");
964	0		cp= (I32)pos;
965	0		pos+= 4;
966			}
967	0		else SB_RETURN_ERROR("unsupported encoding")
968	3600		parse->pos= pos;
969	3600		return cp;
970			#undef SB_RETURN_ERROR
971			}
972
973	850		static int sb_parse_prev_codepoint(secret_buffer_parse *parse) {
974	850		const U8 pos= parse->pos, lim= parse->lim;
975	850		int encoding= parse->encoding;
976			int cp;
977			#define SB_RETURN_ERROR(msg) { parse->error= msg; return -1; }
978
979	850	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII
980	850	100	\|\| encoding == SECRET_BUFFER_ENCODING_ISO8859_1
981	25	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF8
982			) {
983	842	50	if (lim <= pos)
984	0		SB_RETURN_ERROR("end of span")
985	842		cp= *--lim;
986			// handle the simple case first
987	842	100	if (cp >= 0x80 && encoding != SECRET_BUFFER_ENCODING_ISO8859_1) {
		50
988			// Strict ASCII can't encode above 0x7F
989	4	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII)
990	0		SB_RETURN_ERROR("not 7-bit ASCII")
991			// else need to backtrack and then call next_codepoint
992	4		const U8 *start= lim;
993	12	50	while (start >= pos && (*start & 0xC0) == 0x80)
		100
994	8		--start;
995	4		parse->pos= start;
996	4		cp= sb_parse_next_codepoint(parse);
997	4	50	if (parse->pos != parse->lim) {// consumed all characters we gave it?
998	0		parse->pos= pos; // restore original pos
999	0	0	if (cp >= 0) // had a valid char, but extra 0x80 bytes
1000	0		parse->error= "invalid UTF8 character";
1001			// else use the error message from next_codepoint
1002	0		return -1;
1003			}
1004	4		parse->pos= pos; // restore original pos
1005	4		lim= start; // new lim is where we started the parse from
1006			}
1007			}
1008	8	50	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1009	8	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE
1010	1		) {
1011	1	50	if (lim - pos < 2)
1012	0		SB_RETURN_ERROR("end of span");
1013			// handle the simple case first
1014	1		lim -= 2;
1015	1		int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1016	1		cp= lim[low] \| ((int)lim[low^1] << 8);
1017	1	50	if (cp >= 0xD800 && cp <= 0xDFFF) {
		50
1018	1	50	if (lim - pos < 4)
1019	0		SB_RETURN_ERROR("end of span");
1020	1		lim -= 2;
1021	1		int w1= lim[low] \| ((int)lim[low^1] << 8);
1022	1	50	if (w1 < 0xD800 \|\| w1 > 0xDFFF \|\| cp < 0xDC00)
		50
		50
1023	0		SB_RETURN_ERROR("invalid UTF16 surrogate");
1024	1		cp = 0x10000 + (((w1 & 0x3FF) << 10) \| (cp & 0x3FF));
1025			}
1026			}
1027	7	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1028			// Skip over whitespace
1029	1	50	while (pos < lim && isspace(lim[-1]))
		50
1030	0		lim--;
1031	1	50	if (lim - pos < 2)
1032	0	0	SB_RETURN_ERROR((pos == lim? "end of span" : "incomplete hex pair at end of span"))
1033	1		int low= *--lim - '0';
1034	1		int high= *--lim - '0';
1035	1	50	if (low >= ('a'-'0')) low -= ('a'-'0'-10);
1036	0	0	else if (low >= ('A'-'0')) low -= ('A'-'0'-10);
1037	1	50	if (high >= ('a'-'0')) high -= ('a'-'0'-10);
1038	0	0	else if (high >= ('A'-'0')) high -= ('A'-'0'-10);
1039	1	50	if ((low >> 4) \| (high >> 4))
1040	0		SB_RETURN_ERROR("not a pair of hex digits")
1041	1		cp= (high << 4) \| low;
1042			}
1043	6	50	else if (encoding == SECRET_BUFFER_ENCODING_BASE64) {
1044			bool again;
1045			do {
1046	9		again= false;
1047			// Skip over non-base64 chars
1048	12	50	while (pos < lim && base64_decode_table[lim[-1]] < 0)
		100
1049	3		lim--;
1050	9	50	if (pos < lim) {
1051			//warn("lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1052	9	50	if (base64_decode_table[lim[-1]] < 0)
1053	0		SB_RETURN_ERROR("invalid base64 character");
1054			// ->lim_bit > 0 means the character lim[-1] is partially consumed.
1055			// (sequence is 0, 2, 4, 0)
1056	9		cp= ((int)base64_decode_table[lim[-1]]) >> parse->lim_bit;
1057			// parsing an equal sign means 'cp' is bogus and need to go again
1058	9	100	if (lim[-1] == '=')
1059	3		again= true;
1060	9		--lim;
1061			// find next base64 char
1062	9	50	while (pos < lim && base64_decode_table[lim[-1]] < 0)
		50
1063	0		lim--;
1064			}
1065	9	50	if (pos >= lim) {
1066	0		parse->lim_bit= 0;
1067	0		SB_RETURN_ERROR("end of span")
1068			}
1069	9	50	if (base64_decode_table[lim[-1]] < 0)
1070	0		SB_RETURN_ERROR("invalid base64 character");
1071			//warn(" lim-pos=%d, lim[-1]=%c, lim_bit=%d", (int)(lim-pos), lim[-1], parse->lim_bit);
1072	9		cp \|= (((int)base64_decode_table[lim[-1]]) << (6 - parse->lim_bit)) & 0xFF;
1073	9		parse->lim_bit += 2;
1074	9	100	if (parse->lim_bit >= 6) {
1075	3		parse->lim_bit= 0;
1076			// If completed a set of 4 b64 chars, lim[-1] is consumed, and need to
1077			// walk backward to find next base64 char
1078	3		--lim;
1079	3	50	while (pos < lim && base64_decode_table[lim[-1]] < 0)
		0
1080	0		lim--;
1081			}
1082			//warn(" cp=%d, lim-pos=%d, lim_bit=%d", cp, (int)(lim-pos), parse->lim_bit);
1083	9	100	} while (again);
1084			}
1085	0	0	else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1086	0	0	if (lim - pos < 4)
1087	0		SB_RETURN_ERROR("end of span");
1088	0		lim -= 4;
1089	0		cp= (I32)lim;
1090			}
1091	0		else SB_RETURN_ERROR("unsupported encoding")
1092	850		parse->lim= lim;
1093	850		return cp;
1094			#undef SB_RETURN_ERROR
1095			}
1096
1097	1202		static int sizeof_codepoint_encoding(int codepoint, int encoding) {
1098	1202	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII)
1099	0	0	return codepoint < 0x80? 1 : -1;
1100	1202	100	if (encoding == SECRET_BUFFER_ENCODING_ISO8859_1)
1101	110	50	return codepoint < 0x100? 1 : -1;
1102	1092	100	else if (encoding == SECRET_BUFFER_ENCODING_UTF8)
1103	736	100	return codepoint < 0x80? 1 : codepoint < 0x800? 2 : codepoint < 0x10000? 3 : 4;
		100
		100
1104	356	50	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1105	356	50	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE)
1106	0	0	return codepoint >= 0xD800 && codepoint < 0xE000? -1
1107	0	0	: codepoint < 0x10000? 2 : 4;
		0
1108	356	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX)
1109	6	50	return codepoint < 0x100? 2 : -1;
1110			/* Base64 would need to track an accumulator, so just return 1 and fix it in the caller */
1111	350	100	else if (encoding == SECRET_BUFFER_ENCODING_BASE64)
1112	78	50	return codepoint < 0x100? 1 : -1;
1113	272	50	else if (encoding == SECRET_BUFFER_ENCODING_I32)
1114	272		return 4;
1115			else
1116	0		return -1;
1117			}
1118
1119	444		static bool sb_parse_encode_codepoint(secret_buffer_parse_rw *dst, int codepoint) {
1120			#define SB_RETURN_ERROR(msg) { dst->error= msg; return false; }
1121	444		int encoding= dst->encoding, n;
1122	444		U8 *dst_pos= dst->pos;
1123			// codepoints above 0x10FFFF are illegal
1124	444	50	if (codepoint >= 0x110000)
1125	0		SB_RETURN_ERROR("invalid codepoint");
1126			// not quite as efficient as checking during the code below, but saves a bunch of redundancy
1127	444		n= sizeof_codepoint_encoding(codepoint, encoding);
1128	444	50	if (n < 0)
1129	0		SB_RETURN_ERROR("character too wide for encoding")
1130	444	50	if (dst->lim - dst_pos < n)
1131	0		SB_RETURN_ERROR("buffer too small")
1132	444		dst->pos += n;
1133
1134	444	50	if (encoding == SECRET_BUFFER_ENCODING_ASCII
1135	444	100	\|\| encoding == SECRET_BUFFER_ENCODING_ISO8859_1
1136	389	100	\|\| encoding == SECRET_BUFFER_ENCODING_UTF8
1137			) {
1138	423		switch ((n-1)&0x3) { // help the compiler understand there are only 4 possible values
1139	401		case 0: *dst_pos++ = (U8) codepoint;
1140	401		break;
1141	10		case 1: *dst_pos++ = (U8)(0xC0 \| (codepoint >> 6));
1142	10		*dst_pos++ = (U8)(0x80 \| (codepoint & 0x3F));
1143	10		break;
1144	4		case 2: *dst_pos++ = (U8)(0xE0 \| (codepoint >> 12));
1145	4		*dst_pos++ = (U8)(0x80 \| ((codepoint >> 6) & 0x3F));
1146	4		*dst_pos++ = (U8)(0x80 \| (codepoint & 0x3F));
1147	4		break;
1148	8		case 3: *dst_pos++ = (U8)(0xF0 \| (codepoint >> 18));
1149	8		*dst_pos++ = (U8)(0x80 \| ((codepoint >> 12) & 0x3F));
1150	8		*dst_pos++ = (U8)(0x80 \| ((codepoint >> 6) & 0x3F));
1151	8		*dst_pos++ = (U8)(0x80 \| (codepoint & 0x3F));
1152	8		break;
1153			}
1154			}
1155	21	50	else if (encoding == SECRET_BUFFER_ENCODING_UTF16LE
1156	21	50	\|\| encoding == SECRET_BUFFER_ENCODING_UTF16BE
1157	0		) {
1158	0		int low= (encoding == SECRET_BUFFER_ENCODING_UTF16LE)? 0 : 1;
1159	0	0	if (n == 2) {
1160	0		dst_pos[low] = (U8)(codepoint & 0xFF);
1161	0		dst_pos[low^1] = (U8)(codepoint >> 8);
1162			}
1163			else {
1164	0		int adjusted = codepoint - 0x10000;
1165	0		int w0 = 0xD800 \| (adjusted >> 10);
1166	0		int w1 = 0xDC00 \| (adjusted & 0x3FF);
1167	0		dst_pos[low] = (U8)(w0 & 0xFF);
1168	0		dst_pos[1^low] = (U8)(w0 >> 8);
1169	0		dst_pos[2^low] = (U8)(w1 & 0xFF);
1170	0		dst_pos[3^low] = (U8)(w1 >> 8);
1171			}
1172			}
1173	21	100	else if (encoding == SECRET_BUFFER_ENCODING_HEX) {
1174	3		dst_pos[0] = "0123456789ABCDEF"[(codepoint >> 4) & 0xF];
1175	3		dst_pos[1] = "0123456789ABCDEF"[codepoint & 0xF];
1176			}
1177	18	50	else if (encoding == SECRET_BUFFER_ENCODING_I32) {
1178	18		(I32)dst_pos = codepoint;
1179			}
1180			/* BASE64 is not handled here because the '=' padding can only be generated in
1181			* a context that knows when we are ending on a non-multiple-of-4. */
1182	0		else SB_RETURN_ERROR("unsupported encoding");
1183	444		return true;
1184			#undef SB_RETURN_ERROR
1185			}
1186
1187			#define SB_PARSE_MATCH_STR_FN sb_parse_match_str_U8
1188			#define SB_PATTERN_EL_TYPE const U8
1189			#include "secret_buffer_parse_match_str.c"
1190			#undef SB_PARSE_MATCH_STR_FN
1191			#undef SB_PATTERN_EL_TYPE
1192
1193			#define SB_PARSE_MATCH_STR_FN sb_parse_match_str_I32
1194			#define SB_PATTERN_EL_TYPE const I32
1195			#include "secret_buffer_parse_match_str.c"
1196			#undef SB_PARSE_MATCH_STR_FN
1197			#undef SB_PATTERN_EL_TYPE