File Coverage

src/inner.h

Criterion	Covered	Total	%
statement	0	20	0.0
branch			n/a
condition			n/a
subroutine			n/a
pod			n/a
total	0	20	0.0

line	stmt	code
1		/*
2		* Copyright (c) 2016 Thomas Pornin
3		*
4		* Permission is hereby granted, free of charge, to any person obtaining
5		* a copy of this software and associated documentation files (the
6		* "Software"), to deal in the Software without restriction, including
7		* without limitation the rights to use, copy, modify, merge, publish,
8		* distribute, sublicense, and/or sell copies of the Software, and to
9		* permit persons to whom the Software is furnished to do so, subject to
10		* the following conditions:
11		*
12		* The above copyright notice and this permission notice shall be
13		* included in all copies or substantial portions of the Software.
14		*
15		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16		* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17		* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18		* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19		* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20		* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21		* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24
25		#ifndef INNER_H__
26		#define INNER_H__
27
28		#include
29		#include
30
31		#include "config.h"
32		#include "bearssl.h"
33
34		/*
35		* On MSVC, disable the warning about applying unary minus on an
36		* unsigned type: it is standard, we do it all the time, and for
37		* good reasons.
38		*/
39		#if _MSC_VER
40		#pragma warning( disable : 4146 )
41		#endif
42
43		/*
44		* Maximum size for a RSA modulus (in bits). Allocated stack buffers
45		* depend on that size, so this value should be kept small. Currently,
46		* 2048-bit RSA keys offer adequate security, and should still do so for
47		* the next few decades; however, a number of widespread PKI have
48		* already set their root keys to RSA-4096, so we should be able to
49		* process such keys.
50		*
51		* This value MUST be a multiple of 64. This value MUST NOT exceed 47666
52		* (some computations in RSA key generation rely on the factor size being
53		* no more than 23833 bits). RSA key sizes beyond 3072 bits don't make a
54		* lot of sense anyway.
55		*/
56		#define BR_MAX_RSA_SIZE 4096
57
58		/*
59		* Minimum size for a RSA modulus (in bits); this value is used only to
60		* filter out invalid parameters for key pair generation. Normally,
61		* applications should not use RSA keys smaller than 2048 bits; but some
62		* specific cases might need shorter keys, for legacy or research
63		* purposes.
64		*/
65		#define BR_MIN_RSA_SIZE 512
66
67		/*
68		* Maximum size for a RSA factor (in bits). This is for RSA private-key
69		* operations. Default is to support factors up to a bit more than half
70		* the maximum modulus size.
71		*
72		* This value MUST be a multiple of 32.
73		*/
74		#define BR_MAX_RSA_FACTOR ((BR_MAX_RSA_SIZE + 64) >> 1)
75
76		/*
77		* Maximum size for an EC curve (modulus or order), in bits. Size of
78		* stack buffers depends on that parameter. This size MUST be a multiple
79		* of 8 (so that decoding an integer with that many bytes does not
80		* overflow).
81		*/
82		#define BR_MAX_EC_SIZE 528
83
84		/*
85		* Some macros to recognize the current architecture. Right now, we are
86		* interested into automatically recognizing architecture with efficient
87		* 64-bit types so that we may automatically use implementations that
88		* use 64-bit registers in that case. Future versions may detect, e.g.,
89		* availability of SSE2 intrinsics.
90		*
91		* If 'unsigned long' is a 64-bit type, then we assume that 64-bit types
92		* are efficient. Otherwise, we rely on macros that depend on compiler,
93		* OS and architecture. In any case, failure to detect the architecture
94		* as 64-bit means that the 32-bit code will be used, and that code
95		* works also on 64-bit architectures (the 64-bit code may simply be
96		* more efficient).
97		*
98		* The test on 'unsigned long' should already catch most cases, the one
99		* notable exception being Windows code where 'unsigned long' is kept to
100		* 32-bit for compatibility with all the legacy code that liberally uses
101		* the 'DWORD' type for 32-bit values.
102		*
103		* Macro names are taken from: http://nadeausoftware.com/articles/2012/02/c_c_tip_how_detect_processor_type_using_compiler_predefined_macros
104		*/
105		#ifndef BR_64
106		#if ((ULONG_MAX >> 31) >> 31) == 3
107		#define BR_64 1
108		#elif defined(__ia64) \|\| defined(__itanium__) \|\| defined(_M_IA64)
109		#define BR_64 1
110		#elif defined(__powerpc64__) \|\| defined(__ppc64__) \|\| defined(__PPC64__) \
111		\|\| defined(__64BIT__) \|\| defined(_LP64) \|\| defined(__LP64__)
112		#define BR_64 1
113		#elif defined(__sparc64__)
114		#define BR_64 1
115		#elif defined(__x86_64__) \|\| defined(_M_X64)
116		#define BR_64 1
117		#elif defined(__aarch64__) \|\| defined(_M_ARM64)
118		#define BR_64 1
119		#elif defined(__mips64)
120		#define BR_64 1
121		#endif
122		#endif
123
124		/*
125		* Set BR_LOMUL on platforms where it makes sense.
126		*/
127		#ifndef BR_LOMUL
128		#if BR_ARMEL_CORTEXM_GCC
129		#define BR_LOMUL 1
130		#endif
131		#endif
132
133		/*
134		* Architecture detection.
135		*/
136		#ifndef BR_i386
137		#if __i386__ \|\| _M_IX86
138		#define BR_i386 1
139		#endif
140		#endif
141
142		#ifndef BR_amd64
143		#if __x86_64__ \|\| _M_X64
144		#define BR_amd64 1
145		#endif
146		#endif
147
148		/*
149		* Compiler brand and version.
150		*
151		* Implementations that use intrinsics need to detect the compiler type
152		* and version because some specific actions may be needed to activate
153		* the corresponding opcodes, both for header inclusion, and when using
154		* them in a function.
155		*
156		* BR_GCC, BR_CLANG and BR_MSC will be set to 1 for, respectively, GCC,
157		* Clang and MS Visual C. For each of them, sub-macros will be defined
158		* for versions; each sub-macro is set whenever the compiler version is
159		* at least as recent as the one corresponding to the macro.
160		*/
161
162		/*
163		* GCC thresholds are on versions 4.4 to 4.9 and 5.0.
164		*/
165		#ifndef BR_GCC
166		#if __GNUC__ && !__clang__
167		#define BR_GCC 1
168
169		#if __GNUC__ > 4
170		#define BR_GCC_5_0 1
171		#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
172		#define BR_GCC_4_9 1
173		#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 8
174		#define BR_GCC_4_8 1
175		#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 7
176		#define BR_GCC_4_7 1
177		#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 6
178		#define BR_GCC_4_6 1
179		#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 5
180		#define BR_GCC_4_5 1
181		#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 4
182		#define BR_GCC_4_4 1
183		#endif
184
185		#if BR_GCC_5_0
186		#define BR_GCC_4_9 1
187		#endif
188		#if BR_GCC_4_9
189		#define BR_GCC_4_8 1
190		#endif
191		#if BR_GCC_4_8
192		#define BR_GCC_4_7 1
193		#endif
194		#if BR_GCC_4_7
195		#define BR_GCC_4_6 1
196		#endif
197		#if BR_GCC_4_6
198		#define BR_GCC_4_5 1
199		#endif
200		#if BR_GCC_4_5
201		#define BR_GCC_4_4 1
202		#endif
203
204		#endif
205		#endif
206
207		/*
208		* Clang thresholds are on versions 3.7.0 and 3.8.0.
209		*/
210		#ifndef BR_CLANG
211		#if __clang__
212		#define BR_CLANG 1
213
214		#if __clang_major__ > 3 \|\| (__clang_major__ == 3 && __clang_minor__ >= 8)
215		#define BR_CLANG_3_8 1
216		#elif __clang_major__ == 3 && __clang_minor__ >= 7
217		#define BR_CLANG_3_7 1
218		#endif
219
220		#if BR_CLANG_3_8
221		#define BR_CLANG_3_7 1
222		#endif
223
224		#endif
225		#endif
226
227		/*
228		* MS Visual C thresholds are on Visual Studio 2005 to 2015.
229		*/
230		#ifndef BR_MSC
231		#if _MSC_VER
232		#define BR_MSC 1
233
234		#if _MSC_VER >= 1900
235		#define BR_MSC_2015 1
236		#elif _MSC_VER >= 1800
237		#define BR_MSC_2013 1
238		#elif _MSC_VER >= 1700
239		#define BR_MSC_2012 1
240		#elif _MSC_VER >= 1600
241		#define BR_MSC_2010 1
242		#elif _MSC_VER >= 1500
243		#define BR_MSC_2008 1
244		#elif _MSC_VER >= 1400
245		#define BR_MSC_2005 1
246		#endif
247
248		#if BR_MSC_2015
249		#define BR_MSC_2013 1
250		#endif
251		#if BR_MSC_2013
252		#define BR_MSC_2012 1
253		#endif
254		#if BR_MSC_2012
255		#define BR_MSC_2010 1
256		#endif
257		#if BR_MSC_2010
258		#define BR_MSC_2008 1
259		#endif
260		#if BR_MSC_2008
261		#define BR_MSC_2005 1
262		#endif
263
264		#endif
265		#endif
266
267		/*
268		* GCC 4.4+ and Clang 3.7+ allow tagging specific functions with a
269		* 'target' attribute that activates support for specific opcodes.
270		*/
271		#if BR_GCC_4_4 \|\| BR_CLANG_3_7
272		#define BR_TARGET(x) __attribute__((target(x)))
273		#else
274		#define BR_TARGET(x)
275		#endif
276
277		/*
278		* AES-NI intrinsics are available on x86 (32-bit and 64-bit) with
279		* GCC 4.8+, Clang 3.7+ and MSC 2012+.
280		*/
281		#ifndef BR_AES_X86NI
282		#if (BR_i386 \|\| BR_amd64) && (BR_GCC_4_8 \|\| BR_CLANG_3_7 \|\| BR_MSC_2012)
283		#define BR_AES_X86NI 1
284		#endif
285		#endif
286
287		/*
288		* SSE2 intrinsics are available on x86 (32-bit and 64-bit) with
289		* GCC 4.4+, Clang 3.7+ and MSC 2005+.
290		*/
291		#ifndef BR_SSE2
292		#if (BR_i386 \|\| BR_amd64) && (BR_GCC_4_4 \|\| BR_CLANG_3_7 \|\| BR_MSC_2005)
293		#define BR_SSE2 1
294		#endif
295		#endif
296
297		/*
298		* RDRAND intrinsics are available on x86 (32-bit and 64-bit) with
299		* GCC 4.6+, Clang 3.7+ and MSC 2012+.
300		*/
301		#ifndef BR_RDRAND
302		#if (BR_i386 \|\| BR_amd64) && (BR_GCC_4_6 \|\| BR_CLANG_3_7 \|\| BR_MSC_2012)
303		#define BR_RDRAND 1
304		#endif
305		#endif
306
307		/*
308		* Determine type of OS for random number generation. Macro names and
309		* values are documented on:
310		* https://sourceforge.net/p/predef/wiki/OperatingSystems/
311		*
312		* Win32's CryptGenRandom() should be available on Windows systems.
313		*
314		* /dev/urandom should work on all Unix-like systems (including macOS X).
315		*
316		* getentropy() is present on Linux (Glibc 2.25+), FreeBSD (12.0+) and
317		* OpenBSD (5.6+). For OpenBSD, there does not seem to be easy to use
318		* macros to test the minimum version, so we just assume that it is
319		* recent enough (last version without getentropy() has gone out of
320		* support in May 2015).
321		*
322		* Ideally we should use getentropy() on macOS (10.12+) too, but I don't
323		* know how to test the exact OS version with preprocessor macros.
324		*
325		* TODO: enrich the list of detected system.
326		*/
327
328		#ifndef BR_USE_URANDOM
329		#if defined _AIX \
330		\|\| defined __ANDROID__ \
331		\|\| defined __FreeBSD__ \
332		\|\| defined __NetBSD__ \
333		\|\| defined __OpenBSD__ \
334		\|\| defined __DragonFly__ \
335		\|\| defined __linux__ \
336		\|\| (defined __sun && (defined __SVR4 \|\| defined __svr4__)) \
337		\|\| (defined __APPLE__ && defined __MACH__)
338		#define BR_USE_URANDOM 1
339		#endif
340		#endif
341
342		#ifndef BR_USE_GETENTROPY
343		#if (defined __linux__ \
344		&& (__GLIBC__ > 2 \|\| (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
345		\|\| (defined __FreeBSD__ && __FreeBSD__ >= 12) \
346		\|\| defined __OpenBSD__
347		#define BR_USE_GETENTROPY 1
348		#endif
349		#endif
350
351		#ifndef BR_USE_WIN32_RAND
352		#if defined _WIN32 \|\| defined _WIN64
353		#define BR_USE_WIN32_RAND 1
354		#endif
355		#endif
356
357		/*
358		* POWER8 crypto support. We rely on compiler macros for the
359		* architecture, since we do not have a reliable, simple way to detect
360		* the required support at runtime (we could try running an opcode, and
361		* trapping the exception or signal on illegal instruction, but this
362		* induces some non-trivial OS dependencies that we would prefer to
363		* avoid if possible).
364		*/
365		#ifndef BR_POWER8
366		#if __GNUC__ && ((_ARCH_PWR8 \|\| _ARCH_PPC) && __CRYPTO__)
367		#define BR_POWER8 1
368		#endif
369		#endif
370
371		/*
372		* Detect endinanness on POWER8.
373		*/
374		#if BR_POWER8
375		#if defined BR_POWER8_LE
376		#undef BR_POWER8_BE
377		#if BR_POWER8_LE
378		#define BR_POWER8_BE 0
379		#else
380		#define BR_POWER8_BE 1
381		#endif
382		#elif defined BR_POWER8_BE
383		#undef BR_POWER8_LE
384		#if BR_POWER8_BE
385		#define BR_POWER8_LE 0
386		#else
387		#define BR_POWER8_LE 1
388		#endif
389		#else
390		#if __LITTLE_ENDIAN__
391		#define BR_POWER8_LE 1
392		#define BR_POWER8_BE 0
393		#else
394		#define BR_POWER8_LE 0
395		#define BR_POWER8_BE 1
396		#endif
397		#endif
398		#endif
399
400		/*
401		* Detect support for 128-bit integers.
402		*/
403		#if !defined BR_INT128 && !defined BR_UMUL128
404		#ifdef __SIZEOF_INT128__
405		#define BR_INT128 1
406		#elif _M_X64
407		#define BR_UMUL128 1
408		#endif
409		#endif
410
411		/*
412		* Detect support for unaligned accesses with known endianness.
413		*
414		* x86 (both 32-bit and 64-bit) is little-endian and allows unaligned
415		* accesses.
416		*
417		* POWER/PowerPC allows unaligned accesses when big-endian. POWER8 and
418		* later also allow unaligned accesses when little-endian.
419		*/
420		#if !defined BR_LE_UNALIGNED && !defined BR_BE_UNALIGNED
421
422		#if __i386 \|\| __i386__ \|\| __x86_64__ \|\| _M_IX86 \|\| _M_X64
423		#define BR_LE_UNALIGNED 1
424		#elif BR_POWER8_BE
425		#define BR_BE_UNALIGNED 1
426		#elif BR_POWER8_LE
427		#define BR_LE_UNALIGNED 1
428		#elif (__powerpc__ \|\| __powerpc64__ \|\| _M_PPC \|\| _ARCH_PPC \|\| _ARCH_PPC64) \
429		&& __BIG_ENDIAN__
430		#define BR_BE_UNALIGNED 1
431		#endif
432
433		#endif
434
435		/*
436		* Detect support for an OS-provided time source.
437		*/
438
439		#ifndef BR_USE_UNIX_TIME
440		#if defined __unix__ \|\| defined __linux__ \
441		\|\| defined _POSIX_SOURCE \|\| defined _POSIX_C_SOURCE \
442		\|\| (defined __APPLE__ && defined __MACH__)
443		#define BR_USE_UNIX_TIME 1
444		#endif
445		#endif
446
447		#ifndef BR_USE_WIN32_TIME
448		#if defined _WIN32 \|\| defined _WIN64
449		#define BR_USE_WIN32_TIME 1
450		#endif
451		#endif
452
453		/* ==================================================================== */
454		/*
455		* Encoding/decoding functions.
456		*
457		* 32-bit and 64-bit decoding, both little-endian and big-endian, is
458		* implemented with the inline functions below.
459		*
460		* When allowed by some compile-time options (autodetected or provided),
461		* optimised code is used, to perform direct memory access when the
462		* underlying architecture supports it, both for endianness and
463		* alignment. This, however, may trigger strict aliasing issues; the
464		* code below uses unions to perform (supposedly) safe type punning.
465		* Since the C aliasing rules are relatively complex and were amended,
466		* or at least re-explained with different phrasing, in all successive
467		* versions of the C standard, it is always a bit risky to bet that any
468		* specific version of a C compiler got it right, for some notion of
469		* "right".
470		*/
471
472		typedef union {
473		uint16_t u;
474		unsigned char b[sizeof(uint16_t)];
475		} br_union_u16;
476
477		typedef union {
478		uint32_t u;
479		unsigned char b[sizeof(uint32_t)];
480		} br_union_u32;
481
482		typedef union {
483		uint64_t u;
484		unsigned char b[sizeof(uint64_t)];
485		} br_union_u64;
486
487		static inline void
488	0	br_enc16le(void *dst, unsigned x)
489		{
490		#if BR_LE_UNALIGNED
491	0	((br_union_u16 *)dst)->u = x;
492		#else
493		unsigned char *buf;
494
495		buf = dst;
496		buf[0] = (unsigned char)x;
497		buf[1] = (unsigned char)(x >> 8);
498		#endif
499	0	}
500
501		static inline void
502		br_enc16be(void *dst, unsigned x)
503		{
504		#if BR_BE_UNALIGNED
505		((br_union_u16 *)dst)->u = x;
506		#else
507		unsigned char *buf;
508
509		buf = dst;
510		buf[0] = (unsigned char)(x >> 8);
511		buf[1] = (unsigned char)x;
512		#endif
513		}
514
515		static inline unsigned
516	0	br_dec16le(const void *src)
517		{
518		#if BR_LE_UNALIGNED
519	0	return ((const br_union_u16 *)src)->u;
520		#else
521		const unsigned char *buf;
522
523		buf = src;
524		return (unsigned)buf[0] \| ((unsigned)buf[1] << 8);
525		#endif
526		}
527
528		static inline unsigned
529		br_dec16be(const void *src)
530		{
531		#if BR_BE_UNALIGNED
532		return ((const br_union_u16 *)src)->u;
533		#else
534		const unsigned char *buf;
535
536		buf = src;
537		return ((unsigned)buf[0] << 8) \| (unsigned)buf[1];
538		#endif
539		}
540
541		static inline void
542		br_enc32le(void *dst, uint32_t x)
543		{
544		#if BR_LE_UNALIGNED
545		((br_union_u32 *)dst)->u = x;
546		#else
547		unsigned char *buf;
548
549		buf = dst;
550		buf[0] = (unsigned char)x;
551		buf[1] = (unsigned char)(x >> 8);
552		buf[2] = (unsigned char)(x >> 16);
553		buf[3] = (unsigned char)(x >> 24);
554		#endif
555		}
556
557		static inline void
558		br_enc32be(void *dst, uint32_t x)
559		{
560		#if BR_BE_UNALIGNED
561		((br_union_u32 *)dst)->u = x;
562		#else
563		unsigned char *buf;
564
565		buf = dst;
566		buf[0] = (unsigned char)(x >> 24);
567		buf[1] = (unsigned char)(x >> 16);
568		buf[2] = (unsigned char)(x >> 8);
569		buf[3] = (unsigned char)x;
570		#endif
571		}
572
573		static inline uint32_t
574	0	br_dec32le(const void *src)
575		{
576		#if BR_LE_UNALIGNED
577	0	return ((const br_union_u32 *)src)->u;
578		#else
579		const unsigned char *buf;
580
581		buf = src;
582		return (uint32_t)buf[0]
583		\| ((uint32_t)buf[1] << 8)
584		\| ((uint32_t)buf[2] << 16)
585		\| ((uint32_t)buf[3] << 24);
586		#endif
587		}
588
589		static inline uint32_t
590		br_dec32be(const void *src)
591		{
592		#if BR_BE_UNALIGNED
593		return ((const br_union_u32 *)src)->u;
594		#else
595		const unsigned char *buf;
596
597		buf = src;
598		return ((uint32_t)buf[0] << 24)
599		\| ((uint32_t)buf[1] << 16)
600		\| ((uint32_t)buf[2] << 8)
601		\| (uint32_t)buf[3];
602		#endif
603		}
604
605		static inline void
606	0	br_enc64le(void *dst, uint64_t x)
607		{
608		#if BR_LE_UNALIGNED
609	0	((br_union_u64 *)dst)->u = x;
610		#else
611		unsigned char *buf;
612
613		buf = dst;
614		br_enc32le(buf, (uint32_t)x);
615		br_enc32le(buf + 4, (uint32_t)(x >> 32));
616		#endif
617	0	}
618
619		static inline void
620		br_enc64be(void *dst, uint64_t x)
621		{
622		#if BR_BE_UNALIGNED
623		((br_union_u64 *)dst)->u = x;
624		#else
625		unsigned char *buf;
626
627		buf = dst;
628		br_enc32be(buf, (uint32_t)(x >> 32));
629		br_enc32be(buf + 4, (uint32_t)x);
630		#endif
631		}
632
633		static inline uint64_t
634		br_dec64le(const void *src)
635		{
636		#if BR_LE_UNALIGNED
637		return ((const br_union_u64 *)src)->u;
638		#else
639		const unsigned char *buf;
640
641		buf = src;
642		return (uint64_t)br_dec32le(buf)
643		\| ((uint64_t)br_dec32le(buf + 4) << 32);
644		#endif
645		}
646
647		static inline uint64_t
648		br_dec64be(const void *src)
649		{
650		#if BR_BE_UNALIGNED
651		return ((const br_union_u64 *)src)->u;
652		#else
653		const unsigned char *buf;
654
655		buf = src;
656		return ((uint64_t)br_dec32be(buf) << 32)
657		\| (uint64_t)br_dec32be(buf + 4);
658		#endif
659		}
660
661		/*
662		* Range decoding and encoding (for several successive values).
663		*/
664		void br_range_dec16le(uint16_t v, size_t num, const void src);
665		void br_range_dec16be(uint16_t v, size_t num, const void src);
666		void br_range_enc16le(void dst, const uint16_t v, size_t num);
667		void br_range_enc16be(void dst, const uint16_t v, size_t num);
668
669		void br_range_dec32le(uint32_t v, size_t num, const void src);
670		void br_range_dec32be(uint32_t v, size_t num, const void src);
671		void br_range_enc32le(void dst, const uint32_t v, size_t num);
672		void br_range_enc32be(void dst, const uint32_t v, size_t num);
673
674		void br_range_dec64le(uint64_t v, size_t num, const void src);
675		void br_range_dec64be(uint64_t v, size_t num, const void src);
676		void br_range_enc64le(void dst, const uint64_t v, size_t num);
677		void br_range_enc64be(void dst, const uint64_t v, size_t num);
678
679		/*
680		* Byte-swap a 32-bit integer.
681		*/
682		static inline uint32_t
683		br_swap32(uint32_t x)
684		{
685		x = ((x & (uint32_t)0x00FF00FF) << 8)
686		\| ((x >> 8) & (uint32_t)0x00FF00FF);
687		return (x << 16) \| (x >> 16);
688		}
689
690		/* ==================================================================== */
691		/*
692		* Support code for hash functions.
693		*/
694
695		/*
696		* IV for MD5, SHA-1, SHA-224 and SHA-256.
697		*/
698		extern const uint32_t br_md5_IV[];
699		extern const uint32_t br_sha1_IV[];
700		extern const uint32_t br_sha224_IV[];
701		extern const uint32_t br_sha256_IV[];
702
703		/*
704		* Round functions for MD5, SHA-1, SHA-224 and SHA-256 (SHA-224 and
705		* SHA-256 use the same round function).
706		*/
707		void br_md5_round(const unsigned char buf, uint32_t val);
708		void br_sha1_round(const unsigned char buf, uint32_t val);
709		void br_sha2small_round(const unsigned char buf, uint32_t val);
710
711		/*
712		* The core function for the TLS PRF. It computes
713		* P_hash(secret, label + seed), and XORs the result into the dst buffer.
714		*/
715		void br_tls_phash(void *dst, size_t len,
716		const br_hash_class *dig,
717		const void secret, size_t secret_len, const char label,
718		size_t seed_num, const br_tls_prf_seed_chunk *seed);
719
720		/*
721		* Copy all configured hash implementations from a multihash context
722		* to another.
723		*/
724		static inline void
725		br_multihash_copyimpl(br_multihash_context *dst,
726		const br_multihash_context *src)
727		{
728		memcpy((void *)dst->impl, src->impl, sizeof src->impl);
729		}
730
731		/* ==================================================================== */
732		/*
733		* Constant-time primitives. These functions manipulate 32-bit values in
734		* order to provide constant-time comparisons and multiplexers.
735		*
736		* Boolean values (the "ctl" bits) MUST have value 0 or 1.
737		*
738		* Implementation notes:
739		* =====================
740		*
741		* The uintN_t types are unsigned and with width exactly N bits; the C
742		* standard guarantees that computations are performed modulo 2^N, and
743		* there can be no overflow. Negation (unary '-') works on unsigned types
744		* as well.
745		*
746		* The intN_t types are guaranteed to have width exactly N bits, with no
747		* padding bit, and using two's complement representation. Casting
748		* intN_t to uintN_t really is conversion modulo 2^N. Beware that intN_t
749		* types, being signed, trigger implementation-defined behaviour on
750		* overflow (including raising some signal): with GCC, while modular
751		* arithmetics are usually applied, the optimizer may assume that
752		* overflows don't occur (unless the -fwrapv command-line option is
753		* added); Clang has the additional -ftrapv option to explicitly trap on
754		* integer overflow or underflow.
755		*/
756
757		/*
758		* Negate a boolean.
759		*/
760		static inline uint32_t
761	0	NOT(uint32_t ctl)
762		{
763	0	return ctl ^ 1;
764		}
765
766		/*
767		* Multiplexer: returns x if ctl == 1, y if ctl == 0.
768		*/
769		static inline uint32_t
770	0	MUX(uint32_t ctl, uint32_t x, uint32_t y)
771		{
772	0	return y ^ (-ctl & (x ^ y));
773		}
774
775		/*
776		* Equality check: returns 1 if x == y, 0 otherwise.
777		*/
778		static inline uint32_t
779	0	EQ(uint32_t x, uint32_t y)
780		{
781		uint32_t q;
782
783	0	q = x ^ y;
784	0	return NOT((q \| -q) >> 31);
785		}
786
787		/*
788		* Inequality check: returns 1 if x != y, 0 otherwise.
789		*/
790		static inline uint32_t
791		NEQ(uint32_t x, uint32_t y)
792		{
793		uint32_t q;
794
795		q = x ^ y;
796		return (q \| -q) >> 31;
797		}
798
799		/*
800		* Comparison: returns 1 if x > y, 0 otherwise.
801		*/
802		static inline uint32_t
803	0	GT(uint32_t x, uint32_t y)
804		{
805		/*
806		* If both x < 2^31 and x < 2^31, then y-x will have its high
807		* bit set if x > y, cleared otherwise.
808		*
809		* If either x >= 2^31 or y >= 2^31 (but not both), then the
810		* result is the high bit of x.
811		*
812		* If both x >= 2^31 and y >= 2^31, then we can virtually
813		* subtract 2^31 from both, and we are back to the first case.
814		* Since (y-2^31)-(x-2^31) = y-x, the subtraction is already
815		* fine.
816		*/
817		uint32_t z;
818
819	0	z = y - x;
820	0	return (z ^ ((x ^ y) & (x ^ z))) >> 31;
821		}
822
823		/*
824		* Other comparisons (greater-or-equal, lower-than, lower-or-equal).
825		*/
826		#define GE(x, y) NOT(GT(y, x))
827		#define LT(x, y) GT(y, x)
828		#define LE(x, y) NOT(GT(x, y))
829
830		/*
831		* General comparison: returned value is -1, 0 or 1, depending on
832		* whether x is lower than, equal to, or greater than y.
833		*/
834		static inline int32_t
835		CMP(uint32_t x, uint32_t y)
836		{
837		return (int32_t)GT(x, y) \| -(int32_t)GT(y, x);
838		}
839
840		/*
841		* Returns 1 if x == 0, 0 otherwise. Take care that the operand is signed.
842		*/
843		static inline uint32_t
844		EQ0(int32_t x)
845		{
846		uint32_t q;
847
848		q = (uint32_t)x;
849		return ~(q \| -q) >> 31;
850		}
851
852		/*
853		* Returns 1 if x > 0, 0 otherwise. Take care that the operand is signed.
854		*/
855		static inline uint32_t
856		GT0(int32_t x)
857		{
858		/*
859		* High bit of -x is 0 if x == 0, but 1 if x > 0.
860		*/
861		uint32_t q;
862
863		q = (uint32_t)x;
864		return (~q & -q) >> 31;
865		}
866
867		/*
868		* Returns 1 if x >= 0, 0 otherwise. Take care that the operand is signed.
869		*/
870		static inline uint32_t
871		GE0(int32_t x)
872		{
873		return ~(uint32_t)x >> 31;
874		}
875
876		/*
877		* Returns 1 if x < 0, 0 otherwise. Take care that the operand is signed.
878		*/
879		static inline uint32_t
880		LT0(int32_t x)
881		{
882		return (uint32_t)x >> 31;
883		}
884
885		/*
886		* Returns 1 if x <= 0, 0 otherwise. Take care that the operand is signed.
887		*/
888		static inline uint32_t
889		LE0(int32_t x)
890		{
891		uint32_t q;
892
893		/*
894		* ~-x has its high bit set if and only if -x is nonnegative (as
895		* a signed int), i.e. x is in the -(2^31-1) to 0 range. We must
896		* do an OR with x itself to account for x = -2^31.
897		*/
898		q = (uint32_t)x;
899		return (q \| ~-q) >> 31;
900		}
901
902		/*
903		* Conditional copy: src[] is copied into dst[] if and only if ctl is 1.
904		* dst[] and src[] may overlap completely (but not partially).
905		*/
906		void br_ccopy(uint32_t ctl, void dst, const void src, size_t len);
907
908		#define CCOPY br_ccopy
909
910		/*
911		* Compute the bit length of a 32-bit integer. Returned value is between 0
912		* and 32 (inclusive).
913		*/
914		static inline uint32_t
915		BIT_LENGTH(uint32_t x)
916		{
917		uint32_t k, c;
918
919		k = NEQ(x, 0);
920		c = GT(x, 0xFFFF); x = MUX(c, x >> 16, x); k += c << 4;
921		c = GT(x, 0x00FF); x = MUX(c, x >> 8, x); k += c << 3;
922		c = GT(x, 0x000F); x = MUX(c, x >> 4, x); k += c << 2;
923		c = GT(x, 0x0003); x = MUX(c, x >> 2, x); k += c << 1;
924		k += GT(x, 0x0001);
925		return k;
926		}
927
928		/*
929		* Compute the minimum of x and y.
930		*/
931		static inline uint32_t
932		MIN(uint32_t x, uint32_t y)
933		{
934		return MUX(GT(x, y), y, x);
935		}
936
937		/*
938		* Compute the maximum of x and y.
939		*/
940		static inline uint32_t
941		MAX(uint32_t x, uint32_t y)
942		{
943		return MUX(GT(x, y), x, y);
944		}
945
946		/*
947		* Multiply two 32-bit integers, with a 64-bit result. This default
948		* implementation assumes that the basic multiplication operator
949		* yields constant-time code.
950		*/
951		#define MUL(x, y) ((uint64_t)(x) * (uint64_t)(y))
952
953		#if BR_CT_MUL31
954
955		/*
956		* Alternate implementation of MUL31, that will be constant-time on some
957		* (old) platforms where the default MUL31 is not. Unfortunately, it is
958		* also substantially slower, and yields larger code, on more modern
959		* platforms, which is why it is deactivated by default.
960		*
961		* MUL31_lo() must do some extra work because on some platforms, the
962		* _signed_ multiplication may return early if the top bits are 1.
963		* Simply truncating (casting) the output of MUL31() would not be
964		* sufficient, because the compiler may notice that we keep only the low
965		* word, and then replace automatically the unsigned multiplication with
966		* a signed multiplication opcode.
967		*/
968		#define MUL31(x, y) ((uint64_t)((x) \| (uint32_t)0x80000000) \
969		* (uint64_t)((y) \| (uint32_t)0x80000000) \
970		- ((uint64_t)(x) << 31) - ((uint64_t)(y) << 31) \
971		- ((uint64_t)1 << 62))
972		static inline uint32_t
973		MUL31_lo(uint32_t x, uint32_t y)
974		{
975		uint32_t xl, xh;
976		uint32_t yl, yh;
977
978		xl = (x & 0xFFFF) \| (uint32_t)0x80000000;
979		xh = (x >> 16) \| (uint32_t)0x80000000;
980		yl = (y & 0xFFFF) \| (uint32_t)0x80000000;
981		yh = (y >> 16) \| (uint32_t)0x80000000;
982		return (xl * yl + ((xl * yh + xh * yl) << 16)) & (uint32_t)0x7FFFFFFF;
983		}
984
985		#else
986
987		/*
988		* Multiply two 31-bit integers, with a 62-bit result. This default
989		* implementation assumes that the basic multiplication operator
990		* yields constant-time code.
991		* The MUL31_lo() macro returns only the low 31 bits of the product.
992		*/
993		#define MUL31(x, y) ((uint64_t)(x) * (uint64_t)(y))
994		#define MUL31_lo(x, y) (((uint32_t)(x) * (uint32_t)(y)) & (uint32_t)0x7FFFFFFF)
995
996		#endif
997
998		/*
999		* Multiply two words together; the sum of the lengths of the two
1000		* operands must not exceed 31 (for instance, one operand may use 16
1001		* bits if the other fits on 15). If BR_CT_MUL15 is non-zero, then the
1002		* macro will contain some extra operations that help in making the
1003		* operation constant-time on some platforms, where the basic 32-bit
1004		* multiplication is not constant-time.
1005		*/
1006		#if BR_CT_MUL15
1007		#define MUL15(x, y) (((uint32_t)(x) \| (uint32_t)0x80000000) \
1008		* ((uint32_t)(y) \| (uint32_t)0x80000000) \
1009		& (uint32_t)0x7FFFFFFF)
1010		#else
1011		#define MUL15(x, y) ((uint32_t)(x) * (uint32_t)(y))
1012		#endif
1013
1014		/*
1015		* Arithmetic right shift (sign bit is copied). What happens when
1016		* right-shifting a negative value is _implementation-defined_, so it
1017		* does not trigger undefined behaviour, but it is still up to each
1018		* compiler to define (and document) what it does. Most/all compilers
1019		* will do an arithmetic shift, the sign bit being used to fill the
1020		* holes; this is a native operation on the underlying CPU, and it would
1021		* make little sense for the compiler to do otherwise. GCC explicitly
1022		* documents that it follows that convention.
1023		*
1024		* Still, if BR_NO_ARITH_SHIFT is defined (and non-zero), then an
1025		* alternate version will be used, that does not rely on such
1026		* implementation-defined behaviour. Unfortunately, it is also slower
1027		* and yields bigger code, which is why it is deactivated by default.
1028		*/
1029		#if BR_NO_ARITH_SHIFT
1030		#define ARSH(x, n) (((uint32_t)(x) >> (n)) \
1031		\| ((-((uint32_t)(x) >> 31)) << (32 - (n))))
1032		#else
1033		#define ARSH(x, n) (((int32_t )&(x)) >> (n))
1034		#endif
1035
1036		/*
1037		* Constant-time division. The dividend hi:lo is divided by the
1038		* divisor d; the quotient is returned and the remainder is written
1039		* in *r. If hi == d, then the quotient does not fit on 32 bits;
1040		* returned value is thus truncated. If hi > d, returned values are
1041		* indeterminate.
1042		*/
1043		uint32_t br_divrem(uint32_t hi, uint32_t lo, uint32_t d, uint32_t *r);
1044
1045		/*
1046		* Wrapper for br_divrem(); the remainder is returned, and the quotient
1047		* is discarded.
1048		*/
1049		static inline uint32_t
1050		br_rem(uint32_t hi, uint32_t lo, uint32_t d)
1051		{
1052		uint32_t r;
1053
1054		br_divrem(hi, lo, d, &r);
1055		return r;
1056		}
1057
1058		/*
1059		* Wrapper for br_divrem(); the quotient is returned, and the remainder
1060		* is discarded.
1061		*/
1062		static inline uint32_t
1063		br_div(uint32_t hi, uint32_t lo, uint32_t d)
1064		{
1065		uint32_t r;
1066
1067		return br_divrem(hi, lo, d, &r);
1068		}
1069
1070		/* ==================================================================== */
1071
1072		/*
1073		* Integers 'i32'
1074		* --------------
1075		*
1076		* The 'i32' functions implement computations on big integers using
1077		* an internal representation as an array of 32-bit integers. For
1078		* an array x[]:
1079		* -- x[0] contains the "announced bit length" of the integer
1080		* -- x[1], x[2]... contain the value in little-endian order (x[1]
1081		* contains the least significant 32 bits)
1082		*
1083		* Multiplications rely on the elementary 32x32->64 multiplication.
1084		*
1085		* The announced bit length specifies the number of bits that are
1086		* significant in the subsequent 32-bit words. Unused bits in the
1087		* last (most significant) word are set to 0; subsequent words are
1088		* uninitialized and need not exist at all.
1089		*
1090		* The execution time and memory access patterns of all computations
1091		* depend on the announced bit length, but not on the actual word
1092		* values. For modular integers, the announced bit length of any integer
1093		* modulo n is equal to the actual bit length of n; thus, computations
1094		* on modular integers are "constant-time" (only the modulus length may
1095		* leak).
1096		*/
1097
1098		/*
1099		* Compute the actual bit length of an integer. The argument x should
1100		* point to the first (least significant) value word of the integer.
1101		* The len 'xlen' contains the number of 32-bit words to access.
1102		*
1103		* CT: value or length of x does not leak.
1104		*/
1105		uint32_t br_i32_bit_length(uint32_t *x, size_t xlen);
1106
1107		/*
1108		* Decode an integer from its big-endian unsigned representation. The
1109		* "true" bit length of the integer is computed, but all words of x[]
1110		* corresponding to the full 'len' bytes of the source are set.
1111		*
1112		* CT: value or length of x does not leak.
1113		*/
1114		void br_i32_decode(uint32_t x, const void src, size_t len);
1115
1116		/*
1117		* Decode an integer from its big-endian unsigned representation. The
1118		* integer MUST be lower than m[]; the announced bit length written in
1119		* x[] will be equal to that of m[]. All 'len' bytes from the source are
1120		* read.
1121		*
1122		* Returned value is 1 if the decode value fits within the modulus, 0
1123		* otherwise. In the latter case, the x[] buffer will be set to 0 (but
1124		* still with the announced bit length of m[]).
1125		*
1126		* CT: value or length of x does not leak. Memory access pattern depends
1127		* only of 'len' and the announced bit length of m. Whether x fits or
1128		* not does not leak either.
1129		*/
1130		uint32_t br_i32_decode_mod(uint32_t *x,
1131		const void src, size_t len, const uint32_t m);
1132
1133		/*
1134		* Reduce an integer (a[]) modulo another (m[]). The result is written
1135		* in x[] and its announced bit length is set to be equal to that of m[].
1136		*
1137		* x[] MUST be distinct from a[] and m[].
1138		*
1139		* CT: only announced bit lengths leak, not values of x, a or m.
1140		*/
1141		void br_i32_reduce(uint32_t x, const uint32_t a, const uint32_t *m);
1142
1143		/*
1144		* Decode an integer from its big-endian unsigned representation, and
1145		* reduce it modulo the provided modulus m[]. The announced bit length
1146		* of the result is set to be equal to that of the modulus.
1147		*
1148		* x[] MUST be distinct from m[].
1149		*/
1150		void br_i32_decode_reduce(uint32_t *x,
1151		const void src, size_t len, const uint32_t m);
1152
1153		/*
1154		* Encode an integer into its big-endian unsigned representation. The
1155		* output length in bytes is provided (parameter 'len'); if the length
1156		* is too short then the integer is appropriately truncated; if it is
1157		* too long then the extra bytes are set to 0.
1158		*/
1159		void br_i32_encode(void dst, size_t len, const uint32_t x);
1160
1161		/*
1162		* Multiply x[] by 2^32 and then add integer z, modulo m[]. This
1163		* function assumes that x[] and m[] have the same announced bit
1164		* length, and the announced bit length of m[] matches its true
1165		* bit length.
1166		*
1167		* x[] and m[] MUST be distinct arrays.
1168		*
1169		* CT: only the common announced bit length of x and m leaks, not
1170		* the values of x, z or m.
1171		*/
1172		void br_i32_muladd_small(uint32_t x, uint32_t z, const uint32_t m);
1173
1174		/*
1175		* Extract one word from an integer. The offset is counted in bits.
1176		* The word MUST entirely fit within the word elements corresponding
1177		* to the announced bit length of a[].
1178		*/
1179		static inline uint32_t
1180		br_i32_word(const uint32_t *a, uint32_t off)
1181		{
1182		size_t u;
1183		unsigned j;
1184
1185		u = (size_t)(off >> 5) + 1;
1186		j = (unsigned)off & 31;
1187		if (j == 0) {
1188		return a[u];
1189		} else {
1190		return (a[u] >> j) \| (a[u + 1] << (32 - j));
1191		}
1192		}
1193
1194		/*
1195		* Test whether an integer is zero.
1196		*/
1197		uint32_t br_i32_iszero(const uint32_t *x);
1198
1199		/*
1200		* Add b[] to a[] and return the carry (0 or 1). If ctl is 0, then a[]
1201		* is unmodified, but the carry is still computed and returned. The
1202		* arrays a[] and b[] MUST have the same announced bit length.
1203		*
1204		* a[] and b[] MAY be the same array, but partial overlap is not allowed.
1205		*/
1206		uint32_t br_i32_add(uint32_t a, const uint32_t b, uint32_t ctl);
1207
1208		/*
1209		* Subtract b[] from a[] and return the carry (0 or 1). If ctl is 0,
1210		* then a[] is unmodified, but the carry is still computed and returned.
1211		* The arrays a[] and b[] MUST have the same announced bit length.
1212		*
1213		* a[] and b[] MAY be the same array, but partial overlap is not allowed.
1214		*/
1215		uint32_t br_i32_sub(uint32_t a, const uint32_t b, uint32_t ctl);
1216
1217		/*
1218		* Compute d+a*b, result in d. The initial announced bit length of d[]
1219		* MUST match that of a[]. The d[] array MUST be large enough to
1220		* accommodate the full result, plus (possibly) an extra word. The
1221		* resulting announced bit length of d[] will be the sum of the announced
1222		* bit lengths of a[] and b[] (therefore, it may be larger than the actual
1223		* bit length of the numerical result).
1224		*
1225		* a[] and b[] may be the same array. d[] must be disjoint from both a[]
1226		* and b[].
1227		*/
1228		void br_i32_mulacc(uint32_t d, const uint32_t a, const uint32_t *b);
1229
1230		/*
1231		* Zeroize an integer. The announced bit length is set to the provided
1232		* value, and the corresponding words are set to 0.
1233		*/
1234		static inline void
1235		br_i32_zero(uint32_t *x, uint32_t bit_len)
1236		{
1237		*x ++ = bit_len;
1238		memset(x, 0, ((bit_len + 31) >> 5) * sizeof *x);
1239		}
1240
1241		/*
1242		* Compute -(1/x) mod 2^32. If x is even, then this function returns 0.
1243		*/
1244		uint32_t br_i32_ninv32(uint32_t x);
1245
1246		/*
1247		* Convert a modular integer to Montgomery representation. The integer x[]
1248		* MUST be lower than m[], but with the same announced bit length.
1249		*/
1250		void br_i32_to_monty(uint32_t x, const uint32_t m);
1251
1252		/*
1253		* Convert a modular integer back from Montgomery representation. The
1254		* integer x[] MUST be lower than m[], but with the same announced bit
1255		* length. The "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is
1256		* the least significant value word of m[] (this works only if m[] is
1257		* an odd integer).
1258		*/
1259		void br_i32_from_monty(uint32_t x, const uint32_t m, uint32_t m0i);
1260
1261		/*
1262		* Compute a modular Montgomery multiplication. d[] is filled with the
1263		* value of x*y/R modulo m[] (where R is the Montgomery factor). The
1264		* array d[] MUST be distinct from x[], y[] and m[]. x[] and y[] MUST be
1265		* numerically lower than m[]. x[] and y[] MAY be the same array. The
1266		* "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is the least
1267		* significant value word of m[] (this works only if m[] is an odd
1268		* integer).
1269		*/
1270		void br_i32_montymul(uint32_t d, const uint32_t x, const uint32_t *y,
1271		const uint32_t *m, uint32_t m0i);
1272
1273		/*
1274		* Compute a modular exponentiation. x[] MUST be an integer modulo m[]
1275		* (same announced bit length, lower value). m[] MUST be odd. The
1276		* exponent is in big-endian unsigned notation, over 'elen' bytes. The
1277		* "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is the least
1278		* significant value word of m[] (this works only if m[] is an odd
1279		* integer). The t1[] and t2[] parameters must be temporary arrays,
1280		* each large enough to accommodate an integer with the same size as m[].
1281		*/
1282		void br_i32_modpow(uint32_t x, const unsigned char e, size_t elen,
1283		const uint32_t m, uint32_t m0i, uint32_t t1, uint32_t *t2);
1284
1285		/* ==================================================================== */
1286
1287		/*
1288		* Integers 'i31'
1289		* --------------
1290		*
1291		* The 'i31' functions implement computations on big integers using
1292		* an internal representation as an array of 32-bit integers. For
1293		* an array x[]:
1294		* -- x[0] encodes the array length and the "announced bit length"
1295		* of the integer: namely, if the announced bit length is k,
1296		* then x[0] = ((k / 31) << 5) + (k % 31).
1297		* -- x[1], x[2]... contain the value in little-endian order, 31
1298		* bits per word (x[1] contains the least significant 31 bits).
1299		* The upper bit of each word is 0.
1300		*
1301		* Multiplications rely on the elementary 32x32->64 multiplication.
1302		*
1303		* The announced bit length specifies the number of bits that are
1304		* significant in the subsequent 32-bit words. Unused bits in the
1305		* last (most significant) word are set to 0; subsequent words are
1306		* uninitialized and need not exist at all.
1307		*
1308		* The execution time and memory access patterns of all computations
1309		* depend on the announced bit length, but not on the actual word
1310		* values. For modular integers, the announced bit length of any integer
1311		* modulo n is equal to the actual bit length of n; thus, computations
1312		* on modular integers are "constant-time" (only the modulus length may
1313		* leak).
1314		*/
1315
1316		/*
1317		* Test whether an integer is zero.
1318		*/
1319		uint32_t br_i31_iszero(const uint32_t *x);
1320
1321		/*
1322		* Add b[] to a[] and return the carry (0 or 1). If ctl is 0, then a[]
1323		* is unmodified, but the carry is still computed and returned. The
1324		* arrays a[] and b[] MUST have the same announced bit length.
1325		*
1326		* a[] and b[] MAY be the same array, but partial overlap is not allowed.
1327		*/
1328		uint32_t br_i31_add(uint32_t a, const uint32_t b, uint32_t ctl);
1329
1330		/*
1331		* Subtract b[] from a[] and return the carry (0 or 1). If ctl is 0,
1332		* then a[] is unmodified, but the carry is still computed and returned.
1333		* The arrays a[] and b[] MUST have the same announced bit length.
1334		*
1335		* a[] and b[] MAY be the same array, but partial overlap is not allowed.
1336		*/
1337		uint32_t br_i31_sub(uint32_t a, const uint32_t b, uint32_t ctl);
1338
1339		/*
1340		* Compute the ENCODED actual bit length of an integer. The argument x
1341		* should point to the first (least significant) value word of the
1342		* integer. The len 'xlen' contains the number of 32-bit words to
1343		* access. The upper bit of each value word MUST be 0.
1344		* Returned value is ((k / 31) << 5) + (k % 31) if the bit length is k.
1345		*
1346		* CT: value or length of x does not leak.
1347		*/
1348		uint32_t br_i31_bit_length(uint32_t *x, size_t xlen);
1349
1350		/*
1351		* Decode an integer from its big-endian unsigned representation. The
1352		* "true" bit length of the integer is computed and set in the encoded
1353		* announced bit length (x[0]), but all words of x[] corresponding to
1354		* the full 'len' bytes of the source are set.
1355		*
1356		* CT: value or length of x does not leak.
1357		*/
1358		void br_i31_decode(uint32_t x, const void src, size_t len);
1359
1360		/*
1361		* Decode an integer from its big-endian unsigned representation. The
1362		* integer MUST be lower than m[]; the (encoded) announced bit length
1363		* written in x[] will be equal to that of m[]. All 'len' bytes from the
1364		* source are read.
1365		*
1366		* Returned value is 1 if the decode value fits within the modulus, 0
1367		* otherwise. In the latter case, the x[] buffer will be set to 0 (but
1368		* still with the announced bit length of m[]).
1369		*
1370		* CT: value or length of x does not leak. Memory access pattern depends
1371		* only of 'len' and the announced bit length of m. Whether x fits or
1372		* not does not leak either.
1373		*/
1374		uint32_t br_i31_decode_mod(uint32_t *x,
1375		const void src, size_t len, const uint32_t m);
1376
1377		/*
1378		* Zeroize an integer. The announced bit length is set to the provided
1379		* value, and the corresponding words are set to 0. The ENCODED bit length
1380		* is expected here.
1381		*/
1382		static inline void
1383		br_i31_zero(uint32_t *x, uint32_t bit_len)
1384		{
1385		*x ++ = bit_len;
1386		memset(x, 0, ((bit_len + 31) >> 5) * sizeof *x);
1387		}
1388
1389		/*
1390		* Right-shift an integer. The shift amount must be lower than 31
1391		* bits.
1392		*/
1393		void br_i31_rshift(uint32_t *x, int count);
1394
1395		/*
1396		* Reduce an integer (a[]) modulo another (m[]). The result is written
1397		* in x[] and its announced bit length is set to be equal to that of m[].
1398		*
1399		* x[] MUST be distinct from a[] and m[].
1400		*
1401		* CT: only announced bit lengths leak, not values of x, a or m.
1402		*/
1403		void br_i31_reduce(uint32_t x, const uint32_t a, const uint32_t *m);
1404
1405		/*
1406		* Decode an integer from its big-endian unsigned representation, and
1407		* reduce it modulo the provided modulus m[]. The announced bit length
1408		* of the result is set to be equal to that of the modulus.
1409		*
1410		* x[] MUST be distinct from m[].
1411		*/
1412		void br_i31_decode_reduce(uint32_t *x,
1413		const void src, size_t len, const uint32_t m);
1414
1415		/*
1416		* Multiply x[] by 2^31 and then add integer z, modulo m[]. This
1417		* function assumes that x[] and m[] have the same announced bit
1418		* length, the announced bit length of m[] matches its true
1419		* bit length.
1420		*
1421		* x[] and m[] MUST be distinct arrays. z MUST fit in 31 bits (upper
1422		* bit set to 0).
1423		*
1424		* CT: only the common announced bit length of x and m leaks, not
1425		* the values of x, z or m.
1426		*/
1427		void br_i31_muladd_small(uint32_t x, uint32_t z, const uint32_t m);
1428
1429		/*
1430		* Encode an integer into its big-endian unsigned representation. The
1431		* output length in bytes is provided (parameter 'len'); if the length
1432		* is too short then the integer is appropriately truncated; if it is
1433		* too long then the extra bytes are set to 0.
1434		*/
1435		void br_i31_encode(void dst, size_t len, const uint32_t x);
1436
1437		/*
1438		* Compute -(1/x) mod 2^31. If x is even, then this function returns 0.
1439		*/
1440		uint32_t br_i31_ninv31(uint32_t x);
1441
1442		/*
1443		* Compute a modular Montgomery multiplication. d[] is filled with the
1444		* value of x*y/R modulo m[] (where R is the Montgomery factor). The
1445		* array d[] MUST be distinct from x[], y[] and m[]. x[] and y[] MUST be
1446		* numerically lower than m[]. x[] and y[] MAY be the same array. The
1447		* "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
1448		* significant value word of m[] (this works only if m[] is an odd
1449		* integer).
1450		*/
1451		void br_i31_montymul(uint32_t d, const uint32_t x, const uint32_t *y,
1452		const uint32_t *m, uint32_t m0i);
1453
1454		/*
1455		* Convert a modular integer to Montgomery representation. The integer x[]
1456		* MUST be lower than m[], but with the same announced bit length.
1457		*/
1458		void br_i31_to_monty(uint32_t x, const uint32_t m);
1459
1460		/*
1461		* Convert a modular integer back from Montgomery representation. The
1462		* integer x[] MUST be lower than m[], but with the same announced bit
1463		* length. The "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is
1464		* the least significant value word of m[] (this works only if m[] is
1465		* an odd integer).
1466		*/
1467		void br_i31_from_monty(uint32_t x, const uint32_t m, uint32_t m0i);
1468
1469		/*
1470		* Compute a modular exponentiation. x[] MUST be an integer modulo m[]
1471		* (same announced bit length, lower value). m[] MUST be odd. The
1472		* exponent is in big-endian unsigned notation, over 'elen' bytes. The
1473		* "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
1474		* significant value word of m[] (this works only if m[] is an odd
1475		* integer). The t1[] and t2[] parameters must be temporary arrays,
1476		* each large enough to accommodate an integer with the same size as m[].
1477		*/
1478		void br_i31_modpow(uint32_t x, const unsigned char e, size_t elen,
1479		const uint32_t m, uint32_t m0i, uint32_t t1, uint32_t *t2);
1480
1481		/*
1482		* Compute a modular exponentiation. x[] MUST be an integer modulo m[]
1483		* (same announced bit length, lower value). m[] MUST be odd. The
1484		* exponent is in big-endian unsigned notation, over 'elen' bytes. The
1485		* "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
1486		* significant value word of m[] (this works only if m[] is an odd
1487		* integer). The tmp[] array is used for temporaries, and has size
1488		* 'twlen' words; it must be large enough to accommodate at least two
1489		* temporary values with the same size as m[] (including the leading
1490		* "bit length" word). If there is room for more temporaries, then this
1491		* function may use the extra room for window-based optimisation,
1492		* resulting in faster computations.
1493		*
1494		* Returned value is 1 on success, 0 on error. An error is reported if
1495		* the provided tmp[] array is too short.
1496		*/
1497		uint32_t br_i31_modpow_opt(uint32_t x, const unsigned char e, size_t elen,
1498		const uint32_t m, uint32_t m0i, uint32_t tmp, size_t twlen);
1499
1500		/*
1501		* Compute d+a*b, result in d. The initial announced bit length of d[]
1502		* MUST match that of a[]. The d[] array MUST be large enough to
1503		* accommodate the full result, plus (possibly) an extra word. The
1504		* resulting announced bit length of d[] will be the sum of the announced
1505		* bit lengths of a[] and b[] (therefore, it may be larger than the actual
1506		* bit length of the numerical result).
1507		*
1508		* a[] and b[] may be the same array. d[] must be disjoint from both a[]
1509		* and b[].
1510		*/
1511		void br_i31_mulacc(uint32_t d, const uint32_t a, const uint32_t *b);
1512
1513		/*
1514		* Compute x/y mod m, result in x. Values x and y must be between 0 and
1515		* m-1, and have the same announced bit length as m. Modulus m must be
1516		* odd. The "m0i" parameter is equal to -1/m mod 2^31. The array 't'
1517		* must point to a temporary area that can hold at least three integers
1518		* of the size of m.
1519		*
1520		* m may not overlap x and y. x and y may overlap each other (this can
1521		* be useful to test whether a value is invertible modulo m). t must be
1522		* disjoint from all other arrays.
1523		*
1524		* Returned value is 1 on success, 0 otherwise. Success is attained if
1525		* y is invertible modulo m.
1526		*/
1527		uint32_t br_i31_moddiv(uint32_t x, const uint32_t y,
1528		const uint32_t m, uint32_t m0i, uint32_t t);
1529
1530		/* ==================================================================== */
1531
1532		/*
1533		* FIXME: document "i15" functions.
1534		*/
1535
1536		static inline void
1537		br_i15_zero(uint16_t *x, uint16_t bit_len)
1538		{
1539		*x ++ = bit_len;
1540		memset(x, 0, ((bit_len + 15) >> 4) * sizeof *x);
1541		}
1542
1543		uint32_t br_i15_iszero(const uint16_t *x);
1544
1545		uint16_t br_i15_ninv15(uint16_t x);
1546
1547		uint32_t br_i15_add(uint16_t a, const uint16_t b, uint32_t ctl);
1548
1549		uint32_t br_i15_sub(uint16_t a, const uint16_t b, uint32_t ctl);
1550
1551		void br_i15_muladd_small(uint16_t x, uint16_t z, const uint16_t m);
1552
1553		void br_i15_montymul(uint16_t d, const uint16_t x, const uint16_t *y,
1554		const uint16_t *m, uint16_t m0i);
1555
1556		void br_i15_to_monty(uint16_t x, const uint16_t m);
1557
1558		void br_i15_modpow(uint16_t x, const unsigned char e, size_t elen,
1559		const uint16_t m, uint16_t m0i, uint16_t t1, uint16_t *t2);
1560
1561		uint32_t br_i15_modpow_opt(uint16_t x, const unsigned char e, size_t elen,
1562		const uint16_t m, uint16_t m0i, uint16_t tmp, size_t twlen);
1563
1564		void br_i15_encode(void dst, size_t len, const uint16_t x);
1565
1566		uint32_t br_i15_decode_mod(uint16_t *x,
1567		const void src, size_t len, const uint16_t m);
1568
1569		void br_i15_rshift(uint16_t *x, int count);
1570
1571		uint32_t br_i15_bit_length(uint16_t *x, size_t xlen);
1572
1573		void br_i15_decode(uint16_t x, const void src, size_t len);
1574
1575		void br_i15_from_monty(uint16_t x, const uint16_t m, uint16_t m0i);
1576
1577		void br_i15_decode_reduce(uint16_t *x,
1578		const void src, size_t len, const uint16_t m);
1579
1580		void br_i15_reduce(uint16_t x, const uint16_t a, const uint16_t *m);
1581
1582		void br_i15_mulacc(uint16_t d, const uint16_t a, const uint16_t *b);
1583
1584		uint32_t br_i15_moddiv(uint16_t x, const uint16_t y,
1585		const uint16_t m, uint16_t m0i, uint16_t t);
1586
1587		/*
1588		* Variant of br_i31_modpow_opt() that internally uses 64x64->128
1589		* multiplications. It expects the same parameters as br_i31_modpow_opt(),
1590		* except that the temporaries should be 64-bit integers, not 32-bit
1591		* integers.
1592		*/
1593		uint32_t br_i62_modpow_opt(uint32_t x31, const unsigned char e, size_t elen,
1594		const uint32_t m31, uint32_t m0i31, uint64_t tmp, size_t twlen);
1595
1596		/*
1597		* Type for a function with the same API as br_i31_modpow_opt() (some
1598		* implementations of this type may have stricter alignment requirements
1599		* on the temporaries).
1600		*/
1601		typedef uint32_t (br_i31_modpow_opt_type)(uint32_t x,
1602		const unsigned char *e, size_t elen,
1603		const uint32_t m, uint32_t m0i, uint32_t tmp, size_t twlen);
1604
1605		/*
1606		* Wrapper for br_i62_modpow_opt() that uses the same type as
1607		* br_i31_modpow_opt(); however, it requires its 'tmp' argument to the
1608		* 64-bit aligned.
1609		*/
1610		uint32_t br_i62_modpow_opt_as_i31(uint32_t *x,
1611		const unsigned char *e, size_t elen,
1612		const uint32_t m, uint32_t m0i, uint32_t tmp, size_t twlen);
1613
1614		/* ==================================================================== */
1615
1616		static inline size_t
1617		br_digest_size(const br_hash_class *digest_class)
1618		{
1619		return (size_t)(digest_class->desc >> BR_HASHDESC_OUT_OFF)
1620		& BR_HASHDESC_OUT_MASK;
1621		}
1622
1623		/*
1624		* Get the output size (in bytes) of a hash function.
1625		*/
1626		size_t br_digest_size_by_ID(int digest_id);
1627
1628		/*
1629		* Get the OID (encoded OBJECT IDENTIFIER value, without tag and length)
1630		* for a hash function. If digest_id is not a supported digest identifier
1631		* (in particular if it is equal to 0, i.e. br_md5sha1_ID), then NULL is
1632		* returned and *len is set to 0.
1633		*/
1634		const unsigned char br_digest_OID(int digest_id, size_t len);
1635
1636		/* ==================================================================== */
1637		/*
1638		* DES support functions.
1639		*/
1640
1641		/*
1642		* Apply DES Initial Permutation.
1643		*/
1644		void br_des_do_IP(uint32_t xl, uint32_t xr);
1645
1646		/*
1647		* Apply DES Final Permutation (inverse of IP).
1648		*/
1649		void br_des_do_invIP(uint32_t xl, uint32_t xr);
1650
1651		/*
1652		* Key schedule unit: for a DES key (8 bytes), compute 16 subkeys. Each
1653		* subkey is two 28-bit words represented as two 32-bit words; the PC-2
1654		* bit extration is NOT applied.
1655		*/
1656		void br_des_keysched_unit(uint32_t skey, const void key);
1657
1658		/*
1659		* Reversal of 16 DES sub-keys (for decryption).
1660		*/
1661		void br_des_rev_skey(uint32_t *skey);
1662
1663		/*
1664		* DES/3DES key schedule for 'des_tab' (encryption direction). Returned
1665		* value is the number of rounds.
1666		*/
1667		unsigned br_des_tab_keysched(uint32_t skey, const void key, size_t key_len);
1668
1669		/*
1670		* DES/3DES key schedule for 'des_ct' (encryption direction). Returned
1671		* value is the number of rounds.
1672		*/
1673		unsigned br_des_ct_keysched(uint32_t skey, const void key, size_t key_len);
1674
1675		/*
1676		* DES/3DES subkey decompression (from the compressed bitsliced subkeys).
1677		*/
1678		void br_des_ct_skey_expand(uint32_t *sk_exp,
1679		unsigned num_rounds, const uint32_t *skey);
1680
1681		/*
1682		* DES/3DES block encryption/decryption ('des_tab').
1683		*/
1684		void br_des_tab_process_block(unsigned num_rounds,
1685		const uint32_t skey, void block);
1686
1687		/*
1688		* DES/3DES block encryption/decryption ('des_ct').
1689		*/
1690		void br_des_ct_process_block(unsigned num_rounds,
1691		const uint32_t skey, void block);
1692
1693		/* ==================================================================== */
1694		/*
1695		* AES support functions.
1696		*/
1697
1698		/*
1699		* The AES S-box (256-byte table).
1700		*/
1701		extern const unsigned char br_aes_S[];
1702
1703		/*
1704		* AES key schedule. skey[] is filled with n+1 128-bit subkeys, where n
1705		* is the number of rounds (10 to 14, depending on key size). The number
1706		* of rounds is returned. If the key size is invalid (not 16, 24 or 32),
1707		* then 0 is returned.
1708		*
1709		* This implementation uses a 256-byte table and is NOT constant-time.
1710		*/
1711		unsigned br_aes_keysched(uint32_t skey, const void key, size_t key_len);
1712
1713		/*
1714		* AES key schedule for decryption ('aes_big' implementation).
1715		*/
1716		unsigned br_aes_big_keysched_inv(uint32_t *skey,
1717		const void *key, size_t key_len);
1718
1719		/*
1720		* AES block encryption with the 'aes_big' implementation (fast, but
1721		* not constant-time). This function encrypts a single block "in place".
1722		*/
1723		void br_aes_big_encrypt(unsigned num_rounds, const uint32_t skey, void data);
1724
1725		/*
1726		* AES block decryption with the 'aes_big' implementation (fast, but
1727		* not constant-time). This function decrypts a single block "in place".
1728		*/
1729		void br_aes_big_decrypt(unsigned num_rounds, const uint32_t skey, void data);
1730
1731		/*
1732		* AES block encryption with the 'aes_small' implementation (small, but
1733		* slow and not constant-time). This function encrypts a single block
1734		* "in place".
1735		*/
1736		void br_aes_small_encrypt(unsigned num_rounds,
1737		const uint32_t skey, void data);
1738
1739		/*
1740		* AES block decryption with the 'aes_small' implementation (small, but
1741		* slow and not constant-time). This function decrypts a single block
1742		* "in place".
1743		*/
1744		void br_aes_small_decrypt(unsigned num_rounds,
1745		const uint32_t skey, void data);
1746
1747		/*
1748		* The constant-time implementation is "bitsliced": the 128-bit state is
1749		* split over eight 32-bit words q* in the following way:
1750		*
1751		* -- Input block consists in 16 bytes:
1752		* a00 a10 a20 a30 a01 a11 a21 a31 a02 a12 a22 a32 a03 a13 a23 a33
1753		* In the terminology of FIPS 197, this is a 4x4 matrix which is read
1754		* column by column.
1755		*
1756		* -- Each byte is split into eight bits which are distributed over the
1757		* eight words, at the same rank. Thus, for a byte x at rank k, bit 0
1758		* (least significant) of x will be at rank k in q0 (if that bit is b,
1759		* then it contributes "b << k" to the value of q0), bit 1 of x will be
1760		* at rank k in q1, and so on.
1761		*
1762		* -- Ranks given to bits are in "row order" and are either all even, or
1763		* all odd. Two independent AES states are thus interleaved, one using
1764		* the even ranks, the other the odd ranks. Row order means:
1765		* a00 a01 a02 a03 a10 a11 a12 a13 a20 a21 a22 a23 a30 a31 a32 a33
1766		*
1767		* Converting input bytes from two AES blocks to bitslice representation
1768		* is done in the following way:
1769		* -- Decode first block into the four words q0 q2 q4 q6, in that order,
1770		* using little-endian convention.
1771		* -- Decode second block into the four words q1 q3 q5 q7, in that order,
1772		* using little-endian convention.
1773		* -- Call br_aes_ct_ortho().
1774		*
1775		* Converting back to bytes is done by using the reverse operations. Note
1776		* that br_aes_ct_ortho() is its own inverse.
1777		*/
1778
1779		/*
1780		* Perform bytewise orthogonalization of eight 32-bit words. Bytes
1781		* of q0..q7 are spread over all words: for a byte x that occurs
1782		* at rank i in q[j] (byte x uses bits 8i to 8i+7 in q[j]), the bit
1783		* of rank k in x (0 <= k <= 7) goes to q[k] at rank 8*i+j.
1784		*
1785		* This operation is an involution.
1786		*/
1787		void br_aes_ct_ortho(uint32_t *q);
1788
1789		/*
1790		* The AES S-box, as a bitsliced constant-time version. The input array
1791		* consists in eight 32-bit words; 32 S-box instances are computed in
1792		* parallel. Bits 0 to 7 of each S-box input (bit 0 is least significant)
1793		* are spread over the words 0 to 7, at the same rank.
1794		*/
1795		void br_aes_ct_bitslice_Sbox(uint32_t *q);
1796
1797		/*
1798		* Like br_aes_bitslice_Sbox(), but for the inverse S-box.
1799		*/
1800		void br_aes_ct_bitslice_invSbox(uint32_t *q);
1801
1802		/*
1803		* Compute AES encryption on bitsliced data. Since input is stored on
1804		* eight 32-bit words, two block encryptions are actually performed
1805		* in parallel.
1806		*/
1807		void br_aes_ct_bitslice_encrypt(unsigned num_rounds,
1808		const uint32_t skey, uint32_t q);
1809
1810		/*
1811		* Compute AES decryption on bitsliced data. Since input is stored on
1812		* eight 32-bit words, two block decryptions are actually performed
1813		* in parallel.
1814		*/
1815		void br_aes_ct_bitslice_decrypt(unsigned num_rounds,
1816		const uint32_t skey, uint32_t q);
1817
1818		/*
1819		* AES key schedule, constant-time version. skey[] is filled with n+1
1820		* 128-bit subkeys, where n is the number of rounds (10 to 14, depending
1821		* on key size). The number of rounds is returned. If the key size is
1822		* invalid (not 16, 24 or 32), then 0 is returned.
1823		*/
1824		unsigned br_aes_ct_keysched(uint32_t *comp_skey,
1825		const void *key, size_t key_len);
1826
1827		/*
1828		* Expand AES subkeys as produced by br_aes_ct_keysched(), into
1829		* a larger array suitable for br_aes_ct_bitslice_encrypt() and
1830		* br_aes_ct_bitslice_decrypt().
1831		*/
1832		void br_aes_ct_skey_expand(uint32_t *skey,
1833		unsigned num_rounds, const uint32_t *comp_skey);
1834
1835		/*
1836		* For the ct64 implementation, the same bitslicing technique is used,
1837		* but four instances are interleaved. First instance uses bits 0, 4,
1838		* 8, 12,... of each word; second instance uses bits 1, 5, 9, 13,...
1839		* and so on.
1840		*/
1841
1842		/*
1843		* Perform bytewise orthogonalization of eight 64-bit words. Bytes
1844		* of q0..q7 are spread over all words: for a byte x that occurs
1845		* at rank i in q[j] (byte x uses bits 8i to 8i+7 in q[j]), the bit
1846		* of rank k in x (0 <= k <= 7) goes to q[k] at rank 8*i+j.
1847		*
1848		* This operation is an involution.
1849		*/
1850		void br_aes_ct64_ortho(uint64_t *q);
1851
1852		/*
1853		* Interleave bytes for an AES input block. If input bytes are
1854		* denoted 0123456789ABCDEF, and have been decoded with little-endian
1855		* convention (w[0] contains 0123, with '3' being most significant;
1856		* w[1] contains 4567, and so on), then output word q0 will be
1857		* set to 08192A3B (again little-endian convention) and q1 will
1858		* be set to 4C5D6E7F.
1859		*/
1860		void br_aes_ct64_interleave_in(uint64_t q0, uint64_t q1, const uint32_t *w);
1861
1862		/*
1863		* Perform the opposite of br_aes_ct64_interleave_in().
1864		*/
1865		void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1);
1866
1867		/*
1868		* The AES S-box, as a bitsliced constant-time version. The input array
1869		* consists in eight 64-bit words; 64 S-box instances are computed in
1870		* parallel. Bits 0 to 7 of each S-box input (bit 0 is least significant)
1871		* are spread over the words 0 to 7, at the same rank.
1872		*/
1873		void br_aes_ct64_bitslice_Sbox(uint64_t *q);
1874
1875		/*
1876		* Like br_aes_bitslice_Sbox(), but for the inverse S-box.
1877		*/
1878		void br_aes_ct64_bitslice_invSbox(uint64_t *q);
1879
1880		/*
1881		* Compute AES encryption on bitsliced data. Since input is stored on
1882		* eight 64-bit words, four block encryptions are actually performed
1883		* in parallel.
1884		*/
1885		void br_aes_ct64_bitslice_encrypt(unsigned num_rounds,
1886		const uint64_t skey, uint64_t q);
1887
1888		/*
1889		* Compute AES decryption on bitsliced data. Since input is stored on
1890		* eight 64-bit words, four block decryptions are actually performed
1891		* in parallel.
1892		*/
1893		void br_aes_ct64_bitslice_decrypt(unsigned num_rounds,
1894		const uint64_t skey, uint64_t q);
1895
1896		/*
1897		* AES key schedule, constant-time version. skey[] is filled with n+1
1898		* 128-bit subkeys, where n is the number of rounds (10 to 14, depending
1899		* on key size). The number of rounds is returned. If the key size is
1900		* invalid (not 16, 24 or 32), then 0 is returned.
1901		*/
1902		unsigned br_aes_ct64_keysched(uint64_t *comp_skey,
1903		const void *key, size_t key_len);
1904
1905		/*
1906		* Expand AES subkeys as produced by br_aes_ct64_keysched(), into
1907		* a larger array suitable for br_aes_ct64_bitslice_encrypt() and
1908		* br_aes_ct64_bitslice_decrypt().
1909		*/
1910		void br_aes_ct64_skey_expand(uint64_t *skey,
1911		unsigned num_rounds, const uint64_t *comp_skey);
1912
1913		/*
1914		* Test support for AES-NI opcodes.
1915		*/
1916		int br_aes_x86ni_supported(void);
1917
1918		/*
1919		* AES key schedule, using x86 AES-NI instructions. This yields the
1920		* subkeys in the encryption direction. Number of rounds is returned.
1921		* Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
1922		*/
1923		unsigned br_aes_x86ni_keysched_enc(unsigned char *skni,
1924		const void *key, size_t len);
1925
1926		/*
1927		* AES key schedule, using x86 AES-NI instructions. This yields the
1928		* subkeys in the decryption direction. Number of rounds is returned.
1929		* Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
1930		*/
1931		unsigned br_aes_x86ni_keysched_dec(unsigned char *skni,
1932		const void *key, size_t len);
1933
1934		/*
1935		* Test support for AES POWER8 opcodes.
1936		*/
1937		int br_aes_pwr8_supported(void);
1938
1939		/*
1940		* AES key schedule, using POWER8 instructions. This yields the
1941		* subkeys in the encryption direction. Number of rounds is returned.
1942		* Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
1943		*/
1944		unsigned br_aes_pwr8_keysched(unsigned char *skni,
1945		const void *key, size_t len);
1946
1947		/* ==================================================================== */
1948		/*
1949		* RSA.
1950		*/
1951
1952		/*
1953		* Apply proper PKCS#1 v1.5 padding (for signatures). 'hash_oid' is
1954		* the encoded hash function OID, or NULL.
1955		*/
1956		uint32_t br_rsa_pkcs1_sig_pad(const unsigned char *hash_oid,
1957		const unsigned char *hash, size_t hash_len,
1958		uint32_t n_bitlen, unsigned char *x);
1959
1960		/*
1961		* Check PKCS#1 v1.5 padding (for signatures). 'hash_oid' is the encoded
1962		* hash function OID, or NULL. The provided 'sig' value is _after_ the
1963		* modular exponentiation, i.e. it should be the padded hash. On
1964		* success, the hashed message is extracted.
1965		*/
1966		uint32_t br_rsa_pkcs1_sig_unpad(const unsigned char *sig, size_t sig_len,
1967		const unsigned char *hash_oid, size_t hash_len,
1968		unsigned char *hash_out);
1969
1970		/*
1971		* Apply proper PSS padding. The 'x' buffer is output only: it
1972		* receives the value that is to be exponentiated.
1973		*/
1974		uint32_t br_rsa_pss_sig_pad(const br_prng_class **rng,
1975		const br_hash_class hf_data, const br_hash_class hf_mgf1,
1976		const unsigned char *hash, size_t salt_len,
1977		uint32_t n_bitlen, unsigned char *x);
1978
1979		/*
1980		* Check PSS padding. The provided value is the one _after_
1981		* the modular exponentiation; it is modified by this function.
1982		* This function infers the signature length from the public key
1983		* size, i.e. it assumes that this has already been verified (as
1984		* part of the exponentiation).
1985		*/
1986		uint32_t br_rsa_pss_sig_unpad(
1987		const br_hash_class hf_data, const br_hash_class hf_mgf1,
1988		const unsigned char *hash, size_t salt_len,
1989		const br_rsa_public_key pk, unsigned char x);
1990
1991		/*
1992		* Apply OAEP padding. Returned value is the actual padded string length,
1993		* or zero on error.
1994		*/
1995		size_t br_rsa_oaep_pad(const br_prng_class *rnd, const br_hash_class dig,
1996		const void label, size_t label_len, const br_rsa_public_key pk,
1997		void dst, size_t dst_nax_len, const void src, size_t src_len);
1998
1999		/*
2000		* Unravel and check OAEP padding. If the padding is correct, then 1 is
2001		* returned, '*len' is adjusted to the length of the message, and the
2002		* data is moved to the start of the 'data' buffer. If the padding is
2003		* incorrect, then 0 is returned and '*len' is untouched. Either way,
2004		* the complete buffer contents are altered.
2005		*/
2006		uint32_t br_rsa_oaep_unpad(const br_hash_class *dig,
2007		const void label, size_t label_len, void data, size_t *len);
2008
2009		/*
2010		* Compute MGF1 for a given seed, and XOR the output into the provided
2011		* buffer.
2012		*/
2013		void br_mgf1_xor(void *data, size_t len,
2014		const br_hash_class dig, const void seed, size_t seed_len);
2015
2016		/*
2017		* Inner function for RSA key generation; used by the "i31" and "i62"
2018		* implementations.
2019		*/
2020		uint32_t br_rsa_i31_keygen_inner(const br_prng_class **rng,
2021		br_rsa_private_key sk, void kbuf_priv,
2022		br_rsa_public_key pk, void kbuf_pub,
2023		unsigned size, uint32_t pubexp, br_i31_modpow_opt_type mp31);
2024
2025		/* ==================================================================== */
2026		/*
2027		* Elliptic curves.
2028		*/
2029
2030		/*
2031		* Type for generic EC parameters: curve order (unsigned big-endian
2032		* encoding) and encoded conventional generator.
2033		*/
2034		typedef struct {
2035		int curve;
2036		const unsigned char *order;
2037		size_t order_len;
2038		const unsigned char *generator;
2039		size_t generator_len;
2040		} br_ec_curve_def;
2041
2042		extern const br_ec_curve_def br_secp256r1;
2043		extern const br_ec_curve_def br_secp384r1;
2044		extern const br_ec_curve_def br_secp521r1;
2045
2046		/*
2047		* For Curve25519, the advertised "order" really is 2^255-1, since the
2048		* point multipliction function really works over arbitrary 255-bit
2049		* scalars. This value is only meant as a hint for ECDH key generation;
2050		* only ECDSA uses the exact curve order, and ECDSA is not used with
2051		* that specific curve.
2052		*/
2053		extern const br_ec_curve_def br_curve25519;
2054
2055		/*
2056		* Decode some bytes as an i31 integer, with truncation (corresponding
2057		* to the 'bits2int' operation in RFC 6979). The target ENCODED bit
2058		* length is provided as last parameter. The resulting value will have
2059		* this declared bit length, and consists the big-endian unsigned decoding
2060		* of exactly that many bits in the source (capped at the source length).
2061		*/
2062		void br_ecdsa_i31_bits2int(uint32_t *x,
2063		const void *src, size_t len, uint32_t ebitlen);
2064
2065		/*
2066		* Decode some bytes as an i15 integer, with truncation (corresponding
2067		* to the 'bits2int' operation in RFC 6979). The target ENCODED bit
2068		* length is provided as last parameter. The resulting value will have
2069		* this declared bit length, and consists the big-endian unsigned decoding
2070		* of exactly that many bits in the source (capped at the source length).
2071		*/
2072		void br_ecdsa_i15_bits2int(uint16_t *x,
2073		const void *src, size_t len, uint32_t ebitlen);
2074
2075		/* ==================================================================== */
2076		/*
2077		* ASN.1 support functions.
2078		*/
2079
2080		/*
2081		* A br_asn1_uint structure contains encoding information about an
2082		* INTEGER nonnegative value: pointer to the integer contents (unsigned
2083		* big-endian representation), length of the integer contents,
2084		* and length of the encoded value. The data shall have minimal length:
2085		* - If the integer value is zero, then 'len' must be zero.
2086		* - If the integer value is not zero, then data[0] must be non-zero.
2087		*
2088		* Under these conditions, 'asn1len' is necessarily equal to either len
2089		* or len+1.
2090		*/
2091		typedef struct {
2092		const unsigned char *data;
2093		size_t len;
2094		size_t asn1len;
2095		} br_asn1_uint;
2096
2097		/*
2098		* Given an encoded integer (unsigned big-endian, with possible leading
2099		* bytes of value 0), returned the "prepared INTEGER" structure.
2100		*/
2101		br_asn1_uint br_asn1_uint_prepare(const void *xdata, size_t xlen);
2102
2103		/*
2104		* Encode an ASN.1 length. The length of the encoded length is returned.
2105		* If 'dest' is NULL, then no encoding is performed, but the length of
2106		* the encoded length is still computed and returned.
2107		*/
2108		size_t br_asn1_encode_length(void *dest, size_t len);
2109
2110		/*
2111		* Convenient macro for computing lengths of lengths.
2112		*/
2113		#define len_of_len(len) br_asn1_encode_length(NULL, len)
2114
2115		/*
2116		* Encode a (prepared) ASN.1 INTEGER. The encoded length is returned.
2117		* If 'dest' is NULL, then no encoding is performed, but the length of
2118		* the encoded integer is still computed and returned.
2119		*/
2120		size_t br_asn1_encode_uint(void *dest, br_asn1_uint pp);
2121
2122		/*
2123		* Get the OID that identifies an elliptic curve. Returned value is
2124		* the DER-encoded OID, with the length (always one byte) but without
2125		* the tag. Thus, the first byte of the returned buffer contains the
2126		* number of subsequent bytes in the value. If the curve is not
2127		* recognised, NULL is returned.
2128		*/
2129		const unsigned char *br_get_curve_OID(int curve);
2130
2131		/*
2132		* Inner function for EC private key encoding. This is equivalent to
2133		* the API function br_encode_ec_raw_der(), except for an extra
2134		* parameter: if 'include_curve_oid' is zero, then the curve OID is
2135		* _not_ included in the output blob (this is for PKCS#8 support).
2136		*/
2137		size_t br_encode_ec_raw_der_inner(void *dest,
2138		const br_ec_private_key sk, const br_ec_public_key pk,
2139		int include_curve_oid);
2140
2141		/* ==================================================================== */
2142		/*
2143		* SSL/TLS support functions.
2144		*/
2145
2146		/*
2147		* Record types.
2148		*/
2149		#define BR_SSL_CHANGE_CIPHER_SPEC 20
2150		#define BR_SSL_ALERT 21
2151		#define BR_SSL_HANDSHAKE 22
2152		#define BR_SSL_APPLICATION_DATA 23
2153
2154		/*
2155		* Handshake message types.
2156		*/
2157		#define BR_SSL_HELLO_REQUEST 0
2158		#define BR_SSL_CLIENT_HELLO 1
2159		#define BR_SSL_SERVER_HELLO 2
2160		#define BR_SSL_CERTIFICATE 11
2161		#define BR_SSL_SERVER_KEY_EXCHANGE 12
2162		#define BR_SSL_CERTIFICATE_REQUEST 13
2163		#define BR_SSL_SERVER_HELLO_DONE 14
2164		#define BR_SSL_CERTIFICATE_VERIFY 15
2165		#define BR_SSL_CLIENT_KEY_EXCHANGE 16
2166		#define BR_SSL_FINISHED 20
2167
2168		/*
2169		* Alert levels.
2170		*/
2171		#define BR_LEVEL_WARNING 1
2172		#define BR_LEVEL_FATAL 2
2173
2174		/*
2175		* Low-level I/O state.
2176		*/
2177		#define BR_IO_FAILED 0
2178		#define BR_IO_IN 1
2179		#define BR_IO_OUT 2
2180		#define BR_IO_INOUT 3
2181
2182		/*
2183		* Mark a SSL engine as failed. The provided error code is recorded if
2184		* the engine was not already marked as failed. If 'err' is 0, then the
2185		* engine is marked as closed (without error).
2186		*/
2187		void br_ssl_engine_fail(br_ssl_engine_context *cc, int err);
2188
2189		/*
2190		* Test whether the engine is closed (normally or as a failure).
2191		*/
2192		static inline int
2193		br_ssl_engine_closed(const br_ssl_engine_context *cc)
2194		{
2195		return cc->iomode == BR_IO_FAILED;
2196		}
2197
2198		/*
2199		* Configure a new maximum fragment length. If possible, the maximum
2200		* length for outgoing records is immediately adjusted (if there are
2201		* not already too many buffered bytes for that).
2202		*/
2203		void br_ssl_engine_new_max_frag_len(
2204		br_ssl_engine_context *rc, unsigned max_frag_len);
2205
2206		/*
2207		* Test whether the current incoming record has been fully received
2208		* or not. This functions returns 0 only if a complete record header
2209		* has been received, but some of the (possibly encrypted) payload
2210		* has not yet been obtained.
2211		*/
2212		int br_ssl_engine_recvrec_finished(const br_ssl_engine_context *rc);
2213
2214		/*
2215		* Flush the current record (if not empty). This is meant to be called
2216		* from the handshake processor only.
2217		*/
2218		void br_ssl_engine_flush_record(br_ssl_engine_context *cc);
2219
2220		/*
2221		* Test whether there is some accumulated payload to send.
2222		*/
2223		static inline int
2224		br_ssl_engine_has_pld_to_send(const br_ssl_engine_context *rc)
2225		{
2226		return rc->oxa != rc->oxb && rc->oxa != rc->oxc;
2227		}
2228
2229		/*
2230		* Initialize RNG in engine. Returned value is 1 on success, 0 on error.
2231		* This function will try to use the OS-provided RNG, if available. If
2232		* there is no OS-provided RNG, or if it failed, and no entropy was
2233		* injected by the caller, then a failure will be reported. On error,
2234		* the context error code is set.
2235		*/
2236		int br_ssl_engine_init_rand(br_ssl_engine_context *cc);
2237
2238		/*
2239		* Reset the handshake-related parts of the engine.
2240		*/
2241		void br_ssl_engine_hs_reset(br_ssl_engine_context *cc,
2242		void (hsinit)(void ), void (hsrun)(void ));
2243
2244		/*
2245		* Get the PRF to use for this context, for the provided PRF hash
2246		* function ID.
2247		*/
2248		br_tls_prf_impl br_ssl_engine_get_PRF(br_ssl_engine_context *cc, int prf_id);
2249
2250		/*
2251		* Consume the provided pre-master secret and compute the corresponding
2252		* master secret. The 'prf_id' is the ID of the hash function to use
2253		* with the TLS 1.2 PRF (ignored if the version is TLS 1.0 or 1.1).
2254		*/
2255		void br_ssl_engine_compute_master(br_ssl_engine_context *cc,
2256		int prf_id, const void *pms, size_t len);
2257
2258		/*
2259		* Switch to CBC decryption for incoming records.
2260		* cc the engine context
2261		* is_client non-zero for a client, zero for a server
2262		* prf_id id of hash function for PRF (ignored if not TLS 1.2+)
2263		* mac_id id of hash function for HMAC
2264		* bc_impl block cipher implementation (CBC decryption)
2265		* cipher_key_len block cipher key length (in bytes)
2266		*/
2267		void br_ssl_engine_switch_cbc_in(br_ssl_engine_context *cc,
2268		int is_client, int prf_id, int mac_id,
2269		const br_block_cbcdec_class *bc_impl, size_t cipher_key_len);
2270
2271		/*
2272		* Switch to CBC encryption for outgoing records.
2273		* cc the engine context
2274		* is_client non-zero for a client, zero for a server
2275		* prf_id id of hash function for PRF (ignored if not TLS 1.2+)
2276		* mac_id id of hash function for HMAC
2277		* bc_impl block cipher implementation (CBC encryption)
2278		* cipher_key_len block cipher key length (in bytes)
2279		*/
2280		void br_ssl_engine_switch_cbc_out(br_ssl_engine_context *cc,
2281		int is_client, int prf_id, int mac_id,
2282		const br_block_cbcenc_class *bc_impl, size_t cipher_key_len);
2283
2284		/*
2285		* Switch to GCM decryption for incoming records.
2286		* cc the engine context
2287		* is_client non-zero for a client, zero for a server
2288		* prf_id id of hash function for PRF
2289		* bc_impl block cipher implementation (CTR)
2290		* cipher_key_len block cipher key length (in bytes)
2291		*/
2292		void br_ssl_engine_switch_gcm_in(br_ssl_engine_context *cc,
2293		int is_client, int prf_id,
2294		const br_block_ctr_class *bc_impl, size_t cipher_key_len);
2295
2296		/*
2297		* Switch to GCM encryption for outgoing records.
2298		* cc the engine context
2299		* is_client non-zero for a client, zero for a server
2300		* prf_id id of hash function for PRF
2301		* bc_impl block cipher implementation (CTR)
2302		* cipher_key_len block cipher key length (in bytes)
2303		*/
2304		void br_ssl_engine_switch_gcm_out(br_ssl_engine_context *cc,
2305		int is_client, int prf_id,
2306		const br_block_ctr_class *bc_impl, size_t cipher_key_len);
2307
2308		/*
2309		* Switch to ChaCha20+Poly1305 decryption for incoming records.
2310		* cc the engine context
2311		* is_client non-zero for a client, zero for a server
2312		* prf_id id of hash function for PRF
2313		*/
2314		void br_ssl_engine_switch_chapol_in(br_ssl_engine_context *cc,
2315		int is_client, int prf_id);
2316
2317		/*
2318		* Switch to ChaCha20+Poly1305 encryption for outgoing records.
2319		* cc the engine context
2320		* is_client non-zero for a client, zero for a server
2321		* prf_id id of hash function for PRF
2322		*/
2323		void br_ssl_engine_switch_chapol_out(br_ssl_engine_context *cc,
2324		int is_client, int prf_id);
2325
2326		/*
2327		* Switch to CCM decryption for incoming records.
2328		* cc the engine context
2329		* is_client non-zero for a client, zero for a server
2330		* prf_id id of hash function for PRF
2331		* bc_impl block cipher implementation (CTR+CBC)
2332		* cipher_key_len block cipher key length (in bytes)
2333		* tag_len tag length (in bytes)
2334		*/
2335		void br_ssl_engine_switch_ccm_in(br_ssl_engine_context *cc,
2336		int is_client, int prf_id,
2337		const br_block_ctrcbc_class *bc_impl,
2338		size_t cipher_key_len, size_t tag_len);
2339
2340		/*
2341		* Switch to GCM encryption for outgoing records.
2342		* cc the engine context
2343		* is_client non-zero for a client, zero for a server
2344		* prf_id id of hash function for PRF
2345		* bc_impl block cipher implementation (CTR+CBC)
2346		* cipher_key_len block cipher key length (in bytes)
2347		* tag_len tag length (in bytes)
2348		*/
2349		void br_ssl_engine_switch_ccm_out(br_ssl_engine_context *cc,
2350		int is_client, int prf_id,
2351		const br_block_ctrcbc_class *bc_impl,
2352		size_t cipher_key_len, size_t tag_len);
2353
2354		/*
2355		* Calls to T0-generated code.
2356		*/
2357		void br_ssl_hs_client_init_main(void *ctx);
2358		void br_ssl_hs_client_run(void *ctx);
2359		void br_ssl_hs_server_init_main(void *ctx);
2360		void br_ssl_hs_server_run(void *ctx);
2361
2362		/*
2363		* Get the hash function to use for signatures, given a bit mask of
2364		* supported hash functions. This implements a strict choice order
2365		* (namely SHA-256, SHA-384, SHA-512, SHA-224, SHA-1). If the mask
2366		* does not document support of any of these hash functions, then this
2367		* functions returns 0.
2368		*/
2369		int br_ssl_choose_hash(unsigned bf);
2370
2371		/* ==================================================================== */
2372
2373		/*
2374		* PowerPC / POWER assembly stuff. The special BR_POWER_ASM_MACROS macro
2375		* must be defined before including this file; this is done by source
2376		* files that use some inline assembly for PowerPC / POWER machines.
2377		*/
2378
2379		#if BR_POWER_ASM_MACROS
2380
2381		#define lxvw4x(xt, ra, rb) lxvw4x_(xt, ra, rb)
2382		#define stxvw4x(xt, ra, rb) stxvw4x_(xt, ra, rb)
2383
2384		#define bdnz(foo) bdnz_(foo)
2385		#define bdz(foo) bdz_(foo)
2386		#define beq(foo) beq_(foo)
2387
2388		#define li(rx, value) li_(rx, value)
2389		#define addi(rx, ra, imm) addi_(rx, ra, imm)
2390		#define cmpldi(rx, imm) cmpldi_(rx, imm)
2391		#define mtctr(rx) mtctr_(rx)
2392		#define vspltb(vrt, vrb, uim) vspltb_(vrt, vrb, uim)
2393		#define vspltw(vrt, vrb, uim) vspltw_(vrt, vrb, uim)
2394		#define vspltisb(vrt, imm) vspltisb_(vrt, imm)
2395		#define vspltisw(vrt, imm) vspltisw_(vrt, imm)
2396		#define vrlw(vrt, vra, vrb) vrlw_(vrt, vra, vrb)
2397		#define vsbox(vrt, vra) vsbox_(vrt, vra)
2398		#define vxor(vrt, vra, vrb) vxor_(vrt, vra, vrb)
2399		#define vand(vrt, vra, vrb) vand_(vrt, vra, vrb)
2400		#define vsro(vrt, vra, vrb) vsro_(vrt, vra, vrb)
2401		#define vsl(vrt, vra, vrb) vsl_(vrt, vra, vrb)
2402		#define vsldoi(vt, va, vb, sh) vsldoi_(vt, va, vb, sh)
2403		#define vsr(vrt, vra, vrb) vsr_(vrt, vra, vrb)
2404		#define vaddcuw(vrt, vra, vrb) vaddcuw_(vrt, vra, vrb)
2405		#define vadduwm(vrt, vra, vrb) vadduwm_(vrt, vra, vrb)
2406		#define vsububm(vrt, vra, vrb) vsububm_(vrt, vra, vrb)
2407		#define vsubuwm(vrt, vra, vrb) vsubuwm_(vrt, vra, vrb)
2408		#define vsrw(vrt, vra, vrb) vsrw_(vrt, vra, vrb)
2409		#define vcipher(vt, va, vb) vcipher_(vt, va, vb)
2410		#define vcipherlast(vt, va, vb) vcipherlast_(vt, va, vb)
2411		#define vncipher(vt, va, vb) vncipher_(vt, va, vb)
2412		#define vncipherlast(vt, va, vb) vncipherlast_(vt, va, vb)
2413		#define vperm(vt, va, vb, vc) vperm_(vt, va, vb, vc)
2414		#define vpmsumd(vt, va, vb) vpmsumd_(vt, va, vb)
2415		#define xxpermdi(vt, va, vb, d) xxpermdi_(vt, va, vb, d)
2416
2417		#define lxvw4x_(xt, ra, rb) "\tlxvw4x\t" #xt "," #ra "," #rb "\n"
2418		#define stxvw4x_(xt, ra, rb) "\tstxvw4x\t" #xt "," #ra "," #rb "\n"
2419
2420		#define label(foo) #foo "%=:\n"
2421		#define bdnz_(foo) "\tbdnz\t" #foo "%=\n"
2422		#define bdz_(foo) "\tbdz\t" #foo "%=\n"
2423		#define beq_(foo) "\tbeq\t" #foo "%=\n"
2424
2425		#define li_(rx, value) "\tli\t" #rx "," #value "\n"
2426		#define addi_(rx, ra, imm) "\taddi\t" #rx "," #ra "," #imm "\n"
2427		#define cmpldi_(rx, imm) "\tcmpldi\t" #rx "," #imm "\n"
2428		#define mtctr_(rx) "\tmtctr\t" #rx "\n"
2429		#define vspltb_(vrt, vrb, uim) "\tvspltb\t" #vrt "," #vrb "," #uim "\n"
2430		#define vspltw_(vrt, vrb, uim) "\tvspltw\t" #vrt "," #vrb "," #uim "\n"
2431		#define vspltisb_(vrt, imm) "\tvspltisb\t" #vrt "," #imm "\n"
2432		#define vspltisw_(vrt, imm) "\tvspltisw\t" #vrt "," #imm "\n"
2433		#define vrlw_(vrt, vra, vrb) "\tvrlw\t" #vrt "," #vra "," #vrb "\n"
2434		#define vsbox_(vrt, vra) "\tvsbox\t" #vrt "," #vra "\n"
2435		#define vxor_(vrt, vra, vrb) "\tvxor\t" #vrt "," #vra "," #vrb "\n"
2436		#define vand_(vrt, vra, vrb) "\tvand\t" #vrt "," #vra "," #vrb "\n"
2437		#define vsro_(vrt, vra, vrb) "\tvsro\t" #vrt "," #vra "," #vrb "\n"
2438		#define vsl_(vrt, vra, vrb) "\tvsl\t" #vrt "," #vra "," #vrb "\n"
2439		#define vsldoi_(vt, va, vb, sh) "\tvsldoi\t" #vt "," #va "," #vb "," #sh "\n"
2440		#define vsr_(vrt, vra, vrb) "\tvsr\t" #vrt "," #vra "," #vrb "\n"
2441		#define vaddcuw_(vrt, vra, vrb) "\tvaddcuw\t" #vrt "," #vra "," #vrb "\n"
2442		#define vadduwm_(vrt, vra, vrb) "\tvadduwm\t" #vrt "," #vra "," #vrb "\n"
2443		#define vsububm_(vrt, vra, vrb) "\tvsububm\t" #vrt "," #vra "," #vrb "\n"
2444		#define vsubuwm_(vrt, vra, vrb) "\tvsubuwm\t" #vrt "," #vra "," #vrb "\n"
2445		#define vsrw_(vrt, vra, vrb) "\tvsrw\t" #vrt "," #vra "," #vrb "\n"
2446		#define vcipher_(vt, va, vb) "\tvcipher\t" #vt "," #va "," #vb "\n"
2447		#define vcipherlast_(vt, va, vb) "\tvcipherlast\t" #vt "," #va "," #vb "\n"
2448		#define vncipher_(vt, va, vb) "\tvncipher\t" #vt "," #va "," #vb "\n"
2449		#define vncipherlast_(vt, va, vb) "\tvncipherlast\t" #vt "," #va "," #vb "\n"
2450		#define vperm_(vt, va, vb, vc) "\tvperm\t" #vt "," #va "," #vb "," #vc "\n"
2451		#define vpmsumd_(vt, va, vb) "\tvpmsumd\t" #vt "," #va "," #vb "\n"
2452		#define xxpermdi_(vt, va, vb, d) "\txxpermdi\t" #vt "," #va "," #vb "," #d "\n"
2453
2454		#endif
2455
2456		/* ==================================================================== */
2457		/*
2458		* Special "activate intrinsics" code, needed for some compiler versions.
2459		* This is defined at the end of this file, so that it won't impact any
2460		* of the inline functions defined previously; and it is controlled by
2461		* a specific macro defined in the caller code.
2462		*
2463		* Calling code conventions:
2464		*
2465		* - Caller must define BR_ENABLE_INTRINSICS before including "inner.h".
2466		* - Functions that use intrinsics must be enclosed in an "enabled"
2467		* region (between BR_TARGETS_X86_UP and BR_TARGETS_X86_DOWN).
2468		* - Functions that use intrinsics must be tagged with the appropriate
2469		* BR_TARGET().
2470		*/
2471
2472		#if BR_ENABLE_INTRINSICS && (BR_GCC_4_4 \|\| BR_CLANG_3_7 \|\| BR_MSC_2005)
2473
2474		/*
2475		* x86 intrinsics (both 32-bit and 64-bit).
2476		*/
2477		#if BR_i386 \|\| BR_amd64
2478
2479		/*
2480		* On GCC before version 5.0, we need to use the pragma to enable the
2481		* target options globally, because the 'target' function attribute
2482		* appears to be unreliable. Before 4.6 we must also avoid the
2483		* push_options / pop_options mechanism, because it tends to trigger
2484		* some internal compiler errors.
2485		*/
2486		#if BR_GCC && !BR_GCC_5_0
2487		#if BR_GCC_4_6
2488		#define BR_TARGETS_X86_UP \
2489		_Pragma("GCC push_options") \
2490		_Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul,rdrnd\")")
2491		#define BR_TARGETS_X86_DOWN \
2492		_Pragma("GCC pop_options")
2493		#else
2494		#define BR_TARGETS_X86_UP \
2495		_Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
2496		#define BR_TARGETS_X86_DOWN
2497		#endif
2498		#pragma GCC diagnostic ignored "-Wpsabi"
2499		#endif
2500
2501		#if BR_CLANG && !BR_CLANG_3_8
2502		#undef __SSE2__
2503		#undef __SSE3__
2504		#undef __SSSE3__
2505		#undef __SSE4_1__
2506		#undef __AES__
2507		#undef __PCLMUL__
2508		#undef __RDRND__
2509		#define __SSE2__ 1
2510		#define __SSE3__ 1
2511		#define __SSSE3__ 1
2512		#define __SSE4_1__ 1
2513		#define __AES__ 1
2514		#define __PCLMUL__ 1
2515		#define __RDRND__ 1
2516		#endif
2517
2518		#ifndef BR_TARGETS_X86_UP
2519		#define BR_TARGETS_X86_UP
2520		#endif
2521		#ifndef BR_TARGETS_X86_DOWN
2522		#define BR_TARGETS_X86_DOWN
2523		#endif
2524
2525		#if BR_GCC \|\| BR_CLANG
2526		BR_TARGETS_X86_UP
2527		#include
2528		#include
2529		#define br_bswap32 __builtin_bswap32
2530		BR_TARGETS_X86_DOWN
2531		#endif
2532
2533		#if BR_MSC
2534		#include
2535		#include
2536		#include
2537		#define br_bswap32 _byteswap_ulong
2538		#endif
2539
2540		static inline int
2541		br_cpuid(uint32_t mask_eax, uint32_t mask_ebx,
2542		uint32_t mask_ecx, uint32_t mask_edx)
2543		{
2544		#if BR_GCC \|\| BR_CLANG
2545		unsigned eax, ebx, ecx, edx;
2546
2547		if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
2548		if ((eax & mask_eax) == mask_eax
2549		&& (ebx & mask_ebx) == mask_ebx
2550		&& (ecx & mask_ecx) == mask_ecx
2551		&& (edx & mask_edx) == mask_edx)
2552		{
2553		return 1;
2554		}
2555		}
2556		#elif BR_MSC
2557		int info[4];
2558
2559		__cpuid(info, 1);
2560		if (((uint32_t)info[0] & mask_eax) == mask_eax
2561		&& ((uint32_t)info[1] & mask_ebx) == mask_ebx
2562		&& ((uint32_t)info[2] & mask_ecx) == mask_ecx
2563		&& ((uint32_t)info[3] & mask_edx) == mask_edx)
2564		{
2565		return 1;
2566		}
2567		#endif
2568		return 0;
2569		}
2570
2571		#endif
2572
2573		#endif
2574
2575		/* ==================================================================== */
2576
2577		#endif