File Coverage

src/symcipher/chacha20_sse2.c

Criterion	Covered	Total	%
statement	61	74	82.4
branch	6	8	75.0
condition			n/a
subroutine			n/a
pod			n/a
total	67	82	81.7

line	stmt	bran	code
1			/*
2			* Copyright (c) 2017 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#define BR_ENABLE_INTRINSICS 1
26			#include "inner.h"
27
28			#if BR_SSE2
29
30			/*
31			* This file contains a ChaCha20 implementation that leverages SSE2
32			* opcodes for better performance.
33			*/
34
35			/* see bearssl_block.h */
36			br_chacha20_run
37	2		br_chacha20_sse2_get(void)
38			{
39			/*
40			* If using 64-bit mode, then SSE2 opcodes should be automatically
41			* available, since they are part of the ABI.
42			*
43			* In 32-bit mode, we use CPUID to detect the SSE2 feature.
44			*/
45
46			#if BR_amd64
47	2		return &br_chacha20_sse2_run;
48			#else
49
50			/*
51			* SSE2 support is indicated by bit 26 in EDX.
52			*/
53			if (br_cpuid(0, 0, 0, 0x04000000)) {
54			return &br_chacha20_sse2_run;
55			} else {
56			return 0;
57			}
58			#endif
59			}
60
61			BR_TARGETS_X86_UP
62
63			/* see bearssl_block.h */
64			BR_TARGET("sse2")
65			uint32_t
66	24		br_chacha20_sse2_run(const void *key,
67			const void iv, uint32_t cc, void data, size_t len)
68			{
69			unsigned char *buf;
70			uint32_t ivtmp[4];
71			__m128i kw0, kw1;
72			__m128i iw, cw;
73			__m128i one;
74
75			static const uint32_t CW[] = {
76			0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77			};
78
79	24		buf = data;
80	24		kw0 = _mm_loadu_si128(key);
81	24		kw1 = _mm_loadu_si128((const void )((const unsigned char )key + 16));
82	24		ivtmp[0] = cc;
83	24		memcpy(ivtmp + 1, iv, 12);
84	24		iw = _mm_loadu_si128((const void *)ivtmp);
85	24		cw = _mm_loadu_si128((const void *)CW);
86	24		one = _mm_set_epi32(0, 0, 0, 1);
87
88	24	50	while (len > 0) {
89			/*
90			* sj contains state words 4j to 4j+3.
91			*/
92			__m128i s0, s1, s2, s3;
93			int i;
94
95	24		s0 = cw;
96	24		s1 = kw0;
97	24		s2 = kw1;
98	24		s3 = iw;
99	264	100	for (i = 0; i < 10; i ++) {
100			/*
101			* Even round is straightforward application on
102			* the state words.
103			*/
104	240		s0 = _mm_add_epi32(s0, s1);
105	240		s3 = _mm_xor_si128(s3, s0);
106	720		s3 = _mm_or_si128(
107			_mm_slli_epi32(s3, 16),
108			_mm_srli_epi32(s3, 16));
109
110	240		s2 = _mm_add_epi32(s2, s3);
111	240		s1 = _mm_xor_si128(s1, s2);
112	720		s1 = _mm_or_si128(
113			_mm_slli_epi32(s1, 12),
114			_mm_srli_epi32(s1, 20));
115
116	240		s0 = _mm_add_epi32(s0, s1);
117	240		s3 = _mm_xor_si128(s3, s0);
118	720		s3 = _mm_or_si128(
119			_mm_slli_epi32(s3, 8),
120			_mm_srli_epi32(s3, 24));
121
122	240		s2 = _mm_add_epi32(s2, s3);
123	240		s1 = _mm_xor_si128(s1, s2);
124	480		s1 = _mm_or_si128(
125			_mm_slli_epi32(s1, 7),
126			_mm_srli_epi32(s1, 25));
127
128			/*
129			* For the odd round, we must rotate some state
130			* words so that the computations apply on the
131			* right combinations of words.
132			*/
133	240		s1 = _mm_shuffle_epi32(s1, 0x39);
134	240		s2 = _mm_shuffle_epi32(s2, 0x4E);
135	240		s3 = _mm_shuffle_epi32(s3, 0x93);
136
137	240		s0 = _mm_add_epi32(s0, s1);
138	240		s3 = _mm_xor_si128(s3, s0);
139	720		s3 = _mm_or_si128(
140			_mm_slli_epi32(s3, 16),
141			_mm_srli_epi32(s3, 16));
142
143	240		s2 = _mm_add_epi32(s2, s3);
144	240		s1 = _mm_xor_si128(s1, s2);
145	720		s1 = _mm_or_si128(
146			_mm_slli_epi32(s1, 12),
147			_mm_srli_epi32(s1, 20));
148
149	240		s0 = _mm_add_epi32(s0, s1);
150	240		s3 = _mm_xor_si128(s3, s0);
151	720		s3 = _mm_or_si128(
152			_mm_slli_epi32(s3, 8),
153			_mm_srli_epi32(s3, 24));
154
155	240		s2 = _mm_add_epi32(s2, s3);
156	240		s1 = _mm_xor_si128(s1, s2);
157	480		s1 = _mm_or_si128(
158			_mm_slli_epi32(s1, 7),
159			_mm_srli_epi32(s1, 25));
160
161			/*
162			* After the odd round, we rotate back the values
163			* to undo the rotate at the start of the odd round.
164			*/
165	240		s1 = _mm_shuffle_epi32(s1, 0x93);
166	240		s2 = _mm_shuffle_epi32(s2, 0x4E);
167	240		s3 = _mm_shuffle_epi32(s3, 0x39);
168			}
169
170			/*
171			* Addition with the initial state.
172			*/
173	24		s0 = _mm_add_epi32(s0, cw);
174	24		s1 = _mm_add_epi32(s1, kw0);
175	24		s2 = _mm_add_epi32(s2, kw1);
176	24		s3 = _mm_add_epi32(s3, iw);
177
178			/*
179			* Increment block counter.
180			*/
181	24		iw = _mm_add_epi32(iw, one);
182
183			/*
184			* XOR final state with the data.
185			*/
186	24	50	if (len < 64) {
187			unsigned char tmp[64];
188			size_t u;
189
190			_mm_storeu_si128((void *)(tmp + 0), s0);
191	24		_mm_storeu_si128((void *)(tmp + 16), s1);
192	24		_mm_storeu_si128((void *)(tmp + 32), s2);
193	24		_mm_storeu_si128((void *)(tmp + 48), s3);
194	528	100	for (u = 0; u < len; u ++) {
195	504		buf[u] ^= tmp[u];
196			}
197	24		break;
198			} else {
199			__m128i b0, b1, b2, b3;
200
201	0		b0 = _mm_loadu_si128((const void *)(buf + 0));
202	0		b1 = _mm_loadu_si128((const void *)(buf + 16));
203	0		b2 = _mm_loadu_si128((const void *)(buf + 32));
204	0		b3 = _mm_loadu_si128((const void *)(buf + 48));
205	0		b0 = _mm_xor_si128(b0, s0);
206	0		b1 = _mm_xor_si128(b1, s1);
207	0		b2 = _mm_xor_si128(b2, s2);
208	0		b3 = _mm_xor_si128(b3, s3);
209			_mm_storeu_si128((void *)(buf + 0), b0);
210	0		_mm_storeu_si128((void *)(buf + 16), b1);
211	0		_mm_storeu_si128((void *)(buf + 32), b2);
212	0		_mm_storeu_si128((void *)(buf + 48), b3);
213	0		buf += 64;
214	0		len -= 64;
215			}
216			}
217
218			/*
219			* _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220			* raw SSE2, thus we use _mm_extract_epi16().
221			*/
222	24		return (uint32_t)_mm_extract_epi16(iw, 0)
223	24		\| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224			}
225
226			BR_TARGETS_X86_DOWN
227
228			#else
229
230			/* see bearssl_block.h */
231			br_chacha20_run
232			br_chacha20_sse2_get(void)
233			{
234			return 0;
235			}
236
237			#endif