File Coverage

src/hash/ghash_pclmul.c

Criterion	Covered	Total	%
statement	87	87	100.0
branch	9	10	90.0
condition			n/a
subroutine			n/a
pod			n/a
total	96	97	98.9

line	stmt	bran	code
1			/*
2			* Copyright (c) 2017 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#define BR_ENABLE_INTRINSICS 1
26			#include "inner.h"
27
28			/*
29			* This is the GHASH implementation that leverages the pclmulqdq opcode
30			* (from the AES-NI instructions).
31			*/
32
33			#if BR_AES_X86NI
34
35			/*
36			* Test CPU support for PCLMULQDQ.
37			*/
38			static inline int
39	52		pclmul_supported(void)
40			{
41			/*
42			* Bit mask for features in ECX:
43			* 1 PCLMULQDQ support
44			*/
45	52		return br_cpuid(0, 0, 0x00000002, 0);
46			}
47
48			/* see bearssl_hash.h */
49			br_ghash
50	52		br_ghash_pclmul_get(void)
51			{
52	52	50	return pclmul_supported() ? &br_ghash_pclmul : 0;
53			}
54
55			BR_TARGETS_X86_UP
56
57			/*
58			* GHASH is defined over elements of GF(2^128) with "full little-endian"
59			* representation: leftmost byte is least significant, and, within each
60			* byte, leftmost _bit_ is least significant. The natural ordering in
61			* x86 is "mixed little-endian": bytes are ordered from least to most
62			* significant, but bits within a byte are in most-to-least significant
63			* order. Going to full little-endian representation would require
64			* reversing bits within each byte, which is doable but expensive.
65			*
66			* Instead, we go to full big-endian representation, by swapping bytes
67			* around, which is done with a single _mm_shuffle_epi8() opcode (it
68			* comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
69			* can use a full big-endian representation because in a carryless
70			* multiplication, we have a nice bit reversal property:
71			*
72			* rev_128(x) * rev_128(y) = rev_255(x * y)
73			*
74			* So by using full big-endian, we still get the right result, except
75			* that it is right-shifted by 1 bit. The left-shift is relatively
76			* inexpensive, and it can be mutualised.
77			*
78			*
79			* Since SSE2 opcodes do not have facilities for shitfting full 128-bit
80			* values with bit precision, we have to break down values into 64-bit
81			* chunks. We number chunks from 0 to 3 in left to right order.
82			*/
83
84			/*
85			* Byte-swap a complete 128-bit value. This normally uses
86			* _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
87			* However, this crashes old Clang versions, so, for Clang before 3.8,
88			* we use an alternate (and less efficient) version.
89			*/
90			#if BR_CLANG && !BR_CLANG_3_8
91			#define BYTESWAP_DECL
92			#define BYTESWAP_PREP (void)0
93			#define BYTESWAP(x) do { \
94			__m128i byteswap1, byteswap2; \
95			byteswap1 = (x); \
96			byteswap2 = _mm_srli_epi16(byteswap1, 8); \
97			byteswap1 = _mm_slli_epi16(byteswap1, 8); \
98			byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
99			byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100			byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101			(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
102			} while (0)
103			#else
104			#define BYTESWAP_DECL __m128i byteswap_index;
105			#define BYTESWAP_PREP do { \
106			byteswap_index = _mm_set_epi8( \
107			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
108			} while (0)
109			#define BYTESWAP(x) do { \
110			(x) = _mm_shuffle_epi8((x), byteswap_index); \
111			} while (0)
112			#endif
113
114			/*
115			* Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116			* for that compiler, we use inline assembly. Inline assembly is
117			* potentially a bit slower because the compiler does not understand
118			* what the opcode does, and thus cannot optimize instruction
119			* scheduling.
120			*
121			* We use a target of "sse2" only, so that Clang may still handle the
122			* '__m128i' type and allocate SSE2 registers.
123			*/
124			#if BR_CLANG
125			BR_TARGET("sse2")
126			static inline __m128i
127			pclmulqdq00(__m128i x, __m128i y)
128			{
129			__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
130			return x;
131			}
132			BR_TARGET("sse2")
133			static inline __m128i
134			pclmulqdq11(__m128i x, __m128i y)
135			{
136			__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
137			return x;
138			}
139			#else
140			#define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
141			#define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
142			#endif
143
144			/*
145			* From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146			* halves of kw (into the right half of kx; left half is unspecified).
147			*/
148			#define BK(kw, kx) do { \
149			kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
150			} while (0)
151
152			/*
153			* Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154			* the XOR of the two values (kx).
155			*/
156			#define PBK(k0, k1, kw, kx) do { \
157			kw = _mm_unpacklo_epi64(k1, k0); \
158			kx = _mm_xor_si128(k0, k1); \
159			} while (0)
160
161			/*
162			* Left-shift by 1 bit a 256-bit value (in four 64-bit words).
163			*/
164			#define SL_256(x0, x1, x2, x3) do { \
165			x0 = _mm_or_si128( \
166			_mm_slli_epi64(x0, 1), \
167			_mm_srli_epi64(x1, 63)); \
168			x1 = _mm_or_si128( \
169			_mm_slli_epi64(x1, 1), \
170			_mm_srli_epi64(x2, 63)); \
171			x2 = _mm_or_si128( \
172			_mm_slli_epi64(x2, 1), \
173			_mm_srli_epi64(x3, 63)); \
174			x3 = _mm_slli_epi64(x3, 1); \
175			} while (0)
176
177			/*
178			* Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179			* result is written in x0..x1.
180			*/
181			#define REDUCE_F128(x0, x1, x2, x3) do { \
182			x1 = _mm_xor_si128( \
183			x1, \
184			_mm_xor_si128( \
185			_mm_xor_si128( \
186			x3, \
187			_mm_srli_epi64(x3, 1)), \
188			_mm_xor_si128( \
189			_mm_srli_epi64(x3, 2), \
190			_mm_srli_epi64(x3, 7)))); \
191			x2 = _mm_xor_si128( \
192			_mm_xor_si128( \
193			x2, \
194			_mm_slli_epi64(x3, 63)), \
195			_mm_xor_si128( \
196			_mm_slli_epi64(x3, 62), \
197			_mm_slli_epi64(x3, 57))); \
198			x0 = _mm_xor_si128( \
199			x0, \
200			_mm_xor_si128( \
201			_mm_xor_si128( \
202			x2, \
203			_mm_srli_epi64(x2, 1)), \
204			_mm_xor_si128( \
205			_mm_srli_epi64(x2, 2), \
206			_mm_srli_epi64(x2, 7)))); \
207			x1 = _mm_xor_si128( \
208			_mm_xor_si128( \
209			x1, \
210			_mm_slli_epi64(x2, 63)), \
211			_mm_xor_si128( \
212			_mm_slli_epi64(x2, 62), \
213			_mm_slli_epi64(x2, 57))); \
214			} while (0)
215
216			/*
217			* Square value kw into (dw,dx).
218			*/
219			#define SQUARE_F128(kw, dw, dx) do { \
220			__m128i z0, z1, z2, z3; \
221			z1 = pclmulqdq11(kw, kw); \
222			z3 = pclmulqdq00(kw, kw); \
223			z0 = _mm_shuffle_epi32(z1, 0x0E); \
224			z2 = _mm_shuffle_epi32(z3, 0x0E); \
225			SL_256(z0, z1, z2, z3); \
226			REDUCE_F128(z0, z1, z2, z3); \
227			PBK(z0, z1, dw, dx); \
228			} while (0)
229
230			/* see bearssl_hash.h */
231			BR_TARGET("ssse3,pclmul")
232			void
233	68		br_ghash_pclmul(void y, const void h, const void *data, size_t len)
234			{
235			const unsigned char buf1, buf2;
236			unsigned char tmp[64];
237			size_t num4, num1;
238			__m128i yw, h1w, h1x;
239			BYTESWAP_DECL
240
241			/*
242			* We split data into two chunks. First chunk starts at buf1
243			* and contains num4 blocks of 64-byte values. Second chunk
244			* starts at buf2 and contains num1 blocks of 16-byte values.
245			* We want the first chunk to be as large as possible.
246			*/
247	68		buf1 = data;
248	68		num4 = len >> 6;
249	68		len &= 63;
250	68		buf2 = buf1 + (num4 << 6);
251	68		num1 = (len + 15) >> 4;
252	68	100	if ((len & 15) != 0) {
253	16		memcpy(tmp, buf2, len);
254	16		memset(tmp + len, 0, (num1 << 4) - len);
255	16		buf2 = tmp;
256			}
257
258			/*
259			* Preparatory step for endian conversions.
260			*/
261	68		BYTESWAP_PREP;
262
263			/*
264			* Load y and h.
265			*/
266	68		yw = _mm_loadu_si128(y);
267	68		h1w = _mm_loadu_si128(h);
268	68		BYTESWAP(yw);
269	68		BYTESWAP(h1w);
270	68		BK(h1w, h1x);
271
272	68	100	if (num4 > 0) {
273			__m128i h2w, h2x, h3w, h3x, h4w, h4x;
274			__m128i t0, t1, t2, t3;
275
276			/*
277			* Compute h2 = h^2.
278			*/
279	76		SQUARE_F128(h1w, h2w, h2x);
280
281			/*
282			* Compute h3 = h^3 = h*(h^2).
283			*/
284	2		t1 = pclmulqdq11(h1w, h2w);
285	2		t3 = pclmulqdq00(h1w, h2w);
286	2		t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
287			_mm_xor_si128(t1, t3));
288	2		t0 = _mm_shuffle_epi32(t1, 0x0E);
289	2		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
290	4		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
291	20		SL_256(t0, t1, t2, t3);
292	52		REDUCE_F128(t0, t1, t2, t3);
293	2		PBK(t0, t1, h3w, h3x);
294
295			/*
296			* Compute h4 = h^4 = (h^2)^2.
297			*/
298	76		SQUARE_F128(h2w, h4w, h4x);
299
300	4	100	while (num4 -- > 0) {
301			__m128i aw0, aw1, aw2, aw3;
302			__m128i ax0, ax1, ax2, ax3;
303
304	2		aw0 = _mm_loadu_si128((void *)(buf1 + 0));
305	2		aw1 = _mm_loadu_si128((void *)(buf1 + 16));
306	2		aw2 = _mm_loadu_si128((void *)(buf1 + 32));
307	4		aw3 = _mm_loadu_si128((void *)(buf1 + 48));
308	2		BYTESWAP(aw0);
309	2		BYTESWAP(aw1);
310	2		BYTESWAP(aw2);
311	2		BYTESWAP(aw3);
312	2		buf1 += 64;
313
314	2		aw0 = _mm_xor_si128(aw0, yw);
315	2		BK(aw1, ax1);
316	2		BK(aw2, ax2);
317	2		BK(aw3, ax3);
318	2		BK(aw0, ax0);
319
320	2		t1 = _mm_xor_si128(
321			_mm_xor_si128(
322	2		pclmulqdq11(aw0, h4w),
323	2		pclmulqdq11(aw1, h3w)),
324			_mm_xor_si128(
325	2		pclmulqdq11(aw2, h2w),
326	2		pclmulqdq11(aw3, h1w)));
327	2		t3 = _mm_xor_si128(
328			_mm_xor_si128(
329	2		pclmulqdq00(aw0, h4w),
330	2		pclmulqdq00(aw1, h3w)),
331			_mm_xor_si128(
332	2		pclmulqdq00(aw2, h2w),
333	2		pclmulqdq00(aw3, h1w)));
334	4		t2 = _mm_xor_si128(
335			_mm_xor_si128(
336	2		pclmulqdq00(ax0, h4x),
337	2		pclmulqdq00(ax1, h3x)),
338			_mm_xor_si128(
339	2		pclmulqdq00(ax2, h2x),
340	2		pclmulqdq00(ax3, h1x)));
341	2		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
342	2		t0 = _mm_shuffle_epi32(t1, 0x0E);
343	2		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
344	4		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
345	20		SL_256(t0, t1, t2, t3);
346	52		REDUCE_F128(t0, t1, t2, t3);
347	2		yw = _mm_unpacklo_epi64(t1, t0);
348			}
349			}
350
351	138	100	while (num1 -- > 0) {
352			__m128i aw, ax;
353			__m128i t0, t1, t2, t3;
354
355	70		aw = _mm_loadu_si128((void *)buf2);
356	70		BYTESWAP(aw);
357	70		buf2 += 16;
358
359	70		aw = _mm_xor_si128(aw, yw);
360	70		BK(aw, ax);
361
362	70		t1 = pclmulqdq11(aw, h1w);
363	70		t3 = pclmulqdq00(aw, h1w);
364	70		t2 = pclmulqdq00(ax, h1x);
365	70		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
366	70		t0 = _mm_shuffle_epi32(t1, 0x0E);
367	70		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
368	140		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
369	700		SL_256(t0, t1, t2, t3);
370	1820		REDUCE_F128(t0, t1, t2, t3);
371	70		yw = _mm_unpacklo_epi64(t1, t0);
372			}
373
374	68		BYTESWAP(yw);
375			_mm_storeu_si128(y, yw);
376	68		}
377
378			BR_TARGETS_X86_DOWN
379
380			#else
381
382			/* see bearssl_hash.h */
383			br_ghash
384			br_ghash_pclmul_get(void)
385			{
386			return 0;
387			}
388
389			#endif