File Coverage

src/hash/ghash_pclmul.c
Criterion Covered Total %
statement 87 87 100.0
branch 9 10 90.0
condition n/a
subroutine n/a
pod n/a
total 96 97 98.9


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #define BR_ENABLE_INTRINSICS 1
26             #include "inner.h"
27              
28             /*
29             * This is the GHASH implementation that leverages the pclmulqdq opcode
30             * (from the AES-NI instructions).
31             */
32              
33             #if BR_AES_X86NI
34              
35             /*
36             * Test CPU support for PCLMULQDQ.
37             */
38             static inline int
39 52           pclmul_supported(void)
40             {
41             /*
42             * Bit mask for features in ECX:
43             * 1 PCLMULQDQ support
44             */
45 52           return br_cpuid(0, 0, 0x00000002, 0);
46             }
47              
48             /* see bearssl_hash.h */
49             br_ghash
50 52           br_ghash_pclmul_get(void)
51             {
52 52 50         return pclmul_supported() ? &br_ghash_pclmul : 0;
53             }
54              
55             BR_TARGETS_X86_UP
56              
57             /*
58             * GHASH is defined over elements of GF(2^128) with "full little-endian"
59             * representation: leftmost byte is least significant, and, within each
60             * byte, leftmost _bit_ is least significant. The natural ordering in
61             * x86 is "mixed little-endian": bytes are ordered from least to most
62             * significant, but bits within a byte are in most-to-least significant
63             * order. Going to full little-endian representation would require
64             * reversing bits within each byte, which is doable but expensive.
65             *
66             * Instead, we go to full big-endian representation, by swapping bytes
67             * around, which is done with a single _mm_shuffle_epi8() opcode (it
68             * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
69             * can use a full big-endian representation because in a carryless
70             * multiplication, we have a nice bit reversal property:
71             *
72             * rev_128(x) * rev_128(y) = rev_255(x * y)
73             *
74             * So by using full big-endian, we still get the right result, except
75             * that it is right-shifted by 1 bit. The left-shift is relatively
76             * inexpensive, and it can be mutualised.
77             *
78             *
79             * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
80             * values with bit precision, we have to break down values into 64-bit
81             * chunks. We number chunks from 0 to 3 in left to right order.
82             */
83              
84             /*
85             * Byte-swap a complete 128-bit value. This normally uses
86             * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
87             * However, this crashes old Clang versions, so, for Clang before 3.8,
88             * we use an alternate (and less efficient) version.
89             */
90             #if BR_CLANG && !BR_CLANG_3_8
91             #define BYTESWAP_DECL
92             #define BYTESWAP_PREP (void)0
93             #define BYTESWAP(x) do { \
94             __m128i byteswap1, byteswap2; \
95             byteswap1 = (x); \
96             byteswap2 = _mm_srli_epi16(byteswap1, 8); \
97             byteswap1 = _mm_slli_epi16(byteswap1, 8); \
98             byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
99             byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100             byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101             (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
102             } while (0)
103             #else
104             #define BYTESWAP_DECL __m128i byteswap_index;
105             #define BYTESWAP_PREP do { \
106             byteswap_index = _mm_set_epi8( \
107             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
108             } while (0)
109             #define BYTESWAP(x) do { \
110             (x) = _mm_shuffle_epi8((x), byteswap_index); \
111             } while (0)
112             #endif
113              
114             /*
115             * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116             * for that compiler, we use inline assembly. Inline assembly is
117             * potentially a bit slower because the compiler does not understand
118             * what the opcode does, and thus cannot optimize instruction
119             * scheduling.
120             *
121             * We use a target of "sse2" only, so that Clang may still handle the
122             * '__m128i' type and allocate SSE2 registers.
123             */
124             #if BR_CLANG
125             BR_TARGET("sse2")
126             static inline __m128i
127             pclmulqdq00(__m128i x, __m128i y)
128             {
129             __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
130             return x;
131             }
132             BR_TARGET("sse2")
133             static inline __m128i
134             pclmulqdq11(__m128i x, __m128i y)
135             {
136             __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
137             return x;
138             }
139             #else
140             #define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
141             #define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
142             #endif
143              
144             /*
145             * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146             * halves of kw (into the right half of kx; left half is unspecified).
147             */
148             #define BK(kw, kx) do { \
149             kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
150             } while (0)
151              
152             /*
153             * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154             * the XOR of the two values (kx).
155             */
156             #define PBK(k0, k1, kw, kx) do { \
157             kw = _mm_unpacklo_epi64(k1, k0); \
158             kx = _mm_xor_si128(k0, k1); \
159             } while (0)
160              
161             /*
162             * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
163             */
164             #define SL_256(x0, x1, x2, x3) do { \
165             x0 = _mm_or_si128( \
166             _mm_slli_epi64(x0, 1), \
167             _mm_srli_epi64(x1, 63)); \
168             x1 = _mm_or_si128( \
169             _mm_slli_epi64(x1, 1), \
170             _mm_srli_epi64(x2, 63)); \
171             x2 = _mm_or_si128( \
172             _mm_slli_epi64(x2, 1), \
173             _mm_srli_epi64(x3, 63)); \
174             x3 = _mm_slli_epi64(x3, 1); \
175             } while (0)
176              
177             /*
178             * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179             * result is written in x0..x1.
180             */
181             #define REDUCE_F128(x0, x1, x2, x3) do { \
182             x1 = _mm_xor_si128( \
183             x1, \
184             _mm_xor_si128( \
185             _mm_xor_si128( \
186             x3, \
187             _mm_srli_epi64(x3, 1)), \
188             _mm_xor_si128( \
189             _mm_srli_epi64(x3, 2), \
190             _mm_srli_epi64(x3, 7)))); \
191             x2 = _mm_xor_si128( \
192             _mm_xor_si128( \
193             x2, \
194             _mm_slli_epi64(x3, 63)), \
195             _mm_xor_si128( \
196             _mm_slli_epi64(x3, 62), \
197             _mm_slli_epi64(x3, 57))); \
198             x0 = _mm_xor_si128( \
199             x0, \
200             _mm_xor_si128( \
201             _mm_xor_si128( \
202             x2, \
203             _mm_srli_epi64(x2, 1)), \
204             _mm_xor_si128( \
205             _mm_srli_epi64(x2, 2), \
206             _mm_srli_epi64(x2, 7)))); \
207             x1 = _mm_xor_si128( \
208             _mm_xor_si128( \
209             x1, \
210             _mm_slli_epi64(x2, 63)), \
211             _mm_xor_si128( \
212             _mm_slli_epi64(x2, 62), \
213             _mm_slli_epi64(x2, 57))); \
214             } while (0)
215              
216             /*
217             * Square value kw into (dw,dx).
218             */
219             #define SQUARE_F128(kw, dw, dx) do { \
220             __m128i z0, z1, z2, z3; \
221             z1 = pclmulqdq11(kw, kw); \
222             z3 = pclmulqdq00(kw, kw); \
223             z0 = _mm_shuffle_epi32(z1, 0x0E); \
224             z2 = _mm_shuffle_epi32(z3, 0x0E); \
225             SL_256(z0, z1, z2, z3); \
226             REDUCE_F128(z0, z1, z2, z3); \
227             PBK(z0, z1, dw, dx); \
228             } while (0)
229              
230             /* see bearssl_hash.h */
231             BR_TARGET("ssse3,pclmul")
232             void
233 68           br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
234             {
235             const unsigned char *buf1, *buf2;
236             unsigned char tmp[64];
237             size_t num4, num1;
238             __m128i yw, h1w, h1x;
239             BYTESWAP_DECL
240              
241             /*
242             * We split data into two chunks. First chunk starts at buf1
243             * and contains num4 blocks of 64-byte values. Second chunk
244             * starts at buf2 and contains num1 blocks of 16-byte values.
245             * We want the first chunk to be as large as possible.
246             */
247 68           buf1 = data;
248 68           num4 = len >> 6;
249 68           len &= 63;
250 68           buf2 = buf1 + (num4 << 6);
251 68           num1 = (len + 15) >> 4;
252 68 100         if ((len & 15) != 0) {
253 16           memcpy(tmp, buf2, len);
254 16           memset(tmp + len, 0, (num1 << 4) - len);
255 16           buf2 = tmp;
256             }
257              
258             /*
259             * Preparatory step for endian conversions.
260             */
261 68           BYTESWAP_PREP;
262              
263             /*
264             * Load y and h.
265             */
266 68           yw = _mm_loadu_si128(y);
267 68           h1w = _mm_loadu_si128(h);
268 68           BYTESWAP(yw);
269 68           BYTESWAP(h1w);
270 68           BK(h1w, h1x);
271              
272 68 100         if (num4 > 0) {
273             __m128i h2w, h2x, h3w, h3x, h4w, h4x;
274             __m128i t0, t1, t2, t3;
275              
276             /*
277             * Compute h2 = h^2.
278             */
279 76           SQUARE_F128(h1w, h2w, h2x);
280              
281             /*
282             * Compute h3 = h^3 = h*(h^2).
283             */
284 2           t1 = pclmulqdq11(h1w, h2w);
285 2           t3 = pclmulqdq00(h1w, h2w);
286 2           t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
287             _mm_xor_si128(t1, t3));
288 2           t0 = _mm_shuffle_epi32(t1, 0x0E);
289 2           t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
290 4           t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
291 20           SL_256(t0, t1, t2, t3);
292 52           REDUCE_F128(t0, t1, t2, t3);
293 2           PBK(t0, t1, h3w, h3x);
294              
295             /*
296             * Compute h4 = h^4 = (h^2)^2.
297             */
298 76           SQUARE_F128(h2w, h4w, h4x);
299              
300 4 100         while (num4 -- > 0) {
301             __m128i aw0, aw1, aw2, aw3;
302             __m128i ax0, ax1, ax2, ax3;
303              
304 2           aw0 = _mm_loadu_si128((void *)(buf1 + 0));
305 2           aw1 = _mm_loadu_si128((void *)(buf1 + 16));
306 2           aw2 = _mm_loadu_si128((void *)(buf1 + 32));
307 4           aw3 = _mm_loadu_si128((void *)(buf1 + 48));
308 2           BYTESWAP(aw0);
309 2           BYTESWAP(aw1);
310 2           BYTESWAP(aw2);
311 2           BYTESWAP(aw3);
312 2           buf1 += 64;
313              
314 2           aw0 = _mm_xor_si128(aw0, yw);
315 2           BK(aw1, ax1);
316 2           BK(aw2, ax2);
317 2           BK(aw3, ax3);
318 2           BK(aw0, ax0);
319              
320 2           t1 = _mm_xor_si128(
321             _mm_xor_si128(
322 2           pclmulqdq11(aw0, h4w),
323 2           pclmulqdq11(aw1, h3w)),
324             _mm_xor_si128(
325 2           pclmulqdq11(aw2, h2w),
326 2           pclmulqdq11(aw3, h1w)));
327 2           t3 = _mm_xor_si128(
328             _mm_xor_si128(
329 2           pclmulqdq00(aw0, h4w),
330 2           pclmulqdq00(aw1, h3w)),
331             _mm_xor_si128(
332 2           pclmulqdq00(aw2, h2w),
333 2           pclmulqdq00(aw3, h1w)));
334 4           t2 = _mm_xor_si128(
335             _mm_xor_si128(
336 2           pclmulqdq00(ax0, h4x),
337 2           pclmulqdq00(ax1, h3x)),
338             _mm_xor_si128(
339 2           pclmulqdq00(ax2, h2x),
340 2           pclmulqdq00(ax3, h1x)));
341 2           t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
342 2           t0 = _mm_shuffle_epi32(t1, 0x0E);
343 2           t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
344 4           t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
345 20           SL_256(t0, t1, t2, t3);
346 52           REDUCE_F128(t0, t1, t2, t3);
347 2           yw = _mm_unpacklo_epi64(t1, t0);
348             }
349             }
350              
351 138 100         while (num1 -- > 0) {
352             __m128i aw, ax;
353             __m128i t0, t1, t2, t3;
354              
355 70           aw = _mm_loadu_si128((void *)buf2);
356 70           BYTESWAP(aw);
357 70           buf2 += 16;
358              
359 70           aw = _mm_xor_si128(aw, yw);
360 70           BK(aw, ax);
361              
362 70           t1 = pclmulqdq11(aw, h1w);
363 70           t3 = pclmulqdq00(aw, h1w);
364 70           t2 = pclmulqdq00(ax, h1x);
365 70           t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
366 70           t0 = _mm_shuffle_epi32(t1, 0x0E);
367 70           t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
368 140           t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
369 700           SL_256(t0, t1, t2, t3);
370 1820           REDUCE_F128(t0, t1, t2, t3);
371 70           yw = _mm_unpacklo_epi64(t1, t0);
372             }
373              
374 68           BYTESWAP(yw);
375             _mm_storeu_si128(y, yw);
376 68           }
377              
378             BR_TARGETS_X86_DOWN
379              
380             #else
381              
382             /* see bearssl_hash.h */
383             br_ghash
384             br_ghash_pclmul_get(void)
385             {
386             return 0;
387             }
388              
389             #endif