| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* Copyright (c) 2017 Thomas Pornin |
|
3
|
|
|
|
|
|
|
* |
|
4
|
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining |
|
5
|
|
|
|
|
|
|
* a copy of this software and associated documentation files (the |
|
6
|
|
|
|
|
|
|
* "Software"), to deal in the Software without restriction, including |
|
7
|
|
|
|
|
|
|
* without limitation the rights to use, copy, modify, merge, publish, |
|
8
|
|
|
|
|
|
|
* distribute, sublicense, and/or sell copies of the Software, and to |
|
9
|
|
|
|
|
|
|
* permit persons to whom the Software is furnished to do so, subject to |
|
10
|
|
|
|
|
|
|
* the following conditions: |
|
11
|
|
|
|
|
|
|
* |
|
12
|
|
|
|
|
|
|
* The above copyright notice and this permission notice shall be |
|
13
|
|
|
|
|
|
|
* included in all copies or substantial portions of the Software. |
|
14
|
|
|
|
|
|
|
* |
|
15
|
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
16
|
|
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
17
|
|
|
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
18
|
|
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
|
19
|
|
|
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
|
20
|
|
|
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
21
|
|
|
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
22
|
|
|
|
|
|
|
* SOFTWARE. |
|
23
|
|
|
|
|
|
|
*/ |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
#define BR_ENABLE_INTRINSICS 1 |
|
26
|
|
|
|
|
|
|
#include "inner.h" |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
/* |
|
29
|
|
|
|
|
|
|
* This is the GHASH implementation that leverages the pclmulqdq opcode |
|
30
|
|
|
|
|
|
|
* (from the AES-NI instructions). |
|
31
|
|
|
|
|
|
|
*/ |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
#if BR_AES_X86NI |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
/* |
|
36
|
|
|
|
|
|
|
* Test CPU support for PCLMULQDQ. |
|
37
|
|
|
|
|
|
|
*/ |
|
38
|
|
|
|
|
|
|
static inline int |
|
39
|
52
|
|
|
|
|
|
pclmul_supported(void) |
|
40
|
|
|
|
|
|
|
{ |
|
41
|
|
|
|
|
|
|
/* |
|
42
|
|
|
|
|
|
|
* Bit mask for features in ECX: |
|
43
|
|
|
|
|
|
|
* 1 PCLMULQDQ support |
|
44
|
|
|
|
|
|
|
*/ |
|
45
|
52
|
|
|
|
|
|
return br_cpuid(0, 0, 0x00000002, 0); |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
/* see bearssl_hash.h */ |
|
49
|
|
|
|
|
|
|
br_ghash |
|
50
|
52
|
|
|
|
|
|
br_ghash_pclmul_get(void) |
|
51
|
|
|
|
|
|
|
{ |
|
52
|
52
|
50
|
|
|
|
|
return pclmul_supported() ? &br_ghash_pclmul : 0; |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
BR_TARGETS_X86_UP |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
/* |
|
58
|
|
|
|
|
|
|
* GHASH is defined over elements of GF(2^128) with "full little-endian" |
|
59
|
|
|
|
|
|
|
* representation: leftmost byte is least significant, and, within each |
|
60
|
|
|
|
|
|
|
* byte, leftmost _bit_ is least significant. The natural ordering in |
|
61
|
|
|
|
|
|
|
* x86 is "mixed little-endian": bytes are ordered from least to most |
|
62
|
|
|
|
|
|
|
* significant, but bits within a byte are in most-to-least significant |
|
63
|
|
|
|
|
|
|
* order. Going to full little-endian representation would require |
|
64
|
|
|
|
|
|
|
* reversing bits within each byte, which is doable but expensive. |
|
65
|
|
|
|
|
|
|
* |
|
66
|
|
|
|
|
|
|
* Instead, we go to full big-endian representation, by swapping bytes |
|
67
|
|
|
|
|
|
|
* around, which is done with a single _mm_shuffle_epi8() opcode (it |
|
68
|
|
|
|
|
|
|
* comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We |
|
69
|
|
|
|
|
|
|
* can use a full big-endian representation because in a carryless |
|
70
|
|
|
|
|
|
|
* multiplication, we have a nice bit reversal property: |
|
71
|
|
|
|
|
|
|
* |
|
72
|
|
|
|
|
|
|
* rev_128(x) * rev_128(y) = rev_255(x * y) |
|
73
|
|
|
|
|
|
|
* |
|
74
|
|
|
|
|
|
|
* So by using full big-endian, we still get the right result, except |
|
75
|
|
|
|
|
|
|
* that it is right-shifted by 1 bit. The left-shift is relatively |
|
76
|
|
|
|
|
|
|
* inexpensive, and it can be mutualised. |
|
77
|
|
|
|
|
|
|
* |
|
78
|
|
|
|
|
|
|
* |
|
79
|
|
|
|
|
|
|
* Since SSE2 opcodes do not have facilities for shitfting full 128-bit |
|
80
|
|
|
|
|
|
|
* values with bit precision, we have to break down values into 64-bit |
|
81
|
|
|
|
|
|
|
* chunks. We number chunks from 0 to 3 in left to right order. |
|
82
|
|
|
|
|
|
|
*/ |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
/* |
|
85
|
|
|
|
|
|
|
* Byte-swap a complete 128-bit value. This normally uses |
|
86
|
|
|
|
|
|
|
* _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode). |
|
87
|
|
|
|
|
|
|
* However, this crashes old Clang versions, so, for Clang before 3.8, |
|
88
|
|
|
|
|
|
|
* we use an alternate (and less efficient) version. |
|
89
|
|
|
|
|
|
|
*/ |
|
90
|
|
|
|
|
|
|
#if BR_CLANG && !BR_CLANG_3_8 |
|
91
|
|
|
|
|
|
|
#define BYTESWAP_DECL |
|
92
|
|
|
|
|
|
|
#define BYTESWAP_PREP (void)0 |
|
93
|
|
|
|
|
|
|
#define BYTESWAP(x) do { \ |
|
94
|
|
|
|
|
|
|
__m128i byteswap1, byteswap2; \ |
|
95
|
|
|
|
|
|
|
byteswap1 = (x); \ |
|
96
|
|
|
|
|
|
|
byteswap2 = _mm_srli_epi16(byteswap1, 8); \ |
|
97
|
|
|
|
|
|
|
byteswap1 = _mm_slli_epi16(byteswap1, 8); \ |
|
98
|
|
|
|
|
|
|
byteswap1 = _mm_or_si128(byteswap1, byteswap2); \ |
|
99
|
|
|
|
|
|
|
byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \ |
|
100
|
|
|
|
|
|
|
byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \ |
|
101
|
|
|
|
|
|
|
(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \ |
|
102
|
|
|
|
|
|
|
} while (0) |
|
103
|
|
|
|
|
|
|
#else |
|
104
|
|
|
|
|
|
|
#define BYTESWAP_DECL __m128i byteswap_index; |
|
105
|
|
|
|
|
|
|
#define BYTESWAP_PREP do { \ |
|
106
|
|
|
|
|
|
|
byteswap_index = _mm_set_epi8( \ |
|
107
|
|
|
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ |
|
108
|
|
|
|
|
|
|
} while (0) |
|
109
|
|
|
|
|
|
|
#define BYTESWAP(x) do { \ |
|
110
|
|
|
|
|
|
|
(x) = _mm_shuffle_epi8((x), byteswap_index); \ |
|
111
|
|
|
|
|
|
|
} while (0) |
|
112
|
|
|
|
|
|
|
#endif |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
/* |
|
115
|
|
|
|
|
|
|
* Call pclmulqdq. Clang appears to have trouble with the intrinsic, so, |
|
116
|
|
|
|
|
|
|
* for that compiler, we use inline assembly. Inline assembly is |
|
117
|
|
|
|
|
|
|
* potentially a bit slower because the compiler does not understand |
|
118
|
|
|
|
|
|
|
* what the opcode does, and thus cannot optimize instruction |
|
119
|
|
|
|
|
|
|
* scheduling. |
|
120
|
|
|
|
|
|
|
* |
|
121
|
|
|
|
|
|
|
* We use a target of "sse2" only, so that Clang may still handle the |
|
122
|
|
|
|
|
|
|
* '__m128i' type and allocate SSE2 registers. |
|
123
|
|
|
|
|
|
|
*/ |
|
124
|
|
|
|
|
|
|
#if BR_CLANG |
|
125
|
|
|
|
|
|
|
BR_TARGET("sse2") |
|
126
|
|
|
|
|
|
|
static inline __m128i |
|
127
|
|
|
|
|
|
|
pclmulqdq00(__m128i x, __m128i y) |
|
128
|
|
|
|
|
|
|
{ |
|
129
|
|
|
|
|
|
|
__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y)); |
|
130
|
|
|
|
|
|
|
return x; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
BR_TARGET("sse2") |
|
133
|
|
|
|
|
|
|
static inline __m128i |
|
134
|
|
|
|
|
|
|
pclmulqdq11(__m128i x, __m128i y) |
|
135
|
|
|
|
|
|
|
{ |
|
136
|
|
|
|
|
|
|
__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y)); |
|
137
|
|
|
|
|
|
|
return x; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
#else |
|
140
|
|
|
|
|
|
|
#define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00) |
|
141
|
|
|
|
|
|
|
#define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11) |
|
142
|
|
|
|
|
|
|
#endif |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
/* |
|
145
|
|
|
|
|
|
|
* From a 128-bit value kw, compute kx as the XOR of the two 64-bit |
|
146
|
|
|
|
|
|
|
* halves of kw (into the right half of kx; left half is unspecified). |
|
147
|
|
|
|
|
|
|
*/ |
|
148
|
|
|
|
|
|
|
#define BK(kw, kx) do { \ |
|
149
|
|
|
|
|
|
|
kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \ |
|
150
|
|
|
|
|
|
|
} while (0) |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
/* |
|
153
|
|
|
|
|
|
|
* Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and |
|
154
|
|
|
|
|
|
|
* the XOR of the two values (kx). |
|
155
|
|
|
|
|
|
|
*/ |
|
156
|
|
|
|
|
|
|
#define PBK(k0, k1, kw, kx) do { \ |
|
157
|
|
|
|
|
|
|
kw = _mm_unpacklo_epi64(k1, k0); \ |
|
158
|
|
|
|
|
|
|
kx = _mm_xor_si128(k0, k1); \ |
|
159
|
|
|
|
|
|
|
} while (0) |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
/* |
|
162
|
|
|
|
|
|
|
* Left-shift by 1 bit a 256-bit value (in four 64-bit words). |
|
163
|
|
|
|
|
|
|
*/ |
|
164
|
|
|
|
|
|
|
#define SL_256(x0, x1, x2, x3) do { \ |
|
165
|
|
|
|
|
|
|
x0 = _mm_or_si128( \ |
|
166
|
|
|
|
|
|
|
_mm_slli_epi64(x0, 1), \ |
|
167
|
|
|
|
|
|
|
_mm_srli_epi64(x1, 63)); \ |
|
168
|
|
|
|
|
|
|
x1 = _mm_or_si128( \ |
|
169
|
|
|
|
|
|
|
_mm_slli_epi64(x1, 1), \ |
|
170
|
|
|
|
|
|
|
_mm_srli_epi64(x2, 63)); \ |
|
171
|
|
|
|
|
|
|
x2 = _mm_or_si128( \ |
|
172
|
|
|
|
|
|
|
_mm_slli_epi64(x2, 1), \ |
|
173
|
|
|
|
|
|
|
_mm_srli_epi64(x3, 63)); \ |
|
174
|
|
|
|
|
|
|
x3 = _mm_slli_epi64(x3, 1); \ |
|
175
|
|
|
|
|
|
|
} while (0) |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
/* |
|
178
|
|
|
|
|
|
|
* Perform reduction in GF(2^128). The 256-bit value is in x0..x3; |
|
179
|
|
|
|
|
|
|
* result is written in x0..x1. |
|
180
|
|
|
|
|
|
|
*/ |
|
181
|
|
|
|
|
|
|
#define REDUCE_F128(x0, x1, x2, x3) do { \ |
|
182
|
|
|
|
|
|
|
x1 = _mm_xor_si128( \ |
|
183
|
|
|
|
|
|
|
x1, \ |
|
184
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
185
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
186
|
|
|
|
|
|
|
x3, \ |
|
187
|
|
|
|
|
|
|
_mm_srli_epi64(x3, 1)), \ |
|
188
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
189
|
|
|
|
|
|
|
_mm_srli_epi64(x3, 2), \ |
|
190
|
|
|
|
|
|
|
_mm_srli_epi64(x3, 7)))); \ |
|
191
|
|
|
|
|
|
|
x2 = _mm_xor_si128( \ |
|
192
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
193
|
|
|
|
|
|
|
x2, \ |
|
194
|
|
|
|
|
|
|
_mm_slli_epi64(x3, 63)), \ |
|
195
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
196
|
|
|
|
|
|
|
_mm_slli_epi64(x3, 62), \ |
|
197
|
|
|
|
|
|
|
_mm_slli_epi64(x3, 57))); \ |
|
198
|
|
|
|
|
|
|
x0 = _mm_xor_si128( \ |
|
199
|
|
|
|
|
|
|
x0, \ |
|
200
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
201
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
202
|
|
|
|
|
|
|
x2, \ |
|
203
|
|
|
|
|
|
|
_mm_srli_epi64(x2, 1)), \ |
|
204
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
205
|
|
|
|
|
|
|
_mm_srli_epi64(x2, 2), \ |
|
206
|
|
|
|
|
|
|
_mm_srli_epi64(x2, 7)))); \ |
|
207
|
|
|
|
|
|
|
x1 = _mm_xor_si128( \ |
|
208
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
209
|
|
|
|
|
|
|
x1, \ |
|
210
|
|
|
|
|
|
|
_mm_slli_epi64(x2, 63)), \ |
|
211
|
|
|
|
|
|
|
_mm_xor_si128( \ |
|
212
|
|
|
|
|
|
|
_mm_slli_epi64(x2, 62), \ |
|
213
|
|
|
|
|
|
|
_mm_slli_epi64(x2, 57))); \ |
|
214
|
|
|
|
|
|
|
} while (0) |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
/* |
|
217
|
|
|
|
|
|
|
* Square value kw into (dw,dx). |
|
218
|
|
|
|
|
|
|
*/ |
|
219
|
|
|
|
|
|
|
#define SQUARE_F128(kw, dw, dx) do { \ |
|
220
|
|
|
|
|
|
|
__m128i z0, z1, z2, z3; \ |
|
221
|
|
|
|
|
|
|
z1 = pclmulqdq11(kw, kw); \ |
|
222
|
|
|
|
|
|
|
z3 = pclmulqdq00(kw, kw); \ |
|
223
|
|
|
|
|
|
|
z0 = _mm_shuffle_epi32(z1, 0x0E); \ |
|
224
|
|
|
|
|
|
|
z2 = _mm_shuffle_epi32(z3, 0x0E); \ |
|
225
|
|
|
|
|
|
|
SL_256(z0, z1, z2, z3); \ |
|
226
|
|
|
|
|
|
|
REDUCE_F128(z0, z1, z2, z3); \ |
|
227
|
|
|
|
|
|
|
PBK(z0, z1, dw, dx); \ |
|
228
|
|
|
|
|
|
|
} while (0) |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
/* see bearssl_hash.h */ |
|
231
|
|
|
|
|
|
|
BR_TARGET("ssse3,pclmul") |
|
232
|
|
|
|
|
|
|
void |
|
233
|
68
|
|
|
|
|
|
br_ghash_pclmul(void *y, const void *h, const void *data, size_t len) |
|
234
|
|
|
|
|
|
|
{ |
|
235
|
|
|
|
|
|
|
const unsigned char *buf1, *buf2; |
|
236
|
|
|
|
|
|
|
unsigned char tmp[64]; |
|
237
|
|
|
|
|
|
|
size_t num4, num1; |
|
238
|
|
|
|
|
|
|
__m128i yw, h1w, h1x; |
|
239
|
|
|
|
|
|
|
BYTESWAP_DECL |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
/* |
|
242
|
|
|
|
|
|
|
* We split data into two chunks. First chunk starts at buf1 |
|
243
|
|
|
|
|
|
|
* and contains num4 blocks of 64-byte values. Second chunk |
|
244
|
|
|
|
|
|
|
* starts at buf2 and contains num1 blocks of 16-byte values. |
|
245
|
|
|
|
|
|
|
* We want the first chunk to be as large as possible. |
|
246
|
|
|
|
|
|
|
*/ |
|
247
|
68
|
|
|
|
|
|
buf1 = data; |
|
248
|
68
|
|
|
|
|
|
num4 = len >> 6; |
|
249
|
68
|
|
|
|
|
|
len &= 63; |
|
250
|
68
|
|
|
|
|
|
buf2 = buf1 + (num4 << 6); |
|
251
|
68
|
|
|
|
|
|
num1 = (len + 15) >> 4; |
|
252
|
68
|
100
|
|
|
|
|
if ((len & 15) != 0) { |
|
253
|
16
|
|
|
|
|
|
memcpy(tmp, buf2, len); |
|
254
|
16
|
|
|
|
|
|
memset(tmp + len, 0, (num1 << 4) - len); |
|
255
|
16
|
|
|
|
|
|
buf2 = tmp; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
/* |
|
259
|
|
|
|
|
|
|
* Preparatory step for endian conversions. |
|
260
|
|
|
|
|
|
|
*/ |
|
261
|
68
|
|
|
|
|
|
BYTESWAP_PREP; |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
/* |
|
264
|
|
|
|
|
|
|
* Load y and h. |
|
265
|
|
|
|
|
|
|
*/ |
|
266
|
68
|
|
|
|
|
|
yw = _mm_loadu_si128(y); |
|
267
|
68
|
|
|
|
|
|
h1w = _mm_loadu_si128(h); |
|
268
|
68
|
|
|
|
|
|
BYTESWAP(yw); |
|
269
|
68
|
|
|
|
|
|
BYTESWAP(h1w); |
|
270
|
68
|
|
|
|
|
|
BK(h1w, h1x); |
|
271
|
|
|
|
|
|
|
|
|
272
|
68
|
100
|
|
|
|
|
if (num4 > 0) { |
|
273
|
|
|
|
|
|
|
__m128i h2w, h2x, h3w, h3x, h4w, h4x; |
|
274
|
|
|
|
|
|
|
__m128i t0, t1, t2, t3; |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
/* |
|
277
|
|
|
|
|
|
|
* Compute h2 = h^2. |
|
278
|
|
|
|
|
|
|
*/ |
|
279
|
76
|
|
|
|
|
|
SQUARE_F128(h1w, h2w, h2x); |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
/* |
|
282
|
|
|
|
|
|
|
* Compute h3 = h^3 = h*(h^2). |
|
283
|
|
|
|
|
|
|
*/ |
|
284
|
2
|
|
|
|
|
|
t1 = pclmulqdq11(h1w, h2w); |
|
285
|
2
|
|
|
|
|
|
t3 = pclmulqdq00(h1w, h2w); |
|
286
|
2
|
|
|
|
|
|
t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x), |
|
287
|
|
|
|
|
|
|
_mm_xor_si128(t1, t3)); |
|
288
|
2
|
|
|
|
|
|
t0 = _mm_shuffle_epi32(t1, 0x0E); |
|
289
|
2
|
|
|
|
|
|
t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); |
|
290
|
4
|
|
|
|
|
|
t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); |
|
291
|
20
|
|
|
|
|
|
SL_256(t0, t1, t2, t3); |
|
292
|
52
|
|
|
|
|
|
REDUCE_F128(t0, t1, t2, t3); |
|
293
|
2
|
|
|
|
|
|
PBK(t0, t1, h3w, h3x); |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
/* |
|
296
|
|
|
|
|
|
|
* Compute h4 = h^4 = (h^2)^2. |
|
297
|
|
|
|
|
|
|
*/ |
|
298
|
76
|
|
|
|
|
|
SQUARE_F128(h2w, h4w, h4x); |
|
299
|
|
|
|
|
|
|
|
|
300
|
4
|
100
|
|
|
|
|
while (num4 -- > 0) { |
|
301
|
|
|
|
|
|
|
__m128i aw0, aw1, aw2, aw3; |
|
302
|
|
|
|
|
|
|
__m128i ax0, ax1, ax2, ax3; |
|
303
|
|
|
|
|
|
|
|
|
304
|
2
|
|
|
|
|
|
aw0 = _mm_loadu_si128((void *)(buf1 + 0)); |
|
305
|
2
|
|
|
|
|
|
aw1 = _mm_loadu_si128((void *)(buf1 + 16)); |
|
306
|
2
|
|
|
|
|
|
aw2 = _mm_loadu_si128((void *)(buf1 + 32)); |
|
307
|
4
|
|
|
|
|
|
aw3 = _mm_loadu_si128((void *)(buf1 + 48)); |
|
308
|
2
|
|
|
|
|
|
BYTESWAP(aw0); |
|
309
|
2
|
|
|
|
|
|
BYTESWAP(aw1); |
|
310
|
2
|
|
|
|
|
|
BYTESWAP(aw2); |
|
311
|
2
|
|
|
|
|
|
BYTESWAP(aw3); |
|
312
|
2
|
|
|
|
|
|
buf1 += 64; |
|
313
|
|
|
|
|
|
|
|
|
314
|
2
|
|
|
|
|
|
aw0 = _mm_xor_si128(aw0, yw); |
|
315
|
2
|
|
|
|
|
|
BK(aw1, ax1); |
|
316
|
2
|
|
|
|
|
|
BK(aw2, ax2); |
|
317
|
2
|
|
|
|
|
|
BK(aw3, ax3); |
|
318
|
2
|
|
|
|
|
|
BK(aw0, ax0); |
|
319
|
|
|
|
|
|
|
|
|
320
|
2
|
|
|
|
|
|
t1 = _mm_xor_si128( |
|
321
|
|
|
|
|
|
|
_mm_xor_si128( |
|
322
|
2
|
|
|
|
|
|
pclmulqdq11(aw0, h4w), |
|
323
|
2
|
|
|
|
|
|
pclmulqdq11(aw1, h3w)), |
|
324
|
|
|
|
|
|
|
_mm_xor_si128( |
|
325
|
2
|
|
|
|
|
|
pclmulqdq11(aw2, h2w), |
|
326
|
2
|
|
|
|
|
|
pclmulqdq11(aw3, h1w))); |
|
327
|
2
|
|
|
|
|
|
t3 = _mm_xor_si128( |
|
328
|
|
|
|
|
|
|
_mm_xor_si128( |
|
329
|
2
|
|
|
|
|
|
pclmulqdq00(aw0, h4w), |
|
330
|
2
|
|
|
|
|
|
pclmulqdq00(aw1, h3w)), |
|
331
|
|
|
|
|
|
|
_mm_xor_si128( |
|
332
|
2
|
|
|
|
|
|
pclmulqdq00(aw2, h2w), |
|
333
|
2
|
|
|
|
|
|
pclmulqdq00(aw3, h1w))); |
|
334
|
4
|
|
|
|
|
|
t2 = _mm_xor_si128( |
|
335
|
|
|
|
|
|
|
_mm_xor_si128( |
|
336
|
2
|
|
|
|
|
|
pclmulqdq00(ax0, h4x), |
|
337
|
2
|
|
|
|
|
|
pclmulqdq00(ax1, h3x)), |
|
338
|
|
|
|
|
|
|
_mm_xor_si128( |
|
339
|
2
|
|
|
|
|
|
pclmulqdq00(ax2, h2x), |
|
340
|
2
|
|
|
|
|
|
pclmulqdq00(ax3, h1x))); |
|
341
|
2
|
|
|
|
|
|
t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); |
|
342
|
2
|
|
|
|
|
|
t0 = _mm_shuffle_epi32(t1, 0x0E); |
|
343
|
2
|
|
|
|
|
|
t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); |
|
344
|
4
|
|
|
|
|
|
t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); |
|
345
|
20
|
|
|
|
|
|
SL_256(t0, t1, t2, t3); |
|
346
|
52
|
|
|
|
|
|
REDUCE_F128(t0, t1, t2, t3); |
|
347
|
2
|
|
|
|
|
|
yw = _mm_unpacklo_epi64(t1, t0); |
|
348
|
|
|
|
|
|
|
} |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
|
|
351
|
138
|
100
|
|
|
|
|
while (num1 -- > 0) { |
|
352
|
|
|
|
|
|
|
__m128i aw, ax; |
|
353
|
|
|
|
|
|
|
__m128i t0, t1, t2, t3; |
|
354
|
|
|
|
|
|
|
|
|
355
|
70
|
|
|
|
|
|
aw = _mm_loadu_si128((void *)buf2); |
|
356
|
70
|
|
|
|
|
|
BYTESWAP(aw); |
|
357
|
70
|
|
|
|
|
|
buf2 += 16; |
|
358
|
|
|
|
|
|
|
|
|
359
|
70
|
|
|
|
|
|
aw = _mm_xor_si128(aw, yw); |
|
360
|
70
|
|
|
|
|
|
BK(aw, ax); |
|
361
|
|
|
|
|
|
|
|
|
362
|
70
|
|
|
|
|
|
t1 = pclmulqdq11(aw, h1w); |
|
363
|
70
|
|
|
|
|
|
t3 = pclmulqdq00(aw, h1w); |
|
364
|
70
|
|
|
|
|
|
t2 = pclmulqdq00(ax, h1x); |
|
365
|
70
|
|
|
|
|
|
t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); |
|
366
|
70
|
|
|
|
|
|
t0 = _mm_shuffle_epi32(t1, 0x0E); |
|
367
|
70
|
|
|
|
|
|
t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); |
|
368
|
140
|
|
|
|
|
|
t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); |
|
369
|
700
|
|
|
|
|
|
SL_256(t0, t1, t2, t3); |
|
370
|
1820
|
|
|
|
|
|
REDUCE_F128(t0, t1, t2, t3); |
|
371
|
70
|
|
|
|
|
|
yw = _mm_unpacklo_epi64(t1, t0); |
|
372
|
|
|
|
|
|
|
} |
|
373
|
|
|
|
|
|
|
|
|
374
|
68
|
|
|
|
|
|
BYTESWAP(yw); |
|
375
|
|
|
|
|
|
|
_mm_storeu_si128(y, yw); |
|
376
|
68
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
BR_TARGETS_X86_DOWN |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
#else |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
/* see bearssl_hash.h */ |
|
383
|
|
|
|
|
|
|
br_ghash |
|
384
|
|
|
|
|
|
|
br_ghash_pclmul_get(void) |
|
385
|
|
|
|
|
|
|
{ |
|
386
|
|
|
|
|
|
|
return 0; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#endif |