| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* Copyright (c) 2017 Thomas Pornin |
|
3
|
|
|
|
|
|
|
* |
|
4
|
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining |
|
5
|
|
|
|
|
|
|
* a copy of this software and associated documentation files (the |
|
6
|
|
|
|
|
|
|
* "Software"), to deal in the Software without restriction, including |
|
7
|
|
|
|
|
|
|
* without limitation the rights to use, copy, modify, merge, publish, |
|
8
|
|
|
|
|
|
|
* distribute, sublicense, and/or sell copies of the Software, and to |
|
9
|
|
|
|
|
|
|
* permit persons to whom the Software is furnished to do so, subject to |
|
10
|
|
|
|
|
|
|
* the following conditions: |
|
11
|
|
|
|
|
|
|
* |
|
12
|
|
|
|
|
|
|
* The above copyright notice and this permission notice shall be |
|
13
|
|
|
|
|
|
|
* included in all copies or substantial portions of the Software. |
|
14
|
|
|
|
|
|
|
* |
|
15
|
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
16
|
|
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
17
|
|
|
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
18
|
|
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
|
19
|
|
|
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
|
20
|
|
|
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
21
|
|
|
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
22
|
|
|
|
|
|
|
* SOFTWARE. |
|
23
|
|
|
|
|
|
|
*/ |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
#define BR_POWER_ASM_MACROS 1 |
|
26
|
|
|
|
|
|
|
#include "inner.h" |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
/* |
|
29
|
|
|
|
|
|
|
* This is the GHASH implementation that leverages the POWER8 opcodes. |
|
30
|
|
|
|
|
|
|
*/ |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
#if BR_POWER8 |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
/* |
|
35
|
|
|
|
|
|
|
* Some symbolic names for registers. |
|
36
|
|
|
|
|
|
|
* HB0 = 16 bytes of value 0 |
|
37
|
|
|
|
|
|
|
* HB1 = 16 bytes of value 1 |
|
38
|
|
|
|
|
|
|
* HB2 = 16 bytes of value 2 |
|
39
|
|
|
|
|
|
|
* HB6 = 16 bytes of value 6 |
|
40
|
|
|
|
|
|
|
* HB7 = 16 bytes of value 7 |
|
41
|
|
|
|
|
|
|
* TT0, TT1 and TT2 are temporaries |
|
42
|
|
|
|
|
|
|
* |
|
43
|
|
|
|
|
|
|
* BSW holds the pattern for byteswapping 32-bit words; this is set only |
|
44
|
|
|
|
|
|
|
* on little-endian systems. XBSW is the same register with the +32 offset |
|
45
|
|
|
|
|
|
|
* for access with the VSX opcodes. |
|
46
|
|
|
|
|
|
|
*/ |
|
47
|
|
|
|
|
|
|
#define HB0 0 |
|
48
|
|
|
|
|
|
|
#define HB1 1 |
|
49
|
|
|
|
|
|
|
#define HB2 2 |
|
50
|
|
|
|
|
|
|
#define HB6 3 |
|
51
|
|
|
|
|
|
|
#define HB7 4 |
|
52
|
|
|
|
|
|
|
#define TT0 5 |
|
53
|
|
|
|
|
|
|
#define TT1 6 |
|
54
|
|
|
|
|
|
|
#define TT2 7 |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
#define BSW 8 |
|
57
|
|
|
|
|
|
|
#define XBSW 40 |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
/* |
|
60
|
|
|
|
|
|
|
* Macro to initialise the constants. |
|
61
|
|
|
|
|
|
|
*/ |
|
62
|
|
|
|
|
|
|
#define INIT \ |
|
63
|
|
|
|
|
|
|
vxor(HB0, HB0, HB0) \ |
|
64
|
|
|
|
|
|
|
vspltisb(HB1, 1) \ |
|
65
|
|
|
|
|
|
|
vspltisb(HB2, 2) \ |
|
66
|
|
|
|
|
|
|
vspltisb(HB6, 6) \ |
|
67
|
|
|
|
|
|
|
vspltisb(HB7, 7) \ |
|
68
|
|
|
|
|
|
|
INIT_BSW |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
/* |
|
71
|
|
|
|
|
|
|
* Fix endianness of a value after reading it or before writing it, if |
|
72
|
|
|
|
|
|
|
* necessary. |
|
73
|
|
|
|
|
|
|
*/ |
|
74
|
|
|
|
|
|
|
#if BR_POWER8_LE |
|
75
|
|
|
|
|
|
|
#define INIT_BSW lxvw4x(XBSW, 0, %[idx2be]) |
|
76
|
|
|
|
|
|
|
#define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW) |
|
77
|
|
|
|
|
|
|
#else |
|
78
|
|
|
|
|
|
|
#define INIT_BSW |
|
79
|
|
|
|
|
|
|
#define FIX_ENDIAN(xx) |
|
80
|
|
|
|
|
|
|
#endif |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
/* |
|
83
|
|
|
|
|
|
|
* Left-shift x0:x1 by one bit to the left. This is a corrective action |
|
84
|
|
|
|
|
|
|
* needed because GHASH is defined in full little-endian specification, |
|
85
|
|
|
|
|
|
|
* while the opcodes use full big-endian convention, so the 255-bit product |
|
86
|
|
|
|
|
|
|
* ends up one bit to the right. |
|
87
|
|
|
|
|
|
|
*/ |
|
88
|
|
|
|
|
|
|
#define SL_256(x0, x1) \ |
|
89
|
|
|
|
|
|
|
vsldoi(TT0, HB0, x1, 1) \ |
|
90
|
|
|
|
|
|
|
vsl(x0, x0, HB1) \ |
|
91
|
|
|
|
|
|
|
vsr(TT0, TT0, HB7) \ |
|
92
|
|
|
|
|
|
|
vsl(x1, x1, HB1) \ |
|
93
|
|
|
|
|
|
|
vxor(x0, x0, TT0) |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
/* |
|
96
|
|
|
|
|
|
|
* Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as |
|
97
|
|
|
|
|
|
|
* x0 or x1, or a different register). x0 and x1 are modified. |
|
98
|
|
|
|
|
|
|
*/ |
|
99
|
|
|
|
|
|
|
#define REDUCE_F128(xd, x0, x1) \ |
|
100
|
|
|
|
|
|
|
vxor(x0, x0, x1) \ |
|
101
|
|
|
|
|
|
|
vsr(TT0, x1, HB1) \ |
|
102
|
|
|
|
|
|
|
vsr(TT1, x1, HB2) \ |
|
103
|
|
|
|
|
|
|
vsr(TT2, x1, HB7) \ |
|
104
|
|
|
|
|
|
|
vxor(x0, x0, TT0) \ |
|
105
|
|
|
|
|
|
|
vxor(TT1, TT1, TT2) \ |
|
106
|
|
|
|
|
|
|
vxor(x0, x0, TT1) \ |
|
107
|
|
|
|
|
|
|
vsldoi(x1, x1, HB0, 15) \ |
|
108
|
|
|
|
|
|
|
vsl(TT1, x1, HB6) \ |
|
109
|
|
|
|
|
|
|
vsl(TT2, x1, HB1) \ |
|
110
|
|
|
|
|
|
|
vxor(x1, TT1, TT2) \ |
|
111
|
|
|
|
|
|
|
vsr(TT0, x1, HB1) \ |
|
112
|
|
|
|
|
|
|
vsr(TT1, x1, HB2) \ |
|
113
|
|
|
|
|
|
|
vsr(TT2, x1, HB7) \ |
|
114
|
|
|
|
|
|
|
vxor(x0, x0, x1) \ |
|
115
|
|
|
|
|
|
|
vxor(x0, x0, TT0) \ |
|
116
|
|
|
|
|
|
|
vxor(TT1, TT1, TT2) \ |
|
117
|
|
|
|
|
|
|
vxor(xd, x0, TT1) |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
/* see bearssl_hash.h */ |
|
120
|
|
|
|
|
|
|
void |
|
121
|
|
|
|
|
|
|
br_ghash_pwr8(void *y, const void *h, const void *data, size_t len) |
|
122
|
|
|
|
|
|
|
{ |
|
123
|
|
|
|
|
|
|
const unsigned char *buf1, *buf2; |
|
124
|
|
|
|
|
|
|
size_t num4, num1; |
|
125
|
|
|
|
|
|
|
unsigned char tmp[64]; |
|
126
|
|
|
|
|
|
|
long cc0, cc1, cc2, cc3; |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
#if BR_POWER8_LE |
|
129
|
|
|
|
|
|
|
static const uint32_t idx2be[] = { |
|
130
|
|
|
|
|
|
|
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C |
|
131
|
|
|
|
|
|
|
}; |
|
132
|
|
|
|
|
|
|
#endif |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
buf1 = data; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
/* |
|
137
|
|
|
|
|
|
|
* Assembly code requires data into two chunks; first chunk |
|
138
|
|
|
|
|
|
|
* must contain a number of blocks which is a multiple of 4. |
|
139
|
|
|
|
|
|
|
* Since the processing for the first chunk is faster, we want |
|
140
|
|
|
|
|
|
|
* to make it as big as possible. |
|
141
|
|
|
|
|
|
|
* |
|
142
|
|
|
|
|
|
|
* For the remainder, there are two possibilities: |
|
143
|
|
|
|
|
|
|
* -- if the remainder size is a multiple of 16, then use it |
|
144
|
|
|
|
|
|
|
* in place; |
|
145
|
|
|
|
|
|
|
* -- otherwise, copy it to the tmp[] array and pad it with |
|
146
|
|
|
|
|
|
|
* zeros. |
|
147
|
|
|
|
|
|
|
*/ |
|
148
|
|
|
|
|
|
|
num4 = len >> 6; |
|
149
|
|
|
|
|
|
|
buf2 = buf1 + (num4 << 6); |
|
150
|
|
|
|
|
|
|
len &= 63; |
|
151
|
|
|
|
|
|
|
num1 = (len + 15) >> 4; |
|
152
|
|
|
|
|
|
|
if ((len & 15) != 0) { |
|
153
|
|
|
|
|
|
|
memcpy(tmp, buf2, len); |
|
154
|
|
|
|
|
|
|
memset(tmp + len, 0, (num1 << 4) - len); |
|
155
|
|
|
|
|
|
|
buf2 = tmp; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
cc0 = 0; |
|
159
|
|
|
|
|
|
|
cc1 = 16; |
|
160
|
|
|
|
|
|
|
cc2 = 32; |
|
161
|
|
|
|
|
|
|
cc3 = 48; |
|
162
|
|
|
|
|
|
|
asm volatile ( |
|
163
|
|
|
|
|
|
|
INIT |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
/* |
|
166
|
|
|
|
|
|
|
* Load current h (denoted hereafter h1) in v9. |
|
167
|
|
|
|
|
|
|
*/ |
|
168
|
|
|
|
|
|
|
lxvw4x(41, 0, %[h]) |
|
169
|
|
|
|
|
|
|
FIX_ENDIAN(9) |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
/* |
|
172
|
|
|
|
|
|
|
* Load current y into v28. |
|
173
|
|
|
|
|
|
|
*/ |
|
174
|
|
|
|
|
|
|
lxvw4x(60, 0, %[y]) |
|
175
|
|
|
|
|
|
|
FIX_ENDIAN(28) |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
/* |
|
178
|
|
|
|
|
|
|
* Split h1 into three registers: |
|
179
|
|
|
|
|
|
|
* v17 = h1_1:h1_0 |
|
180
|
|
|
|
|
|
|
* v18 = 0:h1_0 |
|
181
|
|
|
|
|
|
|
* v19 = h1_1:0 |
|
182
|
|
|
|
|
|
|
*/ |
|
183
|
|
|
|
|
|
|
xxpermdi(49, 41, 41, 2) |
|
184
|
|
|
|
|
|
|
vsldoi(18, HB0, 9, 8) |
|
185
|
|
|
|
|
|
|
vsldoi(19, 9, HB0, 8) |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
/* |
|
188
|
|
|
|
|
|
|
* If num4 is 0, skip directly to the second chunk. |
|
189
|
|
|
|
|
|
|
*/ |
|
190
|
|
|
|
|
|
|
cmpldi(%[num4], 0) |
|
191
|
|
|
|
|
|
|
beq(chunk1) |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
/* |
|
194
|
|
|
|
|
|
|
* Compute h2 = h*h in v10. |
|
195
|
|
|
|
|
|
|
*/ |
|
196
|
|
|
|
|
|
|
vpmsumd(10, 18, 18) |
|
197
|
|
|
|
|
|
|
vpmsumd(11, 19, 19) |
|
198
|
|
|
|
|
|
|
SL_256(10, 11) |
|
199
|
|
|
|
|
|
|
REDUCE_F128(10, 10, 11) |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
/* |
|
202
|
|
|
|
|
|
|
* Compute h3 = h*h*h in v11. |
|
203
|
|
|
|
|
|
|
* We first split h2 into: |
|
204
|
|
|
|
|
|
|
* v10 = h2_0:h2_1 |
|
205
|
|
|
|
|
|
|
* v11 = 0:h2_0 |
|
206
|
|
|
|
|
|
|
* v12 = h2_1:0 |
|
207
|
|
|
|
|
|
|
* Then we do the product with h1, and reduce into v11. |
|
208
|
|
|
|
|
|
|
*/ |
|
209
|
|
|
|
|
|
|
vsldoi(11, HB0, 10, 8) |
|
210
|
|
|
|
|
|
|
vsldoi(12, 10, HB0, 8) |
|
211
|
|
|
|
|
|
|
vpmsumd(13, 10, 17) |
|
212
|
|
|
|
|
|
|
vpmsumd(11, 11, 18) |
|
213
|
|
|
|
|
|
|
vpmsumd(12, 12, 19) |
|
214
|
|
|
|
|
|
|
vsldoi(14, HB0, 13, 8) |
|
215
|
|
|
|
|
|
|
vsldoi(15, 13, HB0, 8) |
|
216
|
|
|
|
|
|
|
vxor(11, 11, 14) |
|
217
|
|
|
|
|
|
|
vxor(12, 12, 15) |
|
218
|
|
|
|
|
|
|
SL_256(11, 12) |
|
219
|
|
|
|
|
|
|
REDUCE_F128(11, 11, 12) |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
/* |
|
222
|
|
|
|
|
|
|
* Compute h4 = h*h*h*h in v12. This is done by squaring h2. |
|
223
|
|
|
|
|
|
|
*/ |
|
224
|
|
|
|
|
|
|
vsldoi(12, HB0, 10, 8) |
|
225
|
|
|
|
|
|
|
vsldoi(13, 10, HB0, 8) |
|
226
|
|
|
|
|
|
|
vpmsumd(12, 12, 12) |
|
227
|
|
|
|
|
|
|
vpmsumd(13, 13, 13) |
|
228
|
|
|
|
|
|
|
SL_256(12, 13) |
|
229
|
|
|
|
|
|
|
REDUCE_F128(12, 12, 13) |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
/* |
|
232
|
|
|
|
|
|
|
* Repack h1, h2, h3 and h4: |
|
233
|
|
|
|
|
|
|
* v13 = h4_0:h3_0 |
|
234
|
|
|
|
|
|
|
* v14 = h4_1:h3_1 |
|
235
|
|
|
|
|
|
|
* v15 = h2_0:h1_0 |
|
236
|
|
|
|
|
|
|
* v16 = h2_1:h1_1 |
|
237
|
|
|
|
|
|
|
*/ |
|
238
|
|
|
|
|
|
|
xxpermdi(45, 44, 43, 0) |
|
239
|
|
|
|
|
|
|
xxpermdi(46, 44, 43, 3) |
|
240
|
|
|
|
|
|
|
xxpermdi(47, 42, 41, 0) |
|
241
|
|
|
|
|
|
|
xxpermdi(48, 42, 41, 3) |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
/* |
|
244
|
|
|
|
|
|
|
* Loop for each group of four blocks. |
|
245
|
|
|
|
|
|
|
*/ |
|
246
|
|
|
|
|
|
|
mtctr(%[num4]) |
|
247
|
|
|
|
|
|
|
label(loop4) |
|
248
|
|
|
|
|
|
|
/* |
|
249
|
|
|
|
|
|
|
* Read the four next blocks. |
|
250
|
|
|
|
|
|
|
* v20 = y + a0 = b0 |
|
251
|
|
|
|
|
|
|
* v21 = a1 = b1 |
|
252
|
|
|
|
|
|
|
* v22 = a2 = b2 |
|
253
|
|
|
|
|
|
|
* v23 = a3 = b3 |
|
254
|
|
|
|
|
|
|
*/ |
|
255
|
|
|
|
|
|
|
lxvw4x(52, %[cc0], %[buf1]) |
|
256
|
|
|
|
|
|
|
lxvw4x(53, %[cc1], %[buf1]) |
|
257
|
|
|
|
|
|
|
lxvw4x(54, %[cc2], %[buf1]) |
|
258
|
|
|
|
|
|
|
lxvw4x(55, %[cc3], %[buf1]) |
|
259
|
|
|
|
|
|
|
FIX_ENDIAN(20) |
|
260
|
|
|
|
|
|
|
FIX_ENDIAN(21) |
|
261
|
|
|
|
|
|
|
FIX_ENDIAN(22) |
|
262
|
|
|
|
|
|
|
FIX_ENDIAN(23) |
|
263
|
|
|
|
|
|
|
addi(%[buf1], %[buf1], 64) |
|
264
|
|
|
|
|
|
|
vxor(20, 20, 28) |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
/* |
|
267
|
|
|
|
|
|
|
* Repack the blocks into v9, v10, v11 and v12. |
|
268
|
|
|
|
|
|
|
* v9 = b0_0:b1_0 |
|
269
|
|
|
|
|
|
|
* v10 = b0_1:b1_1 |
|
270
|
|
|
|
|
|
|
* v11 = b2_0:b3_0 |
|
271
|
|
|
|
|
|
|
* v12 = b2_1:b3_1 |
|
272
|
|
|
|
|
|
|
*/ |
|
273
|
|
|
|
|
|
|
xxpermdi(41, 52, 53, 0) |
|
274
|
|
|
|
|
|
|
xxpermdi(42, 52, 53, 3) |
|
275
|
|
|
|
|
|
|
xxpermdi(43, 54, 55, 0) |
|
276
|
|
|
|
|
|
|
xxpermdi(44, 54, 55, 3) |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
/* |
|
279
|
|
|
|
|
|
|
* Compute the products. |
|
280
|
|
|
|
|
|
|
* v20 = b0_0*h4_0 + b1_0*h3_0 |
|
281
|
|
|
|
|
|
|
* v21 = b0_1*h4_0 + b1_1*h3_0 |
|
282
|
|
|
|
|
|
|
* v22 = b0_0*h4_1 + b1_0*h3_1 |
|
283
|
|
|
|
|
|
|
* v23 = b0_1*h4_1 + b1_1*h3_1 |
|
284
|
|
|
|
|
|
|
* v24 = b2_0*h2_0 + b3_0*h1_0 |
|
285
|
|
|
|
|
|
|
* v25 = b2_1*h2_0 + b3_1*h1_0 |
|
286
|
|
|
|
|
|
|
* v26 = b2_0*h2_1 + b3_0*h1_1 |
|
287
|
|
|
|
|
|
|
* v27 = b2_1*h2_1 + b3_1*h1_1 |
|
288
|
|
|
|
|
|
|
*/ |
|
289
|
|
|
|
|
|
|
vpmsumd(20, 13, 9) |
|
290
|
|
|
|
|
|
|
vpmsumd(21, 13, 10) |
|
291
|
|
|
|
|
|
|
vpmsumd(22, 14, 9) |
|
292
|
|
|
|
|
|
|
vpmsumd(23, 14, 10) |
|
293
|
|
|
|
|
|
|
vpmsumd(24, 15, 11) |
|
294
|
|
|
|
|
|
|
vpmsumd(25, 15, 12) |
|
295
|
|
|
|
|
|
|
vpmsumd(26, 16, 11) |
|
296
|
|
|
|
|
|
|
vpmsumd(27, 16, 12) |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
/* |
|
299
|
|
|
|
|
|
|
* Sum products into a single 256-bit result in v11:v12. |
|
300
|
|
|
|
|
|
|
*/ |
|
301
|
|
|
|
|
|
|
vxor(11, 20, 24) |
|
302
|
|
|
|
|
|
|
vxor(12, 23, 27) |
|
303
|
|
|
|
|
|
|
vxor( 9, 21, 22) |
|
304
|
|
|
|
|
|
|
vxor(10, 25, 26) |
|
305
|
|
|
|
|
|
|
vxor(20, 9, 10) |
|
306
|
|
|
|
|
|
|
vsldoi( 9, HB0, 20, 8) |
|
307
|
|
|
|
|
|
|
vsldoi(10, 20, HB0, 8) |
|
308
|
|
|
|
|
|
|
vxor(11, 11, 9) |
|
309
|
|
|
|
|
|
|
vxor(12, 12, 10) |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
/* |
|
312
|
|
|
|
|
|
|
* Fix and reduce in GF(2^128); this is the new y (in v28). |
|
313
|
|
|
|
|
|
|
*/ |
|
314
|
|
|
|
|
|
|
SL_256(11, 12) |
|
315
|
|
|
|
|
|
|
REDUCE_F128(28, 11, 12) |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
/* |
|
318
|
|
|
|
|
|
|
* Loop for next group of four blocks. |
|
319
|
|
|
|
|
|
|
*/ |
|
320
|
|
|
|
|
|
|
bdnz(loop4) |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
/* |
|
323
|
|
|
|
|
|
|
* Process second chunk, one block at a time. |
|
324
|
|
|
|
|
|
|
*/ |
|
325
|
|
|
|
|
|
|
label(chunk1) |
|
326
|
|
|
|
|
|
|
cmpldi(%[num1], 0) |
|
327
|
|
|
|
|
|
|
beq(done) |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
mtctr(%[num1]) |
|
330
|
|
|
|
|
|
|
label(loop1) |
|
331
|
|
|
|
|
|
|
/* |
|
332
|
|
|
|
|
|
|
* Load next data block and XOR it into y. |
|
333
|
|
|
|
|
|
|
*/ |
|
334
|
|
|
|
|
|
|
lxvw4x(41, 0, %[buf2]) |
|
335
|
|
|
|
|
|
|
#if BR_POWER8_LE |
|
336
|
|
|
|
|
|
|
FIX_ENDIAN(9) |
|
337
|
|
|
|
|
|
|
#endif |
|
338
|
|
|
|
|
|
|
addi(%[buf2], %[buf2], 16) |
|
339
|
|
|
|
|
|
|
vxor(9, 28, 9) |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
/* |
|
342
|
|
|
|
|
|
|
* Split y into doublewords: |
|
343
|
|
|
|
|
|
|
* v9 = y_0:y_1 |
|
344
|
|
|
|
|
|
|
* v10 = 0:y_0 |
|
345
|
|
|
|
|
|
|
* v11 = y_1:0 |
|
346
|
|
|
|
|
|
|
*/ |
|
347
|
|
|
|
|
|
|
vsldoi(10, HB0, 9, 8) |
|
348
|
|
|
|
|
|
|
vsldoi(11, 9, HB0, 8) |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
/* |
|
351
|
|
|
|
|
|
|
* Compute products with h: |
|
352
|
|
|
|
|
|
|
* v12 = y_0 * h_0 |
|
353
|
|
|
|
|
|
|
* v13 = y_1 * h_1 |
|
354
|
|
|
|
|
|
|
* v14 = y_1 * h_0 + y_0 * h_1 |
|
355
|
|
|
|
|
|
|
*/ |
|
356
|
|
|
|
|
|
|
vpmsumd(14, 9, 17) |
|
357
|
|
|
|
|
|
|
vpmsumd(12, 10, 18) |
|
358
|
|
|
|
|
|
|
vpmsumd(13, 11, 19) |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
/* |
|
361
|
|
|
|
|
|
|
* Propagate v14 into v12:v13 to finalise product. |
|
362
|
|
|
|
|
|
|
*/ |
|
363
|
|
|
|
|
|
|
vsldoi(10, HB0, 14, 8) |
|
364
|
|
|
|
|
|
|
vsldoi(11, 14, HB0, 8) |
|
365
|
|
|
|
|
|
|
vxor(12, 12, 10) |
|
366
|
|
|
|
|
|
|
vxor(13, 13, 11) |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
/* |
|
369
|
|
|
|
|
|
|
* Fix result and reduce into v28 (next value for y). |
|
370
|
|
|
|
|
|
|
*/ |
|
371
|
|
|
|
|
|
|
SL_256(12, 13) |
|
372
|
|
|
|
|
|
|
REDUCE_F128(28, 12, 13) |
|
373
|
|
|
|
|
|
|
bdnz(loop1) |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
label(done) |
|
376
|
|
|
|
|
|
|
/* |
|
377
|
|
|
|
|
|
|
* Write back the new y. |
|
378
|
|
|
|
|
|
|
*/ |
|
379
|
|
|
|
|
|
|
FIX_ENDIAN(28) |
|
380
|
|
|
|
|
|
|
stxvw4x(60, 0, %[y]) |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
: [buf1] "+b" (buf1), [buf2] "+b" (buf2) |
|
383
|
|
|
|
|
|
|
: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1), |
|
384
|
|
|
|
|
|
|
[cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3) |
|
385
|
|
|
|
|
|
|
#if BR_POWER8_LE |
|
386
|
|
|
|
|
|
|
, [idx2be] "b" (idx2be) |
|
387
|
|
|
|
|
|
|
#endif |
|
388
|
|
|
|
|
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
|
389
|
|
|
|
|
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", |
|
390
|
|
|
|
|
|
|
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", |
|
391
|
|
|
|
|
|
|
"ctr", "memory" |
|
392
|
|
|
|
|
|
|
); |
|
393
|
|
|
|
|
|
|
} |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
/* see bearssl_hash.h */ |
|
396
|
|
|
|
|
|
|
br_ghash |
|
397
|
|
|
|
|
|
|
br_ghash_pwr8_get(void) |
|
398
|
|
|
|
|
|
|
{ |
|
399
|
|
|
|
|
|
|
return &br_ghash_pwr8; |
|
400
|
|
|
|
|
|
|
} |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
#else |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
/* see bearssl_hash.h */ |
|
405
|
|
|
|
|
|
|
br_ghash |
|
406
|
0
|
|
|
|
|
|
br_ghash_pwr8_get(void) |
|
407
|
|
|
|
|
|
|
{ |
|
408
|
0
|
|
|
|
|
|
return 0; |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
#endif |