File Coverage

src/symcipher/aes_x86ni_ctrcbc.c
Criterion Covered Total %
statement 2 338 0.5
branch 1 46 2.1
condition n/a
subroutine n/a
pod n/a
total 3 384 0.7


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #define BR_ENABLE_INTRINSICS 1
26             #include "inner.h"
27              
28             #if BR_AES_X86NI
29              
30             /* see bearssl_block.h */
31             const br_block_ctrcbc_class *
32 52           br_aes_x86ni_ctrcbc_get_vtable(void)
33             {
34 52 50         return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
35             }
36              
37             /* see bearssl_block.h */
38             void
39 0           br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
40             const void *key, size_t len)
41             {
42 0           ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
43 0           ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44 0           }
45              
46             BR_TARGETS_X86_UP
47              
48             /* see bearssl_block.h */
49             BR_TARGET("sse2,sse4.1,aes")
50             void
51 0           br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
52             void *ctr, void *data, size_t len)
53             {
54             unsigned char *buf;
55             unsigned num_rounds;
56             __m128i sk[15];
57             __m128i ivx0, ivx1, ivx2, ivx3;
58             __m128i erev, zero, one, four, notthree;
59             unsigned u;
60              
61 0           buf = data;
62 0           num_rounds = ctx->num_rounds;
63 0 0         for (u = 0; u <= num_rounds; u ++) {
64 0           sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
65             }
66              
67             /*
68             * Some SSE2 constants.
69             */
70 0           erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71             8, 9, 10, 11, 12, 13, 14, 15);
72 0           zero = _mm_setzero_si128();
73 0           one = _mm_set_epi64x(0, 1);
74 0           four = _mm_set_epi64x(0, 4);
75 0           notthree = _mm_sub_epi64(zero, four);
76              
77             /*
78             * Decode the counter in big-endian and pre-increment the other
79             * three counters.
80             */
81 0           ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
82 0           ivx1 = _mm_add_epi64(ivx0, one);
83 0           ivx1 = _mm_sub_epi64(ivx1,
84 0           _mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
85 0           ivx2 = _mm_add_epi64(ivx1, one);
86 0           ivx2 = _mm_sub_epi64(ivx2,
87 0           _mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
88 0           ivx3 = _mm_add_epi64(ivx2, one);
89 0           ivx3 = _mm_sub_epi64(ivx3,
90 0           _mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
91 0 0         while (len > 0) {
92             __m128i x0, x1, x2, x3;
93              
94             /*
95             * Load counter values; we need to byteswap them because
96             * the specification says that they use big-endian.
97             */
98 0           x0 = _mm_shuffle_epi8(ivx0, erev);
99 0           x1 = _mm_shuffle_epi8(ivx1, erev);
100 0           x2 = _mm_shuffle_epi8(ivx2, erev);
101 0           x3 = _mm_shuffle_epi8(ivx3, erev);
102              
103 0           x0 = _mm_xor_si128(x0, sk[0]);
104 0           x1 = _mm_xor_si128(x1, sk[0]);
105 0           x2 = _mm_xor_si128(x2, sk[0]);
106 0           x3 = _mm_xor_si128(x3, sk[0]);
107 0           x0 = _mm_aesenc_si128(x0, sk[1]);
108 0           x1 = _mm_aesenc_si128(x1, sk[1]);
109 0           x2 = _mm_aesenc_si128(x2, sk[1]);
110 0           x3 = _mm_aesenc_si128(x3, sk[1]);
111 0           x0 = _mm_aesenc_si128(x0, sk[2]);
112 0           x1 = _mm_aesenc_si128(x1, sk[2]);
113 0           x2 = _mm_aesenc_si128(x2, sk[2]);
114 0           x3 = _mm_aesenc_si128(x3, sk[2]);
115 0           x0 = _mm_aesenc_si128(x0, sk[3]);
116 0           x1 = _mm_aesenc_si128(x1, sk[3]);
117 0           x2 = _mm_aesenc_si128(x2, sk[3]);
118 0           x3 = _mm_aesenc_si128(x3, sk[3]);
119 0           x0 = _mm_aesenc_si128(x0, sk[4]);
120 0           x1 = _mm_aesenc_si128(x1, sk[4]);
121 0           x2 = _mm_aesenc_si128(x2, sk[4]);
122 0           x3 = _mm_aesenc_si128(x3, sk[4]);
123 0           x0 = _mm_aesenc_si128(x0, sk[5]);
124 0           x1 = _mm_aesenc_si128(x1, sk[5]);
125 0           x2 = _mm_aesenc_si128(x2, sk[5]);
126 0           x3 = _mm_aesenc_si128(x3, sk[5]);
127 0           x0 = _mm_aesenc_si128(x0, sk[6]);
128 0           x1 = _mm_aesenc_si128(x1, sk[6]);
129 0           x2 = _mm_aesenc_si128(x2, sk[6]);
130 0           x3 = _mm_aesenc_si128(x3, sk[6]);
131 0           x0 = _mm_aesenc_si128(x0, sk[7]);
132 0           x1 = _mm_aesenc_si128(x1, sk[7]);
133 0           x2 = _mm_aesenc_si128(x2, sk[7]);
134 0           x3 = _mm_aesenc_si128(x3, sk[7]);
135 0           x0 = _mm_aesenc_si128(x0, sk[8]);
136 0           x1 = _mm_aesenc_si128(x1, sk[8]);
137 0           x2 = _mm_aesenc_si128(x2, sk[8]);
138 0           x3 = _mm_aesenc_si128(x3, sk[8]);
139 0           x0 = _mm_aesenc_si128(x0, sk[9]);
140 0           x1 = _mm_aesenc_si128(x1, sk[9]);
141 0           x2 = _mm_aesenc_si128(x2, sk[9]);
142 0           x3 = _mm_aesenc_si128(x3, sk[9]);
143 0 0         if (num_rounds == 10) {
144 0           x0 = _mm_aesenclast_si128(x0, sk[10]);
145 0           x1 = _mm_aesenclast_si128(x1, sk[10]);
146 0           x2 = _mm_aesenclast_si128(x2, sk[10]);
147 0           x3 = _mm_aesenclast_si128(x3, sk[10]);
148 0 0         } else if (num_rounds == 12) {
149 0           x0 = _mm_aesenc_si128(x0, sk[10]);
150 0           x1 = _mm_aesenc_si128(x1, sk[10]);
151 0           x2 = _mm_aesenc_si128(x2, sk[10]);
152 0           x3 = _mm_aesenc_si128(x3, sk[10]);
153 0           x0 = _mm_aesenc_si128(x0, sk[11]);
154 0           x1 = _mm_aesenc_si128(x1, sk[11]);
155 0           x2 = _mm_aesenc_si128(x2, sk[11]);
156 0           x3 = _mm_aesenc_si128(x3, sk[11]);
157 0           x0 = _mm_aesenclast_si128(x0, sk[12]);
158 0           x1 = _mm_aesenclast_si128(x1, sk[12]);
159 0           x2 = _mm_aesenclast_si128(x2, sk[12]);
160 0           x3 = _mm_aesenclast_si128(x3, sk[12]);
161             } else {
162 0           x0 = _mm_aesenc_si128(x0, sk[10]);
163 0           x1 = _mm_aesenc_si128(x1, sk[10]);
164 0           x2 = _mm_aesenc_si128(x2, sk[10]);
165 0           x3 = _mm_aesenc_si128(x3, sk[10]);
166 0           x0 = _mm_aesenc_si128(x0, sk[11]);
167 0           x1 = _mm_aesenc_si128(x1, sk[11]);
168 0           x2 = _mm_aesenc_si128(x2, sk[11]);
169 0           x3 = _mm_aesenc_si128(x3, sk[11]);
170 0           x0 = _mm_aesenc_si128(x0, sk[12]);
171 0           x1 = _mm_aesenc_si128(x1, sk[12]);
172 0           x2 = _mm_aesenc_si128(x2, sk[12]);
173 0           x3 = _mm_aesenc_si128(x3, sk[12]);
174 0           x0 = _mm_aesenc_si128(x0, sk[13]);
175 0           x1 = _mm_aesenc_si128(x1, sk[13]);
176 0           x2 = _mm_aesenc_si128(x2, sk[13]);
177 0           x3 = _mm_aesenc_si128(x3, sk[13]);
178 0           x0 = _mm_aesenclast_si128(x0, sk[14]);
179 0           x1 = _mm_aesenclast_si128(x1, sk[14]);
180 0           x2 = _mm_aesenclast_si128(x2, sk[14]);
181 0           x3 = _mm_aesenclast_si128(x3, sk[14]);
182             }
183 0 0         if (len >= 64) {
184 0           x0 = _mm_xor_si128(x0,
185             _mm_loadu_si128((void *)(buf + 0)));
186 0           x1 = _mm_xor_si128(x1,
187 0           _mm_loadu_si128((void *)(buf + 16)));
188 0           x2 = _mm_xor_si128(x2,
189 0           _mm_loadu_si128((void *)(buf + 32)));
190 0           x3 = _mm_xor_si128(x3,
191 0           _mm_loadu_si128((void *)(buf + 48)));
192             _mm_storeu_si128((void *)(buf + 0), x0);
193 0           _mm_storeu_si128((void *)(buf + 16), x1);
194 0           _mm_storeu_si128((void *)(buf + 32), x2);
195 0           _mm_storeu_si128((void *)(buf + 48), x3);
196 0           buf += 64;
197 0           len -= 64;
198             } else {
199             unsigned char tmp[64];
200              
201             _mm_storeu_si128((void *)(tmp + 0), x0);
202 0           _mm_storeu_si128((void *)(tmp + 16), x1);
203 0           _mm_storeu_si128((void *)(tmp + 32), x2);
204 0           _mm_storeu_si128((void *)(tmp + 48), x3);
205 0 0         for (u = 0; u < len; u ++) {
206 0           buf[u] ^= tmp[u];
207             }
208 0           switch (len) {
209 0           case 16:
210 0           ivx0 = ivx1;
211 0           break;
212 0           case 32:
213 0           ivx0 = ivx2;
214 0           break;
215 0           case 48:
216 0           ivx0 = ivx3;
217 0           break;
218             }
219 0           break;
220             }
221              
222             /*
223             * Add 4 to each counter value. For carry propagation
224             * into the upper 64-bit words, we would need to compare
225             * the results with 4, but SSE2+ has only _signed_
226             * comparisons. Instead, we mask out the low two bits,
227             * and check whether the remaining bits are zero.
228             */
229 0           ivx0 = _mm_add_epi64(ivx0, four);
230 0           ivx1 = _mm_add_epi64(ivx1, four);
231 0           ivx2 = _mm_add_epi64(ivx2, four);
232 0           ivx3 = _mm_add_epi64(ivx3, four);
233 0           ivx0 = _mm_sub_epi64(ivx0,
234 0           _mm_slli_si128(_mm_cmpeq_epi64(
235             _mm_and_si128(ivx0, notthree), zero), 8));
236 0           ivx1 = _mm_sub_epi64(ivx1,
237 0           _mm_slli_si128(_mm_cmpeq_epi64(
238             _mm_and_si128(ivx1, notthree), zero), 8));
239 0           ivx2 = _mm_sub_epi64(ivx2,
240 0           _mm_slli_si128(_mm_cmpeq_epi64(
241             _mm_and_si128(ivx2, notthree), zero), 8));
242 0           ivx3 = _mm_sub_epi64(ivx3,
243 0           _mm_slli_si128(_mm_cmpeq_epi64(
244             _mm_and_si128(ivx3, notthree), zero), 8));
245             }
246              
247             /*
248             * Write back new counter value. The loop took care to put the
249             * right counter value in ivx0.
250             */
251 0           _mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
252 0           }
253              
254             /* see bearssl_block.h */
255             BR_TARGET("sse2,sse4.1,aes")
256             void
257 0           br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
258             void *cbcmac, const void *data, size_t len)
259             {
260             const unsigned char *buf;
261             unsigned num_rounds;
262             __m128i sk[15], ivx;
263             unsigned u;
264              
265 0           buf = data;
266 0           ivx = _mm_loadu_si128(cbcmac);
267 0           num_rounds = ctx->num_rounds;
268 0 0         for (u = 0; u <= num_rounds; u ++) {
269 0           sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
270             }
271 0 0         while (len > 0) {
272             __m128i x;
273              
274 0           x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
275 0           x = _mm_xor_si128(x, sk[0]);
276 0           x = _mm_aesenc_si128(x, sk[1]);
277 0           x = _mm_aesenc_si128(x, sk[2]);
278 0           x = _mm_aesenc_si128(x, sk[3]);
279 0           x = _mm_aesenc_si128(x, sk[4]);
280 0           x = _mm_aesenc_si128(x, sk[5]);
281 0           x = _mm_aesenc_si128(x, sk[6]);
282 0           x = _mm_aesenc_si128(x, sk[7]);
283 0           x = _mm_aesenc_si128(x, sk[8]);
284 0           x = _mm_aesenc_si128(x, sk[9]);
285 0 0         if (num_rounds == 10) {
286 0           x = _mm_aesenclast_si128(x, sk[10]);
287 0 0         } else if (num_rounds == 12) {
288 0           x = _mm_aesenc_si128(x, sk[10]);
289 0           x = _mm_aesenc_si128(x, sk[11]);
290 0           x = _mm_aesenclast_si128(x, sk[12]);
291             } else {
292 0           x = _mm_aesenc_si128(x, sk[10]);
293 0           x = _mm_aesenc_si128(x, sk[11]);
294 0           x = _mm_aesenc_si128(x, sk[12]);
295 0           x = _mm_aesenc_si128(x, sk[13]);
296 0           x = _mm_aesenclast_si128(x, sk[14]);
297             }
298 0           ivx = x;
299 0           buf += 16;
300 0           len -= 16;
301             }
302             _mm_storeu_si128(cbcmac, ivx);
303 0           }
304              
305             /* see bearssl_block.h */
306             BR_TARGET("sse2,sse4.1,aes")
307             void
308 0           br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
309             void *ctr, void *cbcmac, void *data, size_t len)
310             {
311             unsigned char *buf;
312             unsigned num_rounds;
313             __m128i sk[15];
314             __m128i ivx, cmx;
315             __m128i erev, zero, one;
316             unsigned u;
317             int first_iter;
318              
319 0           num_rounds = ctx->num_rounds;
320 0 0         for (u = 0; u <= num_rounds; u ++) {
321 0           sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
322             }
323              
324             /*
325             * Some SSE2 constants.
326             */
327 0           erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328             8, 9, 10, 11, 12, 13, 14, 15);
329 0           zero = _mm_setzero_si128();
330 0           one = _mm_set_epi64x(0, 1);
331              
332             /*
333             * Decode the counter in big-endian.
334             */
335 0           ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
336 0           cmx = _mm_loadu_si128(cbcmac);
337              
338 0           buf = data;
339 0           first_iter = 1;
340 0 0         while (len > 0) {
341             __m128i dx, x0, x1;
342              
343             /*
344             * Load initial values:
345             * dx encrypted block of data
346             * x0 counter (for CTR encryption)
347             * x1 input for CBC-MAC
348             */
349 0           dx = _mm_loadu_si128((void *)buf);
350 0           x0 = _mm_shuffle_epi8(ivx, erev);
351 0           x1 = cmx;
352              
353 0           x0 = _mm_xor_si128(x0, sk[0]);
354 0           x1 = _mm_xor_si128(x1, sk[0]);
355 0           x0 = _mm_aesenc_si128(x0, sk[1]);
356 0           x1 = _mm_aesenc_si128(x1, sk[1]);
357 0           x0 = _mm_aesenc_si128(x0, sk[2]);
358 0           x1 = _mm_aesenc_si128(x1, sk[2]);
359 0           x0 = _mm_aesenc_si128(x0, sk[3]);
360 0           x1 = _mm_aesenc_si128(x1, sk[3]);
361 0           x0 = _mm_aesenc_si128(x0, sk[4]);
362 0           x1 = _mm_aesenc_si128(x1, sk[4]);
363 0           x0 = _mm_aesenc_si128(x0, sk[5]);
364 0           x1 = _mm_aesenc_si128(x1, sk[5]);
365 0           x0 = _mm_aesenc_si128(x0, sk[6]);
366 0           x1 = _mm_aesenc_si128(x1, sk[6]);
367 0           x0 = _mm_aesenc_si128(x0, sk[7]);
368 0           x1 = _mm_aesenc_si128(x1, sk[7]);
369 0           x0 = _mm_aesenc_si128(x0, sk[8]);
370 0           x1 = _mm_aesenc_si128(x1, sk[8]);
371 0           x0 = _mm_aesenc_si128(x0, sk[9]);
372 0           x1 = _mm_aesenc_si128(x1, sk[9]);
373 0 0         if (num_rounds == 10) {
374 0           x0 = _mm_aesenclast_si128(x0, sk[10]);
375 0           x1 = _mm_aesenclast_si128(x1, sk[10]);
376 0 0         } else if (num_rounds == 12) {
377 0           x0 = _mm_aesenc_si128(x0, sk[10]);
378 0           x1 = _mm_aesenc_si128(x1, sk[10]);
379 0           x0 = _mm_aesenc_si128(x0, sk[11]);
380 0           x1 = _mm_aesenc_si128(x1, sk[11]);
381 0           x0 = _mm_aesenclast_si128(x0, sk[12]);
382 0           x1 = _mm_aesenclast_si128(x1, sk[12]);
383             } else {
384 0           x0 = _mm_aesenc_si128(x0, sk[10]);
385 0           x1 = _mm_aesenc_si128(x1, sk[10]);
386 0           x0 = _mm_aesenc_si128(x0, sk[11]);
387 0           x1 = _mm_aesenc_si128(x1, sk[11]);
388 0           x0 = _mm_aesenc_si128(x0, sk[12]);
389 0           x1 = _mm_aesenc_si128(x1, sk[12]);
390 0           x0 = _mm_aesenc_si128(x0, sk[13]);
391 0           x1 = _mm_aesenc_si128(x1, sk[13]);
392 0           x0 = _mm_aesenclast_si128(x0, sk[14]);
393 0           x1 = _mm_aesenclast_si128(x1, sk[14]);
394             }
395              
396 0           x0 = _mm_xor_si128(x0, dx);
397 0 0         if (first_iter) {
398 0           cmx = _mm_xor_si128(cmx, x0);
399 0           first_iter = 0;
400             } else {
401 0           cmx = _mm_xor_si128(x1, x0);
402             }
403             _mm_storeu_si128((void *)buf, x0);
404              
405 0           buf += 16;
406 0           len -= 16;
407              
408             /*
409             * Increment the counter value.
410             */
411 0           ivx = _mm_add_epi64(ivx, one);
412 0           ivx = _mm_sub_epi64(ivx,
413 0           _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
414              
415             /*
416             * If this was the last iteration, then compute the
417             * extra block encryption to complete CBC-MAC.
418             */
419 0 0         if (len == 0) {
420 0           cmx = _mm_xor_si128(cmx, sk[0]);
421 0           cmx = _mm_aesenc_si128(cmx, sk[1]);
422 0           cmx = _mm_aesenc_si128(cmx, sk[2]);
423 0           cmx = _mm_aesenc_si128(cmx, sk[3]);
424 0           cmx = _mm_aesenc_si128(cmx, sk[4]);
425 0           cmx = _mm_aesenc_si128(cmx, sk[5]);
426 0           cmx = _mm_aesenc_si128(cmx, sk[6]);
427 0           cmx = _mm_aesenc_si128(cmx, sk[7]);
428 0           cmx = _mm_aesenc_si128(cmx, sk[8]);
429 0           cmx = _mm_aesenc_si128(cmx, sk[9]);
430 0 0         if (num_rounds == 10) {
431 0           cmx = _mm_aesenclast_si128(cmx, sk[10]);
432 0 0         } else if (num_rounds == 12) {
433 0           cmx = _mm_aesenc_si128(cmx, sk[10]);
434 0           cmx = _mm_aesenc_si128(cmx, sk[11]);
435 0           cmx = _mm_aesenclast_si128(cmx, sk[12]);
436             } else {
437 0           cmx = _mm_aesenc_si128(cmx, sk[10]);
438 0           cmx = _mm_aesenc_si128(cmx, sk[11]);
439 0           cmx = _mm_aesenc_si128(cmx, sk[12]);
440 0           cmx = _mm_aesenc_si128(cmx, sk[13]);
441 0           cmx = _mm_aesenclast_si128(cmx, sk[14]);
442             }
443 0           break;
444             }
445             }
446              
447             /*
448             * Write back new counter value and CBC-MAC value.
449             */
450 0           _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
451             _mm_storeu_si128(cbcmac, cmx);
452 0           }
453              
454             /* see bearssl_block.h */
455             BR_TARGET("sse2,sse4.1,aes")
456             void
457 0           br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
458             void *ctr, void *cbcmac, void *data, size_t len)
459             {
460             unsigned char *buf;
461             unsigned num_rounds;
462             __m128i sk[15];
463             __m128i ivx, cmx;
464             __m128i erev, zero, one;
465             unsigned u;
466              
467 0           num_rounds = ctx->num_rounds;
468 0 0         for (u = 0; u <= num_rounds; u ++) {
469 0           sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
470             }
471              
472             /*
473             * Some SSE2 constants.
474             */
475 0           erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476             8, 9, 10, 11, 12, 13, 14, 15);
477 0           zero = _mm_setzero_si128();
478 0           one = _mm_set_epi64x(0, 1);
479              
480             /*
481             * Decode the counter in big-endian.
482             */
483 0           ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
484 0           cmx = _mm_loadu_si128(cbcmac);
485              
486 0           buf = data;
487 0 0         while (len > 0) {
488             __m128i dx, x0, x1;
489              
490             /*
491             * Load initial values:
492             * dx encrypted block of data
493             * x0 counter (for CTR encryption)
494             * x1 input for CBC-MAC
495             */
496 0           dx = _mm_loadu_si128((void *)buf);
497 0           x0 = _mm_shuffle_epi8(ivx, erev);
498 0           x1 = _mm_xor_si128(cmx, dx);
499              
500 0           x0 = _mm_xor_si128(x0, sk[0]);
501 0           x1 = _mm_xor_si128(x1, sk[0]);
502 0           x0 = _mm_aesenc_si128(x0, sk[1]);
503 0           x1 = _mm_aesenc_si128(x1, sk[1]);
504 0           x0 = _mm_aesenc_si128(x0, sk[2]);
505 0           x1 = _mm_aesenc_si128(x1, sk[2]);
506 0           x0 = _mm_aesenc_si128(x0, sk[3]);
507 0           x1 = _mm_aesenc_si128(x1, sk[3]);
508 0           x0 = _mm_aesenc_si128(x0, sk[4]);
509 0           x1 = _mm_aesenc_si128(x1, sk[4]);
510 0           x0 = _mm_aesenc_si128(x0, sk[5]);
511 0           x1 = _mm_aesenc_si128(x1, sk[5]);
512 0           x0 = _mm_aesenc_si128(x0, sk[6]);
513 0           x1 = _mm_aesenc_si128(x1, sk[6]);
514 0           x0 = _mm_aesenc_si128(x0, sk[7]);
515 0           x1 = _mm_aesenc_si128(x1, sk[7]);
516 0           x0 = _mm_aesenc_si128(x0, sk[8]);
517 0           x1 = _mm_aesenc_si128(x1, sk[8]);
518 0           x0 = _mm_aesenc_si128(x0, sk[9]);
519 0           x1 = _mm_aesenc_si128(x1, sk[9]);
520 0 0         if (num_rounds == 10) {
521 0           x0 = _mm_aesenclast_si128(x0, sk[10]);
522 0           x1 = _mm_aesenclast_si128(x1, sk[10]);
523 0 0         } else if (num_rounds == 12) {
524 0           x0 = _mm_aesenc_si128(x0, sk[10]);
525 0           x1 = _mm_aesenc_si128(x1, sk[10]);
526 0           x0 = _mm_aesenc_si128(x0, sk[11]);
527 0           x1 = _mm_aesenc_si128(x1, sk[11]);
528 0           x0 = _mm_aesenclast_si128(x0, sk[12]);
529 0           x1 = _mm_aesenclast_si128(x1, sk[12]);
530             } else {
531 0           x0 = _mm_aesenc_si128(x0, sk[10]);
532 0           x1 = _mm_aesenc_si128(x1, sk[10]);
533 0           x0 = _mm_aesenc_si128(x0, sk[11]);
534 0           x1 = _mm_aesenc_si128(x1, sk[11]);
535 0           x0 = _mm_aesenc_si128(x0, sk[12]);
536 0           x1 = _mm_aesenc_si128(x1, sk[12]);
537 0           x0 = _mm_aesenc_si128(x0, sk[13]);
538 0           x1 = _mm_aesenc_si128(x1, sk[13]);
539 0           x0 = _mm_aesenclast_si128(x0, sk[14]);
540 0           x1 = _mm_aesenclast_si128(x1, sk[14]);
541             }
542 0           x0 = _mm_xor_si128(x0, dx);
543 0           cmx = x1;
544             _mm_storeu_si128((void *)buf, x0);
545              
546 0           buf += 16;
547 0           len -= 16;
548              
549             /*
550             * Increment the counter value.
551             */
552 0           ivx = _mm_add_epi64(ivx, one);
553 0           ivx = _mm_sub_epi64(ivx,
554 0           _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
555             }
556              
557             /*
558             * Write back new counter value and CBC-MAC value.
559             */
560 0           _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
561             _mm_storeu_si128(cbcmac, cmx);
562 0           }
563              
564             BR_TARGETS_X86_DOWN
565              
566             /* see bearssl_block.h */
567             const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
568             sizeof(br_aes_x86ni_ctrcbc_keys),
569             16,
570             4,
571             (void (*)(const br_block_ctrcbc_class **, const void *, size_t))
572             &br_aes_x86ni_ctrcbc_init,
573             (void (*)(const br_block_ctrcbc_class *const *,
574             void *, void *, void *, size_t))
575             &br_aes_x86ni_ctrcbc_encrypt,
576             (void (*)(const br_block_ctrcbc_class *const *,
577             void *, void *, void *, size_t))
578             &br_aes_x86ni_ctrcbc_decrypt,
579             (void (*)(const br_block_ctrcbc_class *const *,
580             void *, void *, size_t))
581             &br_aes_x86ni_ctrcbc_ctr,
582             (void (*)(const br_block_ctrcbc_class *const *,
583             void *, const void *, size_t))
584             &br_aes_x86ni_ctrcbc_mac
585             };
586              
587             #else
588              
589             /* see bearssl_block.h */
590             const br_block_ctrcbc_class *
591             br_aes_x86ni_ctrcbc_get_vtable(void)
592             {
593             return NULL;
594             }
595              
596             #endif