File Coverage

src/symcipher/aes_x86ni_ctr.c
Criterion Covered Total %
statement 118 118 100.0
branch 13 14 92.8
condition n/a
subroutine n/a
pod n/a
total 131 132 99.2


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #define BR_ENABLE_INTRINSICS 1
26             #include "inner.h"
27              
28             #if BR_AES_X86NI
29              
30             /* see bearssl_block.h */
31             const br_block_ctr_class *
32 52           br_aes_x86ni_ctr_get_vtable(void)
33             {
34 52 50         return br_aes_x86ni_supported() ? &br_aes_x86ni_ctr_vtable : NULL;
35             }
36              
37             /* see bearssl_block.h */
38             void
39 26           br_aes_x86ni_ctr_init(br_aes_x86ni_ctr_keys *ctx,
40             const void *key, size_t len)
41             {
42 26           ctx->vtable = &br_aes_x86ni_ctr_vtable;
43 26           ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44 26           }
45              
46             BR_TARGETS_X86_UP
47              
48             /* see bearssl_block.h */
49             BR_TARGET("sse2,sse4.1,aes")
50             uint32_t
51 80           br_aes_x86ni_ctr_run(const br_aes_x86ni_ctr_keys *ctx,
52             const void *iv, uint32_t cc, void *data, size_t len)
53             {
54             unsigned char *buf;
55             unsigned char ivbuf[16];
56             unsigned num_rounds;
57             __m128i sk[15];
58             __m128i ivx;
59             unsigned u;
60              
61 80           buf = data;
62 80           memcpy(ivbuf, iv, 12);
63 80           num_rounds = ctx->num_rounds;
64 1100 100         for (u = 0; u <= num_rounds; u ++) {
65 2040           sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
66             }
67 80           ivx = _mm_loadu_si128((void *)ivbuf);
68 194 100         while (len > 0) {
69             __m128i x0, x1, x2, x3;
70              
71 174           x0 = _mm_insert_epi32(ivx, br_bswap32(cc + 0), 3);
72 174           x1 = _mm_insert_epi32(ivx, br_bswap32(cc + 1), 3);
73 174           x2 = _mm_insert_epi32(ivx, br_bswap32(cc + 2), 3);
74 174           x3 = _mm_insert_epi32(ivx, br_bswap32(cc + 3), 3);
75 174           x0 = _mm_xor_si128(x0, sk[0]);
76 174           x1 = _mm_xor_si128(x1, sk[0]);
77 174           x2 = _mm_xor_si128(x2, sk[0]);
78 174           x3 = _mm_xor_si128(x3, sk[0]);
79 174           x0 = _mm_aesenc_si128(x0, sk[1]);
80 174           x1 = _mm_aesenc_si128(x1, sk[1]);
81 174           x2 = _mm_aesenc_si128(x2, sk[1]);
82 174           x3 = _mm_aesenc_si128(x3, sk[1]);
83 174           x0 = _mm_aesenc_si128(x0, sk[2]);
84 174           x1 = _mm_aesenc_si128(x1, sk[2]);
85 174           x2 = _mm_aesenc_si128(x2, sk[2]);
86 174           x3 = _mm_aesenc_si128(x3, sk[2]);
87 174           x0 = _mm_aesenc_si128(x0, sk[3]);
88 174           x1 = _mm_aesenc_si128(x1, sk[3]);
89 174           x2 = _mm_aesenc_si128(x2, sk[3]);
90 174           x3 = _mm_aesenc_si128(x3, sk[3]);
91 174           x0 = _mm_aesenc_si128(x0, sk[4]);
92 174           x1 = _mm_aesenc_si128(x1, sk[4]);
93 174           x2 = _mm_aesenc_si128(x2, sk[4]);
94 174           x3 = _mm_aesenc_si128(x3, sk[4]);
95 174           x0 = _mm_aesenc_si128(x0, sk[5]);
96 174           x1 = _mm_aesenc_si128(x1, sk[5]);
97 174           x2 = _mm_aesenc_si128(x2, sk[5]);
98 174           x3 = _mm_aesenc_si128(x3, sk[5]);
99 174           x0 = _mm_aesenc_si128(x0, sk[6]);
100 174           x1 = _mm_aesenc_si128(x1, sk[6]);
101 174           x2 = _mm_aesenc_si128(x2, sk[6]);
102 174           x3 = _mm_aesenc_si128(x3, sk[6]);
103 174           x0 = _mm_aesenc_si128(x0, sk[7]);
104 174           x1 = _mm_aesenc_si128(x1, sk[7]);
105 174           x2 = _mm_aesenc_si128(x2, sk[7]);
106 174           x3 = _mm_aesenc_si128(x3, sk[7]);
107 174           x0 = _mm_aesenc_si128(x0, sk[8]);
108 174           x1 = _mm_aesenc_si128(x1, sk[8]);
109 174           x2 = _mm_aesenc_si128(x2, sk[8]);
110 174           x3 = _mm_aesenc_si128(x3, sk[8]);
111 174           x0 = _mm_aesenc_si128(x0, sk[9]);
112 174           x1 = _mm_aesenc_si128(x1, sk[9]);
113 174           x2 = _mm_aesenc_si128(x2, sk[9]);
114 174           x3 = _mm_aesenc_si128(x3, sk[9]);
115 174 100         if (num_rounds == 10) {
116 38           x0 = _mm_aesenclast_si128(x0, sk[10]);
117 38           x1 = _mm_aesenclast_si128(x1, sk[10]);
118 38           x2 = _mm_aesenclast_si128(x2, sk[10]);
119 76           x3 = _mm_aesenclast_si128(x3, sk[10]);
120 136 100         } else if (num_rounds == 12) {
121 8           x0 = _mm_aesenc_si128(x0, sk[10]);
122 8           x1 = _mm_aesenc_si128(x1, sk[10]);
123 8           x2 = _mm_aesenc_si128(x2, sk[10]);
124 8           x3 = _mm_aesenc_si128(x3, sk[10]);
125 8           x0 = _mm_aesenc_si128(x0, sk[11]);
126 8           x1 = _mm_aesenc_si128(x1, sk[11]);
127 8           x2 = _mm_aesenc_si128(x2, sk[11]);
128 8           x3 = _mm_aesenc_si128(x3, sk[11]);
129 8           x0 = _mm_aesenclast_si128(x0, sk[12]);
130 8           x1 = _mm_aesenclast_si128(x1, sk[12]);
131 8           x2 = _mm_aesenclast_si128(x2, sk[12]);
132 16           x3 = _mm_aesenclast_si128(x3, sk[12]);
133             } else {
134 128           x0 = _mm_aesenc_si128(x0, sk[10]);
135 128           x1 = _mm_aesenc_si128(x1, sk[10]);
136 128           x2 = _mm_aesenc_si128(x2, sk[10]);
137 128           x3 = _mm_aesenc_si128(x3, sk[10]);
138 128           x0 = _mm_aesenc_si128(x0, sk[11]);
139 128           x1 = _mm_aesenc_si128(x1, sk[11]);
140 128           x2 = _mm_aesenc_si128(x2, sk[11]);
141 128           x3 = _mm_aesenc_si128(x3, sk[11]);
142 128           x0 = _mm_aesenc_si128(x0, sk[12]);
143 128           x1 = _mm_aesenc_si128(x1, sk[12]);
144 128           x2 = _mm_aesenc_si128(x2, sk[12]);
145 128           x3 = _mm_aesenc_si128(x3, sk[12]);
146 128           x0 = _mm_aesenc_si128(x0, sk[13]);
147 128           x1 = _mm_aesenc_si128(x1, sk[13]);
148 128           x2 = _mm_aesenc_si128(x2, sk[13]);
149 128           x3 = _mm_aesenc_si128(x3, sk[13]);
150 128           x0 = _mm_aesenclast_si128(x0, sk[14]);
151 128           x1 = _mm_aesenclast_si128(x1, sk[14]);
152 128           x2 = _mm_aesenclast_si128(x2, sk[14]);
153 256           x3 = _mm_aesenclast_si128(x3, sk[14]);
154             }
155 174 100         if (len >= 64) {
156 114           x0 = _mm_xor_si128(x0,
157             _mm_loadu_si128((void *)(buf + 0)));
158 114           x1 = _mm_xor_si128(x1,
159 114           _mm_loadu_si128((void *)(buf + 16)));
160 114           x2 = _mm_xor_si128(x2,
161 114           _mm_loadu_si128((void *)(buf + 32)));
162 228           x3 = _mm_xor_si128(x3,
163 114           _mm_loadu_si128((void *)(buf + 48)));
164             _mm_storeu_si128((void *)(buf + 0), x0);
165 114           _mm_storeu_si128((void *)(buf + 16), x1);
166 114           _mm_storeu_si128((void *)(buf + 32), x2);
167 114           _mm_storeu_si128((void *)(buf + 48), x3);
168 114           buf += 64;
169 114           len -= 64;
170 114           cc += 4;
171             } else {
172             unsigned char tmp[64];
173              
174             _mm_storeu_si128((void *)(tmp + 0), x0);
175 60           _mm_storeu_si128((void *)(tmp + 16), x1);
176 60           _mm_storeu_si128((void *)(tmp + 32), x2);
177 60           _mm_storeu_si128((void *)(tmp + 48), x3);
178 1340 100         for (u = 0; u < len; u ++) {
179 1280           buf[u] ^= tmp[u];
180             }
181 60           cc += (uint32_t)len >> 4;
182 60           break;
183             }
184             }
185 80           return cc;
186             }
187              
188             BR_TARGETS_X86_DOWN
189              
190             /* see bearssl_block.h */
191             const br_block_ctr_class br_aes_x86ni_ctr_vtable = {
192             sizeof(br_aes_x86ni_ctr_keys),
193             16,
194             4,
195             (void (*)(const br_block_ctr_class **, const void *, size_t))
196             &br_aes_x86ni_ctr_init,
197             (uint32_t (*)(const br_block_ctr_class *const *,
198             const void *, uint32_t, void *, size_t))
199             &br_aes_x86ni_ctr_run
200             };
201              
202             #else
203              
204             /* see bearssl_block.h */
205             const br_block_ctr_class *
206             br_aes_x86ni_ctr_get_vtable(void)
207             {
208             return NULL;
209             }
210              
211             #endif