File Coverage

src/symcipher/aes_x86ni_cbcdec.c
Criterion Covered Total %
statement 106 129 82.1
branch 15 22 68.1
condition n/a
subroutine n/a
pod n/a
total 121 151 80.1


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #define BR_ENABLE_INTRINSICS 1
26             #include "inner.h"
27              
28             #if BR_AES_X86NI
29              
30             /* see bearssl_block.h */
31             const br_block_cbcdec_class *
32 52           br_aes_x86ni_cbcdec_get_vtable(void)
33             {
34 52 50         return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcdec_vtable : NULL;
35             }
36              
37             /* see bearssl_block.h */
38             void
39 11           br_aes_x86ni_cbcdec_init(br_aes_x86ni_cbcdec_keys *ctx,
40             const void *key, size_t len)
41             {
42 11           ctx->vtable = &br_aes_x86ni_cbcdec_vtable;
43 11           ctx->num_rounds = br_aes_x86ni_keysched_dec(ctx->skey.skni, key, len);
44 11           }
45              
46             BR_TARGETS_X86_UP
47              
48             /* see bearssl_block.h */
49             BR_TARGET("sse2,aes")
50             void
51 11           br_aes_x86ni_cbcdec_run(const br_aes_x86ni_cbcdec_keys *ctx,
52             void *iv, void *data, size_t len)
53             {
54             unsigned char *buf;
55             unsigned num_rounds;
56             __m128i sk[15], ivx;
57             unsigned u;
58              
59 11           buf = data;
60 11           ivx = _mm_loadu_si128(iv);
61 11           num_rounds = ctx->num_rounds;
62 176 100         for (u = 0; u <= num_rounds; u ++) {
63 330           sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
64             }
65 41 100         while (len > 0) {
66             __m128i x0, x1, x2, x3, e0, e1, e2, e3;
67              
68 36           x0 = _mm_loadu_si128((void *)(buf + 0));
69 36 100         if (len >= 64) {
70 30           x1 = _mm_loadu_si128((void *)(buf + 16));
71 30           x2 = _mm_loadu_si128((void *)(buf + 32));
72 60           x3 = _mm_loadu_si128((void *)(buf + 48));
73             } else {
74 6           x0 = _mm_loadu_si128((void *)(buf + 0));
75 6 50         if (len >= 32) {
76 6           x1 = _mm_loadu_si128((void *)(buf + 16));
77 6 50         if (len >= 48) {
78 0           x2 = _mm_loadu_si128(
79 0           (void *)(buf + 32));
80 0           x3 = x2;
81             } else {
82 6           x2 = x0;
83 6           x3 = x1;
84             }
85             } else {
86 0           x1 = x0;
87 0           x2 = x0;
88 0           x3 = x0;
89             }
90             }
91 36           e0 = x0;
92 36           e1 = x1;
93 36           e2 = x2;
94 36           e3 = x3;
95 36           x0 = _mm_xor_si128(x0, sk[0]);
96 36           x1 = _mm_xor_si128(x1, sk[0]);
97 36           x2 = _mm_xor_si128(x2, sk[0]);
98 36           x3 = _mm_xor_si128(x3, sk[0]);
99 36           x0 = _mm_aesdec_si128(x0, sk[1]);
100 36           x1 = _mm_aesdec_si128(x1, sk[1]);
101 36           x2 = _mm_aesdec_si128(x2, sk[1]);
102 36           x3 = _mm_aesdec_si128(x3, sk[1]);
103 36           x0 = _mm_aesdec_si128(x0, sk[2]);
104 36           x1 = _mm_aesdec_si128(x1, sk[2]);
105 36           x2 = _mm_aesdec_si128(x2, sk[2]);
106 36           x3 = _mm_aesdec_si128(x3, sk[2]);
107 36           x0 = _mm_aesdec_si128(x0, sk[3]);
108 36           x1 = _mm_aesdec_si128(x1, sk[3]);
109 36           x2 = _mm_aesdec_si128(x2, sk[3]);
110 36           x3 = _mm_aesdec_si128(x3, sk[3]);
111 36           x0 = _mm_aesdec_si128(x0, sk[4]);
112 36           x1 = _mm_aesdec_si128(x1, sk[4]);
113 36           x2 = _mm_aesdec_si128(x2, sk[4]);
114 36           x3 = _mm_aesdec_si128(x3, sk[4]);
115 36           x0 = _mm_aesdec_si128(x0, sk[5]);
116 36           x1 = _mm_aesdec_si128(x1, sk[5]);
117 36           x2 = _mm_aesdec_si128(x2, sk[5]);
118 36           x3 = _mm_aesdec_si128(x3, sk[5]);
119 36           x0 = _mm_aesdec_si128(x0, sk[6]);
120 36           x1 = _mm_aesdec_si128(x1, sk[6]);
121 36           x2 = _mm_aesdec_si128(x2, sk[6]);
122 36           x3 = _mm_aesdec_si128(x3, sk[6]);
123 36           x0 = _mm_aesdec_si128(x0, sk[7]);
124 36           x1 = _mm_aesdec_si128(x1, sk[7]);
125 36           x2 = _mm_aesdec_si128(x2, sk[7]);
126 36           x3 = _mm_aesdec_si128(x3, sk[7]);
127 36           x0 = _mm_aesdec_si128(x0, sk[8]);
128 36           x1 = _mm_aesdec_si128(x1, sk[8]);
129 36           x2 = _mm_aesdec_si128(x2, sk[8]);
130 36           x3 = _mm_aesdec_si128(x3, sk[8]);
131 36           x0 = _mm_aesdec_si128(x0, sk[9]);
132 36           x1 = _mm_aesdec_si128(x1, sk[9]);
133 36           x2 = _mm_aesdec_si128(x2, sk[9]);
134 36           x3 = _mm_aesdec_si128(x3, sk[9]);
135 36 50         if (num_rounds == 10) {
136 0           x0 = _mm_aesdeclast_si128(x0, sk[10]);
137 0           x1 = _mm_aesdeclast_si128(x1, sk[10]);
138 0           x2 = _mm_aesdeclast_si128(x2, sk[10]);
139 0           x3 = _mm_aesdeclast_si128(x3, sk[10]);
140 36 50         } else if (num_rounds == 12) {
141 0           x0 = _mm_aesdec_si128(x0, sk[10]);
142 0           x1 = _mm_aesdec_si128(x1, sk[10]);
143 0           x2 = _mm_aesdec_si128(x2, sk[10]);
144 0           x3 = _mm_aesdec_si128(x3, sk[10]);
145 0           x0 = _mm_aesdec_si128(x0, sk[11]);
146 0           x1 = _mm_aesdec_si128(x1, sk[11]);
147 0           x2 = _mm_aesdec_si128(x2, sk[11]);
148 0           x3 = _mm_aesdec_si128(x3, sk[11]);
149 0           x0 = _mm_aesdeclast_si128(x0, sk[12]);
150 0           x1 = _mm_aesdeclast_si128(x1, sk[12]);
151 0           x2 = _mm_aesdeclast_si128(x2, sk[12]);
152 0           x3 = _mm_aesdeclast_si128(x3, sk[12]);
153             } else {
154 36           x0 = _mm_aesdec_si128(x0, sk[10]);
155 36           x1 = _mm_aesdec_si128(x1, sk[10]);
156 36           x2 = _mm_aesdec_si128(x2, sk[10]);
157 36           x3 = _mm_aesdec_si128(x3, sk[10]);
158 36           x0 = _mm_aesdec_si128(x0, sk[11]);
159 36           x1 = _mm_aesdec_si128(x1, sk[11]);
160 36           x2 = _mm_aesdec_si128(x2, sk[11]);
161 36           x3 = _mm_aesdec_si128(x3, sk[11]);
162 36           x0 = _mm_aesdec_si128(x0, sk[12]);
163 36           x1 = _mm_aesdec_si128(x1, sk[12]);
164 36           x2 = _mm_aesdec_si128(x2, sk[12]);
165 36           x3 = _mm_aesdec_si128(x3, sk[12]);
166 36           x0 = _mm_aesdec_si128(x0, sk[13]);
167 36           x1 = _mm_aesdec_si128(x1, sk[13]);
168 36           x2 = _mm_aesdec_si128(x2, sk[13]);
169 36           x3 = _mm_aesdec_si128(x3, sk[13]);
170 36           x0 = _mm_aesdeclast_si128(x0, sk[14]);
171 36           x1 = _mm_aesdeclast_si128(x1, sk[14]);
172 36           x2 = _mm_aesdeclast_si128(x2, sk[14]);
173 72           x3 = _mm_aesdeclast_si128(x3, sk[14]);
174             }
175 36           x0 = _mm_xor_si128(x0, ivx);
176 36           x1 = _mm_xor_si128(x1, e0);
177 36           x2 = _mm_xor_si128(x2, e1);
178 36           x3 = _mm_xor_si128(x3, e2);
179 36           ivx = e3;
180             _mm_storeu_si128((void *)(buf + 0), x0);
181 36 100         if (len >= 64) {
182 30           _mm_storeu_si128((void *)(buf + 16), x1);
183 30           _mm_storeu_si128((void *)(buf + 32), x2);
184 30           _mm_storeu_si128((void *)(buf + 48), x3);
185 30           buf += 64;
186 30           len -= 64;
187             } else {
188 6 50         if (len >= 32) {
189 6           _mm_storeu_si128((void *)(buf + 16), x1);
190 6 50         if (len >= 48) {
191             _mm_storeu_si128(
192 0           (void *)(buf + 32), x2);
193             }
194             }
195 6           break;
196             }
197             }
198             _mm_storeu_si128(iv, ivx);
199 11           }
200              
201             BR_TARGETS_X86_DOWN
202              
203             /* see bearssl_block.h */
204             const br_block_cbcdec_class br_aes_x86ni_cbcdec_vtable = {
205             sizeof(br_aes_x86ni_cbcdec_keys),
206             16,
207             4,
208             (void (*)(const br_block_cbcdec_class **, const void *, size_t))
209             &br_aes_x86ni_cbcdec_init,
210             (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
211             &br_aes_x86ni_cbcdec_run
212             };
213              
214             #else
215              
216             /* see bearssl_block.h */
217             const br_block_cbcdec_class *
218             br_aes_x86ni_cbcdec_get_vtable(void)
219             {
220             return NULL;
221             }
222              
223             #endif