File Coverage

src/symcipher/des_ct.c
Criterion Covered Total %
statement 0 175 0.0
branch 0 10 0.0
condition n/a
subroutine n/a
pod n/a
total 0 185 0.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2016 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #include "inner.h"
26              
27             /*
28             * During key schedule, we need to apply bit extraction PC-2 then permute
29             * things into our bitslice representation. PC-2 extracts 48 bits out
30             * of two 28-bit words (kl and kr), and we store these bits into two
31             * 32-bit words sk0 and sk1.
32             *
33             * -- bit 16+x of sk0 comes from bit QL0[x] of kl
34             * -- bit x of sk0 comes from bit QR0[x] of kr
35             * -- bit 16+x of sk1 comes from bit QL1[x] of kl
36             * -- bit x of sk1 comes from bit QR1[x] of kr
37             */
38              
39             static const unsigned char QL0[] = {
40             17, 4, 27, 23, 13, 22, 7, 18,
41             16, 24, 2, 20, 1, 8, 15, 26
42             };
43              
44             static const unsigned char QR0[] = {
45             25, 19, 9, 1, 5, 11, 23, 8,
46             17, 0, 22, 3, 6, 20, 27, 24
47             };
48              
49             static const unsigned char QL1[] = {
50             28, 28, 14, 11, 28, 28, 25, 0,
51             28, 28, 5, 9, 28, 28, 12, 21
52             };
53              
54             static const unsigned char QR1[] = {
55             28, 28, 15, 4, 28, 28, 26, 16,
56             28, 28, 12, 7, 28, 28, 10, 14
57             };
58              
59             /*
60             * 32-bit rotation. The C compiler is supposed to recognize it as a
61             * rotation and use the local architecture rotation opcode (if available).
62             */
63             static inline uint32_t
64 0           rotl(uint32_t x, int n)
65             {
66 0           return (x << n) | (x >> (32 - n));
67             }
68              
69             /*
70             * Compute key schedule for 8 key bytes (produces 32 subkey words).
71             */
72             static void
73 0           keysched_unit(uint32_t *skey, const void *key)
74             {
75             int i;
76              
77 0           br_des_keysched_unit(skey, key);
78              
79             /*
80             * Apply PC-2 + bitslicing.
81             */
82 0 0         for (i = 0; i < 16; i ++) {
83             uint32_t kl, kr, sk0, sk1;
84             int j;
85              
86 0           kl = skey[(i << 1) + 0];
87 0           kr = skey[(i << 1) + 1];
88 0           sk0 = 0;
89 0           sk1 = 0;
90 0 0         for (j = 0; j < 16; j ++) {
91 0           sk0 <<= 1;
92 0           sk1 <<= 1;
93 0           sk0 |= ((kl >> QL0[j]) & (uint32_t)1) << 16;
94 0           sk0 |= (kr >> QR0[j]) & (uint32_t)1;
95 0           sk1 |= ((kl >> QL1[j]) & (uint32_t)1) << 16;
96 0           sk1 |= (kr >> QR1[j]) & (uint32_t)1;
97             }
98              
99 0           skey[(i << 1) + 0] = sk0;
100 0           skey[(i << 1) + 1] = sk1;
101             }
102              
103             #if 0
104             /*
105             * Speed-optimized version for PC-2 + bitslicing.
106             * (Unused. Kept for reference only.)
107             */
108             sk0 = kl & (uint32_t)0x00100000;
109             sk0 |= (kl & (uint32_t)0x08008000) << 2;
110             sk0 |= (kl & (uint32_t)0x00400000) << 4;
111             sk0 |= (kl & (uint32_t)0x00800000) << 5;
112             sk0 |= (kl & (uint32_t)0x00040000) << 6;
113             sk0 |= (kl & (uint32_t)0x00010000) << 7;
114             sk0 |= (kl & (uint32_t)0x00000100) << 10;
115             sk0 |= (kl & (uint32_t)0x00022000) << 14;
116             sk0 |= (kl & (uint32_t)0x00000082) << 18;
117             sk0 |= (kl & (uint32_t)0x00000004) << 19;
118             sk0 |= (kl & (uint32_t)0x04000000) >> 10;
119             sk0 |= (kl & (uint32_t)0x00000010) << 26;
120             sk0 |= (kl & (uint32_t)0x01000000) >> 2;
121              
122             sk0 |= kr & (uint32_t)0x00000100;
123             sk0 |= (kr & (uint32_t)0x00000008) << 1;
124             sk0 |= (kr & (uint32_t)0x00000200) << 4;
125             sk0 |= rotl(kr & (uint32_t)0x08000021, 6);
126             sk0 |= (kr & (uint32_t)0x01000000) >> 24;
127             sk0 |= (kr & (uint32_t)0x00000002) << 11;
128             sk0 |= (kr & (uint32_t)0x00100000) >> 18;
129             sk0 |= (kr & (uint32_t)0x00400000) >> 17;
130             sk0 |= (kr & (uint32_t)0x00800000) >> 14;
131             sk0 |= (kr & (uint32_t)0x02020000) >> 10;
132             sk0 |= (kr & (uint32_t)0x00080000) >> 5;
133             sk0 |= (kr & (uint32_t)0x00000040) >> 3;
134             sk0 |= (kr & (uint32_t)0x00000800) >> 1;
135              
136             sk1 = kl & (uint32_t)0x02000000;
137             sk1 |= (kl & (uint32_t)0x00001000) << 5;
138             sk1 |= (kl & (uint32_t)0x00000200) << 11;
139             sk1 |= (kl & (uint32_t)0x00004000) << 15;
140             sk1 |= (kl & (uint32_t)0x00000020) << 16;
141             sk1 |= (kl & (uint32_t)0x00000800) << 17;
142             sk1 |= (kl & (uint32_t)0x00000001) << 24;
143             sk1 |= (kl & (uint32_t)0x00200000) >> 5;
144              
145             sk1 |= (kr & (uint32_t)0x00000010) << 8;
146             sk1 |= (kr & (uint32_t)0x04000000) >> 17;
147             sk1 |= (kr & (uint32_t)0x00004000) >> 14;
148             sk1 |= (kr & (uint32_t)0x00000400) >> 9;
149             sk1 |= (kr & (uint32_t)0x00010000) >> 8;
150             sk1 |= (kr & (uint32_t)0x00001000) >> 7;
151             sk1 |= (kr & (uint32_t)0x00000080) >> 3;
152             sk1 |= (kr & (uint32_t)0x00008000) >> 2;
153             #endif
154 0           }
155              
156             /* see inner.h */
157             unsigned
158 0           br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len)
159             {
160 0           switch (key_len) {
161 0           case 8:
162 0           keysched_unit(skey, key);
163 0           return 1;
164 0           case 16:
165 0           keysched_unit(skey, key);
166 0           keysched_unit(skey + 32, (const unsigned char *)key + 8);
167 0           br_des_rev_skey(skey + 32);
168 0           memcpy(skey + 64, skey, 32 * sizeof *skey);
169 0           return 3;
170 0           default:
171 0           keysched_unit(skey, key);
172 0           keysched_unit(skey + 32, (const unsigned char *)key + 8);
173 0           br_des_rev_skey(skey + 32);
174 0           keysched_unit(skey + 64, (const unsigned char *)key + 16);
175 0           return 3;
176             }
177             }
178              
179             /*
180             * DES confusion function. This function performs expansion E (32 to
181             * 48 bits), XOR with subkey, S-boxes, and permutation P.
182             */
183             static inline uint32_t
184 0           Fconf(uint32_t r0, const uint32_t *sk)
185             {
186             /*
187             * Each 6->4 S-box is virtually turned into four 6->1 boxes; we
188             * thus end up with 32 boxes that we call "T-boxes" here. We will
189             * evaluate them with bitslice code.
190             *
191             * Each T-box is a circuit of multiplexers (sort of) and thus
192             * takes 70 inputs: the 6 actual T-box inputs, and 64 constants
193             * that describe the T-box output for all combinations of the
194             * 6 inputs. With this model, all T-boxes are identical (with
195             * distinct inputs) and thus can be executed in parallel with
196             * bitslice code.
197             *
198             * T-boxes are numbered from 0 to 31, in least-to-most
199             * significant order. Thus, S-box S1 corresponds to T-boxes 31,
200             * 30, 29 and 28, in that order. T-box 'n' is computed with the
201             * bits at rank 'n' in the 32-bit words.
202             *
203             * Words x0 to x5 contain the T-box inputs 0 to 5.
204             */
205             uint32_t x0, x1, x2, x3, x4, x5, z0;
206             uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
207             uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
208             uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29;
209             uint32_t y30;
210              
211             /*
212             * Spread input bits over the 6 input words x*.
213             */
214 0           x1 = r0 & (uint32_t)0x11111111;
215 0           x2 = (r0 >> 1) & (uint32_t)0x11111111;
216 0           x3 = (r0 >> 2) & (uint32_t)0x11111111;
217 0           x4 = (r0 >> 3) & (uint32_t)0x11111111;
218 0           x1 = (x1 << 4) - x1;
219 0           x2 = (x2 << 4) - x2;
220 0           x3 = (x3 << 4) - x3;
221 0           x4 = (x4 << 4) - x4;
222 0           x0 = (x4 << 4) | (x4 >> 28);
223 0           x5 = (x1 >> 4) | (x1 << 28);
224              
225             /*
226             * XOR with the subkey for this round.
227             */
228 0           x0 ^= sk[0];
229 0           x1 ^= sk[1];
230 0           x2 ^= sk[2];
231 0           x3 ^= sk[3];
232 0           x4 ^= sk[4];
233 0           x5 ^= sk[5];
234              
235             /*
236             * The T-boxes are done in parallel, since they all use a
237             * "tree of multiplexer". We use "fake multiplexers":
238             *
239             * y = a ^ (x & b)
240             *
241             * computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1).
242             */
243 0           y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C);
244 0           y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821);
245 0           y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809);
246 0           y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28);
247 0           y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820);
248 0           y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801);
249 0           y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929);
250 0           y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1);
251 0           y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A);
252 0           y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B);
253 0           y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A);
254 0           y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572);
255 0           y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083);
256 0           y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000);
257 0           y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA);
258             /* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
259 0           y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9);
260 0           y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE);
261 0           y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816);
262 0           y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B);
263 0           y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88);
264 0           y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25);
265 0           y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D);
266 0           y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0);
267 0           y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100);
268 0           y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011);
269 0           y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959);
270 0           y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A);
271 0           y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8);
272 0           y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B);
273 0           y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD);
274             /* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
275              
276 0           y0 = y0 ^ (x1 & y1);
277 0           y1 = y2 ^ (x1 & y3);
278 0           y2 = y4 ^ (x1 & y5);
279 0           y3 = y6 ^ (x1 & y7);
280 0           y4 = y8 ^ (x1 & y9);
281 0           y5 = y10 ^ (x1 & y11);
282 0           y6 = y12 ^ (x1 & y13);
283 0           y7 = y14; /* was: y14 ^ (x1 & y15) */
284 0           y8 = y16 ^ (x1 & y17);
285 0           y9 = y18 ^ (x1 & y19);
286 0           y10 = y20 ^ (x1 & y21);
287 0           y11 = y22 ^ (x1 & y23);
288 0           y12 = y24 ^ (x1 & y25);
289 0           y13 = y26 ^ (x1 & y27);
290 0           y14 = y28 ^ (x1 & y29);
291 0           y15 = y30; /* was: y30 ^ (x1 & y31) */
292              
293 0           y0 = y0 ^ (x2 & y1);
294 0           y1 = y2 ^ (x2 & y3);
295 0           y2 = y4 ^ (x2 & y5);
296 0           y3 = y6 ^ (x2 & y7);
297 0           y4 = y8 ^ (x2 & y9);
298 0           y5 = y10 ^ (x2 & y11);
299 0           y6 = y12 ^ (x2 & y13);
300 0           y7 = y14 ^ (x2 & y15);
301              
302 0           y0 = y0 ^ (x3 & y1);
303 0           y1 = y2 ^ (x3 & y3);
304 0           y2 = y4 ^ (x3 & y5);
305 0           y3 = y6 ^ (x3 & y7);
306              
307 0           y0 = y0 ^ (x4 & y1);
308 0           y1 = y2 ^ (x4 & y3);
309              
310 0           y0 = y0 ^ (x5 & y1);
311              
312             /*
313             * The P permutation:
314             * -- Each bit move is converted into a mask + left rotation.
315             * -- Rotations that use the same movement are coalesced together.
316             * -- Left and right shifts are used as alternatives to a rotation
317             * where appropriate (this will help architectures that do not have
318             * a rotation opcode).
319             */
320 0           z0 = (y0 & (uint32_t)0x00000004) << 3;
321 0           z0 |= (y0 & (uint32_t)0x00004000) << 4;
322 0           z0 |= rotl(y0 & 0x12020120, 5);
323 0           z0 |= (y0 & (uint32_t)0x00100000) << 6;
324 0           z0 |= (y0 & (uint32_t)0x00008000) << 9;
325 0           z0 |= (y0 & (uint32_t)0x04000000) >> 22;
326 0           z0 |= (y0 & (uint32_t)0x00000001) << 11;
327 0           z0 |= rotl(y0 & 0x20000200, 12);
328 0           z0 |= (y0 & (uint32_t)0x00200000) >> 19;
329 0           z0 |= (y0 & (uint32_t)0x00000040) << 14;
330 0           z0 |= (y0 & (uint32_t)0x00010000) << 15;
331 0           z0 |= (y0 & (uint32_t)0x00000002) << 16;
332 0           z0 |= rotl(y0 & 0x40801800, 17);
333 0           z0 |= (y0 & (uint32_t)0x00080000) >> 13;
334 0           z0 |= (y0 & (uint32_t)0x00000010) << 21;
335 0           z0 |= (y0 & (uint32_t)0x01000000) >> 10;
336 0           z0 |= rotl(y0 & 0x88000008, 24);
337 0           z0 |= (y0 & (uint32_t)0x00000480) >> 7;
338 0           z0 |= (y0 & (uint32_t)0x00442000) >> 6;
339 0           return z0;
340             }
341              
342             /*
343             * Process one block through 16 successive rounds, omitting the swap
344             * in the final round.
345             */
346             static void
347 0           process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *sk_exp)
348             {
349             int i;
350             uint32_t l, r;
351              
352 0           l = *pl;
353 0           r = *pr;
354 0 0         for (i = 0; i < 16; i ++) {
355             uint32_t t;
356              
357 0           t = l ^ Fconf(r, sk_exp);
358 0           l = r;
359 0           r = t;
360 0           sk_exp += 6;
361             }
362 0           *pl = r;
363 0           *pr = l;
364 0           }
365              
366             /* see inner.h */
367             void
368 0           br_des_ct_process_block(unsigned num_rounds,
369             const uint32_t *sk_exp, void *block)
370             {
371             unsigned char *buf;
372             uint32_t l, r;
373              
374 0           buf = block;
375 0           l = br_dec32be(buf);
376 0           r = br_dec32be(buf + 4);
377 0           br_des_do_IP(&l, &r);
378 0 0         while (num_rounds -- > 0) {
379 0           process_block_unit(&l, &r, sk_exp);
380 0           sk_exp += 96;
381             }
382 0           br_des_do_invIP(&l, &r);
383 0           br_enc32be(buf, l);
384 0           br_enc32be(buf + 4, r);
385 0           }
386              
387             /* see inner.h */
388             void
389 0           br_des_ct_skey_expand(uint32_t *sk_exp,
390             unsigned num_rounds, const uint32_t *skey)
391             {
392 0           num_rounds <<= 4;
393 0 0         while (num_rounds -- > 0) {
394             uint32_t v, w0, w1, w2, w3;
395              
396 0           v = *skey ++;
397 0           w0 = v & 0x11111111;
398 0           w1 = (v >> 1) & 0x11111111;
399 0           w2 = (v >> 2) & 0x11111111;
400 0           w3 = (v >> 3) & 0x11111111;
401 0           *sk_exp ++ = (w0 << 4) - w0;
402 0           *sk_exp ++ = (w1 << 4) - w1;
403 0           *sk_exp ++ = (w2 << 4) - w2;
404 0           *sk_exp ++ = (w3 << 4) - w3;
405 0           v = *skey ++;
406 0           w0 = v & 0x11111111;
407 0           w1 = (v >> 1) & 0x11111111;
408 0           *sk_exp ++ = (w0 << 4) - w0;
409 0           *sk_exp ++ = (w1 << 4) - w1;
410             }
411 0           }