File Coverage

src/symcipher/des_ct.c

Criterion	Covered	Total	%
statement	0	175	0.0
branch	0	10	0.0
condition			n/a
subroutine			n/a
pod			n/a
total	0	185	0.0

line	stmt	bran	code
1			/*
2			* Copyright (c) 2016 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#include "inner.h"
26
27			/*
28			* During key schedule, we need to apply bit extraction PC-2 then permute
29			* things into our bitslice representation. PC-2 extracts 48 bits out
30			* of two 28-bit words (kl and kr), and we store these bits into two
31			* 32-bit words sk0 and sk1.
32			*
33			* -- bit 16+x of sk0 comes from bit QL0[x] of kl
34			* -- bit x of sk0 comes from bit QR0[x] of kr
35			* -- bit 16+x of sk1 comes from bit QL1[x] of kl
36			* -- bit x of sk1 comes from bit QR1[x] of kr
37			*/
38
39			static const unsigned char QL0[] = {
40			17, 4, 27, 23, 13, 22, 7, 18,
41			16, 24, 2, 20, 1, 8, 15, 26
42			};
43
44			static const unsigned char QR0[] = {
45			25, 19, 9, 1, 5, 11, 23, 8,
46			17, 0, 22, 3, 6, 20, 27, 24
47			};
48
49			static const unsigned char QL1[] = {
50			28, 28, 14, 11, 28, 28, 25, 0,
51			28, 28, 5, 9, 28, 28, 12, 21
52			};
53
54			static const unsigned char QR1[] = {
55			28, 28, 15, 4, 28, 28, 26, 16,
56			28, 28, 12, 7, 28, 28, 10, 14
57			};
58
59			/*
60			* 32-bit rotation. The C compiler is supposed to recognize it as a
61			* rotation and use the local architecture rotation opcode (if available).
62			*/
63			static inline uint32_t
64	0		rotl(uint32_t x, int n)
65			{
66	0		return (x << n) \| (x >> (32 - n));
67			}
68
69			/*
70			* Compute key schedule for 8 key bytes (produces 32 subkey words).
71			*/
72			static void
73	0		keysched_unit(uint32_t skey, const void key)
74			{
75			int i;
76
77	0		br_des_keysched_unit(skey, key);
78
79			/*
80			* Apply PC-2 + bitslicing.
81			*/
82	0	0	for (i = 0; i < 16; i ++) {
83			uint32_t kl, kr, sk0, sk1;
84			int j;
85
86	0		kl = skey[(i << 1) + 0];
87	0		kr = skey[(i << 1) + 1];
88	0		sk0 = 0;
89	0		sk1 = 0;
90	0	0	for (j = 0; j < 16; j ++) {
91	0		sk0 <<= 1;
92	0		sk1 <<= 1;
93	0		sk0 \|= ((kl >> QL0[j]) & (uint32_t)1) << 16;
94	0		sk0 \|= (kr >> QR0[j]) & (uint32_t)1;
95	0		sk1 \|= ((kl >> QL1[j]) & (uint32_t)1) << 16;
96	0		sk1 \|= (kr >> QR1[j]) & (uint32_t)1;
97			}
98
99	0		skey[(i << 1) + 0] = sk0;
100	0		skey[(i << 1) + 1] = sk1;
101			}
102
103			#if 0
104			/*
105			* Speed-optimized version for PC-2 + bitslicing.
106			* (Unused. Kept for reference only.)
107			*/
108			sk0 = kl & (uint32_t)0x00100000;
109			sk0 \|= (kl & (uint32_t)0x08008000) << 2;
110			sk0 \|= (kl & (uint32_t)0x00400000) << 4;
111			sk0 \|= (kl & (uint32_t)0x00800000) << 5;
112			sk0 \|= (kl & (uint32_t)0x00040000) << 6;
113			sk0 \|= (kl & (uint32_t)0x00010000) << 7;
114			sk0 \|= (kl & (uint32_t)0x00000100) << 10;
115			sk0 \|= (kl & (uint32_t)0x00022000) << 14;
116			sk0 \|= (kl & (uint32_t)0x00000082) << 18;
117			sk0 \|= (kl & (uint32_t)0x00000004) << 19;
118			sk0 \|= (kl & (uint32_t)0x04000000) >> 10;
119			sk0 \|= (kl & (uint32_t)0x00000010) << 26;
120			sk0 \|= (kl & (uint32_t)0x01000000) >> 2;
121
122			sk0 \|= kr & (uint32_t)0x00000100;
123			sk0 \|= (kr & (uint32_t)0x00000008) << 1;
124			sk0 \|= (kr & (uint32_t)0x00000200) << 4;
125			sk0 \|= rotl(kr & (uint32_t)0x08000021, 6);
126			sk0 \|= (kr & (uint32_t)0x01000000) >> 24;
127			sk0 \|= (kr & (uint32_t)0x00000002) << 11;
128			sk0 \|= (kr & (uint32_t)0x00100000) >> 18;
129			sk0 \|= (kr & (uint32_t)0x00400000) >> 17;
130			sk0 \|= (kr & (uint32_t)0x00800000) >> 14;
131			sk0 \|= (kr & (uint32_t)0x02020000) >> 10;
132			sk0 \|= (kr & (uint32_t)0x00080000) >> 5;
133			sk0 \|= (kr & (uint32_t)0x00000040) >> 3;
134			sk0 \|= (kr & (uint32_t)0x00000800) >> 1;
135
136			sk1 = kl & (uint32_t)0x02000000;
137			sk1 \|= (kl & (uint32_t)0x00001000) << 5;
138			sk1 \|= (kl & (uint32_t)0x00000200) << 11;
139			sk1 \|= (kl & (uint32_t)0x00004000) << 15;
140			sk1 \|= (kl & (uint32_t)0x00000020) << 16;
141			sk1 \|= (kl & (uint32_t)0x00000800) << 17;
142			sk1 \|= (kl & (uint32_t)0x00000001) << 24;
143			sk1 \|= (kl & (uint32_t)0x00200000) >> 5;
144
145			sk1 \|= (kr & (uint32_t)0x00000010) << 8;
146			sk1 \|= (kr & (uint32_t)0x04000000) >> 17;
147			sk1 \|= (kr & (uint32_t)0x00004000) >> 14;
148			sk1 \|= (kr & (uint32_t)0x00000400) >> 9;
149			sk1 \|= (kr & (uint32_t)0x00010000) >> 8;
150			sk1 \|= (kr & (uint32_t)0x00001000) >> 7;
151			sk1 \|= (kr & (uint32_t)0x00000080) >> 3;
152			sk1 \|= (kr & (uint32_t)0x00008000) >> 2;
153			#endif
154	0		}
155
156			/* see inner.h */
157			unsigned
158	0		br_des_ct_keysched(uint32_t skey, const void key, size_t key_len)
159			{
160	0		switch (key_len) {
161	0		case 8:
162	0		keysched_unit(skey, key);
163	0		return 1;
164	0		case 16:
165	0		keysched_unit(skey, key);
166	0		keysched_unit(skey + 32, (const unsigned char *)key + 8);
167	0		br_des_rev_skey(skey + 32);
168	0		memcpy(skey + 64, skey, 32 * sizeof *skey);
169	0		return 3;
170	0		default:
171	0		keysched_unit(skey, key);
172	0		keysched_unit(skey + 32, (const unsigned char *)key + 8);
173	0		br_des_rev_skey(skey + 32);
174	0		keysched_unit(skey + 64, (const unsigned char *)key + 16);
175	0		return 3;
176			}
177			}
178
179			/*
180			* DES confusion function. This function performs expansion E (32 to
181			* 48 bits), XOR with subkey, S-boxes, and permutation P.
182			*/
183			static inline uint32_t
184	0		Fconf(uint32_t r0, const uint32_t *sk)
185			{
186			/*
187			* Each 6->4 S-box is virtually turned into four 6->1 boxes; we
188			* thus end up with 32 boxes that we call "T-boxes" here. We will
189			* evaluate them with bitslice code.
190			*
191			* Each T-box is a circuit of multiplexers (sort of) and thus
192			* takes 70 inputs: the 6 actual T-box inputs, and 64 constants
193			* that describe the T-box output for all combinations of the
194			* 6 inputs. With this model, all T-boxes are identical (with
195			* distinct inputs) and thus can be executed in parallel with
196			* bitslice code.
197			*
198			* T-boxes are numbered from 0 to 31, in least-to-most
199			* significant order. Thus, S-box S1 corresponds to T-boxes 31,
200			* 30, 29 and 28, in that order. T-box 'n' is computed with the
201			* bits at rank 'n' in the 32-bit words.
202			*
203			* Words x0 to x5 contain the T-box inputs 0 to 5.
204			*/
205			uint32_t x0, x1, x2, x3, x4, x5, z0;
206			uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
207			uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
208			uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29;
209			uint32_t y30;
210
211			/*
212			* Spread input bits over the 6 input words x*.
213			*/
214	0		x1 = r0 & (uint32_t)0x11111111;
215	0		x2 = (r0 >> 1) & (uint32_t)0x11111111;
216	0		x3 = (r0 >> 2) & (uint32_t)0x11111111;
217	0		x4 = (r0 >> 3) & (uint32_t)0x11111111;
218	0		x1 = (x1 << 4) - x1;
219	0		x2 = (x2 << 4) - x2;
220	0		x3 = (x3 << 4) - x3;
221	0		x4 = (x4 << 4) - x4;
222	0		x0 = (x4 << 4) \| (x4 >> 28);
223	0		x5 = (x1 >> 4) \| (x1 << 28);
224
225			/*
226			* XOR with the subkey for this round.
227			*/
228	0		x0 ^= sk[0];
229	0		x1 ^= sk[1];
230	0		x2 ^= sk[2];
231	0		x3 ^= sk[3];
232	0		x4 ^= sk[4];
233	0		x5 ^= sk[5];
234
235			/*
236			* The T-boxes are done in parallel, since they all use a
237			* "tree of multiplexer". We use "fake multiplexers":
238			*
239			* y = a ^ (x & b)
240			*
241			* computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1).
242			*/
243	0		y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C);
244	0		y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821);
245	0		y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809);
246	0		y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28);
247	0		y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820);
248	0		y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801);
249	0		y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929);
250	0		y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1);
251	0		y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A);
252	0		y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B);
253	0		y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A);
254	0		y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572);
255	0		y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083);
256	0		y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000);
257	0		y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA);
258			/* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
259	0		y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9);
260	0		y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE);
261	0		y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816);
262	0		y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B);
263	0		y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88);
264	0		y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25);
265	0		y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D);
266	0		y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0);
267	0		y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100);
268	0		y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011);
269	0		y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959);
270	0		y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A);
271	0		y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8);
272	0		y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B);
273	0		y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD);
274			/* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
275
276	0		y0 = y0 ^ (x1 & y1);
277	0		y1 = y2 ^ (x1 & y3);
278	0		y2 = y4 ^ (x1 & y5);
279	0		y3 = y6 ^ (x1 & y7);
280	0		y4 = y8 ^ (x1 & y9);
281	0		y5 = y10 ^ (x1 & y11);
282	0		y6 = y12 ^ (x1 & y13);
283	0		y7 = y14; /* was: y14 ^ (x1 & y15) */
284	0		y8 = y16 ^ (x1 & y17);
285	0		y9 = y18 ^ (x1 & y19);
286	0		y10 = y20 ^ (x1 & y21);
287	0		y11 = y22 ^ (x1 & y23);
288	0		y12 = y24 ^ (x1 & y25);
289	0		y13 = y26 ^ (x1 & y27);
290	0		y14 = y28 ^ (x1 & y29);
291	0		y15 = y30; /* was: y30 ^ (x1 & y31) */
292
293	0		y0 = y0 ^ (x2 & y1);
294	0		y1 = y2 ^ (x2 & y3);
295	0		y2 = y4 ^ (x2 & y5);
296	0		y3 = y6 ^ (x2 & y7);
297	0		y4 = y8 ^ (x2 & y9);
298	0		y5 = y10 ^ (x2 & y11);
299	0		y6 = y12 ^ (x2 & y13);
300	0		y7 = y14 ^ (x2 & y15);
301
302	0		y0 = y0 ^ (x3 & y1);
303	0		y1 = y2 ^ (x3 & y3);
304	0		y2 = y4 ^ (x3 & y5);
305	0		y3 = y6 ^ (x3 & y7);
306
307	0		y0 = y0 ^ (x4 & y1);
308	0		y1 = y2 ^ (x4 & y3);
309
310	0		y0 = y0 ^ (x5 & y1);
311
312			/*
313			* The P permutation:
314			* -- Each bit move is converted into a mask + left rotation.
315			* -- Rotations that use the same movement are coalesced together.
316			* -- Left and right shifts are used as alternatives to a rotation
317			* where appropriate (this will help architectures that do not have
318			* a rotation opcode).
319			*/
320	0		z0 = (y0 & (uint32_t)0x00000004) << 3;
321	0		z0 \|= (y0 & (uint32_t)0x00004000) << 4;
322	0		z0 \|= rotl(y0 & 0x12020120, 5);
323	0		z0 \|= (y0 & (uint32_t)0x00100000) << 6;
324	0		z0 \|= (y0 & (uint32_t)0x00008000) << 9;
325	0		z0 \|= (y0 & (uint32_t)0x04000000) >> 22;
326	0		z0 \|= (y0 & (uint32_t)0x00000001) << 11;
327	0		z0 \|= rotl(y0 & 0x20000200, 12);
328	0		z0 \|= (y0 & (uint32_t)0x00200000) >> 19;
329	0		z0 \|= (y0 & (uint32_t)0x00000040) << 14;
330	0		z0 \|= (y0 & (uint32_t)0x00010000) << 15;
331	0		z0 \|= (y0 & (uint32_t)0x00000002) << 16;
332	0		z0 \|= rotl(y0 & 0x40801800, 17);
333	0		z0 \|= (y0 & (uint32_t)0x00080000) >> 13;
334	0		z0 \|= (y0 & (uint32_t)0x00000010) << 21;
335	0		z0 \|= (y0 & (uint32_t)0x01000000) >> 10;
336	0		z0 \|= rotl(y0 & 0x88000008, 24);
337	0		z0 \|= (y0 & (uint32_t)0x00000480) >> 7;
338	0		z0 \|= (y0 & (uint32_t)0x00442000) >> 6;
339	0		return z0;
340			}
341
342			/*
343			* Process one block through 16 successive rounds, omitting the swap
344			* in the final round.
345			*/
346			static void
347	0		process_block_unit(uint32_t pl, uint32_t pr, const uint32_t *sk_exp)
348			{
349			int i;
350			uint32_t l, r;
351
352	0		l = *pl;
353	0		r = *pr;
354	0	0	for (i = 0; i < 16; i ++) {
355			uint32_t t;
356
357	0		t = l ^ Fconf(r, sk_exp);
358	0		l = r;
359	0		r = t;
360	0		sk_exp += 6;
361			}
362	0		*pl = r;
363	0		*pr = l;
364	0		}
365
366			/* see inner.h */
367			void
368	0		br_des_ct_process_block(unsigned num_rounds,
369			const uint32_t sk_exp, void block)
370			{
371			unsigned char *buf;
372			uint32_t l, r;
373
374	0		buf = block;
375	0		l = br_dec32be(buf);
376	0		r = br_dec32be(buf + 4);
377	0		br_des_do_IP(&l, &r);
378	0	0	while (num_rounds -- > 0) {
379	0		process_block_unit(&l, &r, sk_exp);
380	0		sk_exp += 96;
381			}
382	0		br_des_do_invIP(&l, &r);
383	0		br_enc32be(buf, l);
384	0		br_enc32be(buf + 4, r);
385	0		}
386
387			/* see inner.h */
388			void
389	0		br_des_ct_skey_expand(uint32_t *sk_exp,
390			unsigned num_rounds, const uint32_t *skey)
391			{
392	0		num_rounds <<= 4;
393	0	0	while (num_rounds -- > 0) {
394			uint32_t v, w0, w1, w2, w3;
395
396	0		v = *skey ++;
397	0		w0 = v & 0x11111111;
398	0		w1 = (v >> 1) & 0x11111111;
399	0		w2 = (v >> 2) & 0x11111111;
400	0		w3 = (v >> 3) & 0x11111111;
401	0		*sk_exp ++ = (w0 << 4) - w0;
402	0		*sk_exp ++ = (w1 << 4) - w1;
403	0		*sk_exp ++ = (w2 << 4) - w2;
404	0		*sk_exp ++ = (w3 << 4) - w3;
405	0		v = *skey ++;
406	0		w0 = v & 0x11111111;
407	0		w1 = (v >> 1) & 0x11111111;
408	0		*sk_exp ++ = (w0 << 4) - w0;
409	0		*sk_exp ++ = (w1 << 4) - w1;
410			}
411	0		}