File Coverage

_twofish.c
Criterion Covered Total %
statement 97 98 98.9
branch 19 20 95.0
condition n/a
subroutine n/a
pod n/a
total 116 118 98.3


line stmt bran cond sub pod time code
1             /*
2             * Copyright 1999 Dr. Brian Gladman
3             * Copyright 2001 Abhijit Menon-Sen
4             */
5              
6             /* Twofish is a 128-bit symmetric block cipher with a variable length
7             key, developed by Counterpane Labs. It is unpatented and free for all
8             uses, as described at
9             and .
10              
11             This implementation is based on code by Dr. Brian Gladman, at
12             .
13             Some of his comments are reproduced below:
14              
15             "Copyright in this implementation is held by Dr. B R Gladman but I
16             hereby give permission for its free direct or derivative use subject
17             to ackowledgement of its origin and compliance with any conditions
18             that the originators of the algorithm place on its exploitation.
19              
20             My thanks to Doug Whiting and Niels Ferguson for comments that led to
21             improvements in this implementation." */
22              
23             #include "twofish.h"
24             #include "tables.h"
25              
26             /* Extract the n'th byte from a 32-bit word */
27             #define byte(x,n) ((unsigned char)((x) >> (8 * n)))
28              
29             /* 32 bit rotate-left and right macros */
30             #define ror(x,n) (((x) >> ((int)(n))) | ((x) << (32 - (int)(n))))
31             #define rol(x,n) (((x) << ((int)(n))) | ((x) >> (32 - (int)(n))))
32              
33             /* Endian-independent byte -> word conversion */
34             #define strtonl(s) (uint32_t)(*(s)|*(s+1)<<8|*(s+2)<<16|*(s+3)<<24)
35              
36             #define nltostr(l, s) \
37             do { \
38             *(s )=(unsigned char)((l) ); \
39             *(s+1)=(unsigned char)((l) >> 8); \
40             *(s+2)=(unsigned char)((l) >> 16); \
41             *(s+3)=(unsigned char)((l) >> 24); \
42             } while (0)
43              
44             static uint32_t mds_rem(uint32_t a, uint32_t b);
45             static uint32_t h(int len, const int x, unsigned char *key, int odd);
46              
47             /* The key schedule takes a 128, 192, or 256-bit key, and provides 40
48             32-bit words of expanded key K0,...,K39 and the 4 key-dependent
49             S-boxes used in the g function. */
50              
51 20297           struct twofish *twofish_setup(unsigned char *key, int len)
52             {
53             int i;
54             uint32_t a, b, x;
55             struct twofish *t;
56             unsigned char *s, skey[16];
57            
58 20297 50         if ((t = malloc(sizeof(struct twofish))) == NULL)
59 0           return NULL;
60              
61             /* The key consists of k=len/8 (2, 3 or 4) 64-bit units. */
62 20297           t->len = len /= 8;
63              
64             /* We must derive three vectors Me, Mo, and S, each with k 32-bit
65             words, from the 2k words in the key.
66              
67             Me = (key[0], key[2], ..., key[2k-2]) (even words)
68             Mo = (key[1], key[3], ..., key[2k-1]) (odd words)
69              
70             The third vector is derived by multiplying each of the k groups
71             of 8 bytes from the key by a 4x8 matrix, to get k 32-bit words.
72              
73             S = (S[k-1], S[k-2], ..., S[0])
74              
75             where S[i] are the 4 bytes from the multiplication, interpreted
76             as a 32-bit word. As described later, mds_rem is equivalent to
77             the matrix multiplication, but faster.
78              
79             Since all these vectors are going to be used byte-by-byte, we
80             avoid converting them to words altogether, and write the bytes of
81             S into the array skey below: */
82              
83 20297           s = skey + 4*(len - 1);
84 61185 100         for (i = 0; i < len; i++) {
85 40888           x = mds_rem(strtonl(key+8*i), strtonl(key+8*i+4));
86 40888           nltostr(x, s);
87 40888           s -= 4;
88             }
89 20297           s = skey;
90              
91             /* The words of the expanded key K are defined using the h function:
92              
93             rho = 2^24 + 2^16 + 2^8 + 2^0 (0x01010101)
94             A[i] = h(2i*rho, Me)
95             B[i] = ROL(h(2(i+1)*rho, Mo), 8)
96             K[2i] = (A[i] + B[i]) mod 2^32
97             K[2i+1] = ROL((A[i] + 2B[i]) mod 2^32, 9)
98              
99             rho has the property that, for i = 0..255, the word i*rho
100             consists of four equal bytes, each with the value i. The function
101             h is only applied to words of this type, so we only pass it the
102             value of i.
103              
104             We also didn't generate the vectors Me and Mo separately: we pass
105             the entire key, and indicate whether we want the even or odd
106             words to be used. */
107              
108 426237 100         for (i = 0; i < 40; i += 2) {
109 405940           a = h(len, i, key, 0);
110 405940           b = rol(h(len, i+1, key, 1), 8);
111              
112 405940           t->K[i] = a+b;
113 405940           t->K[i+1] = rol(a+2*b, 9);
114             }
115              
116             /* The key-dependent S-boxes used in the g() function are created
117             below. They are defined by g(X) = h(X, S), where S is the vector
118             derived from the key. That is, for i=0..3, the S-box S[i] is
119             formed by mapping from x[i] to y[i] in the h function.
120              
121             The relevant lookup tables qN have been precomputed and stored in
122             tables.h; we also perform full key precomputations incorporating
123             the MDS matrix multiplications. */
124              
125 20297           switch (len) {
126             case 2:
127 5165957 100         for (i = 0; i < 256; i++) {
128 5145856           x = (unsigned char)i;
129 5145856           t->S[0][i] = m[0][q[0][q[0][x]^s[4]]^s[0]];
130 5145856           t->S[1][i] = m[1][q[0][q[1][x]^s[5]]^s[1]];
131 5145856           t->S[2][i] = m[2][q[1][q[0][x]^s[6]]^s[2]];
132 5145856           t->S[3][i] = m[3][q[1][q[1][x]^s[7]]^s[3]];
133             }
134 20101           break;
135             case 3:
136 25186 100         for (i = 0; i < 256; i++) {
137 25088           x = (unsigned char)i;
138 25088           t->S[0][i] = m[0][q[0][q[0][q[1][x]^s[ 8]]^s[4]]^s[0]];
139 25088           t->S[1][i] = m[1][q[0][q[1][q[1][x]^s[ 9]]^s[5]]^s[1]];
140 25088           t->S[2][i] = m[2][q[1][q[0][q[0][x]^s[10]]^s[6]]^s[2]];
141 25088           t->S[3][i] = m[3][q[1][q[1][q[0][x]^s[11]]^s[7]]^s[3]];
142              
143             }
144 98           break;
145             case 4:
146 25186 100         for (i = 0; i < 256; i++) {
147 25088           x = (unsigned char)i;
148 25088           t->S[0][i] = m[0][q[0][q[0][q[1][q[1][x]^s[12]]^s[ 8]]^s[4]]^s[0]];
149 25088           t->S[1][i] = m[1][q[0][q[1][q[1][q[0][x]^s[13]]^s[ 9]]^s[5]]^s[1]];
150 25088           t->S[2][i] = m[2][q[1][q[0][q[0][q[0][x]^s[14]]^s[10]]^s[6]]^s[2]];
151 25088           t->S[3][i] = m[3][q[1][q[1][q[0][q[1][x]^s[15]]^s[11]]^s[7]]^s[3]];
152             }
153 98           break;
154             }
155              
156 20297           return t;
157             }
158              
159 20297           void twofish_free(struct twofish *self)
160             {
161 20297           free(self);
162 20297           }
163              
164             /* The function g splits the input word x into four bytes; each byte is
165             run through its own key-dependent S-box. Each S-box is bijective,
166             takes 8 bits of input and produces 8 bits of output. The four results
167             are interpreted as a vector of length 4 over GF(2^8), and multiplied
168             by the 4x4 MDS matrix. The resulting vector is interpreted as a
169             32-bit word.
170              
171             Since we have performed the full key precomputations, g consists only
172             of four lookups and three XORs. g0 is g; g1 is a shortcut for
173             g(ROL(x, 8)). */
174              
175             #define g0(x) \
176             t->S[0][byte(x,0)]^t->S[1][byte(x,1)]^t->S[2][byte(x,2)]^t->S[3][byte(x,3)]
177              
178             #define g1(x) \
179             t->S[0][byte(x,3)]^t->S[1][byte(x,0)]^t->S[2][byte(x,1)]^t->S[3][byte(x,2)]
180              
181             /* F is a key-dependent permutation on 64-bit values. It takes two input
182             words R0 and R1, and a round number r:
183              
184             T0 = g(R0)
185             T1 = g(ROL(R1, 8))
186             F0 = (T0 + T1 + K[2r+8])
187             F1 = (T0 + 2*T1 + K[2r+9])
188              
189             Each of the 16 encryption rounds consists of the following operations:
190              
191             (F0, F1) = F(R0, R1, r)
192             R0 = ROR(R2 ^ F0, 1)
193             R1 = ROL(R3, 1) ^ F1
194             R2 = R0
195             R3 = R1
196              
197             For efficiency, two rounds are combined into one in the macros below. */
198              
199             #define f_2rounds(i) \
200             t0 = g0(R[0]); \
201             t1 = g1(R[1]); \
202             R[2] = ror(R[2] ^ (t0 + t1 + t->K[4*i+8]), 1); \
203             R[3] = rol(R[3], 1) ^ (t0 + 2*t1 + t->K[4*i+9]); \
204             t0 = g0(R[2]); \
205             t1 = g1(R[3]); \
206             R[0] = ror(R[0] ^ (t0 + t1 + t->K[4*i+10]), 1); \
207             R[1] = rol(R[1], 1) ^ (t0 + 2*t1 + t->K[4*i+11]);
208              
209             /* This is the inverse of f_2rounds */
210             #define i_2rounds(i) \
211             t0 = g0(R[0]); \
212             t1 = g1(R[1]); \
213             R[2] = rol(R[2], 1) ^ (t0 + t1 + t->K[4*i+10]); \
214             R[3] = ror(R[3] ^ (t0 + 2*t1 + t->K[4*i+11]), 1); \
215             t0 = g0(R[2]); \
216             t1 = g1(R[3]); \
217             R[0] = rol(R[0], 1) ^ (t0 + t1 + t->K[4*i+8]); \
218             R[1] = ror(R[1] ^ (t0 + 2*t1 + t->K[4*i+9]), 1)
219              
220             /* This function encrypts or decrypts 16 bytes of input data and writes
221             it to output, using the key defined in t. */
222              
223 40296           void twofish_crypt(struct twofish *t,
224             unsigned char *input, unsigned char *output,
225             int decrypt)
226             {
227             uint32_t t0, t1, R[4], out[4];
228              
229 40296 100         if (!decrypt) {
230             /* Whiten four 32-bit input words. */
231 20148           R[0] = t->K[0] ^ strtonl(input);
232 20148           R[1] = t->K[1] ^ strtonl(input+4);
233 20148           R[2] = t->K[2] ^ strtonl(input+8);
234 20148           R[3] = t->K[3] ^ strtonl(input+12);
235              
236             /* 16 rounds of encryption, combined into 8 pairs. */
237 20148           f_2rounds(0); f_2rounds(1); f_2rounds(2); f_2rounds(3);
238 20148           f_2rounds(4); f_2rounds(5); f_2rounds(6); f_2rounds(7);
239              
240             /* Output whitening; The order of R[n] undoes the last swap. */
241 20148           out[0] = t->K[4] ^ R[2];
242 20148           out[1] = t->K[5] ^ R[3];
243 20148           out[2] = t->K[6] ^ R[0];
244 20148           out[3] = t->K[7] ^ R[1];
245             } else {
246 20148           R[0] = t->K[4] ^ strtonl(input);
247 20148           R[1] = t->K[5] ^ strtonl(input+4);
248 20148           R[2] = t->K[6] ^ strtonl(input+8);
249 20148           R[3] = t->K[7] ^ strtonl(input+12);
250              
251 20148           i_2rounds(7); i_2rounds(6); i_2rounds(5); i_2rounds(4);
252 20148           i_2rounds(3); i_2rounds(2); i_2rounds(1); i_2rounds(0);
253              
254 20148           out[0] = t->K[0] ^ R[2];
255 20148           out[1] = t->K[1] ^ R[3];
256 20148           out[2] = t->K[2] ^ R[0];
257 20148           out[3] = t->K[3] ^ R[1];
258             }
259              
260             /* Write 16 output bytes. */
261 40296           nltostr(out[0], output);
262 40296           nltostr(out[1], output+4);
263 40296           nltostr(out[2], output+8);
264 40296           nltostr(out[3], output+12);
265 40296           }
266              
267             /* h takes a 32-bit word X, and a list, L = (L[0],...,L[k-1]), of 32-bit
268             words, and produces one word of output. During each of the k stages
269             of the function, the four bytes from X are each passed through a
270             fixed S-box, and XORed with a byte derived from the list. Finally,
271             the bytes are once again passed through an S-box and multiplied by
272             the MDS matrix, just as in g.
273              
274             We use the Lbyte macro to extract a given byte from the list L
275             (expressed in little endian). */
276              
277             #define Lbyte(w, b) L[4*(2*w+odd)+b]
278              
279 1217820           static uint32_t h(int len, const int X, unsigned char *L, int odd)
280             {
281             unsigned char b0, b1, b2, b3;
282              
283 1217820           b0 = b1 = b2 = b3 = (unsigned char)X;
284              
285 1217820           switch (len) {
286             case 4:
287 5880           b0 = q[1][b0] ^ Lbyte(3, 0);
288 5880           b1 = q[0][b1] ^ Lbyte(3, 1);
289 5880           b2 = q[0][b2] ^ Lbyte(3, 2);
290 5880           b3 = q[1][b3] ^ Lbyte(3, 3);
291             case 3:
292 11760           b0 = q[1][b0] ^ Lbyte(2, 0);
293 11760           b1 = q[1][b1] ^ Lbyte(2, 1);
294 11760           b2 = q[0][b2] ^ Lbyte(2, 2);
295 11760           b3 = q[0][b3] ^ Lbyte(2, 3);
296             case 2:
297 1217820           b0 = q[0][q[0][b0] ^ Lbyte(1, 0)] ^ Lbyte(0, 0);
298 1217820           b1 = q[0][q[1][b1] ^ Lbyte(1, 1)] ^ Lbyte(0, 1);
299 1217820           b2 = q[1][q[0][b2] ^ Lbyte(1, 2)] ^ Lbyte(0, 2);
300 1217820           b3 = q[1][q[1][b3] ^ Lbyte(1, 3)] ^ Lbyte(0, 3);
301             }
302              
303 1217820           return m[0][b0] ^ m[1][b1] ^ m[2][b2] ^ m[3][b3];
304             }
305              
306             /* The (12, 8) Reed Solomon code has the generator polynomial:
307              
308             g(x) = x^4 + (a + 1/a) * x^3 + a * x^2 + (a + 1/a) * x + 1
309              
310             where the coefficients are in the finite field GF(2^8) with a modular
311             polynomial a^8+a^6+a^3+a^2+1. To generate the remainder, we have to
312             start with a 12th order polynomial with our eight input bytes as the
313             coefficients of the 4th to 11th terms:
314              
315             m[7] * x^11 + m[6] * x^10 ... + m[0] * x^4 + 0 * x^3 +... + 0
316              
317             We then multiply the generator polynomial by m[7]*x^7 and subtract it
318             (XOR in GF(2^8)) from the above to eliminate the x^7 term (the
319             arithmetic on the coefficients is done in GF(2^8)). We then multiply
320             the generator polynomial by m[6]*x^6 and use this to remove the x^10
321             term, and so on until the x^4 term is removed, and we are left with:
322              
323             r[3] * x^3 + r[2] * x^2 + r[1] 8 x^1 + r[0]
324              
325             which give the resulting 4 bytes of the remainder. This is equivalent
326             to the matrix multiplication described in the Twofish paper, but is
327             much faster. */
328              
329 40888           static uint32_t mds_rem(uint32_t a, uint32_t b)
330             {
331             int i;
332             uint32_t t, u;
333             enum { G_MOD = 0x0000014d };
334              
335 367992 100         for (i = 0; i < 8; i++) {
336             /* Get most significant coefficient */
337 327104           t = b >> 24;
338              
339             /* Shift the others up */
340 327104           b = (b << 8) | (a >> 24);
341 327104           a <<= 8;
342              
343 327104           u = t << 1;
344              
345             /* Subtract the modular polynomial on overflow */
346 327104 100         if (t & 0x80)
347 263436           u ^= G_MOD;
348              
349             /* Remove t * (a * x^2 + 1) */
350 327104           b ^= t ^ (u << 16);
351              
352             /* Form u = a*t + t/a = t*(a + 1/a) */
353 327104           u ^= t >> 1;
354              
355             /* Add the modular polynomial on underflow */
356 327104 100         if (t & 0x01)
357 223293           u ^= G_MOD >> 1;
358              
359             /* Remove t * (a + 1/a) * (x^3 + x) */
360 327104           b ^= (u << 24) | (u << 8);
361             }
362              
363 40888           return b;
364             }