File Coverage

src/symcipher/poly1305_ctmul.c
Criterion Covered Total %
statement 0 99 0.0
branch 0 16 0.0
condition n/a
subroutine n/a
pod n/a
total 0 115 0.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2016 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #include "inner.h"
26              
27             /*
28             * Perform the inner processing of blocks for Poly1305. The accumulator
29             * and the r key are provided as arrays of 26-bit words (these words
30             * are allowed to have an extra bit, i.e. use 27 bits).
31             *
32             * On output, all accumulator words fit on 26 bits, except acc[1], which
33             * may be slightly larger (but by a very small amount only).
34             */
35             static void
36 0           poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len)
37             {
38             /*
39             * Implementation notes: we split the 130-bit values into five
40             * 26-bit words. This gives us some space for carries.
41             *
42             * This code is inspired from the public-domain code available
43             * on:
44             * https://github.com/floodyberry/poly1305-donna
45             *
46             * Since we compute modulo 2^130-5, the "upper words" become
47             * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
48             */
49             const unsigned char *buf;
50             uint32_t a0, a1, a2, a3, a4;
51             uint32_t r0, r1, r2, r3, r4;
52             uint32_t u1, u2, u3, u4;
53              
54 0           r0 = r[0];
55 0           r1 = r[1];
56 0           r2 = r[2];
57 0           r3 = r[3];
58 0           r4 = r[4];
59              
60 0           u1 = r1 * 5;
61 0           u2 = r2 * 5;
62 0           u3 = r3 * 5;
63 0           u4 = r4 * 5;
64              
65 0           a0 = acc[0];
66 0           a1 = acc[1];
67 0           a2 = acc[2];
68 0           a3 = acc[3];
69 0           a4 = acc[4];
70              
71 0           buf = data;
72 0 0         while (len > 0) {
73             uint64_t w0, w1, w2, w3, w4;
74             uint64_t c;
75             unsigned char tmp[16];
76              
77             /*
78             * If there is a partial block, right-pad it with zeros.
79             */
80 0 0         if (len < 16) {
81 0           memset(tmp, 0, sizeof tmp);
82 0           memcpy(tmp, buf, len);
83 0           buf = tmp;
84 0           len = 16;
85             }
86              
87             /*
88             * Decode next block and apply the "high bit"; that value
89             * is added to the accumulator.
90             */
91 0           a0 += br_dec32le(buf) & 0x03FFFFFF;
92 0           a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF;
93 0           a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF;
94 0           a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF;
95 0           a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000;
96              
97             /*
98             * Compute multiplication.
99             */
100             #define M(x, y) ((uint64_t)(x) * (uint64_t)(y))
101              
102 0           w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1);
103 0           w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2);
104 0           w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3);
105 0           w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4);
106 0           w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0);
107              
108             #undef M
109             /*
110             * Perform some (partial) modular reduction. This step is
111             * enough to keep values in ranges such that there won't
112             * be carry overflows. Most of the reduction was done in
113             * the multiplication step (by using the 'u*' values, and
114             * using the fact that 2^130 = -5 mod p); here we perform
115             * some carry propagation.
116             */
117 0           c = w0 >> 26;
118 0           a0 = (uint32_t)w0 & 0x3FFFFFF;
119 0           w1 += c;
120 0           c = w1 >> 26;
121 0           a1 = (uint32_t)w1 & 0x3FFFFFF;
122 0           w2 += c;
123 0           c = w2 >> 26;
124 0           a2 = (uint32_t)w2 & 0x3FFFFFF;
125 0           w3 += c;
126 0           c = w3 >> 26;
127 0           a3 = (uint32_t)w3 & 0x3FFFFFF;
128 0           w4 += c;
129 0           c = w4 >> 26;
130 0           a4 = (uint32_t)w4 & 0x3FFFFFF;
131 0           a0 += (uint32_t)c * 5;
132 0           a1 += a0 >> 26;
133 0           a0 &= 0x3FFFFFF;
134              
135 0           buf += 16;
136 0           len -= 16;
137             }
138              
139 0           acc[0] = a0;
140 0           acc[1] = a1;
141 0           acc[2] = a2;
142 0           acc[3] = a3;
143 0           acc[4] = a4;
144 0           }
145              
146             /* see bearssl_block.h */
147             void
148 0           br_poly1305_ctmul_run(const void *key, const void *iv,
149             void *data, size_t len, const void *aad, size_t aad_len,
150             void *tag, br_chacha20_run ichacha, int encrypt)
151             {
152             unsigned char pkey[32], foot[16];
153             uint32_t r[5], acc[5], cc, ctl, hi;
154             uint64_t w;
155             int i;
156              
157             /*
158             * Compute the MAC key. The 'r' value is the first 16 bytes of
159             * pkey[].
160             */
161 0           memset(pkey, 0, sizeof pkey);
162 0           ichacha(key, iv, 0, pkey, sizeof pkey);
163              
164             /*
165             * If encrypting, ChaCha20 must run first, followed by Poly1305.
166             * When decrypting, the operations are reversed.
167             */
168 0 0         if (encrypt) {
169 0           ichacha(key, iv, 1, data, len);
170             }
171              
172             /*
173             * Run Poly1305. We must process the AAD, then ciphertext, then
174             * the footer (with the lengths). Note that the AAD and ciphertext
175             * are meant to be padded with zeros up to the next multiple of 16,
176             * and the length of the footer is 16 bytes as well.
177             */
178              
179             /*
180             * Decode the 'r' value into 26-bit words, with the "clamping"
181             * operation applied.
182             */
183 0           r[0] = br_dec32le(pkey) & 0x03FFFFFF;
184 0           r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;
185 0           r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;
186 0           r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;
187 0           r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
188              
189             /*
190             * Accumulator is 0.
191             */
192 0           memset(acc, 0, sizeof acc);
193              
194             /*
195             * Process the additional authenticated data, ciphertext, and
196             * footer in due order.
197             */
198 0           br_enc64le(foot, (uint64_t)aad_len);
199 0           br_enc64le(foot + 8, (uint64_t)len);
200 0           poly1305_inner(acc, r, aad, aad_len);
201 0           poly1305_inner(acc, r, data, len);
202 0           poly1305_inner(acc, r, foot, sizeof foot);
203              
204             /*
205             * Finalise modular reduction. This is done with carry propagation
206             * and applying the '2^130 = -5 mod p' rule. Note that the output
207             * of poly1035_inner() is already mostly reduced, since only
208             * acc[1] may be (very slightly) above 2^26. A single loop back
209             * to acc[1] will be enough to make the value fit in 130 bits.
210             */
211 0           cc = 0;
212 0 0         for (i = 1; i <= 6; i ++) {
213             int j;
214              
215 0 0         j = (i >= 5) ? i - 5 : i;
216 0           acc[j] += cc;
217 0           cc = acc[j] >> 26;
218 0           acc[j] &= 0x03FFFFFF;
219             }
220              
221             /*
222             * We may still have a value in the 2^130-5..2^130-1 range, in
223             * which case we must reduce it again. The code below selects,
224             * in constant-time, between 'acc' and 'acc-p',
225             */
226 0           ctl = GT(acc[0], 0x03FFFFFA);
227 0 0         for (i = 1; i < 5; i ++) {
228 0           ctl &= EQ(acc[i], 0x03FFFFFF);
229             }
230 0           cc = 5;
231 0 0         for (i = 0; i < 5; i ++) {
232             uint32_t t;
233              
234 0           t = (acc[i] + cc);
235 0           cc = t >> 26;
236 0           t &= 0x03FFFFFF;
237 0           acc[i] = MUX(ctl, t, acc[i]);
238             }
239              
240             /*
241             * Convert back the accumulator to 32-bit words, and add the
242             * 's' value (second half of pkey[]). That addition is done
243             * modulo 2^128.
244             */
245 0           w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16);
246 0           br_enc32le((unsigned char *)tag, (uint32_t)w);
247 0           w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20);
248 0           br_enc32le((unsigned char *)tag + 4, (uint32_t)w);
249 0           w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24);
250 0           br_enc32le((unsigned char *)tag + 8, (uint32_t)w);
251 0           hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28);
252 0           br_enc32le((unsigned char *)tag + 12, hi);
253              
254             /*
255             * If decrypting, then ChaCha20 runs _after_ Poly1305.
256             */
257 0 0         if (!encrypt) {
258 0           ichacha(key, iv, 1, data, len);
259             }
260 0           }