File Coverage

src/symcipher/poly1305_ctmulq.c
Criterion Covered Total %
statement 120 204 58.8
branch 10 14 71.4
condition n/a
subroutine n/a
pod n/a
total 130 218 59.6


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #include "inner.h"
26              
27             #if BR_INT128 || BR_UMUL128
28              
29             #if BR_INT128
30              
31             #define MUL128(hi, lo, x, y) do { \
32             unsigned __int128 mul128tmp; \
33             mul128tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
34             (hi) = (uint64_t)(mul128tmp >> 64); \
35             (lo) = (uint64_t)mul128tmp; \
36             } while (0)
37              
38             #elif BR_UMUL128
39              
40             #include
41              
42             #define MUL128(hi, lo, x, y) do { \
43             (lo) = _umul128((x), (y), &(hi)); \
44             } while (0)
45              
46             #endif
47              
48             #define MASK42 ((uint64_t)0x000003FFFFFFFFFF)
49             #define MASK44 ((uint64_t)0x00000FFFFFFFFFFF)
50              
51             /*
52             * The "accumulator" word is nominally a 130-bit value. We split it into
53             * words of 44 bits, each held in a 64-bit variable.
54             *
55             * If the current accumulator is a = a0 + a1*W + a2*W^2 (where W = 2^44)
56             * and r = r0 + r1*W + r2*W^2, then:
57             *
58             * a*r = (a0*r0)
59             * + (a0*r1 + a1*r0) * W
60             * + (a0*r2 + a1*r1 + a2*r0) * W^2
61             * + (a1*r2 + a2*r1) * W^3
62             * + (a2*r2) * W^4
63             *
64             * We want to reduce that value modulo p = 2^130-5, so W^3 = 20 mod p,
65             * and W^4 = 20*W mod p. Thus, if we define u1 = 20*r1 and u2 = 20*r2,
66             * then the equations above become:
67             *
68             * b0 = a0*r0 + a1*u2 + a2*u1
69             * b1 = a0*r1 + a1*r0 + a2*u2
70             * b2 = a0*r2 + a1*r1 + a2*r0
71             *
72             * In order to make u1 fit in 44 bits, we can change these equations
73             * into:
74             *
75             * b0 = a0*r0 + a1*u2 + a2*t1
76             * b1 = a0*r1 + a1*r0 + a2*t2
77             * b2 = a0*r2 + a1*r1 + a2*r0
78             *
79             * Where t1 is u1 truncated to 44 bits, and t2 is u2 added to the extra
80             * bits of u1. Note that since r is clamped down to a 124-bit value, the
81             * values u2 and t2 fit on 44 bits too.
82             *
83             * The bx values are larger than 44 bits, so we may split them into a
84             * lower half (cx, 44 bits) and an upper half (dx). The new values for
85             * the accumulator are then:
86             *
87             * e0 = c0 + 20*d2
88             * e1 = c1 + d0
89             * e2 = c2 + d1
90             *
91             * The equations allow for some room, i.e. the ax values may be larger
92             * than 44 bits. Similarly, the ex values will usually be larger than
93             * the ax. Thus, some sort of carry propagation must be done regularly,
94             * though not necessarily at each iteration. In particular, we do not
95             * need to compute the additions (for the bx values) over 128-bit
96             * quantities; we can stick to 64-bit computations.
97             *
98             *
99             * Since the 128-bit result of a 64x64 multiplication is actually
100             * represented over two 64-bit registers, it is cheaper to arrange for
101             * any split that happens between the "high" and "low" halves to be on
102             * that 64-bit boundary. This is done by left shifting the rx, ux and tx
103             * by 20 bits (since they all fit on 44 bits each, this shift is
104             * always possible).
105             */
106              
107             static void
108 0           poly1305_inner_big(uint64_t *acc, uint64_t *r, const void *data, size_t len)
109             {
110              
111             #define MX(hi, lo, m0, m1, m2) do { \
112             uint64_t mxhi, mxlo; \
113             MUL128(mxhi, mxlo, a0, m0); \
114             (hi) = mxhi; \
115             (lo) = mxlo >> 20; \
116             MUL128(mxhi, mxlo, a1, m1); \
117             (hi) += mxhi; \
118             (lo) += mxlo >> 20; \
119             MUL128(mxhi, mxlo, a2, m2); \
120             (hi) += mxhi; \
121             (lo) += mxlo >> 20; \
122             } while (0)
123              
124             const unsigned char *buf;
125             uint64_t a0, a1, a2;
126             uint64_t r0, r1, r2, t1, t2, u2;
127              
128 0           r0 = r[0];
129 0           r1 = r[1];
130 0           r2 = r[2];
131 0           t1 = r[3];
132 0           t2 = r[4];
133 0           u2 = r[5];
134 0           a0 = acc[0];
135 0           a1 = acc[1];
136 0           a2 = acc[2];
137 0           buf = data;
138              
139 0 0         while (len > 0) {
140             uint64_t v0, v1, v2;
141             uint64_t c0, c1, c2, d0, d1, d2;
142              
143 0           v0 = br_dec64le(buf + 0);
144 0           v1 = br_dec64le(buf + 8);
145 0           v2 = v1 >> 24;
146 0           v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
147 0           v0 &= MASK44;
148 0           a0 += v0;
149 0           a1 += v1;
150 0           a2 += v2 + ((uint64_t)1 << 40);
151 0           MX(d0, c0, r0, u2, t1);
152 0           MX(d1, c1, r1, r0, t2);
153 0           MX(d2, c2, r2, r1, r0);
154 0           a0 = c0 + 20 * d2;
155 0           a1 = c1 + d0;
156 0           a2 = c2 + d1;
157              
158 0           v0 = br_dec64le(buf + 16);
159 0           v1 = br_dec64le(buf + 24);
160 0           v2 = v1 >> 24;
161 0           v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
162 0           v0 &= MASK44;
163 0           a0 += v0;
164 0           a1 += v1;
165 0           a2 += v2 + ((uint64_t)1 << 40);
166 0           MX(d0, c0, r0, u2, t1);
167 0           MX(d1, c1, r1, r0, t2);
168 0           MX(d2, c2, r2, r1, r0);
169 0           a0 = c0 + 20 * d2;
170 0           a1 = c1 + d0;
171 0           a2 = c2 + d1;
172              
173 0           v0 = br_dec64le(buf + 32);
174 0           v1 = br_dec64le(buf + 40);
175 0           v2 = v1 >> 24;
176 0           v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
177 0           v0 &= MASK44;
178 0           a0 += v0;
179 0           a1 += v1;
180 0           a2 += v2 + ((uint64_t)1 << 40);
181 0           MX(d0, c0, r0, u2, t1);
182 0           MX(d1, c1, r1, r0, t2);
183 0           MX(d2, c2, r2, r1, r0);
184 0           a0 = c0 + 20 * d2;
185 0           a1 = c1 + d0;
186 0           a2 = c2 + d1;
187              
188 0           v0 = br_dec64le(buf + 48);
189 0           v1 = br_dec64le(buf + 56);
190 0           v2 = v1 >> 24;
191 0           v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
192 0           v0 &= MASK44;
193 0           a0 += v0;
194 0           a1 += v1;
195 0           a2 += v2 + ((uint64_t)1 << 40);
196 0           MX(d0, c0, r0, u2, t1);
197 0           MX(d1, c1, r1, r0, t2);
198 0           MX(d2, c2, r2, r1, r0);
199 0           a0 = c0 + 20 * d2;
200 0           a1 = c1 + d0;
201 0           a2 = c2 + d1;
202              
203 0           a1 += a0 >> 44;
204 0           a0 &= MASK44;
205 0           a2 += a1 >> 44;
206 0           a1 &= MASK44;
207 0           a0 += 20 * (a2 >> 44);
208 0           a2 &= MASK44;
209              
210 0           buf += 64;
211 0           len -= 64;
212             }
213 0           acc[0] = a0;
214 0           acc[1] = a1;
215 0           acc[2] = a2;
216              
217             #undef MX
218 0           }
219              
220             static void
221 36           poly1305_inner_small(uint64_t *acc, uint64_t *r, const void *data, size_t len)
222             {
223             const unsigned char *buf;
224             uint64_t a0, a1, a2;
225             uint64_t r0, r1, r2, t1, t2, u2;
226              
227 36           r0 = r[0];
228 36           r1 = r[1];
229 36           r2 = r[2];
230 36           t1 = r[3];
231 36           t2 = r[4];
232 36           u2 = r[5];
233 36           a0 = acc[0];
234 36           a1 = acc[1];
235 36           a2 = acc[2];
236 36           buf = data;
237              
238 72 100         while (len > 0) {
239             uint64_t v0, v1, v2;
240             uint64_t c0, c1, c2, d0, d1, d2;
241             unsigned char tmp[16];
242              
243 36 100         if (len < 16) {
244 20           memcpy(tmp, buf, len);
245 20           memset(tmp + len, 0, (sizeof tmp) - len);
246 20           buf = tmp;
247 20           len = 16;
248             }
249 36           v0 = br_dec64le(buf + 0);
250 36           v1 = br_dec64le(buf + 8);
251              
252 36           v2 = v1 >> 24;
253 36           v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
254 36           v0 &= MASK44;
255              
256 36           a0 += v0;
257 36           a1 += v1;
258 36           a2 += v2 + ((uint64_t)1 << 40);
259              
260             #define MX(hi, lo, m0, m1, m2) do { \
261             uint64_t mxhi, mxlo; \
262             MUL128(mxhi, mxlo, a0, m0); \
263             (hi) = mxhi; \
264             (lo) = mxlo >> 20; \
265             MUL128(mxhi, mxlo, a1, m1); \
266             (hi) += mxhi; \
267             (lo) += mxlo >> 20; \
268             MUL128(mxhi, mxlo, a2, m2); \
269             (hi) += mxhi; \
270             (lo) += mxlo >> 20; \
271             } while (0)
272              
273 36           MX(d0, c0, r0, u2, t1);
274 36           MX(d1, c1, r1, r0, t2);
275 36           MX(d2, c2, r2, r1, r0);
276              
277             #undef MX
278              
279 36           a0 = c0 + 20 * d2;
280 36           a1 = c1 + d0;
281 36           a2 = c2 + d1;
282              
283 36           a1 += a0 >> 44;
284 36           a0 &= MASK44;
285 36           a2 += a1 >> 44;
286 36           a1 &= MASK44;
287 36           a0 += 20 * (a2 >> 44);
288 36           a2 &= MASK44;
289              
290 36           buf += 16;
291 36           len -= 16;
292             }
293 36           acc[0] = a0;
294 36           acc[1] = a1;
295 36           acc[2] = a2;
296 36           }
297              
298             static inline void
299 24           poly1305_inner(uint64_t *acc, uint64_t *r, const void *data, size_t len)
300             {
301 24 50         if (len >= 64) {
302             size_t len2;
303              
304 0           len2 = len & ~(size_t)63;
305 0           poly1305_inner_big(acc, r, data, len2);
306 0           data = (const unsigned char *)data + len2;
307 0           len -= len2;
308             }
309 24 50         if (len > 0) {
310 24           poly1305_inner_small(acc, r, data, len);
311             }
312 24           }
313              
314             /* see bearssl_block.h */
315             void
316 12           br_poly1305_ctmulq_run(const void *key, const void *iv,
317             void *data, size_t len, const void *aad, size_t aad_len,
318             void *tag, br_chacha20_run ichacha, int encrypt)
319             {
320             unsigned char pkey[32], foot[16];
321             uint64_t r[6], acc[3], r0, r1;
322             uint32_t v0, v1, v2, v3, v4;
323             uint64_t w0, w1, w2, w3;
324             uint32_t ctl;
325              
326             /*
327             * Compute the MAC key. The 'r' value is the first 16 bytes of
328             * pkey[].
329             */
330 12           memset(pkey, 0, sizeof pkey);
331 12           ichacha(key, iv, 0, pkey, sizeof pkey);
332              
333             /*
334             * If encrypting, ChaCha20 must run first, followed by Poly1305.
335             * When decrypting, the operations are reversed.
336             */
337 12 100         if (encrypt) {
338 6           ichacha(key, iv, 1, data, len);
339             }
340              
341             /*
342             * Run Poly1305. We must process the AAD, then ciphertext, then
343             * the footer (with the lengths). Note that the AAD and ciphertext
344             * are meant to be padded with zeros up to the next multiple of 16,
345             * and the length of the footer is 16 bytes as well.
346             */
347              
348             /*
349             * Apply the "clamping" on r.
350             */
351 12           pkey[ 3] &= 0x0F;
352 12           pkey[ 4] &= 0xFC;
353 12           pkey[ 7] &= 0x0F;
354 12           pkey[ 8] &= 0xFC;
355 12           pkey[11] &= 0x0F;
356 12           pkey[12] &= 0xFC;
357 12           pkey[15] &= 0x0F;
358              
359             /*
360             * Decode the 'r' value into 44-bit words, left-shifted by 20 bits.
361             * Also compute the u1 and u2 values.
362             */
363 12           r0 = br_dec64le(pkey + 0);
364 12           r1 = br_dec64le(pkey + 8);
365 12           r[0] = r0 << 20;
366 12           r[1] = ((r0 >> 24) | (r1 << 40)) & ~(uint64_t)0xFFFFF;
367 12           r[2] = (r1 >> 4) & ~(uint64_t)0xFFFFF;
368 12           r1 = 20 * (r[1] >> 20);
369 12           r[3] = r1 << 20;
370 12           r[5] = 20 * r[2];
371 12           r[4] = (r[5] + (r1 >> 24)) & ~(uint64_t)0xFFFFF;
372              
373             /*
374             * Accumulator is 0.
375             */
376 12           acc[0] = 0;
377 12           acc[1] = 0;
378 12           acc[2] = 0;
379              
380             /*
381             * Process the additional authenticated data, ciphertext, and
382             * footer in due order.
383             */
384 12           br_enc64le(foot, (uint64_t)aad_len);
385 12           br_enc64le(foot + 8, (uint64_t)len);
386 12           poly1305_inner(acc, r, aad, aad_len);
387 12           poly1305_inner(acc, r, data, len);
388 12           poly1305_inner_small(acc, r, foot, sizeof foot);
389              
390             /*
391             * Finalise modular reduction. At that point, the value consists
392             * in three 44-bit values (the lowest one might be slightly above
393             * 2^44). Two loops shall be sufficient.
394             */
395 12           acc[1] += (acc[0] >> 44);
396 12           acc[0] &= MASK44;
397 12           acc[2] += (acc[1] >> 44);
398 12           acc[1] &= MASK44;
399 12           acc[0] += 5 * (acc[2] >> 42);
400 12           acc[2] &= MASK42;
401 12           acc[1] += (acc[0] >> 44);
402 12           acc[0] &= MASK44;
403 12           acc[2] += (acc[1] >> 44);
404 12           acc[1] &= MASK44;
405 12           acc[0] += 5 * (acc[2] >> 42);
406 12           acc[2] &= MASK42;
407              
408             /*
409             * The value may still fall in the 2^130-5..2^130-1 range, in
410             * which case we must reduce it again. The code below selects,
411             * in constant-time, between 'acc' and 'acc-p'. We encode the
412             * value over four 32-bit integers to finish the operation.
413             */
414 12           v0 = (uint32_t)acc[0];
415 12           v1 = (uint32_t)(acc[0] >> 32) | ((uint32_t)acc[1] << 12);
416 12           v2 = (uint32_t)(acc[1] >> 20) | ((uint32_t)acc[2] << 24);
417 12           v3 = (uint32_t)(acc[2] >> 8);
418 12           v4 = (uint32_t)(acc[2] >> 40);
419              
420 12           ctl = GT(v0, 0xFFFFFFFA);
421 12           ctl &= EQ(v1, 0xFFFFFFFF);
422 12           ctl &= EQ(v2, 0xFFFFFFFF);
423 12           ctl &= EQ(v3, 0xFFFFFFFF);
424 12           ctl &= EQ(v4, 0x00000003);
425 12           v0 = MUX(ctl, v0 + 5, v0);
426 12           v1 = MUX(ctl, 0, v1);
427 12           v2 = MUX(ctl, 0, v2);
428 12           v3 = MUX(ctl, 0, v3);
429              
430             /*
431             * Add the "s" value. This is done modulo 2^128. Don't forget
432             * carry propagation...
433             */
434 12           w0 = (uint64_t)v0 + (uint64_t)br_dec32le(pkey + 16);
435 12           w1 = (uint64_t)v1 + (uint64_t)br_dec32le(pkey + 20) + (w0 >> 32);
436 12           w2 = (uint64_t)v2 + (uint64_t)br_dec32le(pkey + 24) + (w1 >> 32);
437 12           w3 = (uint64_t)v3 + (uint64_t)br_dec32le(pkey + 28) + (w2 >> 32);
438 12           v0 = (uint32_t)w0;
439 12           v1 = (uint32_t)w1;
440 12           v2 = (uint32_t)w2;
441 12           v3 = (uint32_t)w3;
442              
443             /*
444             * Encode the tag.
445             */
446 12           br_enc32le((unsigned char *)tag + 0, v0);
447 12           br_enc32le((unsigned char *)tag + 4, v1);
448 12           br_enc32le((unsigned char *)tag + 8, v2);
449 12           br_enc32le((unsigned char *)tag + 12, v3);
450              
451             /*
452             * If decrypting, then ChaCha20 runs _after_ Poly1305.
453             */
454 12 100         if (!encrypt) {
455 6           ichacha(key, iv, 1, data, len);
456             }
457 12           }
458              
459             /* see bearssl_block.h */
460             br_poly1305_run
461 2           br_poly1305_ctmulq_get(void)
462             {
463 2           return &br_poly1305_ctmulq_run;
464             }
465              
466             #else
467              
468             /* see bearssl_block.h */
469             br_poly1305_run
470             br_poly1305_ctmulq_get(void)
471             {
472             return 0;
473             }
474              
475             #endif