File Coverage

src/symcipher/poly1305_ctmul32.c
Criterion Covered Total %
statement 0 129 0.0
branch 0 20 0.0
condition n/a
subroutine n/a
pod n/a
total 0 149 0.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #include "inner.h"
26              
27             /*
28             * Perform the inner processing of blocks for Poly1305.
29             */
30             static void
31 0           poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len)
32             {
33             /*
34             * Implementation notes: we split the 130-bit values into ten
35             * 13-bit words. This gives us some space for carries and allows
36             * using only 32x32->32 multiplications, which are way faster than
37             * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
38             * help in making constant-time code on the Cortex-M3.
39             *
40             * Since we compute modulo 2^130-5, the "upper words" become
41             * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
42             * This has already been integrated in the r[] array, which
43             * is extended to the 0..18 range.
44             *
45             * In each loop iteration, a[] and r[] words are 13-bit each,
46             * except a[1] which may use 14 bits.
47             */
48             const unsigned char *buf;
49              
50 0           buf = data;
51 0 0         while (len > 0) {
52             unsigned char tmp[16];
53             uint32_t b[10];
54             unsigned u, v;
55             uint32_t z, cc1, cc2;
56              
57             /*
58             * If there is a partial block, right-pad it with zeros.
59             */
60 0 0         if (len < 16) {
61 0           memset(tmp, 0, sizeof tmp);
62 0           memcpy(tmp, buf, len);
63 0           buf = tmp;
64 0           len = 16;
65             }
66              
67             /*
68             * Decode next block and apply the "high bit"; that value
69             * is added to the accumulator.
70             */
71 0           v = br_dec16le(buf);
72 0           a[0] += v & 0x01FFF;
73 0           v >>= 13;
74 0           v |= buf[2] << 3;
75 0           v |= buf[3] << 11;
76 0           a[1] += v & 0x01FFF;
77 0           v >>= 13;
78 0           v |= buf[4] << 6;
79 0           a[2] += v & 0x01FFF;
80 0           v >>= 13;
81 0           v |= buf[5] << 1;
82 0           v |= buf[6] << 9;
83 0           a[3] += v & 0x01FFF;
84 0           v >>= 13;
85 0           v |= buf[7] << 4;
86 0           v |= buf[8] << 12;
87 0           a[4] += v & 0x01FFF;
88 0           v >>= 13;
89 0           v |= buf[9] << 7;
90 0           a[5] += v & 0x01FFF;
91 0           v >>= 13;
92 0           v |= buf[10] << 2;
93 0           v |= buf[11] << 10;
94 0           a[6] += v & 0x01FFF;
95 0           v >>= 13;
96 0           v |= buf[12] << 5;
97 0           a[7] += v & 0x01FFF;
98 0           v = br_dec16le(buf + 13);
99 0           a[8] += v & 0x01FFF;
100 0           v >>= 13;
101 0           v |= buf[15] << 3;
102 0           a[9] += v | 0x00800;
103              
104             /*
105             * At that point, all a[] values fit on 14 bits, while
106             * all r[] values fit on 13 bits. Thus products fit on
107             * 27 bits, and we can accumulate up to 31 of them in
108             * a 32-bit word and still have some room for carries.
109             */
110              
111             /*
112             * Now a[] contains words with values up to 14 bits each.
113             * We perform the multiplication with r[].
114             *
115             * The extended words of r[] may be larger than 13 bits
116             * (they are 5 times a 13-bit word) so the full summation
117             * may yield values up to 46 times a 27-bit word, which
118             * does not fit on a 32-bit word. To avoid that issue, we
119             * must split the loop below in two, with a carry
120             * propagation operation in the middle.
121             */
122 0           cc1 = 0;
123 0 0         for (u = 0; u < 10; u ++) {
124             uint32_t s;
125              
126 0           s = cc1
127 0           + MUL15(a[0], r[u + 9 - 0])
128 0           + MUL15(a[1], r[u + 9 - 1])
129 0           + MUL15(a[2], r[u + 9 - 2])
130 0           + MUL15(a[3], r[u + 9 - 3])
131 0           + MUL15(a[4], r[u + 9 - 4]);
132 0           b[u] = s & 0x1FFF;
133 0           cc1 = s >> 13;
134             }
135 0           cc2 = 0;
136 0 0         for (u = 0; u < 10; u ++) {
137             uint32_t s;
138              
139 0           s = b[u] + cc2
140 0           + MUL15(a[5], r[u + 9 - 5])
141 0           + MUL15(a[6], r[u + 9 - 6])
142 0           + MUL15(a[7], r[u + 9 - 7])
143 0           + MUL15(a[8], r[u + 9 - 8])
144 0           + MUL15(a[9], r[u + 9 - 9]);
145 0           b[u] = s & 0x1FFF;
146 0           cc2 = s >> 13;
147             }
148 0           memcpy(a, b, sizeof b);
149              
150             /*
151             * The two carries "loop back" with a factor of 5. We
152             * propagate them into a[0] and a[1].
153             */
154 0           z = cc1 + cc2;
155 0           z += (z << 2) + a[0];
156 0           a[0] = z & 0x1FFF;
157 0           a[1] += z >> 13;
158              
159 0           buf += 16;
160 0           len -= 16;
161             }
162 0           }
163              
164             /* see bearssl_block.h */
165             void
166 0           br_poly1305_ctmul32_run(const void *key, const void *iv,
167             void *data, size_t len, const void *aad, size_t aad_len,
168             void *tag, br_chacha20_run ichacha, int encrypt)
169             {
170             unsigned char pkey[32], foot[16];
171             uint32_t z, r[19], acc[10], cc, ctl;
172             int i;
173              
174             /*
175             * Compute the MAC key. The 'r' value is the first 16 bytes of
176             * pkey[].
177             */
178 0           memset(pkey, 0, sizeof pkey);
179 0           ichacha(key, iv, 0, pkey, sizeof pkey);
180              
181             /*
182             * If encrypting, ChaCha20 must run first, followed by Poly1305.
183             * When decrypting, the operations are reversed.
184             */
185 0 0         if (encrypt) {
186 0           ichacha(key, iv, 1, data, len);
187             }
188              
189             /*
190             * Run Poly1305. We must process the AAD, then ciphertext, then
191             * the footer (with the lengths). Note that the AAD and ciphertext
192             * are meant to be padded with zeros up to the next multiple of 16,
193             * and the length of the footer is 16 bytes as well.
194             */
195              
196             /*
197             * Decode the 'r' value into 13-bit words, with the "clamping"
198             * operation applied.
199             */
200 0           z = br_dec32le(pkey) & 0x03FFFFFF;
201 0           r[9] = z & 0x1FFF;
202 0           r[10] = z >> 13;
203 0           z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;
204 0           r[11] = z & 0x1FFF;
205 0           r[12] = z >> 13;
206 0           z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;
207 0           r[13] = z & 0x1FFF;
208 0           r[14] = z >> 13;
209 0           z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;
210 0           r[15] = z & 0x1FFF;
211 0           r[16] = z >> 13;
212 0           z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
213 0           r[17] = z & 0x1FFF;
214 0           r[18] = z >> 13;
215              
216             /*
217             * Extend r[] with the 5x factor pre-applied.
218             */
219 0 0         for (i = 0; i < 9; i ++) {
220 0           r[i] = MUL15(5, r[i + 10]);
221             }
222              
223             /*
224             * Accumulator is 0.
225             */
226 0           memset(acc, 0, sizeof acc);
227              
228             /*
229             * Process the additional authenticated data, ciphertext, and
230             * footer in due order.
231             */
232 0           br_enc64le(foot, (uint64_t)aad_len);
233 0           br_enc64le(foot + 8, (uint64_t)len);
234 0           poly1305_inner(acc, r, aad, aad_len);
235 0           poly1305_inner(acc, r, data, len);
236 0           poly1305_inner(acc, r, foot, sizeof foot);
237              
238             /*
239             * Finalise modular reduction. This is done with carry propagation
240             * and applying the '2^130 = -5 mod p' rule. Note that the output
241             * of poly1035_inner() is already mostly reduced, since only
242             * acc[1] may be (very slightly) above 2^13. A single loop back
243             * to acc[1] will be enough to make the value fit in 130 bits.
244             */
245 0           cc = 0;
246 0 0         for (i = 1; i < 10; i ++) {
247 0           z = acc[i] + cc;
248 0           acc[i] = z & 0x1FFF;
249 0           cc = z >> 13;
250             }
251 0           z = acc[0] + cc + (cc << 2);
252 0           acc[0] = z & 0x1FFF;
253 0           acc[1] += z >> 13;
254              
255             /*
256             * We may still have a value in the 2^130-5..2^130-1 range, in
257             * which case we must reduce it again. The code below selects,
258             * in constant-time, between 'acc' and 'acc-p',
259             */
260 0           ctl = GT(acc[0], 0x1FFA);
261 0 0         for (i = 1; i < 10; i ++) {
262 0           ctl &= EQ(acc[i], 0x1FFF);
263             }
264 0           acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);
265 0 0         for (i = 1; i < 10; i ++) {
266 0           acc[i] &= ~(-ctl);
267             }
268              
269             /*
270             * Convert back the accumulator to 32-bit words, and add the
271             * 's' value (second half of pkey[]). That addition is done
272             * modulo 2^128.
273             */
274 0           z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);
275 0           br_enc16le((unsigned char *)tag, z & 0xFFFF);
276 0           z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);
277 0           br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);
278 0           z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);
279 0           br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);
280 0           z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);
281 0           br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);
282 0           z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);
283 0           br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);
284 0           z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);
285 0           br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);
286 0           z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);
287 0           br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);
288 0           z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);
289 0           br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);
290              
291             /*
292             * If decrypting, then ChaCha20 runs _after_ Poly1305.
293             */
294 0 0         if (!encrypt) {
295 0           ichacha(key, iv, 1, data, len);
296             }
297 0           }