File Coverage

src/symcipher/poly1305_ctmul32.c

Criterion	Covered	Total	%
statement	0	129	0.0
branch	0	20	0.0
condition			n/a
subroutine			n/a
pod			n/a
total	0	149	0.0

line	stmt	bran	code
1			/*
2			* Copyright (c) 2017 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#include "inner.h"
26
27			/*
28			* Perform the inner processing of blocks for Poly1305.
29			*/
30			static void
31	0		poly1305_inner(uint32_t a, const uint32_t r, const void *data, size_t len)
32			{
33			/*
34			* Implementation notes: we split the 130-bit values into ten
35			* 13-bit words. This gives us some space for carries and allows
36			* using only 32x32->32 multiplications, which are way faster than
37			* 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
38			* help in making constant-time code on the Cortex-M3.
39			*
40			* Since we compute modulo 2^130-5, the "upper words" become
41			* low words with a factor of 5; that is, x2^130 = x5 mod p.
42			* This has already been integrated in the r[] array, which
43			* is extended to the 0..18 range.
44			*
45			* In each loop iteration, a[] and r[] words are 13-bit each,
46			* except a[1] which may use 14 bits.
47			*/
48			const unsigned char *buf;
49
50	0		buf = data;
51	0	0	while (len > 0) {
52			unsigned char tmp[16];
53			uint32_t b[10];
54			unsigned u, v;
55			uint32_t z, cc1, cc2;
56
57			/*
58			* If there is a partial block, right-pad it with zeros.
59			*/
60	0	0	if (len < 16) {
61	0		memset(tmp, 0, sizeof tmp);
62	0		memcpy(tmp, buf, len);
63	0		buf = tmp;
64	0		len = 16;
65			}
66
67			/*
68			* Decode next block and apply the "high bit"; that value
69			* is added to the accumulator.
70			*/
71	0		v = br_dec16le(buf);
72	0		a[0] += v & 0x01FFF;
73	0		v >>= 13;
74	0		v \|= buf[2] << 3;
75	0		v \|= buf[3] << 11;
76	0		a[1] += v & 0x01FFF;
77	0		v >>= 13;
78	0		v \|= buf[4] << 6;
79	0		a[2] += v & 0x01FFF;
80	0		v >>= 13;
81	0		v \|= buf[5] << 1;
82	0		v \|= buf[6] << 9;
83	0		a[3] += v & 0x01FFF;
84	0		v >>= 13;
85	0		v \|= buf[7] << 4;
86	0		v \|= buf[8] << 12;
87	0		a[4] += v & 0x01FFF;
88	0		v >>= 13;
89	0		v \|= buf[9] << 7;
90	0		a[5] += v & 0x01FFF;
91	0		v >>= 13;
92	0		v \|= buf[10] << 2;
93	0		v \|= buf[11] << 10;
94	0		a[6] += v & 0x01FFF;
95	0		v >>= 13;
96	0		v \|= buf[12] << 5;
97	0		a[7] += v & 0x01FFF;
98	0		v = br_dec16le(buf + 13);
99	0		a[8] += v & 0x01FFF;
100	0		v >>= 13;
101	0		v \|= buf[15] << 3;
102	0		a[9] += v \| 0x00800;
103
104			/*
105			* At that point, all a[] values fit on 14 bits, while
106			* all r[] values fit on 13 bits. Thus products fit on
107			* 27 bits, and we can accumulate up to 31 of them in
108			* a 32-bit word and still have some room for carries.
109			*/
110
111			/*
112			* Now a[] contains words with values up to 14 bits each.
113			* We perform the multiplication with r[].
114			*
115			* The extended words of r[] may be larger than 13 bits
116			* (they are 5 times a 13-bit word) so the full summation
117			* may yield values up to 46 times a 27-bit word, which
118			* does not fit on a 32-bit word. To avoid that issue, we
119			* must split the loop below in two, with a carry
120			* propagation operation in the middle.
121			*/
122	0		cc1 = 0;
123	0	0	for (u = 0; u < 10; u ++) {
124			uint32_t s;
125
126	0		s = cc1
127	0		+ MUL15(a[0], r[u + 9 - 0])
128	0		+ MUL15(a[1], r[u + 9 - 1])
129	0		+ MUL15(a[2], r[u + 9 - 2])
130	0		+ MUL15(a[3], r[u + 9 - 3])
131	0		+ MUL15(a[4], r[u + 9 - 4]);
132	0		b[u] = s & 0x1FFF;
133	0		cc1 = s >> 13;
134			}
135	0		cc2 = 0;
136	0	0	for (u = 0; u < 10; u ++) {
137			uint32_t s;
138
139	0		s = b[u] + cc2
140	0		+ MUL15(a[5], r[u + 9 - 5])
141	0		+ MUL15(a[6], r[u + 9 - 6])
142	0		+ MUL15(a[7], r[u + 9 - 7])
143	0		+ MUL15(a[8], r[u + 9 - 8])
144	0		+ MUL15(a[9], r[u + 9 - 9]);
145	0		b[u] = s & 0x1FFF;
146	0		cc2 = s >> 13;
147			}
148	0		memcpy(a, b, sizeof b);
149
150			/*
151			* The two carries "loop back" with a factor of 5. We
152			* propagate them into a[0] and a[1].
153			*/
154	0		z = cc1 + cc2;
155	0		z += (z << 2) + a[0];
156	0		a[0] = z & 0x1FFF;
157	0		a[1] += z >> 13;
158
159	0		buf += 16;
160	0		len -= 16;
161			}
162	0		}
163
164			/* see bearssl_block.h */
165			void
166	0		br_poly1305_ctmul32_run(const void key, const void iv,
167			void data, size_t len, const void aad, size_t aad_len,
168			void *tag, br_chacha20_run ichacha, int encrypt)
169			{
170			unsigned char pkey[32], foot[16];
171			uint32_t z, r[19], acc[10], cc, ctl;
172			int i;
173
174			/*
175			* Compute the MAC key. The 'r' value is the first 16 bytes of
176			* pkey[].
177			*/
178	0		memset(pkey, 0, sizeof pkey);
179	0		ichacha(key, iv, 0, pkey, sizeof pkey);
180
181			/*
182			* If encrypting, ChaCha20 must run first, followed by Poly1305.
183			* When decrypting, the operations are reversed.
184			*/
185	0	0	if (encrypt) {
186	0		ichacha(key, iv, 1, data, len);
187			}
188
189			/*
190			* Run Poly1305. We must process the AAD, then ciphertext, then
191			* the footer (with the lengths). Note that the AAD and ciphertext
192			* are meant to be padded with zeros up to the next multiple of 16,
193			* and the length of the footer is 16 bytes as well.
194			*/
195
196			/*
197			* Decode the 'r' value into 13-bit words, with the "clamping"
198			* operation applied.
199			*/
200	0		z = br_dec32le(pkey) & 0x03FFFFFF;
201	0		r[9] = z & 0x1FFF;
202	0		r[10] = z >> 13;
203	0		z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;
204	0		r[11] = z & 0x1FFF;
205	0		r[12] = z >> 13;
206	0		z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;
207	0		r[13] = z & 0x1FFF;
208	0		r[14] = z >> 13;
209	0		z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;
210	0		r[15] = z & 0x1FFF;
211	0		r[16] = z >> 13;
212	0		z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
213	0		r[17] = z & 0x1FFF;
214	0		r[18] = z >> 13;
215
216			/*
217			* Extend r[] with the 5x factor pre-applied.
218			*/
219	0	0	for (i = 0; i < 9; i ++) {
220	0		r[i] = MUL15(5, r[i + 10]);
221			}
222
223			/*
224			* Accumulator is 0.
225			*/
226	0		memset(acc, 0, sizeof acc);
227
228			/*
229			* Process the additional authenticated data, ciphertext, and
230			* footer in due order.
231			*/
232	0		br_enc64le(foot, (uint64_t)aad_len);
233	0		br_enc64le(foot + 8, (uint64_t)len);
234	0		poly1305_inner(acc, r, aad, aad_len);
235	0		poly1305_inner(acc, r, data, len);
236	0		poly1305_inner(acc, r, foot, sizeof foot);
237
238			/*
239			* Finalise modular reduction. This is done with carry propagation
240			* and applying the '2^130 = -5 mod p' rule. Note that the output
241			* of poly1035_inner() is already mostly reduced, since only
242			* acc[1] may be (very slightly) above 2^13. A single loop back
243			* to acc[1] will be enough to make the value fit in 130 bits.
244			*/
245	0		cc = 0;
246	0	0	for (i = 1; i < 10; i ++) {
247	0		z = acc[i] + cc;
248	0		acc[i] = z & 0x1FFF;
249	0		cc = z >> 13;
250			}
251	0		z = acc[0] + cc + (cc << 2);
252	0		acc[0] = z & 0x1FFF;
253	0		acc[1] += z >> 13;
254
255			/*
256			* We may still have a value in the 2^130-5..2^130-1 range, in
257			* which case we must reduce it again. The code below selects,
258			* in constant-time, between 'acc' and 'acc-p',
259			*/
260	0		ctl = GT(acc[0], 0x1FFA);
261	0	0	for (i = 1; i < 10; i ++) {
262	0		ctl &= EQ(acc[i], 0x1FFF);
263			}
264	0		acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);
265	0	0	for (i = 1; i < 10; i ++) {
266	0		acc[i] &= ~(-ctl);
267			}
268
269			/*
270			* Convert back the accumulator to 32-bit words, and add the
271			* 's' value (second half of pkey[]). That addition is done
272			* modulo 2^128.
273			*/
274	0		z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);
275	0		br_enc16le((unsigned char *)tag, z & 0xFFFF);
276	0		z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);
277	0		br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);
278	0		z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);
279	0		br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);
280	0		z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);
281	0		br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);
282	0		z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);
283	0		br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);
284	0		z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);
285	0		br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);
286	0		z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);
287	0		br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);
288	0		z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);
289	0		br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);
290
291			/*
292			* If decrypting, then ChaCha20 runs _after_ Poly1305.
293			*/
294	0	0	if (!encrypt) {
295	0		ichacha(key, iv, 1, data, len);
296			}
297	0		}