| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* Copyright (c) 2016 Thomas Pornin |
|
3
|
|
|
|
|
|
|
* |
|
4
|
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining |
|
5
|
|
|
|
|
|
|
* a copy of this software and associated documentation files (the |
|
6
|
|
|
|
|
|
|
* "Software"), to deal in the Software without restriction, including |
|
7
|
|
|
|
|
|
|
* without limitation the rights to use, copy, modify, merge, publish, |
|
8
|
|
|
|
|
|
|
* distribute, sublicense, and/or sell copies of the Software, and to |
|
9
|
|
|
|
|
|
|
* permit persons to whom the Software is furnished to do so, subject to |
|
10
|
|
|
|
|
|
|
* the following conditions: |
|
11
|
|
|
|
|
|
|
* |
|
12
|
|
|
|
|
|
|
* The above copyright notice and this permission notice shall be |
|
13
|
|
|
|
|
|
|
* included in all copies or substantial portions of the Software. |
|
14
|
|
|
|
|
|
|
* |
|
15
|
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
16
|
|
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
17
|
|
|
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
18
|
|
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
|
19
|
|
|
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
|
20
|
|
|
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
21
|
|
|
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
22
|
|
|
|
|
|
|
* SOFTWARE. |
|
23
|
|
|
|
|
|
|
*/ |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
#include "inner.h" |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
/* |
|
28
|
|
|
|
|
|
|
* Perform the inner processing of blocks for Poly1305. The accumulator |
|
29
|
|
|
|
|
|
|
* and the r key are provided as arrays of 26-bit words (these words |
|
30
|
|
|
|
|
|
|
* are allowed to have an extra bit, i.e. use 27 bits). |
|
31
|
|
|
|
|
|
|
* |
|
32
|
|
|
|
|
|
|
* On output, all accumulator words fit on 26 bits, except acc[1], which |
|
33
|
|
|
|
|
|
|
* may be slightly larger (but by a very small amount only). |
|
34
|
|
|
|
|
|
|
*/ |
|
35
|
|
|
|
|
|
|
static void |
|
36
|
0
|
|
|
|
|
|
poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len) |
|
37
|
|
|
|
|
|
|
{ |
|
38
|
|
|
|
|
|
|
/* |
|
39
|
|
|
|
|
|
|
* Implementation notes: we split the 130-bit values into five |
|
40
|
|
|
|
|
|
|
* 26-bit words. This gives us some space for carries. |
|
41
|
|
|
|
|
|
|
* |
|
42
|
|
|
|
|
|
|
* This code is inspired from the public-domain code available |
|
43
|
|
|
|
|
|
|
* on: |
|
44
|
|
|
|
|
|
|
* https://github.com/floodyberry/poly1305-donna |
|
45
|
|
|
|
|
|
|
* |
|
46
|
|
|
|
|
|
|
* Since we compute modulo 2^130-5, the "upper words" become |
|
47
|
|
|
|
|
|
|
* low words with a factor of 5; that is, x*2^130 = x*5 mod p. |
|
48
|
|
|
|
|
|
|
*/ |
|
49
|
|
|
|
|
|
|
const unsigned char *buf; |
|
50
|
|
|
|
|
|
|
uint32_t a0, a1, a2, a3, a4; |
|
51
|
|
|
|
|
|
|
uint32_t r0, r1, r2, r3, r4; |
|
52
|
|
|
|
|
|
|
uint32_t u1, u2, u3, u4; |
|
53
|
|
|
|
|
|
|
|
|
54
|
0
|
|
|
|
|
|
r0 = r[0]; |
|
55
|
0
|
|
|
|
|
|
r1 = r[1]; |
|
56
|
0
|
|
|
|
|
|
r2 = r[2]; |
|
57
|
0
|
|
|
|
|
|
r3 = r[3]; |
|
58
|
0
|
|
|
|
|
|
r4 = r[4]; |
|
59
|
|
|
|
|
|
|
|
|
60
|
0
|
|
|
|
|
|
u1 = r1 * 5; |
|
61
|
0
|
|
|
|
|
|
u2 = r2 * 5; |
|
62
|
0
|
|
|
|
|
|
u3 = r3 * 5; |
|
63
|
0
|
|
|
|
|
|
u4 = r4 * 5; |
|
64
|
|
|
|
|
|
|
|
|
65
|
0
|
|
|
|
|
|
a0 = acc[0]; |
|
66
|
0
|
|
|
|
|
|
a1 = acc[1]; |
|
67
|
0
|
|
|
|
|
|
a2 = acc[2]; |
|
68
|
0
|
|
|
|
|
|
a3 = acc[3]; |
|
69
|
0
|
|
|
|
|
|
a4 = acc[4]; |
|
70
|
|
|
|
|
|
|
|
|
71
|
0
|
|
|
|
|
|
buf = data; |
|
72
|
0
|
0
|
|
|
|
|
while (len > 0) { |
|
73
|
|
|
|
|
|
|
uint64_t w0, w1, w2, w3, w4; |
|
74
|
|
|
|
|
|
|
uint64_t c; |
|
75
|
|
|
|
|
|
|
unsigned char tmp[16]; |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
/* |
|
78
|
|
|
|
|
|
|
* If there is a partial block, right-pad it with zeros. |
|
79
|
|
|
|
|
|
|
*/ |
|
80
|
0
|
0
|
|
|
|
|
if (len < 16) { |
|
81
|
0
|
|
|
|
|
|
memset(tmp, 0, sizeof tmp); |
|
82
|
0
|
|
|
|
|
|
memcpy(tmp, buf, len); |
|
83
|
0
|
|
|
|
|
|
buf = tmp; |
|
84
|
0
|
|
|
|
|
|
len = 16; |
|
85
|
|
|
|
|
|
|
} |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
/* |
|
88
|
|
|
|
|
|
|
* Decode next block and apply the "high bit"; that value |
|
89
|
|
|
|
|
|
|
* is added to the accumulator. |
|
90
|
|
|
|
|
|
|
*/ |
|
91
|
0
|
|
|
|
|
|
a0 += br_dec32le(buf) & 0x03FFFFFF; |
|
92
|
0
|
|
|
|
|
|
a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF; |
|
93
|
0
|
|
|
|
|
|
a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF; |
|
94
|
0
|
|
|
|
|
|
a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF; |
|
95
|
0
|
|
|
|
|
|
a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
/* |
|
98
|
|
|
|
|
|
|
* Compute multiplication. |
|
99
|
|
|
|
|
|
|
*/ |
|
100
|
|
|
|
|
|
|
#define M(x, y) ((uint64_t)(x) * (uint64_t)(y)) |
|
101
|
|
|
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1); |
|
103
|
0
|
|
|
|
|
|
w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2); |
|
104
|
0
|
|
|
|
|
|
w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3); |
|
105
|
0
|
|
|
|
|
|
w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4); |
|
106
|
0
|
|
|
|
|
|
w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0); |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#undef M |
|
109
|
|
|
|
|
|
|
/* |
|
110
|
|
|
|
|
|
|
* Perform some (partial) modular reduction. This step is |
|
111
|
|
|
|
|
|
|
* enough to keep values in ranges such that there won't |
|
112
|
|
|
|
|
|
|
* be carry overflows. Most of the reduction was done in |
|
113
|
|
|
|
|
|
|
* the multiplication step (by using the 'u*' values, and |
|
114
|
|
|
|
|
|
|
* using the fact that 2^130 = -5 mod p); here we perform |
|
115
|
|
|
|
|
|
|
* some carry propagation. |
|
116
|
|
|
|
|
|
|
*/ |
|
117
|
0
|
|
|
|
|
|
c = w0 >> 26; |
|
118
|
0
|
|
|
|
|
|
a0 = (uint32_t)w0 & 0x3FFFFFF; |
|
119
|
0
|
|
|
|
|
|
w1 += c; |
|
120
|
0
|
|
|
|
|
|
c = w1 >> 26; |
|
121
|
0
|
|
|
|
|
|
a1 = (uint32_t)w1 & 0x3FFFFFF; |
|
122
|
0
|
|
|
|
|
|
w2 += c; |
|
123
|
0
|
|
|
|
|
|
c = w2 >> 26; |
|
124
|
0
|
|
|
|
|
|
a2 = (uint32_t)w2 & 0x3FFFFFF; |
|
125
|
0
|
|
|
|
|
|
w3 += c; |
|
126
|
0
|
|
|
|
|
|
c = w3 >> 26; |
|
127
|
0
|
|
|
|
|
|
a3 = (uint32_t)w3 & 0x3FFFFFF; |
|
128
|
0
|
|
|
|
|
|
w4 += c; |
|
129
|
0
|
|
|
|
|
|
c = w4 >> 26; |
|
130
|
0
|
|
|
|
|
|
a4 = (uint32_t)w4 & 0x3FFFFFF; |
|
131
|
0
|
|
|
|
|
|
a0 += (uint32_t)c * 5; |
|
132
|
0
|
|
|
|
|
|
a1 += a0 >> 26; |
|
133
|
0
|
|
|
|
|
|
a0 &= 0x3FFFFFF; |
|
134
|
|
|
|
|
|
|
|
|
135
|
0
|
|
|
|
|
|
buf += 16; |
|
136
|
0
|
|
|
|
|
|
len -= 16; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
acc[0] = a0; |
|
140
|
0
|
|
|
|
|
|
acc[1] = a1; |
|
141
|
0
|
|
|
|
|
|
acc[2] = a2; |
|
142
|
0
|
|
|
|
|
|
acc[3] = a3; |
|
143
|
0
|
|
|
|
|
|
acc[4] = a4; |
|
144
|
0
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
/* see bearssl_block.h */ |
|
147
|
|
|
|
|
|
|
void |
|
148
|
0
|
|
|
|
|
|
br_poly1305_ctmul_run(const void *key, const void *iv, |
|
149
|
|
|
|
|
|
|
void *data, size_t len, const void *aad, size_t aad_len, |
|
150
|
|
|
|
|
|
|
void *tag, br_chacha20_run ichacha, int encrypt) |
|
151
|
|
|
|
|
|
|
{ |
|
152
|
|
|
|
|
|
|
unsigned char pkey[32], foot[16]; |
|
153
|
|
|
|
|
|
|
uint32_t r[5], acc[5], cc, ctl, hi; |
|
154
|
|
|
|
|
|
|
uint64_t w; |
|
155
|
|
|
|
|
|
|
int i; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
/* |
|
158
|
|
|
|
|
|
|
* Compute the MAC key. The 'r' value is the first 16 bytes of |
|
159
|
|
|
|
|
|
|
* pkey[]. |
|
160
|
|
|
|
|
|
|
*/ |
|
161
|
0
|
|
|
|
|
|
memset(pkey, 0, sizeof pkey); |
|
162
|
0
|
|
|
|
|
|
ichacha(key, iv, 0, pkey, sizeof pkey); |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
/* |
|
165
|
|
|
|
|
|
|
* If encrypting, ChaCha20 must run first, followed by Poly1305. |
|
166
|
|
|
|
|
|
|
* When decrypting, the operations are reversed. |
|
167
|
|
|
|
|
|
|
*/ |
|
168
|
0
|
0
|
|
|
|
|
if (encrypt) { |
|
169
|
0
|
|
|
|
|
|
ichacha(key, iv, 1, data, len); |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
/* |
|
173
|
|
|
|
|
|
|
* Run Poly1305. We must process the AAD, then ciphertext, then |
|
174
|
|
|
|
|
|
|
* the footer (with the lengths). Note that the AAD and ciphertext |
|
175
|
|
|
|
|
|
|
* are meant to be padded with zeros up to the next multiple of 16, |
|
176
|
|
|
|
|
|
|
* and the length of the footer is 16 bytes as well. |
|
177
|
|
|
|
|
|
|
*/ |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
/* |
|
180
|
|
|
|
|
|
|
* Decode the 'r' value into 26-bit words, with the "clamping" |
|
181
|
|
|
|
|
|
|
* operation applied. |
|
182
|
|
|
|
|
|
|
*/ |
|
183
|
0
|
|
|
|
|
|
r[0] = br_dec32le(pkey) & 0x03FFFFFF; |
|
184
|
0
|
|
|
|
|
|
r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; |
|
185
|
0
|
|
|
|
|
|
r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; |
|
186
|
0
|
|
|
|
|
|
r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; |
|
187
|
0
|
|
|
|
|
|
r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
/* |
|
190
|
|
|
|
|
|
|
* Accumulator is 0. |
|
191
|
|
|
|
|
|
|
*/ |
|
192
|
0
|
|
|
|
|
|
memset(acc, 0, sizeof acc); |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
/* |
|
195
|
|
|
|
|
|
|
* Process the additional authenticated data, ciphertext, and |
|
196
|
|
|
|
|
|
|
* footer in due order. |
|
197
|
|
|
|
|
|
|
*/ |
|
198
|
0
|
|
|
|
|
|
br_enc64le(foot, (uint64_t)aad_len); |
|
199
|
0
|
|
|
|
|
|
br_enc64le(foot + 8, (uint64_t)len); |
|
200
|
0
|
|
|
|
|
|
poly1305_inner(acc, r, aad, aad_len); |
|
201
|
0
|
|
|
|
|
|
poly1305_inner(acc, r, data, len); |
|
202
|
0
|
|
|
|
|
|
poly1305_inner(acc, r, foot, sizeof foot); |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
/* |
|
205
|
|
|
|
|
|
|
* Finalise modular reduction. This is done with carry propagation |
|
206
|
|
|
|
|
|
|
* and applying the '2^130 = -5 mod p' rule. Note that the output |
|
207
|
|
|
|
|
|
|
* of poly1035_inner() is already mostly reduced, since only |
|
208
|
|
|
|
|
|
|
* acc[1] may be (very slightly) above 2^26. A single loop back |
|
209
|
|
|
|
|
|
|
* to acc[1] will be enough to make the value fit in 130 bits. |
|
210
|
|
|
|
|
|
|
*/ |
|
211
|
0
|
|
|
|
|
|
cc = 0; |
|
212
|
0
|
0
|
|
|
|
|
for (i = 1; i <= 6; i ++) { |
|
213
|
|
|
|
|
|
|
int j; |
|
214
|
|
|
|
|
|
|
|
|
215
|
0
|
0
|
|
|
|
|
j = (i >= 5) ? i - 5 : i; |
|
216
|
0
|
|
|
|
|
|
acc[j] += cc; |
|
217
|
0
|
|
|
|
|
|
cc = acc[j] >> 26; |
|
218
|
0
|
|
|
|
|
|
acc[j] &= 0x03FFFFFF; |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
/* |
|
222
|
|
|
|
|
|
|
* We may still have a value in the 2^130-5..2^130-1 range, in |
|
223
|
|
|
|
|
|
|
* which case we must reduce it again. The code below selects, |
|
224
|
|
|
|
|
|
|
* in constant-time, between 'acc' and 'acc-p', |
|
225
|
|
|
|
|
|
|
*/ |
|
226
|
0
|
|
|
|
|
|
ctl = GT(acc[0], 0x03FFFFFA); |
|
227
|
0
|
0
|
|
|
|
|
for (i = 1; i < 5; i ++) { |
|
228
|
0
|
|
|
|
|
|
ctl &= EQ(acc[i], 0x03FFFFFF); |
|
229
|
|
|
|
|
|
|
} |
|
230
|
0
|
|
|
|
|
|
cc = 5; |
|
231
|
0
|
0
|
|
|
|
|
for (i = 0; i < 5; i ++) { |
|
232
|
|
|
|
|
|
|
uint32_t t; |
|
233
|
|
|
|
|
|
|
|
|
234
|
0
|
|
|
|
|
|
t = (acc[i] + cc); |
|
235
|
0
|
|
|
|
|
|
cc = t >> 26; |
|
236
|
0
|
|
|
|
|
|
t &= 0x03FFFFFF; |
|
237
|
0
|
|
|
|
|
|
acc[i] = MUX(ctl, t, acc[i]); |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
/* |
|
241
|
|
|
|
|
|
|
* Convert back the accumulator to 32-bit words, and add the |
|
242
|
|
|
|
|
|
|
* 's' value (second half of pkey[]). That addition is done |
|
243
|
|
|
|
|
|
|
* modulo 2^128. |
|
244
|
|
|
|
|
|
|
*/ |
|
245
|
0
|
|
|
|
|
|
w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16); |
|
246
|
0
|
|
|
|
|
|
br_enc32le((unsigned char *)tag, (uint32_t)w); |
|
247
|
0
|
|
|
|
|
|
w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20); |
|
248
|
0
|
|
|
|
|
|
br_enc32le((unsigned char *)tag + 4, (uint32_t)w); |
|
249
|
0
|
|
|
|
|
|
w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24); |
|
250
|
0
|
|
|
|
|
|
br_enc32le((unsigned char *)tag + 8, (uint32_t)w); |
|
251
|
0
|
|
|
|
|
|
hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28); |
|
252
|
0
|
|
|
|
|
|
br_enc32le((unsigned char *)tag + 12, hi); |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
/* |
|
255
|
|
|
|
|
|
|
* If decrypting, then ChaCha20 runs _after_ Poly1305. |
|
256
|
|
|
|
|
|
|
*/ |
|
257
|
0
|
0
|
|
|
|
|
if (!encrypt) { |
|
258
|
0
|
|
|
|
|
|
ichacha(key, iv, 1, data, len); |
|
259
|
|
|
|
|
|
|
} |
|
260
|
0
|
|
|
|
|
|
} |