File Coverage

src/ec/ec_c25519_m64.c

Criterion	Covered	Total	%
statement	249	254	98.0
branch	14	16	87.5
condition			n/a
subroutine			n/a
pod			n/a
total	263	270	97.4

line	stmt	bran	code
1			/*
2			* Copyright (c) 2018 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#include "inner.h"
26
27			#if BR_INT128 \|\| BR_UMUL128
28
29			#if BR_UMUL128
30			#include
31			#endif
32
33			static const unsigned char GEN[] = {
34			0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
35			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
36			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
37			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
38			};
39
40			static const unsigned char ORDER[] = {
41			0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
42			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
43			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
44			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
45			};
46
47			static const unsigned char *
48	3		api_generator(int curve, size_t *len)
49			{
50			(void)curve;
51	3		*len = 32;
52	3		return GEN;
53			}
54
55			static const unsigned char *
56	2		api_order(int curve, size_t *len)
57			{
58			(void)curve;
59	2		*len = 32;
60	2		return ORDER;
61			}
62
63			static size_t
64	2		api_xoff(int curve, size_t *len)
65			{
66			(void)curve;
67	2		*len = 32;
68	2		return 0;
69			}
70
71			/*
72			* A field element is encoded as four 64-bit integers, in basis 2^63.
73			* Operations return partially reduced values, which may range up to
74			* 2^255+37.
75			*/
76
77			#define MASK63 (((uint64_t)1 << 63) - (uint64_t)1)
78
79			/*
80			* Swap two field elements, conditionally on a flag.
81			*/
82			static inline void
83	2048		f255_cswap(uint64_t a, uint64_t b, uint32_t ctl)
84			{
85			uint64_t m, w;
86
87	2048		m = -(uint64_t)ctl;
88	2048		w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
89	2048		w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
90	2048		w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
91	2048		w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
92	2048		}
93
94			/*
95			* Addition in the field.
96			*/
97			static inline void
98	4080		f255_add(uint64_t d, const uint64_t a, const uint64_t *b)
99			{
100			#if BR_INT128
101
102			uint64_t t0, t1, t2, t3, cc;
103			unsigned __int128 z;
104
105	4080		z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
106	4080		t0 = (uint64_t)z;
107	4080		z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
108	4080		t1 = (uint64_t)z;
109	4080		z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
110	4080		t2 = (uint64_t)z;
111	4080		z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
112	4080		t3 = (uint64_t)z & MASK63;
113	4080		cc = (uint64_t)(z >> 63);
114
115			/*
116			* Since operands are at most 2^255+37, the sum is at most
117			* 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
118			*
119			* We use: 2^255 = 19 mod p.
120			* Since we add 0, 19 or 38 to a value that fits on 255 bits,
121			* the result is at most 2^255+37.
122			*/
123	4080		z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
124	4080		d[0] = (uint64_t)z;
125	4080		z = (unsigned __int128)t1 + (z >> 64);
126	4080		d[1] = (uint64_t)z;
127	4080		z = (unsigned __int128)t2 + (z >> 64);
128	4080		d[2] = (uint64_t)z;
129	4080		d[3] = t3 + (uint64_t)(z >> 64);
130
131			#elif BR_UMUL128
132
133			uint64_t t0, t1, t2, t3, cc;
134			unsigned char k;
135
136			k = _addcarry_u64(0, a[0], b[0], &t0);
137			k = _addcarry_u64(k, a[1], b[1], &t1);
138			k = _addcarry_u64(k, a[2], b[2], &t2);
139			k = _addcarry_u64(k, a[3], b[3], &t3);
140			cc = (k << 1) + (t3 >> 63);
141			t3 &= MASK63;
142
143			/*
144			* Since operands are at most 2^255+37, the sum is at most
145			* 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
146			*
147			* We use: 2^255 = 19 mod p.
148			* Since we add 0, 19 or 38 to a value that fits on 255 bits,
149			* the result is at most 2^255+37.
150			*/
151			k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
152			k = _addcarry_u64(k, t1, 0, &d[1]);
153			k = _addcarry_u64(k, t2, 0, &d[2]);
154			(void)_addcarry_u64(k, t3, 0, &d[3]);
155
156			#endif
157	4080		}
158
159			/*
160			* Subtraction.
161			*/
162			static inline void
163	4080		f255_sub(uint64_t d, const uint64_t a, const uint64_t *b)
164			{
165			#if BR_INT128
166
167			/*
168			* We compute t = 2^256 - 38 + a - b, which is necessarily
169			* positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
170			* and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
171			* on the two upper bits of t (bits 255 and 256).
172			*/
173
174			uint64_t t0, t1, t2, t3, t4, cc;
175			unsigned __int128 z;
176
177	4080		z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
178	4080		t0 = (uint64_t)z;
179	4080		cc = -(uint64_t)(z >> 64);
180	4080		z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
181	4080		- (unsigned __int128)cc;
182	4080		t1 = (uint64_t)z;
183	4080		cc = -(uint64_t)(z >> 64);
184	4080		z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
185	4080		- (unsigned __int128)cc;
186	4080		t2 = (uint64_t)z;
187	4080		cc = -(uint64_t)(z >> 64);
188	4080		z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
189	4080		- (unsigned __int128)cc;
190	4080		t3 = (uint64_t)z;
191	4080		t4 = 1 + (uint64_t)(z >> 64);
192
193			/*
194			* We have a 257-bit result. The two top bits can be 00, 01 or 10,
195			* but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
196			* Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
197			* This guarantees that the result is at most 2^255+37.
198			*/
199	4080		cc = (38 & -t4) + (19 & -(t3 >> 63));
200	4080		t3 &= MASK63;
201	4080		z = (unsigned __int128)t0 + (unsigned __int128)cc;
202	4080		d[0] = (uint64_t)z;
203	4080		z = (unsigned __int128)t1 + (z >> 64);
204	4080		d[1] = (uint64_t)z;
205	4080		z = (unsigned __int128)t2 + (z >> 64);
206	4080		d[2] = (uint64_t)z;
207	4080		d[3] = t3 + (uint64_t)(z >> 64);
208
209			#elif BR_UMUL128
210
211			/*
212			* We compute t = 2^256 - 38 + a - b, which is necessarily
213			* positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
214			* and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
215			* on the two upper bits of t (bits 255 and 256).
216			*/
217
218			uint64_t t0, t1, t2, t3, t4;
219			unsigned char k;
220
221			k = _subborrow_u64(0, a[0], b[0], &t0);
222			k = _subborrow_u64(k, a[1], b[1], &t1);
223			k = _subborrow_u64(k, a[2], b[2], &t2);
224			k = _subborrow_u64(k, a[3], b[3], &t3);
225			(void)_subborrow_u64(k, 1, 0, &t4);
226
227			k = _subborrow_u64(0, t0, 38, &t0);
228			k = _subborrow_u64(k, t1, 0, &t1);
229			k = _subborrow_u64(k, t2, 0, &t2);
230			k = _subborrow_u64(k, t3, 0, &t3);
231			(void)_subborrow_u64(k, t4, 0, &t4);
232
233			/*
234			* We have a 257-bit result. The two top bits can be 00, 01 or 10,
235			* but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
236			* Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
237			* This guarantees that the result is at most 2^255+37.
238			*/
239			t4 = (38 & -t4) + (19 & -(t3 >> 63));
240			t3 &= MASK63;
241			k = _addcarry_u64(0, t0, t4, &d[0]);
242			k = _addcarry_u64(k, t1, 0, &d[1]);
243			k = _addcarry_u64(k, t2, 0, &d[2]);
244			(void)_addcarry_u64(k, t3, 0, &d[3]);
245
246			#endif
247	4080		}
248
249			/*
250			* Multiplication.
251			*/
252			static inline void
253	10368		f255_mul(uint64_t d, uint64_t a, uint64_t *b)
254			{
255			#if BR_INT128
256
257			unsigned __int128 z;
258			uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
259
260			/*
261			* Compute the product a*b over plain integers.
262			*/
263	10368		z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
264	10368		t0 = (uint64_t)z;
265	10368		z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
266	10368		t1 = (uint64_t)z;
267	10368		z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
268	10368		t2 = (uint64_t)z;
269	10368		z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
270	10368		t3 = (uint64_t)z;
271	10368		t4 = (uint64_t)(z >> 64);
272
273	10368		z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
274	10368		+ (unsigned __int128)t1;
275	10368		t1 = (uint64_t)z;
276	10368		z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
277	10368		+ (unsigned __int128)t2 + (z >> 64);
278	10368		t2 = (uint64_t)z;
279	10368		z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
280	10368		+ (unsigned __int128)t3 + (z >> 64);
281	10368		t3 = (uint64_t)z;
282	10368		z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
283	10368		+ (unsigned __int128)t4 + (z >> 64);
284	10368		t4 = (uint64_t)z;
285	10368		t5 = (uint64_t)(z >> 64);
286
287	10368		z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
288	10368		+ (unsigned __int128)t2;
289	10368		t2 = (uint64_t)z;
290	10368		z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
291	10368		+ (unsigned __int128)t3 + (z >> 64);
292	10368		t3 = (uint64_t)z;
293	10368		z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
294	10368		+ (unsigned __int128)t4 + (z >> 64);
295	10368		t4 = (uint64_t)z;
296	10368		z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
297	10368		+ (unsigned __int128)t5 + (z >> 64);
298	10368		t5 = (uint64_t)z;
299	10368		t6 = (uint64_t)(z >> 64);
300
301	10368		z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
302	10368		+ (unsigned __int128)t3;
303	10368		t3 = (uint64_t)z;
304	10368		z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
305	10368		+ (unsigned __int128)t4 + (z >> 64);
306	10368		t4 = (uint64_t)z;
307	10368		z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
308	10368		+ (unsigned __int128)t5 + (z >> 64);
309	10368		t5 = (uint64_t)z;
310	10368		z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
311	10368		+ (unsigned __int128)t6 + (z >> 64);
312	10368		t6 = (uint64_t)z;
313	10368		t7 = (uint64_t)(z >> 64);
314
315			/*
316			* Modulo p, we have:
317			*
318			* 2^255 = 19
319			* 2^510 = 19*19 = 361
320			*
321			* We split the intermediate t into three parts, in basis
322			* 2^255. The low one will be in t0..t3; the middle one in t4..t7.
323			* The upper one can only be a single bit (th), since the
324			* multiplication operands are at most 2^255+37 each.
325			*/
326	10368		th = t7 >> 62;
327	10368		t7 = ((t7 << 1) \| (t6 >> 63)) & MASK63;
328	10368		t6 = (t6 << 1) \| (t5 >> 63);
329	10368		t5 = (t5 << 1) \| (t4 >> 63);
330	10368		t4 = (t4 << 1) \| (t3 >> 63);
331	10368		t3 &= MASK63;
332
333			/*
334			* Multiply the middle part (t4..t7) by 19. We truncate it to
335			* 255 bits; the extra bits will go along with th.
336			*/
337	10368		z = (unsigned __int128)t4 * 19;
338	10368		t4 = (uint64_t)z;
339	10368		z = (unsigned __int128)t5 * 19 + (z >> 64);
340	10368		t5 = (uint64_t)z;
341	10368		z = (unsigned __int128)t6 * 19 + (z >> 64);
342	10368		t6 = (uint64_t)z;
343	10368		z = (unsigned __int128)t7 * 19 + (z >> 64);
344	10368		t7 = (uint64_t)z & MASK63;
345
346	10368		th = (361 & -th) + (19 * (uint64_t)(z >> 63));
347
348			/*
349			* Add elements together.
350			* At this point:
351			* t0..t3 fits on 255 bits.
352			* t4..t7 fits on 255 bits.
353			* th <= 361 + 342 = 703.
354			*/
355	10368		z = (unsigned __int128)t0 + (unsigned __int128)t4
356	10368		+ (unsigned __int128)th;
357	10368		t0 = (uint64_t)z;
358	10368		z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
359	10368		t1 = (uint64_t)z;
360	10368		z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
361	10368		t2 = (uint64_t)z;
362	10368		z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
363	10368		t3 = (uint64_t)z & MASK63;
364	10368		th = (uint64_t)(z >> 63);
365
366			/*
367			* Since the sum is at most 2^256 + 703, the two upper bits, in th,
368			* can only have value 0, 1 or 2. We just add th*19, which
369			* guarantees a result of at most 2^255+37.
370			*/
371	10368		z = (unsigned __int128)t0 + (19 * th);
372	10368		d[0] = (uint64_t)z;
373	10368		z = (unsigned __int128)t1 + (z >> 64);
374	10368		d[1] = (uint64_t)z;
375	10368		z = (unsigned __int128)t2 + (z >> 64);
376	10368		d[2] = (uint64_t)z;
377	10368		d[3] = t3 + (uint64_t)(z >> 64);
378
379			#elif BR_UMUL128
380
381			uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
382			uint64_t h0, h1, h2, h3;
383			unsigned char k;
384
385			/*
386			* Compute the product a*b over plain integers.
387			*/
388			t0 = _umul128(a[0], b[0], &h0);
389			t1 = _umul128(a[0], b[1], &h1);
390			k = _addcarry_u64(0, t1, h0, &t1);
391			t2 = _umul128(a[0], b[2], &h2);
392			k = _addcarry_u64(k, t2, h1, &t2);
393			t3 = _umul128(a[0], b[3], &h3);
394			k = _addcarry_u64(k, t3, h2, &t3);
395			(void)_addcarry_u64(k, h3, 0, &t4);
396
397			k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
398			k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
399			k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
400			k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
401			t5 = k;
402			k = _addcarry_u64(0, t2, h0, &t2);
403			k = _addcarry_u64(k, t3, h1, &t3);
404			k = _addcarry_u64(k, t4, h2, &t4);
405			(void)_addcarry_u64(k, t5, h3, &t5);
406
407			k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
408			k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
409			k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
410			k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
411			t6 = k;
412			k = _addcarry_u64(0, t3, h0, &t3);
413			k = _addcarry_u64(k, t4, h1, &t4);
414			k = _addcarry_u64(k, t5, h2, &t5);
415			(void)_addcarry_u64(k, t6, h3, &t6);
416
417			k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
418			k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
419			k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
420			k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
421			t7 = k;
422			k = _addcarry_u64(0, t4, h0, &t4);
423			k = _addcarry_u64(k, t5, h1, &t5);
424			k = _addcarry_u64(k, t6, h2, &t6);
425			(void)_addcarry_u64(k, t7, h3, &t7);
426
427			/*
428			* Modulo p, we have:
429			*
430			* 2^255 = 19
431			* 2^510 = 19*19 = 361
432			*
433			* We split the intermediate t into three parts, in basis
434			* 2^255. The low one will be in t0..t3; the middle one in t4..t7.
435			* The upper one can only be a single bit (th), since the
436			* multiplication operands are at most 2^255+37 each.
437			*/
438			th = t7 >> 62;
439			t7 = ((t7 << 1) \| (t6 >> 63)) & MASK63;
440			t6 = (t6 << 1) \| (t5 >> 63);
441			t5 = (t5 << 1) \| (t4 >> 63);
442			t4 = (t4 << 1) \| (t3 >> 63);
443			t3 &= MASK63;
444
445			/*
446			* Multiply the middle part (t4..t7) by 19. We truncate it to
447			* 255 bits; the extra bits will go along with th.
448			*/
449			t4 = _umul128(t4, 19, &h0);
450			t5 = _umul128(t5, 19, &h1);
451			t6 = _umul128(t6, 19, &h2);
452			t7 = _umul128(t7, 19, &h3);
453			k = _addcarry_u64(0, t5, h0, &t5);
454			k = _addcarry_u64(k, t6, h1, &t6);
455			k = _addcarry_u64(k, t7, h2, &t7);
456			(void)_addcarry_u64(k, h3, 0, &h3);
457			th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
458			t7 &= MASK63;
459
460			/*
461			* Add elements together.
462			* At this point:
463			* t0..t3 fits on 255 bits.
464			* t4..t7 fits on 255 bits.
465			* th <= 361 + 342 = 703.
466			*/
467			k = _addcarry_u64(0, t0, t4, &t0);
468			k = _addcarry_u64(k, t1, t5, &t1);
469			k = _addcarry_u64(k, t2, t6, &t2);
470			k = _addcarry_u64(k, t3, t7, &t3);
471			t4 = k;
472			k = _addcarry_u64(0, t0, th, &t0);
473			k = _addcarry_u64(k, t1, 0, &t1);
474			k = _addcarry_u64(k, t2, 0, &t2);
475			k = _addcarry_u64(k, t3, 0, &t3);
476			(void)_addcarry_u64(k, t4, 0, &t4);
477
478			th = (t4 << 1) + (t3 >> 63);
479			t3 &= MASK63;
480
481			/*
482			* Since the sum is at most 2^256 + 703, the two upper bits, in th,
483			* can only have value 0, 1 or 2. We just add th*19, which
484			* guarantees a result of at most 2^255+37.
485			*/
486			k = _addcarry_u64(0, t0, 19 * th, &d[0]);
487			k = _addcarry_u64(k, t1, 0, &d[1]);
488			k = _addcarry_u64(k, t2, 0, &d[2]);
489			(void)_addcarry_u64(k, t3, 0, &d[3]);
490
491			#endif
492	10368		}
493
494			/*
495			* Multiplication by A24 = 121665.
496			*/
497			static inline void
498	1020		f255_mul_a24(uint64_t d, const uint64_t a)
499			{
500			#if BR_INT128
501
502			uint64_t t0, t1, t2, t3;
503			unsigned __int128 z;
504
505	1020		z = (unsigned __int128)a[0] * 121665;
506	1020		t0 = (uint64_t)z;
507	1020		z = (unsigned __int128)a[1] * 121665 + (z >> 64);
508	1020		t1 = (uint64_t)z;
509	1020		z = (unsigned __int128)a[2] * 121665 + (z >> 64);
510	1020		t2 = (uint64_t)z;
511	1020		z = (unsigned __int128)a[3] * 121665 + (z >> 64);
512	1020		t3 = (uint64_t)z & MASK63;
513
514	1020		z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
515	1020		t0 = (uint64_t)z;
516	1020		z = (unsigned __int128)t1 + (z >> 64);
517	1020		t1 = (uint64_t)z;
518	1020		z = (unsigned __int128)t2 + (z >> 64);
519	1020		t2 = (uint64_t)z;
520	1020		t3 = t3 + (uint64_t)(z >> 64);
521
522	1020		z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
523	1020		d[0] = (uint64_t)z;
524	1020		z = (unsigned __int128)t1 + (z >> 64);
525	1020		d[1] = (uint64_t)z;
526	1020		z = (unsigned __int128)t2 + (z >> 64);
527	1020		d[2] = (uint64_t)z;
528	1020		d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
529
530			#elif BR_UMUL128
531
532			uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
533			unsigned char k;
534
535			t0 = _umul128(a[0], 121665, &h0);
536			t1 = _umul128(a[1], 121665, &h1);
537			k = _addcarry_u64(0, t1, h0, &t1);
538			t2 = _umul128(a[2], 121665, &h2);
539			k = _addcarry_u64(k, t2, h1, &t2);
540			t3 = _umul128(a[3], 121665, &h3);
541			k = _addcarry_u64(k, t3, h2, &t3);
542			(void)_addcarry_u64(k, h3, 0, &t4);
543
544			t4 = (t4 << 1) + (t3 >> 63);
545			t3 &= MASK63;
546			k = _addcarry_u64(0, t0, 19 * t4, &t0);
547			k = _addcarry_u64(k, t1, 0, &t1);
548			k = _addcarry_u64(k, t2, 0, &t2);
549			(void)_addcarry_u64(k, t3, 0, &t3);
550
551			t4 = 19 & -(t3 >> 63);
552			t3 &= MASK63;
553			k = _addcarry_u64(0, t0, t4, &d[0]);
554			k = _addcarry_u64(k, t1, 0, &d[1]);
555			k = _addcarry_u64(k, t2, 0, &d[2]);
556			(void)_addcarry_u64(k, t3, 0, &d[3]);
557
558			#endif
559	1020		}
560
561			/*
562			* Finalize reduction.
563			*/
564			static inline void
565	4		f255_final_reduce(uint64_t *a)
566			{
567			#if BR_INT128
568
569			uint64_t t0, t1, t2, t3, m;
570			unsigned __int128 z;
571
572			/*
573			* We add 19. If the result (in t) is below 2^255, then a[]
574			* is already less than 2^255-19, thus already reduced.
575			* Otherwise, we subtract 2^255 from t[], in which case we
576			* have t = a - (2^255-19), and that's our result.
577			*/
578	4		z = (unsigned __int128)a[0] + 19;
579	4		t0 = (uint64_t)z;
580	4		z = (unsigned __int128)a[1] + (z >> 64);
581	4		t1 = (uint64_t)z;
582	4		z = (unsigned __int128)a[2] + (z >> 64);
583	4		t2 = (uint64_t)z;
584	4		t3 = a[3] + (uint64_t)(z >> 64);
585
586	4		m = -(t3 >> 63);
587	4		t3 &= MASK63;
588	4		a[0] ^= m & (a[0] ^ t0);
589	4		a[1] ^= m & (a[1] ^ t1);
590	4		a[2] ^= m & (a[2] ^ t2);
591	4		a[3] ^= m & (a[3] ^ t3);
592
593			#elif BR_UMUL128
594
595			uint64_t t0, t1, t2, t3, m;
596			unsigned char k;
597
598			/*
599			* We add 19. If the result (in t) is below 2^255, then a[]
600			* is already less than 2^255-19, thus already reduced.
601			* Otherwise, we subtract 2^255 from t[], in which case we
602			* have t = a - (2^255-19), and that's our result.
603			*/
604			k = _addcarry_u64(0, a[0], 19, &t0);
605			k = _addcarry_u64(k, a[1], 0, &t1);
606			k = _addcarry_u64(k, a[2], 0, &t2);
607			(void)_addcarry_u64(k, a[3], 0, &t3);
608
609			m = -(t3 >> 63);
610			t3 &= MASK63;
611			a[0] ^= m & (a[0] ^ t0);
612			a[1] ^= m & (a[1] ^ t1);
613			a[2] ^= m & (a[2] ^ t2);
614			a[3] ^= m & (a[3] ^ t3);
615
616			#endif
617	4		}
618
619			static uint32_t
620	4		api_mul(unsigned char *G, size_t Glen,
621			const unsigned char *kb, size_t kblen, int curve)
622			{
623			unsigned char k[32];
624			uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
625			uint32_t swap;
626			int i;
627
628			(void)curve;
629
630			/*
631			* Points are encoded over exactly 32 bytes. Multipliers must fit
632			* in 32 bytes as well.
633			*/
634	4	50	if (Glen != 32 \|\| kblen > 32) {
		50
635	0		return 0;
636			}
637
638			/*
639			* RFC 7748 mandates that the high bit of the last point byte must
640			* be ignored/cleared.
641			*/
642	4		x1[0] = br_dec64le(&G[ 0]);
643	4		x1[1] = br_dec64le(&G[ 8]);
644	4		x1[2] = br_dec64le(&G[16]);
645	4		x1[3] = br_dec64le(&G[24]) & MASK63;
646
647			/*
648			* We can use memset() to clear values, because exact-width types
649			* like uint64_t are guaranteed to have no padding bits or
650			* trap representations.
651			*/
652	4		memset(x2, 0, sizeof x2);
653	4		x2[0] = 1;
654	4		memset(z2, 0, sizeof z2);
655	4		memcpy(x3, x1, sizeof x1);
656	4		memcpy(z3, x2, sizeof x2);
657
658			/*
659			* The multiplier is provided in big-endian notation, and
660			* possibly shorter than 32 bytes.
661			*/
662	4		memset(k, 0, (sizeof k) - kblen);
663	4		memcpy(k + (sizeof k) - kblen, kb, kblen);
664	4		k[31] &= 0xF8;
665	4		k[0] &= 0x7F;
666	4		k[0] \|= 0x40;
667
668	4		swap = 0;
669
670	1024	100	for (i = 254; i >= 0; i --) {
671			uint64_t a[4], aa[4], b[4], bb[4], e[4];
672			uint64_t c[4], d[4], da[4], cb[4];
673			uint32_t kt;
674
675	1020		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
676	1020		swap ^= kt;
677	1020		f255_cswap(x2, x3, swap);
678	1020		f255_cswap(z2, z3, swap);
679	1020		swap = kt;
680
681			/* A = x_2 + z_2 */
682	1020		f255_add(a, x2, z2);
683
684			/* AA = A^2 */
685	1020		f255_mul(aa, a, a);
686
687			/* B = x_2 - z_2 */
688	1020		f255_sub(b, x2, z2);
689
690			/* BB = B^2 */
691	1020		f255_mul(bb, b, b);
692
693			/* E = AA - BB */
694	1020		f255_sub(e, aa, bb);
695
696			/* C = x_3 + z_3 */
697	1020		f255_add(c, x3, z3);
698
699			/* D = x_3 - z_3 */
700	1020		f255_sub(d, x3, z3);
701
702			/* DA = D * A */
703	1020		f255_mul(da, d, a);
704
705			/* CB = C * B */
706	1020		f255_mul(cb, c, b);
707
708			/* x_3 = (DA + CB)^2 */
709	1020		f255_add(x3, da, cb);
710	1020		f255_mul(x3, x3, x3);
711
712			/* z_3 = x_1 * (DA - CB)^2 */
713	1020		f255_sub(z3, da, cb);
714	1020		f255_mul(z3, z3, z3);
715	1020		f255_mul(z3, x1, z3);
716
717			/* x_2 = AA * BB */
718	1020		f255_mul(x2, aa, bb);
719
720			/* z_2 = E * (AA + a24 * E) */
721	1020		f255_mul_a24(z2, e);
722	1020		f255_add(z2, aa, z2);
723	1020		f255_mul(z2, e, z2);
724			}
725
726	4		f255_cswap(x2, x3, swap);
727	4		f255_cswap(z2, z3, swap);
728
729			/*
730			* Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
731			* most non-squarings. We use x1 and x3, now useless, as temporaries.
732			*/
733	4		memcpy(x1, z2, sizeof z2);
734	64	100	for (i = 0; i < 15; i ++) {
735	60		f255_mul(x1, x1, x1);
736	60		f255_mul(x1, x1, z2);
737			}
738	4		memcpy(x3, x1, sizeof x1);
739	60	100	for (i = 0; i < 14; i ++) {
740			int j;
741
742	952	100	for (j = 0; j < 16; j ++) {
743	896		f255_mul(x3, x3, x3);
744			}
745	56		f255_mul(x3, x3, x1);
746			}
747	64	100	for (i = 14; i >= 0; i --) {
748	60		f255_mul(x3, x3, x3);
749	60	100	if ((0xFFEB >> i) & 1) {
750	52		f255_mul(x3, z2, x3);
751			}
752			}
753
754			/*
755			* Compute x2/z2. We have 1/z2 in x3.
756			*/
757	4		f255_mul(x2, x2, x3);
758	4		f255_final_reduce(x2);
759
760			/*
761			* Encode the final x2 value in little-endian.
762			*/
763	4		br_enc64le(G, x2[0]);
764	4		br_enc64le(G + 8, x2[1]);
765	4		br_enc64le(G + 16, x2[2]);
766	4		br_enc64le(G + 24, x2[3]);
767	4		return 1;
768			}
769
770			static size_t
771	2		api_mulgen(unsigned char *R,
772			const unsigned char *x, size_t xlen, int curve)
773			{
774			const unsigned char *G;
775			size_t Glen;
776
777	2		G = api_generator(curve, &Glen);
778	2		memcpy(R, G, Glen);
779	2		api_mul(R, Glen, x, xlen, curve);
780	2		return Glen;
781			}
782
783			static uint32_t
784	0		api_muladd(unsigned char A, const unsigned char B, size_t len,
785			const unsigned char *x, size_t xlen,
786			const unsigned char *y, size_t ylen, int curve)
787			{
788			/*
789			* We don't implement this method, since it is used for ECDSA
790			* only, and there is no ECDSA over Curve25519 (which instead
791			* uses EdDSA).
792			*/
793			(void)A;
794			(void)B;
795			(void)len;
796			(void)x;
797			(void)xlen;
798			(void)y;
799			(void)ylen;
800			(void)curve;
801	0		return 0;
802			}
803
804			/* see bearssl_ec.h */
805			const br_ec_impl br_ec_c25519_m64 = {
806			(uint32_t)0x20000000,
807			&api_generator,
808			&api_order,
809			&api_xoff,
810			&api_mul,
811			&api_mulgen,
812			&api_muladd
813			};
814
815			/* see bearssl_ec.h */
816			const br_ec_impl *
817	0		br_ec_c25519_m64_get(void)
818			{
819	0		return &br_ec_c25519_m64;
820			}
821
822			#else
823
824			/* see bearssl_ec.h */
825			const br_ec_impl *
826			br_ec_c25519_m64_get(void)
827			{
828			return 0;
829			}
830
831			#endif