File Coverage

src/ec/ec_p256_m64.c

Criterion	Covered	Total	%
statement	390	399	97.7
branch	39	46	84.7
condition			n/a
subroutine			n/a
pod			n/a
total	429	445	96.4

line	stmt	bran	code
1			/*
2			* Copyright (c) 2018 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#include "inner.h"
26
27			#if BR_INT128 \|\| BR_UMUL128
28
29			#if BR_UMUL128
30			#include
31			#endif
32
33			static const unsigned char P256_G[] = {
34			0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
35			0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
36			0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
37			0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
38			0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
39			0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
40			0x68, 0x37, 0xBF, 0x51, 0xF5
41			};
42
43			static const unsigned char P256_N[] = {
44			0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
45			0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
46			0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
47			0x25, 0x51
48			};
49
50			static const unsigned char *
51	2		api_generator(int curve, size_t *len)
52			{
53			(void)curve;
54	2		*len = sizeof P256_G;
55	2		return P256_G;
56			}
57
58			static const unsigned char *
59	4		api_order(int curve, size_t *len)
60			{
61			(void)curve;
62	4		*len = sizeof P256_N;
63	4		return P256_N;
64			}
65
66			static size_t
67	2		api_xoff(int curve, size_t *len)
68			{
69			(void)curve;
70	2		*len = 32;
71	2		return 1;
72			}
73
74			/*
75			* A field element is encoded as four 64-bit integers, in basis 2^64.
76			* Values may reach up to 2^256-1. Montgomery multiplication is used.
77			*/
78
79			/* R = 2^256 mod p */
80			static const uint64_t F256_R[] = {
81			0x0000000000000001, 0xFFFFFFFF00000000,
82			0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFE
83			};
84
85			/* Curve equation is y^2 = x^3 - 3x + B. This constant is BR mod p
86			(Montgomery representation of B). */
87			static const uint64_t P256_B_MONTY[] = {
88			0xD89CDF6229C4BDDF, 0xACF005CD78843090,
89			0xE5A220ABF7212ED6, 0xDC30061D04874834
90			};
91
92			/*
93			* Addition in the field.
94			*/
95			static inline void
96	12707		f256_add(uint64_t d, const uint64_t a, const uint64_t *b)
97			{
98			#if BR_INT128
99			unsigned __int128 w;
100			uint64_t t;
101
102			/*
103			* Do the addition, with an extra carry in t.
104			*/
105	12707		w = (unsigned __int128)a[0] + b[0];
106	12707		d[0] = (uint64_t)w;
107	12707		w = (unsigned __int128)a[1] + b[1] + (w >> 64);
108	12707		d[1] = (uint64_t)w;
109	12707		w = (unsigned __int128)a[2] + b[2] + (w >> 64);
110	12707		d[2] = (uint64_t)w;
111	12707		w = (unsigned __int128)a[3] + b[3] + (w >> 64);
112	12707		d[3] = (uint64_t)w;
113	12707		t = (uint64_t)(w >> 64);
114
115			/*
116			* Fold carry t, using: 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p.
117			*/
118	12707		w = (unsigned __int128)d[0] + t;
119	12707		d[0] = (uint64_t)w;
120	12707		w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
121	12707		d[1] = (uint64_t)w;
122			/* Here, carry "w >> 64" can only be 0 or -1 */
123	12707		w = (unsigned __int128)d[2] - ((w >> 64) & 1);
124	12707		d[2] = (uint64_t)w;
125			/* Again, carry is 0 or -1. But there can be carry only if t = 1,
126			in which case the addition of (t << 32) - t is positive. */
127	12707		w = (unsigned __int128)d[3] - ((w >> 64) & 1) + (t << 32) - t;
128	12707		d[3] = (uint64_t)w;
129	12707		t = (uint64_t)(w >> 64);
130
131			/*
132			* There can be an extra carry here, which we must fold again.
133			*/
134	12707		w = (unsigned __int128)d[0] + t;
135	12707		d[0] = (uint64_t)w;
136	12707		w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
137	12707		d[1] = (uint64_t)w;
138	12707		w = (unsigned __int128)d[2] - ((w >> 64) & 1);
139	12707		d[2] = (uint64_t)w;
140	12707		d[3] += (t << 32) - t - (uint64_t)((w >> 64) & 1);
141
142			#elif BR_UMUL128
143
144			unsigned char cc;
145			uint64_t t;
146
147			cc = _addcarry_u64(0, a[0], b[0], &d[0]);
148			cc = _addcarry_u64(cc, a[1], b[1], &d[1]);
149			cc = _addcarry_u64(cc, a[2], b[2], &d[2]);
150			cc = _addcarry_u64(cc, a[3], b[3], &d[3]);
151
152			/*
153			* If there is a carry, then we want to subtract p, which we
154			* do by adding 2^256 - p.
155			*/
156			t = cc;
157			cc = _addcarry_u64(cc, d[0], 0, &d[0]);
158			cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
159			cc = _addcarry_u64(cc, d[2], -t, &d[2]);
160			cc = _addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
161
162			/*
163			* We have to do it again if there still is a carry.
164			*/
165			t = cc;
166			cc = _addcarry_u64(cc, d[0], 0, &d[0]);
167			cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
168			cc = _addcarry_u64(cc, d[2], -t, &d[2]);
169			(void)_addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
170
171			#endif
172	12707		}
173
174			/*
175			* Subtraction in the field.
176			*/
177			static inline void
178	12366		f256_sub(uint64_t d, const uint64_t a, const uint64_t *b)
179			{
180			#if BR_INT128
181
182			unsigned __int128 w;
183			uint64_t t;
184
185	12366		w = (unsigned __int128)a[0] - b[0];
186	12366		d[0] = (uint64_t)w;
187	12366		w = (unsigned __int128)a[1] - b[1] - ((w >> 64) & 1);
188	12366		d[1] = (uint64_t)w;
189	12366		w = (unsigned __int128)a[2] - b[2] - ((w >> 64) & 1);
190	12366		d[2] = (uint64_t)w;
191	12366		w = (unsigned __int128)a[3] - b[3] - ((w >> 64) & 1);
192	12366		d[3] = (uint64_t)w;
193	12366		t = (uint64_t)(w >> 64) & 1;
194
195			/*
196			* If there is a borrow (t = 1), then we must add the modulus
197			* p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
198			*/
199	12366		w = (unsigned __int128)d[0] - t;
200	12366		d[0] = (uint64_t)w;
201	12366		w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
202	12366		d[1] = (uint64_t)w;
203			/* Here, carry "w >> 64" can only be 0 or +1 */
204	12366		w = (unsigned __int128)d[2] + (w >> 64);
205	12366		d[2] = (uint64_t)w;
206			/* Again, carry is 0 or +1 */
207	12366		w = (unsigned __int128)d[3] + (w >> 64) - (t << 32) + t;
208	12366		d[3] = (uint64_t)w;
209	12366		t = (uint64_t)(w >> 64) & 1;
210
211			/*
212			* There may be again a borrow, in which case we must add the
213			* modulus again.
214			*/
215	12366		w = (unsigned __int128)d[0] - t;
216	12366		d[0] = (uint64_t)w;
217	12366		w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
218	12366		d[1] = (uint64_t)w;
219	12366		w = (unsigned __int128)d[2] + (w >> 64);
220	12366		d[2] = (uint64_t)w;
221	12366		d[3] += (uint64_t)(w >> 64) - (t << 32) + t;
222
223			#elif BR_UMUL128
224
225			unsigned char cc;
226			uint64_t t;
227
228			cc = _subborrow_u64(0, a[0], b[0], &d[0]);
229			cc = _subborrow_u64(cc, a[1], b[1], &d[1]);
230			cc = _subborrow_u64(cc, a[2], b[2], &d[2]);
231			cc = _subborrow_u64(cc, a[3], b[3], &d[3]);
232
233			/*
234			* If there is a borrow, then we need to add p. We (virtually)
235			* add 2^256, then subtract 2^256 - p.
236			*/
237			t = cc;
238			cc = _subborrow_u64(0, d[0], t, &d[0]);
239			cc = _subborrow_u64(cc, d[1], -(t << 32), &d[1]);
240			cc = _subborrow_u64(cc, d[2], -t, &d[2]);
241			cc = _subborrow_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
242
243			/*
244			* If there still is a borrow, then we need to add p again.
245			*/
246			t = cc;
247			cc = _subborrow_u64(0, d[0], t, &d[0]);
248			cc = _subborrow_u64(cc, d[1], -(t << 32), &d[1]);
249			cc = _subborrow_u64(cc, d[2], -t, &d[2]);
250			(void)_subborrow_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
251
252			#endif
253	12366		}
254
255			/*
256			* Montgomery multiplication in the field.
257			*/
258			static void
259	22873		f256_montymul(uint64_t d, const uint64_t a, const uint64_t *b)
260			{
261			#if BR_INT128
262
263			uint64_t x, f, t0, t1, t2, t3, t4;
264			unsigned __int128 z, ff;
265			int i;
266
267			/*
268			* When computing d <- d + a[u]b, we also add fp such
269			* that d + a[u]b + fp is a multiple of 2^64. Since
270			* p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
271			*/
272
273			/*
274			* Step 1: t <- (a[0]b + fp) / 2^64
275			* We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
276			* ensures that (a[0]b + fp) is a multiple of 2^64.
277			*
278			* We also have: fp = f2^256 - f2^224 + f2^192 + f*2^96 - f.
279			*/
280	22873		x = a[0];
281	22873		z = (unsigned __int128)b[0] * x;
282	22873		f = (uint64_t)z;
283	22873		z = (unsigned __int128)b[1] * x + (z >> 64) + (uint64_t)(f << 32);
284	22873		t0 = (uint64_t)z;
285	22873		z = (unsigned __int128)b[2] * x + (z >> 64) + (uint64_t)(f >> 32);
286	22873		t1 = (uint64_t)z;
287	22873		z = (unsigned __int128)b[3] * x + (z >> 64) + f;
288	22873		t2 = (uint64_t)z;
289	22873		t3 = (uint64_t)(z >> 64);
290	22873		ff = ((unsigned __int128)f << 64) - ((unsigned __int128)f << 32);
291	22873		z = (unsigned __int128)t2 + (uint64_t)ff;
292	22873		t2 = (uint64_t)z;
293	22873		z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
294	22873		t3 = (uint64_t)z;
295	22873		t4 = (uint64_t)(z >> 64);
296
297			/*
298			* Steps 2 to 4: t <- (t + a[i]b + fp) / 2^64
299			*/
300	91492	100	for (i = 1; i < 4; i ++) {
301	68619		x = a[i];
302
303			/* t <- (t + xb - f) / 2^64 /
304	68619		z = (unsigned __int128)b[0] * x + t0;
305	68619		f = (uint64_t)z;
306	68619		z = (unsigned __int128)b[1] * x + t1 + (z >> 64);
307	68619		t0 = (uint64_t)z;
308	68619		z = (unsigned __int128)b[2] * x + t2 + (z >> 64);
309	68619		t1 = (uint64_t)z;
310	68619		z = (unsigned __int128)b[3] * x + t3 + (z >> 64);
311	68619		t2 = (uint64_t)z;
312	68619		z = t4 + (z >> 64);
313	68619		t3 = (uint64_t)z;
314	68619		t4 = (uint64_t)(z >> 64);
315
316			/* t <- t + f2^32, carry in the upper half of z /
317	68619		z = (unsigned __int128)t0 + (uint64_t)(f << 32);
318	68619		t0 = (uint64_t)z;
319	68619		z = (z >> 64) + (unsigned __int128)t1 + (uint64_t)(f >> 32);
320	68619		t1 = (uint64_t)z;
321
322			/* t <- t + f2^192 - f2^160 + f2^128 /
323	68619		ff = ((unsigned __int128)f << 64)
324	68619		- ((unsigned __int128)f << 32) + f;
325	68619		z = (z >> 64) + (unsigned __int128)t2 + (uint64_t)ff;
326	68619		t2 = (uint64_t)z;
327	68619		z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
328	68619		t3 = (uint64_t)z;
329	68619		t4 += (uint64_t)(z >> 64);
330			}
331
332			/*
333			* At that point, we have computed t = (ab + Fp) / 2^256, where
334			* F is a 256-bit integer whose limbs are the "f" coefficients
335			* in the steps above. We have:
336			* a <= 2^256-1
337			* b <= 2^256-1
338			* F <= 2^256-1
339			* Hence:
340			* ab + Fp <= (2^256-1)(2^256-1) + p(2^256-1)
341			* ab + Fp <= 2^256*(2^256 - 2 + p) + 1 - p
342			* Therefore:
343			* t < 2^256 + p - 2
344			* Since p < 2^256, it follows that:
345			* t4 can be only 0 or 1
346			* t - p < 2^256
347			* We can therefore subtract p from t, conditionally on t4, to
348			* get a nonnegative result that fits on 256 bits.
349			*/
350	22873		z = (unsigned __int128)t0 + t4;
351	22873		t0 = (uint64_t)z;
352	22873		z = (unsigned __int128)t1 - (t4 << 32) + (z >> 64);
353	22873		t1 = (uint64_t)z;
354	22873		z = (unsigned __int128)t2 - (z >> 127);
355	22873		t2 = (uint64_t)z;
356	22873		t3 = t3 - (uint64_t)(z >> 127) - t4 + (t4 << 32);
357
358	22873		d[0] = t0;
359	22873		d[1] = t1;
360	22873		d[2] = t2;
361	22873		d[3] = t3;
362
363			#elif BR_UMUL128
364
365			uint64_t x, f, t0, t1, t2, t3, t4;
366			uint64_t zl, zh, ffl, ffh;
367			unsigned char k, m;
368			int i;
369
370			/*
371			* When computing d <- d + a[u]b, we also add fp such
372			* that d + a[u]b + fp is a multiple of 2^64. Since
373			* p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
374			*/
375
376			/*
377			* Step 1: t <- (a[0]b + fp) / 2^64
378			* We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
379			* ensures that (a[0]b + fp) is a multiple of 2^64.
380			*
381			* We also have: fp = f2^256 - f2^224 + f2^192 + f*2^96 - f.
382			*/
383			x = a[0];
384
385			zl = _umul128(b[0], x, &zh);
386			f = zl;
387			t0 = zh;
388
389			zl = _umul128(b[1], x, &zh);
390			k = _addcarry_u64(0, zl, t0, &zl);
391			(void)_addcarry_u64(k, zh, 0, &zh);
392			k = _addcarry_u64(0, zl, f << 32, &zl);
393			(void)_addcarry_u64(k, zh, 0, &zh);
394			t0 = zl;
395			t1 = zh;
396
397			zl = _umul128(b[2], x, &zh);
398			k = _addcarry_u64(0, zl, t1, &zl);
399			(void)_addcarry_u64(k, zh, 0, &zh);
400			k = _addcarry_u64(0, zl, f >> 32, &zl);
401			(void)_addcarry_u64(k, zh, 0, &zh);
402			t1 = zl;
403			t2 = zh;
404
405			zl = _umul128(b[3], x, &zh);
406			k = _addcarry_u64(0, zl, t2, &zl);
407			(void)_addcarry_u64(k, zh, 0, &zh);
408			k = _addcarry_u64(0, zl, f, &zl);
409			(void)_addcarry_u64(k, zh, 0, &zh);
410			t2 = zl;
411			t3 = zh;
412
413			t4 = _addcarry_u64(0, t3, f, &t3);
414			k = _subborrow_u64(0, t2, f << 32, &t2);
415			k = _subborrow_u64(k, t3, f >> 32, &t3);
416			(void)_subborrow_u64(k, t4, 0, &t4);
417
418			/*
419			* Steps 2 to 4: t <- (t + a[i]b + fp) / 2^64
420			*/
421			for (i = 1; i < 4; i ++) {
422			x = a[i];
423			/* f = t0 + x * b[0]; -- computed below */
424
425			/* t <- (t + xb - f) / 2^64 /
426			zl = _umul128(b[0], x, &zh);
427			k = _addcarry_u64(0, zl, t0, &f);
428			(void)_addcarry_u64(k, zh, 0, &t0);
429
430			zl = _umul128(b[1], x, &zh);
431			k = _addcarry_u64(0, zl, t0, &zl);
432			(void)_addcarry_u64(k, zh, 0, &zh);
433			k = _addcarry_u64(0, zl, t1, &t0);
434			(void)_addcarry_u64(k, zh, 0, &t1);
435
436			zl = _umul128(b[2], x, &zh);
437			k = _addcarry_u64(0, zl, t1, &zl);
438			(void)_addcarry_u64(k, zh, 0, &zh);
439			k = _addcarry_u64(0, zl, t2, &t1);
440			(void)_addcarry_u64(k, zh, 0, &t2);
441
442			zl = _umul128(b[3], x, &zh);
443			k = _addcarry_u64(0, zl, t2, &zl);
444			(void)_addcarry_u64(k, zh, 0, &zh);
445			k = _addcarry_u64(0, zl, t3, &t2);
446			(void)_addcarry_u64(k, zh, 0, &t3);
447
448			t4 = _addcarry_u64(0, t3, t4, &t3);
449
450			/* t <- t + f2^32, carry in k /
451			k = _addcarry_u64(0, t0, f << 32, &t0);
452			k = _addcarry_u64(k, t1, f >> 32, &t1);
453
454			/* t <- t + f2^192 - f2^160 + f2^128 /
455			m = _subborrow_u64(0, f, f << 32, &ffl);
456			(void)_subborrow_u64(m, f, f >> 32, &ffh);
457			k = _addcarry_u64(k, t2, ffl, &t2);
458			k = _addcarry_u64(k, t3, ffh, &t3);
459			(void)_addcarry_u64(k, t4, 0, &t4);
460			}
461
462			/*
463			* At that point, we have computed t = (ab + Fp) / 2^256, where
464			* F is a 256-bit integer whose limbs are the "f" coefficients
465			* in the steps above. We have:
466			* a <= 2^256-1
467			* b <= 2^256-1
468			* F <= 2^256-1
469			* Hence:
470			* ab + Fp <= (2^256-1)(2^256-1) + p(2^256-1)
471			* ab + Fp <= 2^256*(2^256 - 2 + p) + 1 - p
472			* Therefore:
473			* t < 2^256 + p - 2
474			* Since p < 2^256, it follows that:
475			* t4 can be only 0 or 1
476			* t - p < 2^256
477			* We can therefore subtract p from t, conditionally on t4, to
478			* get a nonnegative result that fits on 256 bits.
479			*/
480			k = _addcarry_u64(0, t0, t4, &t0);
481			k = _addcarry_u64(k, t1, -(t4 << 32), &t1);
482			k = _addcarry_u64(k, t2, -t4, &t2);
483			(void)_addcarry_u64(k, t3, (t4 << 32) - (t4 << 1), &t3);
484
485			d[0] = t0;
486			d[1] = t1;
487			d[2] = t2;
488			d[3] = t3;
489
490			#endif
491	22873		}
492
493			/*
494			* Montgomery squaring in the field; currently a basic wrapper around
495			* multiplication (inline, should be optimized away).
496			* TODO: see if some extra speed can be gained here.
497			*/
498			static inline void
499	11040		f256_montysquare(uint64_t d, const uint64_t a)
500			{
501	11040		f256_montymul(d, a, a);
502	11040		}
503
504			/*
505			* Convert to Montgomery representation.
506			*/
507			static void
508	6		f256_tomonty(uint64_t d, const uint64_t a)
509			{
510			/*
511			* R2 = 2^512 mod p.
512			* If R = 2^256 mod p, then R2 = R^2 mod p; and the Montgomery
513			* multiplication of a by R2 is: aR2/R = aR mod p, i.e. the
514			* conversion to Montgomery representation.
515			*/
516			static const uint64_t R2[] = {
517			0x0000000000000003,
518			0xFFFFFFFBFFFFFFFF,
519			0xFFFFFFFFFFFFFFFE,
520			0x00000004FFFFFFFD
521			};
522
523	6		f256_montymul(d, a, R2);
524	6		}
525
526			/*
527			* Convert from Montgomery representation.
528			*/
529			static void
530	12		f256_frommonty(uint64_t d, const uint64_t a)
531			{
532			/*
533			* Montgomery multiplication by 1 is division by 2^256 modulo p.
534			*/
535			static const uint64_t one[] = { 1, 0, 0, 0 };
536
537	12		f256_montymul(d, a, one);
538	12		}
539
540			/*
541			* Inversion in the field. If the source value is 0 modulo p, then this
542			* returns 0 or p. This function uses Montgomery representation.
543			*/
544			static void
545	9		f256_invert(uint64_t d, const uint64_t a)
546			{
547			/*
548			* We compute a^(p-2) mod p. The exponent pattern (from high to
549			* low) is:
550			* - 32 bits of value 1
551			* - 31 bits of value 0
552			* - 1 bit of value 1
553			* - 96 bits of value 0
554			* - 94 bits of value 1
555			* - 1 bit of value 0
556			* - 1 bit of value 1
557			* To speed up the square-and-multiply algorithm, we precompute
558			* a^(2^31-1).
559			*/
560
561			uint64_t r[4], t[4];
562			int i;
563
564	9		memcpy(t, a, sizeof t);
565	279	100	for (i = 0; i < 30; i ++) {
566	270		f256_montysquare(t, t);
567	270		f256_montymul(t, t, a);
568			}
569
570	9		memcpy(r, t, sizeof t);
571	2034	100	for (i = 224; i >= 0; i --) {
572	2025		f256_montysquare(r, r);
573	2025		switch (i) {
574	36		case 0:
575			case 2:
576			case 192:
577			case 224:
578	36		f256_montymul(r, r, a);
579	36		break;
580	27		case 3:
581			case 34:
582			case 65:
583	27		f256_montymul(r, r, t);
584	27		break;
585			}
586			}
587	9		memcpy(d, r, sizeof r);
588	9		}
589
590			/*
591			* Finalize reduction.
592			* Input value fits on 256 bits. This function subtracts p if and only
593			* if the input is greater than or equal to p.
594			*/
595			static inline void
596	486		f256_final_reduce(uint64_t *a)
597			{
598			#if BR_INT128
599
600			uint64_t t0, t1, t2, t3, cc;
601			unsigned __int128 z;
602
603			/*
604			* We add 2^224 - 2^192 - 2^96 + 1 to a. If there is no carry,
605			* then a < p; otherwise, the addition result we computed is
606			* the value we must return.
607			*/
608	486		z = (unsigned __int128)a[0] + 1;
609	486		t0 = (uint64_t)z;
610	486		z = (unsigned __int128)a[1] + (z >> 64) - ((uint64_t)1 << 32);
611	486		t1 = (uint64_t)z;
612	486		z = (unsigned __int128)a[2] - (z >> 127);
613	486		t2 = (uint64_t)z;
614	486		z = (unsigned __int128)a[3] - (z >> 127) + 0xFFFFFFFF;
615	486		t3 = (uint64_t)z;
616	486		cc = -(uint64_t)(z >> 64);
617
618	486		a[0] ^= cc & (a[0] ^ t0);
619	486		a[1] ^= cc & (a[1] ^ t1);
620	486		a[2] ^= cc & (a[2] ^ t2);
621	486		a[3] ^= cc & (a[3] ^ t3);
622
623			#elif BR_UMUL128
624
625			uint64_t t0, t1, t2, t3, m;
626			unsigned char k;
627
628			k = _addcarry_u64(0, a[0], (uint64_t)1, &t0);
629			k = _addcarry_u64(k, a[1], -((uint64_t)1 << 32), &t1);
630			k = _addcarry_u64(k, a[2], -(uint64_t)1, &t2);
631			k = _addcarry_u64(k, a[3], ((uint64_t)1 << 32) - 2, &t3);
632			m = -(uint64_t)k;
633
634			a[0] ^= m & (a[0] ^ t0);
635			a[1] ^= m & (a[1] ^ t1);
636			a[2] ^= m & (a[2] ^ t2);
637			a[3] ^= m & (a[3] ^ t3);
638
639			#endif
640	486		}
641
642			/*
643			* Points in affine and Jacobian coordinates.
644			*
645			* - In affine coordinates, the point-at-infinity cannot be encoded.
646			* - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
647			* if Z = 0 then this is the point-at-infinity.
648			*/
649			typedef struct {
650			uint64_t x[4];
651			uint64_t y[4];
652			} p256_affine;
653
654			typedef struct {
655			uint64_t x[4];
656			uint64_t y[4];
657			uint64_t z[4];
658			} p256_jacobian;
659
660			/*
661			* Decode a point. The returned point is in Jacobian coordinates, but
662			* with z = 1. If the encoding is invalid, or encodes a point which is
663			* not on the curve, or encodes the point at infinity, then this function
664			* returns 0. Otherwise, 1 is returned.
665			*
666			* The buffer is assumed to have length exactly 65 bytes.
667			*/
668			static uint32_t
669	3		point_decode(p256_jacobian P, const unsigned char buf)
670			{
671			uint64_t x[4], y[4], t[4], x3[4], tt;
672			uint32_t r;
673
674			/*
675			* Header byte shall be 0x04.
676			*/
677	3		r = EQ(buf[0], 0x04);
678
679			/*
680			* Decode X and Y coordinates, and convert them into
681			* Montgomery representation.
682			*/
683	3		x[3] = br_dec64be(buf + 1);
684	3		x[2] = br_dec64be(buf + 9);
685	3		x[1] = br_dec64be(buf + 17);
686	3		x[0] = br_dec64be(buf + 25);
687	3		y[3] = br_dec64be(buf + 33);
688	3		y[2] = br_dec64be(buf + 41);
689	3		y[1] = br_dec64be(buf + 49);
690	3		y[0] = br_dec64be(buf + 57);
691	3		f256_tomonty(x, x);
692	3		f256_tomonty(y, y);
693
694			/*
695			* Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
696			* Note that the Montgomery representation of 0 is 0. We must
697			* take care to apply the final reduction to make sure we have
698			* 0 and not p.
699			*/
700	3		f256_montysquare(t, y);
701	3		f256_montysquare(x3, x);
702	3		f256_montymul(x3, x3, x);
703	3		f256_sub(t, t, x3);
704	3		f256_add(t, t, x);
705	3		f256_add(t, t, x);
706	3		f256_add(t, t, x);
707	3		f256_sub(t, t, P256_B_MONTY);
708	3		f256_final_reduce(t);
709	3		tt = t[0] \| t[1] \| t[2] \| t[3];
710	3		r &= EQ((uint32_t)(tt \| (tt >> 32)), 0);
711
712			/*
713			* Return the point in Jacobian coordinates (and Montgomery
714			* representation).
715			*/
716	3		memcpy(P->x, x, sizeof x);
717	3		memcpy(P->y, y, sizeof y);
718	3		memcpy(P->z, F256_R, sizeof F256_R);
719	3		return r;
720			}
721
722			/*
723			* Final conversion for a point:
724			* - The point is converted back to affine coordinates.
725			* - Final reduction is performed.
726			* - The point is encoded into the provided buffer.
727			*
728			* If the point is the point-at-infinity, all operations are performed,
729			* but the buffer contents are indeterminate, and 0 is returned. Otherwise,
730			* the encoded point is written in the buffer, and 1 is returned.
731			*/
732			static uint32_t
733	6		point_encode(unsigned char buf, const p256_jacobian P)
734			{
735			uint64_t t1[4], t2[4], z;
736
737			/* Set t1 = 1/z^2 and t2 = 1/z^3. */
738	6		f256_invert(t2, P->z);
739	6		f256_montysquare(t1, t2);
740	6		f256_montymul(t2, t2, t1);
741
742			/* Compute affine coordinates x (in t1) and y (in t2). */
743	6		f256_montymul(t1, P->x, t1);
744	6		f256_montymul(t2, P->y, t2);
745
746			/* Convert back from Montgomery representation, and finalize
747			reductions. */
748	6		f256_frommonty(t1, t1);
749	6		f256_frommonty(t2, t2);
750	6		f256_final_reduce(t1);
751	6		f256_final_reduce(t2);
752
753			/* Encode. */
754	6		buf[0] = 0x04;
755	6		br_enc64be(buf + 1, t1[3]);
756	6		br_enc64be(buf + 9, t1[2]);
757	6		br_enc64be(buf + 17, t1[1]);
758	6		br_enc64be(buf + 25, t1[0]);
759	6		br_enc64be(buf + 33, t2[3]);
760	6		br_enc64be(buf + 41, t2[2]);
761	6		br_enc64be(buf + 49, t2[1]);
762	6		br_enc64be(buf + 57, t2[0]);
763
764			/* Return success if and only if P->z != 0. */
765	6		z = P->z[0] \| P->z[1] \| P->z[2] \| P->z[3];
766	6		return NEQ((uint32_t)(z \| z >> 32), 0);
767			}
768
769			/*
770			* Point doubling in Jacobian coordinates: point P is doubled.
771			* Note: if the source point is the point-at-infinity, then the result is
772			* still the point-at-infinity, which is correct. Moreover, if the three
773			* coordinates were zero, then they still are zero in the returned value.
774			*
775			* (Note: this is true even without the final reduction: if the three
776			* coordinates are encoded as four words of value zero each, then the
777			* result will also have all-zero coordinate encodings, not the alternate
778			* encoding as the integer p.)
779			*/
780			static void
781	1814		p256_double(p256_jacobian *P)
782			{
783			/*
784			* Doubling formulas are:
785			*
786			* s = 4xy^2
787			* m = 3(x + z^2)(x - z^2)
788			* x' = m^2 - 2*s
789			* y' = m(s - x') - 8y^4
790			* z' = 2yz
791			*
792			* These formulas work for all points, including points of order 2
793			* and points at infinity:
794			* - If y = 0 then z' = 0. But there is no such point in P-256
795			* anyway.
796			* - If z = 0 then z' = 0.
797			*/
798			uint64_t t1[4], t2[4], t3[4], t4[4];
799
800			/*
801			* Compute z^2 in t1.
802			*/
803	1814		f256_montysquare(t1, P->z);
804
805			/*
806			* Compute x-z^2 in t2 and x+z^2 in t1.
807			*/
808	1814		f256_add(t2, P->x, t1);
809	1814		f256_sub(t1, P->x, t1);
810
811			/*
812			* Compute 3(x+z^2)(x-z^2) in t1.
813			*/
814	1814		f256_montymul(t3, t1, t2);
815	1814		f256_add(t1, t3, t3);
816	1814		f256_add(t1, t3, t1);
817
818			/*
819			* Compute 4xy^2 (in t2) and 2*y^2 (in t3).
820			*/
821	1814		f256_montysquare(t3, P->y);
822	1814		f256_add(t3, t3, t3);
823	1814		f256_montymul(t2, P->x, t3);
824	1814		f256_add(t2, t2, t2);
825
826			/*
827			* Compute x' = m^2 - 2*s.
828			*/
829	1814		f256_montysquare(P->x, t1);
830	1814		f256_sub(P->x, P->x, t2);
831	1814		f256_sub(P->x, P->x, t2);
832
833			/*
834			* Compute z' = 2yz.
835			*/
836	1814		f256_montymul(t4, P->y, P->z);
837	1814		f256_add(P->z, t4, t4);
838
839			/*
840			* Compute y' = m(s - x') - 8y^4. Note that we already have
841			* 2*y^2 in t3.
842			*/
843	1814		f256_sub(t2, t2, P->x);
844	1814		f256_montymul(P->y, t1, t2);
845	1814		f256_montysquare(t4, t3);
846	1814		f256_add(t4, t4, t4);
847	1814		f256_sub(P->y, P->y, t4);
848	1814		}
849
850			/*
851			* Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
852			* This function computes the wrong result in the following cases:
853			*
854			* - If P1 == 0 but P2 != 0
855			* - If P1 != 0 but P2 == 0
856			* - If P1 == P2
857			*
858			* In all three cases, P1 is set to the point at infinity.
859			*
860			* Returned value is 0 if one of the following occurs:
861			*
862			* - P1 and P2 have the same Y coordinate.
863			* - P1 == 0 and P2 == 0.
864			* - The Y coordinate of one of the points is 0 and the other point is
865			* the point at infinity.
866			*
867			* The third case cannot actually happen with valid points, since a point
868			* with Y == 0 is a point of order 2, and there is no point of order 2 on
869			* curve P-256.
870			*
871			* Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
872			* can apply the following:
873			*
874			* - If the result is not the point at infinity, then it is correct.
875			* - Otherwise, if the returned value is 1, then this is a case of
876			* P1+P2 == 0, so the result is indeed the point at infinity.
877			* - Otherwise, P1 == P2, so a "double" operation should have been
878			* performed.
879			*
880			* Note that you can get a returned value of 0 with a correct result,
881			* e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
882			*/
883			static uint32_t
884	22		p256_add(p256_jacobian P1, const p256_jacobian P2)
885			{
886			/*
887			* Addtions formulas are:
888			*
889			* u1 = x1 * z2^2
890			* u2 = x2 * z1^2
891			* s1 = y1 * z2^3
892			* s2 = y2 * z1^3
893			* h = u2 - u1
894			* r = s2 - s1
895			* x3 = r^2 - h^3 - 2 * u1 * h^2
896			* y3 = r * (u1 * h^2 - x3) - s1 * h^3
897			* z3 = h * z1 * z2
898			*/
899			uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
900			uint32_t ret;
901
902			/*
903			* Compute u1 = x1z2^2 (in t1) and s1 = y1z2^3 (in t3).
904			*/
905	22		f256_montysquare(t3, P2->z);
906	22		f256_montymul(t1, P1->x, t3);
907	22		f256_montymul(t4, P2->z, t3);
908	22		f256_montymul(t3, P1->y, t4);
909
910			/*
911			* Compute u2 = x2z1^2 (in t2) and s2 = y2z1^3 (in t4).
912			*/
913	22		f256_montysquare(t4, P1->z);
914	22		f256_montymul(t2, P2->x, t4);
915	22		f256_montymul(t5, P1->z, t4);
916	22		f256_montymul(t4, P2->y, t5);
917
918			/*
919			* Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
920			* We need to test whether r is zero, so we will do some extra
921			* reduce.
922			*/
923	22		f256_sub(t2, t2, t1);
924	22		f256_sub(t4, t4, t3);
925	22		f256_final_reduce(t4);
926	22		tt = t4[0] \| t4[1] \| t4[2] \| t4[3];
927	22		ret = (uint32_t)(tt \| (tt >> 32));
928	22		ret = (ret \| -ret) >> 31;
929
930			/*
931			* Compute u1*h^2 (in t6) and h^3 (in t5);
932			*/
933	22		f256_montysquare(t7, t2);
934	22		f256_montymul(t6, t1, t7);
935	22		f256_montymul(t5, t7, t2);
936
937			/*
938			* Compute x3 = r^2 - h^3 - 2u1h^2.
939			*/
940	22		f256_montysquare(P1->x, t4);
941	22		f256_sub(P1->x, P1->x, t5);
942	22		f256_sub(P1->x, P1->x, t6);
943	22		f256_sub(P1->x, P1->x, t6);
944
945			/*
946			* Compute y3 = r(u1h^2 - x3) - s1*h^3.
947			*/
948	22		f256_sub(t6, t6, P1->x);
949	22		f256_montymul(P1->y, t4, t6);
950	22		f256_montymul(t1, t5, t3);
951	22		f256_sub(P1->y, P1->y, t1);
952
953			/*
954			* Compute z3 = hz1z2.
955			*/
956	22		f256_montymul(t1, P1->z, P2->z);
957	22		f256_montymul(P1->z, t1, t2);
958
959	22		return ret;
960			}
961
962			/*
963			* Point addition (mixed coordinates): P1 is replaced with P1+P2.
964			* This is a specialised function for the case when P2 is a non-zero point
965			* in affine coordinates.
966			*
967			* This function computes the wrong result in the following cases:
968			*
969			* - If P1 == 0
970			* - If P1 == P2
971			*
972			* In both cases, P1 is set to the point at infinity.
973			*
974			* Returned value is 0 if one of the following occurs:
975			*
976			* - P1 and P2 have the same Y (affine) coordinate.
977			* - The Y coordinate of P2 is 0 and P1 is the point at infinity.
978			*
979			* The second case cannot actually happen with valid points, since a point
980			* with Y == 0 is a point of order 2, and there is no point of order 2 on
981			* curve P-256.
982			*
983			* Therefore, assuming that P1 != 0 on input, then the caller
984			* can apply the following:
985			*
986			* - If the result is not the point at infinity, then it is correct.
987			* - Otherwise, if the returned value is 1, then this is a case of
988			* P1+P2 == 0, so the result is indeed the point at infinity.
989			* - Otherwise, P1 == P2, so a "double" operation should have been
990			* performed.
991			*
992			* Again, a value of 0 may be returned in some cases where the addition
993			* result is correct.
994			*/
995			static uint32_t
996	448		p256_add_mixed(p256_jacobian P1, const p256_affine P2)
997			{
998			/*
999			* Addtions formulas are:
1000			*
1001			* u1 = x1
1002			* u2 = x2 * z1^2
1003			* s1 = y1
1004			* s2 = y2 * z1^3
1005			* h = u2 - u1
1006			* r = s2 - s1
1007			* x3 = r^2 - h^3 - 2 * u1 * h^2
1008			* y3 = r * (u1 * h^2 - x3) - s1 * h^3
1009			* z3 = h * z1
1010			*/
1011			uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
1012			uint32_t ret;
1013
1014			/*
1015			* Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1016			*/
1017	448		memcpy(t1, P1->x, sizeof t1);
1018	448		memcpy(t3, P1->y, sizeof t3);
1019
1020			/*
1021			* Compute u2 = x2z1^2 (in t2) and s2 = y2z1^3 (in t4).
1022			*/
1023	448		f256_montysquare(t4, P1->z);
1024	448		f256_montymul(t2, P2->x, t4);
1025	448		f256_montymul(t5, P1->z, t4);
1026	448		f256_montymul(t4, P2->y, t5);
1027
1028			/*
1029			* Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1030			* We need to test whether r is zero, so we will do some extra
1031			* reduce.
1032			*/
1033	448		f256_sub(t2, t2, t1);
1034	448		f256_sub(t4, t4, t3);
1035	448		f256_final_reduce(t4);
1036	448		tt = t4[0] \| t4[1] \| t4[2] \| t4[3];
1037	448		ret = (uint32_t)(tt \| (tt >> 32));
1038	448		ret = (ret \| -ret) >> 31;
1039
1040			/*
1041			* Compute u1*h^2 (in t6) and h^3 (in t5);
1042			*/
1043	448		f256_montysquare(t7, t2);
1044	448		f256_montymul(t6, t1, t7);
1045	448		f256_montymul(t5, t7, t2);
1046
1047			/*
1048			* Compute x3 = r^2 - h^3 - 2u1h^2.
1049			*/
1050	448		f256_montysquare(P1->x, t4);
1051	448		f256_sub(P1->x, P1->x, t5);
1052	448		f256_sub(P1->x, P1->x, t6);
1053	448		f256_sub(P1->x, P1->x, t6);
1054
1055			/*
1056			* Compute y3 = r(u1h^2 - x3) - s1*h^3.
1057			*/
1058	448		f256_sub(t6, t6, P1->x);
1059	448		f256_montymul(P1->y, t4, t6);
1060	448		f256_montymul(t1, t5, t3);
1061	448		f256_sub(P1->y, P1->y, t1);
1062
1063			/*
1064			* Compute z3 = hz1z2.
1065			*/
1066	448		f256_montymul(P1->z, P1->z, t2);
1067
1068	448		return ret;
1069			}
1070
1071			#if 0
1072			/* unused */
1073			/*
1074			* Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
1075			* This is a specialised function for the case when P2 is a non-zero point
1076			* in affine coordinates.
1077			*
1078			* This function returns the correct result in all cases.
1079			*/
1080			static uint32_t
1081			p256_add_complete_mixed(p256_jacobian P1, const p256_affine P2)
1082			{
1083			/*
1084			* Addtions formulas, in the general case, are:
1085			*
1086			* u1 = x1
1087			* u2 = x2 * z1^2
1088			* s1 = y1
1089			* s2 = y2 * z1^3
1090			* h = u2 - u1
1091			* r = s2 - s1
1092			* x3 = r^2 - h^3 - 2 * u1 * h^2
1093			* y3 = r * (u1 * h^2 - x3) - s1 * h^3
1094			* z3 = h * z1
1095			*
1096			* These formulas mishandle the two following cases:
1097			*
1098			* - If P1 is the point-at-infinity (z1 = 0), then z3 is
1099			* incorrectly set to 0.
1100			*
1101			* - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
1102			* are all set to 0.
1103			*
1104			* However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
1105			* we correctly get z3 = 0 (the point-at-infinity).
1106			*
1107			* To fix the case P1 = 0, we perform at the end a copy of P2
1108			* over P1, conditional to z1 = 0.
1109			*
1110			* For P1 = P2: in that case, both h and r are set to 0, and
1111			* we get x3, y3 and z3 equal to 0. We can test for that
1112			* occurrence to make a mask which will be all-one if P1 = P2,
1113			* or all-zero otherwise; then we can compute the double of P2
1114			* and add it, combined with the mask, to (x3,y3,z3).
1115			*
1116			* Using the doubling formulas in p256_double() on (x2,y2),
1117			* simplifying since P2 is affine (i.e. z2 = 1, implicitly),
1118			* we get:
1119			* s = 4x2y2^2
1120			* m = 3(x2 + 1)(x2 - 1)
1121			* x' = m^2 - 2*s
1122			* y' = m(s - x') - 8y2^4
1123			* z' = 2*y2
1124			* which requires only 6 multiplications. Added to the 11
1125			* multiplications of the normal mixed addition in Jacobian
1126			* coordinates, we get a cost of 17 multiplications in total.
1127			*/
1128			uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt, zz;
1129			int i;
1130
1131			/*
1132			* Set zz to -1 if P1 is the point at infinity, 0 otherwise.
1133			*/
1134			zz = P1->z[0] \| P1->z[1] \| P1->z[2] \| P1->z[3];
1135			zz = ((zz \| -zz) >> 63) - (uint64_t)1;
1136
1137			/*
1138			* Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1139			*/
1140			memcpy(t1, P1->x, sizeof t1);
1141			memcpy(t3, P1->y, sizeof t3);
1142
1143			/*
1144			* Compute u2 = x2z1^2 (in t2) and s2 = y2z1^3 (in t4).
1145			*/
1146			f256_montysquare(t4, P1->z);
1147			f256_montymul(t2, P2->x, t4);
1148			f256_montymul(t5, P1->z, t4);
1149			f256_montymul(t4, P2->y, t5);
1150
1151			/*
1152			* Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1153			* reduce.
1154			*/
1155			f256_sub(t2, t2, t1);
1156			f256_sub(t4, t4, t3);
1157
1158			/*
1159			* If both h = 0 and r = 0, then P1 = P2, and we want to set
1160			* the mask tt to -1; otherwise, the mask will be 0.
1161			*/
1162			f256_final_reduce(t2);
1163			f256_final_reduce(t4);
1164			tt = t2[0] \| t2[1] \| t2[2] \| t2[3] \| t4[0] \| t4[1] \| t4[2] \| t4[3];
1165			tt = ((tt \| -tt) >> 63) - (uint64_t)1;
1166
1167			/*
1168			* Compute u1*h^2 (in t6) and h^3 (in t5);
1169			*/
1170			f256_montysquare(t7, t2);
1171			f256_montymul(t6, t1, t7);
1172			f256_montymul(t5, t7, t2);
1173
1174			/*
1175			* Compute x3 = r^2 - h^3 - 2u1h^2.
1176			*/
1177			f256_montysquare(P1->x, t4);
1178			f256_sub(P1->x, P1->x, t5);
1179			f256_sub(P1->x, P1->x, t6);
1180			f256_sub(P1->x, P1->x, t6);
1181
1182			/*
1183			* Compute y3 = r(u1h^2 - x3) - s1*h^3.
1184			*/
1185			f256_sub(t6, t6, P1->x);
1186			f256_montymul(P1->y, t4, t6);
1187			f256_montymul(t1, t5, t3);
1188			f256_sub(P1->y, P1->y, t1);
1189
1190			/*
1191			* Compute z3 = h*z1.
1192			*/
1193			f256_montymul(P1->z, P1->z, t2);
1194
1195			/*
1196			* The "double" result, in case P1 = P2.
1197			*/
1198
1199			/*
1200			* Compute z' = 2*y2 (in t1).
1201			*/
1202			f256_add(t1, P2->y, P2->y);
1203
1204			/*
1205			* Compute 2(y2^2) (in t2) and s = 4x2*(y2^2) (in t3).
1206			*/
1207			f256_montysquare(t2, P2->y);
1208			f256_add(t2, t2, t2);
1209			f256_add(t3, t2, t2);
1210			f256_montymul(t3, P2->x, t3);
1211
1212			/*
1213			* Compute m = 3*(x2^2 - 1) (in t4).
1214			*/
1215			f256_montysquare(t4, P2->x);
1216			f256_sub(t4, t4, F256_R);
1217			f256_add(t5, t4, t4);
1218			f256_add(t4, t4, t5);
1219
1220			/*
1221			* Compute x' = m^2 - 2*s (in t5).
1222			*/
1223			f256_montysquare(t5, t4);
1224			f256_sub(t5, t3);
1225			f256_sub(t5, t3);
1226
1227			/*
1228			* Compute y' = m(s - x') - 8y2^4 (in t6).
1229			*/
1230			f256_sub(t6, t3, t5);
1231			f256_montymul(t6, t6, t4);
1232			f256_montysquare(t7, t2);
1233			f256_sub(t6, t6, t7);
1234			f256_sub(t6, t6, t7);
1235
1236			/*
1237			* We now have the alternate (doubling) coordinates in (t5,t6,t1).
1238			* We combine them with (x3,y3,z3).
1239			*/
1240			for (i = 0; i < 4; i ++) {
1241			P1->x[i] \|= tt & t5[i];
1242			P1->y[i] \|= tt & t6[i];
1243			P1->z[i] \|= tt & t1[i];
1244			}
1245
1246			/*
1247			* If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
1248			* then we want to replace the result with a copy of P2. The
1249			* test on z1 was done at the start, in the zz mask.
1250			*/
1251			for (i = 0; i < 4; i ++) {
1252			P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
1253			P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
1254			P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
1255			}
1256			}
1257			#endif
1258
1259			/*
1260			* Inner function for computing a point multiplication. A window is
1261			* provided, with points 1P to 15P in affine coordinates.
1262			*
1263			* Assumptions:
1264			* - All provided points are valid points on the curve.
1265			* - Multiplier is non-zero, and smaller than the curve order.
1266			* - Everything is in Montgomery representation.
1267			*/
1268			static void
1269	7		point_mul_inner(p256_jacobian R, const p256_affine W,
1270			const unsigned char *k, size_t klen)
1271			{
1272			p256_jacobian Q;
1273			uint32_t qz;
1274
1275	7		memset(&Q, 0, sizeof Q);
1276	7		qz = 1;
1277	231	100	while (klen -- > 0) {
1278			int i;
1279			unsigned bk;
1280
1281	224		bk = *k ++;
1282	672	100	for (i = 0; i < 2; i ++) {
1283			uint32_t bits;
1284			uint32_t bnz;
1285			p256_affine T;
1286			p256_jacobian U;
1287			uint32_t n;
1288			int j;
1289			uint64_t m;
1290
1291	448		p256_double(&Q);
1292	448		p256_double(&Q);
1293	448		p256_double(&Q);
1294	448		p256_double(&Q);
1295	448		bits = (bk >> 4) & 0x0F;
1296	448		bnz = NEQ(bits, 0);
1297
1298			/*
1299			* Lookup point in window. If the bits are 0,
1300			* we get something invalid, which is not a
1301			* problem because we will use it only if the
1302			* bits are non-zero.
1303			*/
1304	448		memset(&T, 0, sizeof T);
1305	7168	100	for (n = 0; n < 15; n ++) {
1306	6720		m = -(uint64_t)EQ(bits, n + 1);
1307	6720		T.x[0] \|= m & W[n].x[0];
1308	6720		T.x[1] \|= m & W[n].x[1];
1309	6720		T.x[2] \|= m & W[n].x[2];
1310	6720		T.x[3] \|= m & W[n].x[3];
1311	6720		T.y[0] \|= m & W[n].y[0];
1312	6720		T.y[1] \|= m & W[n].y[1];
1313	6720		T.y[2] \|= m & W[n].y[2];
1314	6720		T.y[3] \|= m & W[n].y[3];
1315			}
1316
1317	448		U = Q;
1318	448		p256_add_mixed(&U, &T);
1319
1320			/*
1321			* If qz is still 1, then Q was all-zeros, and this
1322			* is conserved through p256_double().
1323			*/
1324	448		m = -(uint64_t)(bnz & qz);
1325	2240	100	for (j = 0; j < 4; j ++) {
1326	1792		Q.x[j] \|= m & T.x[j];
1327	1792		Q.y[j] \|= m & T.y[j];
1328	1792		Q.z[j] \|= m & F256_R[j];
1329			}
1330	448		CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331	448		qz &= ~bnz;
1332	448		bk <<= 4;
1333			}
1334			}
1335	7		*R = Q;
1336	7		}
1337
1338			/*
1339			* Convert a window from Jacobian to affine coordinates. A single
1340			* field inversion is used. This function works for windows up to
1341			* 32 elements.
1342			*
1343			* The destination array (aff[]) and the source array (jac[]) may
1344			* overlap, provided that the start of aff[] is not after the start of
1345			* jac[]. Even if the arrays do _not_ overlap, the source array is
1346			* modified.
1347			*/
1348			static void
1349	3		window_to_affine(p256_affine aff, p256_jacobian jac, int num)
1350			{
1351			/*
1352			* Convert the window points to affine coordinates. We use the
1353			* following trick to mutualize the inversion computation: if
1354			* we have z1, z2, z3, and z4, and want to inverse all of them,
1355			* we compute u = 1/(z1z2z3*z4), and then we have:
1356			* 1/z1 = uz2z3*z4
1357			* 1/z2 = uz1z3*z4
1358			* 1/z3 = uz1z2*z4
1359			* 1/z4 = uz1z2*z3
1360			*
1361			* The partial products are computed recursively:
1362			*
1363			* - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
1364			* - on input (z_1,z_2,... z_n):
1365			* recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
1366			* recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
1367			* multiply elements of r1 by m2 -> s1
1368			* multiply elements of r2 by m1 -> s2
1369			* return r1\|\|r2 and m1*m2
1370			*
1371			* In the example below, we suppose that we have 14 elements.
1372			* Let z1, z2,... zE be the 14 values to invert (index noted in
1373			* hexadecimal, starting at 1).
1374			*
1375			* - Depth 1:
1376			* swap(z1, z2); z12 = z1*z2
1377			* swap(z3, z4); z34 = z3*z4
1378			* swap(z5, z6); z56 = z5*z6
1379			* swap(z7, z8); z78 = z7*z8
1380			* swap(z9, zA); z9A = z9*zA
1381			* swap(zB, zC); zBC = zB*zC
1382			* swap(zD, zE); zDE = zD*zE
1383			*
1384			* - Depth 2:
1385			* z1 <- z1z34, z2 <- z2z34, z3 <- z3z12, z4 <- z4z12
1386			* z1234 = z12*z34
1387			* z5 <- z5z78, z6 <- z6z78, z7 <- z7z56, z8 <- z8z56
1388			* z5678 = z56*z78
1389			* z9 <- z9zBC, zA <- zAzBC, zB <- zBz9A, zC <- zCz9A
1390			* z9ABC = z9A*zBC
1391			*
1392			* - Depth 3:
1393			* z1 <- z1z5678, z2 <- z2z5678, z3 <- z3z5678, z4 <- z4z5678
1394			* z5 <- z5z1234, z6 <- z6z1234, z7 <- z7z1234, z8 <- z8z1234
1395			* z12345678 = z1234*z5678
1396			* z9 <- z9zDE, zA <- zAzDE, zB <- zBzDE, zC <- zCzDE
1397			* zD <- zDz9ABC, zEz9ABC
1398			* z9ABCDE = z9ABC*zDE
1399			*
1400			* - Depth 4:
1401			* multiply z1..z8 by z9ABCDE
1402			* multiply z9..zE by z12345678
1403			* final z = z12345678*z9ABCDE
1404			*/
1405
1406			uint64_t z[16][4];
1407			int i, k, s;
1408			#define zt (z[15])
1409			#define zu (z[14])
1410			#define zv (z[13])
1411
1412			/*
1413			* First recursion step (pairwise swapping and multiplication).
1414			* If there is an odd number of elements, then we "invent" an
1415			* extra one with coordinate Z = 1 (in Montgomery representation).
1416			*/
1417	24	100	for (i = 0; (i + 1) < num; i += 2) {
1418	21		memcpy(zt, jac[i].z, sizeof zt);
1419	21		memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
1420	21		memcpy(jac[i + 1].z, zt, sizeof zt);
1421	21		f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
1422			}
1423	3	50	if ((num & 1) != 0) {
1424	3		memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
1425	3		memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
1426			}
1427
1428			/*
1429			* Perform further recursion steps. At the entry of each step,
1430			* the process has been done for groups of 's' points. The
1431			* integer k is the log2 of s.
1432			*/
1433	12	100	for (k = 1, s = 2; s < num; k ++, s <<= 1) {
1434			int n;
1435
1436	144	100	for (i = 0; i < num; i ++) {
1437	135		f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
1438			}
1439	9		n = (num + s - 1) >> k;
1440	30	100	for (i = 0; i < (n >> 1); i ++) {
1441	21		f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
1442			}
1443	9	50	if ((n & 1) != 0) {
1444	0		memmove(z[n >> 1], z[n], sizeof zt);
1445			}
1446			}
1447
1448			/*
1449			* Invert the final result, and convert all points.
1450			*/
1451	3		f256_invert(zt, z[0]);
1452	48	100	for (i = 0; i < num; i ++) {
1453	45		f256_montymul(zv, jac[i].z, zt);
1454	45		f256_montysquare(zu, zv);
1455	45		f256_montymul(zv, zv, zu);
1456	45		f256_montymul(aff[i].x, jac[i].x, zu);
1457	45		f256_montymul(aff[i].y, jac[i].y, zv);
1458			}
1459	3		}
1460
1461			/*
1462			* Multiply the provided point by an integer.
1463			* Assumptions:
1464			* - Source point is a valid curve point.
1465			* - Source point is not the point-at-infinity.
1466			* - Integer is not 0, and is lower than the curve order.
1467			* If these conditions are not met, then the result is indeterminate
1468			* (but the process is still constant-time).
1469			*/
1470			static void
1471	3		p256_mul(p256_jacobian P, const unsigned char k, size_t klen)
1472			{
1473			union {
1474			p256_affine aff[15];
1475			p256_jacobian jac[15];
1476			} window;
1477			int i;
1478
1479			/*
1480			* Compute window, in Jacobian coordinates.
1481			*/
1482	3		window.jac[0] = *P;
1483	45	100	for (i = 2; i < 16; i ++) {
1484	42		window.jac[i - 1] = window.jac[(i >> 1) - 1];
1485	42	100	if ((i & 1) == 0) {
1486	21		p256_double(&window.jac[i - 1]);
1487			} else {
1488	21		p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
1489			}
1490			}
1491
1492			/*
1493			* Convert the window points to affine coordinates. Point
1494			* window[0] is the source point, already in affine coordinates.
1495			*/
1496	3		window_to_affine(window.aff, window.jac, 15);
1497
1498			/*
1499			* Perform point multiplication.
1500			*/
1501	3		point_mul_inner(P, window.aff, k, klen);
1502	3		}
1503
1504			/*
1505			* Precomputed window for the conventional generator: P256_Gwin[n]
1506			* contains (n+1)*G (affine coordinates, in Montgomery representation).
1507			*/
1508			static const p256_affine P256_Gwin[] = {
1509			{
1510			{ 0x79E730D418A9143C, 0x75BA95FC5FEDB601,
1511			0x79FB732B77622510, 0x18905F76A53755C6 },
1512			{ 0xDDF25357CE95560A, 0x8B4AB8E4BA19E45C,
1513			0xD2E88688DD21F325, 0x8571FF1825885D85 }
1514			},
1515			{
1516			{ 0x850046D410DDD64D, 0xAA6AE3C1A433827D,
1517			0x732205038D1490D9, 0xF6BB32E43DCF3A3B },
1518			{ 0x2F3648D361BEE1A5, 0x152CD7CBEB236FF8,
1519			0x19A8FB0E92042DBE, 0x78C577510A5B8A3B }
1520			},
1521			{
1522			{ 0xFFAC3F904EEBC127, 0xB027F84A087D81FB,
1523			0x66AD77DD87CBBC98, 0x26936A3FB6FF747E },
1524			{ 0xB04C5C1FC983A7EB, 0x583E47AD0861FE1A,
1525			0x788208311A2EE98E, 0xD5F06A29E587CC07 }
1526			},
1527			{
1528			{ 0x74B0B50D46918DCC, 0x4650A6EDC623C173,
1529			0x0CDAACACE8100AF2, 0x577362F541B0176B },
1530			{ 0x2D96F24CE4CBABA6, 0x17628471FAD6F447,
1531			0x6B6C36DEE5DDD22E, 0x84B14C394C5AB863 }
1532			},
1533			{
1534			{ 0xBE1B8AAEC45C61F5, 0x90EC649A94B9537D,
1535			0x941CB5AAD076C20C, 0xC9079605890523C8 },
1536			{ 0xEB309B4AE7BA4F10, 0x73C568EFE5EB882B,
1537			0x3540A9877E7A1F68, 0x73A076BB2DD1E916 }
1538			},
1539			{
1540			{ 0x403947373E77664A, 0x55AE744F346CEE3E,
1541			0xD50A961A5B17A3AD, 0x13074B5954213673 },
1542			{ 0x93D36220D377E44B, 0x299C2B53ADFF14B5,
1543			0xF424D44CEF639F11, 0xA4C9916D4A07F75F }
1544			},
1545			{
1546			{ 0x0746354EA0173B4F, 0x2BD20213D23C00F7,
1547			0xF43EAAB50C23BB08, 0x13BA5119C3123E03 },
1548			{ 0x2847D0303F5B9D4D, 0x6742F2F25DA67BDD,
1549			0xEF933BDC77C94195, 0xEAEDD9156E240867 }
1550			},
1551			{
1552			{ 0x27F14CD19499A78F, 0x462AB5C56F9B3455,
1553			0x8F90F02AF02CFC6B, 0xB763891EB265230D },
1554			{ 0xF59DA3A9532D4977, 0x21E3327DCF9EBA15,
1555			0x123C7B84BE60BBF0, 0x56EC12F27706DF76 }
1556			},
1557			{
1558			{ 0x75C96E8F264E20E8, 0xABE6BFED59A7A841,
1559			0x2CC09C0444C8EB00, 0xE05B3080F0C4E16B },
1560			{ 0x1EB7777AA45F3314, 0x56AF7BEDCE5D45E3,
1561			0x2B6E019A88B12F1A, 0x086659CDFD835F9B }
1562			},
1563			{
1564			{ 0x2C18DBD19DC21EC8, 0x98F9868A0FCF8139,
1565			0x737D2CD648250B49, 0xCC61C94724B3428F },
1566			{ 0x0C2B407880DD9E76, 0xC43A8991383FBE08,
1567			0x5F7D2D65779BE5D2, 0x78719A54EB3B4AB5 }
1568			},
1569			{
1570			{ 0xEA7D260A6245E404, 0x9DE407956E7FDFE0,
1571			0x1FF3A4158DAC1AB5, 0x3E7090F1649C9073 },
1572			{ 0x1A7685612B944E88, 0x250F939EE57F61C8,
1573			0x0C0DAA891EAD643D, 0x68930023E125B88E }
1574			},
1575			{
1576			{ 0x04B71AA7D2697768, 0xABDEDEF5CA345A33,
1577			0x2409D29DEE37385E, 0x4EE1DF77CB83E156 },
1578			{ 0x0CAC12D91CBB5B43, 0x170ED2F6CA895637,
1579			0x28228CFA8ADE6D66, 0x7FF57C9553238ACA }
1580			},
1581			{
1582			{ 0xCCC425634B2ED709, 0x0E356769856FD30D,
1583			0xBCBCD43F559E9811, 0x738477AC5395B759 },
1584			{ 0x35752B90C00EE17F, 0x68748390742ED2E3,
1585			0x7CD06422BD1F5BC1, 0xFBC08769C9E7B797 }
1586			},
1587			{
1588			{ 0xA242A35BB0CF664A, 0x126E48F77F9707E3,
1589			0x1717BF54C6832660, 0xFAAE7332FD12C72E },
1590			{ 0x27B52DB7995D586B, 0xBE29569E832237C2,
1591			0xE8E4193E2A65E7DB, 0x152706DC2EAA1BBB }
1592			},
1593			{
1594			{ 0x72BCD8B7BC60055B, 0x03CC23EE56E27E4B,
1595			0xEE337424E4819370, 0xE2AA0E430AD3DA09 },
1596			{ 0x40B8524F6383C45D, 0xD766355442A41B25,
1597			0x64EFA6DE778A4797, 0x2042170A7079ADF4 }
1598			}
1599			};
1600
1601			/*
1602			* Multiply the conventional generator of the curve by the provided
1603			* integer. Return is written in *P.
1604			*
1605			* Assumptions:
1606			* - Integer is not 0, and is lower than the curve order.
1607			* If this conditions is not met, then the result is indeterminate
1608			* (but the process is still constant-time).
1609			*/
1610			static void
1611	4		p256_mulgen(p256_jacobian P, const unsigned char k, size_t klen)
1612			{
1613	4		point_mul_inner(P, P256_Gwin, k, klen);
1614	4		}
1615
1616			/*
1617			* Return 1 if all of the following hold:
1618			* - klen <= 32
1619			* - k != 0
1620			* - k is lower than the curve order
1621			* Otherwise, return 0.
1622			*
1623			* Constant-time behaviour: only klen may be observable.
1624			*/
1625			static uint32_t
1626	2		check_scalar(const unsigned char *k, size_t klen)
1627			{
1628			uint32_t z;
1629			int32_t c;
1630			size_t u;
1631
1632	2	50	if (klen > 32) {
1633	0		return 0;
1634			}
1635	2		z = 0;
1636	66	100	for (u = 0; u < klen; u ++) {
1637	64		z \|= k[u];
1638			}
1639	2	50	if (klen == 32) {
1640	2		c = 0;
1641	66	100	for (u = 0; u < klen; u ++) {
1642	64		c \|= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
1643			}
1644			} else {
1645	0		c = -1;
1646			}
1647	2		return NEQ(z, 0) & LT0(c);
1648			}
1649
1650			static uint32_t
1651	2		api_mul(unsigned char *G, size_t Glen,
1652			const unsigned char *k, size_t klen, int curve)
1653			{
1654			uint32_t r;
1655			p256_jacobian P;
1656
1657			(void)curve;
1658	2	50	if (Glen != 65) {
1659	0		return 0;
1660			}
1661	2		r = check_scalar(k, klen);
1662	2		r &= point_decode(&P, G);
1663	2		p256_mul(&P, k, klen);
1664	2		r &= point_encode(G, &P);
1665	2		return r;
1666			}
1667
1668			static size_t
1669	3		api_mulgen(unsigned char *R,
1670			const unsigned char *k, size_t klen, int curve)
1671			{
1672			p256_jacobian P;
1673
1674			(void)curve;
1675	3		p256_mulgen(&P, k, klen);
1676	3		point_encode(R, &P);
1677	3		return 65;
1678			}
1679
1680			static uint32_t
1681	1		api_muladd(unsigned char A, const unsigned char B, size_t len,
1682			const unsigned char *x, size_t xlen,
1683			const unsigned char *y, size_t ylen, int curve)
1684			{
1685			/*
1686			* We might want to use Shamir's trick here: make a composite
1687			* window of uP+vQ points, to merge the two doubling-ladders
1688			* into one. This, however, has some complications:
1689			*
1690			* - During the computation, we may hit the point-at-infinity.
1691			* Thus, we would need p256_add_complete_mixed() (complete
1692			* formulas for point addition), with a higher cost (17 muls
1693			* instead of 11).
1694			*
1695			* - A 4-bit window would be too large, since it would involve
1696			* 16*16-1 = 255 points. For the same window size as in the
1697			* p256_mul() case, we would need to reduce the window size
1698			* to 2 bits, and thus perform twice as many non-doubling
1699			* point additions.
1700			*
1701			* - The window may itself contain the point-at-infinity, and
1702			* thus cannot be in all generality be made of affine points.
1703			* Instead, we would need to make it a window of points in
1704			* Jacobian coordinates. Even p256_add_complete_mixed() would
1705			* be inappropriate.
1706			*
1707			* For these reasons, the code below performs two separate
1708			* point multiplications, then computes the final point addition
1709			* (which is both a "normal" addition, and a doubling, to handle
1710			* all cases).
1711			*/
1712
1713			p256_jacobian P, Q;
1714			uint32_t r, t, s;
1715			uint64_t z;
1716
1717			(void)curve;
1718	1	50	if (len != 65) {
1719	0		return 0;
1720			}
1721	1		r = point_decode(&P, A);
1722	1		p256_mul(&P, x, xlen);
1723	1	50	if (B == NULL) {
1724	1		p256_mulgen(&Q, y, ylen);
1725			} else {
1726	0		r &= point_decode(&Q, B);
1727	0		p256_mul(&Q, y, ylen);
1728			}
1729
1730			/*
1731			* The final addition may fail in case both points are equal.
1732			*/
1733	1		t = p256_add(&P, &Q);
1734	1		f256_final_reduce(P.z);
1735	1		z = P.z[0] \| P.z[1] \| P.z[2] \| P.z[3];
1736	1		s = EQ((uint32_t)(z \| (z >> 32)), 0);
1737	1		p256_double(&Q);
1738
1739			/*
1740			* If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1741			* have the following:
1742			*
1743			* s = 0, t = 0 return P (normal addition)
1744			* s = 0, t = 1 return P (normal addition)
1745			* s = 1, t = 0 return Q (a 'double' case)
1746			* s = 1, t = 1 report an error (P+Q = 0)
1747			*/
1748	1		CCOPY(s & ~t, &P, &Q, sizeof Q);
1749	1		point_encode(A, &P);
1750	1		r &= ~(s & t);
1751	1		return r;
1752			}
1753
1754			/* see bearssl_ec.h */
1755			const br_ec_impl br_ec_p256_m64 = {
1756			(uint32_t)0x00800000,
1757			&api_generator,
1758			&api_order,
1759			&api_xoff,
1760			&api_mul,
1761			&api_mulgen,
1762			&api_muladd
1763			};
1764
1765			/* see bearssl_ec.h */
1766			const br_ec_impl *
1767	0		br_ec_p256_m64_get(void)
1768			{
1769	0		return &br_ec_p256_m64;
1770			}
1771
1772			#else
1773
1774			/* see bearssl_ec.h */
1775			const br_ec_impl *
1776			br_ec_p256_m64_get(void)
1777			{
1778			return 0;
1779			}
1780
1781			#endif