File Coverage

src/ec/ec_p256_m31.c
Criterion Covered Total %
statement 0 532 0.0
branch 0 68 0.0
condition n/a
subroutine n/a
pod n/a
total 0 600 0.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2017 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #include "inner.h"
26              
27             /*
28             * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29             * that right-shifting a signed negative integer copies the sign bit
30             * (arithmetic right-shift). This is "implementation-defined behaviour",
31             * i.e. it is not undefined, but it may differ between compilers. Each
32             * compiler is supposed to document its behaviour in that respect. GCC
33             * explicitly defines that an arithmetic right shift is used. We expect
34             * all other compilers to do the same, because underlying CPU offer an
35             * arithmetic right shift opcode that could not be used otherwise.
36             */
37             #if BR_NO_ARITH_SHIFT
38             #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39             | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40             #define ARSHW(x, n) (((uint64_t)(x) >> (n)) \
41             | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
42             #else
43             #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
44             #define ARSHW(x, n) ((*(int64_t *)&(x)) >> (n))
45             #endif
46              
47             /*
48             * Convert an integer from unsigned big-endian encoding to a sequence of
49             * 30-bit words in little-endian order. The final "partial" word is
50             * returned.
51             */
52             static uint32_t
53 0           be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
54             {
55             uint32_t acc;
56             int acc_len;
57              
58 0           acc = 0;
59 0           acc_len = 0;
60 0 0         while (len -- > 0) {
61             uint32_t b;
62              
63 0           b = src[len];
64 0 0         if (acc_len < 22) {
65 0           acc |= b << acc_len;
66 0           acc_len += 8;
67             } else {
68 0           *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
69 0           acc = b >> (30 - acc_len);
70 0           acc_len -= 22;
71             }
72             }
73 0           return acc;
74             }
75              
76             /*
77             * Convert an integer (30-bit words, little-endian) to unsigned
78             * big-endian encoding. The total encoding length is provided; all
79             * the destination bytes will be filled.
80             */
81             static void
82 0           le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
83             {
84             uint32_t acc;
85             int acc_len;
86              
87 0           acc = 0;
88 0           acc_len = 0;
89 0 0         while (len -- > 0) {
90 0 0         if (acc_len < 8) {
91             uint32_t w;
92              
93 0           w = *src ++;
94 0           dst[len] = (unsigned char)(acc | (w << acc_len));
95 0           acc = w >> (8 - acc_len);
96 0           acc_len += 22;
97             } else {
98 0           dst[len] = (unsigned char)acc;
99 0           acc >>= 8;
100 0           acc_len -= 8;
101             }
102             }
103 0           }
104              
105             /*
106             * Multiply two integers. Source integers are represented as arrays of
107             * nine 30-bit words, for values up to 2^270-1. Result is encoded over
108             * 18 words of 30 bits each.
109             */
110             static void
111 0           mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
112             {
113             /*
114             * Maximum intermediate result is no more than
115             * 10376293531797946367, which fits in 64 bits. Reason:
116             *
117             * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
118             * 10376293531797946367 < 9663676407 * 2^30
119             *
120             * Thus, adding together 9 products of 30-bit integers, with
121             * a carry of at most 9663676406, yields an integer that fits
122             * on 64 bits and generates a carry of at most 9663676406.
123             */
124             uint64_t t[17];
125             uint64_t cc;
126             int i;
127              
128 0           t[ 0] = MUL31(a[0], b[0]);
129 0           t[ 1] = MUL31(a[0], b[1])
130 0           + MUL31(a[1], b[0]);
131 0           t[ 2] = MUL31(a[0], b[2])
132 0           + MUL31(a[1], b[1])
133 0           + MUL31(a[2], b[0]);
134 0           t[ 3] = MUL31(a[0], b[3])
135 0           + MUL31(a[1], b[2])
136 0           + MUL31(a[2], b[1])
137 0           + MUL31(a[3], b[0]);
138 0           t[ 4] = MUL31(a[0], b[4])
139 0           + MUL31(a[1], b[3])
140 0           + MUL31(a[2], b[2])
141 0           + MUL31(a[3], b[1])
142 0           + MUL31(a[4], b[0]);
143 0           t[ 5] = MUL31(a[0], b[5])
144 0           + MUL31(a[1], b[4])
145 0           + MUL31(a[2], b[3])
146 0           + MUL31(a[3], b[2])
147 0           + MUL31(a[4], b[1])
148 0           + MUL31(a[5], b[0]);
149 0           t[ 6] = MUL31(a[0], b[6])
150 0           + MUL31(a[1], b[5])
151 0           + MUL31(a[2], b[4])
152 0           + MUL31(a[3], b[3])
153 0           + MUL31(a[4], b[2])
154 0           + MUL31(a[5], b[1])
155 0           + MUL31(a[6], b[0]);
156 0           t[ 7] = MUL31(a[0], b[7])
157 0           + MUL31(a[1], b[6])
158 0           + MUL31(a[2], b[5])
159 0           + MUL31(a[3], b[4])
160 0           + MUL31(a[4], b[3])
161 0           + MUL31(a[5], b[2])
162 0           + MUL31(a[6], b[1])
163 0           + MUL31(a[7], b[0]);
164 0           t[ 8] = MUL31(a[0], b[8])
165 0           + MUL31(a[1], b[7])
166 0           + MUL31(a[2], b[6])
167 0           + MUL31(a[3], b[5])
168 0           + MUL31(a[4], b[4])
169 0           + MUL31(a[5], b[3])
170 0           + MUL31(a[6], b[2])
171 0           + MUL31(a[7], b[1])
172 0           + MUL31(a[8], b[0]);
173 0           t[ 9] = MUL31(a[1], b[8])
174 0           + MUL31(a[2], b[7])
175 0           + MUL31(a[3], b[6])
176 0           + MUL31(a[4], b[5])
177 0           + MUL31(a[5], b[4])
178 0           + MUL31(a[6], b[3])
179 0           + MUL31(a[7], b[2])
180 0           + MUL31(a[8], b[1]);
181 0           t[10] = MUL31(a[2], b[8])
182 0           + MUL31(a[3], b[7])
183 0           + MUL31(a[4], b[6])
184 0           + MUL31(a[5], b[5])
185 0           + MUL31(a[6], b[4])
186 0           + MUL31(a[7], b[3])
187 0           + MUL31(a[8], b[2]);
188 0           t[11] = MUL31(a[3], b[8])
189 0           + MUL31(a[4], b[7])
190 0           + MUL31(a[5], b[6])
191 0           + MUL31(a[6], b[5])
192 0           + MUL31(a[7], b[4])
193 0           + MUL31(a[8], b[3]);
194 0           t[12] = MUL31(a[4], b[8])
195 0           + MUL31(a[5], b[7])
196 0           + MUL31(a[6], b[6])
197 0           + MUL31(a[7], b[5])
198 0           + MUL31(a[8], b[4]);
199 0           t[13] = MUL31(a[5], b[8])
200 0           + MUL31(a[6], b[7])
201 0           + MUL31(a[7], b[6])
202 0           + MUL31(a[8], b[5]);
203 0           t[14] = MUL31(a[6], b[8])
204 0           + MUL31(a[7], b[7])
205 0           + MUL31(a[8], b[6]);
206 0           t[15] = MUL31(a[7], b[8])
207 0           + MUL31(a[8], b[7]);
208 0           t[16] = MUL31(a[8], b[8]);
209              
210             /*
211             * Propagate carries.
212             */
213 0           cc = 0;
214 0 0         for (i = 0; i < 17; i ++) {
215             uint64_t w;
216              
217 0           w = t[i] + cc;
218 0           d[i] = (uint32_t)w & 0x3FFFFFFF;
219 0           cc = w >> 30;
220             }
221 0           d[17] = (uint32_t)cc;
222 0           }
223              
224             /*
225             * Square a 270-bit integer, represented as an array of nine 30-bit words.
226             * Result uses 18 words of 30 bits each.
227             */
228             static void
229 0           square9(uint32_t *d, const uint32_t *a)
230             {
231             uint64_t t[17];
232             uint64_t cc;
233             int i;
234              
235 0           t[ 0] = MUL31(a[0], a[0]);
236 0           t[ 1] = ((MUL31(a[0], a[1])) << 1);
237 0           t[ 2] = MUL31(a[1], a[1])
238 0           + ((MUL31(a[0], a[2])) << 1);
239 0           t[ 3] = ((MUL31(a[0], a[3])
240 0           + MUL31(a[1], a[2])) << 1);
241 0           t[ 4] = MUL31(a[2], a[2])
242 0           + ((MUL31(a[0], a[4])
243 0           + MUL31(a[1], a[3])) << 1);
244 0           t[ 5] = ((MUL31(a[0], a[5])
245 0           + MUL31(a[1], a[4])
246 0           + MUL31(a[2], a[3])) << 1);
247 0           t[ 6] = MUL31(a[3], a[3])
248 0           + ((MUL31(a[0], a[6])
249 0           + MUL31(a[1], a[5])
250 0           + MUL31(a[2], a[4])) << 1);
251 0           t[ 7] = ((MUL31(a[0], a[7])
252 0           + MUL31(a[1], a[6])
253 0           + MUL31(a[2], a[5])
254 0           + MUL31(a[3], a[4])) << 1);
255 0           t[ 8] = MUL31(a[4], a[4])
256 0           + ((MUL31(a[0], a[8])
257 0           + MUL31(a[1], a[7])
258 0           + MUL31(a[2], a[6])
259 0           + MUL31(a[3], a[5])) << 1);
260 0           t[ 9] = ((MUL31(a[1], a[8])
261 0           + MUL31(a[2], a[7])
262 0           + MUL31(a[3], a[6])
263 0           + MUL31(a[4], a[5])) << 1);
264 0           t[10] = MUL31(a[5], a[5])
265 0           + ((MUL31(a[2], a[8])
266 0           + MUL31(a[3], a[7])
267 0           + MUL31(a[4], a[6])) << 1);
268 0           t[11] = ((MUL31(a[3], a[8])
269 0           + MUL31(a[4], a[7])
270 0           + MUL31(a[5], a[6])) << 1);
271 0           t[12] = MUL31(a[6], a[6])
272 0           + ((MUL31(a[4], a[8])
273 0           + MUL31(a[5], a[7])) << 1);
274 0           t[13] = ((MUL31(a[5], a[8])
275 0           + MUL31(a[6], a[7])) << 1);
276 0           t[14] = MUL31(a[7], a[7])
277 0           + ((MUL31(a[6], a[8])) << 1);
278 0           t[15] = ((MUL31(a[7], a[8])) << 1);
279 0           t[16] = MUL31(a[8], a[8]);
280              
281             /*
282             * Propagate carries.
283             */
284 0           cc = 0;
285 0 0         for (i = 0; i < 17; i ++) {
286             uint64_t w;
287              
288 0           w = t[i] + cc;
289 0           d[i] = (uint32_t)w & 0x3FFFFFFF;
290 0           cc = w >> 30;
291             }
292 0           d[17] = (uint32_t)cc;
293 0           }
294              
295             /*
296             * Base field modulus for P-256.
297             */
298             static const uint32_t F256[] = {
299              
300             0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
301             0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
302             };
303              
304             /*
305             * The 'b' curve equation coefficient for P-256.
306             */
307             static const uint32_t P256_B[] = {
308              
309             0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
310             0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
311             };
312              
313             /*
314             * Addition in the field. Source operands shall fit on 257 bits; output
315             * will be lower than twice the modulus.
316             */
317             static void
318 0           add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
319             {
320             uint32_t w, cc;
321             int i;
322              
323 0           cc = 0;
324 0 0         for (i = 0; i < 9; i ++) {
325 0           w = a[i] + b[i] + cc;
326 0           d[i] = w & 0x3FFFFFFF;
327 0           cc = w >> 30;
328             }
329 0           w >>= 16;
330 0           d[8] &= 0xFFFF;
331 0           d[3] -= w << 6;
332 0           d[6] -= w << 12;
333 0           d[7] += w << 14;
334 0           cc = w;
335 0 0         for (i = 0; i < 9; i ++) {
336 0           w = d[i] + cc;
337 0           d[i] = w & 0x3FFFFFFF;
338 0           cc = ARSH(w, 30);
339             }
340 0           }
341              
342             /*
343             * Subtraction in the field. Source operands shall be smaller than twice
344             * the modulus; the result will fulfil the same property.
345             */
346             static void
347 0           sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
348             {
349             uint32_t w, cc;
350             int i;
351              
352             /*
353             * We really compute a - b + 2*p to make sure that the result is
354             * positive.
355             */
356 0           w = a[0] - b[0] - 0x00002;
357 0           d[0] = w & 0x3FFFFFFF;
358 0           w = a[1] - b[1] + ARSH(w, 30);
359 0           d[1] = w & 0x3FFFFFFF;
360 0           w = a[2] - b[2] + ARSH(w, 30);
361 0           d[2] = w & 0x3FFFFFFF;
362 0           w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
363 0           d[3] = w & 0x3FFFFFFF;
364 0           w = a[4] - b[4] + ARSH(w, 30);
365 0           d[4] = w & 0x3FFFFFFF;
366 0           w = a[5] - b[5] + ARSH(w, 30);
367 0           d[5] = w & 0x3FFFFFFF;
368 0           w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
369 0           d[6] = w & 0x3FFFFFFF;
370 0           w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
371 0           d[7] = w & 0x3FFFFFFF;
372 0           w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
373 0           d[8] = w & 0xFFFF;
374 0           w >>= 16;
375 0           d[8] &= 0xFFFF;
376 0           d[3] -= w << 6;
377 0           d[6] -= w << 12;
378 0           d[7] += w << 14;
379 0           cc = w;
380 0 0         for (i = 0; i < 9; i ++) {
381 0           w = d[i] + cc;
382 0           d[i] = w & 0x3FFFFFFF;
383 0           cc = ARSH(w, 30);
384             }
385 0           }
386              
387             /*
388             * Compute a multiplication in F256. Source operands shall be less than
389             * twice the modulus.
390             */
391             static void
392 0           mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
393             {
394             uint32_t t[18];
395             uint64_t s[18];
396             uint64_t cc, x;
397             uint32_t z, c;
398             int i;
399              
400 0           mul9(t, a, b);
401              
402             /*
403             * Modular reduction: each high word in added/subtracted where
404             * necessary.
405             *
406             * The modulus is:
407             * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
408             * Therefore:
409             * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
410             *
411             * For a word x at bit offset n (n >= 256), we have:
412             * x*2^n = x*2^(n-32) - x*2^(n-64)
413             * - x*2^(n - 160) + x*2^(n-256) mod p
414             *
415             * Thus, we can nullify the high word if we reinject it at some
416             * proper emplacements.
417             *
418             * We use 64-bit intermediate words to allow for carries to
419             * accumulate easily, before performing the final propagation.
420             */
421 0 0         for (i = 0; i < 18; i ++) {
422 0           s[i] = t[i];
423             }
424              
425 0 0         for (i = 17; i >= 9; i --) {
426             uint64_t y;
427              
428 0           y = s[i];
429 0           s[i - 1] += ARSHW(y, 2);
430 0           s[i - 2] += (y << 28) & 0x3FFFFFFF;
431 0           s[i - 2] -= ARSHW(y, 4);
432 0           s[i - 3] -= (y << 26) & 0x3FFFFFFF;
433 0           s[i - 5] -= ARSHW(y, 10);
434 0           s[i - 6] -= (y << 20) & 0x3FFFFFFF;
435 0           s[i - 8] += ARSHW(y, 16);
436 0           s[i - 9] += (y << 14) & 0x3FFFFFFF;
437             }
438              
439             /*
440             * Carry propagation must be signed. Moreover, we may have overdone
441             * it a bit, and obtain a negative result.
442             *
443             * The loop above ran 9 times; each time, each word was augmented
444             * by at most one extra word (in absolute value). Thus, the top
445             * word must in fine fit in 39 bits, so the carry below will fit
446             * on 9 bits.
447             */
448 0           cc = 0;
449 0 0         for (i = 0; i < 9; i ++) {
450 0           x = s[i] + cc;
451 0           d[i] = (uint32_t)x & 0x3FFFFFFF;
452 0           cc = ARSHW(x, 30);
453             }
454              
455             /*
456             * All nine words fit on 30 bits, but there may be an extra
457             * carry for a few bits (at most 9), and that carry may be
458             * negative. Moreover, we want the result to fit on 257 bits.
459             * The two lines below ensure that the word in d[] has length
460             * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
461             * significant length of cc is less than 24 bits, so we will be
462             * able to switch to 32-bit operations.
463             */
464 0           cc = ARSHW(x, 16);
465 0           d[8] &= 0xFFFF;
466              
467             /*
468             * One extra round of reduction, for cc*2^256, which means
469             * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
470             * value. If cc is negative, then it may happen (rarely, but
471             * not neglectibly so) that the result would be negative. In
472             * order to avoid that, if cc is negative, then we add the
473             * modulus once. Note that if cc is negative, then propagating
474             * that carry must yield a value lower than the modulus, so
475             * adding the modulus once will keep the final result under
476             * twice the modulus.
477             */
478 0           z = (uint32_t)cc;
479 0           d[3] -= z << 6;
480 0           d[6] -= (z << 12) & 0x3FFFFFFF;
481 0           d[7] -= ARSH(z, 18);
482 0           d[7] += (z << 14) & 0x3FFFFFFF;
483 0           d[8] += ARSH(z, 16);
484 0           c = z >> 31;
485 0           d[0] -= c;
486 0           d[3] += c << 6;
487 0           d[6] += c << 12;
488 0           d[7] -= c << 14;
489 0           d[8] += c << 16;
490 0 0         for (i = 0; i < 9; i ++) {
491             uint32_t w;
492              
493 0           w = d[i] + z;
494 0           d[i] = w & 0x3FFFFFFF;
495 0           z = ARSH(w, 30);
496             }
497 0           }
498              
499             /*
500             * Compute a square in F256. Source operand shall be less than
501             * twice the modulus.
502             */
503             static void
504 0           square_f256(uint32_t *d, const uint32_t *a)
505             {
506             uint32_t t[18];
507             uint64_t s[18];
508             uint64_t cc, x;
509             uint32_t z, c;
510             int i;
511              
512 0           square9(t, a);
513              
514             /*
515             * Modular reduction: each high word in added/subtracted where
516             * necessary.
517             *
518             * The modulus is:
519             * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
520             * Therefore:
521             * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
522             *
523             * For a word x at bit offset n (n >= 256), we have:
524             * x*2^n = x*2^(n-32) - x*2^(n-64)
525             * - x*2^(n - 160) + x*2^(n-256) mod p
526             *
527             * Thus, we can nullify the high word if we reinject it at some
528             * proper emplacements.
529             *
530             * We use 64-bit intermediate words to allow for carries to
531             * accumulate easily, before performing the final propagation.
532             */
533 0 0         for (i = 0; i < 18; i ++) {
534 0           s[i] = t[i];
535             }
536              
537 0 0         for (i = 17; i >= 9; i --) {
538             uint64_t y;
539              
540 0           y = s[i];
541 0           s[i - 1] += ARSHW(y, 2);
542 0           s[i - 2] += (y << 28) & 0x3FFFFFFF;
543 0           s[i - 2] -= ARSHW(y, 4);
544 0           s[i - 3] -= (y << 26) & 0x3FFFFFFF;
545 0           s[i - 5] -= ARSHW(y, 10);
546 0           s[i - 6] -= (y << 20) & 0x3FFFFFFF;
547 0           s[i - 8] += ARSHW(y, 16);
548 0           s[i - 9] += (y << 14) & 0x3FFFFFFF;
549             }
550              
551             /*
552             * Carry propagation must be signed. Moreover, we may have overdone
553             * it a bit, and obtain a negative result.
554             *
555             * The loop above ran 9 times; each time, each word was augmented
556             * by at most one extra word (in absolute value). Thus, the top
557             * word must in fine fit in 39 bits, so the carry below will fit
558             * on 9 bits.
559             */
560 0           cc = 0;
561 0 0         for (i = 0; i < 9; i ++) {
562 0           x = s[i] + cc;
563 0           d[i] = (uint32_t)x & 0x3FFFFFFF;
564 0           cc = ARSHW(x, 30);
565             }
566              
567             /*
568             * All nine words fit on 30 bits, but there may be an extra
569             * carry for a few bits (at most 9), and that carry may be
570             * negative. Moreover, we want the result to fit on 257 bits.
571             * The two lines below ensure that the word in d[] has length
572             * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
573             * significant length of cc is less than 24 bits, so we will be
574             * able to switch to 32-bit operations.
575             */
576 0           cc = ARSHW(x, 16);
577 0           d[8] &= 0xFFFF;
578              
579             /*
580             * One extra round of reduction, for cc*2^256, which means
581             * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
582             * value. If cc is negative, then it may happen (rarely, but
583             * not neglectibly so) that the result would be negative. In
584             * order to avoid that, if cc is negative, then we add the
585             * modulus once. Note that if cc is negative, then propagating
586             * that carry must yield a value lower than the modulus, so
587             * adding the modulus once will keep the final result under
588             * twice the modulus.
589             */
590 0           z = (uint32_t)cc;
591 0           d[3] -= z << 6;
592 0           d[6] -= (z << 12) & 0x3FFFFFFF;
593 0           d[7] -= ARSH(z, 18);
594 0           d[7] += (z << 14) & 0x3FFFFFFF;
595 0           d[8] += ARSH(z, 16);
596 0           c = z >> 31;
597 0           d[0] -= c;
598 0           d[3] += c << 6;
599 0           d[6] += c << 12;
600 0           d[7] -= c << 14;
601 0           d[8] += c << 16;
602 0 0         for (i = 0; i < 9; i ++) {
603             uint32_t w;
604              
605 0           w = d[i] + z;
606 0           d[i] = w & 0x3FFFFFFF;
607 0           z = ARSH(w, 30);
608             }
609 0           }
610              
611             /*
612             * Perform a "final reduction" in field F256 (field for curve P-256).
613             * The source value must be less than twice the modulus. If the value
614             * is not lower than the modulus, then the modulus is subtracted and
615             * this function returns 1; otherwise, it leaves it untouched and it
616             * returns 0.
617             */
618             static uint32_t
619 0           reduce_final_f256(uint32_t *d)
620             {
621             uint32_t t[9];
622             uint32_t cc;
623             int i;
624              
625 0           cc = 0;
626 0 0         for (i = 0; i < 9; i ++) {
627             uint32_t w;
628              
629 0           w = d[i] - F256[i] - cc;
630 0           cc = w >> 31;
631 0           t[i] = w & 0x3FFFFFFF;
632             }
633 0           cc ^= 1;
634 0           CCOPY(cc, d, t, sizeof t);
635 0           return cc;
636             }
637              
638             /*
639             * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
640             * are such that:
641             * X = x / z^2
642             * Y = y / z^3
643             * For the point at infinity, z = 0.
644             * Each point thus admits many possible representations.
645             *
646             * Coordinates are represented in arrays of 32-bit integers, each holding
647             * 30 bits of data. Values may also be slightly greater than the modulus,
648             * but they will always be lower than twice the modulus.
649             */
650             typedef struct {
651             uint32_t x[9];
652             uint32_t y[9];
653             uint32_t z[9];
654             } p256_jacobian;
655              
656             /*
657             * Convert a point to affine coordinates:
658             * - If the point is the point at infinity, then all three coordinates
659             * are set to 0.
660             * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
661             * coordinates are the 'X' and 'Y' affine coordinates.
662             * The coordinates are guaranteed to be lower than the modulus.
663             */
664             static void
665 0           p256_to_affine(p256_jacobian *P)
666             {
667             uint32_t t1[9], t2[9];
668             int i;
669              
670             /*
671             * Invert z with a modular exponentiation: the modulus is
672             * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
673             * p-2. Exponent bit pattern (from high to low) is:
674             * - 32 bits of value 1
675             * - 31 bits of value 0
676             * - 1 bit of value 1
677             * - 96 bits of value 0
678             * - 94 bits of value 1
679             * - 1 bit of value 0
680             * - 1 bit of value 1
681             * Thus, we precompute z^(2^31-1) to speed things up.
682             *
683             * If z = 0 (point at infinity) then the modular exponentiation
684             * will yield 0, which leads to the expected result (all three
685             * coordinates set to 0).
686             */
687              
688             /*
689             * A simple square-and-multiply for z^(2^31-1). We could save about
690             * two dozen multiplications here with an addition chain, but
691             * this would require a bit more code, and extra stack buffers.
692             */
693 0           memcpy(t1, P->z, sizeof P->z);
694 0 0         for (i = 0; i < 30; i ++) {
695 0           square_f256(t1, t1);
696 0           mul_f256(t1, t1, P->z);
697             }
698              
699             /*
700             * Square-and-multiply. Apart from the squarings, we have a few
701             * multiplications to set bits to 1; we multiply by the original z
702             * for setting 1 bit, and by t1 for setting 31 bits.
703             */
704 0           memcpy(t2, P->z, sizeof P->z);
705 0 0         for (i = 1; i < 256; i ++) {
706 0           square_f256(t2, t2);
707 0           switch (i) {
708 0           case 31:
709             case 190:
710             case 221:
711             case 252:
712 0           mul_f256(t2, t2, t1);
713 0           break;
714 0           case 63:
715             case 253:
716             case 255:
717 0           mul_f256(t2, t2, P->z);
718 0           break;
719             }
720             }
721              
722             /*
723             * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
724             */
725 0           mul_f256(t1, t2, t2);
726 0           mul_f256(P->x, t1, P->x);
727 0           mul_f256(t1, t1, t2);
728 0           mul_f256(P->y, t1, P->y);
729 0           reduce_final_f256(P->x);
730 0           reduce_final_f256(P->y);
731              
732             /*
733             * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
734             * this will set z to 1.
735             */
736 0           mul_f256(P->z, P->z, t2);
737 0           reduce_final_f256(P->z);
738 0           }
739              
740             /*
741             * Double a point in P-256. This function works for all valid points,
742             * including the point at infinity.
743             */
744             static void
745 0           p256_double(p256_jacobian *Q)
746             {
747             /*
748             * Doubling formulas are:
749             *
750             * s = 4*x*y^2
751             * m = 3*(x + z^2)*(x - z^2)
752             * x' = m^2 - 2*s
753             * y' = m*(s - x') - 8*y^4
754             * z' = 2*y*z
755             *
756             * These formulas work for all points, including points of order 2
757             * and points at infinity:
758             * - If y = 0 then z' = 0. But there is no such point in P-256
759             * anyway.
760             * - If z = 0 then z' = 0.
761             */
762             uint32_t t1[9], t2[9], t3[9], t4[9];
763              
764             /*
765             * Compute z^2 in t1.
766             */
767 0           square_f256(t1, Q->z);
768              
769             /*
770             * Compute x-z^2 in t2 and x+z^2 in t1.
771             */
772 0           add_f256(t2, Q->x, t1);
773 0           sub_f256(t1, Q->x, t1);
774              
775             /*
776             * Compute 3*(x+z^2)*(x-z^2) in t1.
777             */
778 0           mul_f256(t3, t1, t2);
779 0           add_f256(t1, t3, t3);
780 0           add_f256(t1, t3, t1);
781              
782             /*
783             * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
784             */
785 0           square_f256(t3, Q->y);
786 0           add_f256(t3, t3, t3);
787 0           mul_f256(t2, Q->x, t3);
788 0           add_f256(t2, t2, t2);
789              
790             /*
791             * Compute x' = m^2 - 2*s.
792             */
793 0           square_f256(Q->x, t1);
794 0           sub_f256(Q->x, Q->x, t2);
795 0           sub_f256(Q->x, Q->x, t2);
796              
797             /*
798             * Compute z' = 2*y*z.
799             */
800 0           mul_f256(t4, Q->y, Q->z);
801 0           add_f256(Q->z, t4, t4);
802              
803             /*
804             * Compute y' = m*(s - x') - 8*y^4. Note that we already have
805             * 2*y^2 in t3.
806             */
807 0           sub_f256(t2, t2, Q->x);
808 0           mul_f256(Q->y, t1, t2);
809 0           square_f256(t4, t3);
810 0           add_f256(t4, t4, t4);
811 0           sub_f256(Q->y, Q->y, t4);
812 0           }
813              
814             /*
815             * Add point P2 to point P1.
816             *
817             * This function computes the wrong result in the following cases:
818             *
819             * - If P1 == 0 but P2 != 0
820             * - If P1 != 0 but P2 == 0
821             * - If P1 == P2
822             *
823             * In all three cases, P1 is set to the point at infinity.
824             *
825             * Returned value is 0 if one of the following occurs:
826             *
827             * - P1 and P2 have the same Y coordinate
828             * - P1 == 0 and P2 == 0
829             * - The Y coordinate of one of the points is 0 and the other point is
830             * the point at infinity.
831             *
832             * The third case cannot actually happen with valid points, since a point
833             * with Y == 0 is a point of order 2, and there is no point of order 2 on
834             * curve P-256.
835             *
836             * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
837             * can apply the following:
838             *
839             * - If the result is not the point at infinity, then it is correct.
840             * - Otherwise, if the returned value is 1, then this is a case of
841             * P1+P2 == 0, so the result is indeed the point at infinity.
842             * - Otherwise, P1 == P2, so a "double" operation should have been
843             * performed.
844             */
845             static uint32_t
846 0           p256_add(p256_jacobian *P1, const p256_jacobian *P2)
847             {
848             /*
849             * Addtions formulas are:
850             *
851             * u1 = x1 * z2^2
852             * u2 = x2 * z1^2
853             * s1 = y1 * z2^3
854             * s2 = y2 * z1^3
855             * h = u2 - u1
856             * r = s2 - s1
857             * x3 = r^2 - h^3 - 2 * u1 * h^2
858             * y3 = r * (u1 * h^2 - x3) - s1 * h^3
859             * z3 = h * z1 * z2
860             */
861             uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
862             uint32_t ret;
863             int i;
864              
865             /*
866             * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
867             */
868 0           square_f256(t3, P2->z);
869 0           mul_f256(t1, P1->x, t3);
870 0           mul_f256(t4, P2->z, t3);
871 0           mul_f256(t3, P1->y, t4);
872              
873             /*
874             * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
875             */
876 0           square_f256(t4, P1->z);
877 0           mul_f256(t2, P2->x, t4);
878 0           mul_f256(t5, P1->z, t4);
879 0           mul_f256(t4, P2->y, t5);
880              
881             /*
882             * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
883             * We need to test whether r is zero, so we will do some extra
884             * reduce.
885             */
886 0           sub_f256(t2, t2, t1);
887 0           sub_f256(t4, t4, t3);
888 0           reduce_final_f256(t4);
889 0           ret = 0;
890 0 0         for (i = 0; i < 9; i ++) {
891 0           ret |= t4[i];
892             }
893 0           ret = (ret | -ret) >> 31;
894              
895             /*
896             * Compute u1*h^2 (in t6) and h^3 (in t5);
897             */
898 0           square_f256(t7, t2);
899 0           mul_f256(t6, t1, t7);
900 0           mul_f256(t5, t7, t2);
901              
902             /*
903             * Compute x3 = r^2 - h^3 - 2*u1*h^2.
904             */
905 0           square_f256(P1->x, t4);
906 0           sub_f256(P1->x, P1->x, t5);
907 0           sub_f256(P1->x, P1->x, t6);
908 0           sub_f256(P1->x, P1->x, t6);
909              
910             /*
911             * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
912             */
913 0           sub_f256(t6, t6, P1->x);
914 0           mul_f256(P1->y, t4, t6);
915 0           mul_f256(t1, t5, t3);
916 0           sub_f256(P1->y, P1->y, t1);
917              
918             /*
919             * Compute z3 = h*z1*z2.
920             */
921 0           mul_f256(t1, P1->z, P2->z);
922 0           mul_f256(P1->z, t1, t2);
923              
924 0           return ret;
925             }
926              
927             /*
928             * Add point P2 to point P1. This is a specialised function for the
929             * case when P2 is a non-zero point in affine coordinate.
930             *
931             * This function computes the wrong result in the following cases:
932             *
933             * - If P1 == 0
934             * - If P1 == P2
935             *
936             * In both cases, P1 is set to the point at infinity.
937             *
938             * Returned value is 0 if one of the following occurs:
939             *
940             * - P1 and P2 have the same Y coordinate
941             * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
942             *
943             * The second case cannot actually happen with valid points, since a point
944             * with Y == 0 is a point of order 2, and there is no point of order 2 on
945             * curve P-256.
946             *
947             * Therefore, assuming that P1 != 0 on input, then the caller
948             * can apply the following:
949             *
950             * - If the result is not the point at infinity, then it is correct.
951             * - Otherwise, if the returned value is 1, then this is a case of
952             * P1+P2 == 0, so the result is indeed the point at infinity.
953             * - Otherwise, P1 == P2, so a "double" operation should have been
954             * performed.
955             */
956             static uint32_t
957 0           p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
958             {
959             /*
960             * Addtions formulas are:
961             *
962             * u1 = x1
963             * u2 = x2 * z1^2
964             * s1 = y1
965             * s2 = y2 * z1^3
966             * h = u2 - u1
967             * r = s2 - s1
968             * x3 = r^2 - h^3 - 2 * u1 * h^2
969             * y3 = r * (u1 * h^2 - x3) - s1 * h^3
970             * z3 = h * z1
971             */
972             uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
973             uint32_t ret;
974             int i;
975              
976             /*
977             * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
978             */
979 0           memcpy(t1, P1->x, sizeof t1);
980 0           memcpy(t3, P1->y, sizeof t3);
981              
982             /*
983             * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
984             */
985 0           square_f256(t4, P1->z);
986 0           mul_f256(t2, P2->x, t4);
987 0           mul_f256(t5, P1->z, t4);
988 0           mul_f256(t4, P2->y, t5);
989              
990             /*
991             * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
992             * We need to test whether r is zero, so we will do some extra
993             * reduce.
994             */
995 0           sub_f256(t2, t2, t1);
996 0           sub_f256(t4, t4, t3);
997 0           reduce_final_f256(t4);
998 0           ret = 0;
999 0 0         for (i = 0; i < 9; i ++) {
1000 0           ret |= t4[i];
1001             }
1002 0           ret = (ret | -ret) >> 31;
1003              
1004             /*
1005             * Compute u1*h^2 (in t6) and h^3 (in t5);
1006             */
1007 0           square_f256(t7, t2);
1008 0           mul_f256(t6, t1, t7);
1009 0           mul_f256(t5, t7, t2);
1010              
1011             /*
1012             * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013             */
1014 0           square_f256(P1->x, t4);
1015 0           sub_f256(P1->x, P1->x, t5);
1016 0           sub_f256(P1->x, P1->x, t6);
1017 0           sub_f256(P1->x, P1->x, t6);
1018              
1019             /*
1020             * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021             */
1022 0           sub_f256(t6, t6, P1->x);
1023 0           mul_f256(P1->y, t4, t6);
1024 0           mul_f256(t1, t5, t3);
1025 0           sub_f256(P1->y, P1->y, t1);
1026              
1027             /*
1028             * Compute z3 = h*z1*z2.
1029             */
1030 0           mul_f256(P1->z, P1->z, t2);
1031              
1032 0           return ret;
1033             }
1034              
1035             /*
1036             * Decode a P-256 point. This function does not support the point at
1037             * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038             */
1039             static uint32_t
1040 0           p256_decode(p256_jacobian *P, const void *src, size_t len)
1041             {
1042             const unsigned char *buf;
1043             uint32_t tx[9], ty[9], t1[9], t2[9];
1044             uint32_t bad;
1045             int i;
1046              
1047 0 0         if (len != 65) {
1048 0           return 0;
1049             }
1050 0           buf = src;
1051              
1052             /*
1053             * First byte must be 0x04 (uncompressed format). We could support
1054             * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055             * least significant bit of the Y coordinate), but it is explicitly
1056             * forbidden by RFC 5480 (section 2.2).
1057             */
1058 0           bad = NEQ(buf[0], 0x04);
1059              
1060             /*
1061             * Decode the coordinates, and check that they are both lower
1062             * than the modulus.
1063             */
1064 0           tx[8] = be8_to_le30(tx, buf + 1, 32);
1065 0           ty[8] = be8_to_le30(ty, buf + 33, 32);
1066 0           bad |= reduce_final_f256(tx);
1067 0           bad |= reduce_final_f256(ty);
1068              
1069             /*
1070             * Check curve equation.
1071             */
1072 0           square_f256(t1, tx);
1073 0           mul_f256(t1, tx, t1);
1074 0           square_f256(t2, ty);
1075 0           sub_f256(t1, t1, tx);
1076 0           sub_f256(t1, t1, tx);
1077 0           sub_f256(t1, t1, tx);
1078 0           add_f256(t1, t1, P256_B);
1079 0           sub_f256(t1, t1, t2);
1080 0           reduce_final_f256(t1);
1081 0 0         for (i = 0; i < 9; i ++) {
1082 0           bad |= t1[i];
1083             }
1084              
1085             /*
1086             * Copy coordinates to the point structure.
1087             */
1088 0           memcpy(P->x, tx, sizeof tx);
1089 0           memcpy(P->y, ty, sizeof ty);
1090 0           memset(P->z, 0, sizeof P->z);
1091 0           P->z[0] = 1;
1092 0           return EQ(bad, 0);
1093             }
1094              
1095             /*
1096             * Encode a point into a buffer. This function assumes that the point is
1097             * valid, in affine coordinates, and not the point at infinity.
1098             */
1099             static void
1100 0           p256_encode(void *dst, const p256_jacobian *P)
1101             {
1102             unsigned char *buf;
1103              
1104 0           buf = dst;
1105 0           buf[0] = 0x04;
1106 0           le30_to_be8(buf + 1, 32, P->x);
1107 0           le30_to_be8(buf + 33, 32, P->y);
1108 0           }
1109              
1110             /*
1111             * Multiply a curve point by an integer. The integer is assumed to be
1112             * lower than the curve order, and the base point must not be the point
1113             * at infinity.
1114             */
1115             static void
1116 0           p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117             {
1118             /*
1119             * qz is a flag that is initially 1, and remains equal to 1
1120             * as long as the point is the point at infinity.
1121             *
1122             * We use a 2-bit window to handle multiplier bits by pairs.
1123             * The precomputed window really is the points P2 and P3.
1124             */
1125             uint32_t qz;
1126             p256_jacobian P2, P3, Q, T, U;
1127              
1128             /*
1129             * Compute window values.
1130             */
1131 0           P2 = *P;
1132 0           p256_double(&P2);
1133 0           P3 = *P;
1134 0           p256_add(&P3, &P2);
1135              
1136             /*
1137             * We start with Q = 0. We process multiplier bits 2 by 2.
1138             */
1139 0           memset(&Q, 0, sizeof Q);
1140 0           qz = 1;
1141 0 0         while (xlen -- > 0) {
1142             int k;
1143              
1144 0 0         for (k = 6; k >= 0; k -= 2) {
1145             uint32_t bits;
1146             uint32_t bnz;
1147              
1148 0           p256_double(&Q);
1149 0           p256_double(&Q);
1150 0           T = *P;
1151 0           U = Q;
1152 0           bits = (*x >> k) & (uint32_t)3;
1153 0           bnz = NEQ(bits, 0);
1154 0           CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155 0           CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156 0           p256_add(&U, &T);
1157 0           CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158 0           CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159 0           qz &= ~bnz;
1160             }
1161 0           x ++;
1162             }
1163 0           *P = Q;
1164 0           }
1165              
1166             /*
1167             * Precomputed window: k*G points, where G is the curve generator, and k
1168             * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169             * the point are encoded as 9 words of 30 bits each (little-endian
1170             * order).
1171             */
1172             static const uint32_t Gwin[15][18] = {
1173              
1174             { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175             0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176             0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177             0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178             0x10B8BF86, 0x00004FE3 },
1179              
1180             { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181             0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182             0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183             0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184             0x154436E3, 0x00000777 },
1185              
1186             { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187             0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188             0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189             0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190             0x19031266, 0x00008734 },
1191              
1192             { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193             0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194             0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195             0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196             0x15D69318, 0x0000E0F1 },
1197              
1198             { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199             0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200             0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201             0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202             0x1F6A2412, 0x0000E0C1 },
1203              
1204             { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205             0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206             0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207             0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208             0x041D0C8D, 0x0000E85C },
1209              
1210             { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211             0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212             0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213             0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214             0x076F780C, 0x000073EB },
1215              
1216             { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217             0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218             0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219             0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220             0x332F647A, 0x0000AD5A },
1221              
1222             { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223             0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224             0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225             0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226             0x11325CB2, 0x00002A27 },
1227              
1228             { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229             0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230             0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231             0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232             0x18A88A6A, 0x00008786 },
1233              
1234             { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235             0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236             0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237             0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238             0x0826B331, 0x00009099 },
1239              
1240             { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241             0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242             0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243             0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244             0x2D1AA70E, 0x00000770 },
1245              
1246             { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247             0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248             0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249             0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250             0x163353AF, 0x000063BB },
1251              
1252             { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253             0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254             0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255             0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256             0x3C6ECA7D, 0x0000F599 },
1257              
1258             { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259             0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260             0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261             0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262             0x0FB8D64B, 0x0000B5B9 }
1263             };
1264              
1265             /*
1266             * Lookup one of the Gwin[] values, by index. This is constant-time.
1267             */
1268             static void
1269 0           lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270             {
1271             uint32_t xy[18];
1272             uint32_t k;
1273             size_t u;
1274              
1275 0           memset(xy, 0, sizeof xy);
1276 0 0         for (k = 0; k < 15; k ++) {
1277             uint32_t m;
1278              
1279 0           m = -EQ(idx, k + 1);
1280 0 0         for (u = 0; u < 18; u ++) {
1281 0           xy[u] |= m & Gwin[k][u];
1282             }
1283             }
1284 0           memcpy(T->x, &xy[0], sizeof T->x);
1285 0           memcpy(T->y, &xy[9], sizeof T->y);
1286 0           memset(T->z, 0, sizeof T->z);
1287 0           T->z[0] = 1;
1288 0           }
1289              
1290             /*
1291             * Multiply the generator by an integer. The integer is assumed non-zero
1292             * and lower than the curve order.
1293             */
1294             static void
1295 0           p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296             {
1297             /*
1298             * qz is a flag that is initially 1, and remains equal to 1
1299             * as long as the point is the point at infinity.
1300             *
1301             * We use a 4-bit window to handle multiplier bits by groups
1302             * of 4. The precomputed window is constant static data, with
1303             * points in affine coordinates; we use a constant-time lookup.
1304             */
1305             p256_jacobian Q;
1306             uint32_t qz;
1307              
1308 0           memset(&Q, 0, sizeof Q);
1309 0           qz = 1;
1310 0 0         while (xlen -- > 0) {
1311             int k;
1312             unsigned bx;
1313              
1314 0           bx = *x ++;
1315 0 0         for (k = 0; k < 2; k ++) {
1316             uint32_t bits;
1317             uint32_t bnz;
1318             p256_jacobian T, U;
1319              
1320 0           p256_double(&Q);
1321 0           p256_double(&Q);
1322 0           p256_double(&Q);
1323 0           p256_double(&Q);
1324 0           bits = (bx >> 4) & 0x0F;
1325 0           bnz = NEQ(bits, 0);
1326 0           lookup_Gwin(&T, bits);
1327 0           U = Q;
1328 0           p256_add_mixed(&U, &T);
1329 0           CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330 0           CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331 0           qz &= ~bnz;
1332 0           bx <<= 4;
1333             }
1334             }
1335 0           *P = Q;
1336 0           }
1337              
1338             static const unsigned char P256_G[] = {
1339             0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340             0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341             0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342             0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343             0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344             0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345             0x68, 0x37, 0xBF, 0x51, 0xF5
1346             };
1347              
1348             static const unsigned char P256_N[] = {
1349             0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350             0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351             0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352             0x25, 0x51
1353             };
1354              
1355             static const unsigned char *
1356 0           api_generator(int curve, size_t *len)
1357             {
1358             (void)curve;
1359 0           *len = sizeof P256_G;
1360 0           return P256_G;
1361             }
1362              
1363             static const unsigned char *
1364 0           api_order(int curve, size_t *len)
1365             {
1366             (void)curve;
1367 0           *len = sizeof P256_N;
1368 0           return P256_N;
1369             }
1370              
1371             static size_t
1372 0           api_xoff(int curve, size_t *len)
1373             {
1374             (void)curve;
1375 0           *len = 32;
1376 0           return 1;
1377             }
1378              
1379             static uint32_t
1380 0           api_mul(unsigned char *G, size_t Glen,
1381             const unsigned char *x, size_t xlen, int curve)
1382             {
1383             uint32_t r;
1384             p256_jacobian P;
1385              
1386             (void)curve;
1387 0 0         if (Glen != 65) {
1388 0           return 0;
1389             }
1390 0           r = p256_decode(&P, G, Glen);
1391 0           p256_mul(&P, x, xlen);
1392 0           p256_to_affine(&P);
1393 0           p256_encode(G, &P);
1394 0           return r;
1395             }
1396              
1397             static size_t
1398 0           api_mulgen(unsigned char *R,
1399             const unsigned char *x, size_t xlen, int curve)
1400             {
1401             p256_jacobian P;
1402              
1403             (void)curve;
1404 0           p256_mulgen(&P, x, xlen);
1405 0           p256_to_affine(&P);
1406 0           p256_encode(R, &P);
1407 0           return 65;
1408             }
1409              
1410             static uint32_t
1411 0           api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1412             const unsigned char *x, size_t xlen,
1413             const unsigned char *y, size_t ylen, int curve)
1414             {
1415             p256_jacobian P, Q;
1416             uint32_t r, t, z;
1417             int i;
1418              
1419             (void)curve;
1420 0 0         if (len != 65) {
1421 0           return 0;
1422             }
1423 0           r = p256_decode(&P, A, len);
1424 0           p256_mul(&P, x, xlen);
1425 0 0         if (B == NULL) {
1426 0           p256_mulgen(&Q, y, ylen);
1427             } else {
1428 0           r &= p256_decode(&Q, B, len);
1429 0           p256_mul(&Q, y, ylen);
1430             }
1431              
1432             /*
1433             * The final addition may fail in case both points are equal.
1434             */
1435 0           t = p256_add(&P, &Q);
1436 0           reduce_final_f256(P.z);
1437 0           z = 0;
1438 0 0         for (i = 0; i < 9; i ++) {
1439 0           z |= P.z[i];
1440             }
1441 0           z = EQ(z, 0);
1442 0           p256_double(&Q);
1443              
1444             /*
1445             * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1446             * have the following:
1447             *
1448             * z = 0, t = 0 return P (normal addition)
1449             * z = 0, t = 1 return P (normal addition)
1450             * z = 1, t = 0 return Q (a 'double' case)
1451             * z = 1, t = 1 report an error (P+Q = 0)
1452             */
1453 0           CCOPY(z & ~t, &P, &Q, sizeof Q);
1454 0           p256_to_affine(&P);
1455 0           p256_encode(A, &P);
1456 0           r &= ~(z & t);
1457 0           return r;
1458             }
1459              
1460             /* see bearssl_ec.h */
1461             const br_ec_impl br_ec_p256_m31 = {
1462             (uint32_t)0x00800000,
1463             &api_generator,
1464             &api_order,
1465             &api_xoff,
1466             &api_mul,
1467             &api_mulgen,
1468             &api_muladd
1469             };