File Coverage

src/int/i15_moddiv.c
Criterion Covered Total %
statement 0 146 0.0
branch 0 22 0.0
condition n/a
subroutine n/a
pod n/a
total 0 168 0.0


line stmt bran cond sub pod time code
1             /*
2             * Copyright (c) 2018 Thomas Pornin
3             *
4             * Permission is hereby granted, free of charge, to any person obtaining
5             * a copy of this software and associated documentation files (the
6             * "Software"), to deal in the Software without restriction, including
7             * without limitation the rights to use, copy, modify, merge, publish,
8             * distribute, sublicense, and/or sell copies of the Software, and to
9             * permit persons to whom the Software is furnished to do so, subject to
10             * the following conditions:
11             *
12             * The above copyright notice and this permission notice shall be
13             * included in all copies or substantial portions of the Software.
14             *
15             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16             * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17             * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18             * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19             * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20             * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21             * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22             * SOFTWARE.
23             */
24              
25             #include "inner.h"
26              
27             /*
28             * In this file, we handle big integers with a custom format, i.e.
29             * without the usual one-word header. Value is split into 15-bit words,
30             * each stored in a 16-bit slot (top bit is zero) in little-endian
31             * order. The length (in words) is provided explicitly. In some cases,
32             * the value can be negative (using two's complement representation). In
33             * some cases, the top word is allowed to have a 16th bit.
34             */
35              
36             /*
37             * Negate big integer conditionally. The value consists of 'len' words,
38             * with 15 bits in each word (the top bit of each word should be 0,
39             * except possibly for the last word). If 'ctl' is 1, the negation is
40             * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
41             */
42             static void
43 0           cond_negate(uint16_t *a, size_t len, uint32_t ctl)
44             {
45             size_t k;
46             uint32_t cc, xm;
47              
48 0           cc = ctl;
49 0           xm = 0x7FFF & -ctl;
50 0 0         for (k = 0; k < len; k ++) {
51             uint32_t aw;
52              
53 0           aw = a[k];
54 0           aw = (aw ^ xm) + cc;
55 0           a[k] = aw & 0x7FFF;
56 0           cc = (aw >> 15) & 1;
57             }
58 0           }
59              
60             /*
61             * Finish modular reduction. Rules on input parameters:
62             *
63             * if neg = 1, then -m <= a < 0
64             * if neg = 0, then 0 <= a < 2*m
65             *
66             * If neg = 0, then the top word of a[] may use 16 bits.
67             *
68             * Also, modulus m must be odd.
69             */
70             static void
71 0           finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg)
72             {
73             size_t k;
74             uint32_t cc, xm, ym;
75              
76             /*
77             * First pass: compare a (assumed nonnegative) with m.
78             */
79 0           cc = 0;
80 0 0         for (k = 0; k < len; k ++) {
81             uint32_t aw, mw;
82              
83 0           aw = a[k];
84 0           mw = m[k];
85 0           cc = (aw - mw - cc) >> 31;
86             }
87              
88             /*
89             * At this point:
90             * if neg = 1, then we must add m (regardless of cc)
91             * if neg = 0 and cc = 0, then we must subtract m
92             * if neg = 0 and cc = 1, then we must do nothing
93             */
94 0           xm = 0x7FFF & -neg;
95 0           ym = -(neg | (1 - cc));
96 0           cc = neg;
97 0 0         for (k = 0; k < len; k ++) {
98             uint32_t aw, mw;
99              
100 0           aw = a[k];
101 0           mw = (m[k] ^ xm) & ym;
102 0           aw = aw - mw - cc;
103 0           a[k] = aw & 0x7FFF;
104 0           cc = aw >> 31;
105             }
106 0           }
107              
108             /*
109             * Compute:
110             * a <- (a*pa+b*pb)/(2^15)
111             * b <- (a*qa+b*qb)/(2^15)
112             * The division is assumed to be exact (i.e. the low word is dropped).
113             * If the final a is negative, then it is negated. Similarly for b.
114             * Returned value is the combination of two bits:
115             * bit 0: 1 if a had to be negated, 0 otherwise
116             * bit 1: 1 if b had to be negated, 0 otherwise
117             *
118             * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
119             * Source integers a and b must be nonnegative; top word is not allowed
120             * to contain an extra 16th bit.
121             */
122             static uint32_t
123 0           co_reduce(uint16_t *a, uint16_t *b, size_t len,
124             int32_t pa, int32_t pb, int32_t qa, int32_t qb)
125             {
126             size_t k;
127             int32_t cca, ccb;
128             uint32_t nega, negb;
129              
130 0           cca = 0;
131 0           ccb = 0;
132 0 0         for (k = 0; k < len; k ++) {
133             uint32_t wa, wb, za, zb;
134             uint16_t tta, ttb;
135              
136             /*
137             * Since:
138             * |pa| <= 2^15
139             * |pb| <= 2^15
140             * 0 <= wa <= 2^15 - 1
141             * 0 <= wb <= 2^15 - 1
142             * |cca| <= 2^16 - 1
143             * Then:
144             * |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1
145             *
146             * Thus, the new value of cca is such that |cca| <= 2^16 - 1.
147             * The same applies to ccb.
148             */
149 0           wa = a[k];
150 0           wb = b[k];
151 0           za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;
152 0           zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;
153 0 0         if (k > 0) {
154 0           a[k - 1] = za & 0x7FFF;
155 0           b[k - 1] = zb & 0x7FFF;
156             }
157 0           tta = za >> 15;
158 0           ttb = zb >> 15;
159 0           cca = *(int16_t *)&tta;
160 0           ccb = *(int16_t *)&ttb;
161             }
162 0           a[len - 1] = (uint16_t)cca;
163 0           b[len - 1] = (uint16_t)ccb;
164 0           nega = (uint32_t)cca >> 31;
165 0           negb = (uint32_t)ccb >> 31;
166 0           cond_negate(a, len, nega);
167 0           cond_negate(b, len, negb);
168 0           return nega | (negb << 1);
169             }
170              
171             /*
172             * Compute:
173             * a <- (a*pa+b*pb)/(2^15) mod m
174             * b <- (a*qa+b*qb)/(2^15) mod m
175             *
176             * m0i is equal to -1/m[0] mod 2^15.
177             *
178             * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
179             * Source integers a and b must be nonnegative; top word is not allowed
180             * to contain an extra 16th bit.
181             */
182             static void
183 0           co_reduce_mod(uint16_t *a, uint16_t *b, size_t len,
184             int32_t pa, int32_t pb, int32_t qa, int32_t qb,
185             const uint16_t *m, uint16_t m0i)
186             {
187             size_t k;
188             int32_t cca, ccb, fa, fb;
189              
190 0           cca = 0;
191 0           ccb = 0;
192 0           fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;
193 0           fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;
194 0 0         for (k = 0; k < len; k ++) {
195             uint32_t wa, wb, za, zb;
196             uint32_t tta, ttb;
197              
198             /*
199             * In this loop, carries 'cca' and 'ccb' always fit on
200             * 17 bits (in absolute value).
201             */
202 0           wa = a[k];
203 0           wb = b[k];
204 0           za = wa * (uint32_t)pa + wb * (uint32_t)pb
205 0           + m[k] * (uint32_t)fa + (uint32_t)cca;
206 0           zb = wa * (uint32_t)qa + wb * (uint32_t)qb
207 0           + m[k] * (uint32_t)fb + (uint32_t)ccb;
208 0 0         if (k > 0) {
209 0           a[k - 1] = za & 0x7FFF;
210 0           b[k - 1] = zb & 0x7FFF;
211             }
212              
213             /*
214             * The XOR-and-sub construction below does an arithmetic
215             * right shift in a portable way (technically, right-shifting
216             * a negative signed value is implementation-defined in C).
217             */
218             #define M ((uint32_t)1 << 16)
219 0           tta = za >> 15;
220 0           ttb = zb >> 15;
221 0           tta = (tta ^ M) - M;
222 0           ttb = (ttb ^ M) - M;
223 0           cca = *(int32_t *)&tta;
224 0           ccb = *(int32_t *)&ttb;
225             #undef M
226             }
227 0           a[len - 1] = (uint32_t)cca;
228 0           b[len - 1] = (uint32_t)ccb;
229              
230             /*
231             * At this point:
232             * -m <= a < 2*m
233             * -m <= b < 2*m
234             * (this is a case of Montgomery reduction)
235             * The top word of 'a' and 'b' may have a 16-th bit set.
236             * We may have to add or subtract the modulus.
237             */
238 0           finish_mod(a, len, m, (uint32_t)cca >> 31);
239 0           finish_mod(b, len, m, (uint32_t)ccb >> 31);
240 0           }
241              
242             /* see inner.h */
243             uint32_t
244 0           br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i,
245             uint16_t *t)
246             {
247             /*
248             * Algorithm is an extended binary GCD. We maintain four values
249             * a, b, u and v, with the following invariants:
250             *
251             * a * x = y * u mod m
252             * b * x = y * v mod m
253             *
254             * Starting values are:
255             *
256             * a = y
257             * b = m
258             * u = x
259             * v = 0
260             *
261             * The formal definition of the algorithm is a sequence of steps:
262             *
263             * - If a is even, then a <- a/2 and u <- u/2 mod m.
264             * - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
265             * - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
266             * - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
267             *
268             * Algorithm stops when a = b. At that point, they both are equal
269             * to GCD(y,m); the modular division succeeds if that value is 1.
270             * The result of the modular division is then u (or v: both are
271             * equal at that point).
272             *
273             * Each step makes either a or b shrink by at least one bit; hence,
274             * if m has bit length k bits, then 2k-2 steps are sufficient.
275             *
276             *
277             * Though complexity is quadratic in the size of m, the bit-by-bit
278             * processing is not very efficient. We can speed up processing by
279             * remarking that the decisions are taken based only on observation
280             * of the top and low bits of a and b.
281             *
282             * In the loop below, at each iteration, we use the two top words
283             * of a and b, and the low words of a and b, to compute reduction
284             * parameters pa, pb, qa and qb such that the new values for a
285             * and b are:
286             *
287             * a' = (a*pa + b*pb) / (2^15)
288             * b' = (a*qa + b*qb) / (2^15)
289             *
290             * the division being exact.
291             *
292             * Since the choices are based on the top words, they may be slightly
293             * off, requiring an optional correction: if a' < 0, then we replace
294             * pa with -pa, and pb with -pb. The total length of a and b is
295             * thus reduced by at least 14 bits at each iteration.
296             *
297             * The stopping conditions are still the same, though: when a
298             * and b become equal, they must be both odd (since m is odd,
299             * the GCD cannot be even), therefore the next operation is a
300             * subtraction, and one of the values becomes 0. At that point,
301             * nothing else happens, i.e. one value is stuck at 0, and the
302             * other one is the GCD.
303             */
304             size_t len, k;
305             uint16_t *a, *b, *u, *v;
306             uint32_t num, r;
307              
308 0           len = (m[0] + 15) >> 4;
309 0           a = t;
310 0           b = a + len;
311 0           u = x + 1;
312 0           v = b + len;
313 0           memcpy(a, y + 1, len * sizeof *y);
314 0           memcpy(b, m + 1, len * sizeof *m);
315 0           memset(v, 0, len * sizeof *v);
316              
317             /*
318             * Loop below ensures that a and b are reduced by some bits each,
319             * for a total of at least 14 bits.
320             */
321 0 0         for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {
322             size_t j;
323             uint32_t c0, c1;
324             uint32_t a0, a1, b0, b1;
325             uint32_t a_hi, b_hi, a_lo, b_lo;
326             int32_t pa, pb, qa, qb;
327             int i;
328              
329             /*
330             * Extract top words of a and b. If j is the highest
331             * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
332             * (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].
333             * If a and b are down to one word each, then we use a[0]
334             * and b[0].
335             */
336 0           c0 = (uint32_t)-1;
337 0           c1 = (uint32_t)-1;
338 0           a0 = 0;
339 0           a1 = 0;
340 0           b0 = 0;
341 0           b1 = 0;
342 0           j = len;
343 0 0         while (j -- > 0) {
344             uint32_t aw, bw;
345              
346 0           aw = a[j];
347 0           bw = b[j];
348 0           a0 ^= (a0 ^ aw) & c0;
349 0           a1 ^= (a1 ^ aw) & c1;
350 0           b0 ^= (b0 ^ bw) & c0;
351 0           b1 ^= (b1 ^ bw) & c1;
352 0           c1 = c0;
353 0           c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1;
354             }
355              
356             /*
357             * If c1 = 0, then we grabbed two words for a and b.
358             * If c1 != 0 but c0 = 0, then we grabbed one word. It
359             * is not possible that c1 != 0 and c0 != 0, because that
360             * would mean that both integers are zero.
361             */
362 0           a1 |= a0 & c1;
363 0           a0 &= ~c1;
364 0           b1 |= b0 & c1;
365 0           b0 &= ~c1;
366 0           a_hi = (a0 << 15) + a1;
367 0           b_hi = (b0 << 15) + b1;
368 0           a_lo = a[0];
369 0           b_lo = b[0];
370              
371             /*
372             * Compute reduction factors:
373             *
374             * a' = a*pa + b*pb
375             * b' = a*qa + b*qb
376             *
377             * such that a' and b' are both multiple of 2^15, but are
378             * only marginally larger than a and b.
379             */
380 0           pa = 1;
381 0           pb = 0;
382 0           qa = 0;
383 0           qb = 1;
384 0 0         for (i = 0; i < 15; i ++) {
385             /*
386             * At each iteration:
387             *
388             * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
389             * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
390             * a <- a/2 if: a is even
391             * b <- b/2 if: a is odd, b is even
392             *
393             * We multiply a_lo and b_lo by 2 at each
394             * iteration, thus a division by 2 really is a
395             * non-multiplication by 2.
396             */
397             uint32_t r, oa, ob, cAB, cBA, cA;
398              
399             /*
400             * cAB = 1 if b must be subtracted from a
401             * cBA = 1 if a must be subtracted from b
402             * cA = 1 if a is divided by 2, 0 otherwise
403             *
404             * Rules:
405             *
406             * cAB and cBA cannot be both 1.
407             * if a is not divided by 2, b is.
408             */
409 0           r = GT(a_hi, b_hi);
410 0           oa = (a_lo >> i) & 1;
411 0           ob = (b_lo >> i) & 1;
412 0           cAB = oa & ob & r;
413 0           cBA = oa & ob & NOT(r);
414 0           cA = cAB | NOT(oa);
415              
416             /*
417             * Conditional subtractions.
418             */
419 0           a_lo -= b_lo & -cAB;
420 0           a_hi -= b_hi & -cAB;
421 0           pa -= qa & -(int32_t)cAB;
422 0           pb -= qb & -(int32_t)cAB;
423 0           b_lo -= a_lo & -cBA;
424 0           b_hi -= a_hi & -cBA;
425 0           qa -= pa & -(int32_t)cBA;
426 0           qb -= pb & -(int32_t)cBA;
427              
428             /*
429             * Shifting.
430             */
431 0           a_lo += a_lo & (cA - 1);
432 0           pa += pa & ((int32_t)cA - 1);
433 0           pb += pb & ((int32_t)cA - 1);
434 0           a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;
435 0           b_lo += b_lo & -cA;
436 0           qa += qa & -(int32_t)cA;
437 0           qb += qb & -(int32_t)cA;
438 0           b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);
439             }
440              
441             /*
442             * Replace a and b with new values a' and b'.
443             */
444 0           r = co_reduce(a, b, len, pa, pb, qa, qb);
445 0           pa -= pa * ((r & 1) << 1);
446 0           pb -= pb * ((r & 1) << 1);
447 0           qa -= qa * (r & 2);
448 0           qb -= qb * (r & 2);
449 0           co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
450             }
451              
452             /*
453             * Now one of the arrays should be 0, and the other contains
454             * the GCD. If a is 0, then u is 0 as well, and v contains
455             * the division result.
456             * Result is correct if and only if GCD is 1.
457             */
458 0           r = (a[0] | b[0]) ^ 1;
459 0           u[0] |= v[0];
460 0 0         for (k = 1; k < len; k ++) {
461 0           r |= a[k] | b[k];
462 0           u[k] |= v[k];
463             }
464 0           return EQ0(r);
465             }