File Coverage

src/int/i15_moddiv.c

Criterion	Covered	Total	%
statement	0	146	0.0
branch	0	22	0.0
condition			n/a
subroutine			n/a
pod			n/a
total	0	168	0.0

line	stmt	bran	code
1			/*
2			* Copyright (c) 2018 Thomas Pornin
3			*
4			* Permission is hereby granted, free of charge, to any person obtaining
5			* a copy of this software and associated documentation files (the
6			* "Software"), to deal in the Software without restriction, including
7			* without limitation the rights to use, copy, modify, merge, publish,
8			* distribute, sublicense, and/or sell copies of the Software, and to
9			* permit persons to whom the Software is furnished to do so, subject to
10			* the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be
13			* included in all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16			* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17			* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18			* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19			* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20			* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21			* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22			* SOFTWARE.
23			*/
24
25			#include "inner.h"
26
27			/*
28			* In this file, we handle big integers with a custom format, i.e.
29			* without the usual one-word header. Value is split into 15-bit words,
30			* each stored in a 16-bit slot (top bit is zero) in little-endian
31			* order. The length (in words) is provided explicitly. In some cases,
32			* the value can be negative (using two's complement representation). In
33			* some cases, the top word is allowed to have a 16th bit.
34			*/
35
36			/*
37			* Negate big integer conditionally. The value consists of 'len' words,
38			* with 15 bits in each word (the top bit of each word should be 0,
39			* except possibly for the last word). If 'ctl' is 1, the negation is
40			* computed; otherwise, if 'ctl' is 0, then the value is unchanged.
41			*/
42			static void
43	0		cond_negate(uint16_t *a, size_t len, uint32_t ctl)
44			{
45			size_t k;
46			uint32_t cc, xm;
47
48	0		cc = ctl;
49	0		xm = 0x7FFF & -ctl;
50	0	0	for (k = 0; k < len; k ++) {
51			uint32_t aw;
52
53	0		aw = a[k];
54	0		aw = (aw ^ xm) + cc;
55	0		a[k] = aw & 0x7FFF;
56	0		cc = (aw >> 15) & 1;
57			}
58	0		}
59
60			/*
61			* Finish modular reduction. Rules on input parameters:
62			*
63			* if neg = 1, then -m <= a < 0
64			* if neg = 0, then 0 <= a < 2*m
65			*
66			* If neg = 0, then the top word of a[] may use 16 bits.
67			*
68			* Also, modulus m must be odd.
69			*/
70			static void
71	0		finish_mod(uint16_t a, size_t len, const uint16_t m, uint32_t neg)
72			{
73			size_t k;
74			uint32_t cc, xm, ym;
75
76			/*
77			* First pass: compare a (assumed nonnegative) with m.
78			*/
79	0		cc = 0;
80	0	0	for (k = 0; k < len; k ++) {
81			uint32_t aw, mw;
82
83	0		aw = a[k];
84	0		mw = m[k];
85	0		cc = (aw - mw - cc) >> 31;
86			}
87
88			/*
89			* At this point:
90			* if neg = 1, then we must add m (regardless of cc)
91			* if neg = 0 and cc = 0, then we must subtract m
92			* if neg = 0 and cc = 1, then we must do nothing
93			*/
94	0		xm = 0x7FFF & -neg;
95	0		ym = -(neg \| (1 - cc));
96	0		cc = neg;
97	0	0	for (k = 0; k < len; k ++) {
98			uint32_t aw, mw;
99
100	0		aw = a[k];
101	0		mw = (m[k] ^ xm) & ym;
102	0		aw = aw - mw - cc;
103	0		a[k] = aw & 0x7FFF;
104	0		cc = aw >> 31;
105			}
106	0		}
107
108			/*
109			* Compute:
110			* a <- (apa+bpb)/(2^15)
111			* b <- (aqa+bqb)/(2^15)
112			* The division is assumed to be exact (i.e. the low word is dropped).
113			* If the final a is negative, then it is negated. Similarly for b.
114			* Returned value is the combination of two bits:
115			* bit 0: 1 if a had to be negated, 0 otherwise
116			* bit 1: 1 if b had to be negated, 0 otherwise
117			*
118			* Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
119			* Source integers a and b must be nonnegative; top word is not allowed
120			* to contain an extra 16th bit.
121			*/
122			static uint32_t
123	0		co_reduce(uint16_t a, uint16_t b, size_t len,
124			int32_t pa, int32_t pb, int32_t qa, int32_t qb)
125			{
126			size_t k;
127			int32_t cca, ccb;
128			uint32_t nega, negb;
129
130	0		cca = 0;
131	0		ccb = 0;
132	0	0	for (k = 0; k < len; k ++) {
133			uint32_t wa, wb, za, zb;
134			uint16_t tta, ttb;
135
136			/*
137			* Since:
138			* \|pa\| <= 2^15
139			* \|pb\| <= 2^15
140			* 0 <= wa <= 2^15 - 1
141			* 0 <= wb <= 2^15 - 1
142			* \|cca\| <= 2^16 - 1
143			* Then:
144			* \|za\| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1
145			*
146			* Thus, the new value of cca is such that \|cca\| <= 2^16 - 1.
147			* The same applies to ccb.
148			*/
149	0		wa = a[k];
150	0		wb = b[k];
151	0		za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;
152	0		zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;
153	0	0	if (k > 0) {
154	0		a[k - 1] = za & 0x7FFF;
155	0		b[k - 1] = zb & 0x7FFF;
156			}
157	0		tta = za >> 15;
158	0		ttb = zb >> 15;
159	0		cca = (int16_t )&tta;
160	0		ccb = (int16_t )&ttb;
161			}
162	0		a[len - 1] = (uint16_t)cca;
163	0		b[len - 1] = (uint16_t)ccb;
164	0		nega = (uint32_t)cca >> 31;
165	0		negb = (uint32_t)ccb >> 31;
166	0		cond_negate(a, len, nega);
167	0		cond_negate(b, len, negb);
168	0		return nega \| (negb << 1);
169			}
170
171			/*
172			* Compute:
173			* a <- (apa+bpb)/(2^15) mod m
174			* b <- (aqa+bqb)/(2^15) mod m
175			*
176			* m0i is equal to -1/m[0] mod 2^15.
177			*
178			* Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
179			* Source integers a and b must be nonnegative; top word is not allowed
180			* to contain an extra 16th bit.
181			*/
182			static void
183	0		co_reduce_mod(uint16_t a, uint16_t b, size_t len,
184			int32_t pa, int32_t pb, int32_t qa, int32_t qb,
185			const uint16_t *m, uint16_t m0i)
186			{
187			size_t k;
188			int32_t cca, ccb, fa, fb;
189
190	0		cca = 0;
191	0		ccb = 0;
192	0		fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;
193	0		fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;
194	0	0	for (k = 0; k < len; k ++) {
195			uint32_t wa, wb, za, zb;
196			uint32_t tta, ttb;
197
198			/*
199			* In this loop, carries 'cca' and 'ccb' always fit on
200			* 17 bits (in absolute value).
201			*/
202	0		wa = a[k];
203	0		wb = b[k];
204	0		za = wa * (uint32_t)pa + wb * (uint32_t)pb
205	0		+ m[k] * (uint32_t)fa + (uint32_t)cca;
206	0		zb = wa * (uint32_t)qa + wb * (uint32_t)qb
207	0		+ m[k] * (uint32_t)fb + (uint32_t)ccb;
208	0	0	if (k > 0) {
209	0		a[k - 1] = za & 0x7FFF;
210	0		b[k - 1] = zb & 0x7FFF;
211			}
212
213			/*
214			* The XOR-and-sub construction below does an arithmetic
215			* right shift in a portable way (technically, right-shifting
216			* a negative signed value is implementation-defined in C).
217			*/
218			#define M ((uint32_t)1 << 16)
219	0		tta = za >> 15;
220	0		ttb = zb >> 15;
221	0		tta = (tta ^ M) - M;
222	0		ttb = (ttb ^ M) - M;
223	0		cca = (int32_t )&tta;
224	0		ccb = (int32_t )&ttb;
225			#undef M
226			}
227	0		a[len - 1] = (uint32_t)cca;
228	0		b[len - 1] = (uint32_t)ccb;
229
230			/*
231			* At this point:
232			* -m <= a < 2*m
233			* -m <= b < 2*m
234			* (this is a case of Montgomery reduction)
235			* The top word of 'a' and 'b' may have a 16-th bit set.
236			* We may have to add or subtract the modulus.
237			*/
238	0		finish_mod(a, len, m, (uint32_t)cca >> 31);
239	0		finish_mod(b, len, m, (uint32_t)ccb >> 31);
240	0		}
241
242			/* see inner.h */
243			uint32_t
244	0		br_i15_moddiv(uint16_t x, const uint16_t y, const uint16_t *m, uint16_t m0i,
245			uint16_t *t)
246			{
247			/*
248			* Algorithm is an extended binary GCD. We maintain four values
249			* a, b, u and v, with the following invariants:
250			*
251			* a * x = y * u mod m
252			* b * x = y * v mod m
253			*
254			* Starting values are:
255			*
256			* a = y
257			* b = m
258			* u = x
259			* v = 0
260			*
261			* The formal definition of the algorithm is a sequence of steps:
262			*
263			* - If a is even, then a <- a/2 and u <- u/2 mod m.
264			* - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
265			* - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
266			* - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
267			*
268			* Algorithm stops when a = b. At that point, they both are equal
269			* to GCD(y,m); the modular division succeeds if that value is 1.
270			* The result of the modular division is then u (or v: both are
271			* equal at that point).
272			*
273			* Each step makes either a or b shrink by at least one bit; hence,
274			* if m has bit length k bits, then 2k-2 steps are sufficient.
275			*
276			*
277			* Though complexity is quadratic in the size of m, the bit-by-bit
278			* processing is not very efficient. We can speed up processing by
279			* remarking that the decisions are taken based only on observation
280			* of the top and low bits of a and b.
281			*
282			* In the loop below, at each iteration, we use the two top words
283			* of a and b, and the low words of a and b, to compute reduction
284			* parameters pa, pb, qa and qb such that the new values for a
285			* and b are:
286			*
287			* a' = (apa + bpb) / (2^15)
288			* b' = (aqa + bqb) / (2^15)
289			*
290			* the division being exact.
291			*
292			* Since the choices are based on the top words, they may be slightly
293			* off, requiring an optional correction: if a' < 0, then we replace
294			* pa with -pa, and pb with -pb. The total length of a and b is
295			* thus reduced by at least 14 bits at each iteration.
296			*
297			* The stopping conditions are still the same, though: when a
298			* and b become equal, they must be both odd (since m is odd,
299			* the GCD cannot be even), therefore the next operation is a
300			* subtraction, and one of the values becomes 0. At that point,
301			* nothing else happens, i.e. one value is stuck at 0, and the
302			* other one is the GCD.
303			*/
304			size_t len, k;
305			uint16_t a, b, u, v;
306			uint32_t num, r;
307
308	0		len = (m[0] + 15) >> 4;
309	0		a = t;
310	0		b = a + len;
311	0		u = x + 1;
312	0		v = b + len;
313	0		memcpy(a, y + 1, len * sizeof *y);
314	0		memcpy(b, m + 1, len * sizeof *m);
315	0		memset(v, 0, len * sizeof *v);
316
317			/*
318			* Loop below ensures that a and b are reduced by some bits each,
319			* for a total of at least 14 bits.
320			*/
321	0	0	for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {
322			size_t j;
323			uint32_t c0, c1;
324			uint32_t a0, a1, b0, b1;
325			uint32_t a_hi, b_hi, a_lo, b_lo;
326			int32_t pa, pb, qa, qb;
327			int i;
328
329			/*
330			* Extract top words of a and b. If j is the highest
331			* index >= 1 such that a[j] != 0 or b[j] != 0, then we want
332			* (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].
333			* If a and b are down to one word each, then we use a[0]
334			* and b[0].
335			*/
336	0		c0 = (uint32_t)-1;
337	0		c1 = (uint32_t)-1;
338	0		a0 = 0;
339	0		a1 = 0;
340	0		b0 = 0;
341	0		b1 = 0;
342	0		j = len;
343	0	0	while (j -- > 0) {
344			uint32_t aw, bw;
345
346	0		aw = a[j];
347	0		bw = b[j];
348	0		a0 ^= (a0 ^ aw) & c0;
349	0		a1 ^= (a1 ^ aw) & c1;
350	0		b0 ^= (b0 ^ bw) & c0;
351	0		b1 ^= (b1 ^ bw) & c1;
352	0		c1 = c0;
353	0		c0 &= (((aw \| bw) + 0xFFFF) >> 16) - (uint32_t)1;
354			}
355
356			/*
357			* If c1 = 0, then we grabbed two words for a and b.
358			* If c1 != 0 but c0 = 0, then we grabbed one word. It
359			* is not possible that c1 != 0 and c0 != 0, because that
360			* would mean that both integers are zero.
361			*/
362	0		a1 \|= a0 & c1;
363	0		a0 &= ~c1;
364	0		b1 \|= b0 & c1;
365	0		b0 &= ~c1;
366	0		a_hi = (a0 << 15) + a1;
367	0		b_hi = (b0 << 15) + b1;
368	0		a_lo = a[0];
369	0		b_lo = b[0];
370
371			/*
372			* Compute reduction factors:
373			*
374			* a' = apa + bpb
375			* b' = aqa + bqb
376			*
377			* such that a' and b' are both multiple of 2^15, but are
378			* only marginally larger than a and b.
379			*/
380	0		pa = 1;
381	0		pb = 0;
382	0		qa = 0;
383	0		qb = 1;
384	0	0	for (i = 0; i < 15; i ++) {
385			/*
386			* At each iteration:
387			*
388			* a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
389			* b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
390			* a <- a/2 if: a is even
391			* b <- b/2 if: a is odd, b is even
392			*
393			* We multiply a_lo and b_lo by 2 at each
394			* iteration, thus a division by 2 really is a
395			* non-multiplication by 2.
396			*/
397			uint32_t r, oa, ob, cAB, cBA, cA;
398
399			/*
400			* cAB = 1 if b must be subtracted from a
401			* cBA = 1 if a must be subtracted from b
402			* cA = 1 if a is divided by 2, 0 otherwise
403			*
404			* Rules:
405			*
406			* cAB and cBA cannot be both 1.
407			* if a is not divided by 2, b is.
408			*/
409	0		r = GT(a_hi, b_hi);
410	0		oa = (a_lo >> i) & 1;
411	0		ob = (b_lo >> i) & 1;
412	0		cAB = oa & ob & r;
413	0		cBA = oa & ob & NOT(r);
414	0		cA = cAB \| NOT(oa);
415
416			/*
417			* Conditional subtractions.
418			*/
419	0		a_lo -= b_lo & -cAB;
420	0		a_hi -= b_hi & -cAB;
421	0		pa -= qa & -(int32_t)cAB;
422	0		pb -= qb & -(int32_t)cAB;
423	0		b_lo -= a_lo & -cBA;
424	0		b_hi -= a_hi & -cBA;
425	0		qa -= pa & -(int32_t)cBA;
426	0		qb -= pb & -(int32_t)cBA;
427
428			/*
429			* Shifting.
430			*/
431	0		a_lo += a_lo & (cA - 1);
432	0		pa += pa & ((int32_t)cA - 1);
433	0		pb += pb & ((int32_t)cA - 1);
434	0		a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;
435	0		b_lo += b_lo & -cA;
436	0		qa += qa & -(int32_t)cA;
437	0		qb += qb & -(int32_t)cA;
438	0		b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);
439			}
440
441			/*
442			* Replace a and b with new values a' and b'.
443			*/
444	0		r = co_reduce(a, b, len, pa, pb, qa, qb);
445	0		pa -= pa * ((r & 1) << 1);
446	0		pb -= pb * ((r & 1) << 1);
447	0		qa -= qa * (r & 2);
448	0		qb -= qb * (r & 2);
449	0		co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
450			}
451
452			/*
453			* Now one of the arrays should be 0, and the other contains
454			* the GCD. If a is 0, then u is 0 as well, and v contains
455			* the division result.
456			* Result is correct if and only if GCD is 1.
457			*/
458	0		r = (a[0] \| b[0]) ^ 1;
459	0		u[0] \|= v[0];
460	0	0	for (k = 1; k < len; k ++) {
461	0		r \|= a[k] \| b[k];
462	0		u[k] \|= v[k];
463			}
464	0		return EQ0(r);
465			}