| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* Copyright (c) 2018 Thomas Pornin |
|
3
|
|
|
|
|
|
|
* |
|
4
|
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining |
|
5
|
|
|
|
|
|
|
* a copy of this software and associated documentation files (the |
|
6
|
|
|
|
|
|
|
* "Software"), to deal in the Software without restriction, including |
|
7
|
|
|
|
|
|
|
* without limitation the rights to use, copy, modify, merge, publish, |
|
8
|
|
|
|
|
|
|
* distribute, sublicense, and/or sell copies of the Software, and to |
|
9
|
|
|
|
|
|
|
* permit persons to whom the Software is furnished to do so, subject to |
|
10
|
|
|
|
|
|
|
* the following conditions: |
|
11
|
|
|
|
|
|
|
* |
|
12
|
|
|
|
|
|
|
* The above copyright notice and this permission notice shall be |
|
13
|
|
|
|
|
|
|
* included in all copies or substantial portions of the Software. |
|
14
|
|
|
|
|
|
|
* |
|
15
|
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
16
|
|
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
17
|
|
|
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
18
|
|
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
|
19
|
|
|
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
|
20
|
|
|
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
21
|
|
|
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
22
|
|
|
|
|
|
|
* SOFTWARE. |
|
23
|
|
|
|
|
|
|
*/ |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
#include "inner.h" |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
/* |
|
28
|
|
|
|
|
|
|
* In this file, we handle big integers with a custom format, i.e. |
|
29
|
|
|
|
|
|
|
* without the usual one-word header. Value is split into 15-bit words, |
|
30
|
|
|
|
|
|
|
* each stored in a 16-bit slot (top bit is zero) in little-endian |
|
31
|
|
|
|
|
|
|
* order. The length (in words) is provided explicitly. In some cases, |
|
32
|
|
|
|
|
|
|
* the value can be negative (using two's complement representation). In |
|
33
|
|
|
|
|
|
|
* some cases, the top word is allowed to have a 16th bit. |
|
34
|
|
|
|
|
|
|
*/ |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
/* |
|
37
|
|
|
|
|
|
|
* Negate big integer conditionally. The value consists of 'len' words, |
|
38
|
|
|
|
|
|
|
* with 15 bits in each word (the top bit of each word should be 0, |
|
39
|
|
|
|
|
|
|
* except possibly for the last word). If 'ctl' is 1, the negation is |
|
40
|
|
|
|
|
|
|
* computed; otherwise, if 'ctl' is 0, then the value is unchanged. |
|
41
|
|
|
|
|
|
|
*/ |
|
42
|
|
|
|
|
|
|
static void |
|
43
|
0
|
|
|
|
|
|
cond_negate(uint16_t *a, size_t len, uint32_t ctl) |
|
44
|
|
|
|
|
|
|
{ |
|
45
|
|
|
|
|
|
|
size_t k; |
|
46
|
|
|
|
|
|
|
uint32_t cc, xm; |
|
47
|
|
|
|
|
|
|
|
|
48
|
0
|
|
|
|
|
|
cc = ctl; |
|
49
|
0
|
|
|
|
|
|
xm = 0x7FFF & -ctl; |
|
50
|
0
|
0
|
|
|
|
|
for (k = 0; k < len; k ++) { |
|
51
|
|
|
|
|
|
|
uint32_t aw; |
|
52
|
|
|
|
|
|
|
|
|
53
|
0
|
|
|
|
|
|
aw = a[k]; |
|
54
|
0
|
|
|
|
|
|
aw = (aw ^ xm) + cc; |
|
55
|
0
|
|
|
|
|
|
a[k] = aw & 0x7FFF; |
|
56
|
0
|
|
|
|
|
|
cc = (aw >> 15) & 1; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
0
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
/* |
|
61
|
|
|
|
|
|
|
* Finish modular reduction. Rules on input parameters: |
|
62
|
|
|
|
|
|
|
* |
|
63
|
|
|
|
|
|
|
* if neg = 1, then -m <= a < 0 |
|
64
|
|
|
|
|
|
|
* if neg = 0, then 0 <= a < 2*m |
|
65
|
|
|
|
|
|
|
* |
|
66
|
|
|
|
|
|
|
* If neg = 0, then the top word of a[] may use 16 bits. |
|
67
|
|
|
|
|
|
|
* |
|
68
|
|
|
|
|
|
|
* Also, modulus m must be odd. |
|
69
|
|
|
|
|
|
|
*/ |
|
70
|
|
|
|
|
|
|
static void |
|
71
|
0
|
|
|
|
|
|
finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg) |
|
72
|
|
|
|
|
|
|
{ |
|
73
|
|
|
|
|
|
|
size_t k; |
|
74
|
|
|
|
|
|
|
uint32_t cc, xm, ym; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
/* |
|
77
|
|
|
|
|
|
|
* First pass: compare a (assumed nonnegative) with m. |
|
78
|
|
|
|
|
|
|
*/ |
|
79
|
0
|
|
|
|
|
|
cc = 0; |
|
80
|
0
|
0
|
|
|
|
|
for (k = 0; k < len; k ++) { |
|
81
|
|
|
|
|
|
|
uint32_t aw, mw; |
|
82
|
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
aw = a[k]; |
|
84
|
0
|
|
|
|
|
|
mw = m[k]; |
|
85
|
0
|
|
|
|
|
|
cc = (aw - mw - cc) >> 31; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
/* |
|
89
|
|
|
|
|
|
|
* At this point: |
|
90
|
|
|
|
|
|
|
* if neg = 1, then we must add m (regardless of cc) |
|
91
|
|
|
|
|
|
|
* if neg = 0 and cc = 0, then we must subtract m |
|
92
|
|
|
|
|
|
|
* if neg = 0 and cc = 1, then we must do nothing |
|
93
|
|
|
|
|
|
|
*/ |
|
94
|
0
|
|
|
|
|
|
xm = 0x7FFF & -neg; |
|
95
|
0
|
|
|
|
|
|
ym = -(neg | (1 - cc)); |
|
96
|
0
|
|
|
|
|
|
cc = neg; |
|
97
|
0
|
0
|
|
|
|
|
for (k = 0; k < len; k ++) { |
|
98
|
|
|
|
|
|
|
uint32_t aw, mw; |
|
99
|
|
|
|
|
|
|
|
|
100
|
0
|
|
|
|
|
|
aw = a[k]; |
|
101
|
0
|
|
|
|
|
|
mw = (m[k] ^ xm) & ym; |
|
102
|
0
|
|
|
|
|
|
aw = aw - mw - cc; |
|
103
|
0
|
|
|
|
|
|
a[k] = aw & 0x7FFF; |
|
104
|
0
|
|
|
|
|
|
cc = aw >> 31; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
0
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
/* |
|
109
|
|
|
|
|
|
|
* Compute: |
|
110
|
|
|
|
|
|
|
* a <- (a*pa+b*pb)/(2^15) |
|
111
|
|
|
|
|
|
|
* b <- (a*qa+b*qb)/(2^15) |
|
112
|
|
|
|
|
|
|
* The division is assumed to be exact (i.e. the low word is dropped). |
|
113
|
|
|
|
|
|
|
* If the final a is negative, then it is negated. Similarly for b. |
|
114
|
|
|
|
|
|
|
* Returned value is the combination of two bits: |
|
115
|
|
|
|
|
|
|
* bit 0: 1 if a had to be negated, 0 otherwise |
|
116
|
|
|
|
|
|
|
* bit 1: 1 if b had to be negated, 0 otherwise |
|
117
|
|
|
|
|
|
|
* |
|
118
|
|
|
|
|
|
|
* Factors pa, pb, qa and qb must be at most 2^15 in absolute value. |
|
119
|
|
|
|
|
|
|
* Source integers a and b must be nonnegative; top word is not allowed |
|
120
|
|
|
|
|
|
|
* to contain an extra 16th bit. |
|
121
|
|
|
|
|
|
|
*/ |
|
122
|
|
|
|
|
|
|
static uint32_t |
|
123
|
0
|
|
|
|
|
|
co_reduce(uint16_t *a, uint16_t *b, size_t len, |
|
124
|
|
|
|
|
|
|
int32_t pa, int32_t pb, int32_t qa, int32_t qb) |
|
125
|
|
|
|
|
|
|
{ |
|
126
|
|
|
|
|
|
|
size_t k; |
|
127
|
|
|
|
|
|
|
int32_t cca, ccb; |
|
128
|
|
|
|
|
|
|
uint32_t nega, negb; |
|
129
|
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
|
cca = 0; |
|
131
|
0
|
|
|
|
|
|
ccb = 0; |
|
132
|
0
|
0
|
|
|
|
|
for (k = 0; k < len; k ++) { |
|
133
|
|
|
|
|
|
|
uint32_t wa, wb, za, zb; |
|
134
|
|
|
|
|
|
|
uint16_t tta, ttb; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
/* |
|
137
|
|
|
|
|
|
|
* Since: |
|
138
|
|
|
|
|
|
|
* |pa| <= 2^15 |
|
139
|
|
|
|
|
|
|
* |pb| <= 2^15 |
|
140
|
|
|
|
|
|
|
* 0 <= wa <= 2^15 - 1 |
|
141
|
|
|
|
|
|
|
* 0 <= wb <= 2^15 - 1 |
|
142
|
|
|
|
|
|
|
* |cca| <= 2^16 - 1 |
|
143
|
|
|
|
|
|
|
* Then: |
|
144
|
|
|
|
|
|
|
* |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1 |
|
145
|
|
|
|
|
|
|
* |
|
146
|
|
|
|
|
|
|
* Thus, the new value of cca is such that |cca| <= 2^16 - 1. |
|
147
|
|
|
|
|
|
|
* The same applies to ccb. |
|
148
|
|
|
|
|
|
|
*/ |
|
149
|
0
|
|
|
|
|
|
wa = a[k]; |
|
150
|
0
|
|
|
|
|
|
wb = b[k]; |
|
151
|
0
|
|
|
|
|
|
za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca; |
|
152
|
0
|
|
|
|
|
|
zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb; |
|
153
|
0
|
0
|
|
|
|
|
if (k > 0) { |
|
154
|
0
|
|
|
|
|
|
a[k - 1] = za & 0x7FFF; |
|
155
|
0
|
|
|
|
|
|
b[k - 1] = zb & 0x7FFF; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
0
|
|
|
|
|
|
tta = za >> 15; |
|
158
|
0
|
|
|
|
|
|
ttb = zb >> 15; |
|
159
|
0
|
|
|
|
|
|
cca = *(int16_t *)&tta; |
|
160
|
0
|
|
|
|
|
|
ccb = *(int16_t *)&ttb; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
0
|
|
|
|
|
|
a[len - 1] = (uint16_t)cca; |
|
163
|
0
|
|
|
|
|
|
b[len - 1] = (uint16_t)ccb; |
|
164
|
0
|
|
|
|
|
|
nega = (uint32_t)cca >> 31; |
|
165
|
0
|
|
|
|
|
|
negb = (uint32_t)ccb >> 31; |
|
166
|
0
|
|
|
|
|
|
cond_negate(a, len, nega); |
|
167
|
0
|
|
|
|
|
|
cond_negate(b, len, negb); |
|
168
|
0
|
|
|
|
|
|
return nega | (negb << 1); |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
/* |
|
172
|
|
|
|
|
|
|
* Compute: |
|
173
|
|
|
|
|
|
|
* a <- (a*pa+b*pb)/(2^15) mod m |
|
174
|
|
|
|
|
|
|
* b <- (a*qa+b*qb)/(2^15) mod m |
|
175
|
|
|
|
|
|
|
* |
|
176
|
|
|
|
|
|
|
* m0i is equal to -1/m[0] mod 2^15. |
|
177
|
|
|
|
|
|
|
* |
|
178
|
|
|
|
|
|
|
* Factors pa, pb, qa and qb must be at most 2^15 in absolute value. |
|
179
|
|
|
|
|
|
|
* Source integers a and b must be nonnegative; top word is not allowed |
|
180
|
|
|
|
|
|
|
* to contain an extra 16th bit. |
|
181
|
|
|
|
|
|
|
*/ |
|
182
|
|
|
|
|
|
|
static void |
|
183
|
0
|
|
|
|
|
|
co_reduce_mod(uint16_t *a, uint16_t *b, size_t len, |
|
184
|
|
|
|
|
|
|
int32_t pa, int32_t pb, int32_t qa, int32_t qb, |
|
185
|
|
|
|
|
|
|
const uint16_t *m, uint16_t m0i) |
|
186
|
|
|
|
|
|
|
{ |
|
187
|
|
|
|
|
|
|
size_t k; |
|
188
|
|
|
|
|
|
|
int32_t cca, ccb, fa, fb; |
|
189
|
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
|
cca = 0; |
|
191
|
0
|
|
|
|
|
|
ccb = 0; |
|
192
|
0
|
|
|
|
|
|
fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF; |
|
193
|
0
|
|
|
|
|
|
fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF; |
|
194
|
0
|
0
|
|
|
|
|
for (k = 0; k < len; k ++) { |
|
195
|
|
|
|
|
|
|
uint32_t wa, wb, za, zb; |
|
196
|
|
|
|
|
|
|
uint32_t tta, ttb; |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
/* |
|
199
|
|
|
|
|
|
|
* In this loop, carries 'cca' and 'ccb' always fit on |
|
200
|
|
|
|
|
|
|
* 17 bits (in absolute value). |
|
201
|
|
|
|
|
|
|
*/ |
|
202
|
0
|
|
|
|
|
|
wa = a[k]; |
|
203
|
0
|
|
|
|
|
|
wb = b[k]; |
|
204
|
0
|
|
|
|
|
|
za = wa * (uint32_t)pa + wb * (uint32_t)pb |
|
205
|
0
|
|
|
|
|
|
+ m[k] * (uint32_t)fa + (uint32_t)cca; |
|
206
|
0
|
|
|
|
|
|
zb = wa * (uint32_t)qa + wb * (uint32_t)qb |
|
207
|
0
|
|
|
|
|
|
+ m[k] * (uint32_t)fb + (uint32_t)ccb; |
|
208
|
0
|
0
|
|
|
|
|
if (k > 0) { |
|
209
|
0
|
|
|
|
|
|
a[k - 1] = za & 0x7FFF; |
|
210
|
0
|
|
|
|
|
|
b[k - 1] = zb & 0x7FFF; |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
/* |
|
214
|
|
|
|
|
|
|
* The XOR-and-sub construction below does an arithmetic |
|
215
|
|
|
|
|
|
|
* right shift in a portable way (technically, right-shifting |
|
216
|
|
|
|
|
|
|
* a negative signed value is implementation-defined in C). |
|
217
|
|
|
|
|
|
|
*/ |
|
218
|
|
|
|
|
|
|
#define M ((uint32_t)1 << 16) |
|
219
|
0
|
|
|
|
|
|
tta = za >> 15; |
|
220
|
0
|
|
|
|
|
|
ttb = zb >> 15; |
|
221
|
0
|
|
|
|
|
|
tta = (tta ^ M) - M; |
|
222
|
0
|
|
|
|
|
|
ttb = (ttb ^ M) - M; |
|
223
|
0
|
|
|
|
|
|
cca = *(int32_t *)&tta; |
|
224
|
0
|
|
|
|
|
|
ccb = *(int32_t *)&ttb; |
|
225
|
|
|
|
|
|
|
#undef M |
|
226
|
|
|
|
|
|
|
} |
|
227
|
0
|
|
|
|
|
|
a[len - 1] = (uint32_t)cca; |
|
228
|
0
|
|
|
|
|
|
b[len - 1] = (uint32_t)ccb; |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
/* |
|
231
|
|
|
|
|
|
|
* At this point: |
|
232
|
|
|
|
|
|
|
* -m <= a < 2*m |
|
233
|
|
|
|
|
|
|
* -m <= b < 2*m |
|
234
|
|
|
|
|
|
|
* (this is a case of Montgomery reduction) |
|
235
|
|
|
|
|
|
|
* The top word of 'a' and 'b' may have a 16-th bit set. |
|
236
|
|
|
|
|
|
|
* We may have to add or subtract the modulus. |
|
237
|
|
|
|
|
|
|
*/ |
|
238
|
0
|
|
|
|
|
|
finish_mod(a, len, m, (uint32_t)cca >> 31); |
|
239
|
0
|
|
|
|
|
|
finish_mod(b, len, m, (uint32_t)ccb >> 31); |
|
240
|
0
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
/* see inner.h */ |
|
243
|
|
|
|
|
|
|
uint32_t |
|
244
|
0
|
|
|
|
|
|
br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i, |
|
245
|
|
|
|
|
|
|
uint16_t *t) |
|
246
|
|
|
|
|
|
|
{ |
|
247
|
|
|
|
|
|
|
/* |
|
248
|
|
|
|
|
|
|
* Algorithm is an extended binary GCD. We maintain four values |
|
249
|
|
|
|
|
|
|
* a, b, u and v, with the following invariants: |
|
250
|
|
|
|
|
|
|
* |
|
251
|
|
|
|
|
|
|
* a * x = y * u mod m |
|
252
|
|
|
|
|
|
|
* b * x = y * v mod m |
|
253
|
|
|
|
|
|
|
* |
|
254
|
|
|
|
|
|
|
* Starting values are: |
|
255
|
|
|
|
|
|
|
* |
|
256
|
|
|
|
|
|
|
* a = y |
|
257
|
|
|
|
|
|
|
* b = m |
|
258
|
|
|
|
|
|
|
* u = x |
|
259
|
|
|
|
|
|
|
* v = 0 |
|
260
|
|
|
|
|
|
|
* |
|
261
|
|
|
|
|
|
|
* The formal definition of the algorithm is a sequence of steps: |
|
262
|
|
|
|
|
|
|
* |
|
263
|
|
|
|
|
|
|
* - If a is even, then a <- a/2 and u <- u/2 mod m. |
|
264
|
|
|
|
|
|
|
* - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m. |
|
265
|
|
|
|
|
|
|
* - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m. |
|
266
|
|
|
|
|
|
|
* - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m. |
|
267
|
|
|
|
|
|
|
* |
|
268
|
|
|
|
|
|
|
* Algorithm stops when a = b. At that point, they both are equal |
|
269
|
|
|
|
|
|
|
* to GCD(y,m); the modular division succeeds if that value is 1. |
|
270
|
|
|
|
|
|
|
* The result of the modular division is then u (or v: both are |
|
271
|
|
|
|
|
|
|
* equal at that point). |
|
272
|
|
|
|
|
|
|
* |
|
273
|
|
|
|
|
|
|
* Each step makes either a or b shrink by at least one bit; hence, |
|
274
|
|
|
|
|
|
|
* if m has bit length k bits, then 2k-2 steps are sufficient. |
|
275
|
|
|
|
|
|
|
* |
|
276
|
|
|
|
|
|
|
* |
|
277
|
|
|
|
|
|
|
* Though complexity is quadratic in the size of m, the bit-by-bit |
|
278
|
|
|
|
|
|
|
* processing is not very efficient. We can speed up processing by |
|
279
|
|
|
|
|
|
|
* remarking that the decisions are taken based only on observation |
|
280
|
|
|
|
|
|
|
* of the top and low bits of a and b. |
|
281
|
|
|
|
|
|
|
* |
|
282
|
|
|
|
|
|
|
* In the loop below, at each iteration, we use the two top words |
|
283
|
|
|
|
|
|
|
* of a and b, and the low words of a and b, to compute reduction |
|
284
|
|
|
|
|
|
|
* parameters pa, pb, qa and qb such that the new values for a |
|
285
|
|
|
|
|
|
|
* and b are: |
|
286
|
|
|
|
|
|
|
* |
|
287
|
|
|
|
|
|
|
* a' = (a*pa + b*pb) / (2^15) |
|
288
|
|
|
|
|
|
|
* b' = (a*qa + b*qb) / (2^15) |
|
289
|
|
|
|
|
|
|
* |
|
290
|
|
|
|
|
|
|
* the division being exact. |
|
291
|
|
|
|
|
|
|
* |
|
292
|
|
|
|
|
|
|
* Since the choices are based on the top words, they may be slightly |
|
293
|
|
|
|
|
|
|
* off, requiring an optional correction: if a' < 0, then we replace |
|
294
|
|
|
|
|
|
|
* pa with -pa, and pb with -pb. The total length of a and b is |
|
295
|
|
|
|
|
|
|
* thus reduced by at least 14 bits at each iteration. |
|
296
|
|
|
|
|
|
|
* |
|
297
|
|
|
|
|
|
|
* The stopping conditions are still the same, though: when a |
|
298
|
|
|
|
|
|
|
* and b become equal, they must be both odd (since m is odd, |
|
299
|
|
|
|
|
|
|
* the GCD cannot be even), therefore the next operation is a |
|
300
|
|
|
|
|
|
|
* subtraction, and one of the values becomes 0. At that point, |
|
301
|
|
|
|
|
|
|
* nothing else happens, i.e. one value is stuck at 0, and the |
|
302
|
|
|
|
|
|
|
* other one is the GCD. |
|
303
|
|
|
|
|
|
|
*/ |
|
304
|
|
|
|
|
|
|
size_t len, k; |
|
305
|
|
|
|
|
|
|
uint16_t *a, *b, *u, *v; |
|
306
|
|
|
|
|
|
|
uint32_t num, r; |
|
307
|
|
|
|
|
|
|
|
|
308
|
0
|
|
|
|
|
|
len = (m[0] + 15) >> 4; |
|
309
|
0
|
|
|
|
|
|
a = t; |
|
310
|
0
|
|
|
|
|
|
b = a + len; |
|
311
|
0
|
|
|
|
|
|
u = x + 1; |
|
312
|
0
|
|
|
|
|
|
v = b + len; |
|
313
|
0
|
|
|
|
|
|
memcpy(a, y + 1, len * sizeof *y); |
|
314
|
0
|
|
|
|
|
|
memcpy(b, m + 1, len * sizeof *m); |
|
315
|
0
|
|
|
|
|
|
memset(v, 0, len * sizeof *v); |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
/* |
|
318
|
|
|
|
|
|
|
* Loop below ensures that a and b are reduced by some bits each, |
|
319
|
|
|
|
|
|
|
* for a total of at least 14 bits. |
|
320
|
|
|
|
|
|
|
*/ |
|
321
|
0
|
0
|
|
|
|
|
for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) { |
|
322
|
|
|
|
|
|
|
size_t j; |
|
323
|
|
|
|
|
|
|
uint32_t c0, c1; |
|
324
|
|
|
|
|
|
|
uint32_t a0, a1, b0, b1; |
|
325
|
|
|
|
|
|
|
uint32_t a_hi, b_hi, a_lo, b_lo; |
|
326
|
|
|
|
|
|
|
int32_t pa, pb, qa, qb; |
|
327
|
|
|
|
|
|
|
int i; |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
/* |
|
330
|
|
|
|
|
|
|
* Extract top words of a and b. If j is the highest |
|
331
|
|
|
|
|
|
|
* index >= 1 such that a[j] != 0 or b[j] != 0, then we want |
|
332
|
|
|
|
|
|
|
* (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1]. |
|
333
|
|
|
|
|
|
|
* If a and b are down to one word each, then we use a[0] |
|
334
|
|
|
|
|
|
|
* and b[0]. |
|
335
|
|
|
|
|
|
|
*/ |
|
336
|
0
|
|
|
|
|
|
c0 = (uint32_t)-1; |
|
337
|
0
|
|
|
|
|
|
c1 = (uint32_t)-1; |
|
338
|
0
|
|
|
|
|
|
a0 = 0; |
|
339
|
0
|
|
|
|
|
|
a1 = 0; |
|
340
|
0
|
|
|
|
|
|
b0 = 0; |
|
341
|
0
|
|
|
|
|
|
b1 = 0; |
|
342
|
0
|
|
|
|
|
|
j = len; |
|
343
|
0
|
0
|
|
|
|
|
while (j -- > 0) { |
|
344
|
|
|
|
|
|
|
uint32_t aw, bw; |
|
345
|
|
|
|
|
|
|
|
|
346
|
0
|
|
|
|
|
|
aw = a[j]; |
|
347
|
0
|
|
|
|
|
|
bw = b[j]; |
|
348
|
0
|
|
|
|
|
|
a0 ^= (a0 ^ aw) & c0; |
|
349
|
0
|
|
|
|
|
|
a1 ^= (a1 ^ aw) & c1; |
|
350
|
0
|
|
|
|
|
|
b0 ^= (b0 ^ bw) & c0; |
|
351
|
0
|
|
|
|
|
|
b1 ^= (b1 ^ bw) & c1; |
|
352
|
0
|
|
|
|
|
|
c1 = c0; |
|
353
|
0
|
|
|
|
|
|
c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1; |
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
/* |
|
357
|
|
|
|
|
|
|
* If c1 = 0, then we grabbed two words for a and b. |
|
358
|
|
|
|
|
|
|
* If c1 != 0 but c0 = 0, then we grabbed one word. It |
|
359
|
|
|
|
|
|
|
* is not possible that c1 != 0 and c0 != 0, because that |
|
360
|
|
|
|
|
|
|
* would mean that both integers are zero. |
|
361
|
|
|
|
|
|
|
*/ |
|
362
|
0
|
|
|
|
|
|
a1 |= a0 & c1; |
|
363
|
0
|
|
|
|
|
|
a0 &= ~c1; |
|
364
|
0
|
|
|
|
|
|
b1 |= b0 & c1; |
|
365
|
0
|
|
|
|
|
|
b0 &= ~c1; |
|
366
|
0
|
|
|
|
|
|
a_hi = (a0 << 15) + a1; |
|
367
|
0
|
|
|
|
|
|
b_hi = (b0 << 15) + b1; |
|
368
|
0
|
|
|
|
|
|
a_lo = a[0]; |
|
369
|
0
|
|
|
|
|
|
b_lo = b[0]; |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
/* |
|
372
|
|
|
|
|
|
|
* Compute reduction factors: |
|
373
|
|
|
|
|
|
|
* |
|
374
|
|
|
|
|
|
|
* a' = a*pa + b*pb |
|
375
|
|
|
|
|
|
|
* b' = a*qa + b*qb |
|
376
|
|
|
|
|
|
|
* |
|
377
|
|
|
|
|
|
|
* such that a' and b' are both multiple of 2^15, but are |
|
378
|
|
|
|
|
|
|
* only marginally larger than a and b. |
|
379
|
|
|
|
|
|
|
*/ |
|
380
|
0
|
|
|
|
|
|
pa = 1; |
|
381
|
0
|
|
|
|
|
|
pb = 0; |
|
382
|
0
|
|
|
|
|
|
qa = 0; |
|
383
|
0
|
|
|
|
|
|
qb = 1; |
|
384
|
0
|
0
|
|
|
|
|
for (i = 0; i < 15; i ++) { |
|
385
|
|
|
|
|
|
|
/* |
|
386
|
|
|
|
|
|
|
* At each iteration: |
|
387
|
|
|
|
|
|
|
* |
|
388
|
|
|
|
|
|
|
* a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi |
|
389
|
|
|
|
|
|
|
* b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi |
|
390
|
|
|
|
|
|
|
* a <- a/2 if: a is even |
|
391
|
|
|
|
|
|
|
* b <- b/2 if: a is odd, b is even |
|
392
|
|
|
|
|
|
|
* |
|
393
|
|
|
|
|
|
|
* We multiply a_lo and b_lo by 2 at each |
|
394
|
|
|
|
|
|
|
* iteration, thus a division by 2 really is a |
|
395
|
|
|
|
|
|
|
* non-multiplication by 2. |
|
396
|
|
|
|
|
|
|
*/ |
|
397
|
|
|
|
|
|
|
uint32_t r, oa, ob, cAB, cBA, cA; |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
/* |
|
400
|
|
|
|
|
|
|
* cAB = 1 if b must be subtracted from a |
|
401
|
|
|
|
|
|
|
* cBA = 1 if a must be subtracted from b |
|
402
|
|
|
|
|
|
|
* cA = 1 if a is divided by 2, 0 otherwise |
|
403
|
|
|
|
|
|
|
* |
|
404
|
|
|
|
|
|
|
* Rules: |
|
405
|
|
|
|
|
|
|
* |
|
406
|
|
|
|
|
|
|
* cAB and cBA cannot be both 1. |
|
407
|
|
|
|
|
|
|
* if a is not divided by 2, b is. |
|
408
|
|
|
|
|
|
|
*/ |
|
409
|
0
|
|
|
|
|
|
r = GT(a_hi, b_hi); |
|
410
|
0
|
|
|
|
|
|
oa = (a_lo >> i) & 1; |
|
411
|
0
|
|
|
|
|
|
ob = (b_lo >> i) & 1; |
|
412
|
0
|
|
|
|
|
|
cAB = oa & ob & r; |
|
413
|
0
|
|
|
|
|
|
cBA = oa & ob & NOT(r); |
|
414
|
0
|
|
|
|
|
|
cA = cAB | NOT(oa); |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
/* |
|
417
|
|
|
|
|
|
|
* Conditional subtractions. |
|
418
|
|
|
|
|
|
|
*/ |
|
419
|
0
|
|
|
|
|
|
a_lo -= b_lo & -cAB; |
|
420
|
0
|
|
|
|
|
|
a_hi -= b_hi & -cAB; |
|
421
|
0
|
|
|
|
|
|
pa -= qa & -(int32_t)cAB; |
|
422
|
0
|
|
|
|
|
|
pb -= qb & -(int32_t)cAB; |
|
423
|
0
|
|
|
|
|
|
b_lo -= a_lo & -cBA; |
|
424
|
0
|
|
|
|
|
|
b_hi -= a_hi & -cBA; |
|
425
|
0
|
|
|
|
|
|
qa -= pa & -(int32_t)cBA; |
|
426
|
0
|
|
|
|
|
|
qb -= pb & -(int32_t)cBA; |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
/* |
|
429
|
|
|
|
|
|
|
* Shifting. |
|
430
|
|
|
|
|
|
|
*/ |
|
431
|
0
|
|
|
|
|
|
a_lo += a_lo & (cA - 1); |
|
432
|
0
|
|
|
|
|
|
pa += pa & ((int32_t)cA - 1); |
|
433
|
0
|
|
|
|
|
|
pb += pb & ((int32_t)cA - 1); |
|
434
|
0
|
|
|
|
|
|
a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA; |
|
435
|
0
|
|
|
|
|
|
b_lo += b_lo & -cA; |
|
436
|
0
|
|
|
|
|
|
qa += qa & -(int32_t)cA; |
|
437
|
0
|
|
|
|
|
|
qb += qb & -(int32_t)cA; |
|
438
|
0
|
|
|
|
|
|
b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1); |
|
439
|
|
|
|
|
|
|
} |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
/* |
|
442
|
|
|
|
|
|
|
* Replace a and b with new values a' and b'. |
|
443
|
|
|
|
|
|
|
*/ |
|
444
|
0
|
|
|
|
|
|
r = co_reduce(a, b, len, pa, pb, qa, qb); |
|
445
|
0
|
|
|
|
|
|
pa -= pa * ((r & 1) << 1); |
|
446
|
0
|
|
|
|
|
|
pb -= pb * ((r & 1) << 1); |
|
447
|
0
|
|
|
|
|
|
qa -= qa * (r & 2); |
|
448
|
0
|
|
|
|
|
|
qb -= qb * (r & 2); |
|
449
|
0
|
|
|
|
|
|
co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i); |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
/* |
|
453
|
|
|
|
|
|
|
* Now one of the arrays should be 0, and the other contains |
|
454
|
|
|
|
|
|
|
* the GCD. If a is 0, then u is 0 as well, and v contains |
|
455
|
|
|
|
|
|
|
* the division result. |
|
456
|
|
|
|
|
|
|
* Result is correct if and only if GCD is 1. |
|
457
|
|
|
|
|
|
|
*/ |
|
458
|
0
|
|
|
|
|
|
r = (a[0] | b[0]) ^ 1; |
|
459
|
0
|
|
|
|
|
|
u[0] |= v[0]; |
|
460
|
0
|
0
|
|
|
|
|
for (k = 1; k < len; k ++) { |
|
461
|
0
|
|
|
|
|
|
r |= a[k] | b[k]; |
|
462
|
0
|
|
|
|
|
|
u[k] |= v[k]; |
|
463
|
|
|
|
|
|
|
} |
|
464
|
0
|
|
|
|
|
|
return EQ0(r); |
|
465
|
|
|
|
|
|
|
} |