| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#ifndef MPU_MONTMATH_H |
|
2
|
|
|
|
|
|
|
#define MPU_MONTMATH_H |
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
#include "ptypes.h" |
|
5
|
|
|
|
|
|
|
#include "mulmod.h" |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
#if BITS_PER_WORD == 64 && HAVE_UINT64 && defined(__GNUC__) && defined(__x86_64__) |
|
8
|
|
|
|
|
|
|
#define USE_MONTMATH 1 |
|
9
|
|
|
|
|
|
|
#else |
|
10
|
|
|
|
|
|
|
#define USE_MONTMATH 0 |
|
11
|
|
|
|
|
|
|
#endif |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
#if USE_MONTMATH |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
#define mont_get1(n) _u64div(1,n) |
|
16
|
|
|
|
|
|
|
/* Must have npi = mont_inverse(n), mont1 = mont_get1(n) */ |
|
17
|
|
|
|
|
|
|
#define mont_get2(n) addmod(mont1,mont1,n) |
|
18
|
|
|
|
|
|
|
#define mont_geta(a,n) mulmod(a,mont1,n) |
|
19
|
|
|
|
|
|
|
#define mont_mulmod(a,b,n) _mulredc(a,b,n,npi) |
|
20
|
|
|
|
|
|
|
#define mont_sqrmod(a,n) _mulredc(a,a,n,npi) |
|
21
|
|
|
|
|
|
|
#define mont_powmod(a,k,n) _powredc(a,k,mont1,n,npi) |
|
22
|
|
|
|
|
|
|
#define mont_recover(a,n) mont_mulmod(a,1,n) |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
/* Save one branch if desired by calling directly */ |
|
25
|
|
|
|
|
|
|
#define mont_mulmod63(a,b,n) _mulredc63(a,b,n,npi) |
|
26
|
|
|
|
|
|
|
#define mont_mulmod64(a,b,n) _mulredc64(a,b,n,npi) |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
/* See https://arxiv.org/pdf/1303.0328.pdf for lots of details on this. |
|
29
|
|
|
|
|
|
|
* The 128-entry table solution is about 20% faster */ |
|
30
|
16613
|
|
|
|
|
|
static INLINE uint64_t mont_inverse(const uint64_t n) { |
|
31
|
16613
|
|
|
|
|
|
uint64_t ret = (3*n) ^ 2; |
|
32
|
16613
|
|
|
|
|
|
ret *= (uint64_t)2 - n * ret; |
|
33
|
16613
|
|
|
|
|
|
ret *= (uint64_t)2 - n * ret; |
|
34
|
16613
|
|
|
|
|
|
ret *= (uint64_t)2 - n * ret; |
|
35
|
16613
|
|
|
|
|
|
ret *= (uint64_t)2 - n * ret; |
|
36
|
16613
|
|
|
|
|
|
return (uint64_t)0 - ret; |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
/* MULREDC asm from Ben Buhrow */ |
|
40
|
868326
|
|
|
|
|
|
static INLINE uint64_t _mulredc63(uint64_t a, uint64_t b, uint64_t n, uint64_t npi) { |
|
41
|
868326
|
|
|
|
|
|
__asm__ ("mulq %2 \n\t" |
|
42
|
|
|
|
|
|
|
"movq %%rax, %%r10 \n\t" |
|
43
|
|
|
|
|
|
|
"movq %%rdx, %%r11 \n\t" |
|
44
|
|
|
|
|
|
|
"mulq %3 \n\t" |
|
45
|
|
|
|
|
|
|
"mulq %4 \n\t" |
|
46
|
|
|
|
|
|
|
"addq %%r10, %%rax \n\t" |
|
47
|
|
|
|
|
|
|
"adcq %%r11, %%rdx \n\t" |
|
48
|
|
|
|
|
|
|
"xorq %%rax, %%rax \n\t" |
|
49
|
|
|
|
|
|
|
"subq %4, %%rdx \n\t" |
|
50
|
|
|
|
|
|
|
"cmovc %4, %%rax \n\t" |
|
51
|
|
|
|
|
|
|
"addq %%rdx, %%rax \n\t" |
|
52
|
|
|
|
|
|
|
: "=a"(a) |
|
53
|
|
|
|
|
|
|
: "0"(a), "r"(b), "r"(npi), "r"(n) |
|
54
|
|
|
|
|
|
|
: "rdx", "r10", "r11", "cc"); |
|
55
|
868326
|
|
|
|
|
|
return a; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
1755
|
|
|
|
|
|
static INLINE uint64_t _mulredc64(uint64_t a, uint64_t b, uint64_t n, uint64_t npi) { |
|
58
|
1755
|
|
|
|
|
|
__asm__ ("mulq %1 \n\t" |
|
59
|
|
|
|
|
|
|
"movq %%rax, %%r10 \n\t" |
|
60
|
|
|
|
|
|
|
"movq %%rdx, %%r11 \n\t" |
|
61
|
|
|
|
|
|
|
"movq $0, %%r12 \n\t" |
|
62
|
|
|
|
|
|
|
"mulq %2 \n\t" |
|
63
|
|
|
|
|
|
|
"mulq %3 \n\t" |
|
64
|
|
|
|
|
|
|
"addq %%r10, %%rax \n\t" |
|
65
|
|
|
|
|
|
|
"adcq %%r11, %%rdx \n\t" |
|
66
|
|
|
|
|
|
|
"cmovae %3, %%r12 \n\t" |
|
67
|
|
|
|
|
|
|
"xorq %%rax, %%rax \n\t" |
|
68
|
|
|
|
|
|
|
"subq %3, %%rdx \n\t" |
|
69
|
|
|
|
|
|
|
"cmovc %%r12, %%rax \n\t" |
|
70
|
|
|
|
|
|
|
"addq %%rdx, %%rax \n\t" |
|
71
|
|
|
|
|
|
|
: "+&a"(a) |
|
72
|
|
|
|
|
|
|
: "r"(b), "r"(npi), "r"(n) |
|
73
|
|
|
|
|
|
|
: "rdx", "r10", "r11", "r12", "cc"); |
|
74
|
1755
|
|
|
|
|
|
return a; |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
#define _mulredc(a,b,n,npi) ((n & 0x8000000000000000ULL) ? _mulredc64(a,b,n,npi) : _mulredc63(a,b,n,npi)) |
|
77
|
|
|
|
|
|
|
|
|
78
|
3663
|
|
|
|
|
|
static INLINE UV _powredc(uint64_t a, uint64_t k, uint64_t one, uint64_t n, uint64_t npi) { |
|
79
|
3663
|
|
|
|
|
|
uint64_t t = one; |
|
80
|
50155
|
100
|
|
|
|
|
while (k) { |
|
81
|
46492
|
100
|
|
|
|
|
if (k & 1) t = mont_mulmod(t, a, n); |
|
|
|
100
|
|
|
|
|
|
|
82
|
46492
|
|
|
|
|
|
k >>= 1; |
|
83
|
46492
|
100
|
|
|
|
|
if (k) a = mont_sqrmod(a, n); |
|
|
|
100
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
} |
|
85
|
3663
|
|
|
|
|
|
return t; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
16613
|
|
|
|
|
|
static INLINE uint64_t _u64div(uint64_t c, uint64_t n) { |
|
89
|
16613
|
|
|
|
|
|
__asm__("divq %4" |
|
90
|
|
|
|
|
|
|
: "=a"(c), "=d"(n) |
|
91
|
|
|
|
|
|
|
: "1"(c), "0"(0), "r"(n)); |
|
92
|
16613
|
|
|
|
|
|
return n; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
#endif /* use_montmath */ |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
#if 0 |
|
98
|
|
|
|
|
|
|
/* AArch64 */ |
|
99
|
|
|
|
|
|
|
/* https://www.pure.ed.ac.uk/ws/portalfiles/portal/412503872/Concurrency_and_Computation_-_2023_-_Jesus_-_Vectorizing_and_distributing_number_theoretic_transform_to_count_Goldbach.pdf */ |
|
100
|
|
|
|
|
|
|
/* https://era.ed.ac.uk/server/api/core/bitstreams/ed9176f7-d8ed-4af9-aa20-3a82c5f8e353/content */ |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
#define umul128(ph, pl, m0, m1) do { \ |
|
103
|
|
|
|
|
|
|
uint64_t __m0 = (m0), __m1 = (m1); \ |
|
104
|
|
|
|
|
|
|
__asm__ ("umulh\t%0, %1, %2" \ |
|
105
|
|
|
|
|
|
|
: "=r" (ph) \ |
|
106
|
|
|
|
|
|
|
: "r" (__m0), "r" (__m1)); \ |
|
107
|
|
|
|
|
|
|
(pl) = __m0 * __m1; \ |
|
108
|
|
|
|
|
|
|
} while(0) |
|
109
|
|
|
|
|
|
|
static INLINE uint64_t montmul(uint64_t a, uint64_t b, uint64_t n, uint64_t npi) { |
|
110
|
|
|
|
|
|
|
uint64_t m, xh, xl, yh, yl, z; |
|
111
|
|
|
|
|
|
|
umul128(xh, xl, a, b); /* x = a * b */ |
|
112
|
|
|
|
|
|
|
m = (uint64_t)(xl * npi) /* m = (x*N') mod R */ |
|
113
|
|
|
|
|
|
|
umul128(yh, yl, m, n); /* y = m * N */ |
|
114
|
|
|
|
|
|
|
z = xh+yh+((xl+yl) < xl); /* z = (x+y)/R */ |
|
115
|
|
|
|
|
|
|
return z >= N ? z-n : z; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
/* |
|
119
|
|
|
|
|
|
|
Q = R^2 mod N |
|
120
|
|
|
|
|
|
|
to_mont(x) = xR mod N = montmul(x,Q) |
|
121
|
|
|
|
|
|
|
from_mont(X) = X/R mod N = montmul(X,1) |
|
122
|
|
|
|
|
|
|
*/ |
|
123
|
|
|
|
|
|
|
#endif |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
#endif |