File Coverage

montmath.h

Criterion	Covered	Total	%
statement	23	23	100.0
branch	10	10	100.0
condition			n/a
subroutine			n/a
pod			n/a
total	33	33	100.0

line	stmt	bran	code
1			#ifndef MPU_MONTMATH_H
2			#define MPU_MONTMATH_H
3
4			#include "ptypes.h"
5			#include "mulmod.h"
6
7			#if BITS_PER_WORD == 64 && HAVE_UINT64 && defined(__GNUC__) && defined(__x86_64__)
8			#define USE_MONTMATH 1
9			#else
10			#define USE_MONTMATH 0
11			#endif
12
13			#if USE_MONTMATH
14
15			#define mont_get1(n) _u64div(1,n)
16			/* Must have npi = mont_inverse(n), mont1 = mont_get1(n) */
17			#define mont_get2(n) addmod(mont1,mont1,n)
18			#define mont_geta(a,n) mulmod(a,mont1,n)
19			#define mont_mulmod(a,b,n) _mulredc(a,b,n,npi)
20			#define mont_sqrmod(a,n) _mulredc(a,a,n,npi)
21			#define mont_powmod(a,k,n) _powredc(a,k,mont1,n,npi)
22			#define mont_recover(a,n) mont_mulmod(a,1,n)
23
24			/* Save one branch if desired by calling directly */
25			#define mont_mulmod63(a,b,n) _mulredc63(a,b,n,npi)
26			#define mont_mulmod64(a,b,n) _mulredc64(a,b,n,npi)
27
28			/* See https://arxiv.org/pdf/1303.0328.pdf for lots of details on this.
29			* The 128-entry table solution is about 20% faster */
30	16613		static INLINE uint64_t mont_inverse(const uint64_t n) {
31	16613		uint64_t ret = (3*n) ^ 2;
32	16613		ret = (uint64_t)2 - n ret;
33	16613		ret = (uint64_t)2 - n ret;
34	16613		ret = (uint64_t)2 - n ret;
35	16613		ret = (uint64_t)2 - n ret;
36	16613		return (uint64_t)0 - ret;
37			}
38
39			/* MULREDC asm from Ben Buhrow */
40	868326		static INLINE uint64_t _mulredc63(uint64_t a, uint64_t b, uint64_t n, uint64_t npi) {
41	868326		__asm__ ("mulq %2 \n\t"
42			"movq %%rax, %%r10 \n\t"
43			"movq %%rdx, %%r11 \n\t"
44			"mulq %3 \n\t"
45			"mulq %4 \n\t"
46			"addq %%r10, %%rax \n\t"
47			"adcq %%r11, %%rdx \n\t"
48			"xorq %%rax, %%rax \n\t"
49			"subq %4, %%rdx \n\t"
50			"cmovc %4, %%rax \n\t"
51			"addq %%rdx, %%rax \n\t"
52			: "=a"(a)
53			: "0"(a), "r"(b), "r"(npi), "r"(n)
54			: "rdx", "r10", "r11", "cc");
55	868326		return a;
56			}
57	1755		static INLINE uint64_t _mulredc64(uint64_t a, uint64_t b, uint64_t n, uint64_t npi) {
58	1755		__asm__ ("mulq %1 \n\t"
59			"movq %%rax, %%r10 \n\t"
60			"movq %%rdx, %%r11 \n\t"
61			"movq $0, %%r12 \n\t"
62			"mulq %2 \n\t"
63			"mulq %3 \n\t"
64			"addq %%r10, %%rax \n\t"
65			"adcq %%r11, %%rdx \n\t"
66			"cmovae %3, %%r12 \n\t"
67			"xorq %%rax, %%rax \n\t"
68			"subq %3, %%rdx \n\t"
69			"cmovc %%r12, %%rax \n\t"
70			"addq %%rdx, %%rax \n\t"
71			: "+&a"(a)
72			: "r"(b), "r"(npi), "r"(n)
73			: "rdx", "r10", "r11", "r12", "cc");
74	1755		return a;
75			}
76			#define _mulredc(a,b,n,npi) ((n & 0x8000000000000000ULL) ? _mulredc64(a,b,n,npi) : _mulredc63(a,b,n,npi))
77
78	3663		static INLINE UV _powredc(uint64_t a, uint64_t k, uint64_t one, uint64_t n, uint64_t npi) {
79	3663		uint64_t t = one;
80	50155	100	while (k) {
81	46492	100	if (k & 1) t = mont_mulmod(t, a, n);
		100
82	46492		k >>= 1;
83	46492	100	if (k) a = mont_sqrmod(a, n);
		100
84			}
85	3663		return t;
86			}
87
88	16613		static INLINE uint64_t _u64div(uint64_t c, uint64_t n) {
89	16613		__asm__("divq %4"
90			: "=a"(c), "=d"(n)
91			: "1"(c), "0"(0), "r"(n));
92	16613		return n;
93			}
94
95			#endif /* use_montmath */
96
97			#if 0
98			/* AArch64 */
99			/* https://www.pure.ed.ac.uk/ws/portalfiles/portal/412503872/Concurrency_and_Computation_-_2023_-_Jesus_-_Vectorizing_and_distributing_number_theoretic_transform_to_count_Goldbach.pdf */
100			/* https://era.ed.ac.uk/server/api/core/bitstreams/ed9176f7-d8ed-4af9-aa20-3a82c5f8e353/content */
101
102			#define umul128(ph, pl, m0, m1) do { \
103			uint64_t __m0 = (m0), __m1 = (m1); \
104			__asm__ ("umulh\t%0, %1, %2" \
105			: "=r" (ph) \
106			: "r" (__m0), "r" (__m1)); \
107			(pl) = __m0 * __m1; \
108			} while(0)
109			static INLINE uint64_t montmul(uint64_t a, uint64_t b, uint64_t n, uint64_t npi) {
110			uint64_t m, xh, xl, yh, yl, z;
111			umul128(xh, xl, a, b); /* x = a * b */
112			m = (uint64_t)(xl * npi) /* m = (xN') mod R /
113			umul128(yh, yl, m, n); /* y = m * N */
114			z = xh+yh+((xl+yl) < xl); /* z = (x+y)/R */
115			return z >= N ? z-n : z;
116			}
117
118			/*
119			Q = R^2 mod N
120			to_mont(x) = xR mod N = montmul(x,Q)
121			from_mont(X) = X/R mod N = montmul(X,1)
122			*/
123			#endif
124
125			#endif