File Coverage

util.c

Criterion	Covered	Total	%
statement	1418	1644	86.2
branch	1398	1968	71.0
condition			n/a
subroutine			n/a
pod			n/a
total	2816	3612	77.9

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5			#include
6
7			/* Use long double to get a little more precision when we're calculating the
8			* math functions -- especially those calculated with a series. Long double
9			* is defined in C89 (ISO C). Note that 'long double' on many platforms is
10			* identical to 'double so it may buy us nothing. But it's worth trying.
11			*
12			* While the type was in C89, math functions using it are in C99. Some
13			* systems didn't really get it right (e.g. NetBSD which left out some
14			* functions for 13 years).
15			*/
16			#include
17			#if _MSC_VER \|\| defined(__IBMC__) \| defined(__IBMCPP__) \|\| (defined(__STDC_VERSION__) && __STDC_VERSION >= 199901L)
18			/* math.h should give us these as functions or macros.
19			*
20			* extern long double fabsl(long double);
21			* extern long double floorl(long double);
22			* extern long double ceill(long double);
23			* extern long double sqrtl(long double);
24			* extern long double powl(long double, long double);
25			* extern long double expl(long double);
26			* extern long double logl(long double);
27			*/
28			#else
29			#define fabsl(x) (long double) fabs( (double) (x) )
30			#define floorl(x) (long double) floor( (double) (x) )
31			#define ceill(x) (long double) ceil( (double) (x) )
32			#define sqrtl(x) (long double) sqrt( (double) (x) )
33			#define powl(x, y) (long double) pow( (double) (x), (double) (y) )
34			#define expl(x) (long double) exp( (double) (x) )
35			#define logl(x) (long double) log( (double) (x) )
36			#endif
37
38			#ifdef LDBL_INFINITY
39			#undef INFINITY
40			#define INFINITY LDBL_INFINITY
41			#elif !defined(INFINITY)
42			#define INFINITY (DBL_MAX + DBL_MAX)
43			#endif
44			#ifndef LDBL_EPSILON
45			#define LDBL_EPSILON 1e-16
46			#endif
47			#ifndef LDBL_MAX
48			#define LDBL_MAX DBL_MAX
49			#endif
50
51			#include "ptypes.h"
52			#define FUNC_isqrt 1
53			#define FUNC_icbrt 1
54			#define FUNC_lcm_ui 1
55			#define FUNC_ctz 1
56			#define FUNC_log2floor 1
57			#define FUNC_is_perfect_square
58			#define FUNC_is_perfect_cube
59			#define FUNC_is_perfect_fifth
60			#define FUNC_is_perfect_seventh
61			#define FUNC_next_prime_in_sieve 1
62			#define FUNC_prev_prime_in_sieve 1
63			#define FUNC_ipow 1
64			#include "util.h"
65			#include "sieve.h"
66			#include "primality.h"
67			#include "cache.h"
68			#include "lmo.h"
69			#include "factor.h"
70			#include "mulmod.h"
71			#include "constants.h"
72			#include "montmath.h"
73			#include "csprng.h"
74			#include "keyval.h"
75
76			#define KAHAN_INIT(s) \
77			LNV s ## _y, s ## _t; \
78			LNV s ## _c = 0.0; \
79			LNV s = 0.0;
80
81			#define KAHAN_SUM(s, term) \
82			do { \
83			s ## _y = (term) - s ## _c; \
84			s ## _t = s + s ## _y; \
85			s ## _c = (s ## _t - s) - s ## _y; \
86			s = s ## _t; \
87			} while (0)
88
89	57866		int _numcmp(const void a, const void b) {
90	57866		const UV x = a, y = b;
91	57866	100	return (x > y) ? 1 : (x < y) ? -1 : 0;
		50
92			}
93
94			static int _verbose = 0;
95	8		void _XS_set_verbose(int v) { _verbose = v; }
96	34524		int _XS_get_verbose(void) { return _verbose; }
97
98			static int _call_gmp = 0;
99	73		void _XS_set_callgmp(int v) { _call_gmp = v; }
100	19596		int _XS_get_callgmp(void) { return _call_gmp; }
101
102			static int _secure = 0;
103	0		void _XS_set_secure(void) { _secure = 1; }
104	22		int _XS_get_secure(void) { return _secure; }
105
106			/* We'll use this little static sieve to quickly answer small values of
107			* is_prime, next_prime, prev_prime, prime_count
108			* for non-threaded Perl it's basically the same as getting the primary
109			* cache. It guarantees we'll have an answer with no waiting on any version.
110			*/
111			static const unsigned char prime_sieve30[] =
112			{0x01,0x20,0x10,0x81,0x49,0x24,0xc2,0x06,0x2a,0xb0,0xe1,0x0c,0x15,0x59,0x12,
113			0x61,0x19,0xf3,0x2c,0x2c,0xc4,0x22,0xa6,0x5a,0x95,0x98,0x6d,0x42,0x87,0xe1,
114			0x59,0xa9,0xa9,0x1c,0x52,0xd2,0x21,0xd5,0xb3,0xaa,0x26,0x5c,0x0f,0x60,0xfc,
115			0xab,0x5e,0x07,0xd1,0x02,0xbb,0x16,0x99,0x09,0xec,0xc5,0x47,0xb3,0xd4,0xc5,
116			0xba,0xee,0x40,0xab,0x73,0x3e,0x85,0x4c,0x37,0x43,0x73,0xb0,0xde,0xa7,0x8e,
117			0x8e,0x64,0x3e,0xe8,0x10,0xab,0x69,0xe5,0xf7,0x1a,0x7c,0x73,0xb9,0x8d,0x04,
118			0x51,0x9a,0x6d,0x70,0xa7,0x78,0x2d,0x6d,0x27,0x7e,0x9a,0xd9,0x1c,0x5f,0xee,
119			0xc7,0x38,0xd9,0xc3,0x7e,0x14,0x66,0x72,0xae,0x77,0xc1,0xdb,0x0c,0xcc,0xb2,
120			0xa5,0x74,0xe3,0x58,0xd5,0x4b,0xa7,0xb3,0xb1,0xd9,0x09,0xe6,0x7d,0x23,0x7c,
121			0x3c,0xd3,0x0e,0xc7,0xfd,0x4a,0x32,0x32,0xfd,0x4d,0xb5,0x6b,0xf3,0xa8,0xb3,
122			0x85,0xcf,0xbc,0xf4,0x0e,0x34,0xbb,0x93,0xdb,0x07,0xe6,0xfe,0x6a,0x57,0xa3,
123			0x8c,0x15,0x72,0xdb,0x69,0xd4,0xaf,0x59,0xdd,0xe1,0x3b,0x2e,0xb7,0xf9,0x2b,
124			0xc5,0xd0,0x8b,0x63,0xf8,0x95,0xfa,0x77,0x40,0x97,0xea,0xd1,0x9f,0xaa,0x1c,
125			0x48,0xae,0x67,0xf7,0xeb,0x79,0xa5,0x55,0xba,0xb2,0xb6,0x8f,0xd8,0x2d,0x6c,
126			0x2a,0x35,0x54,0xfd,0x7c,0x9e,0xfa,0xdb,0x31,0x78,0xdd,0x3d,0x56,0x52,0xe7,
127			0x73,0xb2,0x87,0x2e,0x76,0xe9,0x4f,0xa8,0x38,0x9d,0x5d,0x3f,0xcb,0xdb,0xad,
128			0x51,0xa5,0xbf,0xcd,0x72,0xde,0xf7,0xbc,0xcb,0x49,0x2d,0x49,0x26,0xe6,0x1e,
129			0x9f,0x98,0xe5,0xc6,0x9f,0x2f,0xbb,0x85,0x6b,0x65,0xf6,0x77,0x7c,0x57,0x8b,
130			0xaa,0xef,0xd8,0x5e,0xa2,0x97,0xe1,0xdc,0x37,0xcd,0x1f,0xe6,0xfc,0xbb,0x8c,
131			0xb7,0x4e,0xc7,0x3c,0x19,0xd5,0xa8,0x9e,0x67,0x4a,0xe3,0xf5,0x97,0x3a,0x7e,
132			0x70,0x53,0xfd,0xd6,0xe5,0xb8,0x1c,0x6b,0xee,0xb1,0x9b,0xd1,0xeb,0x34,0xc2,
133			0x23,0xeb,0x3a,0xf9,0xef,0x16,0xd6,0x4e,0x7d,0x16,0xcf,0xb8,0x1c,0xcb,0xe6,
134			0x3c,0xda,0xf5,0xcf};
135			#define NPRIME_SIEVE30 (sizeof(prime_sieve30)/sizeof(prime_sieve30[0]))
136
137			static const unsigned short primes_tiny[] =
138			{0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
139			101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
140			193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
141			293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
142			409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503};
143			#define NPRIMES_TINY (sizeof(primes_tiny)/sizeof(primes_tiny[0]))
144
145			/* Return of 2 if n is prime, 0 if not. Do it fast. */
146	444016		int is_prime(UV n)
147			{
148	444016	100	if (n <= 10)
149	12983	100	return (n == 2 \|\| n == 3 \|\| n == 5 \|\| n == 7) ? 2 : 0;
		100
		100
		100
150
151	431033	100	if (n < UVCONST(200000000)) {
152	429111		UV d = n/30;
153	429111		UV m = n - d*30;
154	429111		unsigned char mtab = masktab30[m]; /* Bitmask in mod30 wheel */
155			const unsigned char* sieve;
156
157			/* Return 0 if a multiple of 2, 3, or 5 */
158	429111	100	if (mtab == 0)
159	417231		return 0;
160
161			/* Check static tiny sieve */
162	188552	100	if (d < NPRIME_SIEVE30)
163	18329	100	return (prime_sieve30[d] & mtab) ? 0 : 2;
164
165	170223	100	if (!(n%7) \|\| !(n%11) \|\| !(n%13)) return 0;
		100
		100
166
167			/* Check primary cache */
168	117147	100	if (n <= get_prime_cache(0,0)) {
169	105267		int isprime = -1;
170	105267	50	if (n <= get_prime_cache(0, &sieve))
171	105267	100	isprime = 2*((sieve[d] & mtab) == 0);
172			release_prime_cache(sieve);
173	105267	50	if (isprime >= 0)
174	117147		return isprime;
175			}
176			}
177	13802		return is_prob_prime(n);
178			}
179
180
181	25132		UV next_prime(UV n)
182			{
183			UV m, next;
184
185	25132	100	if (n < 30*NPRIME_SIEVE30) {
186	18862		next = next_prime_in_sieve(prime_sieve30, n, 30*NPRIME_SIEVE30);
187	18862	100	if (next != 0) return next;
188			}
189
190	6271	50	if (n >= MPU_MAX_PRIME) return 0; /* Overflow */
191
192	6271	100	if (n < get_prime_cache(0,0)) {
193			const unsigned char* sieve;
194	4068		UV sieve_size = get_prime_cache(0, &sieve);
195	4068	50	next = (n < sieve_size) ? next_prime_in_sieve(sieve, n, sieve_size) : 0;
196			release_prime_cache(sieve);
197	4068	50	if (next != 0) return next;
198			}
199
200	2203		m = n % 30;
201			do { /* Move forward one. */
202	9727		n += wheeladvance30[m];
203	9727		m = nextwheel30[m];
204	9727	100	} while (!is_prob_prime(n));
205	2203		return n;
206			}
207
208
209	28359		UV prev_prime(UV n)
210			{
211			UV m, prev;
212
213	28359	100	if (n < 30*NPRIME_SIEVE30)
214	20172		return prev_prime_in_sieve(prime_sieve30, n);
215
216	8187	100	if (n < get_prime_cache(0,0)) {
217			const unsigned char* sieve;
218	4023		UV sieve_size = get_prime_cache(0, &sieve);
219	4023	50	prev = (n < sieve_size) ? prev_prime_in_sieve(sieve, n) : 0;
220			release_prime_cache(sieve);
221	4023	50	if (prev != 0) return prev;
222			}
223
224	4164		m = n % 30;
225			do { /* Move back one. */
226	14269		n -= wheelretreat30[m];
227	14269		m = prevwheel30[m];
228	14269	100	} while (!is_prob_prime(n));
229	4164		return n;
230			}
231
232			/******************************************************************************/
233			/* PRINTING */
234			/******************************************************************************/
235
236	0		static int my_sprint(char* ptr, UV val) {
237			int nchars;
238			UV t;
239	0		char *s = ptr;
240			do {
241	0		t = val / 10; s++ = (char) ('0' + val - 10 t);
242	0	0	} while ((val = t));
243	0		nchars = s - ptr + 1; *s = '\n';
244	0	0	while (--s > ptr) { char c = s; s = ptr; ptr++ = c; }
245	0		return nchars;
246			}
247	0		static char* write_buf(int fd, char* buf, char* bend) {
248	0		int res = (int) write(fd, buf, bend-buf);
249	0	0	if (res == -1) croak("print_primes write error");
250	0		return buf;
251			}
252	0		void print_primes(UV low, UV high, int fd) {
253			char buf[8000+25];
254	0		char* bend = buf;
255	0	0	if ((low <= 2) && (high >= 2)) bend += my_sprint(bend,2);
		0
256	0	0	if ((low <= 3) && (high >= 3)) bend += my_sprint(bend,3);
		0
257	0	0	if ((low <= 5) && (high >= 5)) bend += my_sprint(bend,5);
		0
258	0	0	if (low < 7) low = 7;
259
260	0	0	if (low <= high) {
261			unsigned char* segment;
262			UV seg_base, seg_low, seg_high;
263	0		void* ctx = start_segment_primes(low, high, &segment);
264	0	0	while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
265	0	0	START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high )
		0
		0
		0
		0
266	0		bend += my_sprint(bend,p);
267	0	0	if (bend-buf > 8000) { bend = write_buf(fd, buf, bend); }
268	0		END_DO_FOR_EACH_SIEVE_PRIME
269			}
270	0		end_segment_primes(ctx);
271			}
272	0	0	if (bend > buf) { bend = write_buf(fd, buf, bend); }
273	0		}
274
275			/******************************************************************************/
276			/* TOTIENT, MOEBIUS, MERTENS */
277			/******************************************************************************/
278
279			/* Return a char array with lo-hi+1 elements. mu[k-lo] = µ(k) for k = lo .. hi.
280			* It is the callers responsibility to call Safefree on the result. */
281			#define PGTLO(ip,p,lo) ((ip)>=(lo)) ? (ip) : ((p)*((lo)/(p)) + (((lo)%(p))?(p):0))
282	69		signed char* range_moebius(UV lo, UV hi)
283			{
284			signed char* mu;
285	69		UV i, sqrtn = isqrt(hi), count = hi-lo+1;
286
287			/* Kuznetsov indicates that the Deléglise & Rivat (1996) method can be
288			* modified to work on logs, which allows us to operate with no
289			* intermediate memory at all. Same time as the D&R method, less memory. */
290			unsigned char logp;
291			UV nextlog, nextlogi;
292
293	69		Newz(0, mu, count, signed char);
294	69	100	if (sqrtn*sqrtn != hi && sqrtn < (UVCONST(1)<<(BITS_PER_WORD/2))-1) sqrtn++;
		50
295
296			/* For small ranges, do it by hand */
297	69	100	if (hi < 100 \|\| count <= 10 \|\| (hi > (1UL<<25) && count < icbrt(hi)/4)) {
		50
		50
		0
298	706	100	for (i = 0; i < count; i++)
299	671		mu[i] = moebius(lo+i);
300	35		return mu;
301			}
302
303	34		logp = 1; nextlog = 3; /* 2+1 */
304	438	50	START_DO_FOR_EACH_PRIME(2, sqrtn) {
		100
		100
		100
		100
		50
		50
		50
		50
		100
305	404		UV p2 = p*p;
306	404	100	if (p > nextlog) {
307	77		logp += 2; /* logp is 1 \| ceil(log(p)/log(2)) */
308	77		nextlog = ((nextlog-1)*4)+1;
309			}
310	146749	50	for (i = PGTLO(p, p, lo); i >= lo && i <= hi; i += p)
		0
		50
		100
311	146345		mu[i-lo] += logp;
312	37943	50	for (i = PGTLO(p2, p2, lo); i >= lo && i <= hi; i += p2)
		0
		50
		100
313	37539		mu[i-lo] = 0x80;
314	404		} END_DO_FOR_EACH_PRIME
315
316	34	100	logp = log2floor(lo);
317	34		nextlogi = (UVCONST(2) << logp) - lo;
318	84025	100	for (i = 0; i < count; i++) {
319	83991		unsigned char a = mu[i];
320	83991	100	if (i >= nextlogi) nextlogi = (UVCONST(2) << ++logp) - lo;
321	83991	100	if (a & 0x80) { a = 0; }
322	51111	100	else if (a >= logp) { a = 1 - 2*(a&1); }
323	38965		else { a = -1 + 2*(a&1); }
324	83991		mu[i] = a;
325			}
326	34	100	if (lo == 0) mu[0] = 0;
327
328	34		return mu;
329			}
330
331	8		UV* range_totient(UV lo, UV hi) {
332			UV* totients;
333	8		UV i, seg_base, seg_low, seg_high, count = hi-lo+1;
334			unsigned char* segment;
335			void* ctx;
336
337	8	50	if (hi < lo) croak("range_totient error hi %"UVuf" < lo %"UVuf"\n", hi, lo);
338	8	50	New(0, totients, count, UV);
339
340			/* Do via factoring if very small or if we have a small range */
341	8	100	if (hi < 100 \|\| count <= 10 \|\| hi/count > 1000) {
		50
		50
342	41	100	for (i = 0; i < count; i++)
343	35		totients[i] = totient(lo+i);
344	6		return totients;
345			}
346
347	2	50	if (hi == UV_MAX) {
348	0		totients[--count] = totient(UV_MAX);
349	0		hi--;
350			}
351
352			/* If doing a full sieve, do it monolithic. Faster. */
353	2	100	if (lo == 0) {
354			UV* prime;
355	1		double loghi = log(hi);
356	1	50	UV max_index = (hi < 67) ? 18
		50
357	1		: (hi < 355991) ? 15+(hi/(loghi-1.09))
358	0		: (hi/loghi) * (1.0+1.0/loghi+2.51/(loghi*loghi));
359	1		UV j, index, nprimes = 0;
360
361	1	50	New(0, prime, max_index, UV); /* could use prime_count_upper(hi) */
362	1		memset(totients, 0, count * sizeof(UV));
363	120	100	for (i = 2; i <= hi/2; i++) {
364	119		index = 2*i;
365	119	100	if ( !(i&1) ) {
366	60	100	if (i == 2) { totients[2] = 1; prime[nprimes++] = 2; }
367	60		totients[index] = totients[i]*2;
368			} else {
369	59	100	if (totients[i] == 0) {
370	29		totients[i] = i-1;
371	29		prime[nprimes++] = i;
372			}
373	167	50	for (j=0; j < nprimes && index <= hi; index = i*prime[++j]) {
		100
374	127	100	if (i % prime[j] == 0) {
375	19		totients[index] = totients[i]*prime[j];
376	19		break;
377			} else {
378	108		totients[index] = totients[i]*(prime[j]-1);
379			}
380			}
381			}
382			}
383	1		Safefree(prime);
384			/* All totient values have been filled in except the primes. Mark them. */
385	61	100	for (i = ((hi/2) + 1) \| 1; i <= hi; i += 2)
386	60	100	if (totients[i] == 0)
387	22		totients[i] = i-1;
388	1		totients[1] = 1;
389	1		totients[0] = 0;
390	1		return totients;
391			}
392
393	26	100	for (i = 0; i < count; i++) {
394	25		UV v = lo+i, nv = v;
395	25	100	if (v % 2 == 0) nv -= nv/2;
396	25	100	if (v % 3 == 0) nv -= nv/3;
397	25	100	if (v % 5 == 0) nv -= nv/5;
398	25		totients[i] = nv;
399			}
400
401	1		ctx = start_segment_primes(7, hi/2, &segment);
402	2	100	while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
403	137	50	START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
		100
		50
		100
		100
404	163	100	for (i = PGTLO(2*p,p,lo); i >= lo && i <= hi; i += p)
		100
		50
		100
405	31		totients[i-lo] -= totients[i-lo]/p;
406	4		} END_DO_FOR_EACH_SIEVE_PRIME
407			}
408	1		end_segment_primes(ctx);
409
410			/* Fill in all primes */
411	14	100	for (i = (lo \| 1) - lo; i < count; i += 2)
412	13	100	if (totients[i] == i+lo)
413	2		totients[i]--;
414	1	50	if (lo <= 1) totients[1-lo] = 1;
415
416	8		return totients;
417			}
418
419	45		IV mertens(UV n) {
420			/* See Deléglise and Rivat (1996) for O(n^2/3 log(log(n))^1/3) algorithm.
421			* This implementation uses their lemma 2.1 directly, so is ~ O(n).
422			* In serial it is quite a bit faster than segmented summation of mu
423			* ranges, though the latter seems to be a favored method for GPUs.
424			*/
425			UV u, j, m, nmk, maxmu;
426			signed char* mu;
427			short* M; /* 16 bits is enough range for all 32-bit M => 64-bit n */
428			IV sum;
429
430	45	100	if (n <= 1) return n;
431	44		u = isqrt(n);
432	44		maxmu = (n/(u+1)); /* maxmu lets us handle u < sqrt(n) */
433	44	100	if (maxmu < u) maxmu = u;
434	44		mu = range_moebius(0, maxmu);
435	44	50	New(0, M, maxmu+1, short); /* Works up to maxmu < 7613644886 */
436	44		M[0] = 0;
437	56574	100	for (j = 1; j <= maxmu; j++)
438	56530		M[j] = M[j-1] + mu[j];
439	44		sum = M[u];
440	56574	100	for (m = 1; m <= u; m++) {
441	56530	100	if (mu[m] != 0) {
442	34416		IV inner_sum = 0;
443	34416		UV lower = (u/m) + 1;
444	34416		UV last_nmk = n/(m*lower);
445	34416		UV this_k = 0;
446	34416		UV next_k = n/(m*1);
447	34416		UV nmkm = m * 2;
448	228957633	100	for (nmk = 1; nmk <= last_nmk; nmk++, nmkm += m) {
449	228923217		this_k = next_k;
450	228923217		next_k = n/nmkm;
451	228923217		inner_sum += M[nmk] * (this_k - next_k);
452			}
453	34416	100	sum += (mu[m] > 0) ? -inner_sum : inner_sum;
454			}
455			}
456	44		Safefree(M);
457	44		Safefree(mu);
458	44		return sum;
459			}
460
461			/******************************************************************************/
462			/* POWERS and ROOTS */
463			/******************************************************************************/
464
465			/* There are at least 4 ways to do this, plus hybrids.
466			* 1) use a table. Great for 32-bit, too big for 64-bit.
467			* 2) Use pow() to check. Relatively slow and FP is always dangerous.
468			* 3) factor or trial factor. Slow for 64-bit.
469			* 4) Dietzfelbinger algorithm 2.3.5. Quite slow.
470			* This currently uses a hybrid of 1 and 2.
471			*/
472	15369		int powerof(UV n) {
473			UV t;
474	15369	100	if ((n <= 3) \|\| (n == UV_MAX)) return 1;
		50
475	15297	100	if ((n & (n-1)) == 0) return ctz(n); /* powers of 2 */
		50
476	15282	100	if (is_perfect_square(n)) return 2 * powerof(isqrt(n));
477	15039	100	if (is_perfect_cube(n)) return 3 * powerof(icbrt(n));
478
479			/* Simple rejection filter for non-powers of 5-37. Rejects 47.85%. */
480	14989	100	t = n & 511; if ((t77855451) & (t4598053) & 862) return 1;
481
482	6673	100	if (is_perfect_fifth(n)) return 5 * powerof(rootof(n,5));
483	6658	100	if (is_perfect_seventh(n)) return 7 * powerof(rootof(n,7));
484
485	6651	100	if (n > 177146 && n <= UVCONST(1977326743)) {
		100
486	1858		switch (n) { /* Check for powers of 11, 13, 17, 19 within 32 bits */
487	4		case 177147: case 48828125: case 362797056: case 1977326743: return 11;
488	0		case 1594323: case 1220703125: return 13;
489	0		case 129140163: return 17;
490	0		case 1162261467: return 19;
491	1854		default: break;
492			}
493			}
494			#if BITS_PER_WORD == 64
495	6647	100	if (n >= UVCONST(8589934592)) {
496			/* The Bloom filters reject about 90% of inputs each, about 99% for two.
497			* Bach/Sorenson type sieves do about as well, but are much slower due
498			* to using a powmod. */
499	163	100	if ( (t = n %121, !((t19706187) & (t61524433) & 876897796)) &&
		50
500	2		(t = n % 89, !((t28913398) & (t69888189) & 2705511937U)) ) {
501			/* (t = n % 67, !((t117621317) & (t48719734) & 537242019)) ) { */
502	0		UV root = rootof(n,11);
503	0	0	if (n == ipow(root,11)) return 11;
504			}
505	163	100	if ( (t = n %131, !((t1545928325) & (t1355660813) & 2771533888U)) &&
		100
506	26		(t = n % 79, !((t48902028) & (t48589927) & 404082779)) ) {
507			/* (t = n % 53, !((t79918293) & (t236846524) & 694943819)) ) { */
508	2		UV root = rootof(n,13);
509	2	50	if (n == ipow(root,13)) return 13;
510			}
511	161		switch (n) {
512			case UVCONST(762939453125):
513			case UVCONST(16926659444736):
514			case UVCONST(232630513987207):
515			case UVCONST(100000000000000000):
516			case UVCONST(505447028499293771):
517			case UVCONST(2218611106740436992):
518	5		case UVCONST(8650415919381337933): return 17;
519			case UVCONST(19073486328125):
520			case UVCONST(609359740010496):
521			case UVCONST(11398895185373143):
522	4		case UVCONST(10000000000000000000): return 19;
523			case UVCONST(94143178827):
524			case UVCONST(11920928955078125):
525	1		case UVCONST(789730223053602816): return 23;
526	0		case UVCONST(68630377364883): return 29;
527	0		case UVCONST(617673396283947): return 31;
528	0		case UVCONST(450283905890997363): return 37;
529	151		default: break;
530			}
531			}
532			#endif
533	6635		return 1;
534			}
535	10448		int is_power(UV n, UV a)
536			{
537			int ret;
538	10448	100	if (a > 0) {
539	22	50	if (a == 1 \|\| n <= 1) return 1;
		100
540	19	100	if ((a % 2) == 0)
541	17	100	return !is_perfect_square(n) ? 0 : (a == 2) ? 1 : is_power(isqrt(n),a>>1);
		50
542	2	50	if ((a % 3) == 0)
543	2	50	return !is_perfect_cube(n) ? 0 : (a == 3) ? 1 : is_power(icbrt(n),a/3);
		50
544	0	0	if ((a % 5) == 0)
545	0	0	return !is_perfect_fifth(n) ? 0 : (a == 5) ? 1 :is_power(rootof(n,5),a/5);
		0
546			}
547	10426		ret = powerof(n);
548	10426	50	if (a != 0) return !(ret % a); /* Is the max power divisible by a? */
549	10426	100	return (ret == 1) ? 0 : ret;
550			}
551
552			#if BITS_PER_WORD == 64
553			#define ROOT_MAX_3 41
554			static const uint32_t root_max[ROOT_MAX_3] = {0,0,0,2642245,65535,7131,1625,565,255,138,84,56,40,30,23,19,15,13,11,10,9,8,7,6,6,5,5,5,4,4,4,4,3,3,3,3,3,3,3,3,3};
555			#else
556			#define ROOT_MAX_3 21
557			static const uint32_t root_max[ROOT_MAX_3] = {0,0,0,1625,255,84,40,23,15,11,9,7,6,5,4,4,3,3,3,3,3};
558			#endif
559
560	532		UV rootof(UV n, UV k) {
561			UV lo, hi, max;
562	532	50	if (k == 0) return 0;
563	532	100	if (k == 1) return n;
564	486	100	if (k == 2) return isqrt(n);
565	364	100	if (k == 3) return icbrt(n);
566
567			/* Bracket between powers of 2, but never exceed max power so ipow works */
568	352	100	max = 1 + ((k >= ROOT_MAX_3) ? 2 : root_max[k]);
569	352	50	lo = UVCONST(1) << (log2floor(n)/k);
570	352		hi = ((lo2) < max) ? lo2 : max;
571
572			/* Binary search */
573	1225	100	while (lo < hi) {
574	873		UV mid = lo + (hi-lo)/2;
575	873	100	if (ipow(mid,k) <= n) lo = mid+1;
576	394		else hi = mid;
577			}
578	352		return lo-1;
579			}
580
581	10274		int primepower(UV n, UV* prime)
582			{
583	10274		int power = 0;
584	10274	100	if (n < 2) return 0;
585			/* Check for small divisors */
586	10268	100	if (!(n&1)) {
587	24	100	if (n & (n-1)) return 0;
588	9		*prime = 2;
589	9	50	return ctz(n);
590			}
591	10244	100	if ((n%3) == 0) {
592			/* if (UVCONST(12157665459056928801) % n) return 0; */
593	156	100	do { n /= 3; power++; } while (n > 1 && (n%3) == 0);
		100
594	15	100	if (n != 1) return 0;
595	11		*prime = 3;
596	11		return power;
597			}
598	10229	100	if ((n%5) == 0) {
599	2573	100	do { n /= 5; power++; } while (n > 1 && (n%5) == 0);
		100
600	2007	100	if (n != 1) return 0;
601	10		*prime = 5;
602	10		return power;
603			}
604	8222	100	if ((n%7) == 0) {
605	1403	100	do { n /= 7; power++; } while (n > 1 && (n%7) == 0);
		100
606	1149	100	if (n != 1) return 0;
607	9		*prime = 7;
608	9		return power;
609			}
610	7073	100	if (is_prob_prime(n))
611	2690		{ *prime = n; return 1; }
612			/* Composite. Test for perfect power with prime root. */
613	4383		power = powerof(n);
614	4383	100	if (power == 1) power = 0;
615	4383	100	if (power) {
616	121		UV root = rootof(n, (UV)power);
617	121	100	if (is_prob_prime(root))
618	103		*prime = root;
619			else
620	18		power = 0;
621			}
622	4383		return power;
623			}
624
625	333		UV valuation(UV n, UV k)
626			{
627	333		UV v = 0;
628	333		UV kpower = k;
629	333	100	if (k < 2 \|\| n < 2) return 0;
		100
630	323	100	if (k == 2) return ctz(n);
		50
631	274	100	while ( !(n % kpower) ) {
632	46		kpower *= k;
633	46		v++;
634			}
635	228		return v;
636			}
637
638	601		UV logint(UV n, UV b)
639			{
640			/* UV e; for (e=0; n; n /= b) e++; return e-1; */
641	601		UV v, e = 0;
642	601	100	if (b == 2)
643	200	50	return log2floor(n);
644	401	50	if (n > UV_MAX/b) {
645	0		n /= b;
646	0		e = 1;
647			}
648	1541	100	for (v = b; v <= n; v *= b)
649	1140		e++;
650	401		return e;
651			}
652
653	2		UV mpu_popcount_string(const char* ptr, uint32_t len)
654			{
655	2		uint32_t count = 0, i, j, d, v, power, slen, s, sptr;
656
657	2	50	while (len > 0 && (ptr == '0' \|\| ptr == '+' \|\| *ptr == '-'))
		50
		50
		50
658	0		{ ptr++; len--; }
659
660			/* Create s as array of base 10^8 numbers */
661	2		slen = (len + 7) / 8;
662	2	50	Newz(0, s, slen, uint32_t);
663	19	100	for (i = 0; i < slen; i++) { /* Chunks of 8 digits */
664	145	100	for (j = 0, d = 0, power = 1; j < 8 && len > 0; j++, power *= 10) {
		100
665	128		v = ptr[--len] - '0';
666	128	50	if (v > 9) croak("Parameter '%s' must be a positive integer",ptr);
667	128		d += power * v;
668			}
669	17		s[slen - 1 - i] = d;
670			}
671			/* Repeatedly count and divide by 2 across s */
672	374	100	while (slen > 1) {
673	372	100	if (s[slen-1] & 1) count++;
674	372		sptr = s;
675	372	100	if (s[0] == 1) {
676	15	50	if (--slen == 0) break;
677	15		*++sptr += 100000000;
678			}
679	2293	100	for (i = 0; i < slen; i++) {
680	1921	100	if ( (i+1) < slen && sptr[i] & 1 ) sptr[i+1] += 100000000;
		100
681	1921		s[i] = sptr[i] >> 1;
682			}
683			}
684			/* For final base 10^8 number just do naive popcnt */
685	56	100	for (d = s[0]; d > 0; d >>= 1)
686	54	100	if (d & 1)
687	25		count++;
688	2		Safefree(s);
689	2		return count;
690			}
691
692
693			/* How many times does 2 divide n? */
694			#define padic2(n) ctz(n)
695			#define IS_MOD8_3OR5(x) (((x)&7)==3 \|\| ((x)&7)==5)
696
697	1165		static int kronecker_uu_sign(UV a, UV b, int s) {
698	4542	100	while (a) {
699	3377	50	int r = padic2(a);
700	3377	100	if (r) {
701	1583	100	if ((r&1) && IS_MOD8_3OR5(b)) s = -s;
		100
		100
702	1583		a >>= r;
703			}
704	3377	100	if (a & b & 2) s = -s;
705	3377		{ UV t = b % a; b = a; a = t; }
706			}
707	1165	100	return (b == 1) ? s : 0;
708			}
709
710	1156		int kronecker_uu(UV a, UV b) {
711			int r, s;
712	1156	100	if (b & 1) return kronecker_uu_sign(a, b, 1);
713	10	100	if (!(a&1)) return 0;
714	7		s = 1;
715	7	100	r = padic2(b);
716	7	50	if (r) {
717	7	100	if ((r&1) && IS_MOD8_3OR5(a)) s = -s;
		50
		0
718	7		b >>= r;
719			}
720	7		return kronecker_uu_sign(a, b, s);
721			}
722
723	49		int kronecker_su(IV a, UV b) {
724			int r, s;
725	49	100	if (a >= 0) return kronecker_uu(a, b);
726	14	100	if (b == 0) return (a == 1 \|\| a == -1) ? 1 : 0;
		50
		100
727	12		s = 1;
728	12	50	r = padic2(b);
729	12	100	if (r) {
730	2	50	if (!(a&1)) return 0;
731	2	50	if ((r&1) && IS_MOD8_3OR5(a)) s = -s;
		50
		50
732	2		b >>= r;
733			}
734	12		a %= (IV) b;
735	12	100	if (a < 0) a += b;
736	12		return kronecker_uu_sign(a, b, s);
737			}
738
739	18		int kronecker_ss(IV a, IV b) {
740	18	100	if (a >= 0 && b >= 0)
		50
741	0	0	return (b & 1) ? kronecker_uu_sign(a, b, 1) : kronecker_uu(a,b);
742	18	100	if (b >= 0)
743	7		return kronecker_su(a, b);
744	11	100	return kronecker_su(a, -b) * ((a < 0) ? -1 : 1);
745			}
746
747	2752		UV factorial(UV n) {
748	2752		UV i, r = 1;
749	2752	100	if ( (n > 12 && sizeof(UV) <= 4) \|\| (n > 20 && sizeof(UV) <= 8) ) return 0;
750	16151	100	for (i = 2; i <= n; i++)
751	13871		r *= i;
752	2280		return r;
753			}
754
755	25124		UV binomial(UV n, UV k) { /* Thanks to MJD and RosettaCode for ideas */
756	25124		UV d, g, r = 1;
757	25124	100	if (k == 0) return 1;
758	24938	100	if (k == 1) return n;
759	22904	100	if (k >= n) return (k == n);
760	21351	100	if (k > n/2) k = n-k;
761	213307	100	for (d = 1; d <= k; d++) {
762	197192	100	if (r >= UV_MAX/n) { /* Possible overflow */
763			UV nr, dr; /* reduced numerator / denominator */
764	14824		g = gcd_ui(n, d); nr = n/g; dr = d/g;
765	14824		g = gcd_ui(r, dr); r = r/g; dr = dr/g;
766	14824	100	if (r >= UV_MAX/nr) return 0; /* Unavoidable overflow */
767	9588		r *= nr;
768	9588		r /= dr;
769	9588		n--;
770			} else {
771	182368		r *= n--;
772	182368		r /= d;
773			}
774			}
775	16115		return r;
776			}
777
778	190		UV stirling3(UV n, UV m) { /* Lah numbers */
779			UV f1, f2;
780
781	190	50	if (m == n) return 1;
782	190	50	if (n == 0 \|\| m == 0 \|\| m > n) return 0;
		50
		50
783	190	100	if (m == 1) return factorial(n);
784
785	171		f1 = binomial(n, m);
786	171	50	if (f1 == 0) return 0;
787	171		f2 = binomial(n-1, m-1);
788	171	50	if (f2 == 0 \|\| f1 >= UV_MAX/f2) return 0;
		50
789	171		f1 *= f2;
790	171		f2 = factorial(n-m);
791	171	50	if (f2 == 0 \|\| f1 >= UV_MAX/f2) return 0;
		100
792	166		return f1 * f2;
793			}
794
795	1789		IV stirling2(UV n, UV m) {
796			UV f;
797	1789		IV j, k, t, s = 0;
798
799	1789	50	if (m == n) return 1;
800	1789	50	if (n == 0 \|\| m == 0 \|\| m > n) return 0;
		50
		50
801	1789	100	if (m == 1) return 1;
802
803	1549	100	if ((f = factorial(m)) == 0) return 0;
804	7661	100	for (j = 1; j <= (IV)m; j++) {
805	6586		t = binomial(m, j);
806	121380	100	for (k = 1; k <= (IV)n; k++) {
807	115103	50	if (t == 0 \|\| j >= IV_MAX/t) return 0;
		100
808	114794		t *= j;
809			}
810	6277	100	if ((m-j) & 1) t *= -1;
811	6277		s += t;
812			}
813	1075		return s/f;
814			}
815
816	192		IV stirling1(UV n, UV m) {
817	192		IV k, t, b1, b2, s2, s = 0;
818
819	192	50	if (m == n) return 1;
820	192	50	if (n == 0 \|\| m == 0 \|\| m > n) return 0;
		50
		50
821	192	100	if (m == 1) {
822	19		UV f = factorial(n-1);
823	19	50	if (f>(UV)IV_MAX) return 0;
824	19	100	return (n&1) ? ((IV)f) : -((IV)f);
825			}
826
827	942	100	for (k = 1; k <= (IV)(n-m); k++) {
828	816		b1 = binomial(k + n - 1, n - m + k);
829	816		b2 = binomial(2 * n - m, n - m - k);
830	816		s2 = stirling2(n - m + k, k);
831	816	100	if (b1 == 0 \|\| b2 == 0 \|\| s2 == 0 \|\| b1 > IV_MAX/b2) return 0;
		50
		100
		50
832	804		t = b1 * b2;
833	804	100	if (s2 > IV_MAX/t) return 0;
834	769		t *= s2;
835	769	100	s += (k & 1) ? -t : t;
836			}
837	126		return s;
838			}
839
840	686		UV totient(UV n) {
841			UV i, nfacs, totient, lastf, facs[MPU_MAX_FACTORS+1];
842	686	100	if (n <= 1) return n;
843	590		totient = 1;
844			/* phi(2m) = 2phi(m) if m even, phi(m) if m odd */
845	733	100	while ((n & 0x3) == 0) { n >>= 1; totient <<= 1; }
846	590	100	if ((n & 0x1) == 0) { n >>= 1; }
847			/* factor and calculate totient */
848	590		nfacs = factor(n, facs);
849	590		lastf = 0;
850	1213	100	for (i = 0; i < nfacs; i++) {
851	623		UV f = facs[i];
852	623	100	if (f == lastf) { totient *= f; }
853	576		else { totient *= f-1; lastf = f; }
854			}
855	686		return totient;
856			}
857
858			static const UV jordan_overflow[5] =
859			#if BITS_PER_WORD == 64
860			{UVCONST(4294967311), 2642249, 65537, 7133, 1627};
861			#else
862			{UVCONST( 65537), 1627, 257, 85, 41};
863			#endif
864	490		UV jordan_totient(UV k, UV n) {
865			UV factors[MPU_MAX_FACTORS+1];
866			int nfac, i;
867			UV totient;
868	490	50	if (k == 0 \|\| n <= 1) return (n == 1);
		100
869	479	100	if (k > 6 \|\| (k > 1 && n >= jordan_overflow[k-2])) return 0;
		100
		50
870
871	457		totient = 1;
872			/* Similar to Euler totient, shortcut even inputs */
873	662	100	while ((n & 0x3) == 0) { n >>= 1; totient *= (1<
874	457	100	if ((n & 0x1) == 0) { n >>= 1; totient *= ((1<
875	457		nfac = factor(n,factors);
876	960	100	for (i = 0; i < nfac; i++) {
877	503		UV p = factors[i];
878	503		UV pk = ipow(p,k);
879	503		totient *= (pk-1);
880	580	100	while (i+1 < nfac && p == factors[i+1]) {
		100
881	77		i++;
882	77		totient *= pk;
883			}
884			}
885	490		return totient;
886			}
887
888	270		UV carmichael_lambda(UV n) {
889			UV fac[MPU_MAX_FACTORS+1];
890			int i, nfactors;
891	270		UV lambda = 1;
892
893	270	100	if (n < 8) return totient(n);
894	262	100	if ((n & (n-1)) == 0) return n >> 2;
895
896	253	50	i = ctz(n);
897	253	100	if (i > 0) {
898	95		n >>= i;
899	95	100	lambda <<= (i>2) ? i-2 : i-1;
900			}
901	253		nfactors = factor(n, fac);
902	600	100	for (i = 0; i < nfactors; i++) {
903	347		UV p = fac[i], pk = p-1;
904	394	100	while (i+1 < nfactors && p == fac[i+1]) {
		100
905	47		i++;
906	47		pk *= p;
907			}
908	347		lambda = lcm_ui(lambda, pk);
909			}
910	270		return lambda;
911			}
912
913	20000		int is_carmichael(UV n) {
914			UV fac[MPU_MAX_FACTORS+1];
915			UV exp[MPU_MAX_FACTORS+1];
916			int i, nfactors;
917
918			/* Small or even is not a Carmichael number */
919	20000	100	if (n < 561 \|\| !(n&1)) return 0;
		100
920
921			/* Simple pre-test for square free (odds only) */
922	9720	100	if (!(n% 9) \|\| !(n%25) \|\| !(n%49) \|\| !(n%121) \|\| !(n%169))
		100
		100
		100
		100
923	1709		return 0;
924
925			/* Check Korselt's criterion for small divisors */
926	8011	100	if (!(n% 5) && ((n-1) % 4 != 0)) return 0;
		100
927	7345	100	if (!(n% 7) && ((n-1) % 6 != 0)) return 0;
		100
928	6771	100	if (!(n%11) && ((n-1) % 10 != 0)) return 0;
		100
929	6333	100	if (!(n%13) && ((n-1) % 12 != 0)) return 0;
		100
930	5984	100	if (!(n%17) && ((n-1) % 16 != 0)) return 0;
		100
931	5678	100	if (!(n%19) && ((n-1) % 18 != 0)) return 0;
		100
932	5417	100	if (!(n%23) && ((n-1) % 22 != 0)) return 0;
		100
933
934			/* Fast check without having to factor */
935	5197	50	if (n > 5000000) {
936	0	0	if (!(n%29) && ((n-1) % 28 != 0)) return 0;
		0
937	0	0	if (!(n%31) && ((n-1) % 30 != 0)) return 0;
		0
938	0	0	if (!(n%37) && ((n-1) % 36 != 0)) return 0;
		0
939	0	0	if (!(n%41) && ((n-1) % 40 != 0)) return 0;
		0
940	0	0	if (!(n%43) && ((n-1) % 42 != 0)) return 0;
		0
941	0	0	if (!is_pseudoprime(n,2)) return 0;
942			}
943
944	5197		nfactors = factor_exp(n, fac, exp);
945	5197	100	if (nfactors < 3)
946	4664		return 0;
947	1330	100	for (i = 0; i < nfactors; i++) {
948	1321	50	if (exp[i] > 1 \|\| ((n-1) % (fac[i]-1)) != 0)
		100
949	524		return 0;
950			}
951	20000		return 1;
952			}
953
954	8230		static int is_quasi_base(int nfactors, UV *fac, UV p, UV b) {
955			int i;
956	13159	100	for (i = 0; i < nfactors; i++) {
957	13015		UV d = fac[i] - b;
958	13015	50	if (d == 0 \|\| (p % d) != 0)
		100
959	8086		return 0;
960			}
961	144		return 1;
962			}
963
964			/* Returns number of bases that pass */
965	5402		UV is_quasi_carmichael(UV n) {
966			UV nbases, fac[MPU_MAX_FACTORS+1], exp[MPU_MAX_FACTORS+1];
967			UV spf, lpf, ndivisors, *divs;
968			int i, nfactors;
969
970	5402	100	if (n < 35) return 0;
971
972			/* Simple pre-test for square free */
973	5334	100	if (!(n% 4) \|\| !(n% 9) \|\| !(n%25) \|\| !(n%49) \|\| !(n%121) \|\| !(n%169))
		100
		100
		100
		100
		100
974	2041		return 0;
975
976	3293		nfactors = factor_exp(n, fac, exp);
977			/* Must be composite */
978	3293	100	if (nfactors < 2)
979	741		return 0;
980			/* Must be square free */
981	8889	100	for (i = 0; i < nfactors; i++)
982	6371	100	if (exp[i] > 1)
983	34		return 0;
984
985	2518		nbases = 0;
986	2518		spf = fac[0];
987	2518		lpf = fac[nfactors-1];
988
989			/* Algorithm from Hiroaki Yamanouchi, 2015 */
990	2518	100	if (nfactors == 2) {
991	1448		divs = _divisor_list(n / spf - 1, &ndivisors);
992	7401	50	for (i = 0; i < (int)ndivisors; i++) {
993	5953		UV d = divs[i];
994	5953		UV k = spf - d;
995	5953	100	if (d >= spf) break;
996	4505	100	if (is_quasi_base(nfactors, fac, n-k, k))
997	92		nbases++;
998			}
999			} else {
1000	1070		divs = _divisor_list(lpf * (n / lpf - 1), &ndivisors);
1001	9523	100	for (i = 0; i < (int)ndivisors; i++) {
1002	8453		UV d = divs[i];
1003	8453		UV k = lpf - d;
1004	8453	100	if (lpf > d && k >= spf) continue;
		100
1005	4795	100	if (k != 0 && is_quasi_base(nfactors, fac, n-k, k))
		100
1006	52		nbases++;
1007			}
1008			}
1009	2518		Safefree(divs);
1010	5402		return nbases;
1011			}
1012
1013	80268		int is_semiprime(UV n) {
1014			UV sp, p, n3, factors[2];
1015
1016	80268	100	if (n < 6) return (n == 4);
1017	80256	100	if (!(n&1)) return !!is_prob_prime(n>>1);
1018	40458	100	if (!(n%3)) return !!is_prob_prime(n/3);
1019	27048	100	if (!(n%5)) return !!is_prob_prime(n/5);
1020			/* 27% of random inputs left */
1021	21687		n3 = icbrt(n);
1022	87163	100	for (sp = 4; sp < 60; sp++) {
1023	87127		p = primes_tiny[sp];
1024	87127	100	if (p > n3)
1025	15103		break;
1026	72024	100	if ((n % p) == 0)
1027	6548		return !!is_prob_prime(n/p);
1028			}
1029			/* 9.8% of random inputs left */
1030	15139	100	if (is_def_prime(n)) return 0;
		100
1031	5220	100	if (p > n3) return 1; /* past this, n is a composite and larger than p^3 */
1032			/* 4-8% of random inputs left */
1033	25	50	if (factor_one(n, factors, 0, 0) != 2) return 0;
1034	80268	50	return (is_def_prime(factors[0]) && is_def_prime(factors[1]));
		50
		0
		100
		100
		50
1035			}
1036
1037	102		int is_fundamental(UV n, int neg) {
1038	102		UV r = n & 15;
1039	102	100	if (r) {
1040	94	100	if (!neg) {
1041	47		switch (r & 3) {
1042	9	100	case 0: return (r == 4) ? 0 : is_square_free(n >> 2);
		50
1043	13		case 1: return is_square_free(n);
1044	25		default: break;
1045			}
1046			} else {
1047	47		switch (r & 3) {
1048	9	100	case 0: return (r == 12) ? 0 : is_square_free(n >> 2);
		100
1049	12		case 3: return is_square_free(n);
1050	26		default: break;
1051			}
1052			}
1053			}
1054	59		return 0;
1055			}
1056
1057	461		static int _totpred(UV n, UV maxd) {
1058			UV i, ndivisors, *divs;
1059			int res;
1060
1061	461	100	if (n & 1) return 0;
1062	255		n >>= 1;
1063	255	100	if (n == 1) return 1;
1064	249	100	if (n < maxd && is_prime(2*n+1)) return 1;
		100
1065
1066	230		divs = _divisor_list(n, &ndivisors);
1067	1086	100	for (i = 0, res = 0; i < ndivisors && divs[i] < maxd && res == 0; i++) {
		100
		100
1068	856		UV r, d = divs[i], p = 2*d+1;
1069	856	100	if (!is_prime(p)) continue;
1070	349		r = n/d;
1071			while (1) {
1072	401	100	if (r == p \|\| _totpred(r, d)) { res = 1; break; }
		100
1073	385	100	if (r % p) break;
1074	52		r /= p;
1075	52		}
1076			}
1077	230		Safefree(divs);
1078	461		return res;
1079			}
1080
1081	124		int is_totient(UV n) {
1082	124	100	return (n == 0 \|\| (n & 1)) ? (n==1) : _totpred(n,n);
		100
1083			}
1084
1085
1086	103		UV inverse_totient_count(UV n) {
1087			set_t set, sumset;
1088			keyval_t keyval;
1089			UV res, i, ndivisors, *divs;
1090
1091	103	100	if (n == 1) return 2;
1092	102	100	if (n < 1 \|\| n & 1) return 0;
		100
1093	52	100	if (is_prime(n >> 1)) { /* Coleman Remark 3.3 (Thm 3.1) and Prop 6.2 */
1094	15	100	if (!is_prime(n+1)) return 0;
1095	7	100	if (n >= 10) return 2;
1096			}
1097
1098	39		divs = _divisor_list(n, &ndivisors);
1099
1100	39		init_set(&set, 2*ndivisors);
1101	39		keyval.key = 1; keyval.val = 1;
1102	39		set_addsum(&set, keyval);
1103
1104	502	100	for (i = 0; i < ndivisors; i++) {
1105	463		UV d = divs[i], p = d+1;
1106	463	100	if (is_prime(p)) {
1107	245		UV j, np = d, v = valuation(n, p);
1108	245		init_set(&sumset, ndivisors/2);
1109	626	100	for (j = 0; j <= v; j++) {
1110	381		UV k, ndiv = n/np; /* Loop over divisors of n/np */
1111	381	100	if (np == 1) {
1112	39		keyval_t kv; kv.key = 1; kv.val = 1;
1113	39		set_addsum(&sumset, kv);
1114			} else {
1115	8912	50	for (k = 0; k < ndivisors && divs[k] <= ndiv; k++) {
		100
1116	8570		UV val, d2 = divs[k];
1117	8570	100	if ((ndiv % d2) != 0) continue;
1118	4078		val = set_getval(set, d2);
1119	4078	100	if (val > 0) {
1120	1896		keyval_t kv; kv.key = d2*np; kv.val = val;
1121	1896		set_addsum(&sumset, kv);
1122			}
1123			}
1124			}
1125			/* if (j < v && np > UV_MAX/p) croak("overflow np d %lu", d); */
1126	381		np *= p;
1127			}
1128	245		set_merge(&set, sumset);
1129	245		free_set(&sumset);
1130			}
1131			}
1132	39		Safefree(divs);
1133	39		res = set_getval(set, n);
1134	39		free_set(&set);
1135	103		return res;
1136			}
1137
1138	2		UV* inverse_totient_list(UV *ntotients, UV n) {
1139			set_list_t setlist, divlist;
1140			UV i, ndivisors, divs, tlist;
1141	2		UV *totlist = 0;
1142
1143	2	50	MPUassert(n <= UV_MAX/7.5, "inverse_totient_list n too large");
1144
1145	2	50	if (n == 1) {
1146	0		New(0, totlist, 2, UV);
1147	0		totlist[0] = 1; totlist[1] = 2;
1148	0		*ntotients = 2;
1149	0		return totlist;
1150			}
1151	2	50	if (n < 1 \|\| n & 1) {
		50
1152	0		*ntotients = 0;
1153	0		return totlist;
1154			}
1155	2	50	if (is_prime(n >> 1)) { /* Coleman Remark 3.3 (Thm 3.1) and Prop 6.2 */
1156	0	0	if (!is_prime(n+1)) {
1157	0		*ntotients = 0;
1158	0		return totlist;
1159			}
1160	0	0	if (n >= 10) {
1161	0		New(0, totlist, 2, UV);
1162	0		totlist[0] = n+1; totlist[1] = 2*n+2;
1163	0		*ntotients = 2;
1164	0		return totlist;
1165			}
1166			}
1167
1168	2		divs = _divisor_list(n, &ndivisors);
1169
1170	2		init_setlist(&setlist, 2*ndivisors);
1171	2		setlist_addval(&setlist, 1, 1); /* Add 1 => [1] */
1172
1173	74	100	for (i = 0; i < ndivisors; i++) {
1174	72		UV d = divs[i], p = d+1;
1175	72	100	if (is_prime(p)) {
1176	23		UV j, dp = d, pp = p, v = valuation(n, p);
1177	23		init_setlist(&divlist, ndivisors/2);
1178	56	100	for (j = 0; j <= v; j++) {
1179	33		UV k, ndiv = n/dp; /* Loop over divisors of n/dp */
1180	33	100	if (dp == 1) {
1181	2		setlist_addval(&divlist, 1, 2); /* Add 1 => [2] */
1182			} else {
1183	888	50	for (k = 0; k < ndivisors && divs[k] <= ndiv; k++) {
		100
1184	857		UV nvals, *vals, d2 = divs[k];
1185	857	100	if ((ndiv % d2) != 0) continue;
1186	406		vals = setlist_getlist(&nvals, setlist, d2);
1187	406	100	if (vals != 0)
1188	406		setlist_addlist(&divlist, d2 * dp, nvals, vals, pp);
1189			}
1190			}
1191	33		dp *= p;
1192	33		pp *= p;
1193			}
1194	23		setlist_merge(&setlist, divlist);
1195	23		free_setlist(&divlist);
1196			}
1197			}
1198	2		Safefree(divs);
1199	2		tlist = setlist_getlist(ntotients, setlist, n);
1200	2	50	if (tlist != 0 && *ntotients > 0) {
		50
1201	2	50	New(0, totlist, *ntotients, UV);
1202	2		memcpy(totlist, tlist, ntotients sizeof(UV));
1203	2		qsort(totlist, *ntotients, sizeof(UV), _numcmp);
1204			}
1205	2		free_setlist(&setlist);
1206	2		return totlist;
1207			}
1208
1209
1210	993		UV pillai_v(UV n) {
1211			UV v, fac;
1212	993	100	if (n == 0) return 0;
1213	104101	100	for (v = 8, fac = 5040 % n; v < n-1 && fac != 0; v++) {
		100
1214	103194	50	fac = (n < HALF_WORD) ? (fac*v) % n : mulmod(fac,v,n);
1215	103194	100	if (fac == n-1 && (n % v) != 1)
		100
1216	85		return v;
1217			}
1218	907		return 0;
1219			}
1220
1221
1222	29732		int moebius(UV n) {
1223			UV factors[MPU_MAX_FACTORS+1];
1224			int i, nfactors;
1225
1226	29732	100	if (n <= 5) return (n == 1) ? 1 : (n % 4) ? -1 : 0;
		100
		100
1227	28856	100	if (n >= 49 && (!(n % 4) \|\| !(n % 9) \|\| !(n % 25) \|\| !(n % 49)))
		100
		100
		100
		100
1228	10111		return 0;
1229	18745	100	if (n >= 361 && (!(n % 121) \|\| !(n % 169) \|\| !(n % 289) \|\| !(n % 361)))
		100
		100
		100
		100
1230	322		return 0;
1231	18423	100	if (n >= 961 && (!(n % 529) \|\| !(n % 841) \|\| !(n % 961)))
		100
		100
		100
1232	63		return 0;
1233
1234	18360		nfactors = factor(n, factors);
1235	38637	100	for (i = 1; i < nfactors; i++)
1236	21038	100	if (factors[i] == factors[i-1])
1237	761		return 0;
1238	29732	100	return (nfactors % 2) ? -1 : 1;
1239			}
1240
1241	20		UV exp_mangoldt(UV n) {
1242			UV p;
1243	20	100	if (!primepower(n,&p)) return 1; /* Not a prime power */
1244	20		return p;
1245			}
1246
1247
1248	75		UV znorder(UV a, UV n) {
1249			UV fac[MPU_MAX_FACTORS+1];
1250			UV exp[MPU_MAX_FACTORS+1];
1251			int i, nfactors;
1252			UV k, phi;
1253
1254	75	100	if (n <= 1) return n; /* znorder(x,0) = 0, znorder(x,1) = 1 */
1255	69	100	if (a <= 1) return a; /* znorder(0,x) = 0, znorder(1,x) = 1 (x > 1) */
1256	66	100	if (gcd_ui(a,n) > 1) return 0;
1257
1258			/* Cohen 1.4.3 using Carmichael Lambda */
1259	60		phi = carmichael_lambda(n);
1260	60		nfactors = factor_exp(phi, fac, exp);
1261	60		k = phi;
1262			#if USE_MONTMATH
1263	60	100	if (n & 1) {
1264	54		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
1265	54		UV ma = mont_geta(a, n);
1266	266	100	for (i = 0; i < nfactors; i++) {
1267	212		UV b, a1, ek, pi = fac[i], ei = exp[i];
1268	212		b = ipow(pi,ei);
1269	212		k /= b;
1270	212		a1 = mont_powmod(ma, k, n);
1271	373	100	for (ek = 0; a1 != mont1 && ek++ <= ei; a1 = mont_powmod(a1, pi, n))
		50
1272	161		k *= pi;
1273	212	50	if (ek > ei) return 0;
1274			}
1275			} else
1276			#endif
1277	13	100	for (i = 0; i < nfactors; i++) {
1278	7		UV b, a1, ek, pi = fac[i], ei = exp[i];
1279	7		b = ipow(pi,ei);
1280	7		k /= b;
1281	7		a1 = powmod(a, k, n);
1282	14	100	for (ek = 0; a1 != 1 && ek++ <= ei; a1 = powmod(a1, pi, n))
		50
1283	7		k *= pi;
1284	7	50	if (ek > ei) return 0;
1285			}
1286	75		return k;
1287			}
1288
1289	25		UV znprimroot(UV n) {
1290			UV fac[MPU_MAX_FACTORS+1];
1291			UV phi_div_fac[MPU_MAX_FACTORS+1];
1292			UV a, phi, on, r;
1293			int i, nfactors;
1294
1295	25	100	if (n <= 4) return (n == 0) ? 0 : n-1;
		100
1296	20	100	if (n % 4 == 0) return 0;
1297
1298	19	100	on = (n&1) ? n : (n>>1);
1299	19		a = powerof(on);
1300	19		r = rootof(on, a);
1301	19	100	if (!is_prob_prime(r)) return 0; /* c^a or 2c^a */
1302	17		phi = (r-1) * (on/r); /* p^a or 2p^a */
1303
1304	17		nfactors = factor_exp(phi, fac, 0);
1305	83	100	for (i = 0; i < nfactors; i++)
1306	66		phi_div_fac[i] = phi / fac[i];
1307
1308			#if USE_MONTMATH
1309	17	100	if (n & 1) {
1310	14		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
1311	834	50	for (a = 2; a < n; a++) {
1312	834	100	if (a == 4 \|\| a == 8 \|\| a == 9) continue; /* Skip some perfect powers */
		100
		100
1313			/* Skip values we know can't be right: (a\|n) = 0 (or 1 for odd primes) */
1314	813	100	if (phi == n-1) { if (kronecker_uu(a, n) != -1) continue; }
		100
1315	1	50	else { if (gcd_ui(a,n) != 1) continue; }
1316	160		r = mont_geta(a, n);
1317	436	100	for (i = 0; i < nfactors; i++)
1318	422	100	if (mont_powmod(r, phi_div_fac[i], n) == mont1)
1319	146		break;
1320	160	100	if (i == nfactors) return a;
1321			}
1322			} else
1323			#endif
1324	36	50	for (a = 2; a < n; a++) {
1325	36	100	if (a == 4 \|\| a == 8 \|\| a == 9) continue; /* Skip some perfect powers */
		100
		100
1326			/* Skip values we know can't be right: (a\|n) = 0 (or 1 for odd primes) */
1327	32	50	if (phi == n-1) { if (kronecker_uu(a, n) != -1) continue; }
		0
1328	32	100	else { if (gcd_ui(a,n) != 1) continue; }
1329	23	100	for (i = 0; i < nfactors; i++)
1330	20	100	if (powmod(a, phi_div_fac[i], n) == 1)
1331	13		break;
1332	16	100	if (i == nfactors) return a;
1333			}
1334	25		return 0;
1335			}
1336
1337	47		int is_primitive_root(UV a, UV n, int nprime) {
1338			UV s, fac[MPU_MAX_FACTORS+1];
1339			int i, nfacs;
1340
1341	47	100	if (n <= 1) return n;
1342	45	100	if (a >= n) a %= n;
1343	45	100	if (n <= 4) return a == n-1;
1344	38	100	if (n % 4 == 0) return 0;
1345
1346			/* Very simple, but not fast:
1347			* s = nprime ? n-1 : totient(n);
1348			* return s == znorder(a, n);
1349			*/
1350
1351	37	100	if (gcd_ui(a,n) != 1) return 0;
1352	36	100	if (nprime) {
1353	15		s = n-1;
1354			} else {
1355	21	100	UV on = (n&1) ? n : (n>>1);
1356	21		UV k = powerof(on);
1357	21		UV r = rootof(on, k);
1358	21	100	if (!is_prob_prime(r)) return 0; /* c^a or 2c^a */
1359	19		s = (r-1) * (on/r); /* p^a or 2p^a */
1360			}
1361	34	100	if (s == n-1 && kronecker_uu(a,n) != -1) return 0;
		100
1362
1363			/* a^x can be a primitive root only if gcd(x,s) = 1 */
1364	26		i = is_power(a,0);
1365	26	100	if (i > 1 && gcd_ui(i, s) != 1) return 0;
		50
1366
1367			#if USE_MONTMATH
1368	26	100	if (n & 1) {
1369	23		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
1370	23		a = mont_geta(a, n);
1371			/* Quick check for small factors before full factor */
1372	23	50	if ((s % 2) == 0 && mont_powmod(a, s/2, n) == mont1) return 0;
		50
1373	23	100	if ((s % 3) == 0 && mont_powmod(a, s/3, n) == mont1) return 0;
		100
1374	22	100	if ((s % 5) == 0 && mont_powmod(a, s/5, n) == mont1) return 0;
		50
1375	22		nfacs = factor_exp(s, fac, 0);
1376	97	100	for (i = 0; i < nfacs; i++)
1377	75	100	if (fac[i] > 5 && mont_powmod(a, s/fac[i], n) == mont1)
		50
1378	0		return 0;
1379			} else
1380			#endif
1381			{
1382			/* Quick check for small factors before full factor */
1383	3	50	if ((s % 2) == 0 && powmod(a, s/2, n) == 1) return 0;
		50
1384	3	50	if ((s % 3) == 0 && powmod(a, s/3, n) == 1) return 0;
		0
1385	3	100	if ((s % 5) == 0 && powmod(a, s/5, n) == 1) return 0;
		50
1386			/* Complete factor and check each one not found above. */
1387	2		nfacs = factor_exp(s, fac, 0);
1388	4	100	for (i = 0; i < nfacs; i++)
1389	2	50	if (fac[i] > 5 && powmod(a, s/fac[i], n) == 1)
		0
1390	0		return 0;
1391			}
1392	47		return 1;
1393			}
1394
1395	50		IV gcdext(IV a, IV b, IV* u, IV* v, IV* cs, IV* ct) {
1396	50		IV s = 0; IV os = 1;
1397	50		IV t = 1; IV ot = 0;
1398	50		IV r = b; IV or = a;
1399	50	100	if (a == 0 && b == 0) { os = 0; t = 0; }
		100
1400	335	100	while (r != 0) {
1401	285		IV quot = or / r;
1402	285		{ IV tmp = r; r = or - quot * r; or = tmp; }
1403	285		{ IV tmp = s; s = os - quot * s; os = tmp; }
1404	285		{ IV tmp = t; t = ot - quot * t; ot = tmp; }
1405			}
1406	50	100	if (or < 0) /* correct sign */
1407	4		{ or = -or; os = -os; ot = -ot; }
1408	50	50	if (u != 0) *u = os;
1409	50	50	if (v != 0) *v = ot;
1410	50	100	if (cs != 0) *cs = s;
1411	50	100	if (ct != 0) *ct = t;
1412	50		return or;
1413			}
1414
1415			/* Calculate 1/a mod n. */
1416	161		UV modinverse(UV a, UV n) {
1417	161		IV t = 0; UV nt = 1;
1418	161		UV r = n; UV nr = a;
1419	3833	100	while (nr != 0) {
1420	3672		UV quot = r / nr;
1421	3672		{ UV tmp = nt; nt = t - quot*nt; t = tmp; }
1422	3672		{ UV tmp = nr; nr = r - quot*nr; r = tmp; }
1423			}
1424	161	100	if (r > 1) return 0; /* No inverse */
1425	120	100	if (t < 0) t += n;
1426	120		return t;
1427			}
1428
1429	2		UV divmod(UV a, UV b, UV n) { /* a / b mod n */
1430	2		UV binv = modinverse(b, n);
1431	2	50	if (binv == 0) return 0;
1432	2		return mulmod(a, binv, n);
1433			}
1434
1435	183072		static UV _powfactor(UV p, UV d, UV m) {
1436	183072		UV e = 0;
1437	366571	100	do { d /= p; e += d; } while (d > 0);
1438	183072		return powmod(p, e, m);
1439			}
1440
1441	821		UV factorialmod(UV n, UV m) { /* n! mod m */
1442	821		UV i, d = n, res = 1;
1443
1444	821	50	if (n >= m \|\| m == 1) return 0;
		100
1445
1446	820	100	if (n <= 10) { /* Keep things simple for small n */
1447	1672	100	for (i = 2; i <= n && res != 0; i++)
		100
1448	1288		res = (res * i) % m;
1449	384		return res;
1450			}
1451
1452	436	100	if (n > m/2 && is_prime(m)) /* Check if we can go backwards */
		100
1453	74		d = m-n-1;
1454	436	100	if (d < 2)
1455	14	100	return (d == 0) ? m-1 : 1; /* Wilson's Theorem: n = m-1 and n = m-2 */
1456
1457	422	100	if (d == n && d > 5000000) { /* Check for composite m that leads to 0 */
		100
1458			UV fac[MPU_MAX_FACTORS+1], exp[MPU_MAX_FACTORS+1];
1459	1		int j, k, nfacs = factor_exp(m, fac, exp);
1460	2	100	for (j = 0; j < nfacs; j++) {
1461	1		UV t = fac[j];
1462	3	100	for (k = 1; (UV)k < exp[j]; k++)
1463	2		t *= fac[j];
1464	1	50	if (n >= t) return 0;
1465			}
1466			}
1467
1468			#if USE_MONTMATH
1469	618	100	if (m & 1 && d < 40000) {
		100
1470	196		const uint64_t npi = mont_inverse(m), mont1 = mont_get1(m);
1471	196		uint64_t monti = mont1;
1472	196		res = mont1;
1473	1828	100	for (i = 2; i <= d && res != 0; i++) {
		100
1474	1632		monti = addmod(monti,mont1,m);
1475	1632	50	res = mont_mulmod(res,monti,m);
1476			}
1477	196	50	res = mont_recover(res, m);
1478			} else
1479			#endif
1480	226	100	if (d < 10000) {
1481	2031	100	for (i = 2; i <= d && res != 0; i++)
		100
1482	1806		res = mulmod(res,i,m);
1483			} else {
1484			#if 0 /* Monolithic prime walk */
1485			START_DO_FOR_EACH_PRIME(2, d) {
1486			UV k = (p > (d>>1)) ? p : _powfactor(p, d, m);
1487			res = mulmod(res, k, m);
1488			if (res == 0) break;
1489			} END_DO_FOR_EACH_PRIME;
1490			#else /* Segmented prime walk */
1491			unsigned char* segment;
1492			UV seg_base, seg_low, seg_high;
1493	1		void* ctx = start_segment_primes(7, d, &segment);
1494	4	100	for (i = 1; i <= 3; i++) /* Handle 2,3,5 assume d>10*/
1495	3		res = mulmod(res, _powfactor(2*i - (i>1), d, m), m);
1496	7	50	while (res != 0 && next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
		100
1497	369350	50	START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high )
		100
		50
		100
		100
1498	348510	100	UV k = (p > (d>>1)) ? p : _powfactor(p, d, m);
1499	348510		res = mulmod(res, k, m);
1500	348510	50	if (res == 0) break;
1501	20834		END_DO_FOR_EACH_SIEVE_PRIME
1502			}
1503	1		end_segment_primes(ctx);
1504			#endif
1505			}
1506
1507	422	100	if (d != n && res != 0) { /* Handle backwards case */
		50
1508	60	100	if (!(d&1)) res = submod(m,res,m);
1509	60		res = modinverse(res,m);
1510			}
1511
1512	422		return res;
1513			}
1514
1515	15		static int verify_sqrtmod(UV s, UV *rs, UV a, UV p) {
1516	15	100	if (p-s < s) s = p-s;
1517	15	50	if (mulmod(s, s, p) != a) return 0;
1518	15		*rs = s;
1519	15		return 1;
1520			}
1521			#if !USE_MONTMATH
1522			UV _sqrtmod_prime(UV a, UV p) {
1523			if ((p % 4) == 3) {
1524			return powmod(a, (p+1)>>2, p);
1525			}
1526			if ((p % 8) == 5) { /* Atkin's algorithm. Faster than Legendre. */
1527			UV a2, alpha, beta, b;
1528			a2 = addmod(a,a,p);
1529			alpha = powmod(a2,(p-5)>>3,p);
1530			beta = mulmod(a2,sqrmod(alpha,p),p);
1531			b = mulmod(alpha, mulmod(a, (beta ? beta-1 : p-1), p), p);
1532			return b;
1533			}
1534			if ((p % 16) == 9) { /* Müller's algorithm extending Atkin */
1535			UV a2, alpha, beta, b, d = 1;
1536			a2 = addmod(a,a,p);
1537			alpha = powmod(a2, (p-9)>>4, p);
1538			beta = mulmod(a2, sqrmod(alpha,p), p);
1539			if (sqrmod(beta,p) != p-1) {
1540			do { d += 2; } while (kronecker_uu(d,p) != -1 && d < p);
1541			alpha = mulmod(alpha, powmod(d,(p-9)>>3,p), p);
1542			beta = mulmod(a2, mulmod(sqrmod(d,p),sqrmod(alpha,p),p), p);
1543			}
1544			b = mulmod(alpha, mulmod(a, mulmod(d,(beta ? beta-1 : p-1),p),p),p);
1545			return b;
1546			}
1547
1548			/* Verify Euler condition for odd p */
1549			if ((p & 1) && powmod(a,(p-1)>>1,p) != 1) return 0;
1550
1551			{
1552			UV x, q, e, t, z, r, m, b;
1553			q = p-1;
1554			e = valuation(q, 2);
1555			q >>= e;
1556			t = 3;
1557			while (kronecker_uu(t, p) != -1) {
1558			t += 2;
1559			if (t == 201) { /* exit if p looks like a composite */
1560			if ((p % 2) == 0 \|\| powmod(2, p-1, p) != 1 \|\| powmod(3, p-1, p) != 1)
1561			return 0;
1562			} else if (t >= 20000) { /* should never happen */
1563			return 0;
1564			}
1565			}
1566			z = powmod(t, q, p);
1567			b = powmod(a, q, p);
1568			r = e;
1569			q = (q+1) >> 1;
1570			x = powmod(a, q, p);
1571			while (b != 1) {
1572			t = b;
1573			for (m = 0; m < r && t != 1; m++)
1574			t = sqrmod(t, p);
1575			if (m >= r) break;
1576			t = powmod(z, UVCONST(1) << (r-m-1), p);
1577			x = mulmod(x, t, p);
1578			z = mulmod(t, t, p);
1579			b = mulmod(b, z, p);
1580			r = m;
1581			}
1582			return x;
1583			}
1584			return 0;
1585			}
1586			#else
1587	8		UV _sqrtmod_prime(UV a, UV p) {
1588	8		const uint64_t npi = mont_inverse(p), mont1 = mont_get1(p);
1589	8		a = mont_geta(a,p);
1590
1591	8	100	if ((p % 4) == 3) {
1592	1		UV b = mont_powmod(a, (p+1)>>2, p);
1593	1	50	return mont_recover(b, p);
1594			}
1595
1596	7	100	if ((p % 8) == 5) { /* Atkin's algorithm. Faster than Legendre. */
1597			UV a2, alpha, beta, b;
1598	5		a2 = addmod(a,a,p);
1599	5		alpha = mont_powmod(a2,(p-5)>>3,p);
1600	5	50	beta = mont_mulmod(a2,mont_sqrmod(alpha,p),p);
		0
		50
1601	5		beta = submod(beta, mont1, p);
1602	5	50	b = mont_mulmod(alpha, mont_mulmod(a, beta, p), p);
		0
		50
1603	5	50	return mont_recover(b, p);
1604			}
1605	2	100	if ((p % 16) == 9) { /* Müller's algorithm extending Atkin */
1606	1		UV a2, alpha, beta, b, d = 1;
1607	1		a2 = addmod(a,a,p);
1608	1		alpha = mont_powmod(a2, (p-9)>>4, p);
1609	1	50	beta = mont_mulmod(a2, mont_sqrmod(alpha,p), p);
		0
		50
1610	1	50	if (mont_sqrmod(beta,p) != submod(0,mont1,p)) {
		50
1611	5	100	do { d += 2; } while (kronecker_uu(d,p) != -1 && d < p);
		50
1612	1		d = mont_geta(d,p);
1613	1	50	alpha = mont_mulmod(alpha, mont_powmod(d,(p-9)>>3,p), p);
1614	1	50	beta = mont_mulmod(a2, mont_mulmod(mont_sqrmod(d,p),mont_sqrmod(alpha,p),p), p);
		0
		0
		0
		0
		0
		50
		0
		0
		50
		50
1615	1	50	beta = mont_mulmod(submod(beta,mont1,p), d, p);
1616			} else {
1617	0		beta = submod(beta, mont1, p);
1618			}
1619	1	50	b = mont_mulmod(alpha, mont_mulmod(a, beta, p), p);
		0
		50
1620	1	50	return mont_recover(b, p);
1621			}
1622
1623			/* Verify Euler condition for odd p */
1624	1	50	if ((p & 1) && mont_powmod(a,(p-1)>>1,p) != mont1) return 0;
		50
1625
1626			{
1627			UV x, q, e, t, z, r, m, b;
1628	1		q = p-1;
1629	1		e = valuation(q, 2);
1630	1		q >>= e;
1631	1		t = 3;
1632	1	50	while (kronecker_uu(t, p) != -1) {
1633	0		t += 2;
1634	0	0	if (t == 201) { /* exit if p looks like a composite */
1635	0	0	if ((p % 2) == 0 \|\| powmod(2, p-1, p) != 1 \|\| powmod(3, p-1, p) != 1)
		0
		0
1636	0		return 0;
1637	0	0	} else if (t >= 20000) { /* should never happen */
1638	0		return 0;
1639			}
1640			}
1641	1		t = mont_geta(t, p);
1642	1		z = mont_powmod(t, q, p);
1643	1		b = mont_powmod(a, q, p);
1644	1		r = e;
1645	1		q = (q+1) >> 1;
1646	1		x = mont_powmod(a, q, p);
1647	3	100	while (b != mont1) {
1648	2		t = b;
1649	5	50	for (m = 0; m < r && t != mont1; m++)
		100
1650	3	50	t = mont_sqrmod(t, p);
1651	2	50	if (m >= r) break;
1652	2		t = mont_powmod(z, UVCONST(1) << (r-m-1), p);
1653	2	50	x = mont_mulmod(x, t, p);
1654	2	50	z = mont_mulmod(t, t, p);
1655	2	50	b = mont_mulmod(b, z, p);
1656	2		r = m;
1657			}
1658	1	50	return mont_recover(x, p);
1659			}
1660			return 0;
1661			}
1662			#endif
1663
1664	10		int sqrtmod(UV *s, UV a, UV p) {
1665	10	50	if (p == 0) return 0;
1666	10	100	if (a >= p) a %= p;
1667	10	50	if (p <= 2 \|\| a <= 1) return verify_sqrtmod(a, s,a,p);
		100
1668
1669	8		return verify_sqrtmod(_sqrtmod_prime(a,p), s,a,p);
1670			}
1671
1672	7		int sqrtmod_composite(UV *s, UV a, UV n) {
1673			UV fac[MPU_MAX_FACTORS+1];
1674			UV exp[MPU_MAX_FACTORS+1];
1675			UV sqr[MPU_MAX_FACTORS+1];
1676			UV p, j, k, gcdan;
1677			int i, nfactors;
1678
1679	7	100	if (n == 0) return 0;
1680	5	50	if (a >= n) a %= n;
1681	5	100	if (n <= 2 \|\| a <= 1) return verify_sqrtmod(a, s,a,n);
		50
1682
1683			/* Simple existence check. It's still possible no solution exists.*/
1684	3	50	if (kronecker_uu(a, ((n%4) == 2) ? n/2 : n) == -1) return 0;
		50
1685
1686			/* if 8\|n 'a' must = 1 mod 8, else if 4\|n 'a' must = 1 mod 4 */
1687	3	50	if ((n % 4) == 0) {
1688	0	0	if ((n % 8) == 0) {
1689	0	0	if ((a % 8) != 1) return 0;
1690			} else {
1691	0	0	if ((a % 4) != 1) return 0;
1692			}
1693			}
1694
1695			/* More detailed existence check before factoring. Still possible. */
1696	3		gcdan = gcd_ui(a, n);
1697	3	50	if (gcdan == 1) {
1698	0	0	if ((n % 3) == 0 && kronecker_uu(a, 3) != 1) return 0;
		0
1699	0	0	if ((n % 5) == 0 && kronecker_uu(a, 5) != 1) return 0;
		0
1700	0	0	if ((n % 7) == 0 && kronecker_uu(a, 7) != 1) return 0;
		0
1701			}
1702
1703			/* Factor n */
1704	3		nfactors = factor_exp(n, fac, exp);
1705
1706			/* If gcd(a,n)==1, this answers comclusively if a solution exists. */
1707	3	50	if (gcdan == 1) {
1708	0	0	for (i = 0; i < nfactors; i++)
1709	0	0	if (fac[i] > 7 && kronecker_uu(a, fac[i]) != 1) return 0;
		0
1710			}
1711
1712	11	100	for (i = 0; i < nfactors; i++) {
1713
1714			/* Powers of 2 */
1715	8	100	if (fac[i] == 2) {
1716	3	50	if (exp[i] == 1) {
1717	3		sqr[i] = a & 1;
1718	0	0	} else if (exp[i] == 2) {
1719	0		sqr[i] = 1; /* and 3 */
1720			} else {
1721			UV this_roots[256], next_roots[256];
1722	0		UV nthis = 0, nnext = 0;
1723	0		this_roots[nthis++] = 1;
1724	0		this_roots[nthis++] = 3;
1725	0	0	for (j = 2; j < exp[i]; j++) {
1726	0		p = UVCONST(1) << (j+1);
1727	0		nnext = 0;
1728	0	0	for (k = 0; k < nthis && nnext < 254; k++) {
		0
1729	0		UV r = this_roots[k];
1730	0	0	if (sqrmod(r,p) == (a % p))
1731	0		next_roots[nnext++] = r;
1732	0	0	if (sqrmod(p-r,p) == (a % p))
1733	0		next_roots[nnext++] = p-r;
1734			}
1735	0	0	if (nnext == 0) return 0;
1736			/* copy next exponent's found roots to this one */
1737	0		nthis = nnext;
1738	0	0	for (k = 0; k < nnext; k++)
1739	0		this_roots[k] = next_roots[k];
1740			}
1741	0		sqr[i] = this_roots[0];
1742			}
1743	3		continue;
1744			}
1745
1746			/* p is an odd prime */
1747	5		p = fac[i];
1748	5	50	if (!sqrtmod(&(sqr[i]), a, p))
1749	0		return 0;
1750
1751			/* Lift solution of x^2 = a mod p to x^2 = a mod p^e */
1752	5	50	for (j = 1; j < exp[i]; j++) {
1753			UV xk2, yk, expect, sol;
1754	0		xk2 = addmod(sqr[i],sqr[i],p);
1755	0		yk = modinverse(xk2, p);
1756	0		expect = mulmod(xk2,yk,p);
1757	0		p *= fac[i];
1758	0		sol = submod(sqr[i], mulmod(submod(sqrmod(sqr[i],p), a % p, p), yk, p), p);
1759	0	0	if (expect != 1 \|\| sqrmod(sol,p) != (a % p)) {
		0
1760			/* printf("a %lu failure to lift to %lu^%d\n", a, fac[i], j+1); */
1761	0		return 0;
1762			}
1763	0		sqr[i] = sol;
1764			}
1765			}
1766
1767			/* raise fac[i] */
1768	11	100	for (i = 0; i < nfactors; i++)
1769	8		fac[i] = ipow(fac[i], exp[i]);
1770
1771	3		p = chinese(sqr, fac, nfactors, &i);
1772	7	50	return (i == 1) ? verify_sqrtmod(p, s, a, n) : 0;
1773			}
1774
1775			/* works only for co-prime inputs and also slower than the algorithm below,
1776			* but handles the case where IV_MAX < lcm <= UV_MAX.
1777			*/
1778	4		static UV _simple_chinese(UV* a, UV* n, UV num, int* status) {
1779	4		UV i, lcm = 1, res = 0;
1780	4		*status = 0;
1781	4	50	if (num == 0) return 0;
1782
1783	8	50	for (i = 0; i < num; i++) {
1784	8		UV ni = n[i];
1785	8		UV gcd = gcd_ui(lcm, ni);
1786	8	100	if (gcd != 1) return 0; /* not coprime */
1787	5		ni /= gcd;
1788	5	100	if (ni > (UV_MAX/lcm)) return 0; /* lcm overflow */
1789	4		lcm *= ni;
1790			}
1791	0	0	for (i = 0; i < num; i++) {
1792			UV p, inverse, term;
1793	0		p = lcm / n[i];
1794	0		inverse = modinverse(p, n[i]);
1795	0	0	if (inverse == 0) return 0; /* n's coprime so should never happen */
1796	0		term = mulmod(p, mulmod(a[i], inverse, lcm), lcm);
1797	0		res = addmod(res, term, lcm);
1798			}
1799	0		*status = 1;
1800	0		return res;
1801			}
1802
1803			/* status: 1 ok, -1 no inverse, 0 overflow */
1804	30		UV chinese(UV* a, UV* n, UV num, int* status) {
1805			static unsigned short sgaps[] = {7983,3548,1577,701,301,132,57,23,10,4,1,0};
1806			UV gcd, i, j, lcm, sum, gi, gap;
1807	30		*status = 1;
1808	30	100	if (num == 0) return 0;
1809
1810			/* Sort modulii, largest first */
1811	348	100	for (gi = 0, gap = sgaps[gi]; gap >= 1; gap = sgaps[++gi]) {
1812	361	100	for (i = gap; i < num; i++) {
1813	42		UV tn = n[i], ta = a[i];
1814	84	100	for (j = i; j >= gap && n[j-gap] < tn; j -= gap)
		100
1815	42		{ n[j] = n[j-gap]; a[j] = a[j-gap]; }
1816	42		n[j] = tn; a[j] = ta;
1817			}
1818			}
1819
1820	29	100	if (n[0] > IV_MAX) return _simple_chinese(a,n,num,status);
1821	28		lcm = n[0]; sum = a[0] % n[0];
1822	58	100	for (i = 1; i < num; i++) {
1823			IV u, v, t, s;
1824			UV vs, ut;
1825	38		gcd = gcdext(lcm, n[i], &u, &v, &s, &t);
1826	41	100	if (gcd != 1 && ((sum % gcd) != (a[i] % gcd))) { *status = -1; return 0; }
		100
1827	33	100	if (s < 0) s = -s;
1828	33	100	if (t < 0) t = -t;
1829	33	100	if (s > (IV)(IV_MAX/lcm)) return _simple_chinese(a,n,num,status);
1830	30		lcm *= s;
1831	30	100	if (u < 0) u += lcm;
1832	30	100	if (v < 0) v += lcm;
1833	30		vs = mulmod((UV)v, (UV)s, lcm);
1834	30		ut = mulmod((UV)u, (UV)t, lcm);
1835	30		sum = addmod( mulmod(vs, sum, lcm), mulmod(ut, a[i], lcm), lcm );
1836			}
1837	20		return sum;
1838			}
1839
1840	9		NV chebyshev_psi(UV n)
1841			{
1842			UV k;
1843	9		KAHAN_INIT(sum);
1844
1845	58	100	for (k = log2floor(n); k > 0; k--) {
		100
1846	49		KAHAN_SUM(sum, chebyshev_theta(rootof(n,k)));
1847			}
1848	9		return sum;
1849			}
1850
1851			#if BITS_PER_WORD == 64
1852			typedef struct {
1853			UV n;
1854			LNV theta;
1855			} cheby_theta_t;
1856			static const cheby_theta_t _cheby_theta[] = { /* >= quad math precision */
1857			{ UVCONST( 67108864),LNVCONST( 67100507.6357700963903836828562472350035880) },
1858			{ UVCONST( 100000000),LNVCONST( 99987730.0180220043832124342600487053812729) },
1859			{ UVCONST( 134217728),LNVCONST( 134204014.5735572091791081610859055728165544) },
1860			{ UVCONST( 268435456),LNVCONST( 268419741.6134308193112682817754501071404173) },
1861			{ UVCONST( 536870912),LNVCONST( 536842885.8045763840625719515011160692495056) },
1862			{ UVCONST( 1000000000),LNVCONST( 999968978.5775661447991262386023331863364793) },
1863			{ UVCONST( 1073741824),LNVCONST( 1073716064.8860663337617909073555831842945484) },
1864			{ UVCONST( 2147483648),LNVCONST( 2147432200.2475857676814950053003448716360822) },
1865			{ UVCONST( 4294967296),LNVCONST( 4294889489.1735446386752045191908417183337361) },
1866			{ UVCONST( 8589934592),LNVCONST( 8589863179.5654263491545135406516173629373070) },
1867			{ UVCONST( 10000000000),LNVCONST( 9999939830.6577573841592219954033850595228736) },
1868			{ UVCONST( 12884901888),LNVCONST( 12884796620.4324254952601520445848183460347362) },
1869			{ UVCONST( 17179869184),LNVCONST( 17179757715.9924077567777285147574707468995695) },
1870			{ UVCONST( 21474836480),LNVCONST( 21474693322.0998273969188369449626287713082943) },
1871			{ UVCONST( 25769803776),LNVCONST( 25769579799.3751535467593954636665656772211515) },
1872			{ UVCONST( 30064771072),LNVCONST( 30064545001.2305211029215168703433831598544454) },
1873			{ UVCONST( 34359738368),LNVCONST( 34359499180.0126643918259085362039638823175054) },
1874			{ UVCONST( 51539607552),LNVCONST( 51539356394.9531019037592855639826469993402730) },
1875			{ UVCONST( 68719476736),LNVCONST( 68719165213.6369838785284711480925219076501720) },
1876			{ UVCONST( 85899345920),LNVCONST( 85899083852.3471545629838432726841470626910905) },
1877			{ UVCONST( 100000000000),LNVCONST( 99999737653.1074446948519125729820679772770146) },
1878			{ UVCONST( 103079215104),LNVCONST(103079022007.113299711630969211422868856259124) },
1879			{ UVCONST( 120259084288),LNVCONST(120258614516.787336970535750737470005730125261) },
1880			{ UVCONST( 137438953472),LNVCONST(137438579206.444595884982301543904849253294539) },
1881			{ UVCONST( 171798691840),LNVCONST(171798276885.585945657918751085729734540334501) },
1882			{ UVCONST( 206158430208),LNVCONST(206158003808.160276853604927822609009916573462) },
1883			{ UVCONST( 240518168576),LNVCONST(240517893445.995868018331936763125264759516048) },
1884			{ UVCONST( 274877906944),LNVCONST(274877354651.045354829956619821889825596300686) },
1885			{ UVCONST( 309237645312),LNVCONST(309237050379.850690561796126460858271984023198) },
1886			{ UVCONST( 343597383680),LNVCONST(343596855806.595496630500062749631211394707114) },
1887			{ UVCONST( 377957122048),LNVCONST(377956498560.227794386327526022452943941537993) },
1888			{ UVCONST( 412316860416),LNVCONST(412316008796.349553568121442261222464590518293) },
1889			{ UVCONST( 446676598784),LNVCONST(446675972485.936512329625489223180824947531484) },
1890			{ UVCONST( 481036337152),LNVCONST(481035608287.572961376833237046440177624505864) },
1891			{ UVCONST( 515396075520),LNVCONST(515395302740.633513931333424447688399032397200) },
1892			{ UVCONST( 549755813888),LNVCONST(549755185085.539613556787409928561107952681488) },
1893			{ UVCONST( 584115552256),LNVCONST(584115015741.698143680148976236958207248900725) },
1894			{ UVCONST( 618475290624),LNVCONST(618474400071.621528348965919774195984612254220) },
1895			{ UVCONST( 652835028992),LNVCONST(652834230470.583317059774197550110194348469358) },
1896			{ UVCONST( 687194767360),LNVCONST(687193697328.927006867624832386534836384752774) },
1897			{ UVCONST( 721554505728),LNVCONST(721553211683.605313067593521060195071837766347) },
1898			{ UVCONST( 755914244096),LNVCONST(755913502349.878525212441903698096011352015192) },
1899			{ UVCONST( 790273982464),LNVCONST(790273042590.053075430445971969285969445183076) },
1900			{ UVCONST( 824633720832),LNVCONST(824633080997.428352876758261549475609957696369) },
1901			{ UVCONST( 858993459200),LNVCONST(858992716288.318498931165663742671579465316192) },
1902			{ UVCONST( 893353197568),LNVCONST(893352235882.851072417721659027263613727927680) },
1903			{ UVCONST( 927712935936),LNVCONST(927711881043.628817668337317445143018372892386) },
1904			{ UVCONST( 962072674304),LNVCONST(962071726126.508938539006575212272731584070786) },
1905			{ UVCONST( 996432412672),LNVCONST(996431411588.361462717402562171913706963939018) },
1906			{ UVCONST( 1099511627776),LNVCONST(1099510565082.05800550569923209414874779035972) },
1907			{ UVCONST( 1168231104512),LNVCONST(1168230478726.83399452743801182220790107593115) },
1908			{ UVCONST( 1236950581248),LNVCONST(1236949680081.02610603189530371762093291521116) },
1909			{ UVCONST( 1305670057984),LNVCONST(1305668780900.04255251887970870257110498423202) },
1910			{ UVCONST( 1374389534720),LNVCONST(1374388383792.63751003694755359184583212193880) },
1911			{ UVCONST( 1443109011456),LNVCONST(1443107961091.80955496949174183091839841371227) },
1912			{ UVCONST( 1511828488192),LNVCONST(1511827317611.91227277802426032456922797572429) },
1913			{ UVCONST( 1580547964928),LNVCONST(1580546753969.30607547506449941085747942395437) },
1914			{ UVCONST( 1649267441664),LNVCONST(1649265973878.75361554498682516738256005501353) },
1915			{ UVCONST( 1717986918400),LNVCONST(1717985403764.24562741452793071287954107946922) },
1916			{ UVCONST( 1786706395136),LNVCONST(1786704769212.04241689416220650800274263053933) },
1917			{ UVCONST( 1855425871872),LNVCONST(1855425013030.54920163513184322741954734357404) },
1918			{ UVCONST( 1924145348608),LNVCONST(1924143701943.02957992419280264060220278182021) },
1919			{ UVCONST( 1992864825344),LNVCONST(1992863373568.84039296068619447120308124302086) },
1920			{ UVCONST( 2061584302080),LNVCONST(2061583632335.91985095534685076604018573279204) },
1921			{ UVCONST( 2130303778816),LNVCONST(2113122935598.01727180199783433992649406589029) },
1922			{ UVCONST( 2199023255552),LNVCONST(2199021399611.18488312543276191461914978761981) },
1923			{ UVCONST( 2267742732288),LNVCONST(2267740947106.05038218811506263712808318234921) },
1924			{ UVCONST( 2336462209024),LNVCONST(2336460081480.34962633829077377680844065198307) },
1925			{ UVCONST( 2405181685760),LNVCONST(2405179969505.38642629423585641169740223940265) },
1926			{ UVCONST( 2473901162496),LNVCONST(2473899311193.37872375168104562948639924654178) },
1927			{ UVCONST( 2542620639232),LNVCONST(2542619362554.88893589220737167756411653816418) },
1928			{ UVCONST( 2611340115968),LNVCONST(2611338370515.94936514022501267847930999670553) },
1929			{ UVCONST( 2680059592704),LNVCONST(2680057722824.52981820001574883706268873541107) },
1930			{ UVCONST( 2748779069440),LNVCONST(2748777610452.18903407570165081726781627254885) },
1931			{ UVCONST( 2817498546176),LNVCONST(2817497017165.31924616507392971415494161401775) },
1932			{ UVCONST( 2886218022912),LNVCONST(2886216579432.32232322707222172612181994322081) },
1933			{ UVCONST( 2954937499648),LNVCONST(2954936100812.97301730406598982753121204977388) },
1934			{ UVCONST( 3023656976384),LNVCONST(3023654789503.82041452274471455184651411931920) },
1935			{ UVCONST( 3298534883328),LNVCONST(3298533215621.76606493931157388037915263658637) },
1936			{ UVCONST( 3573412790272),LNVCONST(3573411344351.74163523704886736624674718378131) },
1937			{ UVCONST( 3848290697216),LNVCONST(3848288415701.82534219216958446478503907262807) },
1938			{ UVCONST( 4123168604160),LNVCONST(4123166102085.86116301709394219323327831487542) },
1939			{ UVCONST( 4398046511104),LNVCONST(4398044965678.05143041707871320554940671182665) },
1940			{ UVCONST( 4672924418048),LNVCONST(4672922414672.04998927945349278916525727295687) },
1941			{ UVCONST( 4947802324992),LNVCONST(4947800056419.04384937181159608905993450182729) },
1942			{ UVCONST( 5222680231936),LNVCONST(5222678728087.69487334278665824384732845008859) },
1943			{ UVCONST( 5497558138880),LNVCONST(5497555766573.55159115560501595606332808978878) },
1944			{ UVCONST( 5772436045824),LNVCONST(5772433560746.27053256770924553245647027548204) },
1945			{ UVCONST( 6047313952768),LNVCONST(6047310750621.24497633828761530843255989494448) },
1946			{ UVCONST( 6322191859712),LNVCONST(6322189275338.39747421237532473168802646234745) },
1947			{ UVCONST( 6597069766656),LNVCONST(6579887620000.56226807898107616294821989189226) },
1948			{ UVCONST( 6871947673600),LNVCONST(6871945430474.61791600096091374271286154432006) },
1949			{ UVCONST( 7146825580544),LNVCONST(7146823258390.34361980709600216319269118247416) },
1950			{ UVCONST( 7421703487488),LNVCONST(7421700443390.35536080251964387835425662360121) },
1951			{ UVCONST( 7696581394432),LNVCONST(7696578975137.73249441643024336954233783264803) },
1952			{ UVCONST( 7971459301376),LNVCONST(7971457197928.90863708984184849978605273042512) },
1953			{ UVCONST( 8246337208320),LNVCONST(8246333982863.77146812177727648999195989358960) },
1954			{ UVCONST( 8521215115264),LNVCONST(8529802085075.55635100929751669785228592926043) },
1955			{ UVCONST( 8796093022208),LNVCONST(8796089836425.34909684634625258535266362465034) },
1956			{ UVCONST( 9345848836096),LNVCONST(9345845828116.77456046925508587313) },
1957			{ UVCONST( 9895604649984),LNVCONST(9895601077915.26821447819584407150) },
1958			{ UVCONST(10000000000000),LNVCONST(9999996988293.03419965318214160284) },
1959			{ UVCONST(15000000000000),LNVCONST(14999996482301.7098815115045166858) },
1960			{ UVCONST(20000000000000),LNVCONST(19999995126082.2286880312461318496) },
1961			{ UVCONST(25000000000000),LNVCONST(24999994219058.4086216020475916538) },
1962			{ UVCONST(30000000000000),LNVCONST(29999995531389.8454274046657200568) },
1963			{ UVCONST(35000000000000),LNVCONST(34999992921190.8049427456456479005) },
1964			{ UVCONST(40000000000000),LNVCONST(39999993533724.3168289589273168844) },
1965			{ UVCONST(45000000000000),LNVCONST(44999993567606.9795798378256194424) },
1966			{ UVCONST(50000000000000),LNVCONST(49999992543194.2636545758235373677) },
1967			{ UVCONST(55000000000000),LNVCONST(54999990847877.2435105757522625171) },
1968			{ UVCONST(60000000000000),LNVCONST(59999990297033.6261976055811111726) },
1969			{ UVCONST(65000000000000),LNVCONST(64999990861395.5522142429859245014) },
1970			{ UVCONST(70000000000000),LNVCONST(69999994316409.8717306521862685981) },
1971			{ UVCONST(75000000000000),LNVCONST(74999990126219.8344899338374090165) },
1972			{ UVCONST(80000000000000),LNVCONST(79999990160858.3042387288372250950) },
1973			{ UVCONST(85000000000000),LNVCONST(84999987096970.5915212896832780715) },
1974			{ UVCONST(90000000000000),LNVCONST(89999989501395.0738966599857919767) },
1975			{ UVCONST(95000000000000),LNVCONST(94999990785908.6672552042792168144) },
1976			{ UVCONST(100000000000000),LNVCONST(99999990573246.9785384070303475639) },
1977			};
1978			#define NCHEBY_VALS (sizeof(_cheby_theta)/sizeof(_cheby_theta[0]))
1979			#endif
1980
1981	58		NV chebyshev_theta(UV n)
1982			{
1983	58		uint16_t i = 0;
1984			UV tp, startn, seg_base, seg_low, seg_high;
1985			unsigned char* segment;
1986			void* ctx;
1987	58		LNV initial_sum, prod = LNV_ONE;
1988	58		KAHAN_INIT(sum);
1989
1990	58	100	if (n < 500) {
1991	379	100	for (i = 1; (tp = primes_tiny[i]) <= n; i++) {
1992	326		KAHAN_SUM(sum, loglnv(tp));
1993			}
1994	53		return sum;
1995			}
1996
1997			#if defined NCHEBY_VALS
1998	5	50	if (n >= _cheby_theta[0].n) {
1999	0	0	for (i = 1; i < NCHEBY_VALS; i++)
2000	0	0	if (n < _cheby_theta[i].n)
2001	0		break;
2002	0		startn = _cheby_theta[i-1].n;
2003	0		initial_sum = _cheby_theta[i-1].theta;
2004			} else
2005			#endif
2006			{
2007	5		KAHAN_SUM(sum, loglnv(235711*13));
2008	5		startn = 17;
2009	5		initial_sum = 0;
2010			}
2011
2012	5		ctx = start_segment_primes(startn, n, &segment);
2013			#if 0
2014			while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2015			START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2016			KAHAN_SUM(sum, loglnv(p));
2017			} END_DO_FOR_EACH_SIEVE_PRIME
2018			}
2019			#else
2020	12	100	while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2021	225425	50	START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
		100
		100
		100
		100
2022	214078		prod *= (LNV) p;
2023	214078	100	if (++i >= (LNV_IS_QUAD ? 64 : 8)) {
2024	26758		KAHAN_SUM(sum, loglnv(prod));
2025	26758		prod = LNV_ONE;
2026	26758		i = 0;
2027			}
2028	11325		} END_DO_FOR_EACH_SIEVE_PRIME
2029			}
2030	5	50	if (prod > 1.0) { KAHAN_SUM(sum, loglnv(prod)); prod = LNV_ONE; }
2031			#endif
2032	5		end_segment_primes(ctx);
2033
2034	5	50	if (initial_sum > 0) KAHAN_SUM(sum, initial_sum);
2035	58		return sum;
2036			}
2037
2038
2039
2040			/*
2041			* See:
2042			* "Multiple-Precision Exponential Integral and Related Functions"
2043			* by David M. Smith
2044			* "On the Evaluation of the Complex-Valued Exponential Integral"
2045			* by Vincent Pegoraro and Philipp Slusallek
2046			* "Numerical Recipes" 3rd edition
2047			* by William H. Press et al.
2048			* "Rational Chevyshev Approximations for the Exponential Integral E_1(x)"
2049			* by W. J. Cody and Henry C. Thacher, Jr.
2050			*
2051			* Any mistakes here are completely my fault. This code has not been
2052			* verified for anything serious. For better results, see:
2053			* http://www.trnicely.net/pi/pix_0000.htm
2054			* which although the author claims are demonstration programs, will
2055			* undoubtedly produce more reliable results than this code does (I don't
2056			* know of any obvious issues with this code, but it just hasn't been used
2057			* by many people).
2058			*/
2059
2060			static LNV const euler_mascheroni = LNVCONST(0.57721566490153286060651209008240243104215933593992);
2061			static LNV const li2 = LNVCONST(1.045163780117492784844588889194613136522615578151);
2062
2063	234		NV Ei(NV x) {
2064			LNV val, term;
2065			unsigned int n;
2066	234		KAHAN_INIT(sum);
2067
2068	234	50	if (x == 0) croak("Invalid input to ExponentialIntegral: x must be != 0");
2069			/* Protect against messed up rounding modes */
2070	234	50	if (x >= 12000) return INFINITY;
2071	234	50	if (x <= -12000) return 0;
2072
2073	234	100	if (x < -1) {
2074			/* Continued fraction, good for x < -1 */
2075	1		LNV lc = 0;
2076	1		LNV ld = LNV_ONE / (LNV_ONE - (LNV)x);
2077	1		val = ld * (-explnv(x));
2078	21	50	for (n = 1; n <= 100000; n++) {
2079			LNV old, t, n2;
2080	20		t = (LNV)(2*n + 1) - (LNV) x;
2081	20		n2 = n * n;
2082	20		lc = LNV_ONE / (t - n2 * lc);
2083	20		ld = LNV_ONE / (t - n2 * ld);
2084	20		old = val;
2085	20		val *= ld/lc;
2086	20	100	if ( fabslnv(val-old) <= LNV_EPSILON*fabslnv(val) )
2087	1		break;
2088			}
2089	233	100	} else if (x < 0) {
2090			/* Rational Chebyshev approximation (Cody, Thacher), good for -1 < x < 0 */
2091			static const LNV C6p[7] = { LNVCONST(-148151.02102575750838086),
2092			LNVCONST( 150260.59476436982420737),
2093			LNVCONST( 89904.972007457256553251),
2094			LNVCONST( 15924.175980637303639884),
2095			LNVCONST( 2150.0672908092918123209),
2096			LNVCONST( 116.69552669734461083368),
2097			LNVCONST( 5.0196785185439843791020) };
2098			static const LNV C6q[7] = { LNVCONST( 256664.93484897117319268),
2099			LNVCONST( 184340.70063353677359298),
2100			LNVCONST( 52440.529172056355429883),
2101			LNVCONST( 8125.8035174768735759866),
2102			LNVCONST( 750.43163907103936624165),
2103			LNVCONST( 40.205465640027706061433),
2104			LNVCONST( 1.0000000000000000000000) };
2105	5		LNV sumn = C6p[0]-x(C6p[1]-x(C6p[2]-x(C6p[3]-x(C6p[4]-x(C6p[5]-xC6p[6])))));
2106	5		LNV sumd = C6q[0]-x(C6q[1]-x(C6q[2]-x(C6q[3]-x(C6q[4]-x(C6q[5]-xC6q[6])))));
2107	5		val = loglnv(-x) - sumn/sumd;
2108	228	50	} else if (x < (-2 * loglnv(LNV_EPSILON))) {
2109			/* Convergent series. Accurate but slow especially with large x. */
2110	228		LNV fact_n = x;
2111	15320	50	for (n = 2; n <= 200; n++) {
2112	15320		LNV invn = LNV_ONE / n;
2113	15320		fact_n = (LNV)x invn;
2114	15320		term = fact_n * invn;
2115	15320		KAHAN_SUM(sum, term);
2116			/* printf("C after adding %.20Lf, val = %.20Lf\n", term, sum); */
2117	15320	100	if (term < LNV_EPSILON*sum) break;
2118			}
2119	228		KAHAN_SUM(sum, euler_mascheroni);
2120	228		KAHAN_SUM(sum, loglnv(x));
2121	228		KAHAN_SUM(sum, x);
2122	228		val = sum;
2123	0	0	} else if (x >= 24) {
2124			/* Cody / Thacher rational Chebyshev */
2125			static const LNV P2[10] = {
2126			LNVCONST( 1.75338801265465972390E02),
2127			LNVCONST(-2.23127670777632409550E02),
2128			LNVCONST(-1.81949664929868906455E01),
2129			LNVCONST(-2.79798528624305389340E01),
2130			LNVCONST(-7.63147701620253630855E00),
2131			LNVCONST(-1.52856623636929636839E01),
2132			LNVCONST(-7.06810977895029358836E00),
2133			LNVCONST(-5.00006640413131002475E00),
2134			LNVCONST(-3.00000000320981265753E00),
2135			LNVCONST( 1.00000000000000485503E00) };
2136			static const LNV Q2[9] = {
2137			LNVCONST( 3.97845977167414720840E04),
2138			LNVCONST( 3.97277109100414518365E00),
2139			LNVCONST( 1.37790390235747998793E02),
2140			LNVCONST( 1.17179220502086455287E02),
2141			LNVCONST( 7.04831847180424675988E01),
2142			LNVCONST(-1.20187763547154743238E01),
2143			LNVCONST(-7.99243595776339741065E00),
2144			LNVCONST(-2.99999894040324959612E00),
2145			LNVCONST( 1.99999999999048104167E00) };
2146	0		LNV invx = LNV_ONE / x, frac = 0.0;
2147	0	0	for (n = 0; n <= 8; n++)
2148	0		frac = Q2[n] / (P2[n] + x + frac);
2149	0		frac += P2[9];
2150	0		val = explnv(x) * (invx + invxinvxfrac);
2151			} else {
2152			/* Asymptotic divergent series */
2153	0		LNV invx = LNV_ONE / x;
2154	0		term = 1.0;
2155	0	0	for (n = 1; n <= 200; n++) {
2156	0		LNV last_term = term;
2157	0		term = term * ( (LNV)n * invx );
2158	0	0	if (term < LNV_EPSILON*sum) break;
2159	0	0	if (term < last_term) {
2160	0		KAHAN_SUM(sum, term);
2161			/* printf("A after adding %.20llf, sum = %.20llf\n", term, sum); */
2162			} else {
2163	0		KAHAN_SUM(sum, (-last_term/3) );
2164			/* printf("A after adding %.20llf, sum = %.20llf\n", -last_term/3, sum); */
2165	0		break;
2166			}
2167			}
2168	0		KAHAN_SUM(sum, LNV_ONE);
2169	0		val = explnv(x) * sum * invx;
2170			}
2171
2172	234		return val;
2173			}
2174
2175	2227		NV Li(NV x) {
2176	2227	50	if (x == 0) return 0;
2177	2227	50	if (x == 1) return -INFINITY;
2178	2227	50	if (x == 2) return li2;
2179	2227	50	if (x < 0) croak("Invalid input to LogarithmicIntegral: x must be >= 0");
2180	2227	50	if (x >= NV_MAX) return INFINITY;
2181
2182			/* Calculate directly using Ramanujan's series. */
2183	2227	50	if (x > 1) {
2184	2227		const LNV logx = loglnv(x);
2185	2227		LNV sum = 0, inner_sum = 0, old_sum, factorial = 1, power2 = 1;
2186	2227		LNV q, p = -1;
2187	2227		int k = 0, n = 0;
2188
2189	89801	50	for (n = 1, k = 0; n < 200; n++) {
2190	89801		factorial *= n;
2191	89801		p *= -logx;
2192	89801		q = factorial * power2;
2193	89801		power2 *= 2;
2194	135349	100	for (; k <= (n - 1) / 2; k++)
2195	45548		inner_sum += LNV_ONE / (2 * k + 1);
2196	89801		old_sum = sum;
2197	89801		sum += (p / q) * inner_sum;
2198	89801	100	if (fabslnv(sum - old_sum) <= LNV_EPSILON) break;
2199			}
2200	2227		return euler_mascheroni + loglnv(logx) + sqrtlnv(x) * sum;
2201			}
2202
2203	0		return Ei(loglnv(x));
2204			}
2205
2206	94		static long double ld_inverse_li(long double lx) {
2207			int i;
2208	94		long double t, term, old_term = 0;
2209			/* Iterate Halley's method until error grows. */
2210	94	50	t = (lx <= 2) ? 2 : lx * logl(lx);
2211	424	100	for (i = 0; i < 4; i++) {
2212	376		long double dn = Li(t) - lx;
2213	376		term = dnlogl(t) / (1.0L + dn/(2t));
2214	376	100	if (i > 0 && fabsl(term) >= fabsl(old_term)) { t -= term/4; break; }
		100
2215	330		old_term = term;
2216	330		t -= term;
2217			}
2218	94		return t;
2219			}
2220
2221	97		UV inverse_li(UV x) {
2222			UV r, i;
2223	97		long double lx = (long double) x;
2224
2225	97	100	if (x <= 2) return x + (x > 0);
2226	94		r = (UV) ceill( ld_inverse_li(lx) );
2227			/* Meet our more stringent goal of an exact answer. */
2228	94	50	i = (x > 4e16) ? 2048 : 128;
2229	94	50	if (Li(r-1) >= lx) {
2230	0	0	while (Li(r-i) >= lx) r -= i;
2231	0	0	for (i = i/2; i > 0; i /= 2)
2232	0	0	if (Li(r-i) >= lx) r -= i;
2233			} else {
2234	94	50	while (Li(r+i-1) < lx) r += i;
2235	752	100	for (i = i/2; i > 0; i /= 2)
2236	658	50	if (Li(r+i-1) < lx) r += i;
2237			}
2238	94		return r;
2239			}
2240
2241	23		static long double ld_inverse_R(long double lx) {
2242			int i;
2243	23		long double t, dn, term, old_term = 0;
2244
2245			/* Rough estimate */
2246	23	50	if (lx <= 3.5) {
2247	0		t = lx + 2.24*(lx-1)/2;
2248			} else {
2249	23		t = lx * logl(lx);
2250	23	50	if (lx < 50) { t *= 1.2; }
2251	23	100	else if (lx < 1000) { t *= 1.15; }
2252			else { /* use inverse Li (one iteration) for first inverse R approx */
2253	22		dn = Li(t) - lx;
2254	22		term = dn * logl(t) / (1.0L + dn/(2*t));
2255	22		t -= term;
2256			}
2257			}
2258			/* Iterate 1-n rounds of Halley, usually only 3 needed. */
2259	116	50	for (i = 0; i < 100; i++) {
2260	116		dn = RiemannR(t) - lx;
2261			#if 1 /* Use f(t) = li(t) for derivatives */
2262	116		term = dn * logl(t) / (1.0L + dn/(2*t));
2263			#else /* Use f(t) = li(t) - li(sqrt(t))/2 for derivatives */
2264			long double logt = logl(t);
2265			long double sqrtt = sqrtl(t);
2266			long double FA = 2 * sqrtt * logt;
2267			long double FB = 2 * sqrtt - 1;
2268			long double ifz = FA / FB;
2269			long double iffz = (logt - 2FB) / (2 sqrtt * FA * FA * FA * FA);
2270			term = dn * ifz * (1.0L - dn * iffz);
2271			#endif
2272	116	100	if (i > 0 && fabsl(term) >= fabsl(old_term)) { t -= term/4; break; }
		100
2273	93		old_term = term;
2274	93		t -= term;
2275			}
2276	23		return t;
2277			}
2278
2279	23		UV inverse_R(UV x) {
2280	23	50	if (x < 2) return x + (x > 0);
2281	23		return (UV) ceill( ld_inverse_R( (long double) x) );
2282			}
2283
2284
2285			/*
2286			* Storing the first 10-20 Zeta values makes sense. Past that it is purely
2287			* to avoid making the call to generate them ourselves. We could cache the
2288			* calculated values. These all have 1 subtracted from them. */
2289			static const long double riemann_zeta_table[] = {
2290			0.6449340668482264364724151666460251892L, /* zeta(2) */
2291			0.2020569031595942853997381615114499908L,
2292			0.0823232337111381915160036965411679028L,
2293			0.0369277551433699263313654864570341681L,
2294			0.0173430619844491397145179297909205279L,
2295			0.0083492773819228268397975498497967596L,
2296			0.0040773561979443393786852385086524653L,
2297			0.0020083928260822144178527692324120605L,
2298			0.0009945751278180853371459589003190170L,
2299			0.0004941886041194645587022825264699365L,
2300			0.0002460865533080482986379980477396710L,
2301			0.0001227133475784891467518365263573957L,
2302			0.0000612481350587048292585451051353337L,
2303			0.0000305882363070204935517285106450626L,
2304			0.0000152822594086518717325714876367220L,
2305			0.0000076371976378997622736002935630292L, /* zeta(17) Past here all we're */
2306			0.0000038172932649998398564616446219397L, /* zeta(18) getting is speed. */
2307			0.0000019082127165539389256569577951013L,
2308			0.0000009539620338727961131520386834493L,
2309			0.0000004769329867878064631167196043730L,
2310			0.0000002384505027277329900036481867530L,
2311			0.0000001192199259653110730677887188823L,
2312			0.0000000596081890512594796124402079358L,
2313			0.0000000298035035146522801860637050694L,
2314			0.0000000149015548283650412346585066307L,
2315			0.0000000074507117898354294919810041706L,
2316			0.0000000037253340247884570548192040184L,
2317			0.0000000018626597235130490064039099454L,
2318			0.0000000009313274324196681828717647350L,
2319			0.0000000004656629065033784072989233251L,
2320			0.0000000002328311833676505492001455976L,
2321			0.0000000001164155017270051977592973835L,
2322			0.0000000000582077208790270088924368599L,
2323			0.0000000000291038504449709968692942523L,
2324			0.0000000000145519218910419842359296322L,
2325			0.0000000000072759598350574810145208690L,
2326			0.0000000000036379795473786511902372363L,
2327			0.0000000000018189896503070659475848321L,
2328			0.0000000000009094947840263889282533118L,
2329			0.0000000000004547473783042154026799112L,
2330			0.0000000000002273736845824652515226821L,
2331			0.0000000000001136868407680227849349105L,
2332			0.0000000000000568434198762758560927718L,
2333			0.0000000000000284217097688930185545507L,
2334			0.0000000000000142108548280316067698343L,
2335			0.00000000000000710542739521085271287735L,
2336			0.00000000000000355271369133711367329847L,
2337			0.00000000000000177635684357912032747335L,
2338			0.000000000000000888178421093081590309609L,
2339			0.000000000000000444089210314381336419777L,
2340			0.000000000000000222044605079804198399932L,
2341			0.000000000000000111022302514106613372055L,
2342			0.0000000000000000555111512484548124372374L,
2343			0.0000000000000000277555756213612417258163L,
2344			0.0000000000000000138777878097252327628391L,
2345			};
2346			#define NPRECALC_ZETA (sizeof(riemann_zeta_table)/sizeof(riemann_zeta_table[0]))
2347
2348			/* Riemann Zeta on the real line, with 1 subtracted.
2349			* Compare to Math::Cephes zetac. Also zeta with q=1 and subtracting 1.
2350			*
2351			* The Cephes zeta function uses a series (2k)!/B_2k which converges rapidly
2352			* and has a very wide range of values. We use it here for some values.
2353			*
2354			* Note: Calculations here are done on long doubles and we try to generate as
2355			* much accuracy as possible. They will get returned to Perl as an NV,
2356			* which is typically a 64-bit double with 15 digits.
2357			*
2358			* For values 0.5 to 5, this code uses the rational Chebyshev approximation
2359			* from Cody and Thacher. This method is extraordinarily fast and very
2360			* accurate over its range (slightly better than Cephes for most values). If
2361			* we had quad floats, we could use the 9-term polynomial.
2362			*/
2363	3049		long double ld_riemann_zeta(long double x) {
2364			int i;
2365
2366	3049	50	if (x < 0) croak("Invalid input to RiemannZeta: x must be >= 0");
2367	3049	50	if (x == 1) return INFINITY;
2368
2369	3049	100	if (x == (unsigned int)x) {
2370	3045		int k = x - 2;
2371	3045	50	if ((k >= 0) && (k < (int)NPRECALC_ZETA))
		100
2372	2		return riemann_zeta_table[k];
2373			}
2374
2375			/* Cody / Thacher rational Chebyshev approximation for small values */
2376	3047	50	if (x >= 0.5 && x <= 5.0) {
		100
2377			static const long double C8p[9] = { 1.287168121482446392809e10L,
2378			1.375396932037025111825e10L,
2379			5.106655918364406103683e09L,
2380			8.561471002433314862469e08L,
2381			7.483618124380232984824e07L,
2382			4.860106585461882511535e06L,
2383			2.739574990221406087728e05L,
2384			4.631710843183427123061e03L,
2385			5.787581004096660659109e01L };
2386			static const long double C8q[9] = { 2.574336242964846244667e10L,
2387			5.938165648679590160003e09L,
2388			9.006330373261233439089e08L,
2389			8.042536634283289888587e07L,
2390			5.609711759541920062814e06L,
2391			2.247431202899137523543e05L,
2392			7.574578909341537560115e03L,
2393			-2.373835781373772623086e01L,
2394			1.000000000000000000000L };
2395	2		long double sumn = C8p[0]+x(C8p[1]+x(C8p[2]+x(C8p[3]+x(C8p[4]+x(C8p[5]+x(C8p[6]+x(C8p[7]+xC8p[8])))))));
2396	2		long double sumd = C8q[0]+x(C8q[1]+x(C8q[2]+x(C8q[3]+x(C8q[4]+x(C8q[5]+x(C8q[6]+x(C8q[7]+xC8q[8])))))));
2397	2		long double sum = (sumn - (x-1)sumd) / ((x-1)sumd);
2398	2		return sum;
2399			}
2400
2401	3045	50	if (x > 17000.0)
2402	0		return 0.0;
2403
2404			#if 0
2405			{
2406			KAHAN_INIT(sum);
2407			/* Simple defining series, works well. */
2408			for (i = 5; i <= 1000000; i++) {
2409			long double term = powl(i, -x);
2410			KAHAN_SUM(sum, term);
2411			if (term < LDBL_EPSILON*sum) break;
2412			}
2413			KAHAN_SUM(sum, powl(4, -x) );
2414			KAHAN_SUM(sum, powl(3, -x) );
2415			KAHAN_SUM(sum, powl(2, -x) );
2416			return sum;
2417			}
2418			#endif
2419
2420			/* The 2n!/B_2k series used by the Cephes library. */
2421			{
2422			/* gp/pari:
2423			* for(i=1,13,printf("%.38g\n",(2i)!/bernreal(2i)))
2424			* MPU:
2425			* use bignum;
2426			* say +(factorial(2$_)/bernreal(2$_))->bround(38) for 1..13;
2427			*/
2428			static const long double A[] = {
2429			12.0L,
2430			-720.0L,
2431			30240.0L,
2432			-1209600.0L,
2433			47900160.0L,
2434			-1892437580.3183791606367583212735166425L,
2435			74724249600.0L,
2436			-2950130727918.1642244954382084600497650L,
2437			116467828143500.67248729113000661089201L,
2438			-4597978722407472.6105457273596737891656L,
2439			181521054019435467.73425331153534235290L,
2440			-7166165256175667011.3346447367083352775L,
2441			282908877253042996618.18640556532523927L,
2442			};
2443			long double a, b, s, t;
2444	3045		const long double w = 10.0;
2445	3045		s = 0.0;
2446	3045		b = 0.0;
2447	9778	100	for (i = 2; i < 11; i++) {
2448	9776		b = powl( i, -x );
2449	9776		s += b;
2450	9776	100	if (fabsl(b) < fabsl(LDBL_EPSILON * s))
2451	3043		return s;
2452			}
2453	2		s = s + bw/(x-1.0) - 0.5 b;
2454	2		a = 1.0;
2455	19	50	for (i = 0; i < 13; i++) {
2456	19		long double k = 2*i;
2457	19		a *= x + k;
2458	19		b /= w;
2459	19		t = a*b/A[i];
2460	19		s = s + t;
2461	19	100	if (fabsl(t) < fabsl(LDBL_EPSILON * s))
2462	2		break;
2463	17		a *= x + k + 1.0;
2464	17		b /= w;
2465			}
2466	2		return s;
2467			}
2468			}
2469
2470	150		long double RiemannR(long double x) {
2471			long double part_term, term, flogx, ki, old_sum;
2472			unsigned int k;
2473	150		KAHAN_INIT(sum);
2474
2475	150	50	if (x <= 0) croak("Invalid input to RiemannR: x must be > 0");
2476
2477	150	100	if (x > 1e19) {
2478	2		const signed char* amob = range_moebius(0, 100);
2479	2		KAHAN_SUM(sum, Li(x));
2480	132	50	for (k = 2; k <= 100; k++) {
2481	132	100	if (amob[k] == 0) continue;
2482	82		ki = 1.0L / (long double) k;
2483	82		part_term = powl(x,ki);
2484	82	50	if (part_term > LDBL_MAX) return INFINITY;
2485	82		term = amob[k] * ki * Li(part_term);
2486	82		old_sum = sum;
2487	82		KAHAN_SUM(sum, term);
2488	82	100	if (fabsl(sum - old_sum) <= LDBL_EPSILON) break;
2489			}
2490	2		Safefree(amob);
2491	2		return sum;
2492			}
2493
2494	148		KAHAN_SUM(sum, 1.0);
2495	148		flogx = logl(x);
2496	148		part_term = 1;
2497
2498	10829	50	for (k = 1; k <= 10000; k++) {
2499	10829	100	ki = (k-1 < NPRECALC_ZETA) ? riemann_zeta_table[k-1] : ld_riemann_zeta(k+1);
2500	10829		part_term *= flogx / k;
2501	10829		term = part_term / (k + k * ki);
2502	10829		old_sum = sum;
2503	10829		KAHAN_SUM(sum, term);
2504			/* printf("R %5d after adding %.18Lg, sum = %.19Lg (%Lg)\n", k, term, sum, fabsl(sum-old_sum)); */
2505	10829	100	if (fabsl(sum - old_sum) <= LDBL_EPSILON) break;
2506			}
2507
2508	148		return sum;
2509			}
2510
2511	8		static long double _lambertw_approx(long double x) {
2512			/* See Veberic 2009 for other approximations */
2513	8	100	if (x < -0.060) { /* Pade(3,2) */
2514	2		long double ti = 5.4365636569180904707205749L * x + 2.0L;
2515	2	50	long double t = (ti <= 0.0L) ? 0.0L : sqrtl(ti);
2516	2		long double t2 = t*t;
2517	2		long double t3 = t*t2;
2518	2		return (-1.0L + (1.0L/6.0L)t + (257.0L/720.0L)t2 + (13.0L/720.0L)t3) / (1.0L + (5.0L/6.0L)t + (103.0L/720.0L)*t2);
2519	6	100	} else if (x < 1.363) { /* Winitzki 2003 section 3.5 */
2520	2		long double l1 = logl(1.0L+x);
2521	2		return l1 * (1.0L - logl(1.0L+l1) / (2.0L+l1));
2522	4	50	} else if (x < 3.7) { /* Modification of Vargas 2013 */
2523	0		long double l1 = logl(x);
2524	0		long double l2 = logl(l1);
2525	0		return l1 - l2 - logl(1.0L - l2/l1)/2.0L;
2526			} else { /* Corless et al. 1993, page 22 */
2527	4		long double l1 = logl(x);
2528	4		long double l2 = logl(l1);
2529	4		long double d1 = 2.0Ll1l1;
2530	4		long double d2 = 3.0Ll1d1;
2531	4		long double d3 = 2.0Ll1d2;
2532	4		long double d4 = 5.0Ll1d3;
2533	4		long double w = l1 - l2 + l2/l1 + l2*(l2-2.0L)/d1;
2534	4		w += l2(6.0L+l2(-9.0L+2.0L*l2))/d2;
2535	4		w += l2(-12.0L+l2(36.0L+l2(-22.0L+3.0Ll2)))/d3;
2536	4		w += l2(60.0L+l2(-300.0L+l2(350.0L+l2(-125.0L+12.0L*l2))))/d4;
2537	4		return w;
2538			}
2539			}
2540
2541	9		NV lambertw(NV x) {
2542			long double w;
2543			int i;
2544
2545	9	50	if (x < -0.36787944117145L)
2546	0		croak("Invalid input to LambertW: x must be >= -1/e");
2547	9	100	if (x == 0.0L) return 0.0L;
2548
2549			/* Estimate initial value */
2550	8		w = _lambertw_approx(x);
2551			/* If input is too small, return .99999.... */
2552	8	50	if (w <= -1.0L) return -1.0L + 8*LDBL_EPSILON;
2553			/* For very small inputs, don't iterate, return approx directly. */
2554	8	100	if (x < -0.36783) return w;
2555
2556			#if 0 /* Halley */
2557			lastw = w;
2558			for (i = 0; i < 100; i++) {
2559			long double ew = expl(w);
2560			long double wew = w * ew;
2561			long double wewx = wew - x;
2562			long double w1 = w + 1;
2563			w = w - wewx / (ew * w1 - (w+2) * wewx/(2*w1));
2564			if (w != 0.0L && fabsl((w-lastw)/w) <= 8*LDBL_EPSILON) break;
2565			lastw = w;
2566			}
2567			#else /* Fritsch, see Veberic 2009. 1-2 iterations are enough. */
2568	30	100	for (i = 0; i < 6 && w != 0.0L; i++) {
		50
2569	28		long double w1 = 1 + w;
2570	28		long double zn = logl((long double)x/w) - w;
2571	28		long double qn = 2 * w1 * (w1+(2.0L/3.0L)*zn);
2572	28		long double en = (zn/w1) * (qn-zn)/(qn-2.0L*zn);
2573			/* w = 1.0L + en; if (fabsl(en) <= 16LDBL_EPSILON) break; */
2574	28		long double wen = w * en;
2575	28		w += wen;
2576	28	100	if (fabsl(wen) <= 64*LDBL_EPSILON) break;
2577			}
2578			#endif
2579			#if LNV_IS_QUAD /* For quadmath, one high precision correction */
2580			if (w != LNV_ZERO) {
2581			LNV lw = w;
2582			LNV w1 = LNV_ONE + lw;
2583			LNV zn = loglnv((LNV)x/lw) - lw;
2584			LNV qn = LNVCONST(2.0) * w1 * (w1+(LNVCONST(2.0)/LNVCONST(3.0))*zn);
2585			LNV en = (zn/w1) * (qn-zn)/(qn-LNVCONST(2.0)*zn);
2586			return lw + lw * en;
2587			}
2588			#endif
2589
2590	7		return w;
2591			}
2592
2593			#if HAVE_STD_U64
2594			#define U64T uint64_t
2595			#else
2596			#define U64T UV
2597			#endif
2598
2599			/* Spigot from Arndt, Haenel, Winter, and Flammenkamp. */
2600			/* Modified for larger digits and rounding by Dana Jacobsen */
2601	987		char* pidigits(int digits)
2602			{
2603			char* out;
2604			uint32_t *a, b, c, d, e, g, i, d4, d3, d2, d1;
2605	987		uint32_t const f = 10000;
2606			U64T d64; /* 64-bit intermediate for 2210000b > 2^32 (~30k digits) /
2607
2608	987	50	if (digits <= 0) return 0;
2609	987	100	if (digits <= DBL_DIG && digits <= 18) {
		50
2610	15		Newz(0, out, 19, char);
2611	15		(void)sprintf(out, "%.*lf", (digits-1), 3.141592653589793238);
2612	15		return out;
2613			}
2614	972		digits++; /* For rounding */
2615	972		c = 14*(digits/4 + 2);
2616	972		New(0, out, digits+5+1, char);
2617	972		out++ = '3'; / We'll turn "31415..." into "3.1415..." */
2618	972	50	New(0, a, c, uint32_t);
2619	1776998	100	for (b = 0; b < c; b++) a[b] = 2000;
2620
2621	972		d = i = 0;
2622	126616	100	while ((b = c -= 14) > 0 && i < (uint32_t)digits) {
		100
2623	125644		d = e = d % f;
2624	125644	50	if (b > 107000) { /* Use 64-bit intermediate while necessary. */
2625	0	0	for (d64 = d; --b > 107000; ) {
2626	0		g = (b << 1) - 1;
2627	0		d64 = d64 * b + f * (U64T)a[b];
2628	0		a[b] = d64 % g;
2629	0		d64 /= g;
2630			}
2631	0		d = d64;
2632	0		b++;
2633			}
2634	148467144	100	while (--b > 0) {
2635	148341500		g = (b << 1) - 1;
2636	148341500		d = d * b + f * a[b];
2637	148341500		a[b] = d % g;
2638	148341500		d /= g;
2639			}
2640			/* sprintf(out+i, "%04d", e+d/f); i += 4; */
2641	125644		d4 = e + d/f;
2642	125644	50	if (d4 > 9999) {
2643	0		d4 -= 10000;
2644	0		out[i-1]++;
2645	0	0	for (b=i-1; out[b] == '0'+1; b--) { out[b]='0'; out[b-1]++; }
2646			}
2647	125644		d3 = d4/10; d2 = d3/10; d1 = d2/10;
2648	125644		out[i++] = '0' + d1;
2649	125644		out[i++] = '0' + d2-d1*10;
2650	125644		out[i++] = '0' + d3-d2*10;
2651	125644		out[i++] = '0' + d4-d3*10;
2652			}
2653	972		Safefree(a);
2654	972	100	if (out[digits-1] >= '5') out[digits-2]++; /* Round */
2655	1042	100	for (i = digits-2; out[i] == '9'+1; i--) /* Keep rounding */
2656	70		{ out[i] = '0'; out[i-1]++; }
2657	972		digits--; /* Undo the extra digit we used for rounding */
2658	972		out[digits] = '\0';
2659	972		*out-- = '.';
2660	972		return out;
2661			}
2662
2663			/* 1. Perform signed integer validation on b/blen.
2664			* 2. Compare to a/alen using min or max based on first arg.
2665			* 3. Return 0 to select a, 1 to select b.
2666			*/
2667	4		int strnum_minmax(int min, char* a, STRLEN alen, char* b, STRLEN blen)
2668			{
2669			int aneg, bneg;
2670			STRLEN i;
2671			/* a is checked, process b */
2672	4	50	if (b == 0 \|\| blen == 0) croak("Parameter must be a positive integer");
		50
2673	4		bneg = (b[0] == '-');
2674	4	50	if (b[0] == '-' \|\| b[0] == '+') { b++; blen--; }
		50
2675	4	50	while (blen > 0 && *b == '0') { b++; blen--; }
		50
2676	84	100	for (i = 0; i < blen; i++)
2677	80	50	if (!isDIGIT(b[i]))
2678	0		break;
2679	4	50	if (blen == 0 \|\| i < blen)
		50
2680	0		croak("Parameter must be a positive integer");
2681
2682	4	100	if (a == 0) return 1;
2683
2684	2		aneg = (a[0] == '-');
2685	2	50	if (a[0] == '-' \|\| a[0] == '+') { a++; alen--; }
		50
2686	2	50	while (alen > 0 && *a == '0') { a++; alen--; }
		50
2687
2688	2	50	if (aneg != bneg) return min ? (bneg == 1) : (aneg == 1);
		0
2689	2	50	if (aneg == 1) min = !min;
2690	2	50	if (alen != blen) return min ? (alen > blen) : (blen > alen);
		0
2691
2692	2	50	for (i = 0; i < blen; i++)
2693	2	50	if (a[i] != b[i])
2694	2	100	return min ? (a[i] > b[i]) : (b[i] > a[i]);
2695	0		return 0; /* equal */
2696			}
2697
2698	6		int from_digit_string(UV* rn, const char* s, int base)
2699			{
2700	6		UV max, n = 0;
2701			int i, len;
2702
2703			/* Skip leading -/+ and zeros */
2704	6	50	if (s[0] == '-' \|\| s[0] == '+') s++;
		50
2705	6	50	while (s[0] == '0') s++;
2706
2707	6		len = strlen(s);
2708	6		max = (UV_MAX-base+1)/base;
2709
2710	39	100	for (i = 0, len = strlen(s); i < len; i++) {
2711	34		const char c = s[i];
2712	34	50	int d = !isalnum(c) ? 255 : (c <= '9') ? c-'0' : (c <= 'Z') ? c-'A'+10 : c-'a'+10;
		100
		50
2713	34	50	if (d >= base) croak("Invalid digit for base %d", base);
2714	34	100	if (n > max) return 0; /* Overflow */
2715	33		n = n * base + d;
2716			}
2717	5		*rn = n;
2718	5		return 1;
2719			}
2720
2721	13		int from_digit_to_UV(UV* rn, UV* r, int len, int base)
2722			{
2723	13		UV d, n = 0;
2724			int i;
2725	13	50	if (len < 0 \|\| len > BITS_PER_WORD)
		50
2726	0		return 0;
2727	76	100	for (i = 0; i < len; i++) {
2728	63		d = r[i];
2729	63	50	if (n > (UV_MAX-d)/base) break; /* overflow */
2730	63		n = n * base + d;
2731			}
2732	13		*rn = n;
2733	13		return (i >= len);
2734			}
2735
2736
2737	0		int from_digit_to_str(char** rstr, UV* r, int len, int base)
2738			{
2739			char so, s;
2740			int i;
2741
2742	0	0	if (len < 0 \|\| !(base == 2 \|\| base == 10 \|\| base == 16)) return 0;
		0
		0
		0
2743
2744	0	0	if (r[0] >= (UV) base) return 0; /* TODO: We don't apply extended carry */
2745
2746	0		New(0, so, len + 3, char);
2747	0		s = so;
2748	0	0	if (base == 2 \|\| base == 16) {
		0
2749	0		*s++ = '0';
2750	0	0	*s++ = (base == 2) ? 'b' : 'x';
2751			}
2752	0	0	for (i = 0; i < len; i++) {
2753	0		UV d = r[i];
2754	0	0	s[i] = (d < 10) ? '0'+d : 'a'+d-10;
2755			}
2756	0		s[len] = '\0';
2757	0		*rstr = so;
2758	0		return 1;
2759			}
2760
2761	17		int to_digit_array(int* bits, UV n, int base, int length)
2762			{
2763			int d;
2764
2765	17	50	if (base < 2 \|\| length > 128) return -1;
		50
2766
2767	17	100	if (base == 2) {
2768	87	100	for (d = 0; n; n >>= 1)
2769	77		bits[d++] = n & 1;
2770			} else {
2771	31	100	for (d = 0; n; n /= base)
2772	24		bits[d++] = n % base;
2773			}
2774	17	100	if (length < 0) length = d;
2775	43	100	while (d < length)
2776	26		bits[d++] = 0;
2777	17		return length;
2778			}
2779
2780	3		int to_digit_string(char* s, UV n, int base, int length)
2781			{
2782			int digits[128];
2783	3		int i, len = to_digit_array(digits, n, base, length);
2784
2785	3	50	if (len < 0) return -1;
2786	3	50	if (base > 36) croak("invalid base for string: %d", base);
2787
2788	21	100	for (i = 0; i < len; i++) {
2789	18		int dig = digits[len-i-1];
2790	18	50	s[i] = (dig < 10) ? '0'+dig : 'a'+dig-10;
2791			}
2792	3		s[len] = '\0';
2793	3		return len;
2794			}
2795
2796	1		int to_string_128(char str[40], IV hi, UV lo)
2797			{
2798	1		int i, slen = 0, isneg = 0;
2799
2800	1	50	if (hi < 0) {
2801	0		isneg = 1;
2802	0		hi = -(hi+1);
2803	0		lo = UV_MAX - lo + 1;
2804			}
2805			#if BITS_PER_WORD == 64 && HAVE_UINT128
2806			{
2807	1		uint128_t dd, sum = (((uint128_t) hi) << 64) + lo;
2808			do {
2809	20		dd = sum / 10;
2810	20		str[slen++] = '0' + (sum - dd*10);
2811	20		sum = dd;
2812	20	100	} while (sum);
2813			}
2814			#else
2815			{
2816			UV d, r;
2817			uint32_t a[4];
2818			a[0] = hi >> (BITS_PER_WORD/2);
2819			a[1] = hi & (UV_MAX >> (BITS_PER_WORD/2));
2820			a[2] = lo >> (BITS_PER_WORD/2);
2821			a[3] = lo & (UV_MAX >> (BITS_PER_WORD/2));
2822			do {
2823			r = a[0];
2824			d = r/10; r = ((r-d*10) << (BITS_PER_WORD/2)) + a[1]; a[0] = d;
2825			d = r/10; r = ((r-d*10) << (BITS_PER_WORD/2)) + a[2]; a[1] = d;
2826			d = r/10; r = ((r-d*10) << (BITS_PER_WORD/2)) + a[3]; a[2] = d;
2827			d = r/10; r = r-d*10; a[3] = d;
2828			str[slen++] = '0'+(r%10);
2829			} while (a[0] \|\| a[1] \|\| a[2] \|\| a[3]);
2830			}
2831			#endif
2832			/* Reverse the order */
2833	11	100	for (i=0; i < slen/2; i++) {
2834	10		char t=str[i];
2835	10		str[i]=str[slen-i-1];
2836	10		str[slen-i-1] = t;
2837			}
2838			/* Prepend a negative sign if needed */
2839	1	50	if (isneg) {
2840	0	0	for (i = slen; i > 0; i--)
2841	0		str[i] = str[i-1];
2842	0		str[0] = '-';
2843	0		slen++;
2844			}
2845			/* Add terminator */
2846	1		str[slen] = '\0';
2847	1		return slen;
2848			}
2849
2850			/* Oddball primality test.
2851			* In this file rather than primality.c because it uses factoring (!).
2852			* Algorithm from Charles R Greathouse IV, 2015 */
2853	472642		static INLINE uint32_t _catalan_v32(uint32_t n, uint32_t p) {
2854	472642		uint32_t s = 0;
2855	946183	100	while (n /= p) s += n % 2;
2856	472642		return s;
2857			}
2858	0		static INLINE uint32_t _catalan_v(UV n, UV p) {
2859	0		uint32_t s = 0;
2860	0	0	while (n /= p) s += n % 2;
2861	0		return s;
2862			}
2863	901451		static UV _catalan_mult(UV m, UV p, UV n, UV a) {
2864	901451	100	if (p > a) {
2865	428809		m = mulmod(m, p, n);
2866			} else {
2867	472642	50	UV pow = (n <= 4294967295UL) ? _catalan_v32(a<<1,p) : _catalan_v(a<<1,p);
2868	472642		m = (pow == 0) ? m
2869	658853	100	: (pow == 1) ? mulmod(m,p,n)
2870	186211	100	: mulmod(m,powmod(p,pow,n),n);
2871			}
2872	901451		return m;
2873			}
2874	5		static int _catalan_vtest(UV n, UV p) {
2875	18	100	while (n /= p)
2876	13	50	if (n % 2)
2877	0		return 1;
2878	5		return 0;
2879			}
2880	3		int is_catalan_pseudoprime(UV n) {
2881			UV m, a;
2882			int i;
2883
2884	3	50	if (n < 2 \|\| ((n % 2) == 0 && n != 2)) return 0;
		50
		0
2885	3	50	if (is_prob_prime(n)) return 1;
2886
2887	3		m = 1;
2888	3		a = n >> 1;
2889			/*
2890			* Ideally we could use some of the requirements for a mod 4/8/64 here:
2891			* http://www.combinatorics.net/conf/Z60/sp/sp/Shu-Chung%20Liu.pdf
2892			* But, how do we make +/-2 = X mod n into a solution for x = X mod 8?
2893			*
2894			* We could also exploit the exhaustive testing that shows there only
2895			* exist three below 1e10: 5907, 1194649, and 12327121.
2896			*/
2897			{
2898			UV factors[MPU_MAX_FACTORS+1];
2899	3		int nfactors = factor_exp(n, factors, 0);
2900			#if BITS_PER_WORD == 32
2901			if (nfactors == 2) return 0; /* Page 9, all 32-bit semiprimes */
2902			#else
2903	3	50	if (nfactors == 2) { /* Conditions from Aebi and Cairns (2008) */
2904	0	0	if (n < UVCONST(10000000000)) return 0; /* Page 9 */
2905	0	0	if (2factors[0]+1 >= factors[1]) return 0; / Corollary 2 and 3 */
2906			}
2907			#endif
2908			/* Test every factor */
2909	8	100	for (i = 0; i < nfactors; i++) {
2910	5	50	if (_catalan_vtest(a << 1, factors[i]))
2911	0		return 0;
2912			}
2913			}
2914			{
2915			UV seg_base, seg_low, seg_high;
2916			unsigned char* segment;
2917			void* ctx;
2918	3		m = _catalan_mult(m, 2, n, a);
2919	3		m = _catalan_mult(m, 3, n, a);
2920	3		m = _catalan_mult(m, 5, n, a);
2921	3		ctx = start_segment_primes(7, n, &segment);
2922	19	100	while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2923	957825	50	START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
		100
		50
		100
		100
2924	901442		m = _catalan_mult(m, p, n, a);
2925	56367		} END_DO_FOR_EACH_SIEVE_PRIME
2926			}
2927	3		end_segment_primes(ctx);
2928			}
2929	3	100	return (a & 1) ? (m==(n-1)) : (m==1);
2930			}
2931
2932			/* If we have fast CTZ, use this GCD. See Brent Alg V and FLINT Abhinav Baid */
2933	35602		UV gcdz(UV x, UV y) {
2934			UV f, x2, y2;
2935
2936	35602	100	if (x == 0) return y;
2937
2938	35592	100	if (y & 1) { /* Optimize y odd */
2939	19773	50	x >>= ctz(x);
2940	390518	100	while (x != y) {
2941	370745	100	if (x < y) { y -= x; y >>= ctz(y); }
		50
2942	318953	50	else { x -= y; x >>= ctz(x); }
2943			}
2944	19773		return x;
2945			}
2946
2947	15819	100	if (y == 0) return x;
2948
2949			/* Alternately: f = ctz(x\|y); x >>= ctz(x); y >>= ctz(y); */
2950	15818	50	x2 = ctz(x);
2951	15818	50	y2 = ctz(y);
2952	15818		f = (x2 <= y2) ? x2 : y2;
2953	15818		x >>= x2;
2954	15818		y >>= y2;
2955
2956	203676	100	while (x != y) {
2957	187858	100	if (x < y) {
2958	14906		y -= x;
2959	14906	50	y >>= ctz(y);
2960			} else {
2961	172952		x -= y;
2962	172952	50	x >>= ctz(x);
2963			}
2964			}
2965	15818		return x << f;
2966			}
2967
2968			/* The intermediate values are so large that we can only stay in 64-bit
2969			* up to 53 or so using the divisor_sum calculations. So just use a table.
2970			* Save space by just storing the 32-bit values. */
2971			static const int32_t tau_table[] = {
2972			0,1,-24,252,-1472,4830,-6048,-16744,84480,-113643,-115920,534612,-370944,-577738,401856,1217160,987136,-6905934,2727432,10661420,-7109760,-4219488,-12830688,18643272,21288960,-25499225,13865712,-73279080,24647168,128406630,-29211840,-52843168,-196706304,134722224,165742416,-80873520,167282496,-182213314,-255874080,-145589976,408038400,308120442,101267712,-17125708,-786948864,-548895690,-447438528
2973			};
2974			#define NTAU (sizeof(tau_table)/sizeof(tau_table[0]))
2975	10		IV ramanujan_tau(UV n) {
2976	10	100	return (n < NTAU) ? tau_table[n] : 0;
2977			}
2978
2979	405		static UV _count_class_div(UV s, UV b2) {
2980	405		UV h = 0, i, ndivisors, *divs, lim;
2981
2982	405		lim = isqrt(b2);
2983	405	100	if (lim*lim == b2) lim--;
2984	405	100	if (s > lim) return 0;
2985
2986	362	100	if ((lim-s) < 70) { /* Iterate looking for divisors */
2987	7104	100	for (i = s; i <= lim; i++)
2988	6767	100	if (b2 % i == 0)
2989	106		h++;
2990			} else { /* Walk through all the divisors */
2991	25		divs = _divisor_list(b2, &ndivisors);
2992	56	50	for (i = 0; i < ndivisors && divs[i] <= lim; i++)
		100
2993	31	100	if (divs[i] >= s)
2994	6		h++;
2995	25		Safefree(divs);
2996			}
2997	405		return h;
2998			}
2999
3000			/* Returns 12 * H(n). See Cohen 5.3.5 or Pari/GP.
3001			* Pari/GP uses a different method for n > 500000, which is quite a bit
3002			* faster, but assumes the GRH. */
3003	96		IV hclassno(UV n) {
3004	96		UV nmod4 = n % 4, b2, b, h;
3005			int square;
3006
3007	96	100	if (n == 0) return -1;
3008	95	100	if (nmod4 == 1 \|\| nmod4 == 2) return 0;
		100
3009	74	100	if (n == 3) return 4;
3010
3011	73		b = n & 1;
3012	73		b2 = (n+1) >> 2;
3013	73		square = is_perfect_square(b2);
3014
3015	73		h = divisor_sum(b2,0) >> 1;
3016	73	100	if (b == 1)
3017	18		h = 1 + square + ((h - 1) << 1);
3018	73		b += 2;
3019
3020	478	100	for (; b2 = (n + bb) >> 2, 3b2 < n; b += 2) {
3021	810		h += (b2 % b == 0)
3022	405		+ is_perfect_square(b2)
3023	405		+ (_count_class_div(b+1, b2) << 1);
3024			}
3025	73	100	return 12h + ((b23 == n) ? 4 : square && !(n&1) ? 6 : 0);
		100
		100
3026			}
3027
3028	25300		UV polygonal_root(UV n, UV k, int* overflow) {
3029			UV D, R;
3030	25300	50	MPUassert(k >= 3, "is_polygonal root < 3");
3031	25300		*overflow = 0;
3032	25300	100	if (n <= 1) return n;
3033	25254	100	if (k == 4) return is_perfect_square(n) ? isqrt(n) : 0;
		100
3034	25056	100	if (k == 3) {
3035	108	50	if (n >= UV_MAX/8) *overflow = 1;
3036	108		D = n << 3;
3037	108		R = 1;
3038			} else {
3039	24948	50	if (k > UV_MAX/k \|\| n > UV_MAX/(8k-16)) overflow = 1;
		50
3040	24948		D = (8k-16) n;
3041	24948		R = (k-4) * (k-4);
3042			}
3043	25056	50	if (D+R <= D) *overflow = 1;
3044	25056		D += R;
3045	25056	50	if (*overflow \|\| !is_perfect_square(D)) return 0;
		100
3046	918		D = isqrt(D) + (k-4);
3047	918		R = 2*k - 4;
3048	918	100	if ((D % R) != 0) return 0;
3049	396		return D/R;
3050			}
3051
3052			/* These rank/unrank are O(n^2) algorithms using O(n) in-place space.
3053			* Bonet 2008 gives O(n log n) algorithms using a bit more space.
3054			*/
3055
3056	5		int num_to_perm(UV k, int n, int *vec) {
3057	5		int i, j, t, si = 0;
3058	5		UV f = factorial(n-1);
3059
3060	8	100	while (f == 0) /* We can handle n! overflow if we have a valid k */
3061	3		f = factorial(n - 1 - ++si);
3062
3063	5	100	if (k/f >= (UV)n)
3064	1		k %= f*n;
3065
3066	50	100	for (i = 0; i < n; i++)
3067	45		vec[i] = i;
3068	42	100	for (i = si; i < n-1; i++) {
3069	37		UV p = k/f;
3070	37		k -= p*f;
3071	37		f /= n-i-1;
3072	37	100	if (p > 0) {
3073	66	100	for (j = i+p, t = vec[j]; j > i; j--)
3074	47		vec[j] = vec[j-1];
3075	19		vec[i] = t;
3076			}
3077			}
3078	5		return 1;
3079			}
3080
3081	6		int perm_to_num(int n, int vec, UV rank) {
3082			int i, j, k;
3083	6		UV f, num = 0;
3084	6		f = factorial(n-1);
3085	6	100	if (f == 0) return 0;
3086	42	100	for (i = 0; i < n-1; i++) {
3087	340	100	for (j = i+1, k = 0; j < n; j++)
3088	302	100	if (vec[j] < vec[i])
3089	137		k++;
3090	38	50	if ((UV)k > (UV_MAX-num)/f) return 0; /* overflow */
3091	38		num += k*f;
3092	38		f /= n-i-1;
3093			}
3094	4		*rank = num;
3095	4		return 1;
3096			}
3097
3098			/*
3099			* For k
3100			* https://www.math.upenn.edu/~wilf/website/CombinatorialAlgorithms.pdf
3101			* Note it requires an O(k) complete shuffle as the results are sorted.
3102			*
3103			* This seems to be 4-100x faster than NumPy's random.{permutation,choice}
3104			* for n under 100k or so. It's even faster with larger n. For example
3105			* from numpy.random import choice; choice(100000000, 4, replace=False)
3106			* uses 774MB and takes 55 seconds. We take less than 1 microsecond.
3107			*/
3108	3		void randperm(void* ctx, UV n, UV k, UV *S) {
3109			UV i, j;
3110
3111	3	50	if (k > n) k = n;
3112
3113	3	50	if (k == 0) { /* 0 of n */
3114	3	100	} else if (k == 1) { /* 1 of n. Pick one at random */
3115	1		S[0] = urandomm64(ctx,n);
3116	2	50	} else if (k == 2 && n == 2) { /* 2 of 2. Flip a coin */
		0
3117	0		S[0] = urandomb(ctx,1);
3118	0		S[1] = 1-S[0];
3119	2	50	} else if (k == 2) { /* 2 of n. Pick 2 skipping dup */
3120	0		S[0] = urandomm64(ctx,n);
3121	0		S[1] = urandomm64(ctx,n-1);
3122	0	0	if (S[1] >= S[0]) S[1]++;
3123	2	50	} else if (k < n/100 && k < 30) { /* k of n. Pick k with loop */
		0
3124	0	0	for (i = 0; i < k; i++) {
3125			do {
3126	0		S[i] = urandomm64(ctx,n);
3127	0	0	for (j = 0; j < i; j++)
3128	0	0	if (S[j] == S[i])
3129	0		break;
3130	0	0	} while (j < i);
3131			}
3132	2	50	} else if (k < n/100 && n > 1000000) {/* k of n. Pick k with dedup retry */
		0
3133	0	0	for (j = 0; j < k; ) {
3134	0	0	for (i = j; i < k; i++) /* Fill S[j .. k-1] then sort S */
3135	0		S[i] = urandomm64(ctx,n);
3136	0		qsort(S, k, sizeof(UV), _numcmp);
3137	0	0	for (j = 0, i = 1; i < k; i++) /* Find and remove dups. O(n). */
3138	0	0	if (S[j] != S[i])
3139	0		S[++j] = S[i];
3140	0		j++;
3141			}
3142			/* S is sorted unique k-selection of 0 to n-1. Shuffle. */
3143	0	0	for (i = 0; i < k; i++) {
3144	0		j = urandomm64(ctx,k-i);
3145	0		{ UV t = S[i]; S[i] = S[i+j]; S[i+j] = t; }
3146			}
3147	2	100	} else if (k < n/4) { /* k of n. Pick k with mask */
3148	1		uint32_t *mask, smask[8] = {0};
3149	1	50	if (n <= 32*8) mask = smask;
3150	0	0	else Newz(0, mask, n/32 + ((n%32)?1:0), uint32_t);
		0
		0
3151	5	100	for (i = 0; i < k; i++) {
3152			do {
3153	4		j = urandomm64(ctx,n);
3154	4	50	} while ( mask[j>>5] & (1U << (j&0x1F)) );
3155	4		S[i] = j;
3156	4		mask[j>>5] \|= (1U << (j&0x1F));
3157			}
3158	1	50	if (mask != smask) Safefree(mask);
3159	1	50	} else if (k < n) { /* k of n. FYK shuffle n, pick k */
3160			UV *T;
3161	0	0	New(0, T, n, UV);
3162	0	0	for (i = 0; i < n; i++)
3163	0		T[i] = i;
3164	0	0	for (i = 0; i < k && i <= n-2; i++) {
		0
3165	0		j = urandomm64(ctx,n-i);
3166	0		S[i] = T[i+j];
3167	0		T[i+j] = T[i];
3168			}
3169	0		Safefree(T);
3170			} else { /* n of n. FYK shuffle. */
3171	101	100	for (i = 0; i < n; i++)
3172	100		S[i] = i;
3173	100	50	for (i = 0; i < k && i <= n-2; i++) {
		100
3174	99		j = urandomm64(ctx,n-i);
3175	99		{ UV t = S[i]; S[i] = S[i+j]; S[i+j] = t; }
3176			}
3177			}
3178	3		}
3179
3180	1		UV random_factored_integer(void* ctx, UV n, int nf, UV factors) {
3181			UV r, s, nfac;
3182	1	50	if (n < 1)
3183	0		return 0;
3184			#if BITS_PER_WORD == 64 && (USE_MONTMATH \|\| MULMODS_ARE_FAST)
3185			if (1) /* Our factoring is very fast, just use it */
3186			#elif BITS_PER_WORD == 64
3187			if (n < UVCONST(1000000000000))
3188			#endif
3189			{
3190	1		r = 1 + urandomm64(ctx, n);
3191	1		*nf = factor(r, factors);
3192	1		return r;
3193			}
3194			do { /* Kalai's algorithm */
3195			for (s = n, r = 1, nfac = 0; s > 1; ) {
3196			s = 1 + urandomm64(ctx, s);
3197			if (!is_prime(s)) continue;
3198			if (s > n / r) { r = 0; break; } /* overflow */
3199			r *= s;
3200			factors[nfac++] = s;
3201			}
3202			} while (r == 0 \|\| r > n \|\| (1 + urandomm64(ctx,n)) > r);
3203			*nf = nfac;
3204			return r;
3205			}