File Coverage

factor.c

Criterion	Covered	Total	%
statement	624	859	72.6
branch	465	902	51.5
condition			n/a
subroutine			n/a
pod			n/a
total	1089	1761	61.8

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5
6			#define FUNC_isqrt 1
7			#define FUNC_gcd_ui 1
8			#define FUNC_is_perfect_square 1
9			#define FUNC_clz 1
10			#include "ptypes.h"
11			#include "factor.h"
12			#include "sieve.h"
13			#include "util.h"
14			#include "mulmod.h"
15			#include "cache.h"
16			#include "primality.h"
17			#include "montmath.h"
18
19			/*
20			* You need to remember to use UV for unsigned and IV for signed types that
21			* are large enough to hold our data.
22			* If you use int, that's 32-bit on LP64 and LLP64 machines. You lose.
23			* If you use long, that's 32-bit on LLP64 machines. You lose.
24			* If you use long long, you may be too large which isn't so bad, but some
25			* compilers may not understand the type at all.
26			* perl.h already figured all this out, and provided us with these types which
27			* match the native integer type used inside our Perl, so just use those.
28			*/
29
30			static const unsigned short primes_small[] =
31			{0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
32			101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
33			193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
34			293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
35			409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,
36			521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,
37			641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,
38			757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,
39			881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,
40			1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,
41			1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,
42			1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,
43			1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,
44			1429,1433,1439,1447,1451,1453,1459,1471,1481,1483,1487,1489,1493,1499,
45			1511,1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,1597,1601,1607,
46			1609,1613,1619,1621,1627,1637,1657,1663,1667,1669,1693,1697,1699,1709,
47			1721,1723,1733,1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,1823,
48			1831,1847,1861,1867,1871,1873,1877,1879,1889,1901,1907,1913,1931,1933,
49			1949,1951,1973,1979,1987,1993,1997,1999,2003,2011};
50			#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
51
52
53			/* The main factoring loop */
54			/* Puts factors in factors[] and returns the number found. */
55	33626		int factor(UV n, UV *factors)
56			{
57	33626		int nsmallfactors, nfactors = 0; /* Number of factored in factors result */
58	33626		unsigned int f = 7;
59
60	33626	100	if (n > 1) {
61	47739	100	while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
62	43709	100	while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
63	39357	100	while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
64			}
65
66	33626	100	if (f*f <= n) {
67	26727		UV const lastsp = 83;
68	26727		UV sp = 4;
69			/* Trial division from 7 to 421. Use 32-bit if possible. */
70	26727	100	if (n <= 4294967295U) {
71	26639		unsigned int un = n;
72	226594	100	while (sp < lastsp) {
73	239613	100	while ( (un%f) == 0 ) {
74	13129		factors[nfactors++] = f;
75	13129		un /= f;
76			}
77	226484		f = primes_small[++sp];
78	226484	100	if (f*f > un) break;
79			}
80	26639		n = un;
81			} else {
82	6312	100	while (sp < lastsp) {
83	6404	100	while ( (n%f) == 0 ) {
84	168		factors[nfactors++] = f;
85	168		n /= f;
86			}
87	6236		f = primes_small[++sp];
88	6236	100	if (f*f > n) break;
89			}
90			}
91			/* If n is small and still composite, finish it here */
92	26727	100	if (n < 20112011 && ff <= n) { /* Trial division from 431 to 2003 */
		100
93	22		unsigned int un = n;
94	1568	50	while (sp < NPRIMES_SMALL) {
95	1582	100	while ( (un%f) == 0 ) {
96	14		factors[nfactors++] = f;
97	14		un /= f;
98			}
99	1568		f = primes_small[++sp];
100	1568	100	if (f*f > un) break;
101			}
102	22		n = un;
103			}
104			}
105	33626	100	if (f*f > n) {
106	33462	100	if (n != 1) factors[nfactors++] = n;
107	33462		return nfactors;
108			}
109			#if BITS_PER_WORD == 64 /* this is slower in 32-bit */
110			/* For small values less than f^3, use simple factor to split semiprime */
111	164	100	if (n < 200000000 && n < fff) {
		100
112	46	100	if (is_prob_prime(n)) factors[nfactors++] = n;
113	23		else nfactors += holf_factor(n, factors+nfactors, 1000000);
114	46		return nfactors;
115			}
116			#endif
117
118	118		nsmallfactors = nfactors;
119
120			/* Perfect powers. Factor root only once. */
121			{
122	118		int i, j, k = powerof(n);
123	118	100	if (k > 1) {
124	4		UV p = rootof(n, k);
125	4		nfactors = factor(p, factors+nsmallfactors);
126	14	100	for (i = nfactors; i >= 0; i--)
127	30	100	for (j = 0; j < k; j++)
128	20		factors[nsmallfactors+k*i+j] = factors[nsmallfactors+i];
129	4		return nsmallfactors + k*nfactors;
130			}
131			}
132
133			{
134			UV tofac_stack[MPU_MAX_FACTORS+1];
135	114		int i, j, ntofac = 0;
136	114		int const verbose = _XS_get_verbose();
137
138			/* loop over each remaining factor, until ntofac == 0 */
139			do {
140	256	100	while ( (n >= f*f) && (!is_prob_prime(n)) ) {
		100
141	71		int split_success = 0;
142			/* Adjust the number of rounds based on the number size and speed */
143	71	50	UV const nbits = BITS_PER_WORD - clz(n);
144			#if USE_MONTMATH
145	71	100	UV const br_rounds = 8000 + (9000 * ((nbits <= 45) ? 0 : (nbits-45)));
146			#elif MULMODS_ARE_FAST
147			UV const br_rounds = 500 + ( 200 * ((nbits <= 45) ? 0 : (nbits-45)));
148			#else
149			UV const br_rounds = (nbits < 63) ? 0 : 10000;
150			#endif
151	71		UV const sq_rounds = 200000; /* 20k 91%, 40k 98%, 80k 99.9%, 120k 99.99%*/
152
153			/* For small semiprimes the fastest solution is HOLF under 32, then
154			* Lehman (no trial) under 38. However on random inputs, HOLF is
155			* best only under 28-30 bits, and adding Lehman is always slower. */
156	71	50	if (!split_success && nbits <= 28) { /* This should always succeed */
		100
157	16		split_success = holf_factor(n, tofac_stack+ntofac, 1000000)-1;
158	16	50	if (verbose) printf("holf %d\n", split_success);
159			}
160			/* 99.7% of 32-bit, 94% of 64-bit random inputs factored here */
161	71	100	if (!split_success && br_rounds > 0) {
		50
162	55		split_success = pbrent_factor(n, tofac_stack+ntofac, br_rounds, 1)-1;
163	55	50	if (verbose) printf("pbrent %d\n", split_success);
164			}
165			#if USE_MONTMATH
166	71	50	if (!split_success) {
167	0		split_success = pbrent_factor(n, tofac_stack+ntofac, 2*br_rounds, 3)-1;
168	0	0	if (verbose) printf("second pbrent %d\n", split_success);
169			}
170			#endif
171			/* SQUFOF with these parameters gets 99.9% of everything left */
172	71	50	if (!split_success && nbits <= 62) {
		0
173	0		split_success = squfof_factor(n,tofac_stack+ntofac, sq_rounds)-1;
174	0	0	if (verbose) printf("squfof %d\n", split_success);
175			}
176			/* At this point we should only have 16+ digit semiprimes. */
177	71	50	if (!split_success) {
178	0		split_success = pminus1_factor(n, tofac_stack+ntofac, 8000, 120000)-1;
179	0	0	if (verbose) printf("pminus1 %d\n", split_success);
180			/* Get the stragglers */
181	0	0	if (!split_success) {
182	0		split_success = prho_factor(n, tofac_stack+ntofac, 120000)-1;
183	0	0	if (verbose) printf("long prho %d\n", split_success);
184	0	0	if (!split_success) {
185	0		split_success = pbrent_factor(n, tofac_stack+ntofac, 500000, 5)-1;
186	0	0	if (verbose) printf("long pbrent %d\n", split_success);
187			}
188			}
189			}
190
191	71	50	if (split_success) {
192	71	50	MPUassert( split_success == 1, "split factor returned more than 2 factors");
193	71		ntofac++; /* Leave one on the to-be-factored stack */
194	71	50	if ((tofac_stack[ntofac] == n) \|\| (tofac_stack[ntofac] == 1))
		50
195	0		croak("bad factor\n");
196	71		n = tofac_stack[ntofac]; /* Set n to the other one */
197			} else {
198			/* Factor via trial division. Nothing should ever get here. */
199	0		UV m = f % 30;
200	0		UV limit = isqrt(n);
201	0	0	if (verbose) printf("doing trial on %"UVuf"\n", n);
202	0	0	while (f <= limit) {
203	0	0	if ( (n%f) == 0 ) {
204			do {
205	0		n /= f;
206	0		factors[nfactors++] = f;
207	0	0	} while ( (n%f) == 0 );
208	0		limit = isqrt(n);
209			}
210	0		f += wheeladvance30[m];
211	0		m = nextwheel30[m];
212			}
213	0		break; /* We just factored n via trial division. Exit loop. */
214			}
215			}
216			/* n is now prime (or 1), so add to already-factored stack */
217	185	50	if (n != 1) factors[nfactors++] = n;
218			/* Pop the next number off the to-factor stack */
219	185	100	if (ntofac > 0) n = tofac_stack[ntofac-1];
220	185	100	} while (ntofac-- > 0);
221
222			/* Sort the non-small factors */
223	185	100	for (i = nsmallfactors+1; i < nfactors; i++) {
224	71		UV fi = factors[i];
225	142	100	for (j = i; j > 0 && factors[j-1] > fi; j--)
		100
226	71		factors[j] = factors[j-1];
227	71		factors[j] = fi;
228			}
229			}
230	114		return nfactors;
231			}
232
233
234	12215		int factor_exp(UV n, UV factors, UV exponents)
235			{
236	12215		int i = 1, j = 1, nfactors;
237
238	12215	100	if (n == 1) return 0;
239	12211		nfactors = factor(n, factors);
240
241	12211	100	if (exponents == 0) {
242	131	100	for (; i < nfactors; i++)
243	92	100	if (factors[i] != factors[i-1])
244	62		factors[j++] = factors[i];
245			} else {
246	12172		exponents[0] = 1;
247	27828	100	for (; i < nfactors; i++) {
248	15656	100	if (factors[i] != factors[i-1]) {
249	12314		exponents[j] = 1;
250	12314		factors[j++] = factors[i];
251			} else {
252	3342		exponents[j-1]++;
253			}
254			}
255			}
256	12211		return j;
257			}
258
259
260	1619		int trial_factor(UV n, UV *factors, UV maxtrial)
261			{
262	1619		int nfactors = 0;
263
264	1619	100	if (maxtrial == 0) maxtrial = UV_MAX;
265
266			/* Cover the cases 0/1/2/3 now */
267	1619	100	if (n < 4 \|\| maxtrial < 2) {
		50
268	8		factors[0] = n;
269	8		return (n == 1) ? 0 : 1;
270			}
271			/* Trial division for 2, 3, 5 immediately */
272	3232	100	while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
273	2415	100	if (3<=maxtrial) while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
		100
274	1982	100	if (5<=maxtrial) while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
		100
275
276	1611	100	if (7*7 <= n) {
277	1255		UV f, sp = 3;
278	5245	100	while (++sp < NPRIMES_SMALL) {
279	5242		f = primes_small[sp];
280	5242	100	if (f*f > n \|\| f > maxtrial) break;
		100
281	4374	100	while ( (n%f) == 0 ) {
282	384		factors[nfactors++] = f;
283	384		n /= f;
284			}
285			}
286			/* Trial division using a mod-30 wheel for larger values */
287	1255	100	if (f*f <= n && f <= maxtrial) {
		100
288	3		UV m, newlimit, limit = isqrt(n);
289	3	100	if (limit > maxtrial) limit = maxtrial;
290	3		m = f % 30;
291	7906	100	while (f <= limit) {
292	7903	100	if ( (n%f) == 0 ) {
293			do {
294	2		factors[nfactors++] = f;
295	2		n /= f;
296	2	50	} while ( (n%f) == 0 );
297	2		newlimit = isqrt(n);
298	2	50	if (newlimit < limit) limit = newlimit;
299			}
300	7903		f += wheeladvance30[m];
301	7903		m = nextwheel30[m];
302			}
303			}
304			}
305			/* All done! */
306	1611	100	if (n != 1)
307	1486		factors[nfactors++] = n;
308	1611		return nfactors;
309			}
310
311
312	3468		static void _divisors_from_factors(UV nfactors, UV* fp, UV* fe, UV* res) {
313	3468		UV s, count = 1;
314
315	3468		res[0] = 1;
316	11425	100	for (s = 0; s < nfactors; s++) {
317	7957		UV i, j, scount = count, p = fp[s], e = fe[s], mult = 1;
318	18887	100	for (j = 0; j < e; j++) {
319	10930		mult *= p;
320	35405	100	for (i = 0; i < scount; i++)
321	24475		res[count++] = res[i] * mult;
322			}
323			}
324	3468		}
325
326	48171		static int numcmp(const void a, const void b)
327	48171	100	{ const UV x = a, y = b; return (x > y) ? 1 : (x < y) ? -1 : 0; }
		50
328
329	3472		UV* _divisor_list(UV n, UV *num_divisors)
330			{
331			UV factors[MPU_MAX_FACTORS+1];
332			UV exponents[MPU_MAX_FACTORS+1];
333			UV* divs;
334			int i, nfactors, ndivisors;
335
336	3472	100	if (n <= 1) {
337	4		New(0, divs, 2, UV);
338	4	100	if (n == 0) { divs[0] = 0; divs[1] = 1; *num_divisors = 2; }
339	4	100	if (n == 1) { divs[0] = 1; *num_divisors = 1; }
340	4		return divs;
341			}
342			/* Factor and convert to factor/exponent pair */
343	3468		nfactors = factor_exp(n, factors, exponents);
344			/* Calculate number of divisors, allocate space, fill with divisors */
345	3468		ndivisors = exponents[0] + 1;
346	7957	100	for (i = 1; i < nfactors; i++)
347	4489		ndivisors *= (exponents[i] + 1);
348	3468	50	New(0, divs, ndivisors, UV);
349	3468		_divisors_from_factors(nfactors, factors, exponents, divs);
350			/* Sort divisors (numeric ascending) */
351	3468		qsort(divs, ndivisors, sizeof(UV), numcmp);
352			/* Return number of divisors and list */
353	3468		*num_divisors = ndivisors;
354	3472		return divs;
355			}
356
357
358			/* The usual method, on OEIS for instance, is:
359			* (p^(k*(e+1))-1) / (p^k-1)
360			* but that overflows quicky. Instead we rearrange as:
361			* 1 + p^k + p^k^2 + ... p^k^e
362			* Return 0 if the result overflowed.
363			*/
364			static const UV sigma_overflow[11] =
365			#if BITS_PER_WORD == 64
366			{UVCONST(3000000000000000000),UVCONST(3000000000),2487240,64260,7026,
367			1622, 566, 256, 139, 85, 57};
368			#else
369			{UVCONST(845404560), 52560, 1548, 252, 84, 41, 24, 16, 12, 10, 8};
370			#endif
371	1505		UV divisor_sum(UV n, UV k)
372			{
373			UV factors[MPU_MAX_FACTORS+1];
374			int nfac, i, j;
375	1505		UV product = 1;
376
377	1505	50	if (k > 11 \|\| (k > 0 && n >= sigma_overflow[k-1])) return 0;
		100
		50
378	1505	100	if (n <= 1) /* n=0 divisors are [0,1] */
379	287	100	return (n == 1) ? 1 : (k == 0) ? 2 : 1; /* n=1 divisors are [1] */
		100
380	1218		nfac = factor(n,factors);
381	1218	100	if (k == 0) {
382	2339	100	for (i = 0; i < nfac; i++) {
383	1365		UV e = 1, f = factors[i];
384	1806	100	while (i+1 < nfac && f == factors[i+1]) { e++; i++; }
		100
385	1365		product *= (e+1);
386			}
387	244	100	} else if (k == 1) {
388	427	100	for (i = 0; i < nfac; i++) {
389	270		UV f = factors[i];
390	270		UV pke = f, fmult = 1 + f;
391	372	100	while (i+1 < nfac && f == factors[i+1]) {
		100
392	102		pke *= f;
393	102		fmult += pke;
394	102		i++;
395			}
396	270		product *= fmult;
397			}
398			} else {
399	222	100	for (i = 0; i < nfac; i++) {
400	135		UV f = factors[i];
401	135		UV fmult, pke, pk = f;
402	328	100	for (j = 1; j < (int)k; j++) pk *= f;
403	135		fmult = 1 + pk;
404	135		pke = pk;
405	188	100	while (i+1 < nfac && f == factors[i+1]) {
		100
406	53		pke *= pk;
407	53		fmult += pke;
408	53		i++;
409			}
410	135		product *= fmult;
411			}
412			}
413	1505		return product;
414			}
415
416
417
418
419	110		static int found_factor(UV n, UV f, UV* factors)
420			{
421	110		UV f2 = n/f;
422	110		int i = f > f2;
423	110	50	if (f == 1 \|\| f2 == 1) {
		50
424	0		factors[0] = n;
425	0		return 1;
426			}
427	110		factors[i] = f;
428	110		factors[1-i] = f2;
429	110	50	MPUassert( factors[0] * factors[1] == n , "incorrect factoring");
430	110		return 2;
431			}
432
433			/* Knuth volume 2, algorithm C.
434			* Very fast for small numbers, grows rapidly.
435			* SQUFOF is better for numbers nearing the 64-bit limit.
436			*/
437	2		int fermat_factor(UV n, UV *factors, UV rounds)
438			{
439			IV sqn, x, y, r;
440	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in fermat_factor");
		50
441	2		sqn = isqrt(n);
442	2		x = 2 * sqn + 1;
443	2		y = 1;
444	2		r = (sqn*sqn) - n;
445
446	10	100	while (r != 0) {
447	8	50	if (rounds-- == 0) { factors[0] = n; return 1; }
448	8		r += x;
449	8		x += 2;
450			do {
451	26		r -= y;
452	26		y += 2;
453	26	100	} while (r > 0);
454			}
455	2		r = (x-y)/2;
456	2		return found_factor(n, r, factors);
457			}
458
459			/* Hart's One Line Factorization. */
460	41		int holf_factor(UV n, UV *factors, UV rounds)
461			{
462			UV i, s, m, f;
463
464	41	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in holf_factor");
		50
465
466			/* We skip the perfect-square test for s in the loop, so we
467			* will never succeed if n is a perfect square. Test that now. */
468	41	100	if (is_perfect_square(n))
469	1		return found_factor(n, isqrt(n), factors);
470
471	40	50	if (n <= (UV_MAX >> 6)) { /* Try with premultiplier first */
472	40	50	UV npre = n * ( (n <= (UV_MAX >> 13)) ? 720 :
		0
		0
		0
473			(n <= (UV_MAX >> 11)) ? 480 :
474			(n <= (UV_MAX >> 10)) ? 360 :
475			(n <= (UV_MAX >> 8)) ? 60 : 30 );
476	40		UV ni = npre;
477			#if 0 /* Straightforward */
478			while (rounds--) {
479			s = isqrt(ni) + 1;
480			m = (s*s) - ni;
481			if (is_perfect_square(m)) {
482			f = gcd_ui(n, s - isqrt(m));
483			if (f > 1 && f < n)
484			return found_factor(n, f, factors);
485			}
486			if (ni >= (ni+npre)) break;
487			ni += npre;
488			}
489			#else /* More optimized */
490	2111	50	while (rounds--) {
491	2111		s = 1 + (UV)sqrt((double)ni);
492	2111		m = (s*s) - ni;
493	2111		f = m & 127;
494	2111	100	if (!((f0x8bc40d7d) & (f0xa1e2f5d1) & 0x14020a)) {
495	1577		f = (UV)sqrt((double)m);
496	1577	100	if (m == f*f) {
497	40		f = gcd_ui(n, s - f);
498	40	50	if (f > 1 && f < n)
		50
499	40		return found_factor(n, f, factors);
500			}
501			}
502	2071	50	if (ni >= (ni+npre)) break;
503	2071		ni += npre;
504			}
505			#endif
506			}
507
508	0	0	for (i = 1; i <= rounds; i++) {
509	0		s = (UV) sqrt( (double)n * (double)i );
510			/* Assume s^2 isn't a perfect square. We're rapidly losing precision
511			* so we won't be able to accurately detect it anyway. */
512	0		s++; /* s = ceil(sqrt(ni)) /
513	0		m = sqrmod(s, n);
514	0	0	if (is_perfect_square(m)) {
515	0		f = isqrt(m);
516	0	0	f = gcd_ui( (s>f) ? s-f : f-s, n);
517			/* This should always succeed, but with overflow concerns.... */
518	0		return found_factor(n, f, factors);
519			}
520			}
521	0		factors[0] = n;
522	0		return 1;
523			}
524
525
526			#if USE_MONTMATH
527			#define ABSDIFF(x,y) (x>y) ? x-y : y-x
528			/* Pollard / Brent. Brent's modifications to Pollard's Rho. Maybe faster. */
529	59		int pbrent_factor(UV n, UV *factors, UV rounds, UV a)
530			{
531	59	50	UV const nbits = BITS_PER_WORD - clz(n);
532	59	100	const UV inner = (nbits <= 31) ? 32 : (nbits <= 35) ? 64 : (nbits <= 40) ? 160 : (nbits <= 52) ? 256 : 320;
		100
		100
		100
533			UV f, m, r, rleft, Xi, Xm, Xs;
534	59		int irounds, fails = 6;
535	59		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
536
537	59	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pbrent_factor");
		50
538	59		r = f = 1;
539	59		Xi = Xm = mont1;
540	59		a = mont_geta(a,n);
541
542	522	50	while (rounds > 0) {
543	522		rleft = (r > rounds) ? rounds : r;
544	522		Xm = Xi;
545			/* Do rleft rounds, inner at a time */
546	1778	100	while (rleft > 0) {
547	1315		irounds = (rleft > (UV)inner) ? inner : rleft;
548	1315		rleft -= irounds;
549	1315		rounds -= irounds;
550	1315		Xs = Xi;
551	1315	100	if (n < (1ULL << 63)) {
552	961		Xi = mont_mulmod63(Xi,Xi+a,n);
553	961	100	m = ABSDIFF(Xi,Xm);
554	166560	100	while (--irounds > 0) {
555	165599		Xi = mont_mulmod63(Xi,Xi+a,n);
556	165599	100	f = ABSDIFF(Xi,Xm);
557	165599		m = mont_mulmod63(m, f, n);
558			}
559	354	50	} else if (a == mont1) {
560	354		Xi = mont_mulmod64(Xi,Xi+a,n);
561	354	100	m = ABSDIFF(Xi,Xm);
562	104189	100	while (--irounds > 0) {
563	103835		Xi = mont_mulmod64(Xi,Xi+a,n);
564	103835	100	f = ABSDIFF(Xi,Xm);
565	103835		m = mont_mulmod64(m, f, n);
566			}
567			} else {
568	0		Xi = addmod(mont_mulmod64(Xi,Xi,n), a, n);
569	0	0	m = ABSDIFF(Xi,Xm);
570	0	0	while (--irounds > 0) {
571	0		Xi = addmod(mont_mulmod64(Xi,Xi,n), a, n);
572	0	0	f = ABSDIFF(Xi,Xm);
573	0		m = mont_mulmod64(m, f, n);
574			}
575			}
576	1315		f = gcd_ui(m, n);
577	1315	100	if (f != 1)
578	59		break;
579			}
580			/* If f == 1, then we didn't find a factor. Move on. */
581	522	100	if (f == 1) {
582	463		r *= 2;
583	463		continue;
584			}
585	59	50	if (f == n) { /* back up, with safety */
586	0		Xi = Xs;
587			do {
588	0	0	if (n < (1ULL << 63) \|\| a == mont1)
		0
589	0	0	Xi = mont_mulmod(Xi,Xi+a,n);
590			else
591	0	0	Xi = addmod(mont_mulmod(Xi,Xi,n),a,n);
592	0	0	m = ABSDIFF(Xi,Xm);
593	0		f = gcd_ui(m, n);
594	0	0	} while (f == 1 && r-- != 0);
		0
595			}
596	59	50	if (f == 0 \|\| f == n) {
		50
597	0	0	if (fails-- <= 0) break;
598	0		Xi = Xm = mont1;
599	0		a = addmod(a, mont_geta(11,n), n);
600	0		continue;
601			}
602	59		return found_factor(n, f, factors);
603			}
604	0		factors[0] = n;
605	0		return 1;
606			}
607			#else
608			/* Pollard / Brent. Brent's modifications to Pollard's Rho. Maybe faster. */
609			int pbrent_factor(UV n, UV *factors, UV rounds, UV a)
610			{
611			UV f, m, r;
612			UV Xi = 2;
613			UV Xm = 2;
614			const UV inner = (n <= 4000000000UL) ? 32 : 160;
615			int fails = 6;
616
617			MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pbrent_factor");
618
619			r = 1;
620			f = 1;
621			while (rounds > 0) {
622			UV rleft = (r > rounds) ? rounds : r;
623			UV saveXi = Xi;
624			/* Do rleft rounds, inner at a time */
625			while (rleft > 0) {
626			UV dorounds = (rleft > inner) ? inner : rleft;
627			saveXi = Xi;
628			rleft -= dorounds;
629			rounds -= dorounds;
630			Xi = sqraddmod(Xi, a, n); /* First iteration, no mulmod needed */
631			m = (Xi>Xm) ? Xi-Xm : Xm-Xi;
632			while (--dorounds > 0) { /* Now do inner-1=63 more iterations */
633			Xi = sqraddmod(Xi, a, n);
634			f = (Xi>Xm) ? Xi-Xm : Xm-Xi;
635			m = mulmod(m, f, n);
636			}
637			f = gcd_ui(m, n);
638			if (f != 1)
639			break;
640			}
641			/* If f == 1, then we didn't find a factor. Move on. */
642			if (f == 1) {
643			r *= 2;
644			Xm = Xi;
645			continue;
646			}
647			if (f == n) { /* back up, with safety */
648			Xi = saveXi;
649			do {
650			Xi = sqraddmod(Xi, a, n);
651			f = gcd_ui( (Xi>Xm) ? Xi-Xm : Xm-Xi, n);
652			} while (f == 1 && r-- != 0);
653			}
654			if (f == 0 \|\| f == n) {
655			if (fails-- <= 0) break;
656			Xm = addmod(Xm, 2, n);
657			Xi = Xm;
658			a++;
659			continue;
660			}
661			return found_factor(n, f, factors);
662			}
663			factors[0] = n;
664			return 1;
665			}
666			#endif
667
668			/* Pollard's Rho. */
669	2		int prho_factor(UV n, UV *factors, UV rounds)
670			{
671			UV a, f, i, m, oldU, oldV;
672	2		const UV inner = 64;
673	2		UV U = 7;
674	2		UV V = 7;
675	2		int fails = 3;
676
677	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in prho_factor");
		50
678
679			/* We could just as well say a = 1 */
680	2		switch (n%8) {
681	0		case 1: a = 1; break;
682	2		case 3: a = 2; break;
683	0		case 5: a = 3; break;
684	0		case 7: a = 5; break;
685	0		default: a = 7; break;
686			}
687
688	2		rounds = (rounds + inner - 1) / inner;
689
690	2	50	while (rounds-- > 0) {
691	2		m = 1; oldU = U; oldV = V;
692	130	100	for (i = 0; i < inner; i++) {
693	128		U = sqraddmod(U, a, n);
694	128		V = sqraddmod(V, a, n);
695	128		V = sqraddmod(V, a, n);
696	128	100	f = (U > V) ? U-V : V-U;
697	128		m = mulmod(m, f, n);
698			}
699	2		f = gcd_ui(m, n);
700	2	50	if (f == 1)
701	0		continue;
702	2	50	if (f == n) { /* back up to find a factor*/
703	2		U = oldU; V = oldV;
704	2		i = inner;
705			do {
706	6		U = sqraddmod(U, a, n);
707	6		V = sqraddmod(V, a, n);
708	6		V = sqraddmod(V, a, n);
709	6	100	f = gcd_ui( (U > V) ? U-V : V-U, n);
710	6	100	} while (f == 1 && i-- != 0);
		50
711			}
712	2	50	if (f == 0 \|\| f == n) {
		50
713	0	0	if (fails-- <= 0) break;
714	0		U = addmod(U,2,n);
715	0		V = U;
716	0		a++;
717	0		continue;
718			}
719	2		return found_factor(n, f, factors);
720			}
721	0		factors[0] = n;
722	0		return 1;
723			}
724
725			/* Pollard's P-1 */
726	2		int pminus1_factor(UV n, UV *factors, UV B1, UV B2)
727			{
728			UV f, k, kmin;
729	2		UV a = 2, q = 2;
730	2		UV savea = 2, saveq = 2;
731	2		UV j = 1;
732	2		UV sqrtB1 = isqrt(B1);
733			#if USE_MONTMATH
734	2		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
735	2		UV ma = mont_geta(a,n);
736			#define PMINUS1_APPLY_POWER ma = mont_powmod(ma, k, n)
737			#define PMINUS1_RECOVER_A a = mont_recover(ma,n)
738			#else
739			#define PMINUS1_APPLY_POWER a = powmod(a, k, n)
740			#define PMINUS1_RECOVER_A
741			#endif
742	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pminus1_factor");
		50
743
744	2	50	if (B1 <= primes_small[NPRIMES_SMALL-2]) {
745			UV i;
746	0	0	for (i = 1; primes_small[i] <= B1; i++) {
747	0		q = k = primes_small[i];
748	0	0	if (q <= sqrtB1) {
749	0		k = q*q; kmin = B1/q;
750	0	0	while (k <= kmin) k *= q;
751			}
752	0		PMINUS1_APPLY_POWER;
753	0	0	if ( (j++ % 32) == 0) {
754	0	0	PMINUS1_RECOVER_A;
755	0	0	if (a == 0 \|\| gcd_ui(a-1, n) != 1)
		0
756			break;
757	0		savea = a; saveq = q;
758			}
759			}
760	0	0	PMINUS1_RECOVER_A;
761			} else {
762	64	50	START_DO_FOR_EACH_PRIME(2, B1) {
		100
		100
		100
		100
		50
		50
		50
		50
		50
763	64		q = k = p;
764	64	50	if (q <= sqrtB1) {
765	64		k = q*q; kmin = B1/q;
766	200	100	while (k <= kmin) k *= q;
767			}
768	64		PMINUS1_APPLY_POWER;
769	64	100	if ( (j++ % 32) == 0) {
770	2	50	PMINUS1_RECOVER_A;
771	2	50	if (a == 0 \|\| gcd_ui(a-1, n) != 1)
		50
772			break;
773	0		savea = a; saveq = q;
774			}
775	62		} END_DO_FOR_EACH_PRIME
776	2	50	PMINUS1_RECOVER_A;
777			}
778	2	50	if (a == 0) { factors[0] = n; return 1; }
779	2		f = gcd_ui(a-1, n);
780
781			/* If we found more than one factor in stage 1, backup and single step */
782	2	50	if (f == n) {
783	2		a = savea;
784	4	50	START_DO_FOR_EACH_PRIME(saveq, B1) {
		50
		100
		50
		0
		0
		0
		0
		0
		50
785	4		k = p; kmin = B1/p;
786	62	100	while (k <= kmin) k *= p;
787	4		a = powmod(a, k, n);
788	4		f = gcd_ui(a-1, n);
789	4		q = p;
790	4	100	if (f != 1)
791	2		break;
792	4		} END_DO_FOR_EACH_PRIME
793			/* If f == n again, we could do:
794			* for (savea = 3; f == n && savea < 100; savea = next_prime(savea)) {
795			* a = savea;
796			* for (q = 2; q <= B1; q = next_prime(q)) {
797			* ...
798			* }
799			* }
800			* but this could be a huge time sink if B1 is large, so just fail.
801			*/
802			}
803
804			/* STAGE 2 */
805	2	50	if (f == 1 && B2 > B1) {
		0
806	0		UV bm = a;
807	0		UV b = 1;
808			UV bmdiff;
809	0		UV precomp_bm[111] = {0}; /* Enough for B2 = 189M */
810
811			/* calculate (a^q)^2, (a^q)^4, etc. */
812	0		bmdiff = sqrmod(bm, n);
813	0		precomp_bm[0] = bmdiff;
814	0	0	for (j = 1; j < 20; j++) {
815	0		bmdiff = mulmod(bmdiff,bm,n);
816	0		bmdiff = mulmod(bmdiff,bm,n);
817	0		precomp_bm[j] = bmdiff;
818			}
819
820	0		a = powmod(a, q, n);
821	0		j = 1;
822	0	0	START_DO_FOR_EACH_PRIME( q+1, B2 ) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
823	0		UV lastq = q;
824			UV qdiff;
825	0		q = p;
826			/* compute a^q = a^lastq * a^(q-lastq) */
827	0		qdiff = (q - lastq) / 2 - 1;
828	0	0	if (qdiff >= 111) {
829	0		bmdiff = powmod(bm, q-lastq, n); /* Big gap */
830			} else {
831	0		bmdiff = precomp_bm[qdiff];
832	0	0	if (bmdiff == 0) {
833	0	0	if (precomp_bm[qdiff-1] != 0)
834	0		bmdiff = mulmod(mulmod(precomp_bm[qdiff-1],bm,n),bm,n);
835			else
836	0		bmdiff = powmod(bm, q-lastq, n);
837	0		precomp_bm[qdiff] = bmdiff;
838			}
839			}
840	0		a = mulmod(a, bmdiff, n);
841	0	0	if (a == 0) break;
842	0		b = mulmod(b, a-1, n); /* if b == 0, we found multiple factors */
843	0	0	if ( (j++ % 64) == 0 ) {
844	0		f = gcd_ui(b, n);
845	0	0	if (f != 1)
846	0		break;
847			}
848	0		} END_DO_FOR_EACH_PRIME
849	0		f = gcd_ui(b, n);
850			}
851	2		return found_factor(n, f, factors);
852			}
853
854			/* Simple Williams p+1 */
855	6		static void pp1_pow(UV *cX, UV exp, UV n)
856			{
857	6		UV X0 = *cX;
858	6		UV X = *cX;
859	6		UV Y = mulsubmod(X, X, 2, n);
860	6	50	UV bit = UVCONST(1) << (clz(exp)-1);
861	345	100	while (bit) {
862	339		UV T = mulsubmod(X, Y, X0, n);
863	339	100	if ( exp & bit ) {
864	15		X = T;
865	15		Y = mulsubmod(Y, Y, 2, n);
866			} else {
867	324		Y = T;
868	324		X = mulsubmod(X, X, 2, n);
869			}
870	339		bit >>= 1;
871			}
872	6		*cX = X;
873	6		}
874	2		int pplus1_factor(UV n, UV *factors, UV B1)
875			{
876			UV X1, X2, f;
877	2		UV sqrtB1 = isqrt(B1);
878	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pplus1_factor");
		50
879
880	2		X1 = 7 % n;
881	2		X2 = 11 % n;
882	2		f = 1;
883	4	50	START_DO_FOR_EACH_PRIME(2, B1) {
		50
		100
		100
		0
		0
		0
		0
		0
		50
884	4		UV k = p;
885	4	50	if (p < sqrtB1) {
886	4		UV kmin = B1/p;
887	21	100	while (k <= kmin)
888	17		k *= p;
889			}
890	4		pp1_pow(&X1, k, n);
891	4	50	if (X1 != 2) {
892	4		f = gcd_ui( submod(X1, 2, n) , n);
893	4	100	if (f != 1 && f != n) break;
		50
894			}
895	2		pp1_pow(&X2, k, n);
896	2	50	if (X2 != 2) {
897	0		f = gcd_ui( submod(X2, 2, n) , n);
898	0	0	if (f != 1 && f != n) break;
		0
899			}
900	2		} END_DO_FOR_EACH_PRIME
901
902	2		return found_factor(n, f, factors);
903			}
904
905
906			/* SQUFOF, based on Ben Buhrow's racing version. */
907			#if 1
908			/* limit to 62-bit inputs, use 32-bit types, faster */
909			#define SQUFOF_TYPE uint32_t
910			#define SQUFOF_MAX (UV_MAX >> 2)
911			#else
912			/* All 64-bit inputs possible, though we severely limit multipliers */
913			#define SQUFOF_TYPE UV
914			#define SQUFOF_MAX UV_MAX
915			#endif
916			typedef struct
917			{
918			int valid;
919			SQUFOF_TYPE P;
920			SQUFOF_TYPE bn;
921			SQUFOF_TYPE Qn;
922			SQUFOF_TYPE Q0;
923			SQUFOF_TYPE b0;
924			SQUFOF_TYPE it;
925			SQUFOF_TYPE imax;
926			SQUFOF_TYPE mult;
927			} mult_t;
928
929			/* N < 2^63 (or 2^31). Returns 0 or a factor */
930	3		static UV squfof_unit(UV n, mult_t* mult_save)
931			{
932			SQUFOF_TYPE imax,i,Q0,Qn,bn,b0,P,bbn,Ro,S,So,t1,t2;
933
934	3		P = mult_save->P;
935	3		bn = mult_save->bn;
936	3		Qn = mult_save->Qn;
937	3		Q0 = mult_save->Q0;
938	3		b0 = mult_save->b0;
939	3		i = mult_save->it;
940	3		imax = i + mult_save->imax;
941
942			#define SQUARE_SEARCH_ITERATION \
943			t1 = P; \
944			P = bn*Qn - P; \
945			t2 = Qn; \
946			Qn = Q0 + bn*(t1-P); \
947			Q0 = t2; \
948			bn = (b0 + P) / Qn; \
949			i++;
950
951			while (1) {
952	3		int j = 0;
953	3	50	if (i & 0x1) {
954	0		SQUARE_SEARCH_ITERATION;
955			}
956			/* i is now even */
957			while (1) {
958			/* We need to know P, bn, Qn, Q0, iteration count, i from prev */
959	4	50	if (i >= imax) {
960			/* save state and try another multiplier. */
961	0		mult_save->P = P;
962	0		mult_save->bn = bn;
963	0		mult_save->Qn = Qn;
964	0		mult_save->Q0 = Q0;
965	0		mult_save->it = i;
966	0		return 0;
967			}
968
969	4		SQUARE_SEARCH_ITERATION;
970
971			/* Even iteration. Check for square: Qn = SS /
972	4		t2 = Qn & 127;
973	4	100	if (!((t20x8bc40d7d) & (t20xa1e2f5d1) & 0x14020a)) {
974	3		t1 = (uint32_t) sqrt(Qn);
975	3	50	if (Qn == t1*t1)
976	3		break;
977			}
978
979			/* Odd iteration. */
980	1		SQUARE_SEARCH_ITERATION;
981	1		}
982	3		S = isqrt(Qn);
983	3		mult_save->it = i;
984
985			/* Reduce to G0 */
986	3		Ro = P + S*((b0 - P)/S);
987	3		So = (n - (UV)Ro*(UV)Ro)/(UV)S;
988	3		bbn = (b0+Ro)/So;
989
990			/* Search for symmetry point */
991			#define SYMMETRY_POINT_ITERATION \
992			t1 = Ro; \
993			Ro = bbn*So - Ro; \
994			t2 = So; \
995			So = S + bbn*(t1-Ro); \
996			S = t2; \
997			bbn = (b0+Ro)/So; \
998			if (Ro == t1) break;
999
1000	3		j = 0;
1001			while (1) {
1002	3	100	SYMMETRY_POINT_ITERATION;
1003	1	50	SYMMETRY_POINT_ITERATION;
1004	0	0	SYMMETRY_POINT_ITERATION;
1005	0	0	SYMMETRY_POINT_ITERATION;
1006	0	0	if (j++ > 2000000) {
1007	0		mult_save->valid = 0;
1008	0		return 0;
1009			}
1010	0		}
1011
1012	3		t1 = gcd_ui(Ro, n);
1013	3	50	if (t1 > 1)
1014	3		return t1;
1015	0		}
1016			}
1017
1018			/* Gower and Wagstaff 2008:
1019			* http://www.ams.org/journals/mcom/2008-77-261/S0025-5718-07-02010-8/
1020			* Section 5.3. I've added some with 13,17,19. Sorted by F(). */
1021			static const UV squfof_multipliers[] =
1022			/* { 35711, 357, 3511, 35, 3711, 37, 5711, 57,
1023			311, 3, 511, 5, 711, 7, 11, 1 }; /
1024			{ 35711, 357, 3571113, 35713, 3571117, 3511,
1025			35717, 35, 3571119, 351113,35719, 3571317,
1026			3513, 3711, 37, 5711, 3713, 57,
1027			3517, 5713, 3519, 311, 37*17, 3,
1028			31113, 511, 3719, 313, 5, 51113,
1029			5719, 513, 711, 7, 317, 713,
1030			11, 1 };
1031			#define NSQUFOF_MULT (sizeof(squfof_multipliers)/sizeof(squfof_multipliers[0]))
1032
1033	2		int squfof_factor(UV n, UV *factors, UV rounds)
1034			{
1035			mult_t mult_save[NSQUFOF_MULT];
1036			int still_racing;
1037			UV i, nn64, mult, f64;
1038	2		UV rounds_done = 0;
1039
1040			/* Caller should have handled these trivial cases */
1041	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in squfof_factor");
		50
1042
1043			/* Too big */
1044	2	50	if (n > SQUFOF_MAX) {
1045	0		factors[0] = n; return 1;
1046			}
1047
1048	78	100	for (i = 0; i < NSQUFOF_MULT; i++) {
1049	76		mult_save[i].valid = -1;
1050	76		mult_save[i].it = 0;
1051			}
1052
1053			/* Process the multipliers a little at a time: 0.33(nmult)^1/4: 20-20k */
1054			do {
1055	2		still_racing = 0;
1056	3	50	for (i = 0; i < NSQUFOF_MULT; i++) {
1057	3	50	if (mult_save[i].valid == 0) continue;
1058	3		mult = squfof_multipliers[i];
1059	3		nn64 = n * mult;
1060	3	50	if (mult_save[i].valid == -1) {
1061	3	50	if ((SQUFOF_MAX / mult) < n) {
1062	0		mult_save[i].valid = 0; /* This multiplier would overflow 64-bit */
1063	0		continue;
1064			}
1065	3		mult_save[i].valid = 1;
1066	3		mult_save[i].b0 = isqrt(nn64);
1067	3		mult_save[i].imax = (UV) (sqrt(mult_save[i].b0) / 16);
1068	3	50	if (mult_save[i].imax < 20) mult_save[i].imax = 20;
1069	3	50	if (mult_save[i].imax > rounds) mult_save[i].imax = rounds;
1070	3		mult_save[i].Q0 = 1;
1071	3		mult_save[i].P = mult_save[i].b0;
1072	3		mult_save[i].Qn = (SQUFOF_TYPE)(nn64 - ((UV)mult_save[i].b0 * (UV)mult_save[i].b0));
1073	3	50	if (mult_save[i].Qn == 0)
1074	0		return found_factor(n, mult_save[i].b0, factors);
1075	3		mult_save[i].bn = ((UV)mult_save[i].b0 + (UV)mult_save[i].P) / (UV)mult_save[i].Qn;
1076	3		mult_save[i].it = 0;
1077	3		mult_save[i].mult = mult;
1078			}
1079	3		f64 = squfof_unit(nn64, &mult_save[i]);
1080	3	50	if (f64 > 1) {
1081	3	50	if (f64 != mult) {
1082	3		f64 /= gcd_ui(f64, mult);
1083	3	100	if (f64 != 1) {
1084			/*
1085			unsigned long totiter = 0;
1086			{int K; for (K = 0; K < NSQUFOF_MULT; K++) totiter += mult_save[K].it; }
1087			printf("f2 %lu mult %lu it %lu (%lu)\n",n,mult,totiter,(UV)mult_save[i].it);
1088			*/
1089	2		return found_factor(n, f64, factors);
1090			}
1091			}
1092			/* Found trivial factor. Quit working with this multiplier. */
1093	1		mult_save[i].valid = 0;
1094			}
1095	1	50	if (mult_save[i].valid == 1)
1096	0		still_racing = 1;
1097	1		rounds_done += mult_save[i].imax;
1098	1	50	if (rounds_done >= rounds)
1099	0		break;
1100			}
1101	0	0	} while (still_racing && rounds_done < rounds);
		0
1102
1103			/* No factors found */
1104	0		factors[0] = n;
1105	2		return 1;
1106			}
1107
1108			#define SQR_TAB_SIZE 512
1109			static int sqr_tab_init = 0;
1110			static double sqr_tab[SQR_TAB_SIZE];
1111	0		static void make_sqr_tab(void) {
1112			int i;
1113	0	0	for (i = 0; i < SQR_TAB_SIZE; i++)
1114	0		sqr_tab[i] = sqrt((double)i);
1115	0		sqr_tab_init = 1;
1116	0		}
1117
1118			/* Lehman written and tuned by Warren D. Smith.
1119			* Revised by Ben Buhrow and Dana Jacobsen. */
1120	0		int lehman_factor(UV n, UV *factors, int do_trial) {
1121	0	0	const double Tune = ((n >> 31) >> 5) ? 3.5 : 5.0;
1122			double x, sqrtn;
1123			UV a,c,kN,kN4,B2;
1124			uint32_t b,p,k,r,B,U,Bred,inc,ip;
1125
1126	0	0	if (!(n&1)) return found_factor(n, 2, factors);
1127
1128	0		B = Tune * (1+rootof(n,3));
1129
1130	0	0	if (do_trial) {
1131	0		uint32_t FirstCut = 0.1 * B;
1132	0	0	if (FirstCut < 84) FirstCut = 84;
1133	0	0	if (FirstCut > 65535) FirstCut = 65535;
1134	0	0	for (ip = 2; ip < NPRIMES_SMALL; ip++) {
1135	0		p = primes_small[ip];
1136	0	0	if (p >= FirstCut)
1137	0		break;
1138	0	0	if (n % p == 0)
1139	0		return found_factor(n, p, factors);
1140			}
1141			}
1142
1143	0	0	if (n >= UVCONST(8796393022207)) { factors[0] = n; return 1; }
1144	0		Bred = B / (Tune * Tune * Tune);
1145	0		B2 = B*B;
1146	0		kN = 0;
1147
1148	0	0	if (!sqr_tab_init) make_sqr_tab();
1149	0		sqrtn = sqrt(n);
1150
1151	0	0	for (k = 1; k <= Bred; k++) {
1152	0	0	if (k&1) { inc = 4; r = (k+n) % 4; }
1153	0		else { inc = 2; r = 1; }
1154	0		kN += n;
1155	0	0	if (kN >= UVCONST(1152921504606846976)) { factors[0] = n; return 1; }
1156	0		kN4 = kN*4;
1157
1158	0	0	x = (k < SQR_TAB_SIZE) ? sqrtn * sqr_tab[k] : sqrt((double)kN);
1159	0		a = x;
1160	0	0	if ((UV)a * (UV)a == kN)
1161	0		return found_factor(n, gcd_ui(a,n), factors);
1162	0		x *= 2;
1163	0		a = x + 0.9999999665; /* Magic constant */
1164	0		b = a % inc;
1165	0		b = a + (inc+r-b) % inc;
1166	0		c = (UV)b*(UV)b - kN4;
1167	0		U = x + B2/(2*x);
1168	0	0	for (a = b; a <= U; c += inc*(a+a+inc), a += inc) {
1169			/* Check for perfect square */
1170	0		b = c & 127;
1171	0	0	if (!((b0x8bc40d7d) & (b0xa1e2f5d1) & 0x14020a)) {
1172	0		b = (uint32_t) sqrt(c);
1173	0	0	if (c == b*b) {
1174	0		B2 = gcd_ui(a+b, n);
1175	0		return found_factor(n, B2, factors);
1176			}
1177			}
1178			}
1179			}
1180	0	0	if (do_trial) {
1181	0	0	if (B > 65535) B = 65535;
1182			/* trial divide from primes[ip] to B. We could:
1183			* 1) use table of 6542 shorts for the primes.
1184			* 2) use a wheel
1185			* 3) bail to trial_factor which duplicates some work
1186			*/
1187	0		return trial_factor(n, factors, B);
1188			}
1189	0		factors[0] = n;
1190	0		return 1;
1191			}
1192
1193	23		static UV dlp_trial(UV a, UV g, UV p, UV maxrounds) {
1194			UV k, t;
1195	23	50	if (maxrounds > p) maxrounds = p;
1196
1197			#if USE_MONTMATH
1198	23	100	if (p&1) {
1199	18		const uint64_t npi = mont_inverse(p), mont1 = mont_get1(p);
1200	18		g = mont_geta(g, p);
1201	18		a = mont_geta(a, p);
1202	13930	50	for (t = g, k = 1; k < maxrounds; k++) {
1203	13930	100	if (t == a)
1204	18		return k;
1205	13912	50	t = mont_mulmod(t, g, p);
1206	13912	50	if (t == g) break; /* Stop at cycle */
1207			}
1208			} else
1209			#endif
1210			{
1211	9	50	for (t = g, k = 1; k < maxrounds; k++) {
1212	9	100	if (t == a)
1213	4		return k;
1214	5		t = mulmod(t, g, p);
1215	5	100	if (t == g) break; /* Stop at cycle */
1216			}
1217			}
1218	1		return 0;
1219			}
1220
1221			/******************************************************************************/
1222			/* DLP - Pollard Rho */
1223			/******************************************************************************/
1224
1225			/* Compare with Pomerance paper (dartmouth dtalk4):
1226			* Type I/II/III = our case 1, 0, 2.
1227			* x_i = u, a_i = v, b_i = w
1228			*
1229			* Also see Bai/Brent 2008 for many ideas to speed this up.
1230			* https://maths-people.anu.edu.au/~brent/pd/rpb231.pdf
1231			* E.g. Teske adding-walk, Brent's cycle algo, Teske modified cycle
1232			*/
1233			#define pollard_rho_cycle(u,v,w,p,n,a,g) \
1234			switch (u % 3) { \
1235			case 0: u = mulmod(u,u,p); v = mulmod(v,2,n); w = mulmod(w,2,n); break;\
1236			case 1: u = mulmod(u,a,p); v = addmod(v,1,n); break;\
1237			case 2: u = mulmod(u,g,p); w = addmod(w,1,n); break;\
1238			}
1239
1240			typedef struct prho_state_t {
1241			UV u;
1242			UV v;
1243			UV w;
1244			UV U;
1245			UV V;
1246			UV W;
1247			UV round;
1248			int failed;
1249			int verbose;
1250			} prho_state_t;
1251
1252	4		static UV dlp_prho_uvw(UV a, UV g, UV p, UV n, UV rounds, prho_state_t *s) {
1253	4		UV i, k = 0;
1254	4		UV u=s->u, v=s->v, w=s->w;
1255	4		UV U=s->U, V=s->V, W=s->W;
1256	4		int const verbose = s->verbose;
1257
1258	4	50	if (s->failed) return 0;
1259	4	50	if (s->round + rounds > n) rounds = n - s->round;
1260
1261	26787	100	for (i = 1; i <= rounds; i++) {
1262	26785		pollard_rho_cycle(u,v,w,p,n,a,g); /* xi, ai, bi */
1263	26785		pollard_rho_cycle(U,V,W,p,n,a,g);
1264	26785		pollard_rho_cycle(U,V,W,p,n,a,g); /* x2i, a2i, b2i */
1265	26785	50	if (verbose > 3) printf( "%3"UVuf" %4"UVuf" %3"UVuf" %3"UVuf" %4"UVuf" %3"UVuf" %3"UVuf"\n", i, u, v, w, U, V, W );
1266	26785	100	if (u == U) {
1267			UV r1, r2, G, G2;
1268	2		r1 = submod(v, V, n);
1269	2	50	if (r1 == 0) {
1270	0	0	if (verbose) printf("DLP Rho failure, r=0\n");
1271	0		s->failed = 1;
1272	0		k = 0;
1273	0		break;
1274			}
1275	2		r2 = submod(W, w, n);
1276
1277	2		G = gcd_ui(r1,n);
1278	2		G2 = gcd_ui(G,r2);
1279	2		k = divmod(r2/G2, r1/G2, n/G2);
1280	2	50	if (G > 1) {
1281	0	0	if (powmod(g,k,p) == a) {
1282	0	0	if (verbose > 2) printf(" common GCD %"UVuf"\n", G2);
1283			} else {
1284	0		UV m, l = divmod(r2, r1, n/G);
1285	0	0	for (m = 0; m < G; m++) {
1286	0		k = addmod(l, mulmod(m,(n/G),n), n);
1287	0	0	if (powmod(g,k,p) == a) break;
1288			}
1289	0	0	if (m 2) printf(" GCD %"UVuf", found with m=%"UVuf"\n", G, m);
		0
1290			}
1291			}
1292
1293	2	50	if (powmod(g,k,p) != a) {
1294	0	0	if (verbose > 2) printf("r1 = %"UVuf" r2 = %"UVuf" k = %"UVuf"\n", r1, r2, k);
1295	0	0	if (verbose) printf("Incorrect DLP Rho solution: %"UVuf"\n", k);
1296	0		s->failed = 1;
1297	0		k = 0;
1298			}
1299	2		break;
1300			}
1301			}
1302	4		s->round += i-1;
1303	4	50	if (verbose && k) printf("DLP Rho solution found after %"UVuf" steps\n", s->round + 1);
		0
1304	4		s->u = u; s->v = v; s->w = w; s->U = U; s->V = V; s->W = W;
1305	4		return k;
1306			}
1307
1308			#if 0
1309			static UV dlp_prho(UV a, UV g, UV p, UV n, UV maxrounds) {
1310			#ifdef DEBUG
1311			int const verbose = _XS_get_verbose()
1312			#else
1313			int const verbose = 0;
1314			#endif
1315			prho_state_t s = {1, 0, 0, 1, 0, 0, 0, 0, verbose};
1316			return dlp_prho_uvw(a, g, p, n, maxrounds, &s);
1317			}
1318			#endif
1319
1320
1321			/******************************************************************************/
1322			/* DLP - BSGS */
1323			/******************************************************************************/
1324
1325			typedef struct bsgs_hash_t {
1326			UV M; /* The baby step index */
1327			UV V; /* The powmod value */
1328			struct bsgs_hash_t* next;
1329			} bsgs_hash_t;
1330
1331			/****************************************/
1332			/* Simple and limited pool allocation */
1333			#define BSGS_ENTRIES_PER_PAGE 8000
1334			typedef struct bsgs_page_top_t {
1335			struct bsgs_page_t* first;
1336			bsgs_hash_t** table;
1337			UV size;
1338			int nused;
1339			int npages;
1340			} bsgs_page_top_t;
1341
1342			typedef struct bsgs_page_t {
1343			bsgs_hash_t entries[BSGS_ENTRIES_PER_PAGE];
1344			struct bsgs_page_t* next;
1345			} bsgs_page_t;
1346
1347	4139		static bsgs_hash_t* get_entry(bsgs_page_top_t* top) {
1348	4139	100	if (top->nused == 0 \|\| top->nused >= BSGS_ENTRIES_PER_PAGE) {
		50
1349			bsgs_page_t* newpage;
1350	2		Newz(0, newpage, 1, bsgs_page_t);
1351	2		newpage->next = top->first;
1352	2		top->first = newpage;
1353	2		top->nused = 0;
1354	2		top->npages++;
1355			}
1356	4139		return top->first->entries + top->nused++;
1357			}
1358	2		static void destroy_pages(bsgs_page_top_t* top) {
1359	2		bsgs_page_t* head = top->first;
1360	4	100	while (head != 0) {
1361	2		bsgs_page_t* next = head->next;
1362	2		Safefree(head);
1363	2		head = next;
1364			}
1365	2		top->first = 0;
1366	2		}
1367			/****************************************/
1368
1369	2		static void bsgs_hash_put(bsgs_page_top_t* pagetop, UV v, UV i) {
1370	2		UV idx = v % pagetop->size;
1371	2		bsgs_hash_t** table = pagetop->table;
1372	2		bsgs_hash_t* entry = table[idx];
1373
1374	2	50	while (entry && entry->V != v)
		0
1375	0		entry = entry->next;
1376
1377	2	50	if (!entry) {
1378	2		entry = get_entry(pagetop);
1379	2		entry->M = i;
1380	2		entry->V = v;
1381	2		entry->next = table[idx];
1382	2		table[idx] = entry;
1383			}
1384	2		}
1385
1386	0		static UV bsgs_hash_get(bsgs_page_top_t* pagetop, UV v) {
1387	0		bsgs_hash_t* entry = pagetop->table[v % pagetop->size];
1388	0	0	while (entry && entry->V != v)
		0
1389	0		entry = entry->next;
1390	0	0	return (entry) ? entry->M : 0;
1391			}
1392
1393	4137		static UV bsgs_hash_put_get(bsgs_page_top_t* pagetop, UV v, UV i) {
1394	4137		UV idx = v % pagetop->size;
1395	4137		bsgs_hash_t** table = pagetop->table;
1396	4137		bsgs_hash_t* entry = table[idx];
1397
1398	4260	100	while (entry && entry->V != v)
		50
1399	123		entry = entry->next;
1400
1401	4137	50	if (entry)
1402	0		return entry->M;
1403
1404	4137		entry = get_entry(pagetop);
1405	4137		entry->M = i;
1406	4137		entry->V = v;
1407	4137		entry->next = table[idx];
1408	4137		table[idx] = entry;
1409	4137		return 0;
1410			}
1411
1412	3		static UV dlp_bsgs(UV a, UV g, UV p, UV n, UV maxent, int race_rho) {
1413			bsgs_page_top_t PAGES;
1414			UV i, m, maxm, hashmap_count;
1415			UV aa, S, gm, T, gs_i, bs_i;
1416	3		UV result = 0;
1417			#ifdef DEBUG
1418			int const verbose = _XS_get_verbose();
1419			#else
1420	3		int const verbose = 0;
1421			#endif
1422	3		prho_state_t rho_state = {1, 0, 0, 1, 0, 0, 0, 0, verbose};
1423
1424	3	50	if (n <= 2) return 0; /* Shouldn't be here with gorder this low */
1425
1426	3	50	if (race_rho) {
1427	3		result = dlp_prho_uvw(a, g, p, n, 10000, &rho_state);
1428	3	100	if (result) {
1429	1	50	if (verbose) printf("rho found solution in BSGS step 0\n");
1430	1		return result;
1431			}
1432			}
1433
1434	2	50	if (a == 0) return 0; /* We don't handle this case */
1435
1436	2		maxm = isqrt(n);
1437	2		m = (maxent > maxm) ? maxm : maxent;
1438
1439	2	50	hashmap_count = (m < 65537) ? 65537 :
1440	0	0	(m > 40000000) ? 40000003 :
1441			next_prime(m); /* Ave depth around 2 */
1442
1443			/* Create table. Size: 8hashmap_count bytes. /
1444	2		PAGES.size = hashmap_count;
1445	2		PAGES.first = 0;
1446	2		PAGES.nused = 0;
1447	2		PAGES.npages = 0;
1448	2	50	Newz(0, PAGES.table, hashmap_count, bsgs_hash_t*);
1449
1450	2		aa = mulmod(a,a,p);
1451	2		S = a;
1452	2		gm = powmod(g, m, p);
1453	2		T = gm;
1454	2		gs_i = 0;
1455	2		bs_i = 0;
1456
1457	2		bsgs_hash_put(&PAGES, S, 0); /* First baby step */
1458	2		S = mulmod(S, g, p);
1459			/* Interleaved Baby Step Giant Step */
1460	2069	50	for (i = 1; i <= m; i++) {
1461	2069		gs_i = bsgs_hash_put_get(&PAGES, S, i);
1462	2069	50	if (gs_i) { bs_i = i; break; }
1463	2069		S = mulmod(S, g, p);
1464	2069	100	if (S == aa) { /* We discovered the solution! */
1465	1	50	if (verbose) printf(" dlp bsgs: solution at BS step %"UVuf"\n", i+1);
1466	1		result = i+1;
1467	1		break;
1468			}
1469	2068		bs_i = bsgs_hash_put_get(&PAGES, T, i);
1470	2068	50	if (bs_i) { gs_i = i; break; }
1471	2068		T = mulmod(T, gm, p);
1472	2068	50	if (race_rho && (i % 2048) == 0) {
		100
1473	1		result = dlp_prho_uvw(a, g, p, n, 100000, &rho_state);
1474	1	50	if (result) {
1475	1	50	if (verbose) printf("rho found solution in BSGS step %"UVuf"\n", i);
1476	1		break;
1477			}
1478			}
1479			}
1480
1481	2	50	if (!result) {
1482			/* Extend Giant Step search */
1483	0	0	if (!(gs_i \|\| bs_i)) {
		0
1484	0		UV b = (p+m-1)/m;
1485	0	0	if (m < maxm && b > 8m) b = 8m;
		0
1486	0	0	for (i = m+1; i < b; i++) {
1487	0		bs_i = bsgs_hash_get(&PAGES, T);
1488	0	0	if (bs_i) { gs_i = i; break; }
1489	0		T = mulmod(T, gm, p);
1490	0	0	if (race_rho && (i % 2048) == 0) {
		0
1491	0		result = dlp_prho_uvw(a, g, p, n, 100000, &rho_state);
1492	0	0	if (result) {
1493	0	0	if (verbose) printf("rho found solution in BSGS step %"UVuf"\n", i);
1494	0		break;
1495			}
1496			}
1497			}
1498			}
1499
1500	0	0	if (gs_i \|\| bs_i) {
		0
1501	0		result = submod(mulmod(gs_i, m, p), bs_i, p);
1502			}
1503			}
1504	2	50	if (verbose) printf(" dlp bsgs using %d pages (%.1fMB+%.1fMB) for hash\n", PAGES.npages, ((double)PAGES.npages * sizeof(bsgs_page_t)) / (10241024), ((double)hashmap_count sizeof(bsgs_hash_t)) / (10241024));
1505
1506	2		destroy_pages(&PAGES);
1507	2		Safefree(PAGES.table);
1508	2	50	if (result != 0 && powmod(g,result,p) != a) {
		50
1509	0	0	if (verbose) printf("Incorrect DLP BSGS solution: %"UVuf"\n", result);
1510	0		result = 0;
1511			}
1512	2	50	if (race_rho && result == 0) {
		50
1513	0		result = dlp_prho_uvw(a, g, p, n, 2000000000U, &rho_state);
1514			}
1515	3		return result;
1516			}
1517
1518			/* Find smallest k where a = g^k mod p */
1519			#define DLP_TRIAL_NUM 10000
1520	16		static UV znlog_solve(UV a, UV g, UV p, UV n) {
1521			UV k, sqrtn;
1522	16		const int verbose = _XS_get_verbose();
1523
1524	16	50	if (a >= p) a %= p;
1525	16	50	if (g >= p) g %= p;
1526
1527	16	100	if (a == 1 \|\| g == 0 \|\| p <= 2)
		50
		50
1528	1		return 0;
1529
1530	15	50	if (verbose > 1 && n != p-1) printf(" g=%"UVuf" p=%"UVuf", order %"UVuf"\n", g, p, n);
		0
1531
1532			/* printf(" solving znlog(%"UVuf",%"UVuf",%"UVuf") n=%"UVuf"\n", a, g, p, n); */
1533
1534	15	50	if (n == 0 \|\| n <= DLP_TRIAL_NUM) {
		100
1535	12		k = dlp_trial(a, g, p, DLP_TRIAL_NUM);
1536	12	50	if (verbose) printf(" dlp trial 10k %s\n", (k!=0 \|\| p <= DLP_TRIAL_NUM) ? "success" : "failure");
		0
		0
1537	12	50	if (k != 0 \|\| (n > 0 && n <= DLP_TRIAL_NUM)) return k;
		0
		0
1538			}
1539
1540			{ /* Existence checks */
1541	3		UV aorder, gorder = n;
1542	3	50	if (gorder != 0 && powmod(a, gorder, p) != 1) return 0;
		50
1543	3		aorder = znorder(a,p);
1544	3	50	if (aorder == 0 && gorder != 0) return 0;
		0
1545	3	50	if (aorder != 0 && gorder % aorder != 0) return 0;
		50
1546			}
1547
1548	3	50	sqrtn = (n == 0) ? 0 : isqrt(n);
1549	3	50	if (n == 0) n = p-1;
1550
1551			{
1552	3	50	UV maxent = (sqrtn > 0) ? sqrtn+1 : 100000;
1553	3		k = dlp_bsgs(a, g, p, n, maxent/2, /* race rho */ 1);
1554	3	50	if (verbose) printf(" dlp bsgs %"UVuf"k %s\n", maxent/1000, k!=0 ? "success" : "failure");
		0
1555	3	50	if (k != 0) return k;
1556	0	0	if (sqrtn > 0 && sqrtn < maxent) return 0;
		0
1557			}
1558
1559	0	0	if (verbose) printf(" dlp doing exhaustive trial\n");
1560	0		k = dlp_trial(a, g, p, p);
1561	0		return k;
1562			}
1563
1564			/* Silver-Pohlig-Hellman */
1565	5		static UV znlog_ph(UV a, UV g, UV p, UV p1) {
1566			UV fac[MPU_MAX_FACTORS+1];
1567			UV exp[MPU_MAX_FACTORS+1];
1568			int i, nfactors;
1569			UV x, j;
1570
1571	5	50	if (p1 == 0) return 0; /* TODO: Should we plow on with p1=p-1? */
1572	5		nfactors = factor_exp(p1, fac, exp);
1573	5	50	if (nfactors == 1)
1574	0		return znlog_solve(a, g, p, p1);
1575	21	100	for (i = 0; i < nfactors; i++) {
1576			UV pi, delta, gamma;
1577	17	100	pi = fac[i]; for (j = 1; j < exp[i]; j++) pi *= fac[i];
1578	16		delta = powmod(a,p1/pi,p);
1579	16		gamma = powmod(g,p1/pi,p);
1580			/* printf(" solving znlog(%"UVuf",%"UVuf",%"UVuf")\n", delta, gamma, p); */
1581	16		fac[i] = znlog_solve( delta, gamma, p, znorder(gamma,p) );
1582	16		exp[i] = pi;
1583			}
1584	5		x = chinese(fac, exp, nfactors, &i);
1585	5	50	if (i == 1 && powmod(g, x, p) == a)
		50
1586	5		return x;
1587	5		return 0;
1588			}
1589
1590			/* Find smallest k where a = g^k mod p */
1591	20		UV znlog(UV a, UV g, UV p) {
1592			UV k, gorder, aorder;
1593	20		const int verbose = _XS_get_verbose();
1594
1595	20	50	if (a >= p) a %= p;
1596	20	50	if (g >= p) g %= p;
1597
1598	20	100	if (a == 1 \|\| g == 0 \|\| p <= 2)
		50
		50
1599	2		return 0;
1600
1601			/* TODO: We call znorder with the same p many times. We should have a
1602			* method for znorder given {phi,nfactors,fac,exp} */
1603
1604	18		gorder = znorder(g,p);
1605	18	100	if (gorder != 0 && powmod(a, gorder, p) != 1) return 0;
		100
1606	16		aorder = znorder(a,p);
1607	16	100	if (aorder == 0 && gorder != 0) return 0;
		50
1608	16	100	if (aorder != 0 && gorder % aorder != 0) return 0;
		50
1609
1610			/* TODO: Come up with a better solution for a=0 */
1611	16	100	if (a == 0 \|\| p < DLP_TRIAL_NUM \|\| (gorder > 0 && gorder < DLP_TRIAL_NUM)) {
		100
		50
		50
1612	11	50	if (verbose > 1) printf(" dlp trial znlog(%"UVuf",%"UVuf",%"UVuf")\n",a,g,p);
1613	11		k = dlp_trial(a, g, p, p);
1614	11		return k;
1615			}
1616
1617	5	50	if (!is_prob_prime(gorder)) {
1618	5		k = znlog_ph(a, g, p, gorder);
1619	5	50	if (verbose) printf(" dlp PH %s\n", k!=0 ? "success" : "failure");
		0
1620	5	50	if (k != 0) return k;
1621			}
1622
1623	0		return znlog_solve(a, g, p, gorder);
1624			}
1625
1626
1627			/* Compile with:
1628			* gcc -O3 -fomit-frame-pointer -march=native -Wall -DFACTOR_STANDALONE -DSTANDALONE factor.c util.c sieve.c cache.c primality.c lmo.c -lm
1629			*/
1630			#ifdef FACTOR_STANDALONE
1631			#include
1632			int main(int argc, char *argv[])
1633			{
1634			UV n;
1635			UV factors[MPU_MAX_FACTORS+1];
1636			int nfactors, i, a;
1637
1638			if (argc <= 1) { printf("usage: %s \n", argv[0]); return(1); }
1639
1640			for (a = 1; a < argc; a++) {
1641			n = strtoul(argv[a], 0, 10);
1642			if (n == ULONG_MAX && errno == ERANGE) { printf("Argument larger than ULONG_MAX\n"); return(-1); }
1643			nfactors = factor(n, factors);
1644			printf("%"UVuf":", n);
1645			for (i = 0; i < nfactors; i++)
1646			printf(" %"UVuf"", factors[i]);
1647			printf("\n");
1648			}
1649
1650			return(0);
1651			}
1652			#endif