File Coverage

factor.c

Criterion	Covered	Total	%
statement	642	880	72.9
branch	483	930	51.9
condition			n/a
subroutine			n/a
pod			n/a
total	1125	1810	62.1

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5
6			#define FUNC_isqrt 1
7			#define FUNC_gcd_ui 1
8			#define FUNC_is_perfect_square 1
9			#define FUNC_clz 1
10			#include "ptypes.h"
11			#include "factor.h"
12			#include "sieve.h"
13			#include "util.h"
14			#include "mulmod.h"
15			#include "cache.h"
16			#include "primality.h"
17			#include "montmath.h"
18			static int holf32(uint32_t n, UV *factors, uint32_t rounds);
19
20			/*
21			* You need to remember to use UV for unsigned and IV for signed types that
22			* are large enough to hold our data.
23			* If you use int, that's 32-bit on LP64 and LLP64 machines. You lose.
24			* If you use long, that's 32-bit on LLP64 machines. You lose.
25			* If you use long long, you may be too large which isn't so bad, but some
26			* compilers may not understand the type at all.
27			* perl.h already figured all this out, and provided us with these types which
28			* match the native integer type used inside our Perl, so just use those.
29			*/
30
31			static const unsigned short primes_small[] =
32			{0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
33			101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
34			193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
35			293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
36			409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,
37			521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,
38			641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,
39			757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,
40			881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,
41			1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,
42			1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,
43			1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,
44			1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,
45			1429,1433,1439,1447,1451,1453,1459,1471,1481,1483,1487,1489,1493,1499,
46			1511,1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,1597,1601,1607,
47			1609,1613,1619,1621,1627,1637,1657,1663,1667,1669,1693,1697,1699,1709,
48			1721,1723,1733,1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,1823,
49			1831,1847,1861,1867,1871,1873,1877,1879,1889,1901,1907,1913,1931,1933,
50			1949,1951,1973,1979,1987,1993,1997,1999,2003,2011};
51			#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
52
53
54			/* The main factoring loop */
55			/* Puts factors in factors[] and returns the number found. */
56	34152		int factor(UV n, UV *factors)
57			{
58	34152		int nsmallfactors, nfactors = 0; /* Number of factored in factors result */
59	34152		uint32_t f = 7;
60
61	34152	100	if (n > 1) {
62	48604	100	while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
63	44406	100	while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
64	39937	100	while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
65			}
66
67	34152	100	if (f*f <= n) {
68	27135		UV const lastsp = 83;
69	27135		UV sp = 4;
70			/* Trial division from 7 to 421. Use 32-bit if possible. */
71	27135	100	if (n <= 4294967295U) {
72	26909		unsigned int un = n;
73	240805	100	while (sp < lastsp) {
74	253903	100	while ( (un%f) == 0 ) {
75	13320		factors[nfactors++] = f;
76	13320		un /= f;
77			}
78	240583		f = primes_small[++sp];
79	240583	100	if (f*f > un) break;
80			}
81	26909		n = un;
82			} else {
83	17352	100	while (sp < lastsp) {
84	17388	100	while ( (n%f) == 0 ) {
85	250		factors[nfactors++] = f;
86	250		n /= f;
87			}
88	17138		f = primes_small[++sp];
89	17138	100	if (f*f > n) break;
90			}
91			}
92			/* If n is small and still composite, finish it here */
93	27135	100	if (n < 20112011 && ff <= n) { /* Trial division from 431 to 2003 */
		100
94	62		unsigned int un = n;
95	5698	50	while (sp < NPRIMES_SMALL) {
96	5719	100	while ( (un%f) == 0 ) {
97	21		factors[nfactors++] = f;
98	21		un /= f;
99			}
100	5698		f = primes_small[++sp];
101	5698	100	if (f*f > un) break;
102			}
103	62		n = un;
104			}
105			}
106	34152	100	if (f*f > n) {
107	33778	100	if (n != 1) factors[nfactors++] = n;
108	33778		return nfactors;
109			}
110			#if BITS_PER_WORD == 64
111			/* For small values less than f^3, use simple factor to split semiprime */
112	374	100	if (n < 100000000 && n < fff) {
		100
113	108	100	if (MR32(n)) factors[nfactors++] = n;
114	54		else nfactors += holf32(n, factors+nfactors, 10000);
115	108		return nfactors;
116			}
117			#endif
118
119	266		nsmallfactors = nfactors;
120
121			/* Perfect powers. Factor root only once. */
122			{
123	266		int i, j, k = powerof(n);
124	266	100	if (k > 1) {
125	5		UV p = rootof(n, k);
126	5		nfactors = factor(p, factors+nsmallfactors);
127	17	100	for (i = nfactors; i >= 0; i--)
128	38	100	for (j = 0; j < k; j++)
129	26		factors[nsmallfactors+k*i+j] = factors[nsmallfactors+i];
130	5		return nsmallfactors + k*nfactors;
131			}
132			}
133
134			{
135			UV tofac_stack[MPU_MAX_FACTORS+1];
136	261		int i, j, ntofac = 0;
137	261		int const verbose = _XS_get_verbose();
138
139			/* loop over each remaining factor, until ntofac == 0 */
140			do {
141	729	100	while ( (n >= f*f) && (!is_def_prime(n)) ) {
		100
		100
		100
142	234		int split_success = 0;
143			/* Adjust the number of rounds based on the number size and speed */
144	234	50	UV const nbits = BITS_PER_WORD - clz(n);
145			#if USE_MONTMATH
146	234	100	UV const br_rounds = 8000 + (9000 * ((nbits <= 45) ? 0 : (nbits-45)));
147	234		UV const sq_rounds = 200000;
148			#elif MULMODS_ARE_FAST
149			UV const br_rounds = 500 + ( 200 * ((nbits <= 45) ? 0 : (nbits-45)));
150			UV const sq_rounds = 100000;
151			#else
152			UV const br_rounds = (nbits >= 63) ? 120000 : (nbits >= 58) ? 500 : 0;
153			UV const sq_rounds = 200000;
154			#endif
155
156			#if BITS_PER_WORD == 64
157			/* For small semiprimes the fastest solution is HOLF under 32, then
158			* Lehman (no trial) under 38. However on random inputs, HOLF is
159			* best only under 28-30 bits, and adding Lehman is always slower. */
160	234	50	if (!split_success && nbits <= 30) { /* This should always succeed */
		100
161	37		split_success = holf32(n, tofac_stack+ntofac, 1000000)-1;
162	37	50	if (verbose) printf("holf %d\n", split_success);
163			}
164			#endif
165			/* Almost all inputs are factored here */
166	234	100	if (!split_success && br_rounds > 0) {
		50
167	197		split_success = pbrent_factor(n, tofac_stack+ntofac, br_rounds, 1)-1;
168	197	50	if (verbose) printf("pbrent %d\n", split_success);
169			}
170			#if USE_MONTMATH
171	234	50	if (!split_success) {
172	0		split_success = pbrent_factor(n, tofac_stack+ntofac, 2*br_rounds, 3)-1;
173	0	0	if (verbose) printf("second pbrent %d\n", split_success);
174			}
175			#endif
176			/* Random 64-bit inputs at this point:
177			* About 3.1% are small enough that we did with HOLF.
178			* montmath: 96.89% pbrent, 0.01% pbrent2
179			* fast: 73.43% pbrent, 21.97% squfof, 1.09% p-1, 0.49% prho, long
180			* slow: 75.34% squfof, 19.47% pbrent, 0.20% p-1, 0.06% prho
181			*/
182			/* SQUFOF with these parameters gets 99.9% of everything left */
183	234	50	if (!split_success && nbits <= 62) {
		0
184	0		split_success = squfof_factor(n,tofac_stack+ntofac, sq_rounds)-1;
185	0	0	if (verbose) printf("squfof %d\n", split_success);
186			}
187			/* At this point we should only have 16+ digit semiprimes. */
188	234	50	if (!split_success) {
189	0		split_success = pminus1_factor(n, tofac_stack+ntofac, 8000, 120000)-1;
190	0	0	if (verbose) printf("pminus1 %d\n", split_success);
191			/* Get the stragglers */
192	0	0	if (!split_success) {
193	0		split_success = prho_factor(n, tofac_stack+ntofac, 120000)-1;
194	0	0	if (verbose) printf("long prho %d\n", split_success);
195	0	0	if (!split_success) {
196	0		split_success = pbrent_factor(n, tofac_stack+ntofac, 500000, 5)-1;
197	0	0	if (verbose) printf("long pbrent %d\n", split_success);
198			}
199			}
200			}
201
202	234	50	if (split_success) {
203	234	50	MPUassert( split_success == 1, "split factor returned more than 2 factors");
204	234		ntofac++; /* Leave one on the to-be-factored stack */
205	234	50	if ((tofac_stack[ntofac] == n) \|\| (tofac_stack[ntofac] == 1))
		50
206	0		croak("bad factor\n");
207	234		n = tofac_stack[ntofac]; /* Set n to the other one */
208			} else {
209			/* Nothing should ever get here, but we're paranoid. */
210	0		UV m = f % 30;
211	0		UV limit = isqrt(n);
212	0	0	if (verbose) printf("doing trial on %"UVuf"\n", n);
213	0	0	while (f <= limit) {
214	0	0	if ( (n%f) == 0 ) {
215			do {
216	0		n /= f;
217	0		factors[nfactors++] = f;
218	0	0	} while ( (n%f) == 0 );
219	0		limit = isqrt(n);
220			}
221	0		f += wheeladvance30[m];
222	0		m = nextwheel30[m];
223			}
224	0		break; /* We just factored n via trial division. Exit loop. */
225			}
226			}
227			/* n is now prime (or 1), so add to already-factored stack */
228	495	50	if (n != 1) factors[nfactors++] = n;
229			/* Pop the next number off the to-factor stack */
230	495	100	if (ntofac > 0) n = tofac_stack[ntofac-1];
231	495	100	} while (ntofac-- > 0);
232
233			/* Sort the non-small factors */
234	495	100	for (i = nsmallfactors+1; i < nfactors; i++) {
235	234		UV fi = factors[i];
236	518	100	for (j = i; j > 0 && factors[j-1] > fi; j--)
		100
237	284		factors[j] = factors[j-1];
238	234		factors[j] = fi;
239			}
240			}
241	261		return nfactors;
242			}
243
244
245	12707		int factor_exp(UV n, UV factors, UV exponents)
246			{
247	12707		int i = 1, j = 1, nfactors;
248
249	12707	100	if (n == 1) return 0;
250	12695		nfactors = factor(n, factors);
251
252	12695	100	if (exponents == 0) {
253	131	100	for (; i < nfactors; i++)
254	92	100	if (factors[i] != factors[i-1])
255	62		factors[j++] = factors[i];
256			} else {
257	12656		exponents[0] = 1;
258	29239	100	for (; i < nfactors; i++) {
259	16583	100	if (factors[i] != factors[i-1]) {
260	13080		exponents[j] = 1;
261	13080		factors[j++] = factors[i];
262			} else {
263	3503		exponents[j-1]++;
264			}
265			}
266			}
267	12695		return j;
268			}
269
270
271	1619		int trial_factor(UV n, UV *factors, UV f, UV last)
272			{
273	1619		int sp, nfactors = 0;
274
275	1619	50	if (f < 2) f = 2;
276	1619	100	if (last == 0 \|\| last*last > n) last = UV_MAX;
		100
277
278	1619	100	if (n < 4 \|\| last < f) {
		50
279	8		factors[0] = n;
280	8		return (n == 1) ? 0 : 1;
281			}
282
283			/* possibly do uint32_t specific code here */
284
285	1611	50	if (f < primes_small[NPRIMES_SMALL-1]) {
286	3232	100	while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n >>= 1; }
287	2415	100	if (3<=last) while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
		100
288	1983	100	if (5<=last) while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
		100
289	5601	100	for (sp = 4; sp < (int)NPRIMES_SMALL; sp++) {
290	5598		f = primes_small[sp];
291	5598	100	if (f*f > n \|\| f > last) break;
		100
292	4374	100	while ( (n%f) == 0 ) {
293	384		factors[nfactors++] = f;
294	384		n /= f;
295			}
296			}
297			}
298			/* Trial division using a mod-30 wheel for larger values */
299	1611	100	if (f*f <= n && f <= last) {
		100
300	3		UV m, newlimit, limit = isqrt(n);
301	3	100	if (limit > last) limit = last;
302	3		m = f % 30;
303	7906	100	while (f <= limit) {
304	7903	100	if ( (n%f) == 0 ) {
305			do {
306	2		factors[nfactors++] = f;
307	2		n /= f;
308	2	50	} while ( (n%f) == 0 );
309	2		newlimit = isqrt(n);
310	2	50	if (newlimit < limit) limit = newlimit;
311			}
312	7903		f += wheeladvance30[m];
313	7903		m = nextwheel30[m];
314			}
315			}
316			/* All done! */
317	1611	100	if (n != 1)
318	1485		factors[nfactors++] = n;
319	1611		return nfactors;
320			}
321
322
323	3862		static void _divisors_from_factors(UV nfactors, UV* fp, UV* fe, UV* res) {
324	3862		UV s, count = 1;
325
326	3862		res[0] = 1;
327	12942	100	for (s = 0; s < nfactors; s++) {
328	9080		UV i, j, scount = count, p = fp[s], e = fe[s], mult = 1;
329	21275	100	for (j = 0; j < e; j++) {
330	12195		mult *= p;
331	41155	100	for (i = 0; i < scount; i++)
332	28960		res[count++] = res[i] * mult;
333			}
334			}
335	3862		}
336
337	58951		static int numcmp(const void a, const void b)
338	58951	100	{ const UV x = a, y = b; return (x > y) ? 1 : (x < y) ? -1 : 0; }
		50
339
340	3866		UV* _divisor_list(UV n, UV *num_divisors)
341			{
342			UV factors[MPU_MAX_FACTORS+1];
343			UV exponents[MPU_MAX_FACTORS+1];
344			UV* divs;
345			int i, nfactors, ndivisors;
346
347	3866	100	if (n <= 1) {
348	4		New(0, divs, 2, UV);
349	4	100	if (n == 0) { divs[0] = 0; divs[1] = 1; *num_divisors = 2; }
350	4	100	if (n == 1) { divs[0] = 1; *num_divisors = 1; }
351	4		return divs;
352			}
353			/* Factor and convert to factor/exponent pair */
354	3862		nfactors = factor_exp(n, factors, exponents);
355			/* Calculate number of divisors, allocate space, fill with divisors */
356	3862		ndivisors = exponents[0] + 1;
357	9080	100	for (i = 1; i < nfactors; i++)
358	5218		ndivisors *= (exponents[i] + 1);
359	3862	50	New(0, divs, ndivisors, UV);
360	3862		_divisors_from_factors(nfactors, factors, exponents, divs);
361			/* Sort divisors (numeric ascending) */
362	3862		qsort(divs, ndivisors, sizeof(UV), numcmp);
363			/* Return number of divisors and list */
364	3862		*num_divisors = ndivisors;
365	3866		return divs;
366			}
367
368
369			/* The usual method, on OEIS for instance, is:
370			* (p^(k*(e+1))-1) / (p^k-1)
371			* but that overflows quicky. Instead we rearrange as:
372			* 1 + p^k + p^k^2 + ... p^k^e
373			* Return 0 if the result overflowed.
374			*/
375			static const UV sigma_overflow[11] =
376			#if BITS_PER_WORD == 64
377			{UVCONST(3000000000000000000),UVCONST(3000000000),2487240,64260,7026,
378			1622, 566, 256, 139, 85, 57};
379			#else
380			{UVCONST(845404560), 52560, 1548, 252, 84, 41, 24, 16, 12, 10, 8};
381			#endif
382	1505		UV divisor_sum(UV n, UV k)
383			{
384			UV factors[MPU_MAX_FACTORS+1];
385			int nfac, i, j;
386	1505		UV product = 1;
387
388	1505	50	if (k > 11 \|\| (k > 0 && n >= sigma_overflow[k-1])) return 0;
		100
		50
389	1505	100	if (n <= 1) /* n=0 divisors are [0,1] */
390	287	100	return (n == 1) ? 1 : (k == 0) ? 2 : 1; /* n=1 divisors are [1] */
		100
391	1218		nfac = factor(n,factors);
392	1218	100	if (k == 0) {
393	2339	100	for (i = 0; i < nfac; i++) {
394	1365		UV e = 1, f = factors[i];
395	1806	100	while (i+1 < nfac && f == factors[i+1]) { e++; i++; }
		100
396	1365		product *= (e+1);
397			}
398	244	100	} else if (k == 1) {
399	427	100	for (i = 0; i < nfac; i++) {
400	270		UV f = factors[i];
401	270		UV pke = f, fmult = 1 + f;
402	372	100	while (i+1 < nfac && f == factors[i+1]) {
		100
403	102		pke *= f;
404	102		fmult += pke;
405	102		i++;
406			}
407	270		product *= fmult;
408			}
409			} else {
410	222	100	for (i = 0; i < nfac; i++) {
411	135		UV f = factors[i];
412	135		UV fmult, pke, pk = f;
413	328	100	for (j = 1; j < (int)k; j++) pk *= f;
414	135		fmult = 1 + pk;
415	135		pke = pk;
416	188	100	while (i+1 < nfac && f == factors[i+1]) {
		100
417	53		pke *= pk;
418	53		fmult += pke;
419	53		i++;
420			}
421	135		product *= fmult;
422			}
423			}
424	1505		return product;
425			}
426
427
428
429
430	303		static int found_factor(UV n, UV f, UV* factors)
431			{
432	303		UV f2 = n/f;
433	303		int i = f > f2;
434	303	50	if (f == 1 \|\| f2 == 1) {
		50
435	0		factors[0] = n;
436	0		return 1;
437			}
438	303		factors[i] = f;
439	303		factors[1-i] = f2;
440	303	50	MPUassert( factors[0] * factors[1] == n , "incorrect factoring");
441	303		return 2;
442			}
443
444			/* Knuth volume 2, algorithm C.
445			* Can't compete with HOLF, SQUFOF, pbrent, etc.
446			*/
447	2		int fermat_factor(UV n, UV *factors, UV rounds)
448			{
449			IV sqn, x, y, r;
450	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in fermat_factor");
		50
451	2		sqn = isqrt(n);
452	2		x = 2 * sqn + 1;
453	2		y = 1;
454	2		r = (sqn*sqn) - n;
455
456	10	100	while (r != 0) {
457	8	50	if (rounds-- == 0) { factors[0] = n; return 1; }
458	8		r += x;
459	8		x += 2;
460			do {
461	26		r -= y;
462	26		y += 2;
463	26	100	} while (r > 0);
464			}
465	2		r = (x-y)/2;
466	2		return found_factor(n, r, factors);
467			}
468
469			/* Hart's One Line Factorization. */
470	2		int holf_factor(UV n, UV *factors, UV rounds)
471			{
472			UV i, s, m, f;
473
474	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in holf_factor");
		50
475
476			/* We skip the perfect-square test for s in the loop, so we
477			* will never succeed if n is a perfect square. Test that now. */
478	2	50	if (is_perfect_square(n))
479	0		return found_factor(n, isqrt(n), factors);
480
481	2	50	if (n <= (UV_MAX >> 6)) { /* Try with premultiplier first */
482	2	50	UV npre = n * ( (n <= (UV_MAX >> 13)) ? 720 :
		0
		0
		0
483			(n <= (UV_MAX >> 11)) ? 480 :
484			(n <= (UV_MAX >> 10)) ? 360 :
485			(n <= (UV_MAX >> 8)) ? 60 : 30 );
486	2		UV ni = npre;
487			#if 0 /* Straightforward */
488			while (rounds--) {
489			s = isqrt(ni) + 1;
490			m = (s*s) - ni;
491			if (is_perfect_square(m)) {
492			f = gcd_ui(n, s - isqrt(m));
493			if (f > 1 && f < n)
494			return found_factor(n, f, factors);
495			}
496			if (ni >= (ni+npre)) break;
497			ni += npre;
498			}
499			#else /* More optimized */
500	3	50	while (rounds--) {
501	3		s = 1 + (UV)sqrt((double)ni);
502	3		m = (s*s) - ni;
503	3		f = m & 127;
504	3	100	if (!((f0x8bc40d7d) & (f0xa1e2f5d1) & 0x14020a)) {
505	2		f = (UV)sqrt((double)m);
506	2	50	if (m == f*f) {
507	2		f = gcd_ui(n, s - f);
508	2	50	if (f > 1 && f < n)
		50
509	2		return found_factor(n, f, factors);
510			}
511			}
512	1	50	if (ni >= (ni+npre)) break;
513	1		ni += npre;
514			}
515			#endif
516			}
517
518	0	0	for (i = 1; i <= rounds; i++) {
519	0		s = (UV) sqrt( (double)n * (double)i );
520			/* Assume s^2 isn't a perfect square. We're rapidly losing precision
521			* so we won't be able to accurately detect it anyway. */
522	0		s++; /* s = ceil(sqrt(ni)) /
523	0		m = sqrmod(s, n);
524	0	0	if (is_perfect_square(m)) {
525	0		f = isqrt(m);
526	0	0	f = gcd_ui( (s>f) ? s-f : f-s, n);
527			/* This should always succeed, but with overflow concerns.... */
528	0		return found_factor(n, f, factors);
529			}
530			}
531	0		factors[0] = n;
532	0		return 1;
533			}
534	91		static int holf32(uint32_t n, UV *factors, uint32_t rounds) {
535			UV npre, ni; /* These should be 64-bit */
536			uint32_t s, m, f;
537
538	91	50	if (n < 3) { factors[0] = n; return 1; }
539	91	50	if (!(n&1)) { factors[0] = 2; factors[1] = n/2; return 2; }
540	91	100	if (is_perfect_square(n)) { factors[0] = factors[1] = isqrt(n); return 2; }
541
542	90		ni = npre = (UV) n * ((BITS_PER_WORD == 64) ? 5040 : 1);
543	2729	50	while (rounds--) {
544	2729		s = 1 + (uint32_t)sqrt((double)ni);
545	2729		m = ((UV)s*(UV)s) - ni;
546	2729		f = m & 127;
547	2729	100	if (!((f0x8bc40d7d) & (f0xa1e2f5d1) & 0x14020a)) {
548	2101		f = (uint32_t)sqrt((double)m);
549	2101	100	if (m == f*f) {
550	90		f = gcd_ui(n, s - f);
551	90	50	if (f > 1 && f < n)
		50
552	90		return found_factor(n, f, factors);
553			}
554			}
555	2639	50	if (ni >= (ni+npre)) break; /* We've overflowed */
556	2639		ni += npre;
557			}
558	0		factors[0] = n;
559	0		return 1;
560			}
561
562
563			#define ABSDIFF(x,y) (x>y) ? x-y : y-x
564			#if USE_MONTMATH
565			/* Pollard Rho with Brent's updates, using Montgomery reduction. */
566	201		int pbrent_factor(UV n, UV *factors, UV rounds, UV a)
567			{
568	201	50	UV const nbits = BITS_PER_WORD - clz(n);
569	201	100	const UV inner = (nbits <= 31) ? 32 : (nbits <= 35) ? 64 : (nbits <= 40) ? 160 : (nbits <= 52) ? 256 : 320;
		100
		100
		100
570			UV f, m, r, rleft, Xi, Xm, Xs;
571	201		int irounds, fails = 6;
572	201		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
573
574	201	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pbrent_factor");
		50
575	201		r = f = 1;
576	201		Xi = Xm = Xs = mont1;
577	201		a = mont_geta(a,n);
578
579	1516	50	while (rounds > 0) {
580	1516		rleft = (r > rounds) ? rounds : r;
581	1516		Xm = Xi;
582			/* Do rleft rounds, inner at a time */
583	3700	100	while (rleft > 0) {
584	2385		irounds = (rleft > (UV)inner) ? inner : rleft;
585	2385		rleft -= irounds;
586	2385		rounds -= irounds;
587	2385		Xs = Xi;
588	2385	100	if (n < (1ULL << 63)) {
589	2031		Xi = mont_mulmod63(Xi,Xi+a,n);
590	2031	100	m = ABSDIFF(Xi,Xm);
591	229554	100	while (--irounds > 0) {
592	227523		Xi = mont_mulmod63(Xi,Xi+a,n);
593	227523	100	f = ABSDIFF(Xi,Xm);
594	227523		m = mont_mulmod63(m, f, n);
595			}
596	354	50	} else if (a == mont1) {
597	354		Xi = mont_mulmod64(Xi,Xi+a,n);
598	354	100	m = ABSDIFF(Xi,Xm);
599	104189	100	while (--irounds > 0) {
600	103835		Xi = mont_mulmod64(Xi,Xi+a,n);
601	103835	100	f = ABSDIFF(Xi,Xm);
602	103835		m = mont_mulmod64(m, f, n);
603			}
604			} else {
605	0		Xi = addmod(mont_mulmod64(Xi,Xi,n), a, n);
606	0	0	m = ABSDIFF(Xi,Xm);
607	0	0	while (--irounds > 0) {
608	0		Xi = addmod(mont_mulmod64(Xi,Xi,n), a, n);
609	0	0	f = ABSDIFF(Xi,Xm);
610	0		m = mont_mulmod64(m, f, n);
611			}
612			}
613	2385		f = gcd_ui(m, n);
614	2385	100	if (f != 1)
615	201		break;
616			}
617			/* If f == 1, then we didn't find a factor. Move on. */
618	1516	100	if (f == 1) {
619	1315		r *= 2;
620	1315		continue;
621			}
622	201	50	if (f == n) { /* back up, with safety */
623	0		Xi = Xs;
624			do {
625	0	0	if (n < (1ULL << 63) \|\| a == mont1)
		0
626	0	0	Xi = mont_mulmod(Xi,Xi+a,n);
627			else
628	0	0	Xi = addmod(mont_mulmod(Xi,Xi,n),a,n);
629	0	0	m = ABSDIFF(Xi,Xm);
630	0		f = gcd_ui(m, n);
631	0	0	} while (f == 1 && r-- != 0);
		0
632			}
633	201	50	if (f == 0 \|\| f == n) {
		50
634	0	0	if (fails-- <= 0) break;
635	0		Xi = Xm = mont1;
636	0		a = addmod(a, mont_geta(11,n), n);
637	0		continue;
638			}
639	201		return found_factor(n, f, factors);
640			}
641	0		factors[0] = n;
642	0		return 1;
643			}
644			#else
645			/* Pollard Rho with Brent's updates. */
646			int pbrent_factor(UV n, UV *factors, UV rounds, UV a)
647			{
648			UV f, m, r, Xi, Xm;
649			const UV inner = (n <= 4000000000UL) ? 32 : 160;
650			int fails = 6;
651
652			MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pbrent_factor");
653
654			r = f = Xi = Xm = 1;
655			while (rounds > 0) {
656			UV rleft = (r > rounds) ? rounds : r;
657			UV saveXi = Xi;
658			/* Do rleft rounds, inner at a time */
659			while (rleft > 0) {
660			UV dorounds = (rleft > inner) ? inner : rleft;
661			saveXi = Xi;
662			rleft -= dorounds;
663			rounds -= dorounds;
664			Xi = sqraddmod(Xi, a, n); /* First iteration, no mulmod needed */
665			m = ABSDIFF(Xi,Xm);
666			while (--dorounds > 0) { /* Now do inner-1=63 more iterations */
667			Xi = sqraddmod(Xi, a, n);
668			f = ABSDIFF(Xi,Xm);
669			m = mulmod(m, f, n);
670			}
671			f = gcd_ui(m, n);
672			if (f != 1)
673			break;
674			}
675			/* If f == 1, then we didn't find a factor. Move on. */
676			if (f == 1) {
677			r *= 2;
678			Xm = Xi;
679			continue;
680			}
681			if (f == n) { /* back up, with safety */
682			Xi = saveXi;
683			do {
684			Xi = sqraddmod(Xi, a, n);
685			f = gcd_ui( ABSDIFF(Xi,Xm), n);
686			} while (f == 1 && r-- != 0);
687			}
688			if (f == 0 \|\| f == n) {
689			if (fails-- <= 0) break;
690			Xm = addmod(Xm, 11, n);
691			Xi = Xm;
692			a++;
693			continue;
694			}
695			return found_factor(n, f, factors);
696			}
697			factors[0] = n;
698			return 1;
699			}
700			#endif
701
702			/* Pollard's Rho. */
703	2		int prho_factor(UV n, UV *factors, UV rounds)
704			{
705			UV a, f, i, m, oldU, oldV;
706	2		const UV inner = 64;
707	2		UV U = 7;
708	2		UV V = 7;
709	2		int fails = 3;
710
711	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in prho_factor");
		50
712
713			/* We could just as well say a = 1 */
714	2		switch (n%8) {
715	0		case 1: a = 1; break;
716	2		case 3: a = 2; break;
717	0		case 5: a = 3; break;
718	0		case 7: a = 5; break;
719	0		default: a = 7; break;
720			}
721
722	2		rounds = (rounds + inner - 1) / inner;
723
724	2	50	while (rounds-- > 0) {
725	2		m = 1; oldU = U; oldV = V;
726	130	100	for (i = 0; i < inner; i++) {
727	128		U = sqraddmod(U, a, n);
728	128		V = sqraddmod(V, a, n);
729	128		V = sqraddmod(V, a, n);
730	128	100	f = (U > V) ? U-V : V-U;
731	128		m = mulmod(m, f, n);
732			}
733	2		f = gcd_ui(m, n);
734	2	50	if (f == 1)
735	0		continue;
736	2	50	if (f == n) { /* back up to find a factor*/
737	2		U = oldU; V = oldV;
738	2		i = inner;
739			do {
740	6		U = sqraddmod(U, a, n);
741	6		V = sqraddmod(V, a, n);
742	6		V = sqraddmod(V, a, n);
743	6	100	f = gcd_ui( (U > V) ? U-V : V-U, n);
744	6	100	} while (f == 1 && i-- != 0);
		50
745			}
746	2	50	if (f == 0 \|\| f == n) {
		50
747	0	0	if (fails-- <= 0) break;
748	0		U = addmod(U,2,n);
749	0		V = U;
750	0		a++;
751	0		continue;
752			}
753	2		return found_factor(n, f, factors);
754			}
755	0		factors[0] = n;
756	0		return 1;
757			}
758
759			/* Pollard's P-1 */
760	2		int pminus1_factor(UV n, UV *factors, UV B1, UV B2)
761			{
762			UV f, k, kmin;
763	2		UV a = 2, q = 2;
764	2		UV savea = 2, saveq = 2;
765	2		UV j = 1;
766	2		UV sqrtB1 = isqrt(B1);
767			#if USE_MONTMATH
768	2		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
769	2		UV ma = mont_geta(a,n);
770			#define PMINUS1_APPLY_POWER ma = mont_powmod(ma, k, n)
771			#define PMINUS1_RECOVER_A a = mont_recover(ma,n)
772			#else
773			#define PMINUS1_APPLY_POWER a = powmod(a, k, n)
774			#define PMINUS1_RECOVER_A
775			#endif
776	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pminus1_factor");
		50
777
778	2	50	if (B1 <= primes_small[NPRIMES_SMALL-2]) {
779			UV i;
780	0	0	for (i = 1; primes_small[i] <= B1; i++) {
781	0		q = k = primes_small[i];
782	0	0	if (q <= sqrtB1) {
783	0		k = q*q; kmin = B1/q;
784	0	0	while (k <= kmin) k *= q;
785			}
786	0		PMINUS1_APPLY_POWER;
787	0	0	if ( (j++ % 32) == 0) {
788	0	0	PMINUS1_RECOVER_A;
789	0	0	if (a == 0 \|\| gcd_ui(a-1, n) != 1)
		0
790			break;
791	0		savea = a; saveq = q;
792			}
793			}
794	0	0	PMINUS1_RECOVER_A;
795			} else {
796	64	50	START_DO_FOR_EACH_PRIME(2, B1) {
		100
		100
		100
		100
		50
		50
		50
		50
		50
797	64		q = k = p;
798	64	50	if (q <= sqrtB1) {
799	64		k = q*q; kmin = B1/q;
800	200	100	while (k <= kmin) k *= q;
801			}
802	64		PMINUS1_APPLY_POWER;
803	64	100	if ( (j++ % 32) == 0) {
804	2	50	PMINUS1_RECOVER_A;
805	2	50	if (a == 0 \|\| gcd_ui(a-1, n) != 1)
		50
806			break;
807	0		savea = a; saveq = q;
808			}
809	62		} END_DO_FOR_EACH_PRIME
810	2	50	PMINUS1_RECOVER_A;
811			}
812	2	50	if (a == 0) { factors[0] = n; return 1; }
813	2		f = gcd_ui(a-1, n);
814
815			/* If we found more than one factor in stage 1, backup and single step */
816	2	50	if (f == n) {
817	2		a = savea;
818	4	50	START_DO_FOR_EACH_PRIME(saveq, B1) {
		50
		100
		50
		0
		0
		0
		0
		0
		50
819	4		k = p; kmin = B1/p;
820	62	100	while (k <= kmin) k *= p;
821	4		a = powmod(a, k, n);
822	4		f = gcd_ui(a-1, n);
823	4		q = p;
824	4	100	if (f != 1)
825	2		break;
826	4		} END_DO_FOR_EACH_PRIME
827			/* If f == n again, we could do:
828			* for (savea = 3; f == n && savea < 100; savea = next_prime(savea)) {
829			* a = savea;
830			* for (q = 2; q <= B1; q = next_prime(q)) {
831			* ...
832			* }
833			* }
834			* but this could be a huge time sink if B1 is large, so just fail.
835			*/
836			}
837
838			/* STAGE 2 */
839	2	50	if (f == 1 && B2 > B1) {
		0
840	0		UV bm = a;
841	0		UV b = 1;
842			UV bmdiff;
843	0		UV precomp_bm[111] = {0}; /* Enough for B2 = 189M */
844
845			/* calculate (a^q)^2, (a^q)^4, etc. */
846	0		bmdiff = sqrmod(bm, n);
847	0		precomp_bm[0] = bmdiff;
848	0	0	for (j = 1; j < 20; j++) {
849	0		bmdiff = mulmod(bmdiff,bm,n);
850	0		bmdiff = mulmod(bmdiff,bm,n);
851	0		precomp_bm[j] = bmdiff;
852			}
853
854	0		a = powmod(a, q, n);
855	0		j = 1;
856	0	0	START_DO_FOR_EACH_PRIME( q+1, B2 ) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
857	0		UV lastq = q;
858			UV qdiff;
859	0		q = p;
860			/* compute a^q = a^lastq * a^(q-lastq) */
861	0		qdiff = (q - lastq) / 2 - 1;
862	0	0	if (qdiff >= 111) {
863	0		bmdiff = powmod(bm, q-lastq, n); /* Big gap */
864			} else {
865	0		bmdiff = precomp_bm[qdiff];
866	0	0	if (bmdiff == 0) {
867	0	0	if (precomp_bm[qdiff-1] != 0)
868	0		bmdiff = mulmod(mulmod(precomp_bm[qdiff-1],bm,n),bm,n);
869			else
870	0		bmdiff = powmod(bm, q-lastq, n);
871	0		precomp_bm[qdiff] = bmdiff;
872			}
873			}
874	0		a = mulmod(a, bmdiff, n);
875	0	0	if (a == 0) break;
876	0		b = mulmod(b, a-1, n); /* if b == 0, we found multiple factors */
877	0	0	if ( (j++ % 64) == 0 ) {
878	0		f = gcd_ui(b, n);
879	0	0	if (f != 1)
880	0		break;
881			}
882	0		} END_DO_FOR_EACH_PRIME
883	0		f = gcd_ui(b, n);
884			}
885	2		return found_factor(n, f, factors);
886			}
887
888			/* Simple Williams p+1 */
889	6		static void pp1_pow(UV *cX, UV exp, UV n)
890			{
891	6		UV X0 = *cX;
892	6		UV X = *cX;
893	6		UV Y = mulsubmod(X, X, 2, n);
894	6	50	UV bit = UVCONST(1) << (clz(exp)-1);
895	345	100	while (bit) {
896	339		UV T = mulsubmod(X, Y, X0, n);
897	339	100	if ( exp & bit ) {
898	15		X = T;
899	15		Y = mulsubmod(Y, Y, 2, n);
900			} else {
901	324		Y = T;
902	324		X = mulsubmod(X, X, 2, n);
903			}
904	339		bit >>= 1;
905			}
906	6		*cX = X;
907	6		}
908	2		int pplus1_factor(UV n, UV *factors, UV B1)
909			{
910			UV X1, X2, f;
911	2		UV sqrtB1 = isqrt(B1);
912	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pplus1_factor");
		50
913
914	2		X1 = 7 % n;
915	2		X2 = 11 % n;
916	2		f = 1;
917	4	50	START_DO_FOR_EACH_PRIME(2, B1) {
		50
		100
		100
		0
		0
		0
		0
		0
		50
918	4		UV k = p;
919	4	50	if (p < sqrtB1) {
920	4		UV kmin = B1/p;
921	21	100	while (k <= kmin)
922	17		k *= p;
923			}
924	4		pp1_pow(&X1, k, n);
925	4	50	if (X1 != 2) {
926	4		f = gcd_ui( submod(X1, 2, n) , n);
927	4	100	if (f != 1 && f != n) break;
		50
928			}
929	2		pp1_pow(&X2, k, n);
930	2	50	if (X2 != 2) {
931	0		f = gcd_ui( submod(X2, 2, n) , n);
932	0	0	if (f != 1 && f != n) break;
		0
933			}
934	2		} END_DO_FOR_EACH_PRIME
935
936	2		return found_factor(n, f, factors);
937			}
938
939
940			/* SQUFOF, based on Ben Buhrow's racing version. */
941			#if 1
942			/* limit to 62-bit inputs, use 32-bit types, faster */
943			#define SQUFOF_TYPE uint32_t
944			#define SQUFOF_MAX (UV_MAX >> 2)
945			#else
946			/* All 64-bit inputs possible, though we severely limit multipliers */
947			#define SQUFOF_TYPE UV
948			#define SQUFOF_MAX UV_MAX
949			#endif
950			typedef struct
951			{
952			int valid;
953			SQUFOF_TYPE P;
954			SQUFOF_TYPE bn;
955			SQUFOF_TYPE Qn;
956			SQUFOF_TYPE Q0;
957			SQUFOF_TYPE b0;
958			SQUFOF_TYPE it;
959			SQUFOF_TYPE imax;
960			SQUFOF_TYPE mult;
961			} mult_t;
962
963			/* N < 2^63 (or 2^31). Returns 0 or a factor */
964	3		static UV squfof_unit(UV n, mult_t* mult_save)
965			{
966			SQUFOF_TYPE imax,i,Q0,Qn,bn,b0,P,bbn,Ro,S,So,t1,t2;
967
968	3		P = mult_save->P;
969	3		bn = mult_save->bn;
970	3		Qn = mult_save->Qn;
971	3		Q0 = mult_save->Q0;
972	3		b0 = mult_save->b0;
973	3		i = mult_save->it;
974	3		imax = i + mult_save->imax;
975
976			#define SQUARE_SEARCH_ITERATION \
977			t1 = P; \
978			P = bn*Qn - P; \
979			t2 = Qn; \
980			Qn = Q0 + bn*(t1-P); \
981			Q0 = t2; \
982			bn = (b0 + P) / Qn; \
983			i++;
984
985			while (1) {
986	3		int j = 0;
987	3	50	if (i & 0x1) {
988	0		SQUARE_SEARCH_ITERATION;
989			}
990			/* i is now even */
991			while (1) {
992			/* We need to know P, bn, Qn, Q0, iteration count, i from prev */
993	4	50	if (i >= imax) {
994			/* save state and try another multiplier. */
995	0		mult_save->P = P;
996	0		mult_save->bn = bn;
997	0		mult_save->Qn = Qn;
998	0		mult_save->Q0 = Q0;
999	0		mult_save->it = i;
1000	0		return 0;
1001			}
1002
1003	4		SQUARE_SEARCH_ITERATION;
1004
1005			/* Even iteration. Check for square: Qn = SS /
1006	4		t2 = Qn & 127;
1007	4	100	if (!((t20x8bc40d7d) & (t20xa1e2f5d1) & 0x14020a)) {
1008	3		t1 = (uint32_t) sqrt(Qn);
1009	3	50	if (Qn == t1*t1)
1010	3		break;
1011			}
1012
1013			/* Odd iteration. */
1014	1		SQUARE_SEARCH_ITERATION;
1015	1		}
1016	3		S = t1; /* isqrt(Qn); */
1017	3		mult_save->it = i;
1018
1019			/* Reduce to G0 */
1020	3		Ro = P + S*((b0 - P)/S);
1021	3		So = (n - (UV)Ro*(UV)Ro)/(UV)S;
1022	3		bbn = (b0+Ro)/So;
1023
1024			/* Search for symmetry point */
1025			#define SYMMETRY_POINT_ITERATION \
1026			t1 = Ro; \
1027			Ro = bbn*So - Ro; \
1028			t2 = So; \
1029			So = S + bbn*(t1-Ro); \
1030			S = t2; \
1031			bbn = (b0+Ro)/So; \
1032			if (Ro == t1) break;
1033
1034	3		j = 0;
1035			while (1) {
1036	3	100	SYMMETRY_POINT_ITERATION;
1037	1	50	SYMMETRY_POINT_ITERATION;
1038	0	0	SYMMETRY_POINT_ITERATION;
1039	0	0	SYMMETRY_POINT_ITERATION;
1040	0	0	if (j++ > 2000000) {
1041	0		mult_save->valid = 0;
1042	0		return 0;
1043			}
1044	0		}
1045
1046	3		t1 = gcd_ui(Ro, n);
1047	3	50	if (t1 > 1)
1048	3		return t1;
1049	0		}
1050			}
1051
1052			/* Gower and Wagstaff 2008:
1053			* http://www.ams.org/journals/mcom/2008-77-261/S0025-5718-07-02010-8/
1054			* Section 5.3. I've added some with 13,17,19. Sorted by F(). */
1055			static const UV squfof_multipliers[] =
1056			/* { 35711, 357, 3511, 35, 3711, 37, 5711, 57,
1057			311, 3, 511, 5, 711, 7, 11, 1 }; /
1058			{ 35711, 357, 3571113, 35713, 3571117, 3511,
1059			35717, 35, 3571119, 351113,35719, 3571317,
1060			3513, 3711, 37, 5711, 3713, 57,
1061			3517, 5713, 3519, 311, 37*17, 3,
1062			31113, 511, 3719, 313, 5, 51113,
1063			5719, 513, 711, 7, 317, 713,
1064			11, 1 };
1065			#define NSQUFOF_MULT (sizeof(squfof_multipliers)/sizeof(squfof_multipliers[0]))
1066
1067	2		int squfof_factor(UV n, UV *factors, UV rounds)
1068			{
1069			mult_t mult_save[NSQUFOF_MULT];
1070	2		UV i, nn64, sqrtnn64, mult, f64,rounds_done = 0;
1071	2		int mults_racing = NSQUFOF_MULT;
1072
1073			/* Caller should have handled these trivial cases */
1074	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in squfof_factor");
		50
1075
1076			/* Too big */
1077	2	50	if (n > SQUFOF_MAX) {
1078	0		factors[0] = n; return 1;
1079			}
1080
1081	78	100	for (i = 0; i < NSQUFOF_MULT; i++) {
1082	76		mult_save[i].valid = -1;
1083	76		mult_save[i].it = 0;
1084			}
1085
1086			/* Race each multiplier for a bit (20-20k rounds) */
1087	2	50	while (mults_racing > 0 && rounds_done < rounds) {
		50
1088	3	50	for (i = 0; i < NSQUFOF_MULT && rounds_done < rounds; i++) {
		50
1089	3	50	if (mult_save[i].valid == 0) continue;
1090	3		mult = squfof_multipliers[i];
1091	3		nn64 = n * mult;
1092	3	50	if (mult_save[i].valid == -1) {
1093	3	50	if ((SQUFOF_MAX / mult) < n) {
1094	0		mult_save[i].valid = 0; /* This multiplier would overflow 64-bit */
1095	0		mults_racing--;
1096	0		continue;
1097			}
1098	3		sqrtnn64 = isqrt(nn64);
1099	3		mult_save[i].valid = 1;
1100	3		mult_save[i].Q0 = 1;
1101	3		mult_save[i].b0 = sqrtnn64;
1102	3		mult_save[i].P = sqrtnn64;
1103	3		mult_save[i].Qn = (SQUFOF_TYPE)(nn64 - sqrtnn64 * sqrtnn64);
1104	3	50	if (mult_save[i].Qn == 0)
1105	0		return found_factor(n, sqrtnn64, factors);
1106	3		mult_save[i].bn = (2 * sqrtnn64) / (UV)mult_save[i].Qn;
1107	3		mult_save[i].it = 0;
1108	3		mult_save[i].mult = mult;
1109	3		mult_save[i].imax = (UV) (sqrt(sqrtnn64) / 16);
1110	3	50	if (mult_save[i].imax < 20) mult_save[i].imax = 20;
1111	3	50	if (mult_save[i].imax > rounds) mult_save[i].imax = rounds;
1112			}
1113	3	50	if (mults_racing == 1) /* Do all rounds if only one multiplier left */
1114	0		mult_save[i].imax = (rounds - rounds_done);
1115	3		f64 = squfof_unit(nn64, &mult_save[i]);
1116	3	50	if (f64 > 1) {
1117	3		UV f64red = f64 / gcd_ui(f64, mult);
1118	3	100	if (f64red > 1) {
1119			/* unsigned long totiter = 0;
1120			{int K; for (K = 0; K < NSQUFOF_MULT; K++) totiter += mult_save[K].it; }
1121			printf(" n %lu mult %lu it %lu (%lu)\n",n,mult,totiter,(UV)mult_save[i].it); */
1122	2		return found_factor(n, f64red, factors);
1123			}
1124			/* Found trivial factor. Quit working with this multiplier. */
1125	1		mult_save[i].valid = 0;
1126			}
1127	1	50	if (mult_save[i].valid == 0)
1128	1		mults_racing--;
1129	1		rounds_done += mult_save[i].imax; /* Assume we did all rounds */
1130			}
1131			}
1132
1133			/* No factors found */
1134	0		factors[0] = n;
1135	2		return 1;
1136			}
1137
1138			#define SQR_TAB_SIZE 512
1139			static int sqr_tab_init = 0;
1140			static double sqr_tab[SQR_TAB_SIZE];
1141	0		static void make_sqr_tab(void) {
1142			int i;
1143	0	0	for (i = 0; i < SQR_TAB_SIZE; i++)
1144	0		sqr_tab[i] = sqrt((double)i);
1145	0		sqr_tab_init = 1;
1146	0		}
1147
1148			/* Lehman written and tuned by Warren D. Smith.
1149			* Revised by Ben Buhrow and Dana Jacobsen. */
1150	0		int lehman_factor(UV n, UV *factors, int do_trial) {
1151	0	0	const double Tune = ((n >> 31) >> 5) ? 3.5 : 5.0;
1152			double x, sqrtn;
1153			UV a,c,kN,kN4,B2;
1154			uint32_t b,p,k,r,B,U,Bred,inc,ip;
1155
1156	0	0	if (!(n&1)) return found_factor(n, 2, factors);
1157
1158	0		B = Tune * (1+rootof(n,3));
1159
1160	0	0	if (do_trial) {
1161	0		uint32_t FirstCut = 0.1 * B;
1162	0	0	if (FirstCut < 84) FirstCut = 84;
1163	0	0	if (FirstCut > 65535) FirstCut = 65535;
1164	0	0	for (ip = 2; ip < NPRIMES_SMALL; ip++) {
1165	0		p = primes_small[ip];
1166	0	0	if (p >= FirstCut)
1167	0		break;
1168	0	0	if (n % p == 0)
1169	0		return found_factor(n, p, factors);
1170			}
1171			}
1172
1173	0	0	if (n >= UVCONST(8796393022207)) { factors[0] = n; return 1; }
1174	0		Bred = B / (Tune * Tune * Tune);
1175	0		B2 = B*B;
1176	0		kN = 0;
1177
1178	0	0	if (!sqr_tab_init) make_sqr_tab();
1179	0		sqrtn = sqrt(n);
1180
1181	0	0	for (k = 1; k <= Bred; k++) {
1182	0	0	if (k&1) { inc = 4; r = (k+n) % 4; }
1183	0		else { inc = 2; r = 1; }
1184	0		kN += n;
1185	0	0	if (kN >= UVCONST(1152921504606846976)) { factors[0] = n; return 1; }
1186	0		kN4 = kN*4;
1187
1188	0	0	x = (k < SQR_TAB_SIZE) ? sqrtn * sqr_tab[k] : sqrt((double)kN);
1189	0		a = x;
1190	0	0	if ((UV)a * (UV)a == kN)
1191	0		return found_factor(n, gcd_ui(a,n), factors);
1192	0		x *= 2;
1193	0		a = x + 0.9999999665; /* Magic constant */
1194	0		b = a % inc;
1195	0		b = a + (inc+r-b) % inc;
1196	0		c = (UV)b*(UV)b - kN4;
1197	0		U = x + B2/(2*x);
1198	0	0	for (a = b; a <= U; c += inc*(a+a+inc), a += inc) {
1199			/* Check for perfect square */
1200	0		b = c & 127;
1201	0	0	if (!((b0x8bc40d7d) & (b0xa1e2f5d1) & 0x14020a)) {
1202	0		b = (uint32_t) sqrt(c);
1203	0	0	if (c == b*b) {
1204	0		B2 = gcd_ui(a+b, n);
1205	0		return found_factor(n, B2, factors);
1206			}
1207			}
1208			}
1209			}
1210	0	0	if (do_trial) {
1211	0	0	if (B > 65535) B = 65535;
1212			/* trial divide from primes[ip] to B. We could:
1213			* 1) use table of 6542 shorts for the primes.
1214			* 2) use a wheel
1215			* 3) let trial_factor handle it
1216			*/
1217	0	0	if (ip >= NPRIMES_SMALL) ip = NPRIMES_SMALL-1;
1218	0		return trial_factor(n, factors, primes_small[ip], B);
1219			}
1220	0		factors[0] = n;
1221	0		return 1;
1222			}
1223
1224	23		static UV dlp_trial(UV a, UV g, UV p, UV maxrounds) {
1225			UV k, t;
1226	23	50	if (maxrounds > p) maxrounds = p;
1227
1228			#if USE_MONTMATH
1229	23	100	if (p&1) {
1230	18		const uint64_t npi = mont_inverse(p), mont1 = mont_get1(p);
1231	18		g = mont_geta(g, p);
1232	18		a = mont_geta(a, p);
1233	13930	50	for (t = g, k = 1; k < maxrounds; k++) {
1234	13930	100	if (t == a)
1235	18		return k;
1236	13912	50	t = mont_mulmod(t, g, p);
1237	13912	50	if (t == g) break; /* Stop at cycle */
1238			}
1239			} else
1240			#endif
1241			{
1242	9	50	for (t = g, k = 1; k < maxrounds; k++) {
1243	9	100	if (t == a)
1244	4		return k;
1245	5		t = mulmod(t, g, p);
1246	5	100	if (t == g) break; /* Stop at cycle */
1247			}
1248			}
1249	1		return 0;
1250			}
1251
1252			/******************************************************************************/
1253			/* DLP - Pollard Rho */
1254			/******************************************************************************/
1255
1256			/* Compare with Pomerance paper (dartmouth dtalk4):
1257			* Type I/II/III = our case 1, 0, 2.
1258			* x_i = u, a_i = v, b_i = w
1259			*
1260			* Also see Bai/Brent 2008 for many ideas to speed this up.
1261			* https://maths-people.anu.edu.au/~brent/pd/rpb231.pdf
1262			* E.g. Teske adding-walk, Brent's cycle algo, Teske modified cycle
1263			*/
1264			#define pollard_rho_cycle(u,v,w,p,n,a,g) \
1265			switch (u % 3) { \
1266			case 0: u = mulmod(u,u,p); v = mulmod(v,2,n); w = mulmod(w,2,n); break;\
1267			case 1: u = mulmod(u,a,p); v = addmod(v,1,n); break;\
1268			case 2: u = mulmod(u,g,p); w = addmod(w,1,n); break;\
1269			}
1270
1271			typedef struct prho_state_t {
1272			UV u;
1273			UV v;
1274			UV w;
1275			UV U;
1276			UV V;
1277			UV W;
1278			UV round;
1279			int failed;
1280			int verbose;
1281			} prho_state_t;
1282
1283	4		static UV dlp_prho_uvw(UV a, UV g, UV p, UV n, UV rounds, prho_state_t *s) {
1284	4		UV i, k = 0;
1285	4		UV u=s->u, v=s->v, w=s->w;
1286	4		UV U=s->U, V=s->V, W=s->W;
1287	4		int const verbose = s->verbose;
1288
1289	4	50	if (s->failed) return 0;
1290	4	50	if (s->round + rounds > n) rounds = n - s->round;
1291
1292	26787	100	for (i = 1; i <= rounds; i++) {
1293	26785		pollard_rho_cycle(u,v,w,p,n,a,g); /* xi, ai, bi */
1294	26785		pollard_rho_cycle(U,V,W,p,n,a,g);
1295	26785		pollard_rho_cycle(U,V,W,p,n,a,g); /* x2i, a2i, b2i */
1296	26785	50	if (verbose > 3) printf( "%3"UVuf" %4"UVuf" %3"UVuf" %3"UVuf" %4"UVuf" %3"UVuf" %3"UVuf"\n", i, u, v, w, U, V, W );
1297	26785	100	if (u == U) {
1298			UV r1, r2, G, G2;
1299	2		r1 = submod(v, V, n);
1300	2	50	if (r1 == 0) {
1301	0	0	if (verbose) printf("DLP Rho failure, r=0\n");
1302	0		s->failed = 1;
1303	0		k = 0;
1304	0		break;
1305			}
1306	2		r2 = submod(W, w, n);
1307
1308	2		G = gcd_ui(r1,n);
1309	2		G2 = gcd_ui(G,r2);
1310	2		k = divmod(r2/G2, r1/G2, n/G2);
1311	2	50	if (G > 1) {
1312	0	0	if (powmod(g,k,p) == a) {
1313	0	0	if (verbose > 2) printf(" common GCD %"UVuf"\n", G2);
1314			} else {
1315	0		UV m, l = divmod(r2, r1, n/G);
1316	0	0	for (m = 0; m < G; m++) {
1317	0		k = addmod(l, mulmod(m,(n/G),n), n);
1318	0	0	if (powmod(g,k,p) == a) break;
1319			}
1320	0	0	if (m 2) printf(" GCD %"UVuf", found with m=%"UVuf"\n", G, m);
		0
1321			}
1322			}
1323
1324	2	50	if (powmod(g,k,p) != a) {
1325	0	0	if (verbose > 2) printf("r1 = %"UVuf" r2 = %"UVuf" k = %"UVuf"\n", r1, r2, k);
1326	0	0	if (verbose) printf("Incorrect DLP Rho solution: %"UVuf"\n", k);
1327	0		s->failed = 1;
1328	0		k = 0;
1329			}
1330	2		break;
1331			}
1332			}
1333	4		s->round += i-1;
1334	4	50	if (verbose && k) printf("DLP Rho solution found after %"UVuf" steps\n", s->round + 1);
		0
1335	4		s->u = u; s->v = v; s->w = w; s->U = U; s->V = V; s->W = W;
1336	4		return k;
1337			}
1338
1339			#if 0
1340			static UV dlp_prho(UV a, UV g, UV p, UV n, UV maxrounds) {
1341			#ifdef DEBUG
1342			int const verbose = _XS_get_verbose()
1343			#else
1344			int const verbose = 0;
1345			#endif
1346			prho_state_t s = {1, 0, 0, 1, 0, 0, 0, 0, verbose};
1347			return dlp_prho_uvw(a, g, p, n, maxrounds, &s);
1348			}
1349			#endif
1350
1351
1352			/******************************************************************************/
1353			/* DLP - BSGS */
1354			/******************************************************************************/
1355
1356			typedef struct bsgs_hash_t {
1357			UV M; /* The baby step index */
1358			UV V; /* The powmod value */
1359			struct bsgs_hash_t* next;
1360			} bsgs_hash_t;
1361
1362			/****************************************/
1363			/* Simple and limited pool allocation */
1364			#define BSGS_ENTRIES_PER_PAGE 8000
1365			typedef struct bsgs_page_top_t {
1366			struct bsgs_page_t* first;
1367			bsgs_hash_t** table;
1368			UV size;
1369			int nused;
1370			int npages;
1371			} bsgs_page_top_t;
1372
1373			typedef struct bsgs_page_t {
1374			bsgs_hash_t entries[BSGS_ENTRIES_PER_PAGE];
1375			struct bsgs_page_t* next;
1376			} bsgs_page_t;
1377
1378	4139		static bsgs_hash_t* get_entry(bsgs_page_top_t* top) {
1379	4139	100	if (top->nused == 0 \|\| top->nused >= BSGS_ENTRIES_PER_PAGE) {
		50
1380			bsgs_page_t* newpage;
1381	2		Newz(0, newpage, 1, bsgs_page_t);
1382	2		newpage->next = top->first;
1383	2		top->first = newpage;
1384	2		top->nused = 0;
1385	2		top->npages++;
1386			}
1387	4139		return top->first->entries + top->nused++;
1388			}
1389	2		static void destroy_pages(bsgs_page_top_t* top) {
1390	2		bsgs_page_t* head = top->first;
1391	4	100	while (head != 0) {
1392	2		bsgs_page_t* next = head->next;
1393	2		Safefree(head);
1394	2		head = next;
1395			}
1396	2		top->first = 0;
1397	2		}
1398			/****************************************/
1399
1400	2		static void bsgs_hash_put(bsgs_page_top_t* pagetop, UV v, UV i) {
1401	2		UV idx = v % pagetop->size;
1402	2		bsgs_hash_t** table = pagetop->table;
1403	2		bsgs_hash_t* entry = table[idx];
1404
1405	2	50	while (entry && entry->V != v)
		0
1406	0		entry = entry->next;
1407
1408	2	50	if (!entry) {
1409	2		entry = get_entry(pagetop);
1410	2		entry->M = i;
1411	2		entry->V = v;
1412	2		entry->next = table[idx];
1413	2		table[idx] = entry;
1414			}
1415	2		}
1416
1417	0		static UV bsgs_hash_get(bsgs_page_top_t* pagetop, UV v) {
1418	0		bsgs_hash_t* entry = pagetop->table[v % pagetop->size];
1419	0	0	while (entry && entry->V != v)
		0
1420	0		entry = entry->next;
1421	0	0	return (entry) ? entry->M : 0;
1422			}
1423
1424	4137		static UV bsgs_hash_put_get(bsgs_page_top_t* pagetop, UV v, UV i) {
1425	4137		UV idx = v % pagetop->size;
1426	4137		bsgs_hash_t** table = pagetop->table;
1427	4137		bsgs_hash_t* entry = table[idx];
1428
1429	4260	100	while (entry && entry->V != v)
		50
1430	123		entry = entry->next;
1431
1432	4137	50	if (entry)
1433	0		return entry->M;
1434
1435	4137		entry = get_entry(pagetop);
1436	4137		entry->M = i;
1437	4137		entry->V = v;
1438	4137		entry->next = table[idx];
1439	4137		table[idx] = entry;
1440	4137		return 0;
1441			}
1442
1443	3		static UV dlp_bsgs(UV a, UV g, UV p, UV n, UV maxent, int race_rho) {
1444			bsgs_page_top_t PAGES;
1445			UV i, m, maxm, hashmap_count;
1446			UV aa, S, gm, T, gs_i, bs_i;
1447	3		UV result = 0;
1448			#ifdef DEBUG
1449			int const verbose = _XS_get_verbose();
1450			#else
1451	3		int const verbose = 0;
1452			#endif
1453	3		prho_state_t rho_state = {1, 0, 0, 1, 0, 0, 0, 0, verbose};
1454
1455	3	50	if (n <= 2) return 0; /* Shouldn't be here with gorder this low */
1456
1457	3	50	if (race_rho) {
1458	3		result = dlp_prho_uvw(a, g, p, n, 10000, &rho_state);
1459	3	100	if (result) {
1460	1	50	if (verbose) printf("rho found solution in BSGS step 0\n");
1461	1		return result;
1462			}
1463			}
1464
1465	2	50	if (a == 0) return 0; /* We don't handle this case */
1466
1467	2		maxm = isqrt(n);
1468	2		m = (maxent > maxm) ? maxm : maxent;
1469
1470	2	50	hashmap_count = (m < 65537) ? 65537 :
1471	0	0	(m > 40000000) ? 40000003 :
1472			next_prime(m); /* Ave depth around 2 */
1473
1474			/* Create table. Size: 8hashmap_count bytes. /
1475	2		PAGES.size = hashmap_count;
1476	2		PAGES.first = 0;
1477	2		PAGES.nused = 0;
1478	2		PAGES.npages = 0;
1479	2	50	Newz(0, PAGES.table, hashmap_count, bsgs_hash_t*);
1480
1481	2		aa = mulmod(a,a,p);
1482	2		S = a;
1483	2		gm = powmod(g, m, p);
1484	2		T = gm;
1485	2		gs_i = 0;
1486	2		bs_i = 0;
1487
1488	2		bsgs_hash_put(&PAGES, S, 0); /* First baby step */
1489	2		S = mulmod(S, g, p);
1490			/* Interleaved Baby Step Giant Step */
1491	2069	50	for (i = 1; i <= m; i++) {
1492	2069		gs_i = bsgs_hash_put_get(&PAGES, S, i);
1493	2069	50	if (gs_i) { bs_i = i; break; }
1494	2069		S = mulmod(S, g, p);
1495	2069	100	if (S == aa) { /* We discovered the solution! */
1496	1	50	if (verbose) printf(" dlp bsgs: solution at BS step %"UVuf"\n", i+1);
1497	1		result = i+1;
1498	1		break;
1499			}
1500	2068		bs_i = bsgs_hash_put_get(&PAGES, T, i);
1501	2068	50	if (bs_i) { gs_i = i; break; }
1502	2068		T = mulmod(T, gm, p);
1503	2068	50	if (race_rho && (i % 2048) == 0) {
		100
1504	1		result = dlp_prho_uvw(a, g, p, n, 100000, &rho_state);
1505	1	50	if (result) {
1506	1	50	if (verbose) printf("rho found solution in BSGS step %"UVuf"\n", i);
1507	1		break;
1508			}
1509			}
1510			}
1511
1512	2	50	if (!result) {
1513			/* Extend Giant Step search */
1514	0	0	if (!(gs_i \|\| bs_i)) {
		0
1515	0		UV b = (p+m-1)/m;
1516	0	0	if (m < maxm && b > 8m) b = 8m;
		0
1517	0	0	for (i = m+1; i < b; i++) {
1518	0		bs_i = bsgs_hash_get(&PAGES, T);
1519	0	0	if (bs_i) { gs_i = i; break; }
1520	0		T = mulmod(T, gm, p);
1521	0	0	if (race_rho && (i % 2048) == 0) {
		0
1522	0		result = dlp_prho_uvw(a, g, p, n, 100000, &rho_state);
1523	0	0	if (result) {
1524	0	0	if (verbose) printf("rho found solution in BSGS step %"UVuf"\n", i);
1525	0		break;
1526			}
1527			}
1528			}
1529			}
1530
1531	0	0	if (gs_i \|\| bs_i) {
		0
1532	0		result = submod(mulmod(gs_i, m, p), bs_i, p);
1533			}
1534			}
1535	2	50	if (verbose) printf(" dlp bsgs using %d pages (%.1fMB+%.1fMB) for hash\n", PAGES.npages, ((double)PAGES.npages * sizeof(bsgs_page_t)) / (10241024), ((double)hashmap_count sizeof(bsgs_hash_t)) / (10241024));
1536
1537	2		destroy_pages(&PAGES);
1538	2		Safefree(PAGES.table);
1539	2	50	if (result != 0 && powmod(g,result,p) != a) {
		50
1540	0	0	if (verbose) printf("Incorrect DLP BSGS solution: %"UVuf"\n", result);
1541	0		result = 0;
1542			}
1543	2	50	if (race_rho && result == 0) {
		50
1544	0		result = dlp_prho_uvw(a, g, p, n, 2000000000U, &rho_state);
1545			}
1546	3		return result;
1547			}
1548
1549			/* Find smallest k where a = g^k mod p */
1550			#define DLP_TRIAL_NUM 10000
1551	16		static UV znlog_solve(UV a, UV g, UV p, UV n) {
1552			UV k, sqrtn;
1553	16		const int verbose = _XS_get_verbose();
1554
1555	16	50	if (a >= p) a %= p;
1556	16	50	if (g >= p) g %= p;
1557
1558	16	100	if (a == 1 \|\| g == 0 \|\| p <= 2)
		50
		50
1559	1		return 0;
1560
1561	15	50	if (verbose > 1 && n != p-1) printf(" g=%"UVuf" p=%"UVuf", order %"UVuf"\n", g, p, n);
		0
1562
1563			/* printf(" solving znlog(%"UVuf",%"UVuf",%"UVuf") n=%"UVuf"\n", a, g, p, n); */
1564
1565	15	50	if (n == 0 \|\| n <= DLP_TRIAL_NUM) {
		100
1566	12		k = dlp_trial(a, g, p, DLP_TRIAL_NUM);
1567	12	50	if (verbose) printf(" dlp trial 10k %s\n", (k!=0 \|\| p <= DLP_TRIAL_NUM) ? "success" : "failure");
		0
		0
1568	12	50	if (k != 0 \|\| (n > 0 && n <= DLP_TRIAL_NUM)) return k;
		0
		0
1569			}
1570
1571			{ /* Existence checks */
1572	3		UV aorder, gorder = n;
1573	3	50	if (gorder != 0 && powmod(a, gorder, p) != 1) return 0;
		50
1574	3		aorder = znorder(a,p);
1575	3	50	if (aorder == 0 && gorder != 0) return 0;
		0
1576	3	50	if (aorder != 0 && gorder % aorder != 0) return 0;
		50
1577			}
1578
1579	3	50	sqrtn = (n == 0) ? 0 : isqrt(n);
1580	3	50	if (n == 0) n = p-1;
1581
1582			{
1583	3	50	UV maxent = (sqrtn > 0) ? sqrtn+1 : 100000;
1584	3		k = dlp_bsgs(a, g, p, n, maxent/2, /* race rho */ 1);
1585	3	50	if (verbose) printf(" dlp bsgs %"UVuf"k %s\n", maxent/1000, k!=0 ? "success" : "failure");
		0
1586	3	50	if (k != 0) return k;
1587	0	0	if (sqrtn > 0 && sqrtn < maxent) return 0;
		0
1588			}
1589
1590	0	0	if (verbose) printf(" dlp doing exhaustive trial\n");
1591	0		k = dlp_trial(a, g, p, p);
1592	0		return k;
1593			}
1594
1595			/* Silver-Pohlig-Hellman */
1596	5		static UV znlog_ph(UV a, UV g, UV p, UV p1) {
1597			UV fac[MPU_MAX_FACTORS+1];
1598			UV exp[MPU_MAX_FACTORS+1];
1599			int i, nfactors;
1600			UV x, j;
1601
1602	5	50	if (p1 == 0) return 0; /* TODO: Should we plow on with p1=p-1? */
1603	5		nfactors = factor_exp(p1, fac, exp);
1604	5	50	if (nfactors == 1)
1605	0		return znlog_solve(a, g, p, p1);
1606	21	100	for (i = 0; i < nfactors; i++) {
1607			UV pi, delta, gamma;
1608	17	100	pi = fac[i]; for (j = 1; j < exp[i]; j++) pi *= fac[i];
1609	16		delta = powmod(a,p1/pi,p);
1610	16		gamma = powmod(g,p1/pi,p);
1611			/* printf(" solving znlog(%"UVuf",%"UVuf",%"UVuf")\n", delta, gamma, p); */
1612	16		fac[i] = znlog_solve( delta, gamma, p, znorder(gamma,p) );
1613	16		exp[i] = pi;
1614			}
1615	5		x = chinese(fac, exp, nfactors, &i);
1616	5	50	if (i == 1 && powmod(g, x, p) == a)
		50
1617	5		return x;
1618	5		return 0;
1619			}
1620
1621			/* Find smallest k where a = g^k mod p */
1622	20		UV znlog(UV a, UV g, UV p) {
1623			UV k, gorder, aorder;
1624	20		const int verbose = _XS_get_verbose();
1625
1626	20	50	if (a >= p) a %= p;
1627	20	50	if (g >= p) g %= p;
1628
1629	20	100	if (a == 1 \|\| g == 0 \|\| p <= 2)
		50
		50
1630	2		return 0;
1631
1632			/* TODO: We call znorder with the same p many times. We should have a
1633			* method for znorder given {phi,nfactors,fac,exp} */
1634
1635	18		gorder = znorder(g,p);
1636	18	100	if (gorder != 0 && powmod(a, gorder, p) != 1) return 0;
		100
1637	16		aorder = znorder(a,p);
1638	16	100	if (aorder == 0 && gorder != 0) return 0;
		50
1639	16	100	if (aorder != 0 && gorder % aorder != 0) return 0;
		50
1640
1641			/* TODO: Come up with a better solution for a=0 */
1642	16	100	if (a == 0 \|\| p < DLP_TRIAL_NUM \|\| (gorder > 0 && gorder < DLP_TRIAL_NUM)) {
		100
		50
		50
1643	11	50	if (verbose > 1) printf(" dlp trial znlog(%"UVuf",%"UVuf",%"UVuf")\n",a,g,p);
1644	11		k = dlp_trial(a, g, p, p);
1645	11		return k;
1646			}
1647
1648	5	50	if (!is_prob_prime(gorder)) {
1649	5		k = znlog_ph(a, g, p, gorder);
1650	5	50	if (verbose) printf(" dlp PH %s\n", k!=0 ? "success" : "failure");
		0
1651	5	50	if (k != 0) return k;
1652			}
1653
1654	0		return znlog_solve(a, g, p, gorder);
1655			}
1656
1657
1658			/* Compile with:
1659			* gcc -O3 -fomit-frame-pointer -march=native -Wall -DFACTOR_STANDALONE -DSTANDALONE factor.c util.c sieve.c cache.c primality.c lmo.c -lm
1660			*/
1661			#ifdef FACTOR_STANDALONE
1662			#include
1663			int main(int argc, char *argv[])
1664			{
1665			UV n;
1666			UV factors[MPU_MAX_FACTORS+1];
1667			int nfactors, i, a;
1668
1669			if (argc <= 1) { printf("usage: %s \n", argv[0]); return(1); }
1670
1671			for (a = 1; a < argc; a++) {
1672			n = strtoul(argv[a], 0, 10);
1673			if (n == ULONG_MAX && errno == ERANGE) { printf("Argument larger than ULONG_MAX\n"); return(-1); }
1674			nfactors = factor(n, factors);
1675			printf("%"UVuf":", n);
1676			for (i = 0; i < nfactors; i++)
1677			printf(" %"UVuf"", factors[i]);
1678			printf("\n");
1679			}
1680
1681			return(0);
1682			}
1683			#endif