File Coverage

lmo.c

Criterion	Covered	Total	%
statement	314	346	90.7
branch	222	306	72.5
condition			n/a
subroutine			n/a
pod			n/a
total	536	652	82.2

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5
6			/*****************************************************************************
7			*
8			* Prime counts using the extended Lagarias-Miller-Odlyzko combinatorial method.
9			*
10			* Copyright (c) 2013-2014 Dana Jacobsen (dana@acm.org)
11			* This is free software; you can redistribute it and/or modify it under
12			* the same terms as the Perl 5 programming language system itself.
13			*
14			* This file is part of the Math::Prime::Util Perl module, but it should
15			* not be difficult to turn it into standalone code.
16			*
17			* The structure of the main routine is based on Christian Bau's earlier work.
18			*
19			* References:
20			* - Christian Bau's paper and example implementation, 2003, Christian Bau
21			* This was of immense help. References to "step #" refer to this preprint.
22			* - "Computing Pi(x): the combinatorial method", 2006, Tomás Oliveira e Silva
23			* - "Computing Pi(x): The Meissel, Lehmer, Lagarias, Miller, Odlyzko Method"
24			* 1996, Deléglise and Rivat.
25			*
26			* Comparisons to the other prime counting implementations in this package:
27			*
28			* Sieve: Segmented, single threaded, thread-safe. Small table enhanced,
29			* fastest for n < 60M. Bad growth rate (like all sieves will have).
30			* Legendre:Simple. Recursive caching phi.
31			* Meissel: Simple. Non-recursive phi, lots of memory.
32			* Lehmer: Non-recursive phi, tries to restrict memory.
33			* LMOS: Simple. Non-recursive phi, less memory than Lehmer above.
34			* LMO: Sieve phi. Much faster and less memory than the others.
35			*
36			* Timing below is single core Haswell 4770K using Math::Prime::Util.
37			*
38			* \| n \| Legendre \| Meissel \| Lehmer \| LMOS \| LMO \|
39			* +-------+----------+----------+----------+----------+-----------+
40			* \| 10^19 \| \| \| \| \| 2493.4 \|
41			* \| 10^18 \| \| \| \| \| 498.16 \|
42			* \| 10^17 \|10459.3 \| 4348.3 \| 6109.7 \| 3478.0 \| 103.03 \|
43			* \| 10^16 \| 1354.6 \| 510.8 \| 758.6 \| 458.4 \| 21.64 \|
44			* \| 10^15 \| 171.2 \| 97.1 \| 106.4 \| 68.11 \| 4.707 \|
45			* \| 10^14 \| 23.56 \| 18.59 \| 16.51 \| 10.44 \| 1.032 \|
46			* \| 10^13 \| 3.783 \| 3.552 \| 2.803 \| 1.845 \| 0.237 \|
47			* \| 10^12 \| 0.755 \| 0.697 \| 0.505 \| 0.378 \| 54.9ms \|
48			* \| 10^11 \| 0.165 \| 0.144 \| 93.7ms\| 81.6ms\| 13.80ms\|
49			* \| 10^10 \| 35.9ms\| 29.9ms\| 19.9ms\| 17.8ms\| 3.64ms\|
50			*
51			* Run with high memory limits: Meissel uses 1GB for 10^16, ~3GB for 10^17.
52			* Lehmer is limited at high n values by sieving speed. It is much faster
53			* using parallel primesieve, though cannot come close to LMO.
54			*/
55
56			/* Adjust to get best performance. Alpha from TOS paper. */
57			#define M_FACTOR(n) (UV) ((double)n * (log(n)/log(5.2)) * (log(log(n))-1.4))
58			/* Size of segment used for previous primes, must be >= 21 */
59			#define PREV_SIEVE_SIZE 512
60			/* Phi sieve multiplier, adjust for best performance and memory use. */
61			#define PHI_SIEVE_MULT 13
62
63			#define FUNC_isqrt 1
64			#define FUNC_icbrt 1
65			#include "lmo.h"
66			#include "util.h"
67			#include "constants.h"
68			#include "prime_nth_count.h"
69			#include "cache.h"
70			#include "sieve.h"
71
72			#ifdef _MSC_VER
73			typedef unsigned __int8 uint8;
74			typedef unsigned __int16 uint16;
75			typedef unsigned __int32 uint32;
76			#else
77			typedef unsigned char uint8;
78			typedef unsigned short uint16;
79			typedef uint32_t uint32;
80			#endif
81
82			/* UV is either uint32 or uint64 depending on Perl. We use this native size
83			* for the basic unit of the phi sieve. It can be easily overridden here. */
84			typedef UV sword_t;
85			#define SWORD_BITS BITS_PER_WORD
86			#define SWORD_ONES UV_MAX
87			#define SWORD_MASKBIT(bits) (UVCONST(1) << ((bits) % SWORD_BITS))
88			#define SWORD_CLEAR(s,bits) s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits)
89
90			/* GCC 3.4 - 4.1 has broken 64-bit popcount.
91			* GCC 4.2+ can generate awful code when it doesn't have asm (GCC bug 36041).
92			* When the asm is present (e.g. compile with -march=native on a platform that
93			* has them, like Nahelem+), then it is almost as fast as the direct asm. */
94			#if SWORD_BITS == 64
95			#if defined(__POPCNT__) && defined(__GNUC__) && (__GNUC__> 4 \|\| (__GNUC__== 4 && __GNUC_MINOR__> 1))
96			#define bitcount(b) __builtin_popcountll(b)
97			#else
98	23671812		static sword_t bitcount(sword_t b) {
99	23671812		b -= (b >> 1) & 0x5555555555555555;
100	23671812		b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
101	23671812		b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
102	23671812		return (b * 0x0101010101010101) >> 56;
103			}
104			#endif
105			#else
106			/* An 8-bit table version is usually a little faster, but this is simpler. */
107			static sword_t bitcount(sword_t b) {
108			b -= (b >> 1) & 0x55555555;
109			b = (b & 0x33333333) + ((b >> 2) & 0x33333333);
110			b = (b + (b >> 4)) & 0x0f0f0f0f;
111			return (b * 0x01010101) >> 24;
112			}
113			#endif
114
115
116			/* Create array of small primes: 0,2,3,5,...,prev_prime(n+1) */
117	847		static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
118			{
119	847		uint32 i = 0;
120			uint32_t* plist;
121	847		double logn = log(n);
122	847	50	uint32 max_index = (n < 67) ? 18
		50
123	847		: (n < 355991) ? 15+(n/(logn-1.09))
124	0		: (n/logn) * (1.0+1.0/logn+2.51/(logn*logn));
125	847		*number_of_primes = 0;
126	847	50	New(0, plist, max_index+1, uint32_t);
127	847		plist[0] = 0;
128			/* We could do a simple SoE here. This is not time critical. */
129	237891	50	START_DO_FOR_EACH_PRIME(2, n) {
		100
		100
		100
		100
		100
		100
		100
		50
		100
130	237030		plist[++i] = p;
131	237030		} END_DO_FOR_EACH_PRIME;
132	847		*number_of_primes = i;
133	847		return plist;
134			}
135			#if 0 /* primesieve 5.0 example */
136			#include
137			static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) {
138			uint32_t plist;
139			uint32_t* psprimes = generate_primes(2, n, number_of_primes, UINT_PRIMES);
140			New(0, plist, *number_of_primes + 1, uint32_t);
141			plist[0] = 0;
142			memcpy(plist+1, psprimes, number_of_primes sizeof(uint32_t));
143			primesieve_free(psprimes);
144			return plist;
145			}
146			#endif
147
148			/* Given a max prime in small prime list, return max prev prime input */
149	847		static uint32 prev_sieve_max(UV maxprime) {
150	847		UV limit = maxprimemaxprime - (maxprimemaxprime % (16*PREV_SIEVE_SIZE)) - 1;
151	847		return (limit > U32_CONST(4294967295)) ? U32_CONST(4294967295) : limit;
152			}
153
154			/* Simple SoE filling a segment */
155	2571		static void _prev_sieve_fill(UV start, uint8* sieve, const uint32_t* primes) {
156			UV i, j, p;
157	2571		memset( sieve, 0xFF, PREV_SIEVE_SIZE );
158	94946	100	for (i = 2, p = 3; pp < start + (16PREV_SIEVE_SIZE); p = primes[++i])
159	14178014	100	for (j = (start == 0) ? p*p/2 : (p-1) - ((start+(p-1))/2) % p;
		100
160	14085639		j < (8*PREV_SIEVE_SIZE); j += p)
161	14085639		sieve[j/8] &= ~(1U << (j%8));
162	2571		}
163
164			/* Calculate previous prime using small segment */
165	1671328		static uint32 prev_sieve_prime(uint32 n, uint8* sieve, uint32* segment_start, uint32 sieve_max, const uint32_t* primes)
166			{
167			uint32 sieve_start, bit_offset;
168	1671328	50	if (n <= 3) return (n == 3) ? 2 : 0;
		0
169	1671328	50	if (n > sieve_max) croak("ps overflow\n");
170
171			/* If n > 3 && n <= sieve_max, then there is an odd prime we can find. */
172	1671328		n -= 2;
173	1671328		bit_offset = n % (16*PREV_SIEVE_SIZE);
174	1671328		sieve_start = n - bit_offset;
175	1671328		bit_offset >>= 1;
176
177			while (1) {
178	1672926	100	if (sieve_start != segment_start) { / Fill sieve if necessary */
179	2571		_prev_sieve_fill(sieve_start, sieve, primes);
180	2571		*segment_start = sieve_start;
181			}
182			do { /* Look for a set bit in sieve */
183	7894821	100	if (sieve[bit_offset / 8] & (1u << (bit_offset % 8)))
184	1671328		return sieve_start + 2*bit_offset + 1;
185	6223493	100	} while (bit_offset-- > 0);
186	1598		sieve_start -= (16 * PREV_SIEVE_SIZE);
187	1598		bit_offset = ((16 * PREV_SIEVE_SIZE) - 1) / 2;
188	1598		}
189			}
190
191			/* Create factor table.
192			* In lehmer.c we create mu and lpf arrays. Here we use Christian Bau's
193			* method, which is slightly more memory efficient and also a bit faster than
194			* the code there (which does not use our fast ranged moebius). It makes
195			* very little difference -- mainly using this table is more convenient.
196			*
197			* In a uint16 we have stored:
198			* 0 moebius(n) = 0
199			* even moebius(n) = 1
200			* odd moebius(n) = -1 (last bit indicates even/odd number of factors)
201			* v smallest odd prime factor of n is v&1
202			* 65535 large prime
203			*/
204	847		static uint16* ft_create(uint32 max)
205			{
206			uint16* factor_table;
207			uint32 i;
208	847		uint32 tableLimit = max + 338 + 1; /* At least one more prime */
209	847		uint32 tableSize = tableLimit/2;
210	847		uint32 max_prime = (tableLimit - 1) / 3 + 1;
211
212	847		New(0, factor_table, tableSize, uint16);
213
214			/* Set all values to 65535 (a large prime), set 0 to 65534. */
215	847		factor_table[0] = 65534;
216	720378	100	for (i = 1; i < tableSize; ++i)
217	719531		factor_table[i] = 65535;
218
219			/* Process each odd. */
220	720378	100	for (i = 1; i < tableSize; ++i) {
221			uint32 factor, max_factor;
222	719531		uint32 p = i*2+1;
223	719531	100	if (factor_table[i] != 65535) /* Already marked. */
224	502159		continue;
225	217372	50	if (p < 65535) /* p is a small prime, so set the number. */
226	217372		factor_table[i] = p;
227	217372	100	if (p >= max_prime) /* No multiples will be in the table */
228	132060		continue;
229
230	85312		max_factor = (tableLimit - 1) / p + 1;
231			/* Look for odd multiples of the prime p. */
232	1189737	100	for (factor = 3; factor < max_factor; factor += 2) {
233	1104425		uint32 index = (p*factor)/2;
234	1104425	100	if (factor_table[index] == 65535) /* p is smallest factor */
235	502159		factor_table[index] = p;
236	602266	100	else if (factor_table[index] > 0) /* Change number of factors */
237	476142		factor_table[index] ^= 0x01;
238			}
239
240			/* Change all odd multiples of pp to 0 to indicate non-square-free. /
241	228229	100	for (factor = p; factor < max_factor; factor += 2*p)
242	142917		factor_table[ (p*factor) / 2] = 0;
243			}
244	847		return factor_table;
245			}
246
247			#define PHIC 6
248
249			/* static const uint8_t _s0[ 1] = {0};
250			static const uint8_t _s1[ 2] = {0,1};
251			static const uint8_t _s2[ 6] = {0,1,1,1,1,2}; */
252			static const uint8_t _s3[30] = {0,1,1,1,1,1,1,2,2,2,2,3,3,4,4,4,4,5,5,6,6,6,6,7,7,7,7,7,7,8};
253			static const uint8_t _s4[210]= {0,1,1,1,1,1,1,1,1,1,1,2,2,3,3,3,3,4,4,5,5,5,5,6,6,6,6,6,6,7,7,8,8,8,8,8,8,9,9,9,9,10,10,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,15,15,15,15,15,15,16,16,16,16,17,17,18,18,18,18,18,18,19,19,19,19,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,23,23,24,24,24,24,25,25,26,26,26,26,27,27,27,27,27,27,27,27,28,28,28,28,28,28,29,29,29,29,30,30,30,30,30,30,31,31,32,32,32,32,33,33,33,33,33,33,34,34,35,35,35,35,35,35,36,36,36,36,36,36,37,37,37,37,38,38,39,39,39,39,40,40,40,40,40,40,41,41,42,42,42,42,42,42,43,43,43,43,44,44,45,45,45,45,46,46,47,47,47,47,47,47,47,47,47,47,48};
254	411299		static UV tablephi(UV x, uint32 a)
255			{
256	411299		switch (a) {
257	0		case 0: return x;
258	0		case 1: return x-x/2;
259	0		case 2: return x-x/2-x/3+x/6;
260	0		case 3: return (x/ 30U) * 8U + _s3[x % 30U];
261	6		case 4: return (x/ 210U) * 48U + _s4[x % 210U];
262			case 5: {
263	2		UV xp = x / 11U;
264	2		return ((x /210) * 48 + _s4[x % 210]) -
265	2		((xp/210) * 48 + _s4[xp % 210]);
266			}
267			case 6:
268			default:{
269	411291		UV xp = x / 11U;
270	411291		UV x2 = x / 13U;
271	411291		UV x2p = x2 / 11U;
272	411291		return ((x /210) * 48 + _s4[x % 210]) -
273	822582		((xp /210) * 48 + _s4[xp % 210]) -
274	411291		((x2 /210) * 48 + _s4[x2 % 210]) +
275	411291		((x2p/210) * 48 + _s4[x2p% 210]);
276			}
277			/* case 7: return tablephi(x,a-1)-tablephi(x/17,a-1); / / Hack hack */
278			}
279			}
280
281			/****************************************************************************/
282			/* Legendre Phi. Not used by LMO, but exported. */
283			/****************************************************************************/
284
285			/*
286			* Choices include:
287			* 1) recursive, memory-less. We use this for small values.
288			* 2) recursive, caching. We use a this for larger values w/ 32MB cache.
289			* 3) a-walker sorted list. lehmer.c has this implementation. It is
290			* faster for some values, but big and memory intensive.
291			*/
292	3299		static UV _phi_recurse(UV x, UV a) {
293	3299		UV i, c = (a > PHIC) ? PHIC : a;
294	3299		UV sum = tablephi(x, c);
295	3299	100	if (a > c) {
296	2745		UV p = nth_prime(c);
297	2745		UV pa = nth_prime(a);
298	6029	100	for (i = c+1; i <= a; i++) {
299			UV xp;
300	5877		p = next_prime(p);
301	5877		xp = x/p;
302	5877	100	if (xp < p) {
303	2593	50	while (x < pa) {
304	0		a--;
305	0		pa = prev_prime(pa);
306			}
307	2593		return (sum - a + i - 1);
308			}
309	3284		sum -= legendre_phi(xp, i-1);
310			}
311			}
312	706		return sum;
313			}
314
315			#define PHICACHEA 256
316			#define PHICACHEX 65536
317			#define PHICACHE_EXISTS(x,a) \
318			((x < PHICACHEX && a < PHICACHEA) ? cache[a*PHICACHEX+x] : 0)
319	0		static IV _phi(UV x, UV a, int sign, const uint32_t* const primes, const uint32_t lastidx, uint16_t* cache)
320			{
321			IV sum;
322	0	0	if (PHICACHE_EXISTS(x,a)) return sign * cache[a*PHICACHEX+x];
		0
		0
323	0	0	else if (a <= PHIC) return sign * tablephi(x, a);
324	0	0	else if (x < primes[a+1]) sum = sign;
325			else {
326			/* sum = _phi(x, a-1, sign, primes, lastidx, cache) + */
327			/* _phi(x/primes[a], a-1, -sign, primes, lastidx, cache); */
328	0	0	UV a2, iters = (a*a > x) ? segment_prime_count(2,isqrt(x)) : a;
329	0		UV c = (iters > PHIC) ? PHIC : iters;
330	0	0	IV phixc = PHICACHE_EXISTS(x,c) ? cache[a*PHICACHEX+x] : tablephi(x,c);
		0
		0
331	0		sum = sign * (iters - a + phixc);
332	0	0	for (a2 = c+1; a2 <= iters; a2++)
333	0		sum += _phi(x/primes[a2], a2-1, -sign, primes, lastidx, cache);
334			}
335	0	0	if (x < PHICACHEX && a < PHICACHEA && sign*sum <= SHRT_MAX)
		0
		0
336	0		cache[aPHICACHEX+x] = sign sum;
337	0		return sum;
338			}
339	3301		UV legendre_phi(UV x, UV a)
340			{
341			/* If 'x' is very small, give a quick answer with any 'a' */
342	3301	100	if (x <= PHIC)
343	2		return tablephi(x, (a > PHIC) ? PHIC : a);
344
345			/* Shortcuts for large values, from R. Andrew Ohana */
346	3299	50	if (a > (x >> 1)) return 1;
347			/* If a > prime_count(2^32), then we need not be concerned with composite
348			* x values with all factors > 2^32, as x is limited to 64-bit. */
349	3299	50	if (a > 203280221) { /* prime_count(2*32) /
350	0		UV pc = LMO_prime_count(x);
351	0	0	return (a > pc) ? 1 : pc - a + 1;
352			}
353			/* If a is large enough, check the ratios */
354	3299	50	if (a > 1000000 && x < a21) { / x always less than 2^32 */
		0
355	0	0	if ( LMO_prime_count(x) < a) return 1;
356			}
357
358			/* TODO: R. Andrew Ohana's 2011 SAGE code is faster as the a value
359			* increases. It uses a primelist as in the caching code below, as
360			* well as a binary search prime count on it (like in our lehmer). */
361
362	3299	50	if ( a > 254 \|\| (x > 1000000000 && a > 30) ) {
		50
		0
363			uint16_t* cache;
364			uint32_t* primes;
365			uint32_t lastidx;
366	0	0	UV res, max_cache_a = (a >= PHICACHEA) ? PHICACHEA : a+1;
367	0	0	Newz(0, cache, PHICACHEX * max_cache_a, uint16_t);
368	0		primes = make_primelist(nth_prime(a+1), &lastidx);
369	0		res = (UV) _phi(x, a, 1, primes, lastidx, cache);
370	0		Safefree(primes);
371	0		Safefree(cache);
372	0		return res;
373			}
374
375	3299		return _phi_recurse(x, a);
376			}
377			/****************************************************************************/
378
379
380			typedef struct {
381			sword_t sieve; / segment bit mask */
382			uint8 word_count; / bit count in each 64-bit word */
383			uint32 word_count_sum; / cumulative sum of word_count */
384			UV totals; / total bit count for all phis at index */
385			uint32 prime_index; / index of prime where phi(n/p/p(k+1))=1 */
386			uint32 first_bit_index; / offset relative to start for this prime */
387			uint8 multiplier; / mod-30 wheel of each prime */
388			UV start; /* x value of first bit of segment */
389			UV phi_total; /* cumulative bit count before removal */
390			uint32 size; /* segment size in bits */
391			uint32 first_prime; /* index of first prime in segment */
392			uint32 last_prime; /* index of last prime in segment */
393			uint32 last_prime_to_remove; /* index of last prime p, p^2 in segment */
394			} sieve_t;
395
396			/* Size of phi sieve in words. Multiple of 35711 words. /
397			#define PHI_SIEVE_WORDS (1155 * PHI_SIEVE_MULT)
398
399			/* Bit counting using cumulative sums. A bit slower than using a running sum,
400			* but a little simpler and can be run in parallel. */
401	97443		static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count, uint32* sieve_word_count_sum) {
402	97443		uint32 i, bc, words = (sieve_size + 2SWORD_BITS-1) / (2SWORD_BITS);
403	97443		sieve_word_count_sum[0] = 0;
404	40468260	100	for (i = 0, bc = 0; i+7 < words; i += 8) {
405	40370817		const uint8* cntptr = sieve_word_count + i;
406	40370817		uint32* sumptr = sieve_word_count_sum + i;
407	40370817		sumptr[1] = bc += cntptr[0];
408	40370817		sumptr[2] = bc += cntptr[1];
409	40370817		sumptr[3] = bc += cntptr[2];
410	40370817		sumptr[4] = bc += cntptr[3];
411	40370817		sumptr[5] = bc += cntptr[4];
412	40370817		sumptr[6] = bc += cntptr[5];
413	40370817		sumptr[7] = bc += cntptr[6];
414	40370817		sumptr[8] = bc += cntptr[7];
415			}
416	465234	100	for (; i < words; i++)
417	367791		sieve_word_count_sum[i+1] = sieve_word_count_sum[i] + sieve_word_count[i];
418	97443		return sieve_word_count_sum[words];
419			}
420
421	21715908		static UV _sieve_phi(UV segment_x, const sword_t* sieve, const uint32* sieve_word_count_sum) {
422	21715908		uint32 bits = (segment_x + 1) / 2;
423	21715908		uint32 words = bits / SWORD_BITS;
424	21715908		uint32 sieve_sum = sieve_word_count_sum[words];
425	21715908		sieve_sum += bitcount( sieve[words] & ~(SWORD_ONES << (bits % SWORD_BITS)) );
426	21715908		return sieve_sum;
427			}
428
429			/* Erasing primes from the sieve is done using Christian Bau's
430			* case statement walker. It's not pretty, but it is short, fast,
431			* clever, and does the job. */
432
433			#define sieve_zero(sieve, si, wordcount) \
434			{ uint32 index_ = si/SWORD_BITS; \
435			sword_t mask_ = SWORD_MASKBIT(si); \
436			if (sieve[index_] & mask_) { \
437			sieve[index_] &= ~mask_; \
438			wordcount[index_]--; \
439			} }
440
441			#define sieve_case_zero(casenum, skip, si, p, size, mult, sieve, wordcount) \
442			case casenum: sieve_zero(sieve, si, wordcount); \
443			si += skip * p; \
444			mult = (casenum+1) % 8; \
445			if (si >= size) break;
446
447	97443		static void remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uint32_t* primes)
448			{
449	97443		uint32 size = (s->size + 1) / 2;
450	97443		sword_t *sieve = s->sieve;
451	97443		uint8 *word_count = s->word_count;
452
453	97443		s->phi_total = s->totals[last_index];
454	207857	100	for ( ;index <= last_index; index++) {
455	110414	100	if (index >= s->first_prime && index <= s->last_prime) {
		50
456	96696		uint32 b = (primes[index] - (uint32) s->start - 1) / 2;
457	96696	50	sieve_zero(sieve, b, word_count);
458			}
459	110414	100	if (index <= s->last_prime_to_remove) {
460	78980		uint32 b = s->first_bit_index[index];
461	78980	50	if (b < size) {
462	78980		uint32 p = primes[index];
463	78980		uint32 mult = s->multiplier[index];
464	78980		switch (mult) {
465			reloop: ;
466	6677343	100	sieve_case_zero(0, 3, b, p, size, mult, sieve, word_count);
		100
467	6672453	100	sieve_case_zero(1, 2, b, p, size, mult, sieve, word_count);
		100
468	6671341	100	sieve_case_zero(2, 1, b, p, size, mult, sieve, word_count);
		100
469	6676432	100	sieve_case_zero(3, 2, b, p, size, mult, sieve, word_count);
		100
470	6676293	100	sieve_case_zero(4, 1, b, p, size, mult, sieve, word_count);
		100
471	6679526	100	sieve_case_zero(5, 2, b, p, size, mult, sieve, word_count);
		100
472	6679326	100	sieve_case_zero(6, 3, b, p, size, mult, sieve, word_count);
		100
473	6673916	100	sieve_case_zero(7, 1, b, p, size, mult, sieve, word_count);
		100
474	6668725		goto reloop;
475			}
476	78980		s->multiplier[index] = mult;
477			}
478	78980		s->first_bit_index[index] = b - size;
479			}
480			}
481	97443		s->totals[last_index] += make_sieve_sums(s->size, s->word_count, s->word_count_sum);
482	97443		}
483
484	3468		static void word_tile (sword_t* source, uint32 from, uint32 to) {
485	13030	100	while (from < to) {
486	9562	100	uint32 words = (2*from > to) ? to-from : from;
487	9562		memcpy(source+from, source, sizeof(sword_t)*words);
488	9562		from += words;
489			}
490	3468		}
491
492	867		static void init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start_prime_index, uint32 sieve_last, const uint32_t* primes)
493			{
494			uint32 i, words;
495	867		sword_t* sieve = s->sieve;
496	867		uint8* word_count = s->word_count;
497
498	867		s->start = segment_start;
499	867		s->size = size;
500
501	867	100	if (segment_start == 0) {
502	847		s->last_prime = 0;
503	847		s->last_prime_to_remove = 0;
504			}
505	867		s->first_prime = s->last_prime + 1;
506	101798	100	while (s->last_prime < sieve_last) {
507	100931		uint32 p = primes[s->last_prime + 1];
508	100931	50	if (p >= segment_start + size)
509	0		break;
510	100931		s->last_prime++;
511			}
512	78126	50	while (s->last_prime_to_remove < sieve_last) {
513	78126		UV p = primes[s->last_prime_to_remove + 1];
514	78126		UV p2 = p*p;
515	78126	100	if (p2 >= segment_start + size)
516	867		break;
517	77259		s->last_prime_to_remove++;
518	77259		s->first_bit_index[s->last_prime_to_remove] = (p2 - segment_start - 1) / 2;
519	77259		s->multiplier[s->last_prime_to_remove] = (uint8) ((p % 30) * 8 / 30);
520			}
521
522	867		memset(sieve, 0xFF, 3sizeof(sword_t)); / Set first 3 words to all 1 bits */
523	867	50	if (start_prime_index >= 3) /* Remove multiples of 3. */
524	56355	100	for (i = 3/2; i < 3 * SWORD_BITS; i += 3)
525	55488		SWORD_CLEAR(sieve, i);
526
527	867		word_tile(sieve, 3, 15); /* Copy to first 15 = 35 words /
528	867	50	if (start_prime_index >= 3) /* Remove multiples of 5. */
529	167331	100	for (i = 5/2; i < 15 * SWORD_BITS; i += 5)
530	166464		SWORD_CLEAR(sieve, i);
531
532	867		word_tile(sieve, 15, 105); /* Copy to first 105 = 357 words */
533	867	50	if (start_prime_index >= 4) /* Remove multiples of 7. */
534	833187	100	for (i = 7/2; i < 105 * SWORD_BITS; i += 7)
535	832320		SWORD_CLEAR(sieve, i);
536
537	867		word_tile(sieve, 105, 1155); /* Copy to first 1155 = 35711 words /
538	867	50	if (start_prime_index >= 5) /* Remove multiples of 11. */
539	5827107	100	for (i = 11/2; i < 1155 * SWORD_BITS; i += 11)
540	5826240		SWORD_CLEAR(sieve, i);
541
542	867		size = (size+1) / 2; /* size to odds */
543	867		words = (size + SWORD_BITS-1) / SWORD_BITS; /* sieve size in words */
544	867		word_tile(sieve, 1155, words); /* Copy first 1155 words to rest */
545			/* Zero all unused bits and words */
546	867	100	if (size % SWORD_BITS)
547	829		sieve[words-1] &= ~(SWORD_ONES << (size % SWORD_BITS));
548	867		memset(sieve + words, 0x00, sizeof(sword_t)*(PHI_SIEVE_WORDS+2 - words));
549
550			/* Create counts, remove primes (updating counts and sums). */
551	1956771	100	for (i = 0; i < words; i++)
552	1955904		word_count[i] = (uint8) bitcount(sieve[i]);
553	867		remove_primes(6, start_prime_index, s, primes);
554	867		}
555
556			/* However we want to handle reduced prime counts */
557			#define simple_pi(n) LMO_prime_count(n)
558			/* Macros to hide all the variables being passed */
559			#define prev_sieve_prime(n) \
560			prev_sieve_prime(n, &prev_sieve[0], &ps_start, ps_max, primes)
561			#define sieve_phi(x) \
562			ss.phi_total + _sieve_phi((x) - ss.start, ss.sieve, ss.word_count_sum)
563
564
565	52460		UV LMO_prime_count(UV n)
566			{
567			UV N2, N3, K2, K3, M, sum1, sum2, phi_value;
568			UV sieve_start, sieve_end, least_divisor, step7_max, last_phi_sieve;
569			uint32 j, k, piM, KM, end, prime, prime_index;
570			uint32 ps_start, ps_max, smallest_divisor, nprimes;
571			uint8 prev_sieve[PREV_SIEVE_SIZE];
572			uint32_t *primes;
573			uint16 *factor_table;
574			sieve_t ss;
575
576	52460		const uint32 c = PHIC; /* We can use our fast function for this */
577
578			/* For "small" n, use our table+segment sieve. */
579	52460	100	if (n < _MPU_LMO_CROSSOVER \|\| n < 10000) return segment_prime_count(2, n);
		50
580			/* n should now be reasonably sized (not tiny). */
581
582			#ifdef USE_PRIMECOUNT_FOR_LARGE_LMO
583			if (n > 110000000000UL) {
584			FILE *f;
585			char cmd[100];
586			sprintf(cmd, "primecount %lu", n);
587			f = popen(cmd, "r");
588			fscanf(f, "%lu", &sum1);
589			pclose(f);
590			return sum1;
591			}
592			#endif
593
594	847		N2 = isqrt(n); /* floor(N^1/2) */
595	847		N3 = icbrt(n); /* floor(N^1/3) */
596	847		K2 = simple_pi(N2); /* Pi(N2) */
597	847		K3 = simple_pi(N3); /* Pi(N3) */
598
599			/* M is N^1/3 times a tunable performance factor. */
600	847	100	M = (N3 > 500) ? M_FACTOR(N3) : N3+N3/2;
601	847	50	if (M >= N2) M = N2 - 1; /* M must be smaller than N^1/2 */
602	847	50	if (M < N3) M = N3; /* M must be at least N^1/3 */
603
604			/* Create the array of small primes, and least-prime-factor/moebius table */
605	847		primes = make_primelist( M + 500, &nprimes );
606	847		factor_table = ft_create( M );
607
608			/* Create other arrays */
609	847		New(0, ss.sieve, PHI_SIEVE_WORDS + 2, sword_t);
610	847		New(0, ss.word_count, PHI_SIEVE_WORDS + 2, uint8);
611	847		New(0, ss.word_count_sum, PHI_SIEVE_WORDS + 2, uint32);
612	847	50	New(0, ss.totals, K3+2, UV);
613	847	50	New(0, ss.prime_index, K3+2, uint32);
614	847	50	New(0, ss.first_bit_index, K3+2, uint32);
615	847		New(0, ss.multiplier, K3+2, uint8);
616
617	847	50	if (ss.sieve == 0 \|\| ss.word_count == 0 \|\| ss.word_count_sum == 0 \|\|
		50
		50
		50
618	847	50	ss.totals == 0 \|\| ss.prime_index == 0 \|\| ss.first_bit_index == 0 \|\|
		50
		50
619	847		ss.multiplier == 0)
620	0		croak("Allocation failure in LMO Pi\n");
621
622			/* Variables for fast prev_prime using small segment sieves (up to M^2) */
623	847		ps_max = prev_sieve_max( primes[nprimes] );
624	847		ps_start = U32_CONST(0xFFFFFFFF);
625
626			/* Look for the smallest divisor: the smallest number > M which is
627			* square-free and not divisible by any prime covered by our Mapes
628			* small-phi case. The largest value we will look up in the phi
629			* sieve is n/smallest_divisor. */
630	1792	100	for (j = (M+1)/2; factor_table[j] <= primes[c]; j++) /* */;
631	847		smallest_divisor = 2*j+1;
632			/* largest_divisor = (N2 > (UV)M * (UV)M) ? N2 : (UV)M * (UV)M; */
633
634	847		M = smallest_divisor - 1; /* Increase M if possible */
635	847		piM = simple_pi(M);
636	847	50	if (piM < c) croak("N too small for LMO\n");
637	847		last_phi_sieve = n / smallest_divisor + 1;
638
639			/* KM = smallest k, c <= k <= piM, s.t. primes[k+1] * primes[k+2] > M. */
640	4645	100	for (KM = c; primes[KM+1] * primes[KM+2] <= M && KM < piM; KM++) /* */;
		50
641	847	50	if (K3 < KM) K3 = KM; /* Ensure K3 >= KM */
642
643			/* Start calculating Pi(n). Steps 4-10 from Bau. */
644	847		sum1 = (K2 - 1) + (UV) (piM - K3 - 1) * (UV) (piM - K3) / 2;
645	847		sum2 = 0;
646	847		end = (M+1)/2;
647
648			/* Start at index K2, which is the prime preceeding N^1/2 */
649	847	50	prime = prev_sieve_prime( (N2 >= ps_start) ? ps_start : N2+1 );
650	847		prime_index = K2 - 1;
651	847		step7_max = K3;
652
653			/* Step 4: For 1 <= x <= M where x is square-free and has no
654			* factor <= primes[c], sum phi(n / x, c). */
655	579027	100	for (j = 0; j < end; j++) {
656	578180		uint32 lpf = factor_table[j];
657	578180	100	if (lpf > primes[c]) {
658	216352		phi_value = tablephi(n / (2j+1), c); / x = 2j+1 */
659	216352	100	if (lpf & 0x01) sum2 += phi_value; else sum1 += phi_value;
660			}
661			}
662
663			/* Step 5: For 1+M/primes[c+1] <= x <= M, x square-free and
664			* has no factor <= primes[c+1], sum phi(n / (xprimes[c+1]), c). /
665	847	50	if (c < piM) {
666	847		UV pc_1 = primes[c+1];
667	545039	100	for (j = (1+M/pc_1)/2; j < end; j++) {
668	544192		uint32 lpf = factor_table[j];
669	544192	100	if (lpf > pc_1) {
670	191646		phi_value = tablephi(n / (pc_1 * (2j+1)), c); / x = 2j+1 */
671	191646	100	if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
672			}
673			}
674			}
675
676	102625	100	for (k = 0; k <= K3; k++) ss.totals[k] = 0;
677	9727	100	for (k = 0; k < KM; k++) ss.prime_index[k] = end;
678
679			/* Instead of dividing by all primes up to pi(M), once a divisor is large
680			* enough then phi(n / (pprimes[k+1]), k) = 1. /
681			{
682	847		uint32 last_prime = piM;
683	92898	100	for (k = KM; k < K3; k++) {
684	92051		UV pk = primes[k+1];
685	168721	100	while (last_prime > k+1 && pk * pk * primes[last_prime] > n)
		100
686	76670		last_prime--;
687	92051		ss.prime_index[k] = last_prime;
688	92051		sum1 += piM - last_prime;
689			}
690			}
691
692	1714	100	for (sieve_start = 0; sieve_start < last_phi_sieve; sieve_start = sieve_end) {
693			/* This phi segment goes from sieve_start to sieve_end. */
694	867		sieve_end = ((sieve_start + 2SWORD_BITSPHI_SIEVE_WORDS) < last_phi_sieve)
695	867		? sieve_start + 2SWORD_BITSPHI_SIEVE_WORDS : last_phi_sieve;
696			/* Only divisors s.t. sieve_start <= N / divisor < sieve_end considered. */
697	867		least_divisor = n / sieve_end;
698			/* Initialize the sieve segment and all associated variables. */
699	867		init_segment(&ss, sieve_start, sieve_end - sieve_start, c, K3, primes);
700
701			/* Step 6: For c < k < KM: For 1+M/primes[k+1] <= x <= M, x square-free
702			* and has no factor <= primes[k+1], sum phi(n / (xprimes[k+1]), k). /
703	4330	100	for (k = c+1; k < KM; k++) {
704	3463		UV pk = primes[k+1];
705	3463	50	uint32 start = (least_divisor >= pk * U32_CONST(0xFFFFFFFE))
706			? U32_CONST(0xFFFFFFFF)
707	3463		: (least_divisor / pk + 1)/2;
708	3463		remove_primes(k, k, &ss, primes);
709	4143648	100	for (j = ss.prime_index[k] - 1; j >= start; j--) {
710	4140185		uint32 lpf = factor_table[j];
711	4140185	100	if (lpf > pk) {
712	1186073		phi_value = sieve_phi(n / (pk * (2*j+1)));
713	1186073	100	if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
714			}
715			}
716	3463	100	if (start < ss.prime_index[k])
717	3446		ss.prime_index[k] = start;
718			}
719			/* Step 7: For KM <= K < Pi_M: For primes[k+2] <= x <= M, sum
720			* phi(n / (xprimes[k+1]), k). The inner for loop can be parallelized. /
721	93113	100	for (; k < step7_max; k++) {
722	92246		remove_primes(k, k, &ss, primes);
723	92246		j = ss.prime_index[k];
724	92246	100	if (j >= k+2) {
725	91971		UV pk = primes[k+1];
726	91971		UV endj = j;
727	2409155	50	while (endj > 7 && endj-7 >= k+2 && pk*primes[endj-7] > least_divisor) endj -= 8;
		100
		100
728	413853	100	while ( endj >= k+2 && pk*primes[endj ] > least_divisor) endj--;
		100
729			/* Now that we know how far to go, do the summations */
730	18951325	100	for ( ; j > endj; j--)
731	18859354		sum1 += sieve_phi(n / (pk*primes[j]));
732	91971		ss.prime_index[k] = endj;
733			}
734			}
735			/* Restrict work for the above loop when we know it will be empty. */
736	92918	100	while (step7_max > KM && ss.prime_index[step7_max-1] < (step7_max-1)+2)
		100
737	92051		step7_max--;
738
739			/* Step 8: For KM <= K < K3, sum -phi(n / primes[k+1], k) */
740	867		remove_primes(k, K3, &ss, primes);
741			/* Step 9: For K3 <= k < K2, sum -phi(n / primes[k+1], k) + (k-K3). */
742	1671348	100	while (prime > least_divisor && prime_index >= piM) {
		50
743	1670481		sum1 += prime_index - K3;
744	1670481		sum2 += sieve_phi(n / prime);
745	1670481		prime_index--;
746	1670481		prime = prev_sieve_prime(prime);
747			}
748			}
749
750	847		Safefree(ss.sieve);
751	847		Safefree(ss.word_count);
752	847		Safefree(ss.word_count_sum);
753	847		Safefree(ss.totals);
754	847		Safefree(ss.prime_index);
755	847		Safefree(ss.first_bit_index);
756	847		Safefree(ss.multiplier);
757	847		Safefree(factor_table);
758	847		Safefree(primes);
759
760	52460		return sum1 - sum2;
761			}