File Coverage

lmo.c

Criterion	Covered	Total	%
statement	314	346	90.7
branch	222	306	72.5
condition			n/a
subroutine			n/a
pod			n/a
total	536	652	82.2

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5
6			/*****************************************************************************
7			*
8			* Prime counts using the extended Lagarias-Miller-Odlyzko combinatorial method.
9			*
10			* Copyright (c) 2013-2014 Dana Jacobsen (dana@acm.org)
11			* This is free software; you can redistribute it and/or modify it under
12			* the same terms as the Perl 5 programming language system itself.
13			*
14			* This file is part of the Math::Prime::Util Perl module, but it should
15			* not be difficult to turn it into standalone code.
16			*
17			* The structure of the main routine is based on Christian Bau's earlier work.
18			*
19			* References:
20			* - Christian Bau's paper and example implementation, 2003, Christian Bau
21			* This was of immense help. References to "step #" refer to this preprint.
22			* - "Computing Pi(x): the combinatorial method", 2006, Tomás Oliveira e Silva
23			* - "Computing Pi(x): The Meissel, Lehmer, Lagarias, Miller, Odlyzko Method"
24			* 1996, Deléglise and Rivat.
25			*
26			* Comparisons to the other prime counting implementations in this package:
27			*
28			* Sieve: Segmented, single threaded, thread-safe. Small table enhanced,
29			* fastest for n < 60M. Bad growth rate (like all sieves will have).
30			* Legendre:Simple. Recursive caching phi.
31			* Meissel: Simple. Non-recursive phi, lots of memory.
32			* Lehmer: Non-recursive phi, tries to restrict memory.
33			* LMOS: Simple. Non-recursive phi, less memory than Lehmer above.
34			* LMO: Sieve phi. Much faster and less memory than the others.
35			*
36			* Timing below is single core Haswell 4770K using Math::Prime::Util.
37			*
38			* \| n \| Legendre \| Meissel \| Lehmer \| LMOS \| LMO \|
39			* +-------+----------+----------+----------+----------+-----------+
40			* \| 10^19 \| \| \| \| \| 2493.4 \|
41			* \| 10^18 \| \| \| \| \| 498.16 \|
42			* \| 10^17 \|10459.3 \| 4348.3 \| 6109.7 \| 3478.0 \| 103.03 \|
43			* \| 10^16 \| 1354.6 \| 510.8 \| 758.6 \| 458.4 \| 21.64 \|
44			* \| 10^15 \| 171.2 \| 97.1 \| 106.4 \| 68.11 \| 4.707 \|
45			* \| 10^14 \| 23.56 \| 18.59 \| 16.51 \| 10.44 \| 1.032 \|
46			* \| 10^13 \| 3.783 \| 3.552 \| 2.803 \| 1.845 \| 0.237 \|
47			* \| 10^12 \| 0.755 \| 0.697 \| 0.505 \| 0.378 \| 54.9ms \|
48			* \| 10^11 \| 0.165 \| 0.144 \| 93.7ms\| 81.6ms\| 13.80ms\|
49			* \| 10^10 \| 35.9ms\| 29.9ms\| 19.9ms\| 17.8ms\| 3.64ms\|
50			*
51			* Run with high memory limits: Meissel uses 1GB for 10^16, ~3GB for 10^17.
52			* Lehmer is limited at high n values by sieving speed. It is much faster
53			* using parallel primesieve, though cannot come close to LMO.
54			*/
55
56			/* Below this size, just sieve (with table speedup). */
57			#define SIEVE_LIMIT 66000000
58			/* Adjust to get best performance. Alpha from TOS paper. */
59			#define M_FACTOR(n) (UV) ((double)n * (log(n)/log(5.2)) * (log(log(n))-1.4))
60			/* Size of segment used for previous primes, must be >= 21 */
61			#define PREV_SIEVE_SIZE 512
62			/* Phi sieve multiplier, adjust for best performance and memory use. */
63			#define PHI_SIEVE_MULT 13
64
65			#define FUNC_isqrt 1
66			#define FUNC_icbrt 1
67			#include "lmo.h"
68			#include "util.h"
69			#include "prime_nth_count.h"
70			#include "cache.h"
71			#include "sieve.h"
72
73			#ifdef _MSC_VER
74			typedef unsigned __int8 uint8;
75			typedef unsigned __int16 uint16;
76			typedef unsigned __int32 uint32;
77			#else
78			typedef unsigned char uint8;
79			typedef unsigned short uint16;
80			typedef uint32_t uint32;
81			#endif
82
83			/* UV is either uint32 or uint64 depending on Perl. We use this native size
84			* for the basic unit of the phi sieve. It can be easily overridden here. */
85			typedef UV sword_t;
86			#define SWORD_BITS BITS_PER_WORD
87			#define SWORD_ONES UV_MAX
88			#define SWORD_MASKBIT(bits) (UVCONST(1) << ((bits) % SWORD_BITS))
89			#define SWORD_CLEAR(s,bits) s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits)
90
91			/* GCC 3.4 - 4.1 has broken 64-bit popcount.
92			* GCC 4.2+ can generate awful code when it doesn't have asm (GCC bug 36041).
93			* When the asm is present (e.g. compile with -march=native on a platform that
94			* has them, like Nahelem+), then it is almost as fast as the direct asm. */
95			#if SWORD_BITS == 64
96			#if defined(__POPCNT__) && defined(__GNUC__) && (__GNUC__> 4 \|\| (__GNUC__== 4 && __GNUC_MINOR__> 1))
97			#define bitcount(b) __builtin_popcountll(b)
98			#else
99	3084671		static sword_t bitcount(sword_t b) {
100	3084671		b -= (b >> 1) & 0x5555555555555555;
101	3084671		b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
102	3084671		b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
103	3084671		return (b * 0x0101010101010101) >> 56;
104			}
105			#endif
106			#else
107			/* An 8-bit table version is usually a little faster, but this is simpler. */
108			static sword_t bitcount(sword_t b) {
109			b -= (b >> 1) & 0x55555555;
110			b = (b & 0x33333333) + ((b >> 2) & 0x33333333);
111			b = (b + (b >> 4)) & 0x0f0f0f0f;
112			return (b * 0x01010101) >> 24;
113			}
114			#endif
115
116
117			/* Create array of small primes: 0,2,3,5,...,prev_prime(n+1) */
118	20		static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
119			{
120	20		uint32 i = 0;
121			uint32_t* plist;
122	20		double logn = log(n);
123	20	50	uint32 max_index = (n < 67) ? 18
		50
124	20		: (n < 355991) ? 15+(n/(logn-1.09))
125	0		: (n/logn) * (1.0+1.0/logn+2.51/(logn*logn));
126	20		*number_of_primes = 0;
127	20	50	New(0, plist, max_index+1, uint32_t);
128	20		plist[0] = 0;
129			/* We could do a simple SoE here. This is not time critical. */
130	8956	50	START_DO_FOR_EACH_PRIME(2, n) {
		100
		100
		100
		100
		100
		100
		100
		50
		100
131	8927		plist[++i] = p;
132	8927		} END_DO_FOR_EACH_PRIME;
133	20		*number_of_primes = i;
134	20		return plist;
135			}
136			#if 0 /* primesieve 5.0 example */
137			#include
138			static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) {
139			uint32_t plist;
140			uint32_t* psprimes = generate_primes(2, n, number_of_primes, UINT_PRIMES);
141			New(0, plist, *number_of_primes + 1, uint32_t);
142			plist[0] = 0;
143			memcpy(plist+1, psprimes, number_of_primes sizeof(uint32_t));
144			primesieve_free(psprimes);
145			return plist;
146			}
147			#endif
148
149			/* Given a max prime in small prime list, return max prev prime input */
150	20		static uint32 prev_sieve_max(UV maxprime) {
151	20		UV limit = maxprimemaxprime - (maxprimemaxprime % (16*PREV_SIEVE_SIZE)) - 1;
152	20		return (limit > U32_CONST(4294967295)) ? U32_CONST(4294967295) : limit;
153			}
154
155			/* Simple SoE filling a segment */
156	140		static void _prev_sieve_fill(UV start, uint8* sieve, const uint32_t* primes) {
157			UV i, j, p;
158	140		memset( sieve, 0xFF, PREV_SIEVE_SIZE );
159	11212	100	for (i = 2, p = 3; pp < start + (16PREV_SIEVE_SIZE); p = primes[++i])
160	871881	100	for (j = (start == 0) ? p*p/2 : (p-1) - ((start+(p-1))/2) % p;
		100
161	860809		j < (8*PREV_SIEVE_SIZE); j += p)
162	860809		sieve[j/8] &= ~(1U << (j%8));
163	140		}
164
165			/* Calculate previous prime using small segment */
166	90739		static uint32 prev_sieve_prime(uint32 n, uint8* sieve, uint32* segment_start, uint32 sieve_max, const uint32_t* primes)
167			{
168			uint32 sieve_start, bit_offset;
169	90739	50	if (n <= 3) return (n == 3) ? 2 : 0;
		0
170	90739	50	if (n > sieve_max) croak("ps overflow\n");
171
172			/* If n > 3 && n <= sieve_max, then there is an odd prime we can find. */
173	90739		n -= 2;
174	90739		bit_offset = n % (16*PREV_SIEVE_SIZE);
175	90739		sieve_start = n - bit_offset;
176	90739		bit_offset >>= 1;
177
178			while (1) {
179	90843	100	if (sieve_start != segment_start) { / Fill sieve if necessary */
180	140		_prev_sieve_fill(sieve_start, sieve, primes);
181	140		*segment_start = sieve_start;
182			}
183			do { /* Look for a set bit in sieve */
184	514244	100	if (sieve[bit_offset / 8] & (1u << (bit_offset % 8)))
185	90739		return sieve_start + 2*bit_offset + 1;
186	423505	100	} while (bit_offset-- > 0);
187	104		sieve_start -= (16 * PREV_SIEVE_SIZE);
188	104		bit_offset = ((16 * PREV_SIEVE_SIZE) - 1) / 2;
189	104		}
190			}
191
192			/* Create factor table.
193			* In lehmer.c we create mu and lpf arrays. Here we use Christian Bau's
194			* method, which is slightly more memory efficient and also a bit faster than
195			* the code there (which does not use our fast ranged moebius). It makes
196			* very little difference -- mainly using this table is more convenient.
197			*
198			* In a uint16 we have stored:
199			* 0 moebius(n) = 0
200			* even moebius(n) = 1
201			* odd moebius(n) = -1 (last bit indicates even/odd number of factors)
202			* v smallest odd prime factor of n is v&1
203			* 65535 large prime
204			*/
205	20		static uint16* ft_create(uint32 max)
206			{
207			uint16* factor_table;
208			uint32 i;
209	20		uint32 tableLimit = max + 338 + 1; /* At least one more prime */
210	20		uint32 tableSize = tableLimit/2;
211	20		uint32 max_prime = (tableLimit - 1) / 3 + 1;
212
213	20		New(0, factor_table, tableSize, uint16);
214
215			/* Set all values to 65535 (a large prime), set 0 to 65534. */
216	20		factor_table[0] = 65534;
217	33185	100	for (i = 1; i < tableSize; ++i)
218	33165		factor_table[i] = 65535;
219
220			/* Process each odd. */
221	33185	100	for (i = 1; i < tableSize; ++i) {
222			uint32 factor, max_factor;
223	33165		uint32 p = i*2+1;
224	33165	100	if (factor_table[i] != 65535) /* Already marked. */
225	24694		continue;
226	8471	50	if (p < 65535) /* p is a small prime, so set the number. */
227	8471		factor_table[i] = p;
228	8471	100	if (p >= max_prime) /* No multiples will be in the table */
229	5215		continue;
230
231	3256		max_factor = (tableLimit - 1) / p + 1;
232			/* Look for odd multiples of the prime p. */
233	60933	100	for (factor = 3; factor < max_factor; factor += 2) {
234	57677		uint32 index = (p*factor)/2;
235	57677	100	if (factor_table[index] == 65535) /* p is smallest factor */
236	24694		factor_table[index] = p;
237	32983	100	else if (factor_table[index] > 0) /* Change number of factors */
238	25806		factor_table[index] ^= 0x01;
239			}
240
241			/* Change all odd multiples of pp to 0 to indicate non-square-free. /
242	9892	100	for (factor = p; factor < max_factor; factor += 2*p)
243	6636		factor_table[ (p*factor) / 2] = 0;
244			}
245	20		return factor_table;
246			}
247
248			#define PHIC 6
249
250			/* static const uint8_t _s0[ 1] = {0};
251			static const uint8_t _s1[ 2] = {0,1};
252			static const uint8_t _s2[ 6] = {0,1,1,1,1,2}; */
253			static const uint8_t _s3[30] = {0,1,1,1,1,1,1,2,2,2,2,3,3,4,4,4,4,5,5,6,6,6,6,7,7,7,7,7,7,8};
254			static const uint8_t _s4[210]= {0,1,1,1,1,1,1,1,1,1,1,2,2,3,3,3,3,4,4,5,5,5,5,6,6,6,6,6,6,7,7,8,8,8,8,8,8,9,9,9,9,10,10,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,15,15,15,15,15,15,16,16,16,16,17,17,18,18,18,18,18,18,19,19,19,19,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,23,23,24,24,24,24,25,25,26,26,26,26,27,27,27,27,27,27,27,27,28,28,28,28,28,28,29,29,29,29,30,30,30,30,30,30,31,31,32,32,32,32,33,33,33,33,33,33,34,34,35,35,35,35,35,35,36,36,36,36,36,36,37,37,37,37,38,38,39,39,39,39,40,40,40,40,40,40,41,41,42,42,42,42,42,42,43,43,43,43,44,44,45,45,45,45,46,46,47,47,47,47,47,47,47,47,47,47,48};
255	24487		static UV tablephi(UV x, uint32 a)
256			{
257	24487		switch (a) {
258	0		case 0: return x;
259	0		case 1: return x-x/2;
260	0		case 2: return x-x/2-x/3+x/6;
261	0		case 3: return (x/ 30U) * 8U + _s3[x % 30U];
262	6		case 4: return (x/ 210U) * 48U + _s4[x % 210U];
263			case 5: {
264	2		UV xp = x / 11U;
265	2		return ((x /210) * 48 + _s4[x % 210]) -
266	2		((xp/210) * 48 + _s4[xp % 210]);
267			}
268			case 6:
269			default:{
270	24479		UV xp = x / 11U;
271	24479		UV x2 = x / 13U;
272	24479		UV x2p = x2 / 11U;
273	24479		return ((x /210) * 48 + _s4[x % 210]) -
274	48958		((xp /210) * 48 + _s4[xp % 210]) -
275	24479		((x2 /210) * 48 + _s4[x2 % 210]) +
276	24479		((x2p/210) * 48 + _s4[x2p% 210]);
277			}
278			/* case 7: return tablephi(x,a-1)-tablephi(x/17,a-1); / / Hack hack */
279			}
280			}
281
282			/****************************************************************************/
283			/* Legendre Phi. Not used by LMO, but exported. */
284			/****************************************************************************/
285
286			/*
287			* Choices include:
288			* 1) recursive, memory-less. We use this for small values.
289			* 2) recursive, caching. We use a this for larger values w/ 32MB cache.
290			* 3) a-walker sorted list. lehmer.c has this implementation. It is
291			* faster for some values, but big and memory intensive.
292			*/
293	3299		static UV _phi_recurse(UV x, UV a) {
294	3299		UV i, c = (a > PHIC) ? PHIC : a;
295	3299		UV sum = tablephi(x, c);
296	3299	100	if (a > c) {
297	2745		UV p = nth_prime(c);
298	2745		UV pa = nth_prime(a);
299	6029	100	for (i = c+1; i <= a; i++) {
300			UV xp;
301	5877		p = next_prime(p);
302	5877		xp = x/p;
303	5877	100	if (xp < p) {
304	2593	50	while (x < pa) {
305	0		a--;
306	0		pa = prev_prime(pa);
307			}
308	2593		return (sum - a + i - 1);
309			}
310	3284		sum -= legendre_phi(xp, i-1);
311			}
312			}
313	706		return sum;
314			}
315
316			#define PHICACHEA 256
317			#define PHICACHEX 65536
318			#define PHICACHE_EXISTS(x,a) \
319			((x < PHICACHEX && a < PHICACHEA) ? cache[a*PHICACHEX+x] : 0)
320	0		static IV _phi(UV x, UV a, int sign, const uint32_t* const primes, const uint32_t lastidx, uint16_t* cache)
321			{
322			IV sum;
323	0	0	if (PHICACHE_EXISTS(x,a)) return sign * cache[a*PHICACHEX+x];
		0
		0
324	0	0	else if (a <= PHIC) return sign * tablephi(x, a);
325	0	0	else if (x < primes[a+1]) sum = sign;
326			else {
327			/* sum = _phi(x, a-1, sign, primes, lastidx, cache) + */
328			/* _phi(x/primes[a], a-1, -sign, primes, lastidx, cache); */
329	0	0	UV a2, iters = (a*a > x) ? segment_prime_count(2,isqrt(x)) : a;
330	0		UV c = (iters > PHIC) ? PHIC : iters;
331	0	0	IV phixc = PHICACHE_EXISTS(x,c) ? cache[a*PHICACHEX+x] : tablephi(x,c);
		0
		0
332	0		sum = sign * (iters - a + phixc);
333	0	0	for (a2 = c+1; a2 <= iters; a2++)
334	0		sum += _phi(x/primes[a2], a2-1, -sign, primes, lastidx, cache);
335			}
336	0	0	if (x < PHICACHEX && a < PHICACHEA && sign*sum <= SHRT_MAX)
		0
		0
337	0		cache[aPHICACHEX+x] = sign sum;
338	0		return sum;
339			}
340	3301		UV legendre_phi(UV x, UV a)
341			{
342			/* If 'x' is very small, give a quick answer with any 'a' */
343	3301	100	if (x <= PHIC)
344	2		return tablephi(x, (a > PHIC) ? PHIC : a);
345
346			/* Shortcuts for large values, from R. Andrew Ohana */
347	3299	50	if (a > (x >> 1)) return 1;
348			/* If a > prime_count(2^32), then we need not be concerned with composite
349			* x values with all factors > 2^32, as x is limited to 64-bit. */
350	3299	50	if (a > 203280221) { /* prime_count(2*32) /
351	0		UV pc = LMO_prime_count(x);
352	0	0	return (a > pc) ? 1 : pc - a + 1;
353			}
354			/* If a is large enough, check the ratios */
355	3299	50	if (a > 1000000 && x < a21) { / x always less than 2^32 */
		0
356	0	0	if ( LMO_prime_count(x) < a) return 1;
357			}
358
359			/* TODO: R. Andrew Ohana's 2011 SAGE code is faster as the a value
360			* increases. It uses a primelist as in the caching code below, as
361			* well as a binary search prime count on it (like in our lehmer). */
362
363	3299	50	if ( a > 254 \|\| (x > 1000000000 && a > 30) ) {
		50
		0
364			uint16_t* cache;
365			uint32_t* primes;
366			uint32_t lastidx;
367	0	0	UV res, max_cache_a = (a >= PHICACHEA) ? PHICACHEA : a+1;
368	0	0	Newz(0, cache, PHICACHEX * max_cache_a, uint16_t);
369	0		primes = make_primelist(nth_prime(a+1), &lastidx);
370	0		res = (UV) _phi(x, a, 1, primes, lastidx, cache);
371	0		Safefree(primes);
372	0		Safefree(cache);
373	0		return res;
374			}
375
376	3299		return _phi_recurse(x, a);
377			}
378			/****************************************************************************/
379
380
381			typedef struct {
382			sword_t sieve; / segment bit mask */
383			uint8 word_count; / bit count in each 64-bit word */
384			uint32 word_count_sum; / cumulative sum of word_count */
385			UV totals; / total bit count for all phis at index */
386			uint32 prime_index; / index of prime where phi(n/p/p(k+1))=1 */
387			uint32 first_bit_index; / offset relative to start for this prime */
388			uint8 multiplier; / mod-30 wheel of each prime */
389			UV start; /* x value of first bit of segment */
390			UV phi_total; /* cumulative bit count before removal */
391			uint32 size; /* segment size in bits */
392			uint32 first_prime; /* index of first prime in segment */
393			uint32 last_prime; /* index of last prime in segment */
394			uint32 last_prime_to_remove; /* index of last prime p, p^2 in segment */
395			} sieve_t;
396
397			/* Size of phi sieve in words. Multiple of 35711 words. /
398			#define PHI_SIEVE_WORDS (1155 * PHI_SIEVE_MULT)
399
400			/* Bit counting using cumulative sums. A bit slower than using a running sum,
401			* but a little simpler and can be run in parallel. */
402	3576		static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count, uint32* sieve_word_count_sum) {
403	3576		uint32 i, bc, words = (sieve_size + 2SWORD_BITS-1) / (2SWORD_BITS);
404	3576		sieve_word_count_sum[0] = 0;
405	3438190	100	for (i = 0, bc = 0; i+7 < words; i += 8) {
406	3434614		const uint8* cntptr = sieve_word_count + i;
407	3434614		uint32* sumptr = sieve_word_count_sum + i;
408	3434614		sumptr[1] = bc += cntptr[0];
409	3434614		sumptr[2] = bc += cntptr[1];
410	3434614		sumptr[3] = bc += cntptr[2];
411	3434614		sumptr[4] = bc += cntptr[3];
412	3434614		sumptr[5] = bc += cntptr[4];
413	3434614		sumptr[6] = bc += cntptr[5];
414	3434614		sumptr[7] = bc += cntptr[6];
415	3434614		sumptr[8] = bc += cntptr[7];
416			}
417	20538	100	for (; i < words; i++)
418	16962		sieve_word_count_sum[i+1] = sieve_word_count_sum[i] + sieve_word_count[i];
419	3576		return sieve_word_count_sum[words];
420			}
421
422	2925220		static UV _sieve_phi(UV segment_x, const sword_t* sieve, const uint32* sieve_word_count_sum) {
423	2925220		uint32 bits = (segment_x + 1) / 2;
424	2925220		uint32 words = bits / SWORD_BITS;
425	2925220		uint32 sieve_sum = sieve_word_count_sum[words];
426	2925220		sieve_sum += bitcount( sieve[words] & ~(SWORD_ONES << (bits % SWORD_BITS)) );
427	2925220		return sieve_sum;
428			}
429
430			/* Erasing primes from the sieve is done using Christian Bau's
431			* case statement walker. It's not pretty, but it is short, fast,
432			* clever, and does the job. */
433
434			#define sieve_zero(sieve, si, wordcount) \
435			{ uint32 index_ = si/SWORD_BITS; \
436			sword_t mask_ = SWORD_MASKBIT(si); \
437			if (sieve[index_] & mask_) { \
438			sieve[index_] &= ~mask_; \
439			wordcount[index_]--; \
440			} }
441
442			#define sieve_case_zero(casenum, skip, si, p, size, mult, sieve, wordcount) \
443			case casenum: sieve_zero(sieve, si, wordcount); \
444			si += skip * p; \
445			mult = (casenum+1) % 8; \
446			if (si >= size) break;
447
448	3576		static void remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uint32_t* primes)
449			{
450	3576		uint32 size = (s->size + 1) / 2;
451	3576		sword_t *sieve = s->sieve;
452	3576		uint8 *word_count = s->word_count;
453
454	3576		s->phi_total = s->totals[last_index];
455	14899	100	for ( ;index <= last_index; index++) {
456	11323	100	if (index >= s->first_prime && index <= s->last_prime) {
		50
457	3139		uint32 b = (primes[index] - (uint32) s->start - 1) / 2;
458	3139	50	sieve_zero(sieve, b, word_count);
459			}
460	11323	100	if (index <= s->last_prime_to_remove) {
461	5358		uint32 b = s->first_bit_index[index];
462	5358	50	if (b < size) {
463	5358		uint32 p = primes[index];
464	5358		uint32 mult = s->multiplier[index];
465	5358		switch (mult) {
466			reloop: ;
467	665836	100	sieve_case_zero(0, 3, b, p, size, mult, sieve, word_count);
		100
468	665720	100	sieve_case_zero(1, 2, b, p, size, mult, sieve, word_count);
		100
469	665645	100	sieve_case_zero(2, 1, b, p, size, mult, sieve, word_count);
		100
470	665766	100	sieve_case_zero(3, 2, b, p, size, mult, sieve, word_count);
		100
471	665738	100	sieve_case_zero(4, 1, b, p, size, mult, sieve, word_count);
		100
472	665822	100	sieve_case_zero(5, 2, b, p, size, mult, sieve, word_count);
		100
473	665830	100	sieve_case_zero(6, 3, b, p, size, mult, sieve, word_count);
		100
474	665700	100	sieve_case_zero(7, 1, b, p, size, mult, sieve, word_count);
		100
475	665405		goto reloop;
476			}
477	5358		s->multiplier[index] = mult;
478			}
479	5358		s->first_bit_index[index] = b - size;
480			}
481			}
482	3576		s->totals[last_index] += make_sieve_sums(s->size, s->word_count, s->word_count_sum);
483	3576		}
484
485	112		static void word_tile (sword_t* source, uint32 from, uint32 to) {
486	444	100	while (from < to) {
487	332	100	uint32 words = (2*from > to) ? to-from : from;
488	332		memcpy(source+from, source, sizeof(sword_t)*words);
489	332		from += words;
490			}
491	112		}
492
493	28		static void init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start_prime_index, uint32 sieve_last, const uint32_t* primes)
494			{
495			uint32 i, words;
496	28		sword_t* sieve = s->sieve;
497	28		uint8* word_count = s->word_count;
498
499	28		s->start = segment_start;
500	28		s->size = size;
501
502	28	100	if (segment_start == 0) {
503	20		s->last_prime = 0;
504	20		s->last_prime_to_remove = 0;
505			}
506	28		s->first_prime = s->last_prime + 1;
507	3267	100	while (s->last_prime < sieve_last) {
508	3239		uint32 p = primes[s->last_prime + 1];
509	3239	50	if (p >= segment_start + size)
510	0		break;
511	3239		s->last_prime++;
512			}
513	2274	50	while (s->last_prime_to_remove < sieve_last) {
514	2274		UV p = primes[s->last_prime_to_remove + 1];
515	2274		UV p2 = p*p;
516	2274	100	if (p2 >= segment_start + size)
517	28		break;
518	2246		s->last_prime_to_remove++;
519	2246		s->first_bit_index[s->last_prime_to_remove] = (p2 - segment_start - 1) / 2;
520	2246		s->multiplier[s->last_prime_to_remove] = (uint8) ((p % 30) * 8 / 30);
521			}
522
523	28		memset(sieve, 0xFF, 3sizeof(sword_t)); / Set first 3 words to all 1 bits */
524	28	50	if (start_prime_index >= 3) /* Remove multiples of 3. */
525	1820	100	for (i = 3/2; i < 3 * SWORD_BITS; i += 3)
526	1792		SWORD_CLEAR(sieve, i);
527
528	28		word_tile(sieve, 3, 15); /* Copy to first 15 = 35 words /
529	28	50	if (start_prime_index >= 3) /* Remove multiples of 5. */
530	5404	100	for (i = 5/2; i < 15 * SWORD_BITS; i += 5)
531	5376		SWORD_CLEAR(sieve, i);
532
533	28		word_tile(sieve, 15, 105); /* Copy to first 105 = 357 words */
534	28	50	if (start_prime_index >= 4) /* Remove multiples of 7. */
535	26908	100	for (i = 7/2; i < 105 * SWORD_BITS; i += 7)
536	26880		SWORD_CLEAR(sieve, i);
537
538	28		word_tile(sieve, 105, 1155); /* Copy to first 1155 = 35711 words /
539	28	50	if (start_prime_index >= 5) /* Remove multiples of 11. */
540	188188	100	for (i = 11/2; i < 1155 * SWORD_BITS; i += 11)
541	188160		SWORD_CLEAR(sieve, i);
542
543	28		size = (size+1) / 2; /* size to odds */
544	28		words = (size + SWORD_BITS-1) / SWORD_BITS; /* sieve size in words */
545	28		word_tile(sieve, 1155, words); /* Copy first 1155 words to rest */
546			/* Zero all unused bits and words */
547	28	100	if (size % SWORD_BITS)
548	20		sieve[words-1] &= ~(SWORD_ONES << (size % SWORD_BITS));
549	28		memset(sieve + words, 0x00, sizeof(sword_t)*(PHI_SIEVE_WORDS+2 - words));
550
551			/* Create counts, remove primes (updating counts and sums). */
552	159479	100	for (i = 0; i < words; i++)
553	159451		word_count[i] = (uint8) bitcount(sieve[i]);
554	28		remove_primes(6, start_prime_index, s, primes);
555	28		}
556
557			/* However we want to handle reduced prime counts */
558			#define simple_pi(n) LMO_prime_count(n)
559			/* Macros to hide all the variables being passed */
560			#define prev_sieve_prime(n) \
561			prev_sieve_prime(n, &prev_sieve[0], &ps_start, ps_max, primes)
562			#define sieve_phi(x) \
563			ss.phi_total + _sieve_phi((x) - ss.start, ss.sieve, ss.word_count_sum)
564
565
566	99		UV LMO_prime_count(UV n)
567			{
568			UV N2, N3, K2, K3, M, sum1, sum2, phi_value;
569			UV sieve_start, sieve_end, least_divisor, step7_max, last_phi_sieve;
570			uint32 j, k, piM, KM, end, prime, prime_index;
571			uint32 ps_start, ps_max, smallest_divisor, nprimes;
572			uint8 prev_sieve[PREV_SIEVE_SIZE];
573			uint32_t *primes;
574			uint16 *factor_table;
575			sieve_t ss;
576
577	99		const uint32 c = PHIC; /* We can use our fast function for this */
578
579			/* For "small" n, use our table+segment sieve. */
580	99	100	if (n < SIEVE_LIMIT \|\| n < 10000) return segment_prime_count(2, n);
		50
581			/* n should now be reasonably sized (not tiny). */
582
583	20		N2 = isqrt(n); /* floor(N^1/2) */
584	20		N3 = icbrt(n); /* floor(N^1/3) */
585	20		K2 = simple_pi(N2); /* Pi(N2) */
586	20		K3 = simple_pi(N3); /* Pi(N3) */
587
588			/* M is N^1/3 times a tunable performance factor. */
589	20	100	M = (N3 > 500) ? M_FACTOR(N3) : N3+N3/2;
590	20	50	if (M >= N2) M = N2 - 1; /* M must be smaller than N^1/2 */
591	20	50	if (M < N3) M = N3; /* M must be at least N^1/3 */
592
593			/* Create the array of small primes, and least-prime-factor/moebius table */
594	20		primes = make_primelist( M + 500, &nprimes );
595	20		factor_table = ft_create( M );
596
597			/* Create other arrays */
598	20		New(0, ss.sieve, PHI_SIEVE_WORDS + 2, sword_t);
599	20		New(0, ss.word_count, PHI_SIEVE_WORDS + 2, uint8);
600	20		New(0, ss.word_count_sum, PHI_SIEVE_WORDS + 2, uint32);
601	20	50	New(0, ss.totals, K3+2, UV);
602	20	50	New(0, ss.prime_index, K3+2, uint32);
603	20	50	New(0, ss.first_bit_index, K3+2, uint32);
604	20		New(0, ss.multiplier, K3+2, uint8);
605
606	20	50	if (ss.sieve == 0 \|\| ss.word_count == 0 \|\| ss.word_count_sum == 0 \|\|
		50
		50
		50
607	20	50	ss.totals == 0 \|\| ss.prime_index == 0 \|\| ss.first_bit_index == 0 \|\|
		50
		50
608	20		ss.multiplier == 0)
609	0		croak("Allocation failure in LMO Pi\n");
610
611			/* Variables for fast prev_prime using small segment sieves (up to M^2) */
612	20		ps_max = prev_sieve_max( primes[nprimes] );
613	20		ps_start = U32_CONST(0xFFFFFFFF);
614
615			/* Look for the smallest divisor: the smallest number > M which is
616			* square-free and not divisible by any prime covered by our Mapes
617			* small-phi case. The largest value we will look up in the phi
618			* sieve is n/smallest_divisor. */
619	31	100	for (j = (M+1)/2; factor_table[j] <= primes[c]; j++) /* */;
620	20		smallest_divisor = 2*j+1;
621			/* largest_divisor = (N2 > (UV)M * (UV)M) ? N2 : (UV)M * (UV)M; */
622
623	20		M = smallest_divisor - 1; /* Increase M if possible */
624	20		piM = simple_pi(M);
625	20	50	if (piM < c) croak("N too small for LMO\n");
626	20		last_phi_sieve = n / smallest_divisor + 1;
627
628			/* KM = smallest k, c <= k <= piM, s.t. primes[k+1] * primes[k+2] > M. */
629	140	100	for (KM = c; primes[KM+1] * primes[KM+2] <= M && KM < piM; KM++) /* */;
		50
630	20	50	if (K3 < KM) K3 = KM; /* Ensure K3 >= KM */
631
632			/* Start calculating Pi(n). Steps 4-10 from Bau. */
633	20		sum1 = (K2 - 1) + (UV) (piM - K3 - 1) * (UV) (piM - K3) / 2;
634	20		sum2 = 0;
635	20		end = (M+1)/2;
636
637			/* Start at index K2, which is the prime preceeding N^1/2 */
638	20	50	prime = prev_sieve_prime( (N2 >= ps_start) ? ps_start : N2+1 );
639	20		prime_index = K2 - 1;
640	20		step7_max = K3;
641
642			/* Step 4: For 1 <= x <= M where x is square-free and has no
643			* factor <= primes[c], sum phi(n / x, c). */
644	29836	100	for (j = 0; j < end; j++) {
645	29816		uint32 lpf = factor_table[j];
646	29816	100	if (lpf > primes[c]) {
647	11215		phi_value = tablephi(n / (2j+1), c); / x = 2j+1 */
648	11215	100	if (lpf & 0x01) sum2 += phi_value; else sum1 += phi_value;
649			}
650			}
651
652			/* Step 5: For 1+M/primes[c+1] <= x <= M, x square-free and
653			* has no factor <= primes[c+1], sum phi(n / (xprimes[c+1]), c). /
654	20	50	if (c < piM) {
655	20		UV pc_1 = primes[c+1];
656	28085	100	for (j = (1+M/pc_1)/2; j < end; j++) {
657	28065		uint32 lpf = factor_table[j];
658	28065	100	if (lpf > pc_1) {
659	9971		phi_value = tablephi(n / (pc_1 * (2j+1)), c); / x = 2j+1 */
660	9971	100	if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
661			}
662			}
663			}
664
665	3279	100	for (k = 0; k <= K3; k++) ss.totals[k] = 0;
666	260	100	for (k = 0; k < KM; k++) ss.prime_index[k] = end;
667
668			/* Instead of dividing by all primes up to pi(M), once a divisor is large
669			* enough then phi(n / (pprimes[k+1]), k) = 1. /
670			{
671	20		uint32 last_prime = piM;
672	3019	100	for (k = KM; k < K3; k++) {
673	2999		UV pk = primes[k+1];
674	7304	100	while (last_prime > k+1 && pk * pk * primes[last_prime] > n)
		100
675	4305		last_prime--;
676	2999		ss.prime_index[k] = last_prime;
677	2999		sum1 += piM - last_prime;
678			}
679			}
680
681	48	100	for (sieve_start = 0; sieve_start < last_phi_sieve; sieve_start = sieve_end) {
682			/* This phi segment goes from sieve_start to sieve_end. */
683	28		sieve_end = ((sieve_start + 2SWORD_BITSPHI_SIEVE_WORDS) < last_phi_sieve)
684	28		? sieve_start + 2SWORD_BITSPHI_SIEVE_WORDS : last_phi_sieve;
685			/* Only divisors s.t. sieve_start <= N / divisor < sieve_end considered. */
686	28		least_divisor = n / sieve_end;
687			/* Initialize the sieve segment and all associated variables. */
688	28		init_segment(&ss, sieve_start, sieve_end - sieve_start, c, K3, primes);
689
690			/* Step 6: For c < k < KM: For 1+M/primes[k+1] <= x <= M, x square-free
691			* and has no factor <= primes[k+1], sum phi(n / (xprimes[k+1]), k). /
692	408	100	for (k = c+1; k < KM; k++) {
693	380		UV pk = primes[k+1];
694	380	50	uint32 start = (least_divisor >= pk * U32_CONST(0xFFFFFFFE))
695			? U32_CONST(0xFFFFFFFF)
696	380		: (least_divisor / pk + 1)/2;
697	380		remove_primes(k, k, &ss, primes);
698	685167	100	for (j = ss.prime_index[k] - 1; j >= start; j--) {
699	684787		uint32 lpf = factor_table[j];
700	684787	100	if (lpf > pk) {
701	171458		phi_value = sieve_phi(n / (pk * (2*j+1)));
702	171458	100	if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
703			}
704			}
705	380	100	if (start < ss.prime_index[k])
706	363		ss.prime_index[k] = start;
707			}
708			/* Step 7: For KM <= K < Pi_M: For primes[k+2] <= x <= M, sum
709			* phi(n / (xprimes[k+1]), k). The inner for loop can be parallelized. /
710	3168	100	for (; k < step7_max; k++) {
711	3140		remove_primes(k, k, &ss, primes);
712	3140		j = ss.prime_index[k];
713	3140	100	if (j >= k+2) {
714	3126		UV pk = primes[k+1];
715	3126		UV endj = j;
716	334635	50	while (endj > 7 && endj-7 >= k+2 && pk*primes[endj-7] > least_divisor) endj -= 8;
		100
		100
717	14097	100	while ( endj >= k+2 && pk*primes[endj ] > least_divisor) endj--;
		100
718			/* Now that we know how far to go, do the summations */
719	2666169	100	for ( ; j > endj; j--)
720	2663043		sum1 += sieve_phi(n / (pk*primes[j]));
721	3126		ss.prime_index[k] = endj;
722			}
723			}
724			/* Restrict work for the above loop when we know it will be empty. */
725	3027	100	while (step7_max > KM && ss.prime_index[step7_max-1] < (step7_max-1)+2)
		100
726	2999		step7_max--;
727
728			/* Step 8: For KM <= K < K3, sum -phi(n / primes[k+1], k) */
729	28		remove_primes(k, K3, &ss, primes);
730			/* Step 9: For K3 <= k < K2, sum -phi(n / primes[k+1], k) + (k-K3). */
731	90747	100	while (prime > least_divisor && prime_index >= piM) {
		50
732	90719		sum1 += prime_index - K3;
733	90719		sum2 += sieve_phi(n / prime);
734	90719		prime_index--;
735	90719		prime = prev_sieve_prime(prime);
736			}
737			}
738
739	20		Safefree(ss.sieve);
740	20		Safefree(ss.word_count);
741	20		Safefree(ss.word_count_sum);
742	20		Safefree(ss.totals);
743	20		Safefree(ss.prime_index);
744	20		Safefree(ss.first_bit_index);
745	20		Safefree(ss.multiplier);
746	20		Safefree(factor_table);
747	20		Safefree(primes);
748
749	99		return sum1 - sum2;
750			}