File Coverage

lmo.c

Criterion	Covered	Total	%
statement	268	270	99.2
branch	170	204	83.3
condition			n/a
subroutine			n/a
pod			n/a
total	438	474	92.4

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5
6			/*****************************************************************************
7			*
8			* Prime counts using the extended Lagarias-Miller-Odlyzko combinatorial method.
9			*
10			* Copyright (c) 2013-2014 Dana Jacobsen (dana@acm.org)
11			* This is free software; you can redistribute it and/or modify it under
12			* the same terms as the Perl 5 programming language system itself.
13			*
14			* This file is part of the Math::Prime::Util Perl module, but it should
15			* not be difficult to turn it into standalone code.
16			*
17			* The structure of the main routine is based on Christian Bau's earlier work.
18			*
19			* References:
20			* - Christian Bau's paper and example implementation, 2003, Christian Bau
21			* This was of immense help. References to "step #" refer to this preprint.
22			* - "Computing Pi(x): the combinatorial method", 2006, Tomás Oliveira e Silva
23			* - "Computing Pi(x): The Meissel, Lehmer, Lagarias, Miller, Odlyzko Method"
24			* 1996, Deléglise and Rivat.
25			*
26			* Comparisons to the other prime counting implementations in this package:
27			*
28			* Sieve: Segmented, single threaded, thread-safe. Small table enhanced,
29			* fastest for n < 60M. Bad growth rate (like all sieves will have).
30			* Legendre:Simple. Recursive caching phi.
31			* Meissel: Simple. Non-recursive phi, lots of memory.
32			* Lehmer: Non-recursive phi, tries to restrict memory.
33			* LMOS: Simple. Non-recursive phi, less memory than Lehmer above.
34			* LMO: Sieve phi. Much faster and less memory than the others.
35			*
36			* Timing below is single core Haswell 4770K using Math::Prime::Util.
37			*
38			* \| n \| Legendre \| Meissel \| Lehmer \| LMOS \| LMO \|
39			* +-------+----------+----------+----------+----------+-----------+
40			* \| 10^19 \| \| \| \| \| 2493.4 \|
41			* \| 10^18 \| \| \| \| \| 498.16 \|
42			* \| 10^17 \|10459.3 \| 4348.3 \| 6109.7 \| 3478.0 \| 103.03 \|
43			* \| 10^16 \| 1354.6 \| 510.8 \| 758.6 \| 458.4 \| 21.64 \|
44			* \| 10^15 \| 171.2 \| 97.1 \| 106.4 \| 68.11 \| 4.707 \|
45			* \| 10^14 \| 23.56 \| 18.59 \| 16.51 \| 10.44 \| 1.032 \|
46			* \| 10^13 \| 3.783 \| 3.552 \| 2.803 \| 1.845 \| 0.237 \|
47			* \| 10^12 \| 0.755 \| 0.697 \| 0.505 \| 0.378 \| 54.9ms \|
48			* \| 10^11 \| 0.165 \| 0.144 \| 93.7ms\| 81.6ms\| 13.80ms\|
49			* \| 10^10 \| 35.9ms\| 29.9ms\| 19.9ms\| 17.8ms\| 3.64ms\|
50			*
51			* Run with high memory limits: Meissel uses 1GB for 10^16, ~3GB for 10^17.
52			* Lehmer is limited at high n values by sieving speed. It is much faster
53			* using parallel primesieve, though cannot come close to LMO.
54			*/
55
56			/* Adjust to get best performance. Alpha from TOS paper. */
57			#define M_FACTOR(n) (UV) ((double)n * (log(n)/log(5.2)) * (log(log(n))-1.4))
58			/* Size of segment used for previous primes, must be >= 21 */
59			#define PREV_SIEVE_SIZE 512
60			/* Phi sieve multiplier, adjust for best performance and memory use. */
61			#define PHI_SIEVE_MULT 13
62
63			#define FUNC_isqrt 1
64			#include "util.h"
65			#include "constants.h"
66			#include "prime_counts.h"
67			#include "cache.h"
68			#include "sieve.h"
69			#include "legendre_phi.h"
70			#include "lmo.h"
71
72			#ifdef _MSC_VER
73			typedef unsigned __int8 uint8;
74			typedef unsigned __int16 uint16;
75			typedef unsigned __int32 uint32;
76			#else
77			typedef unsigned char uint8;
78			typedef unsigned short uint16;
79			typedef uint32_t uint32;
80			#endif
81
82			/* UV is either uint32 or uint64 depending on Perl. We use this native size
83			* for the basic unit of the phi sieve. It can be easily overridden here. */
84			typedef UV sword_t;
85			#define SWORD_BITS BITS_PER_WORD
86			#define SWORD_ONES UV_MAX
87			#define SWORD_MASKBIT(bits) (UVCONST(1) << ((bits) % SWORD_BITS))
88			#define SWORD_CLEAR(s,bits) s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits)
89
90			/* GCC 3.4 - 4.1 has broken 64-bit popcount.
91			* GCC 4.2+ can generate awful code when it doesn't have asm (GCC bug 36041).
92			* When the asm is present (e.g. compile with -march=native on a platform that
93			* has them, like Nahelem+), then it is almost as fast as the direct asm. */
94			#if SWORD_BITS == 64
95			#if defined(__POPCNT__) && defined(__GNUC__) && (__GNUC__> 4 \|\| (__GNUC__== 4 && __GNUC_MINOR__> 1))
96			#define bitcount(b) __builtin_popcountll(b)
97			#else
98	21954541		static sword_t bitcount(sword_t b) {
99	21954541		b -= (b >> 1) & 0x5555555555555555;
100	21954541		b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
101	21954541		b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f;
102	21954541		return (b * 0x0101010101010101) >> 56;
103			}
104			#endif
105			#else
106			/* An 8-bit table version is usually a little faster, but this is simpler. */
107			static sword_t bitcount(sword_t b) {
108			b -= (b >> 1) & 0x55555555;
109			b = (b & 0x33333333) + ((b >> 2) & 0x33333333);
110			b = (b + (b >> 4)) & 0x0f0f0f0f;
111			return (b * 0x01010101) >> 24;
112			}
113			#endif
114
115
116			/* Create array of small primes: 0,2,3,5,...,prev_prime(n+1) */
117	788		static uint32_t* make_primelist(uint32 n, uint32* number_of_primes)
118			{
119			uint32_t* plist;
120			#if 1
121	788		*number_of_primes = range_prime_sieve_32(&plist, n, 1);
122			#else
123			uint32 i = 0;
124			uint32 max_index = max_nprimes(n);
125			*number_of_primes = 0;
126			New(0, plist, max_index+1, uint32_t);
127			plist[0] = 0;
128			/* We could do a simple SoE here. This is not time critical. */
129			START_DO_FOR_EACH_PRIME(2, n) {
130			plist[++i] = p;
131			} END_DO_FOR_EACH_PRIME;
132			*number_of_primes = i;
133			#endif
134	788		return plist;
135			}
136			#if 0 /* primesieve 5.0 example */
137			#include
138			static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) {
139			uint32_t plist;
140			uint32_t* psprimes = generate_primes(2, n, number_of_primes, UINT_PRIMES);
141			New(0, plist, *number_of_primes + 1, uint32_t);
142			plist[0] = 0;
143			memcpy(plist+1, psprimes, number_of_primes sizeof(uint32_t));
144			primesieve_free(psprimes);
145			return plist;
146			}
147			#endif
148
149			/* Given a max prime in small prime list, return max prev prime input */
150	788		static uint32 prev_sieve_max(UV maxprime) {
151	788		UV limit = maxprimemaxprime - (maxprimemaxprime % (16*PREV_SIEVE_SIZE)) - 1;
152	788		return (limit > U32_CONST(4294967295)) ? U32_CONST(4294967295) : limit;
153			}
154
155			/* Simple SoE filling a segment */
156	2387		static void _prev_sieve_fill(UV start, uint8* sieve, const uint32_t* primes) {
157			UV i, j, p;
158	2387		memset( sieve, 0xFF, PREV_SIEVE_SIZE );
159	88090	100	for (i = 2, p = 3; pp < start + (16PREV_SIEVE_SIZE); p = primes[++i])
160	85703	100	for (j = (start == 0) ? p*p/2 : (p-1) - ((start+(p-1))/2) % p;
161	13158954	100	j < (8*PREV_SIEVE_SIZE); j += p)
162	13073251		sieve[j/8] &= ~(1U << (j%8));
163	2387		}
164
165			/* Calculate previous prime using small segment */
166	1552313		static uint32 prev_sieve_prime(uint32 n, uint8* sieve, uint32* segment_start, uint32 sieve_max, const uint32_t* primes)
167			{
168			uint32 sieve_start, bit_offset;
169	1552313	50	if (n <= 3) return (n == 3) ? 2 : 0;
		0
170	1552313	50	if (n > sieve_max) croak("ps overflow\n");
171
172			/* If n > 3 && n <= sieve_max, then there is an odd prime we can find. */
173	1552313		n -= 2;
174	1552313		bit_offset = n % (16*PREV_SIEVE_SIZE);
175	1552313		sieve_start = n - bit_offset;
176	1552313		bit_offset >>= 1;
177
178			while (1) {
179	1553796	100	if (sieve_start != segment_start) { / Fill sieve if necessary */
180	2387		_prev_sieve_fill(sieve_start, sieve, primes);
181	2387		*segment_start = sieve_start;
182			}
183			do { /* Look for a set bit in sieve */
184	7329516	100	if (sieve[bit_offset / 8] & (1u << (bit_offset % 8)))
185	1552313		return sieve_start + 2*bit_offset + 1;
186	5777203	100	} while (bit_offset-- > 0);
187	1483		sieve_start -= (16 * PREV_SIEVE_SIZE);
188	1483		bit_offset = ((16 * PREV_SIEVE_SIZE) - 1) / 2;
189			}
190			}
191
192			/* Create factor table.
193			* In lehmer.c we create mu and lpf arrays. Here we use Christian Bau's
194			* method, which is slightly more memory efficient and also a bit faster than
195			* the code there (which does not use our fast ranged moebius). It makes
196			* very little difference -- mainly using this table is more convenient.
197			*
198			* In a uint16 we have stored:
199			* 0 moebius(n) = 0
200			* even moebius(n) = 1
201			* odd moebius(n) = -1 (last bit indicates even/odd number of factors)
202			* v smallest odd prime factor of n is v&1
203			* 65535 large prime
204			*/
205	788		static uint16* ft_create(uint32 max)
206			{
207			uint16* factor_table;
208			uint32 i;
209	788		uint32 tableLimit = max + 338 + 1; /* At least one more prime */
210	788		uint32 tableSize = tableLimit/2;
211	788		uint32 max_prime = (tableLimit - 1) / 3 + 1;
212
213	788		New(0, factor_table, tableSize, uint16);
214
215			/* Set all values to 65535 (a large prime), set 0 to 65534. */
216	788		factor_table[0] = 65534;
217	669325	100	for (i = 1; i < tableSize; ++i)
218	668537		factor_table[i] = 65535;
219
220			/* Process each odd. */
221	669325	100	for (i = 1; i < tableSize; ++i) {
222			uint32 factor, max_factor;
223	668537		uint32 p = i*2+1;
224	668537	100	if (factor_table[i] != 65535) /* Already marked. */
225	466476		continue;
226	202061	50	if (p < 65535) /* p is a small prime, so set the number. */
227	202061		factor_table[i] = (uint16) p;
228	202061	100	if (p >= max_prime) /* No multiples will be in the table */
229	122743		continue;
230
231	79318		max_factor = (tableLimit - 1) / p + 1;
232			/* Look for odd multiples of the prime p. */
233	1105097	100	for (factor = 3; factor < max_factor; factor += 2) {
234	1025779		uint32 index = (p*factor)/2;
235	1025779	100	if (factor_table[index] == 65535) /* p is smallest factor */
236	466476		factor_table[index] = p; /* TODO p > 65535? */
237	559303	100	else if (factor_table[index] > 0) /* Change number of factors */
238	442181		factor_table[index] ^= 0x01;
239			}
240
241			/* Change all odd multiples of pp to 0 to indicate non-square-free. /
242	212112	100	for (factor = p; factor < max_factor; factor += 2*p)
243	132794		factor_table[ (p*factor) / 2] = 0;
244			}
245	788		return factor_table;
246			}
247
248
249			/****************************************************************************/
250
251
252			typedef struct {
253			sword_t sieve; / segment bit mask */
254			uint8 word_count; / bit count in each 64-bit word */
255			uint32 word_count_sum; / cumulative sum of word_count */
256			UV totals; / total bit count for all phis at index */
257			uint32 prime_index; / index of prime where phi(n/p/p(k+1))=1 */
258			uint32 first_bit_index; / offset relative to start for this prime */
259			uint8 multiplier; / mod-30 wheel of each prime */
260			UV start; /* x value of first bit of segment */
261			UV phi_total; /* cumulative bit count before removal */
262			uint32 size; /* segment size in bits */
263			uint32 first_prime; /* index of first prime in segment */
264			uint32 last_prime; /* index of last prime in segment */
265			uint32 last_prime_to_remove; /* index of last prime p, p^2 in segment */
266			} sieve_t;
267
268			/* Size of phi sieve in words. Multiple of 35711 words. /
269			#define PHI_SIEVE_WORDS (1155 * PHI_SIEVE_MULT)
270
271			/* Bit counting using cumulative sums. A bit slower than using a running sum,
272			* but a little simpler and can be run in parallel. */
273	90613		static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count, uint32* sieve_word_count_sum) {
274	90613		uint32 i, bc, words = (sieve_size + 2SWORD_BITS-1) / (2SWORD_BITS);
275	90613		sieve_word_count_sum[0] = 0;
276	37464317	100	for (i = 0, bc = 0; i+7 < words; i += 8) {
277	37373704		const uint8* cntptr = sieve_word_count + i;
278	37373704		uint32* sumptr = sieve_word_count_sum + i;
279	37373704		sumptr[1] = bc += cntptr[0];
280	37373704		sumptr[2] = bc += cntptr[1];
281	37373704		sumptr[3] = bc += cntptr[2];
282	37373704		sumptr[4] = bc += cntptr[3];
283	37373704		sumptr[5] = bc += cntptr[4];
284	37373704		sumptr[6] = bc += cntptr[5];
285	37373704		sumptr[7] = bc += cntptr[6];
286	37373704		sumptr[8] = bc += cntptr[7];
287			}
288	433847	100	for (; i < words; i++)
289	343234		sieve_word_count_sum[i+1] = sieve_word_count_sum[i] + sieve_word_count[i];
290	90613		return sieve_word_count_sum[words];
291			}
292
293	20139102		static UV _sieve_phi(UV segment_x, const sword_t* sieve, const uint32* sieve_word_count_sum) {
294	20139102		uint32 bits = (segment_x + 1) / 2;
295	20139102		uint32 words = bits / SWORD_BITS;
296	20139102		uint32 sieve_sum = sieve_word_count_sum[words];
297	20139102		sieve_sum += bitcount( sieve[words] & ~(SWORD_ONES << (bits % SWORD_BITS)) );
298	20139102		return sieve_sum;
299			}
300
301			/* Erasing primes from the sieve is done using Christian Bau's
302			* case statement walker. It's not pretty, but it is short, fast,
303			* clever, and does the job.
304			* Kim W. gave a nice branchless speedup for sieve_zero */
305
306			#define sieve_zero(sieve, si, wordcount) \
307			{ uint32 index_ = si/SWORD_BITS; \
308			sword_t mask_ = SWORD_MASKBIT(si); \
309			sword_t is_bit = (sieve[index_] >> (si % SWORD_BITS)) & 1; \
310			sieve[index_] &= ~mask_; \
311			wordcount[index_] -= is_bit; \
312			}
313
314			#define sieve_case_zero(casenum, skip, si, p, size, mult, sieve, wordcount) \
315			case casenum: sieve_zero(sieve, si, wordcount); \
316			si += skip * p; \
317			mult = (casenum+1) % 8; \
318			if (si >= size) break;
319
320	90613		static void remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uint32_t* primes)
321			{
322	90613		uint32 size = (s->size + 1) / 2;
323	90613		sword_t *sieve = s->sieve;
324	90613		uint8 *word_count = s->word_count;
325
326	90613		s->phi_total = s->totals[last_index];
327	193205	100	for ( ;index <= last_index; index++) {
328	102592	100	if (index >= s->first_prime && index <= s->last_prime) {
		50
329	89924		uint32 b = (primes[index] - (uint32) s->start - 1) / 2;
330	89924		sieve_zero(sieve, b, word_count);
331			}
332	102592	100	if (index <= s->last_prime_to_remove) {
333	73357		uint32 b = s->first_bit_index[index];
334	73357	50	if (b < size) {
335	73357		uint32 p = primes[index];
336	73357		uint32 mult = s->multiplier[index];
337	73357		switch (mult) {
338	6187821		reloop: ;
339	6195828	100	sieve_case_zero(0, 3, b, p, size, mult, sieve, word_count);
340	6191135	100	sieve_case_zero(1, 2, b, p, size, mult, sieve, word_count);
341	6190320	100	sieve_case_zero(2, 1, b, p, size, mult, sieve, word_count);
342	6194952	100	sieve_case_zero(3, 2, b, p, size, mult, sieve, word_count);
343	6194870	100	sieve_case_zero(4, 1, b, p, size, mult, sieve, word_count);
344	6197830	100	sieve_case_zero(5, 2, b, p, size, mult, sieve, word_count);
345	6197636	100	sieve_case_zero(6, 3, b, p, size, mult, sieve, word_count);
346	6192580	100	sieve_case_zero(7, 1, b, p, size, mult, sieve, word_count);
347	6187821		goto reloop;
348			}
349	73357		s->multiplier[index] = (uint8) mult;
350			}
351	73357		s->first_bit_index[index] = b - size;
352			}
353			}
354	90613		s->totals[last_index] += make_sieve_sums(s->size, s->word_count, s->word_count_sum);
355	90613		}
356
357	3224		static void word_tile (sword_t* source, uint32 from, uint32 to) {
358	12107	100	while (from < to) {
359	8883	100	uint32 words = (2*from > to) ? to-from : from;
360	8883		memcpy(source+from, source, sizeof(sword_t)*words);
361	8883		from += words;
362			}
363	3224		}
364
365	806		static void init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start_prime_index, uint32 sieve_last, const uint32_t* primes)
366			{
367			uint32 i, words;
368	806		sword_t* sieve = s->sieve;
369	806		uint8* word_count = s->word_count;
370
371	806		s->start = segment_start;
372	806		s->size = size;
373
374	806	100	if (segment_start == 0) {
375	788		s->last_prime = 0;
376	788		s->last_prime_to_remove = 0;
377			}
378	806		s->first_prime = s->last_prime + 1;
379	94670	100	while (s->last_prime < sieve_last) {
380	93864		uint32 p = primes[s->last_prime + 1];
381	93864	50	if (p >= segment_start + size)
382	0		break;
383	93864		s->last_prime++;
384			}
385	72655	50	while (s->last_prime_to_remove < sieve_last) {
386	72655		UV p = primes[s->last_prime_to_remove + 1];
387	72655		UV p2 = p*p;
388	72655	100	if (p2 >= segment_start + size)
389	806		break;
390	71849		s->last_prime_to_remove++;
391	71849		s->first_bit_index[s->last_prime_to_remove] = (p2 - segment_start - 1) / 2;
392	71849		s->multiplier[s->last_prime_to_remove] = (uint8) ((p % 30) * 8 / 30);
393			}
394
395	806		memset(sieve, 0xFF, 3sizeof(sword_t)); / Set first 3 words to all 1 bits */
396	806	50	if (start_prime_index >= 3) /* Remove multiples of 3. */
397	52390	100	for (i = 3/2; i < 3 * SWORD_BITS; i += 3)
398	51584		SWORD_CLEAR(sieve, i);
399
400	806		word_tile(sieve, 3, 15); /* Copy to first 15 = 35 words /
401	806	50	if (start_prime_index >= 3) /* Remove multiples of 5. */
402	155558	100	for (i = 5/2; i < 15 * SWORD_BITS; i += 5)
403	154752		SWORD_CLEAR(sieve, i);
404
405	806		word_tile(sieve, 15, 105); /* Copy to first 105 = 357 words */
406	806	50	if (start_prime_index >= 4) /* Remove multiples of 7. */
407	774566	100	for (i = 7/2; i < 105 * SWORD_BITS; i += 7)
408	773760		SWORD_CLEAR(sieve, i);
409
410	806		word_tile(sieve, 105, 1155); /* Copy to first 1155 = 35711 words /
411	806	50	if (start_prime_index >= 5) /* Remove multiples of 11. */
412	5417126	100	for (i = 11/2; i < 1155 * SWORD_BITS; i += 11)
413	5416320		SWORD_CLEAR(sieve, i);
414
415	806		size = (size+1) / 2; /* size to odds */
416	806		words = (size + SWORD_BITS-1) / SWORD_BITS; /* sieve size in words */
417	806		word_tile(sieve, 1155, words); /* Copy first 1155 words to rest */
418			/* Zero all unused bits and words */
419	806	100	if (size % SWORD_BITS)
420	774		sieve[words-1] &= ~(SWORD_ONES << (size % SWORD_BITS));
421	806		memset(sieve + words, 0x00, sizeof(sword_t)*(PHI_SIEVE_WORDS+2 - words));
422
423			/* Create counts, remove primes (updating counts and sums). */
424	1816245	100	for (i = 0; i < words; i++)
425	1815439		word_count[i] = (uint8) bitcount(sieve[i]);
426	806		remove_primes(6, start_prime_index, s, primes);
427	806		}
428
429			/* However we want to handle reduced prime counts */
430			#define simple_pi(n) LMO_prime_count(n)
431			/* Macros to hide all the variables being passed */
432			#define prev_sieve_prime(n) \
433			prev_sieve_prime(n, &prev_sieve[0], &ps_start, ps_max, primes)
434			#define sieve_phi(x) \
435			ss.phi_total + _sieve_phi((x) - ss.start, ss.sieve, ss.word_count_sum)
436
437
438	3341		UV LMO_prime_count(UV n)
439			{
440			UV N2, N3, K2, K3, M, sum1, sum2, phi_value;
441			UV sieve_start, sieve_end, least_divisor, step7_max, last_phi_sieve;
442			uint32 j, k, piM, KM, end, prime, prime_index;
443			uint32 ps_start, ps_max, smallest_divisor, nprimes;
444			uint8 prev_sieve[PREV_SIEVE_SIZE];
445			uint32_t *primes;
446			uint16 *factor_table;
447			sieve_t ss;
448
449	3341		const uint32 c = tiny_phi_max_a(); /* We can use our fast function for this */
450
451			/* For "small" n, use our table+segment sieve. */
452	3341	100	if (n < _MPU_LMO_CROSSOVER \|\| n < 10000) return segment_prime_count(2, n);
		50
453			/* n should now be reasonably sized (not tiny). */
454
455			#ifdef USE_PRIMECOUNT_FOR_LARGE_LMO
456			if (n > 500000000000UL) { /* Crossover on 2020 Macbook M1 (with parallel!) */
457			FILE *f;
458			char cmd[100];
459			snprintf(cmd, 100, "primecount %lu", n);
460			f = popen(cmd, "r");
461			fscanf(f, "%lu", &sum1);
462			pclose(f);
463			return sum1;
464			}
465			#endif
466
467	788		N2 = isqrt(n); /* floor(N^1/2) */
468	788		N3 = icbrt(n); /* floor(N^1/3) */
469	788		K2 = simple_pi(N2); /* Pi(N2) */
470	788		K3 = simple_pi(N3); /* Pi(N3) */
471
472			/* M is N^1/3 times a tunable performance factor. */
473	788	100	M = (N3 > 500) ? M_FACTOR(N3) : N3+N3/2;
474	788	50	if (M >= N2) M = N2 - 1; /* M must be smaller than N^1/2 */
475	788	50	if (M < N3) M = N3; /* M must be at least N^1/3 */
476
477			/* Create the array of small primes, and least-prime-factor/moebius table */
478	788		primes = make_primelist( M + 500, &nprimes );
479	788		factor_table = ft_create( M );
480
481			/* Create other arrays */
482	788		New(0, ss.sieve, PHI_SIEVE_WORDS + 2, sword_t);
483	788		New(0, ss.word_count, PHI_SIEVE_WORDS + 2, uint8);
484	788		New(0, ss.word_count_sum, PHI_SIEVE_WORDS + 2, uint32);
485	788	50	New(0, ss.totals, K3+2, UV);
486	788	50	New(0, ss.prime_index, K3+2, uint32);
487	788	50	New(0, ss.first_bit_index, K3+2, uint32);
488	788		New(0, ss.multiplier, K3+2, uint8);
489
490	788	50	if (ss.sieve == 0 \|\| ss.word_count == 0 \|\| ss.word_count_sum == 0 \|\|
		50
		50
491	788	50	ss.totals == 0 \|\| ss.prime_index == 0 \|\| ss.first_bit_index == 0 \|\|
		50
		50
492	788	50	ss.multiplier == 0)
493	0		croak("Allocation failure in LMO Pi\n");
494
495			/* Variables for fast prev_prime using small segment sieves (up to M^2) */
496	788		ps_max = prev_sieve_max( primes[nprimes] );
497	788		ps_start = U32_CONST(0xFFFFFFFF);
498
499			/* Look for the smallest divisor: the smallest number > M which is
500			* square-free and not divisible by any prime covered by our Mapes
501			* small-phi case. The largest value we will look up in the phi
502			* sieve is n/smallest_divisor. */
503	1672	100	for (j = (M+1)/2; factor_table[j] <= primes[c]; j++) /* */;
504	788		smallest_divisor = 2*j+1;
505			/* largest_divisor = (N2 > (UV)M * (UV)M) ? N2 : (UV)M * (UV)M; */
506
507	788		M = smallest_divisor - 1; /* Increase M if possible */
508	788		piM = simple_pi(M);
509	788	50	if (piM < c) croak("N too small for LMO\n");
510	788		last_phi_sieve = n / smallest_divisor + 1;
511
512			/* KM = smallest k, c <= k <= piM, s.t. primes[k+1] * primes[k+2] > M. */
513	4325	100	for (KM = c; primes[KM+1] * primes[KM+2] <= M && KM < piM; KM++) /* */;
		50
514	788	50	if (K3 < KM) K3 = KM; /* Ensure K3 >= KM */
515
516			/* Start calculating Pi(n). Steps 4-10 from Bau. */
517	788		sum1 = (K2 - 1) + (UV) (piM - K3 - 1) * (UV) (piM - K3) / 2;
518	788		sum2 = 0;
519	788		end = (M+1)/2;
520
521			/* Start at index K2, which is the prime preceeding N^1/2 */
522	788	50	prime = prev_sieve_prime( (N2 >= ps_start) ? ps_start : N2+1 );
523	788		prime_index = K2 - 1;
524	788		step7_max = K3;
525
526			/* Step 4: For 1 <= x <= M where x is square-free and has no
527			* factor <= primes[c], sum phi(n / x, c). */
528	537825	100	for (j = 0; j < end; j++) {
529	537037		uint32 lpf = factor_table[j];
530	537037	100	if (lpf > primes[c]) {
531	200965		phi_value = tiny_phi(n / (2j+1), c); / x = 2j+1 */
532	200965	100	if (lpf & 0x01) sum2 += phi_value; else sum1 += phi_value;
533			}
534			}
535
536			/* Step 5: For 1+M/primes[c+1] <= x <= M, x square-free and
537			* has no factor <= primes[c+1], sum phi(n / (xprimes[c+1]), c). /
538	788	50	if (c < piM) {
539	788		UV pc_1 = primes[c+1];
540	506254	100	for (j = (1+M/pc_1)/2; j < end; j++) {
541	505466		uint32 lpf = factor_table[j];
542	505466	100	if (lpf > pc_1) {
543	178019		phi_value = tiny_phi(n / (pc_1 * (2j+1)), c); / x = 2j+1 */
544	178019	100	if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
545			}
546			}
547			}
548
549	95440	100	for (k = 0; k <= K3; k++) ss.totals[k] = 0;
550	9053	100	for (k = 0; k < KM; k++) ss.prime_index[k] = end;
551
552			/* Instead of dividing by all primes up to pi(M), once a divisor is large
553			* enough then phi(n / (pprimes[k+1]), k) = 1. /
554			{
555	788		uint32 last_prime = piM;
556	86387	100	for (k = KM; k < K3; k++) {
557	85599		UV pk = primes[k+1];
558	156790	100	while (last_prime > k+1 && pk * pk * primes[last_prime] > n)
		100
559	71191		last_prime--;
560	85599		ss.prime_index[k] = last_prime;
561	85599		sum1 += piM - last_prime;
562			}
563			}
564
565	1594	100	for (sieve_start = 0; sieve_start < last_phi_sieve; sieve_start = sieve_end) {
566			/* This phi segment goes from sieve_start to sieve_end. */
567	806		sieve_end = ((sieve_start + 2SWORD_BITSPHI_SIEVE_WORDS) < last_phi_sieve)
568			? sieve_start + 2SWORD_BITSPHI_SIEVE_WORDS : last_phi_sieve;
569			/* Only divisors s.t. sieve_start <= N / divisor < sieve_end considered. */
570	806		least_divisor = n / sieve_end;
571			/* Initialize the sieve segment and all associated variables. */
572	806		init_segment(&ss, sieve_start, sieve_end - sieve_start, c, K3, primes);
573
574			/* Step 6: For c < k < KM: For 1+M/primes[k+1] <= x <= M, x square-free
575			* and has no factor <= primes[k+1], sum phi(n / (xprimes[k+1]), k). /
576	4025	100	for (k = c+1; k < KM; k++) {
577	3219		UV pk = primes[k+1];
578	3219	50	uint32 start = (least_divisor >= pk * U32_CONST(0xFFFFFFFE))
579			? U32_CONST(0xFFFFFFFF)
580	3219		: (least_divisor / pk + 1)/2;
581	3219		remove_primes(k, k, &ss, primes);
582	3843498	100	for (j = ss.prime_index[k] - 1; j >= start; j--) {
583	3840279		uint32 lpf = factor_table[j];
584	3840279	100	if (lpf > pk) {
585	1099957		phi_value = sieve_phi(n / (pk * (2*j+1)));
586	1099957	100	if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value;
587			}
588			}
589	3219	100	if (start < ss.prime_index[k])
590	3202		ss.prime_index[k] = start;
591			}
592			/* Step 7: For KM <= K < Pi_M: For primes[k+2] <= x <= M, sum
593			* phi(n / (xprimes[k+1]), k). The inner for loop can be parallelized. /
594	86588	100	for (; k < step7_max; k++) {
595	85782		remove_primes(k, k, &ss, primes);
596	85782		j = ss.prime_index[k];
597	85782	100	if (j >= k+2) {
598	85517		UV pk = primes[k+1];
599	85517		UV endj = j;
600	2234064	50	while (endj > 7 && endj-7 >= k+2 && pk*primes[endj-7] > least_divisor) endj -= 8;
		100
		100
601	384761	100	while ( endj >= k+2 && pk*primes[endj ] > least_divisor) endj--;
		100
602			/* Now that we know how far to go, do the summations */
603	17573137	100	for ( ; j > endj; j--)
604	17487620		sum1 += sieve_phi(n / (pk*primes[j]));
605	85517		ss.prime_index[k] = endj;
606			}
607			}
608			/* Restrict work for the above loop when we know it will be empty. */
609	86405	100	while (step7_max > KM && ss.prime_index[step7_max-1] < (step7_max-1)+2)
		100
610	85599		step7_max--;
611
612			/* Step 8: For KM <= K < K3, sum -phi(n / primes[k+1], k) */
613	806		remove_primes(k, K3, &ss, primes);
614			/* Step 9: For K3 <= k < K2, sum -phi(n / primes[k+1], k) + (k-K3). */
615	1552331	100	while (prime > least_divisor && prime_index >= piM) {
		50
616	1551525		sum1 += prime_index - K3;
617	1551525		sum2 += sieve_phi(n / prime);
618	1551525		prime_index--;
619	1551525		prime = prev_sieve_prime(prime);
620			}
621			}
622
623	788		Safefree(ss.sieve);
624	788		Safefree(ss.word_count);
625	788		Safefree(ss.word_count_sum);
626	788		Safefree(ss.totals);
627	788		Safefree(ss.prime_index);
628	788		Safefree(ss.first_bit_index);
629	788		Safefree(ss.multiplier);
630	788		Safefree(factor_table);
631	788		Safefree(primes);
632
633	788		return sum1 - sum2;
634			}