line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#if defined(LEHMER) || defined(PRIMESIEVE_STANDALONE) |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#include |
4
|
|
|
|
|
|
|
#include |
5
|
|
|
|
|
|
|
#include |
6
|
|
|
|
|
|
|
#include |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
/***************************************************************************** |
9
|
|
|
|
|
|
|
* |
10
|
|
|
|
|
|
|
* Lehmer prime counting utility. Calculates pi(x), count of primes <= x. |
11
|
|
|
|
|
|
|
* |
12
|
|
|
|
|
|
|
* Copyright (c) 2012-2013 Dana Jacobsen (dana@acm.org). |
13
|
|
|
|
|
|
|
* This is free software; you can redistribute it and/or modify it under |
14
|
|
|
|
|
|
|
* the same terms as the Perl 5 programming language system itself. |
15
|
|
|
|
|
|
|
* |
16
|
|
|
|
|
|
|
* This file is part of the Math::Prime::Util Perl module, but also can be |
17
|
|
|
|
|
|
|
* compiled as a standalone UNIX program using primesieve 5.x. |
18
|
|
|
|
|
|
|
* |
19
|
|
|
|
|
|
|
* g++ -O3 -DPRIMESIEVE_STANDALONE lehmer.c -o prime_count -lprimesieve |
20
|
|
|
|
|
|
|
* |
21
|
|
|
|
|
|
|
* The phi(x,a) calculation is unique, to the best of my knowledge. It uses |
22
|
|
|
|
|
|
|
* two lists of all x values + signed counts for the given 'a' value, and walks |
23
|
|
|
|
|
|
|
* 'a' down until it is small enough to calculate directly using a table. |
24
|
|
|
|
|
|
|
* This is relatively fast and low memory compared to many other solutions. |
25
|
|
|
|
|
|
|
* As with all Lehmer-Meissel-Legendre algorithms, memory use will be a |
26
|
|
|
|
|
|
|
* constraint with large values of x. |
27
|
|
|
|
|
|
|
* |
28
|
|
|
|
|
|
|
* Math::Prime::Util now includes an extended LMO implementation, which will |
29
|
|
|
|
|
|
|
* be quite a bit faster and much less memory than this code. It is the |
30
|
|
|
|
|
|
|
* default method for large counts. Timing comparisons are in that file. |
31
|
|
|
|
|
|
|
* |
32
|
|
|
|
|
|
|
* Times and memory use for prime_count(10^15) on a Haswell 4770K, asterisk |
33
|
|
|
|
|
|
|
* indicates parallel operation. The standalone versions of my code use |
34
|
|
|
|
|
|
|
* Kim Walisch's excellent primesieve, which is faster than my sieve. |
35
|
|
|
|
|
|
|
* His Lehmer/Meissel/Legendre seem a bit slower in serial, but |
36
|
|
|
|
|
|
|
* parallelize much better. |
37
|
|
|
|
|
|
|
* |
38
|
|
|
|
|
|
|
* 4.74s 1.3MB LMO |
39
|
|
|
|
|
|
|
* 24.53s* 137.9MB Lehmer Walisch primecount v0.9, 8 threads |
40
|
|
|
|
|
|
|
* 38.74s* 150.3MB LMOS Walisch primecount v0.9, 8 threads |
41
|
|
|
|
|
|
|
* 42.52s* 159.4MB Lehmer standalone, 8 threads |
42
|
|
|
|
|
|
|
* 42.82s* 137.9MB Meissel Walisch primecount v0.9, 8 threads |
43
|
|
|
|
|
|
|
* 51.88s 153.9MB LMOS standalone, 1 thread |
44
|
|
|
|
|
|
|
* 52.01s* 145.5MB Legendre Walisch primecount v0.9, 8 threads |
45
|
|
|
|
|
|
|
* 64.96s 160.3MB Lehmer standalone, 1 thread |
46
|
|
|
|
|
|
|
* 67.16s 67.0MB LMOS |
47
|
|
|
|
|
|
|
* 80.42s 286.6MB Meissel |
48
|
|
|
|
|
|
|
* 99.70s 159.6MB Lehmer |
49
|
|
|
|
|
|
|
* 107.43s 28.5MB Lehmer Walisch primecount v0.9, 1 thread |
50
|
|
|
|
|
|
|
* 174.51s 83.5MB Legendre |
51
|
|
|
|
|
|
|
* 185.11s 25.6MB LMOS Walisch primecount v0.9, 1 thread |
52
|
|
|
|
|
|
|
* 191.19s 24.8MB Meissel Walisch primecount v0.9, 1 thread |
53
|
|
|
|
|
|
|
* 868.96s 1668.1MB Lehmer pix4 by T.R. Nicely |
54
|
|
|
|
|
|
|
* |
55
|
|
|
|
|
|
|
* Reference: Hans Riesel, "Prime Numbers and Computer Methods for |
56
|
|
|
|
|
|
|
* Factorization", 2nd edition, 1994. |
57
|
|
|
|
|
|
|
*/ |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
/* Below this size, just sieve (with table speedup). */ |
60
|
|
|
|
|
|
|
#define SIEVE_LIMIT 60000000 |
61
|
|
|
|
|
|
|
#define MAX_PHI_MEM (896*1024*1024) |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
static int const verbose = 0; |
64
|
|
|
|
|
|
|
#define STAGE_TIMING 0 |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
#if STAGE_TIMING |
67
|
|
|
|
|
|
|
#include |
68
|
|
|
|
|
|
|
#define DECLARE_TIMING_VARIABLES struct timeval t0, t1; |
69
|
|
|
|
|
|
|
#define TIMING_START gettimeofday(&t0, 0); |
70
|
|
|
|
|
|
|
#define TIMING_END_PRINT(text) \ |
71
|
|
|
|
|
|
|
{ unsigned long long t; \ |
72
|
|
|
|
|
|
|
gettimeofday(&t1, 0); \ |
73
|
|
|
|
|
|
|
t = (t1.tv_sec-t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec); \ |
74
|
|
|
|
|
|
|
printf("%s: %10.5f\n", text, ((double)t) / 1000000); } |
75
|
|
|
|
|
|
|
#else |
76
|
|
|
|
|
|
|
#define DECLARE_TIMING_VARIABLES |
77
|
|
|
|
|
|
|
#define TIMING_START |
78
|
|
|
|
|
|
|
#define TIMING_END_PRINT(text) |
79
|
|
|
|
|
|
|
#endif |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#ifdef PRIMESIEVE_STANDALONE |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
/* countPrimes can be pretty slow for small ranges, so sieve more small primes |
85
|
|
|
|
|
|
|
* and count using binary search. Uses a lot of memory though. For big |
86
|
|
|
|
|
|
|
* ranges, countPrimes is really fast. If you use primesieve 4.2, the |
87
|
|
|
|
|
|
|
* crossover point is lower (better). */ |
88
|
|
|
|
|
|
|
#define SIEVE_MULT 10 |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
/* Translations from Perl + Math::Prime::Util to C/C++ + primesieve */ |
91
|
|
|
|
|
|
|
typedef unsigned long UV; |
92
|
|
|
|
|
|
|
typedef signed long IV; |
93
|
|
|
|
|
|
|
#define UV_MAX ULONG_MAX |
94
|
|
|
|
|
|
|
#define UVCONST(x) ((unsigned long)x##UL) |
95
|
|
|
|
|
|
|
#define New(id, mem, size, type) mem = (type*) malloc((size)*sizeof(type)) |
96
|
|
|
|
|
|
|
#define Newz(id, mem, size, type) mem = (type*) calloc(size, sizeof(type)) |
97
|
|
|
|
|
|
|
#define Renew(mem, size, type) mem = (type*) realloc(mem,(size)*sizeof(type)) |
98
|
|
|
|
|
|
|
#define Safefree(mem) free((void*)mem) |
99
|
|
|
|
|
|
|
#define croak(fmt,...) { printf(fmt,##__VA_ARGS__); exit(1); } |
100
|
|
|
|
|
|
|
#define prime_precalc(n) /* */ |
101
|
|
|
|
|
|
|
#define BITS_PER_WORD ((ULONG_MAX <= 4294967295UL) ? 32 : 64) |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
static UV isqrt(UV n) { |
104
|
|
|
|
|
|
|
UV root; |
105
|
|
|
|
|
|
|
if (sizeof(UV)==8 && n >= UVCONST(18446744065119617025)) return 4294967295UL; |
106
|
|
|
|
|
|
|
if (sizeof(UV)==4 && n >= 4294836225UL) return 65535UL; |
107
|
|
|
|
|
|
|
root = (UV) sqrt((double)n); |
108
|
|
|
|
|
|
|
while (root*root > n) root--; |
109
|
|
|
|
|
|
|
while ((root+1)*(root+1) <= n) root++; |
110
|
|
|
|
|
|
|
return root; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
static UV icbrt(UV n) { |
113
|
|
|
|
|
|
|
UV b, root = 0; |
114
|
|
|
|
|
|
|
int s; |
115
|
|
|
|
|
|
|
if (sizeof(UV) == 8) { |
116
|
|
|
|
|
|
|
s = 63; if (n >= UVCONST(18446724184312856125)) return 2642245UL; |
117
|
|
|
|
|
|
|
} else { |
118
|
|
|
|
|
|
|
s = 30; if (n >= 4291015625UL) return 1625UL; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
for ( ; s >= 0; s -= 3) { |
121
|
|
|
|
|
|
|
root += root; |
122
|
|
|
|
|
|
|
b = 3*root*(root+1)+1; |
123
|
|
|
|
|
|
|
if ((n >> s) >= b) { |
124
|
|
|
|
|
|
|
n -= b << s; |
125
|
|
|
|
|
|
|
root++; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
return root; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
/* Use version 5.x of PrimeSieve */ |
132
|
|
|
|
|
|
|
#include |
133
|
|
|
|
|
|
|
#include |
134
|
|
|
|
|
|
|
#include |
135
|
|
|
|
|
|
|
#include |
136
|
|
|
|
|
|
|
#ifdef _OPENMP |
137
|
|
|
|
|
|
|
#include |
138
|
|
|
|
|
|
|
#endif |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
#define segment_prime_count(a, b) primesieve::parallel_count_primes(a, b) |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
/* Generate an array of n small primes, where the kth prime is element p[k]. |
143
|
|
|
|
|
|
|
* Remember to free when done. */ |
144
|
|
|
|
|
|
|
#define TINY_PRIME_SIZE 20000 |
145
|
|
|
|
|
|
|
static uint32_t* tiny_primes = 0; |
146
|
|
|
|
|
|
|
static uint32_t* generate_small_primes(UV n) |
147
|
|
|
|
|
|
|
{ |
148
|
|
|
|
|
|
|
uint32_t* primes; |
149
|
|
|
|
|
|
|
New(0, primes, n+1, uint32_t); |
150
|
|
|
|
|
|
|
if (n < TINY_PRIME_SIZE) { |
151
|
|
|
|
|
|
|
if (tiny_primes == 0) |
152
|
|
|
|
|
|
|
tiny_primes = generate_small_primes(TINY_PRIME_SIZE+1); |
153
|
|
|
|
|
|
|
memcpy(primes, tiny_primes, (n+1) * sizeof(uint32_t)); |
154
|
|
|
|
|
|
|
return primes; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
primes[0] = 0; |
157
|
|
|
|
|
|
|
{ |
158
|
|
|
|
|
|
|
std::vector v; |
159
|
|
|
|
|
|
|
primesieve::generate_n_primes(n, &v); |
160
|
|
|
|
|
|
|
memcpy(primes+1, &v[0], n * sizeof(uint32_t)); |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
return primes; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
#else |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
/* We will use pre-sieving to speed up counting for small ranges */ |
168
|
|
|
|
|
|
|
#define SIEVE_MULT 1 |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
#define FUNC_isqrt 1 |
171
|
|
|
|
|
|
|
#define FUNC_icbrt 1 |
172
|
|
|
|
|
|
|
#include "lehmer.h" |
173
|
|
|
|
|
|
|
#include "util.h" |
174
|
|
|
|
|
|
|
#include "cache.h" |
175
|
|
|
|
|
|
|
#include "sieve.h" |
176
|
|
|
|
|
|
|
#include "prime_nth_count.h" |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
/* Generate an array of n small primes, where the kth prime is element p[k]. |
179
|
|
|
|
|
|
|
* Remember to free when done. */ |
180
|
|
|
|
|
|
|
static uint32_t* generate_small_primes(UV n) |
181
|
|
|
|
|
|
|
{ |
182
|
|
|
|
|
|
|
uint32_t* primes; |
183
|
|
|
|
|
|
|
UV i = 0; |
184
|
|
|
|
|
|
|
double fn = (double)n; |
185
|
|
|
|
|
|
|
double flogn = log(fn); |
186
|
|
|
|
|
|
|
double flog2n = log(flogn); |
187
|
|
|
|
|
|
|
UV nth_prime = /* Dusart 2010 for > 179k, custom for 18-179k */ |
188
|
|
|
|
|
|
|
(n >= 688383) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-2.00)/flogn))) : |
189
|
|
|
|
|
|
|
(n >= 178974) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-1.95)/flogn))) : |
190
|
|
|
|
|
|
|
(n >= 18) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n+0.30)/flogn))) |
191
|
|
|
|
|
|
|
: 59; |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
if (n > 203280221) |
194
|
|
|
|
|
|
|
croak("generate small primes with argument too large: %lu\n", (unsigned long)n); |
195
|
|
|
|
|
|
|
New(0, primes, n+1, uint32_t); |
196
|
|
|
|
|
|
|
primes[0] = 0; |
197
|
|
|
|
|
|
|
START_DO_FOR_EACH_PRIME(2, nth_prime) { |
198
|
|
|
|
|
|
|
if (i >= n) break; |
199
|
|
|
|
|
|
|
primes[++i] = p; |
200
|
|
|
|
|
|
|
} END_DO_FOR_EACH_PRIME |
201
|
|
|
|
|
|
|
if (i < n) |
202
|
|
|
|
|
|
|
croak("Did not generate enough small primes.\n"); |
203
|
|
|
|
|
|
|
if (verbose > 1) printf("generated %lu small primes, from 2 to %lu\n", i, (unsigned long)primes[i]); |
204
|
|
|
|
|
|
|
return primes; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
#endif |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
/* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime. |
210
|
|
|
|
|
|
|
* This is actually quite fast, and definitely faster than sieving. By using |
211
|
|
|
|
|
|
|
* this we can avoid caching prime counts and also skip most calls to the |
212
|
|
|
|
|
|
|
* segment siever. |
213
|
|
|
|
|
|
|
*/ |
214
|
|
|
|
|
|
|
static UV bs_prime_count(uint32_t n, uint32_t const* const primes, uint32_t lastidx) |
215
|
|
|
|
|
|
|
{ |
216
|
|
|
|
|
|
|
UV i, j; |
217
|
|
|
|
|
|
|
if (n <= 2) return (n == 2); |
218
|
|
|
|
|
|
|
/* If n is out of range, we could: |
219
|
|
|
|
|
|
|
* 1. return segment_prime_count(2, n); |
220
|
|
|
|
|
|
|
* 2. if (n == primes[lastidx]) return lastidx else croak("bspc range"); |
221
|
|
|
|
|
|
|
* 3. if (n >= primes[lastidx]) return lastidx; |
222
|
|
|
|
|
|
|
*/ |
223
|
|
|
|
|
|
|
if (n >= primes[lastidx]) return lastidx; |
224
|
|
|
|
|
|
|
j = lastidx; |
225
|
|
|
|
|
|
|
if (n < 8480) { |
226
|
|
|
|
|
|
|
i = 1 + (n>>4); |
227
|
|
|
|
|
|
|
if (j > 1060) j = 1060; |
228
|
|
|
|
|
|
|
} else if (n < 25875000) { |
229
|
|
|
|
|
|
|
i = 793 + (n>>5); |
230
|
|
|
|
|
|
|
if (j > (n>>3)) j = n>>3; |
231
|
|
|
|
|
|
|
} else { |
232
|
|
|
|
|
|
|
i = 1617183; |
233
|
|
|
|
|
|
|
if (j > (n>>4)) j = n>>4; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
while (i < j) { |
236
|
|
|
|
|
|
|
UV mid = i + (j-i)/2; |
237
|
|
|
|
|
|
|
if (primes[mid] <= n) i = mid+1; |
238
|
|
|
|
|
|
|
else j = mid; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
/* if (i-1 != segment_prime_count(2, n)) croak("wrong count for %lu: %lu vs. %lu\n", n, i-1, segment_prime_count(2, n)); */ |
241
|
|
|
|
|
|
|
return i-1; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
#define FAST_DIV(x,y) \ |
245
|
|
|
|
|
|
|
( ((x) <= 4294967295U) ? (uint32_t)(x)/(uint32_t)(y) : (x)/(y) ) |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
/* static uint32_t sprime[] = {0,2, 3, 5, 7, 11, 13, 17, 19, 23}; */ |
248
|
|
|
|
|
|
|
/* static uint32_t sprimorial[] = {1,2,6,30,210,2310,30030,510510}; */ |
249
|
|
|
|
|
|
|
/* static uint32_t stotient[] = {1,1,2, 8, 48, 480, 5760, 92160}; */ |
250
|
|
|
|
|
|
|
static const uint16_t _s0[ 1] = {0}; |
251
|
|
|
|
|
|
|
static const uint16_t _s1[ 2] = {0,1}; |
252
|
|
|
|
|
|
|
static const uint16_t _s2[ 6] = {0,1,1,1,1,2}; |
253
|
|
|
|
|
|
|
static const uint16_t _s3[30] = {0,1,1,1,1,1,1,2,2,2,2,3,3,4,4,4,4,5,5,6,6,6,6,7,7,7,7,7,7,8}; |
254
|
|
|
|
|
|
|
static uint16_t _s4[210]; |
255
|
|
|
|
|
|
|
static uint16_t _s5[2310]; |
256
|
|
|
|
|
|
|
static uint16_t _s6[30030]; |
257
|
|
|
|
|
|
|
static const uint16_t* sphicache[7] = { _s0,_s1,_s2,_s3,_s4,_s5,_s6 }; |
258
|
|
|
|
|
|
|
static int sphi_init = 0; |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
#define PHIC 7 |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
static UV tablephi(UV x, uint32_t a) { |
263
|
|
|
|
|
|
|
switch (a) { |
264
|
|
|
|
|
|
|
case 0: return x; |
265
|
|
|
|
|
|
|
case 1: return x-x/2; |
266
|
|
|
|
|
|
|
case 2: return x-x/2-x/3+x/6; |
267
|
|
|
|
|
|
|
case 3: return (x/ 30U) * 8U + sphicache[3][x % 30U]; |
268
|
|
|
|
|
|
|
case 4: return (x/ 210U) * 48U + sphicache[4][x % 210U]; |
269
|
|
|
|
|
|
|
case 5: return (x/ 2310U) * 480U + sphicache[5][x % 2310U]; |
270
|
|
|
|
|
|
|
case 6: return (x/ 30030U) * 5760U + sphicache[6][x % 30030U]; |
271
|
|
|
|
|
|
|
#if PHIC >= 7 |
272
|
|
|
|
|
|
|
case 7: { |
273
|
|
|
|
|
|
|
UV xp = x / 17U; |
274
|
|
|
|
|
|
|
return ((x /30030U) * 5760U + sphicache[6][x % 30030U]) - |
275
|
|
|
|
|
|
|
((xp/30030U) * 5760U + sphicache[6][xp % 30030U]); |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
#endif |
278
|
|
|
|
|
|
|
#if PHIC >= 8 |
279
|
|
|
|
|
|
|
case 8: { |
280
|
|
|
|
|
|
|
UV xp = x / 17U; |
281
|
|
|
|
|
|
|
UV x2 = x / 19U; |
282
|
|
|
|
|
|
|
UV x2p = x2 / 17U; |
283
|
|
|
|
|
|
|
return ((x /30030U) * 5760U + sphicache[6][x % 30030U]) - |
284
|
|
|
|
|
|
|
((xp /30030U) * 5760U + sphicache[6][xp % 30030U]) - |
285
|
|
|
|
|
|
|
((x2 /30030U) * 5760U + sphicache[6][x2 % 30030U]) + |
286
|
|
|
|
|
|
|
((x2p/30030U) * 5760U + sphicache[6][x2p% 30030U]); |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
#endif |
289
|
|
|
|
|
|
|
default: croak("a %u too large for tablephi\n", a); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
static void phitableinit(void) { |
293
|
|
|
|
|
|
|
if (sphi_init == 0) { |
294
|
|
|
|
|
|
|
int x; |
295
|
|
|
|
|
|
|
for (x = 0; x < 210; x++) |
296
|
|
|
|
|
|
|
_s4[x] = ((x/ 30)* 8+_s3[x% 30])-(((x/ 7)/ 30)* 8+_s3[(x/ 7)% 30]); |
297
|
|
|
|
|
|
|
for (x = 0; x < 2310; x++) |
298
|
|
|
|
|
|
|
_s5[x] = ((x/ 210)* 48+_s4[x% 210])-(((x/11)/ 210)* 48+_s4[(x/11)% 210]); |
299
|
|
|
|
|
|
|
for (x = 0; x < 30030; x++) |
300
|
|
|
|
|
|
|
_s6[x] = ((x/2310)*480+_s5[x%2310])-(((x/13)/2310)*480+_s5[(x/13)%2310]); |
301
|
|
|
|
|
|
|
sphi_init = 1; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
/* Max memory = 2*X*A bytes, e.g. 2*65536*256 = 32 MB */ |
307
|
|
|
|
|
|
|
#define PHICACHEA 512 |
308
|
|
|
|
|
|
|
#define PHICACHEX 65536 |
309
|
|
|
|
|
|
|
typedef struct |
310
|
|
|
|
|
|
|
{ |
311
|
|
|
|
|
|
|
uint32_t max[PHICACHEA]; |
312
|
|
|
|
|
|
|
int16_t* val[PHICACHEA]; |
313
|
|
|
|
|
|
|
} cache_t; |
314
|
|
|
|
|
|
|
static void phicache_init(cache_t* cache) { |
315
|
|
|
|
|
|
|
int a; |
316
|
|
|
|
|
|
|
for (a = 0; a < PHICACHEA; a++) { |
317
|
|
|
|
|
|
|
cache->val[a] = 0; |
318
|
|
|
|
|
|
|
cache->max[a] = 0; |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
phitableinit(); |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
static void phicache_free(cache_t* cache) { |
323
|
|
|
|
|
|
|
int a; |
324
|
|
|
|
|
|
|
for (a = 0; a < PHICACHEA; a++) { |
325
|
|
|
|
|
|
|
if (cache->val[a] != 0) |
326
|
|
|
|
|
|
|
Safefree(cache->val[a]); |
327
|
|
|
|
|
|
|
cache->val[a] = 0; |
328
|
|
|
|
|
|
|
cache->max[a] = 0; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
#define PHI_CACHE_POPULATED(x, a) \ |
333
|
|
|
|
|
|
|
((a) < PHICACHEA && (UV) cache->max[a] > (x) && cache->val[a][x] != 0) |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
static void phi_cache_insert(uint32_t x, uint32_t a, IV sum, cache_t* cache) { |
336
|
|
|
|
|
|
|
uint32_t cap = ( (x+32) >> 5) << 5; |
337
|
|
|
|
|
|
|
/* If sum is too large for the cache, just ignore it. */ |
338
|
|
|
|
|
|
|
if (sum < SHRT_MIN || sum > SHRT_MAX) return; |
339
|
|
|
|
|
|
|
if (cache->val[a] == 0) { |
340
|
|
|
|
|
|
|
Newz(0, cache->val[a], cap, int16_t); |
341
|
|
|
|
|
|
|
cache->max[a] = cap; |
342
|
|
|
|
|
|
|
} else if (cache->max[a] < cap) { |
343
|
|
|
|
|
|
|
uint32_t i; |
344
|
|
|
|
|
|
|
Renew(cache->val[a], cap, int16_t); |
345
|
|
|
|
|
|
|
for (i = cache->max[a]; i < cap; i++) |
346
|
|
|
|
|
|
|
cache->val[a][i] = 0; |
347
|
|
|
|
|
|
|
cache->max[a] = cap; |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
cache->val[a][x] = (int16_t) sum; |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
static IV _phi3(UV x, UV a, int sign, const uint32_t* const primes, const uint32_t lastidx, cache_t* cache) |
353
|
|
|
|
|
|
|
{ |
354
|
|
|
|
|
|
|
IV sum; |
355
|
|
|
|
|
|
|
if (a <= 1) |
356
|
|
|
|
|
|
|
return sign * ((a == 0) ? x : x-x/2); |
357
|
|
|
|
|
|
|
else if (PHI_CACHE_POPULATED(x, a)) |
358
|
|
|
|
|
|
|
return sign * cache->val[a][x]; |
359
|
|
|
|
|
|
|
else if (a <= PHIC) |
360
|
|
|
|
|
|
|
sum = sign * tablephi(x,a); |
361
|
|
|
|
|
|
|
else if (x < primes[a+1]) |
362
|
|
|
|
|
|
|
sum = sign; |
363
|
|
|
|
|
|
|
else if (x <= primes[lastidx] && x < primes[a+1]*primes[a+1]) |
364
|
|
|
|
|
|
|
sum = sign * (bs_prime_count(x, primes, lastidx) - a + 1); |
365
|
|
|
|
|
|
|
else { |
366
|
|
|
|
|
|
|
UV a2, iters = (a*a > x) ? bs_prime_count( isqrt(x), primes, a) : a; |
367
|
|
|
|
|
|
|
UV c = (iters > PHIC) ? PHIC : iters; |
368
|
|
|
|
|
|
|
IV phixc = PHI_CACHE_POPULATED(x, c) ? cache->val[c][x] : (IV)tablephi(x,c); |
369
|
|
|
|
|
|
|
sum = sign * (iters - a + phixc); |
370
|
|
|
|
|
|
|
for (a2 = c+1; a2 <= iters; a2++) |
371
|
|
|
|
|
|
|
sum += _phi3(FAST_DIV(x,primes[a2]), a2-1, -sign, primes, lastidx, cache); |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
if (a < PHICACHEA && x < PHICACHEX) |
374
|
|
|
|
|
|
|
phi_cache_insert(x, a, sign * sum, cache); |
375
|
|
|
|
|
|
|
return sum; |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
#define phi_small(x, a, primes, lastidx, cache) _phi3(x, a, 1, primes, lastidx, cache) |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
/******************************************************************************/ |
380
|
|
|
|
|
|
|
/* In-order lists for manipulating our UV value / IV count pairs */ |
381
|
|
|
|
|
|
|
/******************************************************************************/ |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
typedef struct { |
384
|
|
|
|
|
|
|
UV v; |
385
|
|
|
|
|
|
|
IV c; |
386
|
|
|
|
|
|
|
} vc_t; |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
typedef struct { |
389
|
|
|
|
|
|
|
vc_t* a; |
390
|
|
|
|
|
|
|
UV size; |
391
|
|
|
|
|
|
|
UV n; |
392
|
|
|
|
|
|
|
} vcarray_t; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
static vcarray_t vcarray_create(void) |
395
|
|
|
|
|
|
|
{ |
396
|
|
|
|
|
|
|
vcarray_t l; |
397
|
|
|
|
|
|
|
l.a = 0; |
398
|
|
|
|
|
|
|
l.size = 0; |
399
|
|
|
|
|
|
|
l.n = 0; |
400
|
|
|
|
|
|
|
return l; |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
static void vcarray_destroy(vcarray_t* l) |
403
|
|
|
|
|
|
|
{ |
404
|
|
|
|
|
|
|
if (l->a != 0) { |
405
|
|
|
|
|
|
|
if (verbose > 2) printf("FREE list %p\n", l->a); |
406
|
|
|
|
|
|
|
Safefree(l->a); |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
l->size = 0; |
409
|
|
|
|
|
|
|
l->n = 0; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
/* Insert a value/count pair. We do this indirection because about 80% of |
412
|
|
|
|
|
|
|
* the calls result in a merge with the previous entry. */ |
413
|
|
|
|
|
|
|
static void vcarray_insert(vcarray_t* l, UV val, IV count) |
414
|
|
|
|
|
|
|
{ |
415
|
|
|
|
|
|
|
UV n = l->n; |
416
|
|
|
|
|
|
|
if (n > 0 && l->a[n-1].v < val) |
417
|
|
|
|
|
|
|
croak("Previous value was %lu, inserting %lu out of order\n", l->a[n-1].v, val); |
418
|
|
|
|
|
|
|
if (n >= l->size) { |
419
|
|
|
|
|
|
|
UV new_size; |
420
|
|
|
|
|
|
|
if (l->size == 0) { |
421
|
|
|
|
|
|
|
new_size = 20000; |
422
|
|
|
|
|
|
|
if (verbose>2) printf("ALLOCing list, size %lu (%luk)\n", new_size, new_size*sizeof(vc_t)/1024); |
423
|
|
|
|
|
|
|
New(0, l->a, new_size, vc_t); |
424
|
|
|
|
|
|
|
} else { |
425
|
|
|
|
|
|
|
new_size = (UV) (1.5 * l->size); |
426
|
|
|
|
|
|
|
if (verbose>2) printf("REALLOCing list %p, new size %lu (%luk)\n",l->a,new_size, new_size*sizeof(vc_t)/1024); |
427
|
|
|
|
|
|
|
Renew( l->a, new_size, vc_t ); |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
l->size = new_size; |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
/* printf(" inserting %lu %ld\n", val, count); */ |
432
|
|
|
|
|
|
|
l->a[n].v = val; |
433
|
|
|
|
|
|
|
l->a[n].c = count; |
434
|
|
|
|
|
|
|
l->n++; |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
/* Merge the two sorted lists A and B into A. Each list has no duplicates, |
438
|
|
|
|
|
|
|
* but they may have duplications between the two. We're quite interested |
439
|
|
|
|
|
|
|
* in saving memory, so first remove all the duplicates, then do an in-place |
440
|
|
|
|
|
|
|
* merge. */ |
441
|
|
|
|
|
|
|
static void vcarray_merge(vcarray_t* a, vcarray_t* b) |
442
|
|
|
|
|
|
|
{ |
443
|
|
|
|
|
|
|
long ai, bi, bj, k, kn; |
444
|
|
|
|
|
|
|
long an = a->n; |
445
|
|
|
|
|
|
|
long bn = b->n; |
446
|
|
|
|
|
|
|
vc_t* aa = a->a; |
447
|
|
|
|
|
|
|
vc_t* ba = b->a; |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
/* Merge anything in B that appears in A. */ |
450
|
|
|
|
|
|
|
for (ai = 0, bi = 0, bj = 0; bi < bn; bi++) { |
451
|
|
|
|
|
|
|
UV bval = ba[bi].v; |
452
|
|
|
|
|
|
|
/* Skip forward in A until empty or aa[ai].v <= ba[bi].v */ |
453
|
|
|
|
|
|
|
while (ai+8 < an && aa[ai+8].v > bval) ai += 8; |
454
|
|
|
|
|
|
|
while (ai < an && aa[ai ].v > bval) ai++; |
455
|
|
|
|
|
|
|
/* if A empty then copy the remaining elements */ |
456
|
|
|
|
|
|
|
if (ai >= an) { |
457
|
|
|
|
|
|
|
if (bi == bj) |
458
|
|
|
|
|
|
|
bj = bn; |
459
|
|
|
|
|
|
|
else |
460
|
|
|
|
|
|
|
while (bi < bn) |
461
|
|
|
|
|
|
|
ba[bj++] = ba[bi++]; |
462
|
|
|
|
|
|
|
break; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
if (aa[ai].v == bval) |
465
|
|
|
|
|
|
|
aa[ai].c += ba[bi].c; |
466
|
|
|
|
|
|
|
else |
467
|
|
|
|
|
|
|
ba[bj++] = ba[bi]; |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
if (verbose>3) printf(" removed %lu duplicates from b\n", bn - bj); |
470
|
|
|
|
|
|
|
bn = bj; |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
if (bn == 0) { /* In case they were all duplicates */ |
473
|
|
|
|
|
|
|
b->n = 0; |
474
|
|
|
|
|
|
|
return; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
/* kn = the final merged size. All duplicates are gone, so this is exact. */ |
478
|
|
|
|
|
|
|
kn = an+bn; |
479
|
|
|
|
|
|
|
if ((long)a->size < kn) { /* Make A big enough to hold kn elements */ |
480
|
|
|
|
|
|
|
UV new_size = (UV) (1.2 * kn); |
481
|
|
|
|
|
|
|
if (verbose>2) printf("REALLOCing list %p, new size %lu (%luk)\n", a->a, new_size, new_size*sizeof(vc_t)/1024); |
482
|
|
|
|
|
|
|
Renew( a->a, new_size, vc_t ); |
483
|
|
|
|
|
|
|
aa = a->a; /* this could have been changed by the realloc */ |
484
|
|
|
|
|
|
|
a->size = new_size; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
/* merge A and B. Very simple using reverse merge. */ |
488
|
|
|
|
|
|
|
ai = an-1; |
489
|
|
|
|
|
|
|
bi = bn-1; |
490
|
|
|
|
|
|
|
for (k = kn-1; k >= 0 && bi >= 0; k--) { |
491
|
|
|
|
|
|
|
UV bval = ba[bi].v; |
492
|
|
|
|
|
|
|
long startai = ai; |
493
|
|
|
|
|
|
|
while (ai >= 15 && aa[ai-15].v < bval) ai -= 16; |
494
|
|
|
|
|
|
|
while (ai >= 3 && aa[ai- 3].v < bval) ai -= 4; |
495
|
|
|
|
|
|
|
while (ai >= 0 && aa[ai ].v < bval) ai--; |
496
|
|
|
|
|
|
|
if (startai > ai) { |
497
|
|
|
|
|
|
|
k = k - (startai - ai) + 1; |
498
|
|
|
|
|
|
|
memmove(aa+k, aa+ai+1, (startai-ai) * sizeof(vc_t)); |
499
|
|
|
|
|
|
|
} else { |
500
|
|
|
|
|
|
|
if (ai >= 0 && aa[ai].v == bval) croak("deduplication error"); |
501
|
|
|
|
|
|
|
aa[k] = ba[bi--]; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
a->n = kn; /* A now has this many items */ |
505
|
|
|
|
|
|
|
b->n = 0; /* B is marked empty */ |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
static void vcarray_remove_zeros(vcarray_t* a) |
509
|
|
|
|
|
|
|
{ |
510
|
|
|
|
|
|
|
long ai = 0; |
511
|
|
|
|
|
|
|
long aj = 0; |
512
|
|
|
|
|
|
|
long an = a->n; |
513
|
|
|
|
|
|
|
vc_t* aa = a->a; |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
while (aj < an) { |
516
|
|
|
|
|
|
|
if (aa[aj].c != 0) { |
517
|
|
|
|
|
|
|
if (ai != aj) |
518
|
|
|
|
|
|
|
aa[ai] = aa[aj]; |
519
|
|
|
|
|
|
|
ai++; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
aj++; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
a->n = ai; |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
/* |
528
|
|
|
|
|
|
|
* The main phi(x,a) algorithm. In this implementation, it takes under 10% |
529
|
|
|
|
|
|
|
* of the total time for the Lehmer algorithm, but is a big memory consumer. |
530
|
|
|
|
|
|
|
*/ |
531
|
|
|
|
|
|
|
#define NTHRESH (MAX_PHI_MEM/16) |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
static UV phi(UV x, UV a) |
534
|
|
|
|
|
|
|
{ |
535
|
|
|
|
|
|
|
UV i, val, sval, lastidx, lastprime; |
536
|
|
|
|
|
|
|
UV sum = 0; |
537
|
|
|
|
|
|
|
IV count; |
538
|
|
|
|
|
|
|
const uint32_t* primes; |
539
|
|
|
|
|
|
|
vcarray_t a1, a2; |
540
|
|
|
|
|
|
|
vc_t* arr; |
541
|
|
|
|
|
|
|
cache_t pcache; /* Cache for recursive phi */ |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
phitableinit(); |
544
|
|
|
|
|
|
|
if (a == 1) return ((x+1)/2); |
545
|
|
|
|
|
|
|
if (a <= PHIC) return tablephi(x, a); |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
lastidx = a+1; |
548
|
|
|
|
|
|
|
primes = generate_small_primes(lastidx); |
549
|
|
|
|
|
|
|
lastprime = primes[lastidx]; |
550
|
|
|
|
|
|
|
if (x < lastprime) { Safefree(primes); return (x > 0) ? 1 : 0; } |
551
|
|
|
|
|
|
|
phicache_init(&pcache); |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
a1 = vcarray_create(); |
554
|
|
|
|
|
|
|
a2 = vcarray_create(); |
555
|
|
|
|
|
|
|
vcarray_insert(&a1, x, 1); |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
while (a > PHIC) { |
558
|
|
|
|
|
|
|
UV primea = primes[a]; |
559
|
|
|
|
|
|
|
UV sval_last = 0; |
560
|
|
|
|
|
|
|
IV sval_count = 0; |
561
|
|
|
|
|
|
|
arr = a1.a; |
562
|
|
|
|
|
|
|
for (i = 0; i < a1.n; i++) { |
563
|
|
|
|
|
|
|
count = arr[i].c; |
564
|
|
|
|
|
|
|
val = arr[i].v; |
565
|
|
|
|
|
|
|
sval = FAST_DIV(val, primea); |
566
|
|
|
|
|
|
|
if (sval < primea) break; /* stop inserting into a2 if small */ |
567
|
|
|
|
|
|
|
if (sval != sval_last) { /* non-merged value. Insert into a2 */ |
568
|
|
|
|
|
|
|
if (sval_last != 0) { |
569
|
|
|
|
|
|
|
if (sval_last <= lastprime && sval_last < primes[a-1]*primes[a-1]) |
570
|
|
|
|
|
|
|
sum += sval_count*(bs_prime_count(sval_last,primes,lastidx)-a+2); |
571
|
|
|
|
|
|
|
else |
572
|
|
|
|
|
|
|
vcarray_insert(&a2, sval_last, sval_count); |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
sval_last = sval; |
575
|
|
|
|
|
|
|
sval_count = 0; |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
sval_count -= count; /* Accumulate count for this sval */ |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
if (sval_last != 0) { /* Insert the last sval */ |
580
|
|
|
|
|
|
|
if (sval_last <= lastprime && sval_last < primes[a-1]*primes[a-1]) |
581
|
|
|
|
|
|
|
sum += sval_count*(bs_prime_count(sval_last,primes,lastidx)-a+2); |
582
|
|
|
|
|
|
|
else |
583
|
|
|
|
|
|
|
vcarray_insert(&a2, sval_last, sval_count); |
584
|
|
|
|
|
|
|
} |
585
|
|
|
|
|
|
|
/* For each small sval, add up the counts */ |
586
|
|
|
|
|
|
|
for ( ; i < a1.n; i++) |
587
|
|
|
|
|
|
|
sum -= arr[i].c; |
588
|
|
|
|
|
|
|
/* Merge a1 and a2 into a1. a2 will be emptied. */ |
589
|
|
|
|
|
|
|
vcarray_merge(&a1, &a2); |
590
|
|
|
|
|
|
|
/* If we've grown too large, use recursive phi to clip. */ |
591
|
|
|
|
|
|
|
if ( a1.n > NTHRESH ) { |
592
|
|
|
|
|
|
|
arr = a1.a; |
593
|
|
|
|
|
|
|
if (verbose > 0) printf("clipping small values at a=%lu a1.n=%lu \n", a, a1.n); |
594
|
|
|
|
|
|
|
#ifdef _OPENMP |
595
|
|
|
|
|
|
|
/* #pragma omp parallel for reduction(+: sum) firstprivate(pcache) schedule(dynamic, 16) */ |
596
|
|
|
|
|
|
|
#endif |
597
|
|
|
|
|
|
|
for (i = 0; i < a1.n-NTHRESH+NTHRESH/50; i++) { |
598
|
|
|
|
|
|
|
UV j = a1.n - 1 - i; |
599
|
|
|
|
|
|
|
IV count = arr[j].c; |
600
|
|
|
|
|
|
|
if (count != 0) { |
601
|
|
|
|
|
|
|
sum += count * phi_small( arr[j].v, a-1, primes, lastidx, &pcache ); |
602
|
|
|
|
|
|
|
arr[j].c = 0; |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
vcarray_remove_zeros(&a1); |
607
|
|
|
|
|
|
|
a--; |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
phicache_free(&pcache); |
610
|
|
|
|
|
|
|
vcarray_destroy(&a2); |
611
|
|
|
|
|
|
|
arr = a1.a; |
612
|
|
|
|
|
|
|
#ifdef _OPENMP |
613
|
|
|
|
|
|
|
#pragma omp parallel for reduction(+: sum) schedule(dynamic, 16) |
614
|
|
|
|
|
|
|
#endif |
615
|
|
|
|
|
|
|
for (i = 0; i < a1.n; i++) |
616
|
|
|
|
|
|
|
sum += arr[i].c * tablephi( arr[i].v, PHIC ); |
617
|
|
|
|
|
|
|
vcarray_destroy(&a1); |
618
|
|
|
|
|
|
|
Safefree(primes); |
619
|
|
|
|
|
|
|
return (UV) sum; |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
extern UV meissel_prime_count(UV n); |
624
|
|
|
|
|
|
|
/* b = prime_count(isqrt(n)) */ |
625
|
|
|
|
|
|
|
static UV Pk_2_p(UV n, UV a, UV b, const uint32_t* primes, uint32_t lastidx) |
626
|
|
|
|
|
|
|
{ |
627
|
|
|
|
|
|
|
UV lastw, lastwpc, i, P2; |
628
|
|
|
|
|
|
|
UV lastpc = primes[lastidx]; |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
/* Ensure we have a large enough base sieve */ |
631
|
|
|
|
|
|
|
prime_precalc(isqrt(n / primes[a+1])); |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
P2 = lastw = lastwpc = 0; |
634
|
|
|
|
|
|
|
for (i = b; i > a; i--) { |
635
|
|
|
|
|
|
|
UV w = n / primes[i]; |
636
|
|
|
|
|
|
|
lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastidx) |
637
|
|
|
|
|
|
|
: lastwpc + segment_prime_count(lastw+1, w); |
638
|
|
|
|
|
|
|
lastw = w; |
639
|
|
|
|
|
|
|
P2 += lastwpc; |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
P2 -= ((b+a-2) * (b-a+1) / 2) - a + 1; |
642
|
|
|
|
|
|
|
return P2; |
643
|
|
|
|
|
|
|
} |
644
|
|
|
|
|
|
|
static UV Pk_2(UV n, UV a, UV b) |
645
|
|
|
|
|
|
|
{ |
646
|
|
|
|
|
|
|
UV lastprime = ((b*3+1) > 203280221) ? 203280221 : b*3+1; |
647
|
|
|
|
|
|
|
const uint32_t* primes = generate_small_primes(lastprime); |
648
|
|
|
|
|
|
|
UV P2 = Pk_2_p(n, a, b, primes, lastprime); |
649
|
|
|
|
|
|
|
Safefree(primes); |
650
|
|
|
|
|
|
|
return P2; |
651
|
|
|
|
|
|
|
} |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
/* Legendre's method. Interesting and a good test for phi(x,a), but Lehmer's |
655
|
|
|
|
|
|
|
* method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */ |
656
|
|
|
|
|
|
|
UV legendre_prime_count(UV n) |
657
|
|
|
|
|
|
|
{ |
658
|
|
|
|
|
|
|
UV a, phina; |
659
|
|
|
|
|
|
|
if (n < SIEVE_LIMIT) |
660
|
|
|
|
|
|
|
return segment_prime_count(2, n); |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
a = legendre_prime_count(isqrt(n)); |
663
|
|
|
|
|
|
|
/* phina = phi(n, a); */ |
664
|
|
|
|
|
|
|
{ /* The small phi routine is faster for large a */ |
665
|
|
|
|
|
|
|
cache_t pcache; |
666
|
|
|
|
|
|
|
const uint32_t* primes = 0; |
667
|
|
|
|
|
|
|
primes = generate_small_primes(a+1); |
668
|
|
|
|
|
|
|
phicache_init(&pcache); |
669
|
|
|
|
|
|
|
phina = phi_small(n, a, primes, a+1, &pcache); |
670
|
|
|
|
|
|
|
phicache_free(&pcache); |
671
|
|
|
|
|
|
|
Safefree(primes); |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
return phina + a - 1; |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
/* Meissel's method. */ |
678
|
|
|
|
|
|
|
UV meissel_prime_count(UV n) |
679
|
|
|
|
|
|
|
{ |
680
|
|
|
|
|
|
|
UV a, b, sum; |
681
|
|
|
|
|
|
|
if (n < SIEVE_LIMIT) |
682
|
|
|
|
|
|
|
return segment_prime_count(2, n); |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
a = meissel_prime_count(icbrt(n)); /* a = Pi(floor(n^1/3)) [max 192725] */ |
685
|
|
|
|
|
|
|
b = meissel_prime_count(isqrt(n)); /* b = Pi(floor(n^1/2)) [max 203280221] */ |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
sum = phi(n, a) + a - 1 - Pk_2(n, a, b); |
688
|
|
|
|
|
|
|
return sum; |
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
/* Lehmer's method. This is basically Riesel's Lehmer function (page 22), |
692
|
|
|
|
|
|
|
* with some additional code to help optimize it. */ |
693
|
|
|
|
|
|
|
UV lehmer_prime_count(UV n) |
694
|
|
|
|
|
|
|
{ |
695
|
|
|
|
|
|
|
UV z, a, b, c, sum, i, j, lastprime, lastpc, lastw, lastwpc; |
696
|
|
|
|
|
|
|
const uint32_t* primes = 0; /* small prime cache, first b=pi(z)=pi(sqrt(n)) */ |
697
|
|
|
|
|
|
|
DECLARE_TIMING_VARIABLES; |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
if (n < SIEVE_LIMIT) |
700
|
|
|
|
|
|
|
return segment_prime_count(2, n); |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
/* Protect against overflow. 2^32-1 and 2^64-1 are both divisible by 3. */ |
703
|
|
|
|
|
|
|
if (n == UV_MAX) { |
704
|
|
|
|
|
|
|
if ( (n%3) == 0 || (n%5) == 0 || (n%7) == 0 || (n%31) == 0 ) |
705
|
|
|
|
|
|
|
n--; |
706
|
|
|
|
|
|
|
else |
707
|
|
|
|
|
|
|
return segment_prime_count(2,n); |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n); |
711
|
|
|
|
|
|
|
TIMING_START; |
712
|
|
|
|
|
|
|
z = isqrt(n); |
713
|
|
|
|
|
|
|
a = lehmer_prime_count(isqrt(z)); /* a = Pi(floor(n^1/4)) [max 6542] */ |
714
|
|
|
|
|
|
|
b = lehmer_prime_count(z); /* b = Pi(floor(n^1/2)) [max 203280221] */ |
715
|
|
|
|
|
|
|
c = lehmer_prime_count(icbrt(n)); /* c = Pi(floor(n^1/3)) [max 192725] */ |
716
|
|
|
|
|
|
|
TIMING_END_PRINT("stage 1") |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c); |
719
|
|
|
|
|
|
|
TIMING_START; |
720
|
|
|
|
|
|
|
sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2); |
721
|
|
|
|
|
|
|
TIMING_END_PRINT("phi(x,a)") |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
/* We get an array of the first b primes. This is used in stage 4. If we |
724
|
|
|
|
|
|
|
* get more than necessary, we can use them to speed up some. |
725
|
|
|
|
|
|
|
*/ |
726
|
|
|
|
|
|
|
lastprime = b*SIEVE_MULT+1; |
727
|
|
|
|
|
|
|
if (lastprime > 203280221) lastprime = 203280221; |
728
|
|
|
|
|
|
|
if (verbose > 0) printf("lehmer %lu stage 3: %lu small primes\n", n, lastprime); |
729
|
|
|
|
|
|
|
TIMING_START; |
730
|
|
|
|
|
|
|
primes = generate_small_primes(lastprime); |
731
|
|
|
|
|
|
|
lastpc = primes[lastprime]; |
732
|
|
|
|
|
|
|
TIMING_END_PRINT("small primes") |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
TIMING_START; |
735
|
|
|
|
|
|
|
/* Speed up all the prime counts by doing a big base sieve */ |
736
|
|
|
|
|
|
|
prime_precalc( (UV) pow(n, 3.0/5.0) ); |
737
|
|
|
|
|
|
|
/* Ensure we have the base sieve for big prime_count ( n/primes[i] ). */ |
738
|
|
|
|
|
|
|
/* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */ |
739
|
|
|
|
|
|
|
prime_precalc(isqrt(n / primes[a+1])); |
740
|
|
|
|
|
|
|
TIMING_END_PRINT("sieve precalc") |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]); |
743
|
|
|
|
|
|
|
TIMING_START; |
744
|
|
|
|
|
|
|
/* Reverse the i loop so w increases. Count w in segments. */ |
745
|
|
|
|
|
|
|
lastw = 0; |
746
|
|
|
|
|
|
|
lastwpc = 0; |
747
|
|
|
|
|
|
|
for (i = b; i >= a+1; i--) { |
748
|
|
|
|
|
|
|
UV w = n / primes[i]; |
749
|
|
|
|
|
|
|
lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime) |
750
|
|
|
|
|
|
|
: lastwpc + segment_prime_count(lastw+1, w); |
751
|
|
|
|
|
|
|
lastw = w; |
752
|
|
|
|
|
|
|
sum = sum - lastwpc; |
753
|
|
|
|
|
|
|
if (i <= c) { |
754
|
|
|
|
|
|
|
UV bi = bs_prime_count( isqrt(w), primes, lastprime ); |
755
|
|
|
|
|
|
|
for (j = i; j <= bi; j++) { |
756
|
|
|
|
|
|
|
sum = sum - bs_prime_count(w / primes[j], primes, lastprime) + j - 1; |
757
|
|
|
|
|
|
|
} |
758
|
|
|
|
|
|
|
/* We could wrap the +j-1 in: sum += ((bi+1-i)*(bi+i))/2 - (bi-i+1); */ |
759
|
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
TIMING_END_PRINT("stage 4") |
762
|
|
|
|
|
|
|
Safefree(primes); |
763
|
|
|
|
|
|
|
return sum; |
764
|
|
|
|
|
|
|
} |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
/* The Lagarias-Miller-Odlyzko method. |
768
|
|
|
|
|
|
|
* Naive implementation without optimizations. |
769
|
|
|
|
|
|
|
* About the same speed as Lehmer, a bit less memory. |
770
|
|
|
|
|
|
|
* A better implementation can be 10-50x faster and much less memory. |
771
|
|
|
|
|
|
|
*/ |
772
|
|
|
|
|
|
|
UV LMOS_prime_count(UV n) |
773
|
|
|
|
|
|
|
{ |
774
|
|
|
|
|
|
|
UV n13, a, b, sum, i, j, k, lastprime, P2, S1, S2; |
775
|
|
|
|
|
|
|
const uint32_t* primes = 0; /* small prime cache */ |
776
|
|
|
|
|
|
|
signed char* mu = 0; /* moebius to n^1/3 */ |
777
|
|
|
|
|
|
|
uint32_t* lpf = 0; /* least prime factor to n^1/3 */ |
778
|
|
|
|
|
|
|
cache_t pcache; /* Cache for recursive phi */ |
779
|
|
|
|
|
|
|
DECLARE_TIMING_VARIABLES; |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
if (n < SIEVE_LIMIT) |
782
|
|
|
|
|
|
|
return segment_prime_count(2, n); |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
n13 = icbrt(n); /* n13 = floor(n^1/3) [max 2642245] */ |
785
|
|
|
|
|
|
|
a = lehmer_prime_count(n13); /* a = Pi(floor(n^1/3)) [max 192725] */ |
786
|
|
|
|
|
|
|
b = lehmer_prime_count(isqrt(n)); /* b = Pi(floor(n^1/2)) [max 203280221] */ |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
lastprime = b*SIEVE_MULT+1; |
789
|
|
|
|
|
|
|
if (lastprime > 203280221) lastprime = 203280221; |
790
|
|
|
|
|
|
|
if (lastprime < n13) lastprime = n13; |
791
|
|
|
|
|
|
|
primes = generate_small_primes(lastprime); |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
New(0, mu, n13+1, signed char); |
794
|
|
|
|
|
|
|
memset(mu, 1, sizeof(signed char) * (n13+1)); |
795
|
|
|
|
|
|
|
Newz(0, lpf, n13+1, uint32_t); |
796
|
|
|
|
|
|
|
mu[0] = 0; |
797
|
|
|
|
|
|
|
for (i = 1; i <= n13; i++) { |
798
|
|
|
|
|
|
|
UV primei = primes[i]; |
799
|
|
|
|
|
|
|
for (j = primei; j <= n13; j += primei) { |
800
|
|
|
|
|
|
|
mu[j] = -mu[j]; |
801
|
|
|
|
|
|
|
if (lpf[j] == 0) lpf[j] = primei; |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
k = primei * primei; |
804
|
|
|
|
|
|
|
for (j = k; j <= n13; j += k) |
805
|
|
|
|
|
|
|
mu[j] = 0; |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
lpf[1] = UVCONST(4294967295); /* Set lpf[1] to max */ |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
/* Remove mu[i] == 0 using lpf */ |
810
|
|
|
|
|
|
|
for (i = 1; i <= n13; i++) |
811
|
|
|
|
|
|
|
if (mu[i] == 0) |
812
|
|
|
|
|
|
|
lpf[i] = 0; |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
/* Thanks to Kim Walisch for help with the S1+S2 calculations. */ |
815
|
|
|
|
|
|
|
k = (a < 7) ? a : 7; |
816
|
|
|
|
|
|
|
S1 = 0; |
817
|
|
|
|
|
|
|
S2 = 0; |
818
|
|
|
|
|
|
|
phicache_init(&pcache); |
819
|
|
|
|
|
|
|
TIMING_START; |
820
|
|
|
|
|
|
|
for (i = 1; i <= n13; i++) |
821
|
|
|
|
|
|
|
if (lpf[i] > primes[k]) |
822
|
|
|
|
|
|
|
/* S1 += mu[i] * phi_small(n/i, k, primes, lastprime, &pcache); */ |
823
|
|
|
|
|
|
|
S1 += mu[i] * phi(n/i, k); |
824
|
|
|
|
|
|
|
TIMING_END_PRINT("S1") |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
TIMING_START; |
827
|
|
|
|
|
|
|
for (i = k; i+1 < a; i++) { |
828
|
|
|
|
|
|
|
uint32_t p = primes[i+1]; |
829
|
|
|
|
|
|
|
/* TODO: #pragma omp parallel for reduction(+: S2) firstprivate(pcache) schedule(dynamic, 16) */ |
830
|
|
|
|
|
|
|
for (j = (n13/p)+1; j <= n13; j++) |
831
|
|
|
|
|
|
|
if (lpf[j] > p) |
832
|
|
|
|
|
|
|
S2 += -mu[j] * phi_small(n / (j*p), i, primes, lastprime, &pcache); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
TIMING_END_PRINT("S2") |
835
|
|
|
|
|
|
|
phicache_free(&pcache); |
836
|
|
|
|
|
|
|
Safefree(lpf); |
837
|
|
|
|
|
|
|
Safefree(mu); |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
TIMING_START; |
840
|
|
|
|
|
|
|
prime_precalc( (UV) pow(n, 2.9/5.0) ); |
841
|
|
|
|
|
|
|
P2 = Pk_2_p(n, a, b, primes, lastprime); |
842
|
|
|
|
|
|
|
TIMING_END_PRINT("P2") |
843
|
|
|
|
|
|
|
Safefree(primes); |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
/* printf("S1 = %lu\nS2 = %lu\na = %lu\nP2 = %lu\n", S1, S2, a, P2); */ |
846
|
|
|
|
|
|
|
sum = (S1 + S2) + a - 1 - P2; |
847
|
|
|
|
|
|
|
return sum; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
#ifdef PRIMESIEVE_STANDALONE |
851
|
|
|
|
|
|
|
int main(int argc, char *argv[]) |
852
|
|
|
|
|
|
|
{ |
853
|
|
|
|
|
|
|
UV n, pi; |
854
|
|
|
|
|
|
|
double t; |
855
|
|
|
|
|
|
|
const char* method; |
856
|
|
|
|
|
|
|
struct timeval t0, t1; |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
if (argc <= 1) { printf("usage: %s []\n", argv[0]); return(1); } |
859
|
|
|
|
|
|
|
n = strtoul(argv[1], 0, 10); |
860
|
|
|
|
|
|
|
if (n < 2) { printf("Pi(%lu) = 0\n", n); return(0); } |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
if (argc > 2) |
863
|
|
|
|
|
|
|
method = argv[2]; |
864
|
|
|
|
|
|
|
else |
865
|
|
|
|
|
|
|
method = "lehmer"; |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
gettimeofday(&t0, 0); |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
if (!strcasecmp(method, "lehmer")) { pi = lehmer_prime_count(n); } |
870
|
|
|
|
|
|
|
else if (!strcasecmp(method, "meissel")) { pi = meissel_prime_count(n); } |
871
|
|
|
|
|
|
|
else if (!strcasecmp(method, "legendre")) { pi = legendre_prime_count(n); } |
872
|
|
|
|
|
|
|
else if (!strcasecmp(method, "lmo")) { pi = LMOS_prime_count(n); } |
873
|
|
|
|
|
|
|
else if (!strcasecmp(method, "sieve")) { pi = segment_prime_count(2, n); } |
874
|
|
|
|
|
|
|
else { |
875
|
|
|
|
|
|
|
printf("method must be one of: lehmer, meissel, legendre, lmo, or sieve\n"); |
876
|
|
|
|
|
|
|
return(2); |
877
|
|
|
|
|
|
|
} |
878
|
|
|
|
|
|
|
gettimeofday(&t1, 0); |
879
|
|
|
|
|
|
|
t = (t1.tv_sec-t0.tv_sec); t *= 1000000.0; t += (t1.tv_usec - t0.tv_usec); |
880
|
|
|
|
|
|
|
printf("%8s Pi(%lu) = %lu in %10.5fs\n", method, n, pi, t / 1000000.0); |
881
|
|
|
|
|
|
|
return(0); |
882
|
|
|
|
|
|
|
} |
883
|
|
|
|
|
|
|
#endif |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
#else |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
#include "lehmer.h" |
888
|
0
|
0
|
|
|
|
|
UV LMOS_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;} |
889
|
0
|
0
|
|
|
|
|
UV lehmer_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;} |
890
|
0
|
0
|
|
|
|
|
UV meissel_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;} |
891
|
0
|
0
|
|
|
|
|
UV legendre_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;} |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
#endif |