File Coverage

prime_sums.c
Criterion Covered Total %
statement 92 92 100.0
branch 97 116 83.6
condition n/a
subroutine n/a
pod n/a
total 189 208 90.8


line stmt bran cond sub pod time code
1             #include
2             #include
3             #include
4              
5             #define FUNC_isqrt 1
6             #include "ptypes.h"
7             #include "constants.h"
8             #include "util.h"
9             #include "cache.h"
10             #include "sieve.h"
11             #include "prime_sums.h"
12              
13             /******************************************************************************/
14             /* SUMS */
15             /******************************************************************************/
16              
17             /* As an aside, good information about bounds and approximations can be
18             * found in Axler (2019) "On the sum of the first n prime numbers"
19             * https://jtnb.centre-mersenne.org/item/10.5802/jtnb.1081.pdf
20             */
21              
22             static const unsigned char byte_zeros[256] =
23             {8,7,7,6,7,6,6,5,7,6,6,5,6,5,5,4,7,6,6,5,6,5,5,4,6,5,5,4,5,4,4,3,
24             7,6,6,5,6,5,5,4,6,5,5,4,5,4,4,3,6,5,5,4,5,4,4,3,5,4,4,3,4,3,3,2,
25             7,6,6,5,6,5,5,4,6,5,5,4,5,4,4,3,6,5,5,4,5,4,4,3,5,4,4,3,4,3,3,2,
26             6,5,5,4,5,4,4,3,5,4,4,3,4,3,3,2,5,4,4,3,4,3,3,2,4,3,3,2,3,2,2,1,
27             7,6,6,5,6,5,5,4,6,5,5,4,5,4,4,3,6,5,5,4,5,4,4,3,5,4,4,3,4,3,3,2,
28             6,5,5,4,5,4,4,3,5,4,4,3,4,3,3,2,5,4,4,3,4,3,3,2,4,3,3,2,3,2,2,1,
29             6,5,5,4,5,4,4,3,5,4,4,3,4,3,3,2,5,4,4,3,4,3,3,2,4,3,3,2,3,2,2,1,
30             5,4,4,3,4,3,3,2,4,3,3,2,3,2,2,1,4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0};
31              
32             /* The fastest way to compute the sum of primes is using a combinatorial
33             * algorithm such as Deleglise-Rivat or Gourdon. This is what Kim Walisch's
34             * primesum program does. Note that one quickly needs 128-bit or larger
35             * storage, as the sums grow rapidly.
36             *
37             * We are using much simpler methods. Performance at small sizes is also a
38             * consideration. Using tables combined with summing over sieved primes can
39             * work well with small input sizes.
40             */
41              
42             /* Simplified Legendre method giving pisum(n) for n <= 65535 or 4294967295. */
43              
44 19           UV sum_primes64(UV n) {
45             uint32_t *V, j, k, r, r2, p;
46             UV *S, sum;
47              
48 19 50         if (n < 2 || (n >> (BITS_PER_WORD/2)) > 0) /* S[] will overflow */
    100          
49 7           return 0;
50 12           r = isqrt(n);
51 12           r2 = r + n/(r+1);
52              
53 12           New(0, V, r2+1, uint32_t);
54 12           New(0, S, r2+1, UV);
55 444200 100         for (k = 1; k <= r2; k++) {
56 444188 100         UV v = (k <= r) ? k : n/(r2-k+1);
57 444188           V[k] = v;
58 444188           S[k] = ((v*(v-1))>>1) + (v-1);
59             }
60              
61 222096 100         for (p = 2; p <= r; p++) {
62 222084 100         if (S[p] > S[p-1]) { /* For each prime p from 2 to r */
63 23284           UV sp = S[p-1], p2 = p*p;
64 25830282 50         for (j = r2; j > 1 && V[j] >= p2; j--) {
    100          
65 25806998           uint32_t a = V[j], b = a/p;
66 25806998 100         if (a > r) a = r2 - n/a + 1;
67 25806998 100         if (b > r) b = r2 - n/b + 1;
68 25806998           S[a] -= (UV)p * (S[b] - sp); /* sp = sum of primes less than p */
69             }
70             }
71             }
72 12           sum = S[r2];
73 12           Safefree(V);
74 12           Safefree(S);
75 12           return sum;
76             }
77              
78             /* Simplified Legendre method giving pisum(n) for any 64-bit input n,
79             * assuming the uint128_t type is available. The result is returned as
80             * two 64-bit results. */
81              
82 1           bool sum_primes128(UV n, UV *hi_sum, UV *lo_sum) {
83             #if HAVE_SUM_PRIMES128
84             uint128_t *S;
85             UV *V, j, k, r, r2, p;
86              
87             /* pisum(2^64-1) < 2^128-1, so no overflow issues */
88 1           r = isqrt(n);
89 1           r2 = r + n/(r+1);
90              
91 1 50         New(0, V, r2+1, UV);
92 1 50         New(0, S, r2+1, uint128_t);
93 343543 100         for (k = 0; k <= r2; k++) {
94 343542 100         UV v = (k <= r) ? k : n/(r2-k+1);
95 343542           V[k] = v;
96 343542           S[k] = ((uint128_t)v+1)/2 * (v|1) - 1; /* (v*(v+1))/2-1 */
97             }
98              
99 171771 100         for (p = 2; p <= r; p++) {
100 171770 100         if (S[p] > S[p-1]) { /* For each prime p from 2 to r */
101 15648           uint128_t sp = S[p-1], p2 = ((uint128_t)p) * p;
102 33779919 50         for (j = r2; j > 1 && V[j] >= p2; j--) {
    100          
103 33764271           UV a = V[j], b = a/p;
104 33764271 100         if (a > r) a = r2 - n/a + 1;
105 33764271 100         if (b > r) b = r2 - n/b + 1;
106 33764271           S[a] -= p * (S[b] - sp); /* sp = sum of primes less than p */
107             }
108             }
109             }
110 1           *hi_sum = (S[r2] >> 64) & UV_MAX;
111 1           *lo_sum = (S[r2] ) & UV_MAX;
112 1           Safefree(V);
113 1           Safefree(S);
114 1           return 1;
115             #else
116             return 0;
117             #endif
118             }
119              
120              
121             /* sum primes in a 64-bit range using a sieving with table acceleration */
122              
123             static const unsigned char byte_sum[256] =
124             {120,119,113,112,109,108,102,101,107,106,100,99,96,95,89,88,103,102,96,95,92,
125             91,85,84,90,89,83,82,79,78,72,71,101,100,94,93,90,89,83,82,88,87,81,80,77,
126             76,70,69,84,83,77,76,73,72,66,65,71,70,64,63,60,59,53,52,97,96,90,89,86,85,
127             79,78,84,83,77,76,73,72,66,65,80,79,73,72,69,68,62,61,67,66,60,59,56,55,49,
128             48,78,77,71,70,67,66,60,59,65,64,58,57,54,53,47,46,61,60,54,53,50,49,43,42,
129             48,47,41,40,37,36,30,29,91,90,84,83,80,79,73,72,78,77,71,70,67,66,60,59,74,
130             73,67,66,63,62,56,55,61,60,54,53,50,49,43,42,72,71,65,64,61,60,54,53,59,58,
131             52,51,48,47,41,40,55,54,48,47,44,43,37,36,42,41,35,34,31,30,24,23,68,67,61,
132             60,57,56,50,49,55,54,48,47,44,43,37,36,51,50,44,43,40,39,33,32,38,37,31,30,
133             27,26,20,19,49,48,42,41,38,37,31,30,36,35,29,28,25,24,18,17,32,31,25,24,21,
134             20,14,13,19,18,12,11,8,7,1,0};
135              
136             #if BITS_PER_WORD == 64
137             /* We have a much more limited range, so use a fixed interval. We should be
138             * able to get any 64-bit sum in under a half-second. */
139             static const UV sum_table_2e8[] =
140             {1075207199997324,3071230303170813,4990865886639877,6872723092050268,8729485610396243,10566436676784677,12388862798895708,14198556341669206,15997206121881531,17783028661796383,19566685687136351,21339485298848693,23108856419719148,
141             24861364231151903,26619321031799321,28368484289421890,30110050320271201,31856321671656548,33592089385327108,35316546074029522,37040262208390735,38774260466286299,40490125006181147,42207686658844380,43915802985817228,45635106002281013,
142             47337822860157465,49047713696453759,50750666660265584,52449748364487290,54152689180758005,55832433395290183,57540651847418233,59224867245128289,60907462954737468,62597192477315868,64283665223856098,65961576139329367,67641982565760928,
143             69339211720915217,71006044680007261,72690896543747616,74358564592509127,76016548794894677,77694517638354266,79351385193517953,81053240048141953,82698120948724835,84380724263091726,86028655116421543,87679091888973563,89348007111430334,
144             90995902774878695,92678527127292212,94313220293410120,95988730932107432,97603162494502485,99310622699836698,100935243057337310,102572075478649557,104236362884241550,105885045921116836,107546170993472638,109163445284201278,
145             110835950755374921,112461991135144669,114116351921245042,115740770232532531,117408250788520189,119007914428335965,120652479429703269,122317415246500401,123951466213858688,125596789655927842,127204379051939418,128867944265073217,
146             130480037123800711,132121840147764197,133752985360747726,135365954823762234,137014594650995101,138614165689305879,140269121741383097,141915099618762647,143529289083557618,145146413750649432,146751434858695468,148397902396643807,
147             149990139346918801,151661665434334577,153236861034424304,154885985064643097,156500983286383741,158120868946747299,159735201435796748,161399264792716319,162999489977602579,164566400448130092,166219688860475191,167836981098849796,
148             169447127305804401,171078187147848898,172678849082290997,174284436375728242,175918609754056455,177525046501311788,179125593738290153,180765176633753371,182338473848291683,183966529541155489,185585792988238475,187131988176321434,
149             188797837140841381,190397649440649965,191981841583560122,193609739194967419,195166830650558070,196865965063113041,198400070713177440,200057161591648721,201621899486413406,203238279253414934,204790684829891896,206407676204061001,
150             208061050481364659,209641606658938873,211192088300183855,212855420483750498,214394145510853736,216036806225784861,217628995137940563,219277567478725189,220833877268454872,222430818525363309,224007307616922530,225640739533952807,
151             227213096159236934,228853318075566255,230401824696558125,231961445347821085,233593317860593895,235124654760954338,236777716068869769,238431514923528303,239965003913481640,241515977959535845,243129874530821395};
152             #define N_SUM_TABLE (sizeof(sum_table_2e8)/sizeof(sum_table_2e8[0]))
153             #endif
154              
155 1030           bool sum_primes(UV low, UV high, UV *return_sum) {
156 1030           UV sum = 0;
157 1030           bool overflow = 0;
158              
159 1030 100         if (low <= 2 && high >= 100000) {
    100          
160 19           *return_sum = sum_primes64(high);
161 19 100         if (*return_sum != 0)
162 12           return 1;
163             }
164             /* TODO: performance: more cases where using sum_primes64 is faster. */
165              
166 1018 100         if ((low <= 2) && (high >= 2)) sum += 2;
    100          
167 1018 100         if ((low <= 3) && (high >= 3)) sum += 3;
    100          
168 1018 100         if ((low <= 5) && (high >= 5)) sum += 5;
    100          
169 1018 100         if (low < 7) low = 7;
170              
171             /* If we know the range will overflow, return now */
172             #if BITS_PER_WORD == 64
173 1018 100         if (low == 7 && high >= 29505444491) return 0;
    100          
174 1017 50         if (low >= 1e10 && (high-low) >= 32e9) return 0;
    0          
175 1017 50         if (low >= 1e13 && (high-low) >= 5e7) return 0;
    0          
176             #else
177             if (low == 7 && high >= 323381) return 0;
178             #endif
179              
180             #if 1 && BITS_PER_WORD == 64 /* Tables */
181 1017 100         if (low == 7 && high >= 2e8) {
    100          
182             UV step;
183 448 100         for (step = 1; high >= (step * 2e8) && step < N_SUM_TABLE; step++) {
    100          
184 442           sum += sum_table_2e8[step-1];
185 442           low = step * 2e8;
186             }
187             }
188             #endif
189              
190 1017 100         if (low <= high) {
191             unsigned char* segment;
192             UV seg_base, seg_low, seg_high;
193 1010           void* ctx = start_segment_primes(low, high, &segment);
194 2245 50         while (!overflow && next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
    100          
195 1235           UV bytes = seg_high/30 - seg_low/30 + 1;
196             unsigned char s;
197 1235           unsigned char* sp = segment;
198 1235           unsigned char* const spend = segment + bytes - 1;
199 1235           UV i, p, pbase = 30*(seg_low/30);
200              
201             /* Clear primes before and after our range */
202 1235           p = pbase;
203 2271 50         for (i = 0; i < 8 && p+wheel30[i] < low; i++)
    100          
204 1036 100         if ( (*sp & (1<
205 5           *sp |= (1 << i);
206              
207 1235           p = 30*(seg_high/30);
208 11115 100         for (i = 0; i < 8; i++)
209 9880 100         if ( (*spend & (1< high )
    100          
210 2480           *spend |= (1 << i);
211              
212 20390020 100         while (sp <= spend) {
213 20388785           s = *sp++;
214 20388785 100         if (sum < (UV_MAX >> 3) && pbase < (UV_MAX >> 5)) {
    50          
215             /* sum block of 8 all at once */
216 3192482           sum += pbase * byte_zeros[s] + byte_sum[s];
217             } else {
218             /* sum block of 8, checking for overflow at each step */
219 38741186 100         for (i = 0; i < byte_zeros[s]; i++) {
220 21544883 50         if (sum+pbase < sum) overflow = 1;
221 21544883           sum += pbase;
222             }
223 17196303 50         if (sum+byte_sum[s] < sum) overflow = 1;
224 17196303           sum += byte_sum[s];
225 17196303 50         if (overflow) break;
226             }
227 20388785           pbase += 30;
228             }
229             }
230 1010           end_segment_primes(ctx);
231             }
232 1017 50         if (!overflow && return_sum != 0) *return_sum = sum;
    50          
233 1017           return !overflow;
234             }