File Coverage

sort.c
Criterion Covered Total %
statement 148 203 72.9
branch 92 154 59.7
condition n/a
subroutine n/a
pod n/a
total 240 357 67.2


line stmt bran cond sub pod time code
1             #include
2             #include
3             #include
4             #define FUNC_log2floor 1
5             #include "util.h"
6             #include "sort.h"
7              
8             #define USE_QUADSORT 0
9              
10             /*
11             * Sorting arrays of integers.
12             *
13             * We really have two quite different use cases. The first is for internal
14             * use, where it's very common to have a small number of inputs.
15             * E.g. sorting roots, factors, divisors, totients, prime powers, etc. Most
16             * of these will have small arrays and the overall time is dominated by the
17             * real work that created the data.
18             * There are some degenerate cases that generate many inputs, but these are
19             * exceptional. Most sorts from our test suite are 32 or fewer items, with
20             * the largest being 576 items.
21             *
22             * The second use is from vecsort, where the user or our PP code has given
23             * us a possibly large array to sort. Here we have the additional challenge
24             * of making sure the overhead of Perl->C->Perl is as small as possible.
25             *
26             * We have a number of possible choices.
27             *
28             * 1) Perl's sort. A cache-aware merge sort, which makes a lot of sense for
29             * its use with arbitrary and complicated data structures, possibly
30             * expensive comparisons, and where a stable sort is highly desirable.
31             * Most of this is irrelevant for sorting simple integers.
32             * Problem 1: We can sort SV's but there isn't a simple UV interface.
33             * Problem 2: It's slow for shuffled inputs, like most stable merge sorts.
34             *
35             * 2) qsort. Easy and works, but system dependent.
36             * Can be quite fast -- MacOS/clang is 3x faster than merge sort for
37             * shuffled inputs, and has fast behavior with sorted/reversed data.
38             *
39             * 3) fluxsort/quadsort/timsort/powersort/glidesort/etc.
40             * fluxsort is extremely fast and has excellent behavior with ordered data.
41             * The main reason it isn't being used here is the code size.
42             *
43             * 4) insertion / Shell. Fastest on tiny arrays and very compact. We use
44             * insertion sort for small chunks.
45             *
46             * 5) heapsort. lobby99's implementation here is surprisingly fast and very
47             * consistent across a variety of inputs. It is used as a fallback if
48             * quicksort is choosing bad partitions.
49             *
50             * 6) quicksort. Yes, yet another quicksort implementation. Fast for small
51             * inputs, competitive for larger. This uses true median of 9
52             * partitioning, insertion sort for small partitions, and will switch to
53             * heapsort after enough bad partitions, so there is no O(n^2) disaster.
54             *
55             * 5) radix sort. With enough integers, radix sort beats everything else on
56             * shuffled data. Performance on ordered data is decent though not like
57             * fluxsort. Uses auxiliary data equal to the input size.
58             *
59             * We use our quicksort for small arrays, radixsort for larger.
60             *
61             */
62              
63              
64             /******************************************************************************/
65              
66 4561           static void insertionsort_uv(UV *array, size_t len) {
67             size_t i,j;
68 34398 100         for (i = 1; i < len; i++) {
69 29837           UV t = array[i];
70 47255 100         for (j = i; j > 0 && array[j-1] > t; j--)
    100          
71 17418           array[j] = array[j-1];
72 29837           array[j] = t;
73             }
74 4561           }
75 49           static void insertionsort_iv(IV *array, size_t len) {
76             size_t i,j;
77 265 100         for (i = 1; i < len; i++) {
78 216           IV t = array[i];
79 652 100         for (j = i; j > 0 && array[j-1] > t; j--)
    100          
80 436           array[j] = array[j-1];
81 216           array[j] = t;
82             }
83 49           }
84              
85             #if 0
86             static void shellsort_uv(UV *array, size_t len) {
87             static unsigned short sgaps[] = {209,109,41,19,5,1}; /* Sedgewick 1986 */
88             size_t i, j, gap, gi = 0;
89             do {
90             gap = sgaps[gi++];
91             for (i = gap; i < len; i++) {
92             UV t = array[i];
93             for (j = i; j >= gap && array[j-gap] > t; j -= gap)
94             array[j] = array[j-gap];
95             array[j] = t;
96             }
97             } while (gap > 1);
98             }
99             static void shellsort_iv(IV *array, size_t len) {
100             static unsigned short sgaps[] = {209,109,41,19,5,1}; /* Sedgewick 1986 */
101             size_t i, j, gap, gi = 0;
102             do {
103             gap = sgaps[gi++];
104             for (i = gap; i < len; i++) {
105             IV t = array[i];
106             for (j = i; j >= gap && array[j-gap] > t; j -= gap)
107             array[j] = array[j-gap];
108             array[j] = t;
109             }
110             } while (gap > 1);
111             }
112             #endif
113              
114             /******************************************************************************/
115             /* RADIX SORT */
116             /******************************************************************************/
117             #define RADIX_BIT 8
118             #define RADIX (1u<
119 1           static bool _radixsort(UV *array, size_t n, bool is_iv)
120             {
121             size_t i, count[RADIX];
122             unsigned r;
123             UV *a, *b, *ptr[RADIX];
124 1           UV passmask = 0;
125             int sh;
126              
127 1           memset(count, 0, sizeof count);
128 1729 100         for (i = 0; i < n; i++) {
129 1728           UV d = array[i];
130 1728           passmask |= d ^ (d >> RADIX_BIT);
131 1728           count[d % RADIX]++;
132             }
133 1 50         if (passmask < RADIX) { /* If all values < RADIX, Use *fast* counting sort */
134 0 0         if (passmask) {
135 0           size_t j = 0, lim = 0;
136 0 0         for (r = 0; r < RADIX; r++)
137 0 0         for (lim += count[r]; j < lim; j++)
138 0           array[j] = r;
139             }
140 0           return 1;
141             }
142             /* Allocate second ping-pong buffer */
143 1           a = array;
144 1           b = (UV*)malloc(n * sizeof(UV));
145 1 50         if (b == 0) return 0;
146             /* Each pass radix-sorts and counts for next pass */
147 8 100         for (sh = 0; UV_MAX >> sh >= RADIX; sh += RADIX_BIT) {
148 7           UV *p = b;
149 7 100         if ((passmask >> sh) % RADIX == 0)
150 3           continue;
151 1028 100         for (r = 0; r < RADIX; r++) {
152 1024           ptr[r] = p;
153 1024           p += count[r];
154             }
155             /* assert(p == b + n); */
156 4           memset(count, 0, sizeof count);
157 6916 100         for (i = 0; i < n; i++) {
158 6912           UV d = a[i];
159 6912           *(ptr[(d>>sh) % RADIX]++) = d;
160 6912           count[(d >> (sh + RADIX_BIT)) % RADIX]++;
161             }
162 4           p = b; b = a; a = p;
163             }
164             /* Last pass does no more counting */
165 1 50         if (passmask >> sh) {
166 0           UV *p = b;
167 0 0         unsigned signbit = is_iv ? 1 << (BITS_PER_WORD-1)%RADIX_BIT : 0;
168 0 0         for (r = 0; r < RADIX; r++) {
169 0           ptr[r^signbit] = p;
170 0           p += count[r^signbit];
171             }
172             /* assert(p == b + n); */
173 0 0         for (i = 0; i < n; i++) {
174 0           UV d = a[i];
175 0           *(ptr[(d>>sh) % RADIX]++) = d;
176             }
177 0           p = b; b = a; a = p;
178             }
179             /* Move back to input array if necessary */
180 1 50         if (a != array) {
181 0           memcpy(array, a, n * sizeof *array);
182 0           b = a;
183             }
184 1           free(b);
185 1           return 1;
186             }
187             #undef RADIX_BIT
188             #undef RADIX
189              
190             /******************************************************************************/
191             /* HEAP SORT */
192             /******************************************************************************/
193 0           static void _heapsort(UV *array, size_t len, bool is_iv)
194             {
195 0           size_t a = len/2;
196              
197 0 0         if (!a) /* Trivial cases: len < 2 */
198 0           return;
199              
200 0           for (len--;;) {
201             UV r; /* Value from array[a] being sifted down */
202             size_t b, c; /* Current descendent and its child */
203              
204             /*
205             * Elements [0,a) are unsorted.
206             * Elements [a,n] are in the heap.
207             * Elements (n,...) are sorted.
208             */
209 0 0         if (a > 0) /* Building heap: sift down array[--a] */
210 0           r = array[--a];
211 0 0         else if (len > 0) { /* Extracting: Swap root<->array[n--] */
212 0           r = array[len]; array[len--] = array[0];
213             } else /* Extraction complete */
214 0           return;
215              
216             /* Sift element r (at "a") down into heap. */
217 0 0         if (!is_iv) {
218 0 0         for (b = a; (c = 2*b + 1) < len; b = c) {
219 0           UV s = array[c];
220 0 0         if (array[c+1] >= s)
221 0           s = array[++c];
222 0 0         if (r >= s)
223 0           goto sift_done;
224 0           array[b] = s;
225             }
226             } else {
227 0 0         for (b = a; (c = 2*b + 1) < len; b = c) {
228 0           IV s = array[c];
229 0 0         if ((IV)array[c+1] >= s)
230 0           s = array[++c];
231 0 0         if ((IV)r >= s)
232 0           goto sift_done;
233 0           array[b] = s;
234             }
235             }
236 0 0         if (c == len) { /* Corner case: last leaf with no sibling */
237 0 0         if ( (!is_iv && r < array[c]) || (is_iv && (IV)r < (IV)array[c]) ) {
    0          
    0          
    0          
238 0           array[b] = array[c];
239 0           b = c;
240             }
241             }
242 0           sift_done:
243 0           array[b] = r;
244             }
245             }
246              
247             #define radixsort_uv(L,len) _radixsort(L, len, 0)
248             #define radixsort_iv(L,len) _radixsort((UV*)L, len, 1)
249             #define heapsort_uv(L,len) _heapsort(L, len, 0)
250             #define heapsort_iv(L,len) _heapsort((UV*)L, len, 1)
251              
252             /******************************************************************************/
253             /* QUICK SORT */
254             /******************************************************************************/
255              
256 629           static size_t _mid3_uv_val(UV* L, size_t a, size_t b, size_t c) {
257 629           const UV s[3] = {L[a],L[b],L[c]}; /* Scandum's branchless method */
258 629           int x = s[0] > s[1];
259 629           int y = s[0] > s[2];
260 629           int z = s[1] > s[2];
261 629           return s[(x == y) + (y ^ z)];
262             }
263 8           static size_t _mid3_iv_val(IV* L, size_t a, size_t b, size_t c) {
264 8           const IV s[3] = {L[a],L[b],L[c]}; /* Scandum's branchless method */
265 8           int x = s[0] > s[1];
266 8           int y = s[0] > s[2];
267 8           int z = s[1] > s[2];
268 8           return s[(x == y) + (y ^ z)];
269             }
270              
271 336           static void _mid2_of_4_uv(UV* L) {
272             UV swap; /* 1) put first two and last two in order */
273             size_t x; /* 2) L[2] = max(L[0],L[2]); L[1] = min(L[1],L[3]); */
274 336 100         x = L[0] > L[1]; swap = L[!x]; L[0]=L[x]; L[1]=swap;
275 336 100         L += 2; x = L[0] > L[1]; swap = L[!x]; L[0]=L[x]; L[1]=swap;
276 336 100         L -= 2; x = (L[0] <= L[2]) * 2; L[2] = L[x];
277 336 100         L += 1; x = (L[0] > L[2]) * 2; L[0] = L[x];
278 336           }
279 6           static void _mid2_of_4_iv(IV* L) {
280             IV swap; /* 1) put first two and last two in order */
281             size_t x; /* 2) L[2] = max(L[0],L[2]); L[1] = min(L[1],L[3]); */
282 6 100         x = L[0] > L[1]; swap = L[!x]; L[0]=L[x]; L[1]=swap;
283 6 100         L += 2; x = L[0] > L[1]; swap = L[!x]; L[0]=L[x]; L[1]=swap;
284 6 100         L -= 2; x = (L[0] <= L[2]) * 2; L[2] = L[x];
285 6 100         L += 1; x = (L[0] > L[2]) * 2; L[0] = L[x];
286 6           }
287              
288             /* Using scandum's median of 9 gives better partitions than the median of
289             * three medians, and gives better actual run times for large inputs.
290             */
291 629           static size_t _partition_uv(UV* L, size_t lo, size_t hi) {
292 629           size_t i = lo-1, j = hi+1, len = hi-lo+1;
293             UV pivot;
294 629 50         if (len <= 7) {
295 0           pivot = L[len/2];
296 629 100         } else if (len <= 40) {
297 517           pivot = _mid3_uv_val(L, lo, lo+(hi-lo)/2, hi);
298             } else { /* Fluxsort's median_of_nine */
299 112           UV swap[9], *X = L+lo;
300 112           size_t x, step = (len-1)/8;
301 1120 100         for (x = 0; x < 9; x++) { swap[x] = *X; X += step; }
302 112           _mid2_of_4_uv(swap); /* [X v v X v v v v v] */
303 112           _mid2_of_4_uv(swap+4); /* [X v v X X v v X v] */
304 112           swap[0] = swap[5]; swap[3] = swap[8];
305 112           _mid2_of_4_uv(swap); /* [X v v X X X v X X] */
306 112           pivot = _mid3_uv_val(swap, 6, 1, 2);
307             }
308 3942           while (1) {
309 14332 100         do { i++; } while (L[i] < pivot);
310 11871 100         do { j--; } while (L[j] > pivot);
311 4571 100         if (i >= j) return j;
312 3942           { UV t = L[i]; L[i] = L[j]; L[j] = t; }
313             }
314             }
315 8           static size_t _partition_iv(IV* L, size_t lo, size_t hi) {
316 8           size_t i = lo-1, j = hi+1, len = hi-lo+1;
317             IV pivot;
318 8 50         if (len <= 7) {
319 0           pivot = L[len/2];
320 8 100         } else if (len <= 40) {
321 6           pivot = _mid3_iv_val(L, lo, lo+(hi-lo)/2, hi);
322             } else { /* Fluxsort's median_of_nine */
323 2           IV swap[9], *X = L+lo;
324 2           size_t x, step = (len-1)/8;
325 20 100         for (x = 0; x < 9; x++) { swap[x] = *X; X += step; }
326 2           _mid2_of_4_iv(swap); /* [X v v X v v v v v] */
327 2           _mid2_of_4_iv(swap+4); /* [X v v X X v v X v] */
328 2           swap[0] = swap[5]; swap[3] = swap[8];
329 2           _mid2_of_4_iv(swap); /* [X v v X X X v X X] */
330 2           pivot = _mid3_iv_val(swap, 6, 1, 2);
331             }
332 53           while (1) {
333 129 100         do { i++; } while (L[i] < pivot);
334 119 100         do { j--; } while (L[j] > pivot);
335 61 100         if (i >= j) return j;
336 53           { IV t = L[i]; L[i] = L[j]; L[j] = t; }
337             }
338             }
339              
340 5190           static void _qs_uv(UV* L, size_t lo, size_t hi, int badpartsleft) {
341 5190           size_t p, size = hi-lo+1;
342              
343 5190 100         if (size <= 16)
344 4561           { insertionsort_uv(L+lo, size); return; }
345              
346 629           p = _partition_uv(L, lo, hi);
347              
348             { /* check for unbalanced partitions, same as pdqsort */
349 629           size_t l_size = p - lo + 1;
350 629           size_t r_size = hi - (p+1) + 1;
351 629 100         bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
    100          
352 629 100         if (highly_unbalanced && --badpartsleft <= 0)
    50          
353 0           { heapsort_uv(L+lo, size); return; }
354             }
355              
356 629           _qs_uv(L, lo, p, badpartsleft);
357 629           _qs_uv(L, p+1, hi, badpartsleft);
358             }
359 57           static void _qs_iv(IV* L, size_t lo, size_t hi, int badpartsleft) {
360 57           size_t p, size = hi-lo+1;
361              
362 57 100         if (size <= 16)
363 49           { insertionsort_iv(L+lo, size); return; }
364              
365 8           p = _partition_iv(L, lo, hi);
366              
367             { /* check for unbalanced partitions, same as pdqsort */
368 8           size_t l_size = p - lo + 1;
369 8           size_t r_size = hi - (p+1) + 1;
370 8 50         bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
    50          
371 8 50         if (highly_unbalanced && --badpartsleft <= 0)
    0          
372 0           { heapsort_iv(L+lo, size); return; }
373             }
374              
375 8           _qs_iv(L, lo, p, badpartsleft);
376 8           _qs_iv(L, p+1, hi, badpartsleft);
377             }
378              
379 4001           static void quicksort_uv(UV *L, size_t len) {
380 4001 100         if (len > 1) _qs_uv(L, 0, len-1, log2floor(len));
    50          
381 4001           }
382 41           static void quicksort_iv(IV *L, size_t len) {
383 41 50         if (len > 1) _qs_iv(L, 0, len-1, log2floor(len));
    50          
384 41           }
385              
386              
387             #if USE_QUADSORT
388              
389             #include "quadsortuv.h"
390             void sort_uv_array(UV* L, size_t len) { quadsort_uv(L, len, 0); }
391             void sort_iv_array(IV* L, size_t len) { quadsort_iv(L, len, 0); }
392              
393             #else
394              
395 4002           void sort_uv_array(UV* L, size_t len)
396             {
397 4002 100         if (len < 800) {
398 4001           quicksort_uv(L, len);
399             } else {
400             /* We could use an in-place radix sort like Ska Sort. Our radix sort
401             * is traditional and uses O(n) extra memory. If we cannot get the
402             * extra memory, we fall back to an in-place sort. */
403 1 50         if (!radixsort_uv(L, len))
404 0           quicksort_uv(L, len);
405             }
406 4002           }
407              
408 41           void sort_iv_array(IV* L, size_t len)
409             {
410 41 50         if (len < 800) {
411 41           quicksort_iv(L, len);
412             } else {
413 0 0         if (!radixsort_iv(L, len)) /* radixsort could fail aux allocation */
414 0           quicksort_iv(L, len);
415             }
416 41           }
417              
418             #endif
419              
420             /******************************************************************************/
421              
422 68           void sort_dedup_uv_array(UV* L, bool data_is_signed, size_t *len)
423             {
424 68 50         if (*len > 1) {
425             size_t i, j;
426 68 100         if (data_is_signed) sort_iv_array((IV *)L, *len);
427 41           else sort_uv_array(L, *len);
428 397 100         for (i=0, j=1; j < *len; j++) {
429 329           i += (L[i] != L[j]);
430 329           L[i] = L[j];
431             }
432 68           *len = i+1;
433             }
434 68           }
435