File Coverage

sphash.h

Criterion	Covered	Total	%
statement	637	765	83.2
branch	470	708	66.3
condition			n/a
subroutine			n/a
pod			n/a
total	1107	1473	75.1

line	stmt	bran	code
1			/*
2			* sphash.h -- Shared-memory spatial hash index for Linux
3			*
4			* Unbounded sparse spatial hash. Points are bucketed by integer cell
5			* coordinates (floor(pos / cell_size)); cells map to hash buckets via XXH3.
6			* Entries live in a fixed pool allocated from a free-list, chained into
7			* per-bucket singly/doubly linked lists. A write-preferring futex rwlock
8			* with reader-slot dead-process recovery guards mutations.
9			*
10			* Entry: double pos[3]; int64_t value; uint32_t next, prev
11			* Layout: Header -> reader_slots[1024] -> buckets -> bitmap -> entries
12			*/
13
14			#ifndef SPHASH_H
15			#define SPHASH_H
16
17			#include
18			#include
19			#include
20			#include
21			#include
22			#include
23			#include
24			#include
25			#include
26			#include
27			#include
28			#include
29			#include
30			#include
31			#include
32			#include
33			#include
34			#include
35
36			#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
37			#error "sphash.h: requires little-endian architecture"
38			#endif
39
40			#define XXH_INLINE_ALL
41			#include "xxhash.h"
42
43			/* ================================================================
44			* Constants
45			* ================================================================ */
46
47			#define SPH_MAGIC 0x53504831U /* "SPH1" */
48			#define SPH_VERSION 1
49			#define SPH_NONE UINT32_MAX
50			#define SPH_ERR_BUFLEN 256
51			#define SPH_READER_SLOTS 1024 /* max concurrent reader processes for dead-process recovery */
52
53			#define SPH_MAX_QUERY_CELLS (1u << 26) /* ~67M cell ceiling per query; raise + rebuild if you genuinely need larger regions */
54			/* query function return codes */
55			#define SPH_Q_OK 1
56			#define SPH_Q_OOM 0
57			#define SPH_Q_TOOBIG (-1)
58
59			#define SPH_ERR(fmt, ...) do { if (errbuf) snprintf(errbuf, SPH_ERR_BUFLEN, fmt, ##__VA_ARGS__); } while (0)
60
61			/* ================================================================
62			* Structs
63			* ================================================================ */
64
65			typedef struct {
66			double pos[3]; /* 0 */
67			int64_t value; /* 24 */
68			double radius; /* 32 per-entry interaction radius (0 = point) */
69			uint32_t next; /* 40 */
70			uint32_t prev; /* 44 */
71			} SpatialEntry; /* 48 */
72
73			/* Per-process slot for dead-process recovery. Each shared rwlock counter
74			* (the main rwlock-reader count, rwlock_waiters, rwlock_writers_waiting)
75			* is mirrored here so a wrlock timeout can attribute and reverse a dead
76			* process's contribution instead of waiting for the slow per-op timeout
77			* drain. */
78			typedef struct {
79			uint32_t pid; /* 0 = unclaimed */
80			uint32_t subcount; /* in-flight rdlock acquisitions for this process */
81			uint32_t waiters_parked; /* contribution to hdr->rwlock_waiters */
82			uint32_t writers_parked; /* contribution to hdr->rwlock_writers_waiting */
83			} SphReaderSlot;
84
85			struct SphHeader {
86			uint32_t magic, version; /* 0,4 */
87			uint32_t max_entries, num_buckets;/* 8,12 */
88			double cell_size; /* 16 */
89			uint32_t count; /* 24 */
90			uint32_t free_head; /* 28 */
91			uint64_t total_size; /* 32 */
92			uint64_t reader_slots_off; /* 40 */
93			uint64_t buckets_off; /* 48 */
94			uint64_t bitmap_off; /* 56 */
95			uint64_t entries_off; /* 64 */
96			uint32_t rwlock; /* 72 */
97			uint32_t rwlock_waiters; /* 76 */
98			uint32_t rwlock_writers_waiting; /* 80 */
99			uint32_t _pad0; /* 84 */
100			uint64_t stat_ops; /* 88 */
101			double world[3]; /* 96 toroidal wrap extents per axis (0 = no wrap) */
102			uint32_t flags; /* 120 bit0 = SPH_FLAG_WRAP */
103			uint32_t _pad0b; /* 124 */
104			double sphere_radius; /* 128 body radius for geo methods (0 = geo disabled) */
105			uint8_t _pad1[120]; /* 136..255 */
106			};
107			typedef struct SphHeader SphHeader;
108
109			#define SPH_FLAG_WRAP 1u
110
111			_Static_assert(sizeof(SpatialEntry) == 48, "SpatialEntry must be 48 bytes");
112			_Static_assert(sizeof(SphHeader) == 256, "SphHeader must be 256 bytes");
113
114			/* ---- Process-local handle ---- */
115
116			typedef struct SpatialHandle {
117			SphHeader *hdr;
118			SphReaderSlot reader_slots; / SPH_READER_SLOTS entries */
119			uint32_t *buckets;
120			uint64_t *bitmap;
121			SpatialEntry *entries;
122			uint32_t bitmap_words;
123			size_t mmap_size;
124			char path; / backing file path (strdup'd) */
125			int notify_fd;
126			int backing_fd; /* memfd fd to close on destroy, -1 otherwise */
127			uint32_t my_slot_idx; /* UINT32_MAX if all slots taken (no recovery for this handle) */
128			uint32_t cached_pid; /* getpid() cached at last slot claim */
129			uint32_t cached_fork_gen; /* sph_fork_gen value at last slot claim */
130			double world[3]; /* cached wrap extents (0 = no wrap on axis) */
131			int64_t wrap_cells[3]; /* ceil(world/cell) per axis, 0 = no wrap */
132			int wrap; /* nonzero if any axis wraps */
133			} SpatialHandle;
134
135			/* ================================================================
136			* Hashing + cell helpers
137			* ================================================================ */
138
139	83		static inline uint32_t sph_next_pow2(uint32_t v) {
140	83	100	if (v < 2) return 1;
141	82		return 1u << (32 - __builtin_clz(v - 1));
142			}
143			/* Largest magnitude a cell index may take. Clamping to +/-2^62 keeps cells far
144			* inside int64 range, so the cell-box and knn-shell loops -- which expand from
145			* center +/- g over a span already capped at SPH_MAX_QUERY_CELLS -- can never
146			* overflow int64. 2^62 is far beyond any coordinate a double represents exactly
147			* (2^53), so nothing reachable is lost. */
148			#define SPH_CELL_LIMIT ((int64_t)1 << 62)
149
150			/* Floor one coordinate to its integer cell. Defined for every double: NaN/Inf
151			* map to cell 0, and the result is clamped to +/-SPH_CELL_LIMIT, both to avoid
152			* the UB of converting an out-of-range double to int64_t and to bound the
153			* query loops (see SPH_CELL_LIMIT). */
154	2753844		static inline int64_t sph_floor_cell(double v, double cs) {
155	2753844		double d = floor(v / cs);
156	2753844	100	if (!isfinite(d)) return 0;
157	2753840	100	if (d >= (double)SPH_CELL_LIMIT) return SPH_CELL_LIMIT;
158	2753832	100	if (d <= -(double)SPH_CELL_LIMIT) return -SPH_CELL_LIMIT;
159	2753831		return (int64_t)d;
160			}
161			/* positive modulo of a cell index into [0, n); n <= 0 means "no wrap on this axis" */
162	34033		static inline int64_t sph_wrap_cell(int64_t c, int64_t n) {
163	34033	100	if (n <= 0) return c;
164	30707		int64_t m = c % n;
165	30707	100	return m < 0 ? m + n : m;
166			}
167			/* shortest per-axis separation; with wrap, the minimum-image (toroidal) distance */
168	56105		static inline double sph_axis_delta(double a, double b, double world) {
169	56105		double d = fabs(a - b);
170	56105	100	return (world > 0.0 && d > world * 0.5) ? world - d : d;
		100
171			}
172			/* raw integer cell of a position (no wrap) -- used for box-corner span math */
173	917948		static inline void sph_cell_raw(const SpatialHandle *h, const double pos[3], int64_t cell[3]) {
174	917948		double cs = h->hdr->cell_size;
175	917948		cell[0] = sph_floor_cell(pos[0], cs);
176	917948		cell[1] = sph_floor_cell(pos[1], cs);
177	917948		cell[2] = sph_floor_cell(pos[2], cs);
178	917948		}
179			/* storage cell: the raw cell wrapped into [0,nx) per axis when toroidal, so a
180			point near x=0 and one near x=world land in adjacent cells (0 and nx-1) */
181	915874		static inline void sph_cell_of(const SpatialHandle *h, const double pos[3], int64_t cell[3]) {
182	915874		sph_cell_raw(h, pos, cell);
183	915874	100	if (h->wrap) {
184	5287		cell[0] = sph_wrap_cell(cell[0], h->wrap_cells[0]);
185	5287		cell[1] = sph_wrap_cell(cell[1], h->wrap_cells[1]);
186	5287		cell[2] = sph_wrap_cell(cell[2], h->wrap_cells[2]);
187			}
188	915874		}
189	2014540		static inline uint32_t sph_bucket_of_cell(const SpatialHandle *h, const int64_t cell[3]) {
190	2014540		return (uint32_t)(XXH3_64bits(cell, 3 * sizeof(int64_t)) & (h->hdr->num_buckets - 1));
191			}
192	901830		static inline int sph_cell_eq(const int64_t a[3], const int64_t b[3]) {
193	901830	100	return a[0] == b[0] && a[1] == b[1] && a[2] == b[2];
		100
		100
194			}
195
196			/* ================================================================
197			* Futex-based write-preferring read-write lock
198			* with reader-slot dead-process recovery
199			* ================================================================ */
200
201			#define SPH_RWLOCK_SPIN_LIMIT 32
202			#define SPH_LOCK_TIMEOUT_SEC 2 /* FUTEX_WAIT timeout for stale lock detection */
203
204	0		static inline void sph_rwlock_spin_pause(void) {
205			#if defined(__x86_64__) \|\| defined(__i386__)
206	0		__asm__ volatile("pause" ::: "memory");
207			#elif defined(__aarch64__)
208			__asm__ volatile("yield" ::: "memory");
209			#else
210			__asm__ volatile("" ::: "memory");
211			#endif
212	0		}
213
214			/* Extract writer PID from rwlock value (lower 31 bits when write-locked). */
215			#define SPH_RWLOCK_WRITER_BIT 0x80000000U
216			#define SPH_RWLOCK_PID_MASK 0x7FFFFFFFU
217			#define SPH_RWLOCK_WR(pid) (SPH_RWLOCK_WRITER_BIT \| ((uint32_t)(pid) & SPH_RWLOCK_PID_MASK))
218
219			/* Check if a PID is alive. Returns 1 if alive or unknown, 0 if definitely dead. */
220			/* Liveness via kill(pid,0). NOTE: cannot detect PID reuse -- if a dead
221			* lock-holder's PID is recycled to an unrelated live process before recovery
222			* runs, this reports "alive" and that slot's orphaned contribution is not
223			* reclaimed until the recycled process exits. Robust detection would require
224			* a per-slot process-start-time epoch (a header-layout/version change).
225			* Documented under "Crash Safety" in the POD. */
226	0		static inline int sph_pid_alive(uint32_t pid) {
227	0	0	if (pid == 0) return 1; /* no owner recorded, assume alive */
228	0	0	return !(kill((pid_t)pid, 0) == -1 && errno == ESRCH);
		0
229			}
230
231			/* Force-recover a stale write lock left by a dead process.
232			* CAS to OUR pid to hold the lock while fixing shared state, then release.
233			* Using our pid (not a bare WRITER_BIT sentinel) means a subsequent
234			* recovering process can detect and re-recover if we crash mid-recovery. */
235	0		static inline void sph_recover_stale_lock(SpatialHandle *h, uint32_t observed_rwlock) {
236	0		SphHeader *hdr = h->hdr;
237	0		uint32_t mypid = SPH_RWLOCK_WR((uint32_t)getpid());
238	0	0	if (!__atomic_compare_exchange_n(&hdr->rwlock, &observed_rwlock,
239			mypid, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
240	0		return;
241			/* We now hold the write lock as mypid. No additional shared state needs
242			* repair here (this module has no seqlock); just release the lock. */
243	0		__atomic_store_n(&hdr->rwlock, 0, __ATOMIC_RELEASE);
244	0	0	if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
245	0		syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
246			}
247
248			static const struct timespec sph_lock_timeout = { SPH_LOCK_TIMEOUT_SEC, 0 };
249
250			/* Process-global fork-generation counter. Incremented in the pthread_atfork
251			* child callback so every open handle detects a fork transition on the next
252			* lock call without paying a getpid() syscall on the hot path. */
253			static uint32_t sph_fork_gen = 1;
254			static pthread_once_t sph_atfork_once = PTHREAD_ONCE_INIT;
255	0		static void sph_on_fork_child(void) {
256	0		__atomic_add_fetch(&sph_fork_gen, 1, __ATOMIC_RELAXED);
257	0		}
258	17		static void sph_atfork_init(void) {
259	17		pthread_atfork(NULL, NULL, sph_on_fork_child);
260	17		}
261
262			/* Ensure this process owns a reader slot. Called from the lock helpers so
263			* that fork()'d children pick up their own slot lazily instead of sharing
264			* the parent's. Hot-path is a single relaxed load + compare; only on a
265			* fork-generation mismatch do we touch getpid() and scan slots. */
266	12173		static inline void sph_claim_reader_slot(SpatialHandle *h) {
267	12173		uint32_t cur_gen = __atomic_load_n(&sph_fork_gen, __ATOMIC_RELAXED);
268	12173	100	if (__builtin_expect(cur_gen == h->cached_fork_gen && h->my_slot_idx != UINT32_MAX, 1))
		50
269	12114		return;
270			/* Cold path -- register the atfork hook once per process, then claim. */
271	59		pthread_once(&sph_atfork_once, sph_atfork_init);
272			/* Re-read after pthread_once: sph_on_fork_child may have bumped it. */
273	59		cur_gen = __atomic_load_n(&sph_fork_gen, __ATOMIC_RELAXED);
274	59		uint32_t now_pid = (uint32_t)getpid();
275	59		h->cached_pid = now_pid;
276	59		h->cached_fork_gen = cur_gen;
277	59		h->my_slot_idx = UINT32_MAX;
278	59		uint32_t start = now_pid % SPH_READER_SLOTS;
279	67	50	for (uint32_t i = 0; i < SPH_READER_SLOTS; i++) {
280	67		uint32_t s = (start + i) % SPH_READER_SLOTS;
281	67		uint32_t expected = 0;
282	67	100	if (__atomic_compare_exchange_n(&h->reader_slots[s].pid,
283			&expected, now_pid, 0,
284			__ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
285			/* Zero all mirror fields, not just subcount: a SIGKILL'd
286			* predecessor may have left waiters_parked/writers_parked
287			* non-zero, and sph_recover_dead_readers won't drain them
288			* once we own the slot (the CAS expects the dead PID). */
289	59		__atomic_store_n(&h->reader_slots[s].subcount, 0, __ATOMIC_RELAXED);
290	59		__atomic_store_n(&h->reader_slots[s].waiters_parked, 0, __ATOMIC_RELAXED);
291	59		__atomic_store_n(&h->reader_slots[s].writers_parked, 0, __ATOMIC_RELAXED);
292	59		h->my_slot_idx = s;
293	59		return;
294			}
295			}
296			/* Table full -- leave my_slot_idx = UINT32_MAX so we silently skip
297			* tracking for this handle (lock still works; just no recovery). */
298			}
299
300			/* Atomically subtract `sub` from a counter, capped at 0 (never underflows). */
301	0		static inline void sph_atomic_sub_cap(uint32_t *p, uint32_t sub) {
302	0	0	if (!sub) return;
303	0		uint32_t cur = __atomic_load_n(p, __ATOMIC_RELAXED);
304	0		for (;;) {
305	0	0	uint32_t want = (cur > sub) ? cur - sub : 0;
306	0	0	if (__atomic_compare_exchange_n(p, &cur, want,
307			1, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
308	0		return;
309			}
310			}
311
312			/* Try to claim a dead slot (CAS pid -> 0) and drain its parked-waiter
313			* contributions back to the global counters. A no-op if the slot was stolen
314			* by another recoverer or had no waiter contribution to drain.
315			*
316			* Note: subcount/waiters_parked/writers_parked are NOT zeroed here.
317			* Between our CAS and a follow-up store, a new process could claim the
318			* slot and start populating these fields -- our stores would clobber its
319			* state. sph_claim_reader_slot zeros all three on every claim, so
320			* leaving stale values is harmless. */
321	0		static inline void sph_drain_dead_slot(SpatialHandle *h, uint32_t i, uint32_t pid) {
322	0		SphHeader *hdr = h->hdr;
323	0		uint32_t expected = pid;
324			/* ACQ_REL on success: RELEASE publishes pid=0 to other observers;
325			* ACQUIRE syncs us with prior writes from the dead process to
326			* waiters_parked/writers_parked. On weakly-ordered archs (aarch64)
327			* a plain RELAXED load before the CAS could miss those writes;
328			* loading them after the CAS keeps them inside the acquire window. */
329	0	0	if (!__atomic_compare_exchange_n(&h->reader_slots[i].pid, &expected, 0,
330			0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
331	0		return;
332	0		uint32_t wp = __atomic_load_n(&h->reader_slots[i].waiters_parked, __ATOMIC_RELAXED);
333	0		uint32_t writp = __atomic_load_n(&h->reader_slots[i].writers_parked, __ATOMIC_RELAXED);
334	0	0	if (wp) sph_atomic_sub_cap(&hdr->rwlock_waiters, wp);
335	0	0	if (writp) sph_atomic_sub_cap(&hdr->rwlock_writers_waiting, writp);
336			}
337
338			/* Scan reader slots for dead-process recovery.
339			*
340			* For each dead PID with non-zero contributions to the shared rwlock,
341			* rwlock_waiters, or rwlock_writers_waiting counters, drain its share back
342			* out so live processes don't have to wait for the slow per-op timeout
343			* decrement to drain it for them.
344			*
345			* For the main rwlock counter we use the "no live reader holds -> force-
346			* reset to 0" trick (precise) because per-process attribution of the
347			* subcount is racy across the inc-counter-then-inc-subcount window. */
348	0		static inline void sph_recover_dead_readers(SpatialHandle *h) {
349	0	0	if (!h->reader_slots) return;
350	0		SphHeader *hdr = h->hdr;
351	0		int any_live_reader = 0;
352	0		int found_dead_reader = 0;
353
354			/* Pass 1: classify slots. Slots with dead pid and sc == 0 (no rwlock
355			* contribution to lose) are wiped immediately to free the slot for
356			* future claimants and drain any orphan parked-waiter counters. Slots
357			* with dead pid and sc > 0 are left intact in this pass: if force-
358			* reset cannot fire (because a live reader is concurrently present),
359			* wiping the dead slot would lose the only record of its orphan
360			* rwlock contribution and strand writers permanently once the live
361			* reader releases. */
362	0	0	for (uint32_t i = 0; i < SPH_READER_SLOTS; i++) {
363	0		uint32_t pid = __atomic_load_n(&h->reader_slots[i].pid, __ATOMIC_ACQUIRE);
364	0	0	if (pid == 0) continue;
365	0		uint32_t sc = __atomic_load_n(&h->reader_slots[i].subcount, __ATOMIC_RELAXED);
366	0	0	if (sph_pid_alive(pid)) {
367	0	0	if (sc > 0) any_live_reader = 1;
368	0		continue;
369			}
370	0	0	if (sc > 0) { found_dead_reader = 1; continue; }
371	0		sph_drain_dead_slot(h, i, pid);
372			}
373
374			/* Pass 2: only if force-reset will fire. Issue the rwlock force-
375			* reset CAS FIRST, while the window since pass 1's last scan is
376			* still narrow (a handful of instructions, as in the original
377			* single-pass code). A new reader that started rdlock between
378			* pass 1's scan and the CAS will either:
379			* (a) have already CAS'd rwlock from cur to cur+1 -- our CAS then
380			* fails (cur mismatched), recovery yields and a future
381			* cycle retries; or
382			* (b) be still in the subcount-bump phase -- our CAS sees the
383			* stale cur and resets to 0; the new reader's subsequent CAS
384			* rwlock(0 -> 1) succeeds cleanly.
385			* Only after the CAS resolves do we wipe the deferred dead slots,
386			* keeping that work outside the race-sensitive window. */
387	0	0	if (found_dead_reader && !any_live_reader) {
		0
388	0		uint32_t cur = __atomic_load_n(&hdr->rwlock, __ATOMIC_RELAXED);
389	0	0	if (cur > 0 && cur < SPH_RWLOCK_WRITER_BIT) {
		0
390	0	0	if (__atomic_compare_exchange_n(&hdr->rwlock, &cur, 0,
391			0, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
392	0	0	if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
393	0		syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
394			}
395			}
396	0	0	for (uint32_t i = 0; i < SPH_READER_SLOTS; i++) {
397	0		uint32_t pid = __atomic_load_n(&h->reader_slots[i].pid, __ATOMIC_ACQUIRE);
398	0	0	if (pid == 0 \|\| sph_pid_alive(pid)) continue;
		0
399	0		sph_drain_dead_slot(h, i, pid);
400			}
401			}
402			}
403
404			/* Inspect the lock word after a futex-wait timeout. If a dead writer
405			* holds it, force-recover the lock. Otherwise drain dead readers' shares
406			* of the rwlock/waiter counters. Called from rdlock and wrlock ETIMEDOUT
407			* branches -- identical recovery logic in both. */
408	0		static inline void sph_recover_after_timeout(SpatialHandle *h) {
409	0		SphHeader *hdr = h->hdr;
410	0		uint32_t val = __atomic_load_n(&hdr->rwlock, __ATOMIC_RELAXED);
411	0	0	if (val >= SPH_RWLOCK_WRITER_BIT) {
412	0		uint32_t pid = val & SPH_RWLOCK_PID_MASK;
413	0	0	if (!sph_pid_alive(pid))
414	0		sph_recover_stale_lock(h, val);
415			} else {
416	0		sph_recover_dead_readers(h);
417			}
418	0		}
419
420			/* Park/unpark helpers: bump the global waiter counters together with this
421			* process's mirrored slot counters so a wrlock-timeout recovery scan can
422			* attribute and reverse a dead PID's contribution. Kept paired to make
423			* accidental drift between global and per-slot counts impossible. */
424	0		static inline void sph_park_reader(SpatialHandle *h) {
425	0	0	if (h->my_slot_idx != UINT32_MAX)
426	0		__atomic_add_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
427	0		__atomic_add_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
428	0		}
429	0		static inline void sph_unpark_reader(SpatialHandle *h) {
430	0		__atomic_sub_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
431	0	0	if (h->my_slot_idx != UINT32_MAX)
432	0		__atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
433	0		}
434	0		static inline void sph_park_writer(SpatialHandle *h) {
435	0	0	if (h->my_slot_idx != UINT32_MAX) {
436	0		__atomic_add_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
437	0		__atomic_add_fetch(&h->reader_slots[h->my_slot_idx].writers_parked, 1, __ATOMIC_RELAXED);
438			}
439	0		__atomic_add_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
440	0		__atomic_add_fetch(&h->hdr->rwlock_writers_waiting, 1, __ATOMIC_RELAXED);
441	0		}
442	0		static inline void sph_unpark_writer(SpatialHandle *h) {
443	0		__atomic_sub_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
444	0		__atomic_sub_fetch(&h->hdr->rwlock_writers_waiting, 1, __ATOMIC_RELAXED);
445	0	0	if (h->my_slot_idx != UINT32_MAX) {
446	0		__atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
447	0		__atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].writers_parked, 1, __ATOMIC_RELAXED);
448			}
449	0		}
450
451	685		static inline void sph_rwlock_rdlock(SpatialHandle *h) {
452	685		sph_claim_reader_slot(h);
453	685		SphHeader *hdr = h->hdr;
454	685		uint32_t *lock = &hdr->rwlock;
455	685		uint32_t *writers_waiting = &hdr->rwlock_writers_waiting;
456			/* Claim subcount BEFORE bumping the shared rwlock counter. This way
457			* a concurrent writer-side recovery scan that sees our PID alive with
458			* subcount > 0 will (correctly) defer force-reset, even while we are
459			* still spinning trying to win the rwlock CAS. Without this, a reader
460			* killed between rwlock CAS-success and subcount++ would let recovery
461			* force-reset rwlock to 0 underneath us, causing a UINT32_MAX wrap on
462			* our eventual rdunlock dec. */
463	685	50	if (h->my_slot_idx != UINT32_MAX)
464	685		__atomic_add_fetch(&h->reader_slots[h->my_slot_idx].subcount, 1, __ATOMIC_RELAXED);
465	685		for (int spin = 0; ; spin++) {
466	685		uint32_t cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
467			/* Write-preferring: when lock is free (cur==0) and writers are
468			* waiting, yield to let the writer acquire. When readers are
469			* already active (cur>=1), new readers may join freely. */
470	685	50	if (cur > 0 && cur < SPH_RWLOCK_WRITER_BIT) {
		0
471	0	0	if (__atomic_compare_exchange_n(lock, &cur, cur + 1,
472			1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
473	685		return;
474	685	50	} else if (cur == 0 && !__atomic_load_n(writers_waiting, __ATOMIC_RELAXED)) {
		50
475	685	50	if (__atomic_compare_exchange_n(lock, &cur, 1,
476			1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
477	685		return;
478			}
479	0	0	if (__builtin_expect(spin < SPH_RWLOCK_SPIN_LIMIT, 1)) {
480	0		sph_rwlock_spin_pause();
481	0		continue;
482			}
483	0		sph_park_reader(h);
484	0		cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
485			/* Sleep when write-locked OR when yielding to waiting writers */
486	0	0	if (cur >= SPH_RWLOCK_WRITER_BIT \|\| cur == 0) {
		0
487	0		long rc = syscall(SYS_futex, lock, FUTEX_WAIT, cur,
488			&sph_lock_timeout, NULL, 0);
489	0	0	if (rc == -1 && errno == ETIMEDOUT) {
		0
490	0		sph_unpark_reader(h);
491	0		sph_recover_after_timeout(h);
492	0		spin = 0;
493	0		continue;
494			}
495			}
496	0		sph_unpark_reader(h);
497	0		spin = 0;
498			}
499			}
500
501	685		static inline void sph_rwlock_rdunlock(SpatialHandle *h) {
502	685		SphHeader *hdr = h->hdr;
503			/* Release the shared counter BEFORE dropping our subcount so that
504			* "any live PID with subcount > 0" is a reliable in-flight indicator
505			* for the writer-side recovery scan. Inverting these would create a
506			* window where we still own a unit of rwlock but our slot subcount is
507			* 0, letting recovery force-reset rwlock underneath us. */
508	685		uint32_t after = __atomic_sub_fetch(&hdr->rwlock, 1, __ATOMIC_RELEASE);
509	685	50	if (h->my_slot_idx != UINT32_MAX)
510	685		__atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].subcount, 1, __ATOMIC_RELAXED);
511	685	50	if (after == 0 && __atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
		50
512	0		syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
513	685		}
514
515	11488		static inline void sph_rwlock_wrlock(SpatialHandle *h) {
516	11488		sph_claim_reader_slot(h); /* refresh cached_pid across fork */
517	11488		SphHeader *hdr = h->hdr;
518	11488		uint32_t *lock = &hdr->rwlock;
519			/* Encode PID in the rwlock word itself (0x80000000 \| pid) to eliminate
520			* any crash window between acquiring the lock and storing the owner. */
521	11488		uint32_t mypid = SPH_RWLOCK_WR(h->cached_pid);
522	11488		for (int spin = 0; ; spin++) {
523	11488		uint32_t expected = 0;
524	11488	50	if (__atomic_compare_exchange_n(lock, &expected, mypid,
525			1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
526	11488		return;
527	0	0	if (__builtin_expect(spin < SPH_RWLOCK_SPIN_LIMIT, 1)) {
528	0		sph_rwlock_spin_pause();
529	0		continue;
530			}
531	0		sph_park_writer(h);
532	0		uint32_t cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
533	0	0	if (cur != 0) {
534	0		long rc = syscall(SYS_futex, lock, FUTEX_WAIT, cur,
535			&sph_lock_timeout, NULL, 0);
536	0	0	if (rc == -1 && errno == ETIMEDOUT) {
		0
537	0		sph_unpark_writer(h);
538	0		sph_recover_after_timeout(h);
539	0		spin = 0;
540	0		continue;
541			}
542			}
543	0		sph_unpark_writer(h);
544	0		spin = 0;
545			}
546			}
547
548	11488		static inline void sph_rwlock_wrunlock(SpatialHandle *h) {
549	11488		SphHeader *hdr = h->hdr;
550	11488		__atomic_store_n(&hdr->rwlock, 0, __ATOMIC_RELEASE);
551	11488	50	if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
552	0		syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
553	11488		}
554
555			/* ================================================================
556			* Layout math + create / open / destroy
557			*
558			* Layout: Header -> reader_slots[1024] -> buckets -> bitmap -> entries
559			* ================================================================ */
560
561			/* Largest max_entries / num_buckets accepted at create time. 2^30 keeps the
562			* power-of-two rounding (sph_next_pow2) and every byte offset well within
563			* range, and is far beyond any realistic shared-memory map. */
564			#define SPH_MAX_CAPACITY 0x40000000u
565
566			/* Single source of truth for the mmap region layout offsets. */
567			typedef struct { uint64_t reader_slots, buckets, bitmap, entries; } SphLayout;
568
569	173		static inline SphLayout sph_layout(uint32_t max_entries, uint32_t num_buckets) {
570	173		uint32_t bw = (max_entries + 63) / 64;
571			SphLayout L;
572	173		L.reader_slots = sizeof(SphHeader);
573	173		L.buckets = L.reader_slots + (uint64_t)SPH_READER_SLOTS * sizeof(SphReaderSlot);
574	173		L.bitmap = L.buckets + (uint64_t)num_buckets * sizeof(uint32_t);
575	173		L.bitmap = (L.bitmap + 7) & ~(uint64_t)7; /* 8-byte align: bitmap is uint64[] and entries (doubles) follow it */
576	173		L.entries = L.bitmap + (uint64_t)bw * sizeof(uint64_t);
577	173		return L;
578			}
579
580	93		static inline uint64_t sph_total_size(uint32_t max_entries, uint32_t num_buckets) {
581	93		SphLayout L = sph_layout(max_entries, num_buckets);
582	93		return L.entries + (uint64_t)max_entries * sizeof(SpatialEntry);
583			}
584
585	70		static inline void sph_init_header(void *base, uint32_t max_entries, uint32_t num_buckets,
586			double cell_size, const double world[3], double sphere_radius, uint64_t total) {
587	70		SphLayout L = sph_layout(max_entries, num_buckets);
588
589	70		SphHeader hdr = (SphHeader )base;
590	70		memset(base, 0, (size_t)total);
591	70		hdr->magic = SPH_MAGIC;
592	70		hdr->version = SPH_VERSION;
593	70		hdr->max_entries = max_entries;
594	70		hdr->num_buckets = num_buckets;
595	70		hdr->cell_size = cell_size;
596			/* toroidal wrap extents: any axis > 0 enables wrap */
597	70		int wrap = 0;
598	280	100	for (int i = 0; i < 3; i++) {
599	210	100	double w = world ? world[i] : 0.0;
600	210		hdr->world[i] = w;
601	210	100	if (w > 0.0) wrap = 1;
602			}
603	70	100	hdr->flags = wrap ? SPH_FLAG_WRAP : 0;
604	70		hdr->sphere_radius = sphere_radius; /* caller validated finite and >= 0 */
605	70		hdr->count = 0;
606	70		hdr->total_size = total;
607	70		hdr->reader_slots_off = L.reader_slots;
608	70		hdr->buckets_off = L.buckets;
609	70		hdr->bitmap_off = L.bitmap;
610	70		hdr->entries_off = L.entries;
611
612			/* Set every bucket head to SPH_NONE (empty). */
613	70		uint32_t buckets = (uint32_t )((uint8_t *)base + L.buckets);
614	118047	100	for (uint32_t i = 0; i < num_buckets; i++) buckets[i] = SPH_NONE;
615
616			/* Thread the free-list through the entries pool. */
617	70		SpatialEntry entries = (SpatialEntry )((uint8_t *)base + L.entries);
618	89764	100	for (uint32_t i = 0; i < max_entries; i++)
619	89694	100	entries[i].next = (i + 1 < max_entries) ? (i + 1) : SPH_NONE;
620	70		hdr->free_head = 0; /* max_entries >= 1 (validated at create) */
621
622			/* Alloc bitmap left zeroed (all entries free). */
623	70		__atomic_thread_fence(__ATOMIC_SEQ_CST);
624	70		}
625
626	79		static inline SpatialHandle sph_setup(void base, size_t map_size,
627			const char *path, int backing_fd) {
628	79		SphHeader hdr = (SphHeader )base;
629	79		SpatialHandle h = (SpatialHandle )calloc(1, sizeof(SpatialHandle));
630	79	50	if (!h) {
631	0		munmap(base, map_size);
632	0	0	if (backing_fd >= 0) close(backing_fd);
633	0		return NULL;
634			}
635	79		h->hdr = hdr;
636	79		h->reader_slots = (SphReaderSlot )((uint8_t )base + hdr->reader_slots_off);
637	79		h->buckets = (uint32_t )((uint8_t )base + hdr->buckets_off);
638	79		h->bitmap = (uint64_t )((uint8_t )base + hdr->bitmap_off);
639	79		h->entries = (SpatialEntry )((uint8_t )base + hdr->entries_off);
640	79		h->bitmap_words = (hdr->max_entries + 63) / 64;
641	79		h->mmap_size = map_size;
642	79	100	h->path = path ? strdup(path) : NULL;
643	79		h->notify_fd = -1;
644	79		h->backing_fd = backing_fd;
645	79		h->my_slot_idx = UINT32_MAX;
646	79		h->wrap = (hdr->flags & SPH_FLAG_WRAP) ? 1 : 0;
647	316	100	for (int i = 0; i < 3; i++) {
648	237	100	h->world[i] = h->wrap ? hdr->world[i] : 0.0;
649	264	100	h->wrap_cells[i] = (h->wrap && hdr->world[i] > 0.0)
650	264	100	? (int64_t)floor(hdr->world[i] / hdr->cell_size + 0.5) : 0; /* exact: extent is a multiple */
651			}
652	79		return h;
653			}
654
655			/* Validate a mapped header (shared by sph_create reopen and sph_open_fd). */
656	14		static inline int sph_validate_header(const SphHeader *hdr, uint64_t file_size) {
657	14	100	if (hdr->magic != SPH_MAGIC) return 0;
658	13	100	if (hdr->version != SPH_VERSION) return 0;
659	12	50	if (hdr->max_entries == 0 \|\| hdr->num_buckets == 0) return 0;
		50
660	12	50	if (hdr->max_entries > SPH_MAX_CAPACITY \|\| hdr->num_buckets > SPH_MAX_CAPACITY) return 0;
		50
661	12	50	if ((hdr->num_buckets & (hdr->num_buckets - 1)) != 0) return 0; /* power of two */
662	12	50	if (!(hdr->cell_size > 0) \|\| !isfinite(hdr->cell_size)) return 0;
		50
663	12	100	if (hdr->total_size != file_size) return 0;
664	10	50	if (hdr->total_size != sph_total_size(hdr->max_entries, hdr->num_buckets)) return 0;
665
666	10		SphLayout L = sph_layout(hdr->max_entries, hdr->num_buckets);
667	10	50	if (hdr->reader_slots_off != L.reader_slots) return 0;
668	10	50	if (hdr->buckets_off != L.buckets) return 0;
669	10	50	if (hdr->bitmap_off != L.bitmap) return 0;
670	10	50	if (hdr->entries_off != L.entries) return 0;
671	10	50	if (hdr->flags & ~SPH_FLAG_WRAP) return 0; /* no unknown flags */
672	10		{ int w = 0;
673	40	100	for (int i = 0; i < 3; i++) {
674	30	50	if (!isfinite(hdr->world[i]) \|\| hdr->world[i] < 0.0) return 0;
		50
675	30	100	if (hdr->world[i] > 0.0) { w = 1;
676	4		double nx = floor(hdr->world[i] / hdr->cell_size + 0.5);
677	4	50	if (nx < 1.0 \|\| fabs(nx * hdr->cell_size - hdr->world[i]) > 1e-9 * hdr->world[i]) return 0;
		50
678			}
679			}
680	10	50	if (((hdr->flags & SPH_FLAG_WRAP) ? 1 : 0) != w) return 0; /* flag must match extents */
681			}
682	10	50	if (!isfinite(hdr->sphere_radius) \|\| hdr->sphere_radius < 0.0) return 0;
		50
683	10	100	if (hdr->sphere_radius > 0.0 && (hdr->flags & SPH_FLAG_WRAP)) return 0; /* sphere and wrap are mutually exclusive */
		100
684	9		return 1;
685			}
686
687			/* Validate optional wrap extents against cell_size (world may be NULL); each
688			wrapped axis must be a positive multiple of cell_size so cells tile exactly. */
689	87		static inline int sph_validate_world(const double world, double cell_size, char errbuf) {
690	87	100	if (!world) return 1;
691	37	100	for (int i = 0; i < 3; i++) {
692	28	50	if (!(world[i] >= 0.0) \|\| !isfinite(world[i])) { SPH_ERR("world extent must be finite and >= 0"); return 0; }
		50
		0
693	28	100	if (world[i] > 0.0) {
694	21		double nx = floor(world[i] / cell_size + 0.5);
695	21	50	if (nx < 1.0 \|\| fabs(nx * cell_size - world[i]) > 1e-9 * world[i]) {
		100
696	1	50	SPH_ERR("wrap extent must be a positive multiple of cell_size"); return 0;
697			}
698			}
699			}
700	9		return 1;
701			}
702
703			/* ================================================================
704			* Geo helpers (lat/lon/alt <-> xyz); R = body radius (hdr->sphere_radius).
705			* Angles in radians: lat in [-pi/2, pi/2], lon in (-pi, pi].
706			* ================================================================ */
707	1134		static inline void sph_geo_to_xyz(double R, double lat, double lon, double alt, double out[3]) {
708	1134		double r = R + alt;
709	1134		double cl = cos(lat);
710	1134		out[0] = r * cl * cos(lon);
711	1134		out[1] = r * cl * sin(lon);
712	1134		out[2] = r * sin(lat);
713	1134		}
714	306		static inline void sph_geo_of_xyz(double R, const double in[3], double lat, double lon, double *alt) {
715	306		double r = sqrt(in[0]in[0] + in[1]in[1] + in[2]*in[2]);
716	306	50	double s = (r > 0.0) ? in[2] / r : 0.0;
717	306	50	if (s > 1.0) s = 1.0; else if (s < -1.0) s = -1.0; /* guard asin domain vs rounding */
		50
718	306		*lat = asin(s);
719	306		lon = atan2(in[1], in[0]); / 0 at the poles */
720	306		*alt = r - R;
721	306		}
722
723			/* ================================================================
724			* Cube-sphere cell IDs (stateless; a tiny S2-like scheme).
725			* Equal-angle (tangent-warp) projection of a direction onto one of 6 cube
726			* faces, row-major i,j at a level. Packed id: level(5) face(3) i(24) j(24).
727			* Faces: 0=+X 1=-X 2=+Y 3=-Y 4=+Z 5=-Z. Per face the major axis carries the
728			* sign and the other two coords are (s,t); reconstruction is d=(face basis).
729			* ================================================================ */
730			#define SPH_CUBE_MAX_LEVEL 24
731			#define SPH_PI_4 0.78539816339744830961 /* pi/4 */
732
733	198253		static inline int sph_cube_level(uint64_t id) { return (int)((id >> 51) & 0x1Fu); }
734	178249		static inline int sph_cube_face (uint64_t id) { return (int)((id >> 48) & 0x7u); }
735
736			/* well-formed id: no stray high bits, level <= MAX, face < 6, i,j < 2^level */
737	99132		static inline int sph_cube_valid(uint64_t id) {
738	99132	100	if (id >> 56) return 0;
739	99127		int level = sph_cube_level(id);
740	99127	100	if (level > SPH_CUBE_MAX_LEVEL) return 0;
741	99126	50	if (sph_cube_face(id) > 5) return 0;
742	99126		uint64_t N = (uint64_t)1 << level;
743	99126		uint64_t i = (id >> 24) & 0xFFFFFFu, j = id & 0xFFFFFFu;
744	99126	50	return (i < N && j < N);
		50
745			}
746
747			/* direction (need not be unit) -> cell id at level [0, SPH_CUBE_MAX_LEVEL] */
748	162087		static inline uint64_t sph_cube_cell(const double dir[3], int level) {
749	162087		double x = dir[0], y = dir[1], z = dir[2];
750	162087		double ax = fabs(x), ay = fabs(y), az = fabs(z);
751			int face; double mag, s, t;
752	162087	100	if (ax >= ay && ax >= az) { mag = ax; face = (x >= 0) ? 0 : 1; s = y / mag; t = z / mag; }
		100
753	108374	100	else if (ay >= az) { mag = ay; face = (y >= 0) ? 2 : 3; s = x / mag; t = z / mag; }
		100
754	54118	100	else { mag = az; face = (z >= 0) ? 4 : 5; s = x / mag; t = y / mag; }
755	162087		double u = atan(s) / SPH_PI_4; /* equal-angle warp to [-1,1] */
756	162087		double v = atan(t) / SPH_PI_4;
757	162087	100	if (!isfinite(u)) u = 0.0; /* dir==0 -> s,t NaN; keep defined */
758	162087	100	if (!isfinite(v)) v = 0.0;
759	162087		int64_t N = (int64_t)1 << level;
760	162087		int64_t i = (int64_t)floor((u + 1.0) * 0.5 * (double)N);
761	162087		int64_t j = (int64_t)floor((v + 1.0) * 0.5 * (double)N);
762	162087	50	if (i < 0) i = 0; else if (i >= N) i = N - 1;
		50
763	162087	50	if (j < 0) j = 0; else if (j >= N) j = N - 1;
		50
764	162087		return ((uint64_t)level << 51) \| ((uint64_t)face << 48)
765	162087		\| ((uint64_t)i << 24) \| (uint64_t)j;
766			}
767
768			/* reconstruct a face-basis direction from (face, s, t): the major axis carries
769			the face sign, the two minor coords are (s, t). Inverse of the face/s/t split
770			in sph_cube_cell; shared by sph_cube_center and sph_cube_neighbors. */
771	159483		static inline void sph_face_dir(int face, double s, double t, double d[3]) {
772	159483		switch (face) {
773	26550		case 0: d[0]= 1; d[1]= s; d[2]= t; break;
774	26123		case 1: d[0]=-1; d[1]= s; d[2]= t; break;
775	27206		case 2: d[0]= s; d[1]= 1; d[2]= t; break;
776	26193		case 3: d[0]= s; d[1]=-1; d[2]= t; break;
777	26642		case 4: d[0]= s; d[1]= t; d[2]= 1; break;
778	26769		default: d[0]= s; d[1]= t; d[2]=-1; break;
779			}
780	159483		}
781
782			/* cell id -> its centre as a unit direction */
783	39003		static inline void sph_cube_center(uint64_t id, double out[3]) {
784	39003		int level = sph_cube_level(id), face = sph_cube_face(id);
785	39003		int64_t N = (int64_t)1 << level;
786	39003		int64_t i = (int64_t)((id >> 24) & 0xFFFFFFu), j = (int64_t)(id & 0xFFFFFFu);
787	39003		double u = ((double)i + 0.5) / (double)N * 2.0 - 1.0;
788	39003		double v = ((double)j + 0.5) / (double)N * 2.0 - 1.0;
789	39003		double s = tan(u * SPH_PI_4), t = tan(v * SPH_PI_4);
790			double d[3];
791	39003		sph_face_dir(face, s, t, d);
792	39003		double n = sqrt(d[0]d[0] + d[1]d[1] + d[2]*d[2]);
793	39003		out[0] = d[0]/n; out[1] = d[1]/n; out[2] = d[2]/n;
794	39003		}
795
796			/* parent at level-1 (returns 0 at level 0); children at level+1 (0 at MAX) */
797	8001		static inline int sph_cube_parent(uint64_t id, uint64_t *out) {
798	8001		int level = sph_cube_level(id);
799	8001	100	if (level == 0) return 0;
800	8000		int64_t i = (int64_t)((id >> 24) & 0xFFFFFFu), j = (int64_t)(id & 0xFFFFFFu);
801	8000		*out = ((uint64_t)(level - 1) << 51) \| ((uint64_t)sph_cube_face(id) << 48)
802	8000		\| ((uint64_t)(i >> 1) << 24) \| (uint64_t)(j >> 1);
803	8000		return 1;
804			}
805	2001		static inline int sph_cube_children(uint64_t id, uint64_t out[4]) {
806	2001		int level = sph_cube_level(id);
807	2001	100	if (level >= SPH_CUBE_MAX_LEVEL) return 0;
808	2000		uint64_t face = (uint64_t)sph_cube_face(id);
809	2000		int64_t i = (int64_t)((id >> 24) & 0xFFFFFFu), j = (int64_t)(id & 0xFFFFFFu);
810	2000		int k = 0;
811	14000	100	for (int di = 0; di < 2; di++) for (int dj = 0; dj < 2; dj++)
		100
812	8000		out[k++] = ((uint64_t)(level + 1) << 51) \| (face << 48)
813	8000		\| ((uint64_t)(2i + di) << 24) \| (uint64_t)(2j + dj);
814	2000		return 1;
815			}
816
817			/* 4 edge-adjacent neighbors (seam-aware) by perturb-and-reproject: step one
818			cell width from the centre along +-u / +-v in THIS face's basis, rebuild the
819			direction (\|coord\| may exceed 1, crossing to the adjacent face), and re-run
820			sph_cube_cell. The major-axis flip handles the seam + orientation; a full
821			cell-width step lands at the neighbour's centre, so it is rounding-robust. */
822	30120		static inline void sph_cube_neighbors(uint64_t id, uint64_t out[4]) {
823	30120		int level = sph_cube_level(id), face = sph_cube_face(id);
824	30120		int64_t N = (int64_t)1 << level;
825	30120		int64_t i = (int64_t)((id >> 24) & 0xFFFFFFu), j = (int64_t)(id & 0xFFFFFFu);
826	30120		double uc = ((double)i + 0.5) / (double)N * 2.0 - 1.0;
827	30120		double vc = ((double)j + 0.5) / (double)N * 2.0 - 1.0;
828	30120		double step = 2.0 / (double)N;
829			static const double du[4] = { +1.0, -1.0, 0.0, 0.0 };
830			static const double dv[4] = { 0.0, 0.0, +1.0, -1.0 };
831	150600	100	for (int k = 0; k < 4; k++) {
832	120480		double s = tan((uc + du[k]step) SPH_PI_4);
833	120480		double t = tan((vc + dv[k]step) SPH_PI_4);
834			double d[3];
835	120480		sph_face_dir(face, s, t, d);
836	120480		out[k] = sph_cube_cell(d, level);
837			}
838	30120		}
839
840			/* validate + normalize create() args (shared by sph_create + sph_create_memfd);
841			on success rounds num_buckets up to a power of two /
842	89		static int sph_validate_create_args(uint32_t max_entries, uint32_t *num_buckets,
843			double cell_size, const double *world,
844			double sphere_radius, char *errbuf) {
845	89	50	if (errbuf) errbuf[0] = '\0';
846	89	100	if (!(cell_size > 0) \|\| !isfinite(cell_size)) { SPH_ERR("cell_size must be a finite number > 0"); return 0; }
		50
		50
847	88	100	if (max_entries == 0) { SPH_ERR("max_entries must be > 0"); return 0; }
		50
848	87	100	if (!sph_validate_world(world, cell_size, errbuf)) return 0;
849	86	50	if (!(sphere_radius >= 0.0) \|\| !isfinite(sphere_radius)) { SPH_ERR("sphere radius must be finite and >= 0"); return 0; }
		50
		0
850	86	100	if (sphere_radius > 0.0 && world && (world[0] > 0.0 \|\| world[1] > 0.0 \|\| world[2] > 0.0)) { SPH_ERR("sphere and wrap are mutually exclusive"); return 0; }
		100
		50
		0
		0
		50
851	85	100	if (max_entries > SPH_MAX_CAPACITY) { SPH_ERR("max_entries too large (max %u)", SPH_MAX_CAPACITY); return 0; }
		50
852	84	100	if (*num_buckets > SPH_MAX_CAPACITY) { SPH_ERR("num_buckets too large (max %u)", SPH_MAX_CAPACITY); return 0; }
		50
853	83	100	num_buckets = sph_next_pow2(num_buckets == 0 ? max_entries : *num_buckets);
854	83		return 1;
855			}
856
857	87		static SpatialHandle sph_create(const char path, uint32_t max_entries,
858			uint32_t num_buckets, double cell_size,
859			const double world, double sphere_radius, char errbuf) {
860	87	100	if (!sph_validate_create_args(max_entries, &num_buckets, cell_size, world, sphere_radius, errbuf)) return NULL;
861
862	81		uint64_t total = sph_total_size(max_entries, num_buckets);
863	81		int anonymous = (path == NULL);
864	81		int fd = -1;
865			size_t map_size;
866			void *base;
867
868	81	100	if (anonymous) {
869	57		map_size = (size_t)total;
870	57		base = mmap(NULL, map_size, PROT_READ\|PROT_WRITE, MAP_SHARED\|MAP_ANONYMOUS, -1, 0);
871	57	50	if (base == MAP_FAILED) { SPH_ERR("mmap: %s", strerror(errno)); return NULL; }
		0
872			} else {
873	24		fd = open(path, O_RDWR\|O_CREAT, 0666);
874	37	50	if (fd < 0) { SPH_ERR("open: %s", strerror(errno)); return NULL; }
		0
875	24	50	if (flock(fd, LOCK_EX) < 0) { SPH_ERR("flock: %s", strerror(errno)); close(fd); return NULL; }
		0
876			struct stat st;
877	24	50	if (fstat(fd, &st) < 0) { SPH_ERR("fstat: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL; }
		0
878	24		int is_new = (st.st_size == 0);
879	24	100	if (!is_new && (uint64_t)st.st_size < sizeof(SphHeader)) {
		100
880	1	50	SPH_ERR("%s: file too small (%lld)", path, (long long)st.st_size);
881	1		flock(fd, LOCK_UN); close(fd); return NULL;
882			}
883	23	100	if (is_new && ftruncate(fd, (off_t)total) < 0) {
		50
884	0	0	SPH_ERR("ftruncate: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL;
885			}
886	23	100	map_size = is_new ? (size_t)total : (size_t)st.st_size;
887	23		base = mmap(NULL, map_size, PROT_READ\|PROT_WRITE, MAP_SHARED, fd, 0);
888	23	50	if (base == MAP_FAILED) { SPH_ERR("mmap: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL; }
		0
889	23	100	if (!is_new) {
890	12	100	if (!sph_validate_header((SphHeader *)base, (uint64_t)st.st_size)) {
891	5	50	SPH_ERR("invalid spatial hash file"); munmap(base, map_size); flock(fd, LOCK_UN); close(fd); return NULL;
892			}
893	7		flock(fd, LOCK_UN); close(fd);
894	7		return sph_setup(base, map_size, path, -1);
895			}
896			}
897	68		sph_init_header(base, max_entries, num_buckets, cell_size, world, sphere_radius, total);
898	68	100	if (fd >= 0) { flock(fd, LOCK_UN); close(fd); }
899	68		return sph_setup(base, map_size, path, -1);
900			}
901
902	2		static SpatialHandle sph_create_memfd(const char name, uint32_t max_entries,
903			uint32_t num_buckets, double cell_size,
904			const double world, double sphere_radius, char errbuf) {
905	2	50	if (!sph_validate_create_args(max_entries, &num_buckets, cell_size, world, sphere_radius, errbuf)) return NULL;
906
907	2		uint64_t total = sph_total_size(max_entries, num_buckets);
908	2	50	int fd = memfd_create(name ? name : "sphash", MFD_CLOEXEC \| MFD_ALLOW_SEALING);
909	2	50	if (fd < 0) { SPH_ERR("memfd_create: %s", strerror(errno)); return NULL; }
		0
910	2	50	if (ftruncate(fd, (off_t)total) < 0) {
911	0	0	SPH_ERR("ftruncate: %s", strerror(errno)); close(fd); return NULL;
912			}
913	2		(void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK \| F_SEAL_GROW);
914	2		void *base = mmap(NULL, (size_t)total, PROT_READ\|PROT_WRITE, MAP_SHARED, fd, 0);
915	2	50	if (base == MAP_FAILED) { SPH_ERR("mmap: %s", strerror(errno)); close(fd); return NULL; }
		0
916	2		sph_init_header(base, max_entries, num_buckets, cell_size, world, sphere_radius, total);
917	2		return sph_setup(base, (size_t)total, NULL, fd);
918			}
919
920	2		static SpatialHandle sph_open_fd(int fd, char errbuf) {
921	2	50	if (errbuf) errbuf[0] = '\0';
922			struct stat st;
923	2	50	if (fstat(fd, &st) < 0) { SPH_ERR("fstat: %s", strerror(errno)); return NULL; }
		0
924	2	50	if ((uint64_t)st.st_size < sizeof(SphHeader)) { SPH_ERR("too small"); return NULL; }
		0
925	2		size_t ms = (size_t)st.st_size;
926	2		void *base = mmap(NULL, ms, PROT_READ\|PROT_WRITE, MAP_SHARED, fd, 0);
927	2	50	if (base == MAP_FAILED) { SPH_ERR("mmap: %s", strerror(errno)); return NULL; }
		0
928	2	50	if (!sph_validate_header((SphHeader *)base, (uint64_t)st.st_size)) {
929	0	0	SPH_ERR("invalid spatial hash"); munmap(base, ms); return NULL;
930			}
931	2		int myfd = fcntl(fd, F_DUPFD_CLOEXEC, 0);
932	2	50	if (myfd < 0) { SPH_ERR("fcntl: %s", strerror(errno)); munmap(base, ms); return NULL; }
		0
933	2		return sph_setup(base, ms, NULL, myfd);
934			}
935
936	79		static void sph_destroy(SpatialHandle *h) {
937	79	50	if (!h) return;
938	79	100	if (h->notify_fd >= 0) close(h->notify_fd);
939	79	100	if (h->backing_fd >= 0) close(h->backing_fd);
940	79	50	if (h->hdr) munmap(h->hdr, h->mmap_size);
941	79		free(h->path);
942	79		free(h);
943			}
944
945	9		static inline int sph_msync(SpatialHandle *h) {
946	9	50	if (!h \|\| !h->hdr) return 0;
		50
947	9		return msync(h->hdr, h->mmap_size, MS_SYNC);
948			}
949
950	2		static int sph_create_eventfd(SpatialHandle *h) {
951	2	100	if (h->notify_fd >= 0) return h->notify_fd;
952	1		int efd = eventfd(0, EFD_NONBLOCK\|EFD_CLOEXEC);
953	1	50	if (efd < 0) return -1;
954	1		h->notify_fd = efd;
955	1		return efd;
956			}
957
958	2		static int sph_notify(SpatialHandle *h) {
959	2	50	if (h->notify_fd < 0) return 0;
960	2		uint64_t v = 1;
961	2		return write(h->notify_fd, &v, sizeof(v)) == sizeof(v);
962			}
963
964	3		static int64_t sph_eventfd_consume(SpatialHandle *h) {
965	3	100	if (h->notify_fd < 0) return -1;
966	2		uint64_t v = 0;
967	2	100	if (read(h->notify_fd, &v, sizeof(v)) != sizeof(v)) return -1;
968	1		return (int64_t)v;
969			}
970
971			/* ================================================================
972			* Slot pool: alloc / free / liveness
973			* ================================================================ */
974
975	34439		static inline int sph_is_live(const SpatialHandle *h, uint32_t idx) {
976	34439	100	if (idx >= h->hdr->max_entries) return 0;
977	34435		uint64_t w = h->bitmap[idx / 64];
978	34435		return (w >> (idx % 64)) & 1;
979			}
980
981	9906		static inline uint32_t sph_alloc_slot(SpatialHandle *h) {
982	9906		uint32_t idx = h->hdr->free_head;
983	9906	100	if (idx == SPH_NONE) return SPH_NONE;
984	9902		h->hdr->free_head = h->entries[idx].next; /* pop free-list */
985	9902		h->bitmap[idx / 64] \|= (uint64_t)1 << (idx % 64);
986	9902		h->entries[idx].next = SPH_NONE;
987	9902		h->entries[idx].prev = SPH_NONE;
988	9902		return idx;
989			}
990
991	462		static inline void sph_free_slot(SpatialHandle *h, uint32_t idx) {
992	462		h->bitmap[idx / 64] &= ~((uint64_t)1 << (idx % 64));
993	462		h->entries[idx].next = h->hdr->free_head; /* push free-list */
994	462		h->hdr->free_head = idx;
995	462		}
996
997			/* ================================================================
998			* Bucket chain: link / unlink (callers hold write lock)
999			* ================================================================ */
1000
1001	10565		static inline void sph_bucket_link(SpatialHandle *h, uint32_t idx) {
1002	10565		int64_t cell[3]; sph_cell_of(h, h->entries[idx].pos, cell);
1003	10565		uint32_t b = sph_bucket_of_cell(h, cell);
1004	10565		uint32_t head = h->buckets[b];
1005	10565		h->entries[idx].prev = SPH_NONE;
1006	10565		h->entries[idx].next = head;
1007	10565	100	if (head != SPH_NONE) h->entries[head].prev = idx;
1008	10565		h->buckets[b] = idx;
1009	10565		}
1010
1011	1125		static inline void sph_bucket_unlink(SpatialHandle *h, uint32_t idx) {
1012	1125		int64_t cell[3]; sph_cell_of(h, h->entries[idx].pos, cell);
1013	1125		uint32_t b = sph_bucket_of_cell(h, cell);
1014	1125		uint32_t p = h->entries[idx].prev, n = h->entries[idx].next;
1015	1125	100	if (p != SPH_NONE) h->entries[p].next = n; else h->buckets[b] = n;
1016	1125	100	if (n != SPH_NONE) h->entries[n].prev = p;
1017	1125		}
1018
1019			/* ================================================================
1020			* Mutation ops (callers hold write lock)
1021			* ================================================================ */
1022
1023	9906		static inline uint32_t sph_insert_locked(SpatialHandle *h, double x, double y, double z,
1024			int64_t value, double radius) {
1025	9906		uint32_t idx = sph_alloc_slot(h);
1026	9906	100	if (idx == SPH_NONE) return SPH_NONE;
1027	9902		h->entries[idx].pos[0] = x; h->entries[idx].pos[1] = y; h->entries[idx].pos[2] = z;
1028	9902		h->entries[idx].value = value;
1029	9902		h->entries[idx].radius = radius;
1030	9902		sph_bucket_link(h, idx);
1031	9902		h->hdr->count++;
1032	9902		return idx;
1033			}
1034
1035	463		static inline int sph_remove_locked(SpatialHandle *h, uint32_t idx) {
1036	463	100	if (!sph_is_live(h, idx)) return 0;
1037	462		sph_bucket_unlink(h, idx);
1038	462		sph_free_slot(h, idx);
1039	462		h->hdr->count--;
1040	462		return 1;
1041			}
1042
1043	667		static inline int sph_move_locked(SpatialHandle *h, uint32_t idx, double x, double y, double z) {
1044	667	100	if (!sph_is_live(h, idx)) return 0;
1045			int64_t oc[3], nc[3];
1046	664		sph_cell_of(h, h->entries[idx].pos, oc);
1047	664		double np[3] = { x, y, z }; sph_cell_of(h, np, nc);
1048	664	100	if (sph_cell_eq(oc, nc)) { /* same cell: just rewrite pos */
1049	1		h->entries[idx].pos[0] = x; h->entries[idx].pos[1] = y; h->entries[idx].pos[2] = z;
1050	1		return 1;
1051			}
1052	663		sph_bucket_unlink(h, idx); /* unlink uses OLD pos */
1053	663		h->entries[idx].pos[0] = x; h->entries[idx].pos[1] = y; h->entries[idx].pos[2] = z;
1054	663		sph_bucket_link(h, idx); /* link uses NEW pos */
1055	663		return 1;
1056			}
1057
1058			/* ================================================================
1059			* Region query: collector + walk helpers
1060			* ================================================================ */
1061
1062			typedef struct { int64_t *vals; size_t n, cap; } sph_collect_t;
1063
1064	13780		static int sph_collect_push(sph_collect_t *c, int64_t v) {
1065	13780	100	if (c->n == c->cap) {
1066	341	100	size_t nc = c->cap ? c->cap * 2 : 64;
1067	341		int64_t nv = (int64_t )realloc(c->vals, nc * sizeof(int64_t));
1068	341	50	if (!nv) return 0;
1069	341		c->vals = nv; c->cap = nc;
1070			}
1071	13780		c->vals[c->n++] = v;
1072	13780		return 1;
1073			}
1074
1075			/* squared distance from a to b over `dims` axes; minimum-image when toroidal */
1076	24546		static inline double sph_dist2_w(const double world[3], const double a[3], const double b[3], int dims) {
1077	24546		double dx = sph_axis_delta(a[0], b[0], world[0]);
1078	24546		double dy = sph_axis_delta(a[1], b[1], world[1]);
1079	24546		double d = dxdx + dydy;
1080	24546	100	if (dims == 3) { double dz = sph_axis_delta(a[2], b[2], world[2]); d += dz*dz; }
1081	24546		return d;
1082			}
1083	18789		static inline double sph_dist2(const SpatialHandle *h, const double a[3], const double b[3], int dims) {
1084	18789		return sph_dist2_w(h->world, a, b, dims);
1085			}
1086
1087			/* Walk one cell C's bucket chain; for each entry whose recomputed cell == C
1088			and which passes `accept`, push its value. Returns 1 ok, 0 OOM. */
1089	587347		static int sph_walk_cell(SpatialHandle h, const int64_t C[3], sph_collect_t out,
1090			int (accept)(const double pos[3], void ctx), void *ctx) {
1091	587347		uint32_t b = sph_bucket_of_cell(h, C);
1092	775932	100	for (uint32_t idx = h->buckets[b]; idx != SPH_NONE; idx = h->entries[idx].next) {
1093	188585		int64_t ec[3]; sph_cell_of(h, h->entries[idx].pos, ec);
1094	190376	100	if (!sph_cell_eq(ec, C)) continue; /* collision / dedup guard */
1095	5982	100	if (accept && !accept(h->entries[idx].pos, ctx)) continue;
		100
1096	4191	50	if (!sph_collect_push(out, h->entries[idx].value)) return 0;
1097			}
1098	587347		return 1;
1099			}
1100
1101	17		static int sph_query_cell(SpatialHandle h, const double p[3], int dims, sph_collect_t out) {
1102	17	100	double pp[3] = { p[0], p[1], dims == 3 ? p[2] : 0.0 };
1103	17		int64_t C[3]; sph_cell_of(h, pp, C);
1104	17		return sph_walk_cell(h, C, out, NULL, NULL);
1105			}
1106
1107			struct sph_box { double lo[3], hi[3]; int dims; };
1108			struct sph_disk { double c[3]; double r2; int dims; const double *world; };
1109
1110	201		static int sph_accept_box(const double pos[3], void *ctx) {
1111	201		struct sph_box q = (struct sph_box )ctx;
1112	201	50	if (pos[0] < q->lo[0] \|\| pos[0] > q->hi[0]) return 0;
		100
1113	172	50	if (pos[1] < q->lo[1] \|\| pos[1] > q->hi[1]) return 0;
		100
1114	149	100	if (q->dims == 3 && (pos[2] < q->lo[2] \|\| pos[2] > q->hi[2])) return 0;
		50
		100
1115	133		return 1;
1116			}
1117	5757		static int sph_accept_disk(const double pos[3], void *ctx) {
1118	5757		struct sph_disk q = (struct sph_disk )ctx;
1119	5757		return sph_dist2_w(q->world, pos, q->c, q->dims) <= q->r2;
1120			}
1121
1122			/* enumerate the inclusive cell box [lo..hi] (per axis), z fixed to its single
1123			cell when dims==2, walking each cell with `accept`. */
1124	335		static int sph_enum_cellbox(SpatialHandle *h, const double lo[3], const double hi[3],
1125			int dims, sph_collect_t *out,
1126			int (accept)(const double[3], void ), void *ctx) {
1127			int64_t cl[3], ch[3];
1128	335		sph_cell_raw(h, lo, cl); sph_cell_raw(h, hi, ch); /* raw corners for span math */
1129	335		int64_t cnt[3] = { 1, 1, 1 };
1130	335		uint64_t cells = 1;
1131	1144	100	for (int i = 0; i < dims; i++) {
1132	812	50	if (ch[i] < cl[i]) return SPH_Q_OK; /* empty box */
1133	812		uint64_t span = (uint64_t)ch[i] - (uint64_t)cl[i] + 1; /* exact, since ch>=cl */
1134	812	100	if (h->wrap_cells[i] > 0 && span > (uint64_t)h->wrap_cells[i])
		50
1135	0		span = (uint64_t)h->wrap_cells[i]; /* whole ring, each cell once */
1136	812	100	if (span >= (uint64_t)SPH_MAX_QUERY_CELLS) return SPH_Q_TOOBIG;
1137	809	50	if (cells > (uint64_t)SPH_MAX_QUERY_CELLS / span) return SPH_Q_TOOBIG;
1138	809		cells *= span; cnt[i] = (int64_t)span;
1139			}
1140			int64_t C[3];
1141	6585	100	for (int64_t i0 = 0; i0 < cnt[0]; i0++) {
1142	6253	100	C[0] = h->wrap ? sph_wrap_cell(cl[0] + i0, h->wrap_cells[0]) : cl[0] + i0;
1143	342630	100	for (int64_t i1 = 0; i1 < cnt[1]; i1++) {
1144	336377	100	C[1] = h->wrap ? sph_wrap_cell(cl[1] + i1, h->wrap_cells[1]) : cl[1] + i1;
1145	336377	100	if (dims == 3) {
1146	288653	100	for (int64_t i2 = 0; i2 < cnt[2]; i2++) {
1147	269803	100	C[2] = h->wrap ? sph_wrap_cell(cl[2] + i2, h->wrap_cells[2]) : cl[2] + i2;
1148	269803	50	if (!sph_walk_cell(h, C, out, accept, ctx)) return SPH_Q_OOM;
1149			}
1150			} else {
1151	317527		C[2] = 0;
1152	317527	50	if (!sph_walk_cell(h, C, out, accept, ctx)) return SPH_Q_OOM;
1153			}
1154			}
1155			}
1156	332		return SPH_Q_OK;
1157			}
1158
1159	12		static int sph_query_aabb(SpatialHandle *h, const double lo[3], const double hi[3],
1160			int dims, sph_collect_t *out) {
1161	12	100	struct sph_box q = { { lo[0], lo[1], dims==3?lo[2]:0 }, { hi[0], hi[1], dims==3?hi[2]:0 }, dims };
		100
1162	12		return sph_enum_cellbox(h, q.lo, q.hi, dims, out, sph_accept_box, &q);
1163			}
1164
1165	323		static int sph_query_radius(SpatialHandle h, const double c[3], double r, int dims, sph_collect_t out) {
1166	323	100	struct sph_disk q = { { c[0], c[1], dims==3?c[2]:0 }, r*r, dims, h->world };
1167	323	100	double lo[3] = { c[0]-r, c[1]-r, (dims==3?c[2]-r:0) };
1168	323	100	double hi[3] = { c[0]+r, c[1]+r, (dims==3?c[2]+r:0) };
1169	323		return sph_enum_cellbox(h, lo, hi, dims, out, sph_accept_disk, &q);
1170			}
1171
1172			/* ================================================================
1173			* k-nearest-neighbour query
1174			* ================================================================ */
1175
1176			/* Candidate kept during the search: squared distance + value. */
1177			typedef struct { double d2; int64_t val; } sph_cand_t;
1178
1179			/* Bounded keep-k: maintain a max-heap of size k keyed by d2 so the worst is
1180			at [0]. Insert only if better than current worst once full. */
1181	2371		static void sph_heap_offer(sph_cand_t heap, uint32_t n, uint32_t k, double d2, int64_t val) {
1182	2371	100	if (*n < k) {
1183	293		uint32_t i = (n)++; / sift up */
1184	293		heap[i].d2 = d2; heap[i].val = val;
1185	527	100	while (i) { uint32_t p = (i-1)/2; if (heap[p].d2 >= heap[i].d2) break;
		100
1186	234		sph_cand_t t = heap[p]; heap[p] = heap[i]; heap[i] = t; i = p; }
1187	2078	50	} else if (k && d2 < heap[0].d2) {
		100
1188	177		heap[0].d2 = d2; heap[0].val = val; /* replace worst, sift down */
1189	177		uint32_t i = 0;
1190	581		for (;;) { uint32_t l=2i+1, r=2i+2, m=i;
1191	379	100	if (lheap[m].d2) m=l;
		100
1192	379	100	if (rheap[m].d2) m=r;
		100
1193	379	100	if (m==i) break; sph_cand_t t=heap[m]; heap[m]=heap[i]; heap[i]=t; i=m; }
1194			}
1195	2371		}
1196	439		static int sph_cmp_d2(const void a, const void b) {
1197	439		double x = ((const sph_cand_t)a)->d2, y = ((const sph_cand_t)b)->d2;
1198	439	100	return (x < y) ? -1 : (x > y) ? 1 : 0;
1199			}
1200			/* Walk a single cell, offering live entries (cell-guarded) to the heap.
1201			Returns the number of live, cell-matched entries examined in this cell, so
1202			the caller can terminate once every live entry has been seen (not by a cell
1203			count, which is unbounded since coords are continuous). */
1204	1086357		static uint32_t sph_knn_walk(SpatialHandle *h, const int64_t C[3], const double c[3],
1205			int dims, sph_cand_t heap, uint32_t n, uint32_t k) {
1206	1086357		uint32_t b = sph_bucket_of_cell(h, C);
1207	1086357		uint32_t walked = 0;
1208	1271465	100	for (uint32_t idx = h->buckets[b]; idx != SPH_NONE; idx = h->entries[idx].next) {
1209	185108		int64_t ec[3]; sph_cell_of(h, h->entries[idx].pos, ec);
1210	185108	100	if (!sph_cell_eq(ec, C)) continue;
1211	871		walked++;
1212	871		sph_heap_offer(heap, n, k, sph_dist2(h, h->entries[idx].pos, c, dims), h->entries[idx].value);
1213			}
1214	1086357		return walked;
1215			}
1216	56		static int sph_query_knn(SpatialHandle h, const double c[3], uint32_t k, int dims, sph_collect_t out) {
1217	56	100	double cc[3] = { c[0], c[1], dims==3 ? c[2] : 0 };
1218	56		uint32_t total = h->hdr->count;
1219	56	100	if (k > total) k = total; /* can never return more than exist */
1220	56	100	if (k == 0) return 1; /* empty hash (or k clamped to 0): empty result */
1221	55		sph_cand_t heap = (sph_cand_t )malloc((size_t)k * sizeof(sph_cand_t));
1222	55	50	if (!heap) return 0;
1223	55		uint32_t n = 0; /* candidates currently in the heap */
1224
1225	55	100	if (h->wrap) {
1226			/* Toroidal: expanding shells would re-visit wrapped cells and double-offer
1227			the same entry to the heap. The world is bounded, so scan every live
1228			entry once with the min-image distance -- O(max_entries), fine for a torus. */
1229	4		uint32_t me = h->hdr->max_entries;
1230	15004	100	for (uint32_t idx = 0; idx < me; idx++)
1231	15000	100	if (sph_is_live(h, idx))
1232	1500		sph_heap_offer(heap, &n, k, sph_dist2(h, h->entries[idx].pos, cc, dims),
1233	1500		h->entries[idx].value);
1234			} else {
1235	51		int64_t center[3]; sph_cell_of(h, cc, center);
1236	51		double cs = h->hdr->cell_size;
1237			/* Termination is by how many live entries we have EXAMINED, not by a cell
1238			count: cell distance is unbounded (continuous coords), so a far point can
1239			sit arbitrarily many shells out. Snapshot the population; once seen==total
1240			every live entry has been visited and nothing remains to find. */
1241	51		uint32_t seen = 0; /* live, cell-matched entries examined */
1242	51		uint64_t cells_visited = 0;
1243			/* g < INT32_MAX is only a corruption guard against a bogus count; the real
1244			terminators below (have-k bound, or seen>=total) end the loop normally. */
1245	522	50	for (int64_t g = 0; g < INT32_MAX; g++) {
1246			/* Enumerate the Chebyshev shell at distance g -- its SURFACE only, not
1247			the full (2g+1)^d box, so per-shell work is O(g) in 2D / O(g^2) in 3D.
1248			cells_visited counts every cell walked, so the cap bounds real work. */
1249			#define SPH_KNN_PROCESS(CX, CY, CZ) do { \
1250			if (++cells_visited > (uint64_t)SPH_MAX_QUERY_CELLS) { \
1251			free(heap); return SPH_Q_TOOBIG; \
1252			} \
1253			int64_t C_[3] = { (CX), (CY), (CZ) }; \
1254			seen += sph_knn_walk(h, C_, cc, dims, heap, &n, k); \
1255			} while (0)
1256	522		int64_t cx = center[0], cy = center[1], cz = center[2];
1257	522	100	if (g == 0) {
1258	51	50	SPH_KNN_PROCESS(cx, cy, cz);
1259	471	100	} else if (dims == 2) {
1260	3146	100	for (int64_t dx = -g; dx <= g; dx++) { /* top + bottom rows */
1261	3024	50	SPH_KNN_PROCESS(cx + dx, cy - g, cz);
1262	3024	50	SPH_KNN_PROCESS(cx + dx, cy + g, cz);
1263			}
1264	2902	100	for (int64_t dy = -g + 1; dy <= g - 1; dy++) { /* left + right cols */
1265	2780	50	SPH_KNN_PROCESS(cx - g, cy + dy, cz);
1266	2780	50	SPH_KNN_PROCESS(cx + g, cy + dy, cz);
1267			}
1268			} else {
1269	1047	100	for (int64_t dz = -g; dz <= g; dz += 2 * g) { /* two z caps (full faces) */
1270	14796	100	for (int64_t dx = -g; dx <= g; dx++)
1271	399596	100	for (int64_t dy = -g; dy <= g; dy++)
1272	385498	50	SPH_KNN_PROCESS(cx + dx, cy + dy, cz + dz);
1273			}
1274	6700	100	for (int64_t dz = -g + 1; dz <= g - 1; dz++) { /* middle layers: xy perimeter */
1275	185002	100	for (int64_t dx = -g; dx <= g; dx++) {
1276	178651	50	SPH_KNN_PROCESS(cx + dx, cy - g, cz + dz);
1277	178651	50	SPH_KNN_PROCESS(cx + dx, cy + g, cz + dz);
1278			}
1279	172300	100	for (int64_t dy = -g + 1; dy <= g - 1; dy++) {
1280	165949	50	SPH_KNN_PROCESS(cx - g, cy + dy, cz + dz);
1281	165949	50	SPH_KNN_PROCESS(cx + g, cy + dy, cz + dz);
1282			}
1283			}
1284			}
1285			#undef SPH_KNN_PROCESS
1286			/* stop once full and the next shell cannot contain anything closer:
1287			min distance from the query point to shell g+1 is gcell_size. /
1288	522	100	if (n >= k) { double bound = (double)g * cs; if (bound*bound >= heap[0].d2) break; }
		100
1289			/* stop once every live entry has been examined: nothing left to find. */
1290	473	100	if (seen >= total) break;
1291			}
1292			} /* end !wrap */
1293	55		qsort(heap, n, sizeof(sph_cand_t), sph_cmp_d2); /* nearest-first */
1294	348	100	for (uint32_t i = 0; i < n; i++)
1295	293	50	if (!sph_collect_push(out, heap[i].val)) { free(heap); return 0; }
1296	55		free(heap);
1297	55		return 1;
1298			}
1299
1300			/* ================================================================
1301			* Collision-pair emitters (callers hold the read lock)
1302			* ================================================================ */
1303
1304			/* emit returns 1 to continue, 0 to abort (collector OOM). */
1305			typedef int (sph_pair_cb)(int64_t va, int64_t vb, void ctx);
1306
1307	4648		static int sph_pair_to_collect(int64_t va, int64_t vb, void *ctx) {
1308	4648		sph_collect_t c = (sph_collect_t )ctx;
1309	4648	50	return sph_collect_push(c, va) && sph_collect_push(c, vb);
		50
1310			}
1311
1312			/* Emit every unordered pair once. fixed_r >= 0: pairs with min-image centre
1313			distance < fixed_r. fixed_r < 0: collision mode -- distance < radius_a+radius_b.
1314			Enumeration/distance are 3D when any entry has a non-zero z (entries are
1315			bucketed by their full 3D cell) or the world wraps in z, else 2D.
1316			Returns SPH_Q_OK / OOM / TOOBIG. */
1317	10		static int sph_pairs(SpatialHandle h, double fixed_r, sph_pair_cb emit, void ctx) {
1318	10		double cs = h->hdr->cell_size;
1319	10		uint32_t me = h->hdr->max_entries;
1320	10		int collide = (fixed_r < 0.0);
1321			/* one pass: largest radius (collision reach) + whether any entry is 3D, so the
1322			enumeration covers the real z-cell range rather than only z-cell 0 */
1323	10		double maxr = 0.0; int has3d = 0;
1324	9310	100	for (uint32_t i = 0; i < me; i++) {
1325	9300	100	if (!sph_is_live(h, i)) continue;
1326	1656	100	if (collide && h->entries[i].radius > maxr) maxr = h->entries[i].radius;
		100
1327	1656	100	if (h->entries[i].pos[2] != 0.0) has3d = 1;
1328			}
1329	10	100	if (collide) { if (maxr <= 0.0) return SPH_Q_OK; } /* no radii -> points never collide */
		100
1330	6	100	else if (!(fixed_r > 0.0)) return SPH_Q_OK; /* non-positive radius -> nothing */
1331	8	100	int dims = (h->world[2] > 0.0 \|\| has3d) ? 3 : 2;
		100
1332	8208	100	for (uint32_t a = 0; a < me; a++) {
1333	8200	100	if (!sph_is_live(h, a)) continue;
1334	1404		const double *pa = h->entries[a].pos;
1335	1404		double ra = h->entries[a].radius;
1336	1404	100	double reach_d = ceil((collide ? (ra + maxr) : fixed_r) / cs);
1337			/* defensive: the API validates radii, but a corrupt stored radius could make this
1338			NaN, which would slip past the cap check below (NaN >= X is false) into the cast */
1339	1404	50	if (!(reach_d >= 0)) reach_d = 0;
1340	1404	50	if (reach_d >= (double)SPH_MAX_QUERY_CELLS) return SPH_Q_TOOBIG; /* avoid int64 overflow */
1341	1404		int64_t reach = (int64_t)reach_d;
1342	1404		int64_t ac[3]; sph_cell_raw(h, pa, ac);
1343	1404		int64_t cnt[3] = { 1, 1, 1 };
1344	1404		uint64_t cells = 1;
1345	4912	100	for (int i = 0; i < dims; i++) {
1346	3508		uint64_t span = (uint64_t)(2 * reach + 1);
1347	3508	100	if (h->wrap_cells[i] > 0 && span > (uint64_t)h->wrap_cells[i]) span = (uint64_t)h->wrap_cells[i];
		50
1348	3508	50	if (span >= (uint64_t)SPH_MAX_QUERY_CELLS) return SPH_Q_TOOBIG;
1349	3508	50	if (cells > (uint64_t)SPH_MAX_QUERY_CELLS / span) return SPH_Q_TOOBIG;
1350	3508		cells *= span; cnt[i] = (int64_t)span;
1351			}
1352	1404		int64_t b0 = ac[0]-reach, b1 = ac[1]-reach, b2 = ac[2]-reach;
1353	11204	100	for (int64_t i0 = 0; i0 < cnt[0]; i0++) {
1354	9800	100	int64_t C0 = h->wrap ? sph_wrap_cell(b0+i0, h->wrap_cells[0]) : b0+i0;
1355	94924	100	for (int64_t i1 = 0; i1 < cnt[1]; i1++) {
1356	85124	100	int64_t C1 = h->wrap ? sph_wrap_cell(b1+i1, h->wrap_cells[1]) : b1+i1;
1357	414270	100	for (int64_t i2 = 0; i2 < cnt[2]; i2++) { /* cnt[2]==1 when dims==2 */
1358	329146	100	int64_t C2 = (dims == 3) ? (h->wrap ? sph_wrap_cell(b2+i2, h->wrap_cells[2]) : b2+i2) : 0;
		100
1359	329146		int64_t C[3] = { C0, C1, C2 };
1360	329146		uint32_t bkt = sph_bucket_of_cell(h, C);
1361	425090	100	for (uint32_t idx = h->buckets[bkt]; idx != SPH_NONE; idx = h->entries[idx].next) {
1362	126547	100	if (idx <= a) continue; /* unordered: emit each pair once */
1363	47021		int64_t ec[3]; sph_cell_of(h, h->entries[idx].pos, ec);
1364	47021	100	if (!sph_cell_eq(ec, C)) continue; /* hash-collision guard */
1365	16418	100	double thr = collide ? (ra + h->entries[idx].radius) : fixed_r;
1366	16418	100	if (sph_dist2(h, pa, h->entries[idx].pos, dims) < thr*thr)
1367	4648	50	if (!emit(h->entries[a].value, h->entries[idx].value, ctx)) return SPH_Q_OOM;
1368			}
1369			}
1370			}
1371			}
1372			}
1373	8		return SPH_Q_OK;
1374			}
1375
1376			/* ================================================================
1377			* Lifecycle helpers: clear, chain stats
1378			* ================================================================ */
1379
1380	1		static void sph_clear_locked(SpatialHandle *h) {
1381	1		uint32_t me = h->hdr->max_entries, nb = h->hdr->num_buckets;
1382	1025	100	for (uint32_t b = 0; b < nb; b++) h->buckets[b] = SPH_NONE;
1383	1		memset(h->bitmap, 0, (size_t)h->bitmap_words * sizeof(uint64_t));
1384	1001	100	for (uint32_t i = 0; i < me; i++) h->entries[i].next = (i+1 < me) ? i+1 : SPH_NONE;
		100
1385	1		h->hdr->free_head = 0; /* max_entries >= 1 (validated at create) */
1386	1		h->hdr->count = 0;
1387	1		}
1388	7		static void sph_chain_stats(SpatialHandle h, uint32_t occupied, uint32_t *max_chain,
1389			uint32_t *max_cell) {
1390	7		uint32_t occ = 0, mx = 0, mxc = 0, nb = h->hdr->num_buckets;
1391	1293	100	for (uint32_t b = 0; b < nb; b++) {
1392	1286		uint32_t len = 0;
1393	2908	100	for (uint32_t idx = h->buckets[b]; idx != SPH_NONE; idx = h->entries[idx].next) {
1394	1622		len++;
1395			/* per-cell occupancy: count chain entries sharing idx's cell (entries of
1396			one cell always hash to one bucket, so a cell is a subset of a chain) */
1397	1622		int64_t ci[3]; sph_cell_of(h, h->entries[idx].pos, ci);
1398	1622		uint32_t cc = 0;
1399	482074	100	for (uint32_t j = h->buckets[b]; j != SPH_NONE; j = h->entries[j].next) {
1400	480452		int64_t cj[3]; sph_cell_of(h, h->entries[j].pos, cj);
1401	480452	100	if (sph_cell_eq(ci, cj)) cc++;
1402			}
1403	1622	100	if (cc > mxc) mxc = cc;
1404			}
1405	1286	100	if (len) { occ++; if (len > mx) mx = len; }
		100
1406			}
1407	7		occupied = occ; max_chain = mx; *max_cell = mxc;
1408	7		}
1409
1410			#endif /* SPHASH_H */