File Coverage

radix.h
Criterion Covered Total %
statement 317 446 71.0
branch 145 320 45.3
condition n/a
subroutine n/a
pod n/a
total 462 766 60.3


line stmt bran cond sub pod time code
1             /*
2             * radix.h -- Shared-memory compressed radix tree (PATRICIA-style trie) for Linux
3             *
4             * Maps arbitrary byte-string keys to uint64 values. A radix-256 trie with
5             * path/edge compression: each node carries a label (a run of bytes shared by
6             * the whole subtree) so chains of single-child nodes collapse into one edge.
7             * Insert and lookup are O(key length). Beyond exact lookup the tree answers
8             * longest-prefix queries -- the longest stored key that is a prefix of a query
9             * string -- which is what routing tables want. The node pool and label arena
10             * live in a shared mapping so several processes share one tree; a
11             * write-preferring futex rwlock with reader-slot dead-process recovery guards
12             * mutation.
13             *
14             * lookup / exists / longest_prefix are pure reads (no path compression) and run
15             * under the READ lock; insert / delete / clear take the WRITE lock.
16             *
17             * delete is LAZY in v1: it unmarks the key's value but does not free node-pool
18             * or arena space. Size the capacities for the working set, or clear() to reset.
19             *
20             * Layout: Header -> reader_slots[1024] -> node_pool[node_cap] -> label_arena[arena_cap]
21             */
22              
23             #ifndef RADIX_H
24             #define RADIX_H
25              
26             #include
27             #include
28             #include
29             #include
30             #include
31             #include
32             #include
33             #include
34             #include
35             #include
36             #include
37             #include
38             #include
39             #include
40             #include
41             #include
42              
43             #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
44             #error "radix.h: requires little-endian architecture"
45             #endif
46              
47              
48             /* ================================================================
49             * Constants
50             * ================================================================ */
51              
52             #define RDX_MAGIC 0x58444152U /* "RADX" (little-endian) */
53             #define RDX_VERSION 1
54             #define RDX_ERR_BUFLEN 256
55             #define RDX_READER_SLOTS 1024 /* max concurrent reader processes for dead-process recovery */
56             #define RDX_MAX_NODES (1u << 24) /* 16.7M nodes: node index 0 is the reserved NIL sentinel */
57             #define RDX_MAX_ARENA 0xF0000000u /* ~3.75 GiB label arena; offsets/lengths are uint32 */
58              
59             #define RDX_ERR(fmt, ...) do { if (errbuf) snprintf(errbuf, RDX_ERR_BUFLEN, fmt, ##__VA_ARGS__); } while (0)
60              
61             /* ================================================================
62             * Structs
63             * ================================================================ */
64              
65             /* Radix-tree node (fixed size). children[b] == 0 means "no child on byte b",
66             * so node index 0 is the reserved NIL sentinel and is never a real node. The
67             * label (children-shared prefix of this edge) lives in the arena at
68             * [label_off, label_off+label_len). */
69             typedef struct {
70             uint32_t children[256]; /* child node index per next byte; 0 == none */
71             uint32_t label_off; /* offset of this edge's label in the arena */
72             uint32_t label_len; /* length of the label in bytes */
73             uint64_t value; /* stored value (valid only when has_value) */
74             uint8_t has_value; /* 1 if a key ends exactly at this node */
75             uint8_t _pad[7]; /* pad to 8-byte alignment (1048 bytes total) */
76             } RdxNode;
77              
78             _Static_assert(sizeof(RdxNode) == 256u * 4u + 4u + 4u + 8u + 8u, "RdxNode layout");
79             _Static_assert(sizeof(RdxNode) % 8 == 0, "RdxNode must be 8-byte aligned");
80              
81             /* Per-process slot for dead-process recovery. Each shared rwlock counter
82             * (the main rwlock-reader count, rwlock_waiters, rwlock_writers_waiting)
83             * is mirrored here so a wrlock timeout can attribute and reverse a dead
84             * process's contribution instead of waiting for the slow per-op timeout
85             * drain. */
86             typedef struct {
87             uint32_t pid; /* 0 = unclaimed */
88             uint32_t subcount; /* in-flight rdlock acquisitions for this process */
89             uint32_t waiters_parked; /* contribution to hdr->rwlock_waiters */
90             uint32_t writers_parked; /* contribution to hdr->rwlock_writers_waiting */
91             } RdxReaderSlot;
92              
93             struct RdxHeader {
94             uint32_t magic, version; /* 0,4 */
95             uint32_t node_cap; /* 8 node-pool capacity (slots, incl. NIL) */
96             uint32_t node_used; /* 12 high-water of ever-allocated nodes (incl. NIL+root) */
97             uint32_t root; /* 16 root node index (allocated at create) */
98             uint32_t arena_cap; /* 20 label-arena capacity in bytes */
99             uint32_t arena_used; /* 24 bytes used in the arena */
100             uint32_t _pad1; /* 28 (was free_head; lazy delete never freed nodes) */
101             uint64_t keys; /* 32 count of stored keys */
102             uint64_t total_size; /* 40 */
103             uint64_t reader_slots_off; /* 48 */
104             uint64_t node_pool_off; /* 56 */
105             uint64_t arena_off; /* 64 */
106             uint32_t rwlock; /* 72 */
107             uint32_t rwlock_waiters; /* 76 */
108             uint32_t rwlock_writers_waiting; /* 80 */
109             uint32_t _pad0; /* 84 */
110             uint64_t stat_ops; /* 88 */
111             uint8_t _pad[160]; /* 96..255 */
112             };
113             typedef struct RdxHeader RdxHeader;
114              
115             _Static_assert(sizeof(RdxHeader) == 256, "RdxHeader must be 256 bytes");
116              
117             /* ---- Process-local handle ---- */
118              
119             typedef struct RdxHandle {
120             RdxHeader *hdr;
121             RdxReaderSlot *reader_slots; /* RDX_READER_SLOTS entries */
122             void *base; /* mmap base */
123             size_t mmap_size;
124             char *path; /* backing file path (strdup'd) */
125             int backing_fd; /* memfd or reopened-fd to close on destroy, -1 for file/anon */
126             uint32_t my_slot_idx; /* UINT32_MAX if all slots taken (no recovery for this handle) */
127             uint32_t cached_pid; /* getpid() cached at last slot claim */
128             uint32_t cached_fork_gen; /* rdx_fork_gen value at last slot claim */
129             } RdxHandle;
130              
131             /* ================================================================
132             * Futex-based write-preferring read-write lock
133             * with reader-slot dead-process recovery
134             * ================================================================ */
135              
136             #define RDX_RWLOCK_SPIN_LIMIT 32
137             #define RDX_LOCK_TIMEOUT_SEC 2 /* FUTEX_WAIT timeout for stale lock detection */
138              
139 0           static inline void rdx_rwlock_spin_pause(void) {
140             #if defined(__x86_64__) || defined(__i386__)
141 0           __asm__ volatile("pause" ::: "memory");
142             #elif defined(__aarch64__)
143             __asm__ volatile("yield" ::: "memory");
144             #else
145             __asm__ volatile("" ::: "memory");
146             #endif
147 0           }
148              
149             /* Extract writer PID from rwlock value (lower 31 bits when write-locked). */
150             #define RDX_RWLOCK_WRITER_BIT 0x80000000U
151             #define RDX_RWLOCK_PID_MASK 0x7FFFFFFFU
152             #define RDX_RWLOCK_WR(pid) (RDX_RWLOCK_WRITER_BIT | ((uint32_t)(pid) & RDX_RWLOCK_PID_MASK))
153              
154             /* Check if a PID is alive. Returns 1 if alive or unknown, 0 if definitely dead. */
155             /* Liveness via kill(pid,0). NOTE: cannot detect PID reuse -- if a dead
156             * lock-holder's PID is recycled to an unrelated live process before recovery
157             * runs, this reports "alive" and that slot's orphaned contribution is not
158             * reclaimed until the recycled process exits. Robust detection would require
159             * a per-slot process-start-time epoch (a header-layout/version change).
160             * Documented under "Crash Safety" in the POD. */
161 0           static inline int rdx_pid_alive(uint32_t pid) {
162 0 0         if (pid == 0) return 1; /* no owner recorded, assume alive */
163 0 0         return !(kill((pid_t)pid, 0) == -1 && errno == ESRCH);
    0          
164             }
165              
166             /* Force-recover a stale write lock left by a dead process.
167             * CAS to OUR pid to hold the lock while fixing shared state, then release.
168             * Using our pid (not a bare WRITER_BIT sentinel) means a subsequent
169             * recovering process can detect and re-recover if we crash mid-recovery. */
170 0           static inline void rdx_recover_stale_lock(RdxHandle *h, uint32_t observed_rwlock) {
171 0           RdxHeader *hdr = h->hdr;
172 0           uint32_t mypid = RDX_RWLOCK_WR((uint32_t)getpid());
173 0 0         if (!__atomic_compare_exchange_n(&hdr->rwlock, &observed_rwlock,
174             mypid, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
175 0           return;
176             /* We now hold the write lock as mypid. No additional shared state needs
177             * repair here (this module has no seqlock); just release the lock. */
178 0           __atomic_store_n(&hdr->rwlock, 0, __ATOMIC_RELEASE);
179 0 0         if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
180 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
181             }
182              
183             static const struct timespec rdx_lock_timeout = { RDX_LOCK_TIMEOUT_SEC, 0 };
184              
185             /* Process-global fork-generation counter. Incremented in the pthread_atfork
186             * child callback so every open handle detects a fork transition on the next
187             * lock call without paying a getpid() syscall on the hot path. */
188             static uint32_t rdx_fork_gen = 1;
189             static pthread_once_t rdx_atfork_once = PTHREAD_ONCE_INIT;
190 0           static void rdx_on_fork_child(void) {
191 0           __atomic_add_fetch(&rdx_fork_gen, 1, __ATOMIC_RELAXED);
192 0           }
193 2           static void rdx_atfork_init(void) {
194 2           pthread_atfork(NULL, NULL, rdx_on_fork_child);
195 2           }
196              
197             /* Ensure this process owns a reader slot. Called from the lock helpers so
198             * that fork()'d children pick up their own slot lazily instead of sharing
199             * the parent's. Hot-path is a single relaxed load + compare; only on a
200             * fork-generation mismatch do we touch getpid() and scan slots. */
201 4880           static inline void rdx_claim_reader_slot(RdxHandle *h) {
202 4880           uint32_t cur_gen = __atomic_load_n(&rdx_fork_gen, __ATOMIC_RELAXED);
203 4880 100         if (__builtin_expect(cur_gen == h->cached_fork_gen && h->my_slot_idx != UINT32_MAX, 1))
    50          
204 4860           return;
205             /* Cold path -- register the atfork hook once per process, then claim. */
206 20           pthread_once(&rdx_atfork_once, rdx_atfork_init);
207             /* Re-read after pthread_once: rdx_on_fork_child may have bumped it. */
208 20           cur_gen = __atomic_load_n(&rdx_fork_gen, __ATOMIC_RELAXED);
209 20           uint32_t now_pid = (uint32_t)getpid();
210 20           h->cached_pid = now_pid;
211 20           h->cached_fork_gen = cur_gen;
212 20           h->my_slot_idx = UINT32_MAX;
213 20           uint32_t start = now_pid % RDX_READER_SLOTS;
214 22 50         for (uint32_t i = 0; i < RDX_READER_SLOTS; i++) {
215 22           uint32_t s = (start + i) % RDX_READER_SLOTS;
216 22           uint32_t expected = 0;
217 22 100         if (__atomic_compare_exchange_n(&h->reader_slots[s].pid,
218             &expected, now_pid, 0,
219             __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
220             /* Zero all mirror fields, not just subcount: a SIGKILL'd
221             * predecessor may have left waiters_parked/writers_parked
222             * non-zero, and rdx_recover_dead_readers won't drain them
223             * once we own the slot (the CAS expects the dead PID). */
224 20           __atomic_store_n(&h->reader_slots[s].subcount, 0, __ATOMIC_RELAXED);
225 20           __atomic_store_n(&h->reader_slots[s].waiters_parked, 0, __ATOMIC_RELAXED);
226 20           __atomic_store_n(&h->reader_slots[s].writers_parked, 0, __ATOMIC_RELAXED);
227 20           h->my_slot_idx = s;
228 20           return;
229             }
230             }
231             /* Table full -- leave my_slot_idx = UINT32_MAX so we silently skip
232             * tracking for this handle (lock still works; just no recovery). */
233             }
234              
235             /* Atomically subtract `sub` from a counter, capped at 0 (never underflows). */
236 0           static inline void rdx_atomic_sub_cap(uint32_t *p, uint32_t sub) {
237 0 0         if (!sub) return;
238 0           uint32_t cur = __atomic_load_n(p, __ATOMIC_RELAXED);
239 0           for (;;) {
240 0 0         uint32_t want = (cur > sub) ? cur - sub : 0;
241 0 0         if (__atomic_compare_exchange_n(p, &cur, want,
242             1, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
243 0           return;
244             }
245             }
246              
247             /* Try to claim a dead slot (CAS pid -> 0) and drain its parked-waiter
248             * contributions back to the global counters. A no-op if the slot was stolen
249             * by another recoverer or had no waiter contribution to drain.
250             *
251             * Note: subcount/waiters_parked/writers_parked are NOT zeroed here.
252             * Between our CAS and a follow-up store, a new process could claim the
253             * slot and start populating these fields -- our stores would clobber its
254             * state. rdx_claim_reader_slot zeros all three on every claim, so
255             * leaving stale values is harmless. */
256 0           static inline void rdx_drain_dead_slot(RdxHandle *h, uint32_t i, uint32_t pid) {
257 0           RdxHeader *hdr = h->hdr;
258 0           uint32_t expected = pid;
259             /* ACQ_REL on success: RELEASE publishes pid=0 to other observers;
260             * ACQUIRE syncs us with prior writes from the dead process to
261             * waiters_parked/writers_parked. On weakly-ordered archs (aarch64)
262             * a plain RELAXED load before the CAS could miss those writes;
263             * loading them after the CAS keeps them inside the acquire window. */
264 0 0         if (!__atomic_compare_exchange_n(&h->reader_slots[i].pid, &expected, 0,
265             0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
266 0           return;
267 0           uint32_t wp = __atomic_load_n(&h->reader_slots[i].waiters_parked, __ATOMIC_RELAXED);
268 0           uint32_t writp = __atomic_load_n(&h->reader_slots[i].writers_parked, __ATOMIC_RELAXED);
269 0 0         if (wp) rdx_atomic_sub_cap(&hdr->rwlock_waiters, wp);
270 0 0         if (writp) rdx_atomic_sub_cap(&hdr->rwlock_writers_waiting, writp);
271             }
272              
273             /* Scan reader slots for dead-process recovery.
274             *
275             * For each dead PID with non-zero contributions to the shared rwlock,
276             * rwlock_waiters, or rwlock_writers_waiting counters, drain its share back
277             * out so live processes don't have to wait for the slow per-op timeout
278             * decrement to drain it for them.
279             *
280             * For the main rwlock counter we use the "no live reader holds -> force-
281             * reset to 0" trick (precise) because per-process attribution of the
282             * subcount is racy across the inc-counter-then-inc-subcount window. */
283 0           static inline void rdx_recover_dead_readers(RdxHandle *h) {
284 0 0         if (!h->reader_slots) return;
285 0           RdxHeader *hdr = h->hdr;
286 0           int any_live_reader = 0;
287 0           int found_dead_reader = 0;
288              
289             /* Pass 1: classify slots. Slots with dead pid and sc == 0 (no rwlock
290             * contribution to lose) are wiped immediately to free the slot for
291             * future claimants and drain any orphan parked-waiter counters. Slots
292             * with dead pid and sc > 0 are left intact in this pass: if force-
293             * reset cannot fire (because a live reader is concurrently present),
294             * wiping the dead slot would lose the only record of its orphan
295             * rwlock contribution and strand writers permanently once the live
296             * reader releases. */
297 0 0         for (uint32_t i = 0; i < RDX_READER_SLOTS; i++) {
298 0           uint32_t pid = __atomic_load_n(&h->reader_slots[i].pid, __ATOMIC_ACQUIRE);
299 0 0         if (pid == 0) continue;
300 0           uint32_t sc = __atomic_load_n(&h->reader_slots[i].subcount, __ATOMIC_RELAXED);
301 0 0         if (rdx_pid_alive(pid)) {
302 0 0         if (sc > 0) any_live_reader = 1;
303 0           continue;
304             }
305 0 0         if (sc > 0) { found_dead_reader = 1; continue; }
306 0           rdx_drain_dead_slot(h, i, pid);
307             }
308              
309             /* Pass 2: only if force-reset will fire. Issue the rwlock force-
310             * reset CAS FIRST, while the window since pass 1's last scan is
311             * still narrow (a handful of instructions, as in the original
312             * single-pass code). A new reader that started rdlock between
313             * pass 1's scan and the CAS will either:
314             * (a) have already CAS'd rwlock from cur to cur+1 -- our CAS then
315             * fails (cur mismatched), recovery yields and a future
316             * cycle retries; or
317             * (b) be still in the subcount-bump phase -- our CAS sees the
318             * stale cur and resets to 0; the new reader's subsequent CAS
319             * rwlock(0 -> 1) succeeds cleanly.
320             * Only after the CAS resolves do we wipe the deferred dead slots,
321             * keeping that work outside the race-sensitive window. */
322 0 0         if (found_dead_reader && !any_live_reader) {
    0          
323 0           uint32_t cur = __atomic_load_n(&hdr->rwlock, __ATOMIC_RELAXED);
324 0 0         if (cur > 0 && cur < RDX_RWLOCK_WRITER_BIT) {
    0          
325 0 0         if (__atomic_compare_exchange_n(&hdr->rwlock, &cur, 0,
326             0, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
327 0 0         if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
328 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
329             }
330             }
331 0 0         for (uint32_t i = 0; i < RDX_READER_SLOTS; i++) {
332 0           uint32_t pid = __atomic_load_n(&h->reader_slots[i].pid, __ATOMIC_ACQUIRE);
333 0 0         if (pid == 0 || rdx_pid_alive(pid)) continue;
    0          
334 0           rdx_drain_dead_slot(h, i, pid);
335             }
336             }
337             }
338              
339             /* Inspect the lock word after a futex-wait timeout. If a dead writer
340             * holds it, force-recover the lock. Otherwise drain dead readers' shares
341             * of the rwlock/waiter counters. Called from rdlock and wrlock ETIMEDOUT
342             * branches -- identical recovery logic in both. */
343 0           static inline void rdx_recover_after_timeout(RdxHandle *h) {
344 0           RdxHeader *hdr = h->hdr;
345 0           uint32_t val = __atomic_load_n(&hdr->rwlock, __ATOMIC_RELAXED);
346 0 0         if (val >= RDX_RWLOCK_WRITER_BIT) {
347 0           uint32_t pid = val & RDX_RWLOCK_PID_MASK;
348 0 0         if (!rdx_pid_alive(pid))
349 0           rdx_recover_stale_lock(h, val);
350             } else {
351 0           rdx_recover_dead_readers(h);
352             }
353 0           }
354              
355             /* Park/unpark helpers: bump the global waiter counters together with this
356             * process's mirrored slot counters so a wrlock-timeout recovery scan can
357             * attribute and reverse a dead PID's contribution. Kept paired to make
358             * accidental drift between global and per-slot counts impossible. */
359 0           static inline void rdx_park_reader(RdxHandle *h) {
360 0 0         if (h->my_slot_idx != UINT32_MAX)
361 0           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
362 0           __atomic_add_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
363 0           }
364 0           static inline void rdx_unpark_reader(RdxHandle *h) {
365 0           __atomic_sub_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
366 0 0         if (h->my_slot_idx != UINT32_MAX)
367 0           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
368 0           }
369 0           static inline void rdx_park_writer(RdxHandle *h) {
370 0 0         if (h->my_slot_idx != UINT32_MAX) {
371 0           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
372 0           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].writers_parked, 1, __ATOMIC_RELAXED);
373             }
374 0           __atomic_add_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
375 0           __atomic_add_fetch(&h->hdr->rwlock_writers_waiting, 1, __ATOMIC_RELAXED);
376 0           }
377 0           static inline void rdx_unpark_writer(RdxHandle *h) {
378 0           __atomic_sub_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
379 0           __atomic_sub_fetch(&h->hdr->rwlock_writers_waiting, 1, __ATOMIC_RELAXED);
380 0 0         if (h->my_slot_idx != UINT32_MAX) {
381 0           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
382 0           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].writers_parked, 1, __ATOMIC_RELAXED);
383             }
384 0           }
385              
386 3264           static inline void rdx_rwlock_rdlock(RdxHandle *h) {
387 3264           rdx_claim_reader_slot(h);
388 3264           RdxHeader *hdr = h->hdr;
389 3264           uint32_t *lock = &hdr->rwlock;
390 3264           uint32_t *writers_waiting = &hdr->rwlock_writers_waiting;
391             /* Claim subcount BEFORE bumping the shared rwlock counter. This way
392             * a concurrent writer-side recovery scan that sees our PID alive with
393             * subcount > 0 will (correctly) defer force-reset, even while we are
394             * still spinning trying to win the rwlock CAS. Without this, a reader
395             * killed between rwlock CAS-success and subcount++ would let recovery
396             * force-reset rwlock to 0 underneath us, causing a UINT32_MAX wrap on
397             * our eventual rdunlock dec. */
398 3264 50         if (h->my_slot_idx != UINT32_MAX)
399 3264           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].subcount, 1, __ATOMIC_RELAXED);
400 3264           for (int spin = 0; ; spin++) {
401 3264           uint32_t cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
402             /* Write-preferring: when lock is free (cur==0) and writers are
403             * waiting, yield to let the writer acquire. When readers are
404             * already active (cur>=1), new readers may join freely. */
405 3264 50         if (cur > 0 && cur < RDX_RWLOCK_WRITER_BIT) {
    0          
406 0 0         if (__atomic_compare_exchange_n(lock, &cur, cur + 1,
407             1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
408 3264           return;
409 3264 50         } else if (cur == 0 && !__atomic_load_n(writers_waiting, __ATOMIC_RELAXED)) {
    50          
410 3264 50         if (__atomic_compare_exchange_n(lock, &cur, 1,
411             1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
412 3264           return;
413             }
414 0 0         if (__builtin_expect(spin < RDX_RWLOCK_SPIN_LIMIT, 1)) {
415 0           rdx_rwlock_spin_pause();
416 0           continue;
417             }
418 0           rdx_park_reader(h);
419 0           cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
420             /* Sleep when write-locked OR when yielding to waiting writers */
421 0 0         if (cur >= RDX_RWLOCK_WRITER_BIT || cur == 0) {
    0          
422 0           long rc = syscall(SYS_futex, lock, FUTEX_WAIT, cur,
423             &rdx_lock_timeout, NULL, 0);
424 0 0         if (rc == -1 && errno == ETIMEDOUT) {
    0          
425 0           rdx_unpark_reader(h);
426 0           rdx_recover_after_timeout(h);
427 0           spin = 0;
428 0           continue;
429             }
430             }
431 0           rdx_unpark_reader(h);
432 0           spin = 0;
433             }
434             }
435              
436 3264           static inline void rdx_rwlock_rdunlock(RdxHandle *h) {
437 3264           RdxHeader *hdr = h->hdr;
438             /* Release the shared counter BEFORE dropping our subcount so that
439             * "any live PID with subcount > 0" is a reliable in-flight indicator
440             * for the writer-side recovery scan. Inverting these would create a
441             * window where we still own a unit of rwlock but our slot subcount is
442             * 0, letting recovery force-reset rwlock underneath us. */
443 3264           uint32_t after = __atomic_sub_fetch(&hdr->rwlock, 1, __ATOMIC_RELEASE);
444 3264 50         if (h->my_slot_idx != UINT32_MAX)
445 3264           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].subcount, 1, __ATOMIC_RELAXED);
446 3264 50         if (after == 0 && __atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
    50          
447 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
448 3264           }
449              
450 1616           static inline void rdx_rwlock_wrlock(RdxHandle *h) {
451 1616           rdx_claim_reader_slot(h); /* refresh cached_pid across fork */
452 1616           RdxHeader *hdr = h->hdr;
453 1616           uint32_t *lock = &hdr->rwlock;
454             /* Encode PID in the rwlock word itself (0x80000000 | pid) to eliminate
455             * any crash window between acquiring the lock and storing the owner. */
456 1616           uint32_t mypid = RDX_RWLOCK_WR(h->cached_pid);
457 1616           for (int spin = 0; ; spin++) {
458 1616           uint32_t expected = 0;
459 1616 50         if (__atomic_compare_exchange_n(lock, &expected, mypid,
460             1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
461 1616           return;
462 0 0         if (__builtin_expect(spin < RDX_RWLOCK_SPIN_LIMIT, 1)) {
463 0           rdx_rwlock_spin_pause();
464 0           continue;
465             }
466 0           rdx_park_writer(h);
467 0           uint32_t cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
468 0 0         if (cur != 0) {
469 0           long rc = syscall(SYS_futex, lock, FUTEX_WAIT, cur,
470             &rdx_lock_timeout, NULL, 0);
471 0 0         if (rc == -1 && errno == ETIMEDOUT) {
    0          
472 0           rdx_unpark_writer(h);
473 0           rdx_recover_after_timeout(h);
474 0           spin = 0;
475 0           continue;
476             }
477             }
478 0           rdx_unpark_writer(h);
479 0           spin = 0;
480             }
481             }
482              
483 1616           static inline void rdx_rwlock_wrunlock(RdxHandle *h) {
484 1616           RdxHeader *hdr = h->hdr;
485 1616           __atomic_store_n(&hdr->rwlock, 0, __ATOMIC_RELEASE);
486 1616 50         if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
487 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
488 1616           }
489              
490             /* ================================================================
491             * Layout math + node-pool / arena accessors
492             *
493             * Layout: Header -> reader_slots[1024] -> node_pool[node_cap] -> arena[arena_cap]
494             * RdxNode is 8-byte aligned (sizeof %8 == 0) and RdxReaderSlot is 16 bytes,
495             * so node_pool_off is 8-byte aligned. The arena is raw bytes (no alignment
496             * requirement) and follows the node pool.
497             * ================================================================ */
498              
499             typedef struct { uint64_t reader_slots, node_pool, arena; } RdxLayout;
500              
501 74           static inline RdxLayout rdx_layout(uint32_t node_cap) {
502             RdxLayout L;
503 74           L.reader_slots = sizeof(RdxHeader);
504 74           L.node_pool = L.reader_slots + (uint64_t)RDX_READER_SLOTS * sizeof(RdxReaderSlot);
505 74           L.arena = L.node_pool + (uint64_t)node_cap * sizeof(RdxNode);
506 74           return L;
507             }
508              
509 50           static inline uint64_t rdx_total_size(uint32_t node_cap, uint32_t arena_cap) {
510 50           RdxLayout L = rdx_layout(node_cap);
511 50           return L.arena + (uint64_t)arena_cap;
512             }
513              
514 10973           static inline RdxNode *rdx_nodes(RdxHandle *h) {
515 10973           return (RdxNode *)((char *)h->base + h->hdr->node_pool_off);
516             }
517 5928           static inline uint8_t *rdx_arena(RdxHandle *h) {
518 5928           return (uint8_t *)((char *)h->base + h->hdr->arena_off);
519             }
520              
521             /* ================================================================
522             * Node allocation + arena append. Callers hold the WRITE lock.
523             * ================================================================ */
524              
525             /* Allocate a node: bump node_used, else 0 (pool exhausted). Returns a zeroed
526             * node index. v1 has no freelist (delete is lazy and never frees nodes), so a
527             * node always comes off the high-water mark. The caller pre-checks capacity
528             * before any mutation, so a 0 return must not happen mid-insert. */
529 1222           static inline uint32_t rdx_alloc_node(RdxHandle *h) {
530 1222           RdxHeader *hdr = h->hdr;
531 1222           RdxNode *nodes = rdx_nodes(h);
532 1222 50         if (hdr->node_used < hdr->node_cap) {
533 1222           uint32_t idx = hdr->node_used++;
534 1222           memset(&nodes[idx], 0, sizeof(RdxNode));
535 1222           return idx;
536             }
537 0           return 0;
538             }
539              
540             /* Append `len` bytes to the arena, returning the offset of the first byte.
541             * Append-only: existing bytes never move, so pointers into the arena stay
542             * valid across appends. The caller pre-checked that len bytes fit. */
543 1080           static inline uint32_t rdx_arena_append(RdxHandle *h, const uint8_t *bytes, uint32_t len) {
544 1080           RdxHeader *hdr = h->hdr;
545 1080           uint32_t off = hdr->arena_used;
546 1080 50         if (len) memcpy(rdx_arena(h) + off, bytes, len);
547 1080           hdr->arena_used += len;
548 1080           return off;
549             }
550              
551             /* Worst case any single insert consumes: up to 2 new nodes (a split makes a
552             * mid node + a leaf node) and up to klen arena bytes (the leaf's label).
553             * v1 has no freelist, so the 2 nodes must come fresh from the high-water mark.
554             * Returns 1 if both fit, 0 otherwise. Caller holds the write lock. */
555 1091           static inline int rdx_insert_has_room(RdxHandle *h, uint32_t klen) {
556 1091           RdxHeader *hdr = h->hdr;
557 1091 100         if (hdr->node_cap - hdr->node_used < 2) return 0;
558 1089 100         if (hdr->arena_cap - hdr->arena_used < klen) return 0;
559 1088           return 1;
560             }
561              
562             /* ================================================================
563             * Radix-tree core
564             * ================================================================ */
565              
566             #ifndef RDX_MIN
567             #define RDX_MIN(a, b) ((a) < (b) ? (a) : (b))
568             #endif
569              
570             /* Common-prefix length: number of leading bytes where a[i]==b[i], up to max. */
571 2840           static inline uint32_t rdx_cpl(const uint8_t *a, const uint8_t *b, uint32_t max) {
572 2840           uint32_t i = 0;
573 8982 100         while (i < max && a[i] == b[i]) i++;
    100          
574 2840           return i;
575             }
576              
577             /* Insert key -> value. Returns 1 if a new key was added, 0 if an existing key
578             * was updated. Caller holds the write lock AND has verified rdx_insert_has_room
579             * (so every rdx_alloc_node / rdx_arena_append below is guaranteed to succeed,
580             * keeping the tree consistent -- no partial-split-on-OOM possibility). */
581 1088           static inline int rdx_insert_locked(RdxHandle *h, const uint8_t *key, uint32_t klen, uint64_t value) {
582 1088           RdxHeader *hdr = h->hdr;
583 1088           RdxNode *nodes = rdx_nodes(h);
584 1088           uint8_t *arena = rdx_arena(h);
585 1088           uint32_t cur = hdr->root, kpos = 0;
586 2698           for (;;) {
587 3786 100         if (kpos == klen) { /* key ends here -> mark this node */
588 5           int isnew = !nodes[cur].has_value;
589 5           nodes[cur].has_value = 1;
590 5           nodes[cur].value = value;
591 5 100         if (isnew) hdr->keys++;
592 5           return isnew;
593             }
594 3781           uint8_t b = key[kpos];
595 3781           uint32_t ch = nodes[cur].children[b];
596 3781 100         if (ch == 0) { /* no child on b -> new leaf with the rest as its label */
597 941           uint32_t leaf = rdx_alloc_node(h);
598 941           nodes = rdx_nodes(h); /* base is stable, but re-fetch defensively after alloc */
599 941           nodes[leaf].label_off = rdx_arena_append(h, key + kpos, klen - kpos);
600 941           nodes[leaf].label_len = klen - kpos;
601 941           nodes[leaf].has_value = 1;
602 941           nodes[leaf].value = value;
603 941           nodes[cur].children[b] = leaf;
604 941           hdr->keys++;
605 941           return 1;
606             }
607             /* match the child's label against key[kpos..] */
608 2840           const uint8_t *L = arena + nodes[ch].label_off;
609 2840           uint32_t llen = nodes[ch].label_len;
610 2840           uint32_t m = rdx_cpl(L, key + kpos, RDX_MIN(llen, klen - kpos));
611 2840 100         if (m == llen) { /* whole label matched -> descend */
612 2698           cur = ch;
613 2698           kpos += llen;
614 2698           continue;
615             }
616             /* partial match -> split the child's edge at m.
617             * mid takes L[0..m-1]; child keeps L[m..] (sharing the same arena region).
618             * Capture mid_first = L[m] BEFORE mutating ch's label_off (L is a pointer
619             * into the arena and is unaffected by the label_off change, but be explicit). */
620 142           uint8_t mid_first = L[m];
621 142           uint32_t mid = rdx_alloc_node(h);
622 142           nodes = rdx_nodes(h);
623 142           nodes[mid].label_off = nodes[ch].label_off; /* first m bytes */
624 142           nodes[mid].label_len = m;
625 142           nodes[ch].label_off += m; /* child keeps the remainder, same region */
626 142           nodes[ch].label_len -= m;
627 142           nodes[mid].children[mid_first] = ch;
628 142           nodes[cur].children[b] = mid;
629 142 100         if (kpos + m == klen) { /* the key ends exactly at the split point */
630 3           nodes[mid].has_value = 1;
631 3           nodes[mid].value = value;
632 3           hdr->keys++;
633 3           return 1;
634             }
635 139           uint32_t leaf = rdx_alloc_node(h);
636 139           nodes = rdx_nodes(h);
637 139           nodes[leaf].label_off = rdx_arena_append(h, key + kpos + m, klen - kpos - m);
638 139           nodes[leaf].label_len = klen - kpos - m;
639 139           nodes[leaf].has_value = 1;
640 139           nodes[leaf].value = value;
641 139           nodes[mid].children[key[kpos + m]] = leaf;
642 139           hdr->keys++;
643 139           return 1;
644             }
645             }
646              
647             /* Navigate `key` to its terminal node. Returns the node index once the full
648             * key is consumed, or 0 (the NIL sentinel) if any step diverges. Read-only, so
649             * the caller may hold the READ or write lock. root is always >= 1, so a 0
650             * return is an unambiguous "not found". */
651 3717           static inline uint32_t rdx_find_locked(RdxHandle *h, const uint8_t *key, uint32_t klen) {
652 3717           RdxNode *nodes = rdx_nodes(h);
653 3717           uint8_t *arena = rdx_arena(h);
654 3717           uint32_t cur = h->hdr->root, kpos = 0;
655 14890           for (;;) {
656 18607 100         if (kpos == klen) return cur;
657 14927           uint32_t ch = nodes[cur].children[key[kpos]];
658 14927 100         if (!ch) return 0;
659 14900           uint32_t llen = nodes[ch].label_len;
660 14900 100         if (klen - kpos < llen) return 0;
661 14892 100         if (memcmp(arena + nodes[ch].label_off, key + kpos, llen) != 0) return 0;
662 14890           cur = ch;
663 14890           kpos += llen;
664             }
665             }
666              
667             /* Exact lookup. Returns 1 and sets *out if found, else 0. Read-only (no path
668             * compression) so the caller may hold the READ lock. */
669 3193           static inline int rdx_lookup_locked(RdxHandle *h, const uint8_t *key, uint32_t klen, uint64_t *out) {
670 3193           uint32_t n = rdx_find_locked(h, key, klen);
671 3193 100         if (!n) return 0;
672 3157           RdxNode *nodes = rdx_nodes(h);
673 3157 100         if (nodes[n].has_value) { if (out) *out = nodes[n].value; return 1; }
    100          
674 525           return 0;
675             }
676              
677             /* Longest-prefix match: is some stored key a prefix of `key`? Returns 1 and
678             * sets *out to the value of the LONGEST such stored key, else 0. Read-only. */
679 43           static inline int rdx_longest_prefix_locked(RdxHandle *h, const uint8_t *key, uint32_t klen, uint64_t *out) {
680 43           RdxNode *nodes = rdx_nodes(h);
681 43           uint8_t *arena = rdx_arena(h);
682 43           uint32_t cur = h->hdr->root, kpos = 0;
683 43           int found = 0;
684 43 100         if (nodes[cur].has_value) { if (out) *out = nodes[cur].value; found = 1; } /* empty key stored */
    50          
685 101           for (;;) {
686 144 100         if (kpos == klen) break;
687 129           uint32_t ch = nodes[cur].children[key[kpos]];
688 129 100         if (!ch) break;
689 112           uint32_t llen = nodes[ch].label_len;
690 112 100         if (klen - kpos < llen || memcmp(arena + nodes[ch].label_off, key + kpos, llen) != 0) break;
    100          
691 101           cur = ch;
692 101           kpos += llen;
693 101 100         if (nodes[cur].has_value) { if (out) *out = nodes[cur].value; found = 1; }
    50          
694             }
695 43           return found;
696             }
697              
698             /* Lazy delete: walk to the node; if found and has_value, clear it. Returns
699             * 1 if a key was removed, 0 if absent. Does NOT free nodes or compact the
700             * arena in v1. Caller holds the write lock. */
701 524           static inline int rdx_delete_locked(RdxHandle *h, const uint8_t *key, uint32_t klen) {
702 524           uint32_t n = rdx_find_locked(h, key, klen);
703 524 100         if (!n) return 0;
704 523           RdxNode *nodes = rdx_nodes(h);
705 523 100         if (!nodes[n].has_value) return 0;
706 522           nodes[n].has_value = 0;
707 522           nodes[n].value = 0;
708 522           h->hdr->keys--;
709 522           return 1;
710             }
711              
712             /* Reset to a single empty root: node_used=2, arena_used=0, keys=0, and a fresh
713             * zeroed root. Caller holds the write lock. */
714 1           static inline void rdx_clear_locked(RdxHandle *h) {
715 1           RdxHeader *hdr = h->hdr;
716 1           RdxNode *nodes = rdx_nodes(h);
717 1           hdr->node_used = 2;
718 1           hdr->arena_used = 0;
719 1           hdr->keys = 0;
720 1           memset(&nodes[hdr->root], 0, sizeof(RdxNode)); /* zero children + has_value + label */
721 1           }
722              
723             /* ================================================================
724             * Validate args + header init / setup / open / destroy
725             * ================================================================ */
726              
727             /* Validate create args. Single source of truth: the XS layer does NOT
728             * duplicate these range checks. */
729 29           static int rdx_validate_create_args(uint64_t node_cap, uint64_t arena_cap, char *errbuf) {
730 29 50         if (errbuf) errbuf[0] = '\0';
731 29 100         if (node_cap < 2) { RDX_ERR("node_capacity must be >= 2 (NIL + root)"); return 0; }
    50          
732 27 100         if (node_cap > RDX_MAX_NODES) { RDX_ERR("node_capacity must be <= %u", (unsigned)RDX_MAX_NODES); return 0; }
    50          
733 26 100         if (arena_cap < 1) { RDX_ERR("arena_capacity must be >= 1"); return 0; }
    50          
734 25 100         if (arena_cap > RDX_MAX_ARENA) { RDX_ERR("arena_capacity must be <= %u", (unsigned)RDX_MAX_ARENA); return 0; }
    50          
735             /* Keep the whole mapping within size_t (matters on 32-bit, but we already
736             * require 64-bit Perl; still, guard against absurd products). */
737             {
738 24           uint64_t total = rdx_total_size((uint32_t)node_cap, (uint32_t)arena_cap);
739             if (total > (uint64_t)SIZE_MAX) { RDX_ERR("requested mapping too large"); return 0; }
740             }
741 24           return 1;
742             }
743              
744 22           static inline void rdx_init_header(void *base, uint32_t node_cap, uint32_t arena_cap, uint64_t total_size) {
745 22           RdxLayout L = rdx_layout(node_cap);
746 22           RdxHeader *hdr = (RdxHeader *)base;
747             /* Zero the header + reader-slot region (lock-recovery state). The node
748             * pool and arena are read only within [0,node_used)/[0,arena_used); a
749             * fresh mapping is OS-zeroed, but we explicitly zero node 0 (NIL) and the
750             * root below for clarity / for the reopen-of-anon path. */
751 22           memset(base, 0, (size_t)L.node_pool);
752 22           hdr->magic = RDX_MAGIC;
753 22           hdr->version = RDX_VERSION;
754 22           hdr->node_cap = node_cap;
755 22           hdr->arena_cap = arena_cap;
756 22           hdr->total_size = total_size;
757 22           hdr->reader_slots_off = L.reader_slots;
758 22           hdr->node_pool_off = L.node_pool;
759 22           hdr->arena_off = L.arena;
760             {
761 22           RdxNode *nodes = (RdxNode *)((char *)base + L.node_pool);
762 22           memset(&nodes[0], 0, sizeof(RdxNode)); /* NIL sentinel */
763 22           memset(&nodes[1], 0, sizeof(RdxNode)); /* root: empty label, no value, no children */
764 22           hdr->root = 1;
765 22           hdr->node_used = 2; /* NIL + root */
766 22           hdr->arena_used = 0;
767 22           hdr->keys = 0;
768             }
769 22           __atomic_thread_fence(__ATOMIC_SEQ_CST);
770 22           }
771              
772 24           static inline RdxHandle *rdx_setup(void *base, size_t map_size,
773             const char *path, int backing_fd) {
774 24           RdxHeader *hdr = (RdxHeader *)base;
775 24           RdxHandle *h = (RdxHandle *)calloc(1, sizeof(RdxHandle));
776 24 50         if (!h) {
777 0           munmap(base, map_size);
778 0 0         if (backing_fd >= 0) close(backing_fd);
779 0           return NULL;
780             }
781 24           h->hdr = hdr;
782 24           h->base = base;
783 24           h->reader_slots = (RdxReaderSlot *)((uint8_t *)base + hdr->reader_slots_off);
784 24           h->mmap_size = map_size;
785 24 100         h->path = path ? strdup(path) : NULL;
786 24           h->backing_fd = backing_fd;
787 24           h->my_slot_idx = UINT32_MAX;
788 24           return h;
789             }
790              
791             /* Validate a mapped header (shared by rdx_create reopen and rdx_open_fd).
792             * Stored geometry wins on reopen; require total_size to equal both the size
793             * the stored caps imply AND the actual file size, and all offsets to match
794             * the canonical layout. */
795 2           static inline int rdx_validate_header(const RdxHeader *hdr, uint64_t file_size) {
796 2 50         if (hdr->magic != RDX_MAGIC) return 0;
797 2 50         if (hdr->version != RDX_VERSION) return 0;
798 2 50         if (hdr->node_cap < 2 || hdr->node_cap > RDX_MAX_NODES) return 0;
    50          
799 2 50         if (hdr->arena_cap < 1 || hdr->arena_cap > RDX_MAX_ARENA) return 0;
    50          
800 2 50         if (hdr->total_size != file_size) return 0;
801 2 50         if (hdr->total_size != rdx_total_size(hdr->node_cap, hdr->arena_cap)) return 0;
802 2           RdxLayout L = rdx_layout(hdr->node_cap);
803 2 50         if (hdr->reader_slots_off != L.reader_slots) return 0;
804 2 50         if (hdr->node_pool_off != L.node_pool) return 0;
805 2 50         if (hdr->arena_off != L.arena) return 0;
806 2 50         if (hdr->root == 0 || hdr->root >= hdr->node_cap) return 0;
    50          
807 2 50         if (hdr->node_used < 2 || hdr->node_used > hdr->node_cap) return 0;
    50          
808 2 50         if (hdr->arena_used > hdr->arena_cap) return 0;
809 2           return 1;
810             }
811              
812 26           static RdxHandle *rdx_create(const char *path, uint64_t node_cap_in, uint64_t arena_cap_in, char *errbuf) {
813 26 100         if (!rdx_validate_create_args(node_cap_in, arena_cap_in, errbuf)) return NULL;
814 22           uint32_t node_cap = (uint32_t)node_cap_in;
815 22           uint32_t arena_cap = (uint32_t)arena_cap_in;
816              
817 22           uint64_t total = rdx_total_size(node_cap, arena_cap);
818 22           int anonymous = (path == NULL);
819 22           int fd = -1;
820             size_t map_size;
821             void *base;
822              
823 22 100         if (anonymous) {
824 17           map_size = (size_t)total;
825 17           base = mmap(NULL, map_size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
826 17 50         if (base == MAP_FAILED) { RDX_ERR("mmap: %s", strerror(errno)); return NULL; }
    0          
827             } else {
828 5           fd = open(path, O_RDWR|O_CREAT, 0666);
829 7 50         if (fd < 0) { RDX_ERR("open: %s", strerror(errno)); return NULL; }
    0          
830 5 50         if (flock(fd, LOCK_EX) < 0) { RDX_ERR("flock: %s", strerror(errno)); close(fd); return NULL; }
    0          
831             struct stat st;
832 5 50         if (fstat(fd, &st) < 0) { RDX_ERR("fstat: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL; }
    0          
833 5           int is_new = (st.st_size == 0);
834 5 100         if (!is_new && (uint64_t)st.st_size < sizeof(RdxHeader)) {
    100          
835 1 50         RDX_ERR("%s: file too small (%lld)", path, (long long)st.st_size);
836 1           flock(fd, LOCK_UN); close(fd); return NULL;
837             }
838 4 100         if (is_new && ftruncate(fd, (off_t)total) < 0) {
    50          
839 0 0         RDX_ERR("ftruncate: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL;
840             }
841 4 100         map_size = is_new ? (size_t)total : (size_t)st.st_size;
842 4           base = mmap(NULL, map_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
843 4 50         if (base == MAP_FAILED) { RDX_ERR("mmap: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL; }
    0          
844 4 100         if (!is_new) {
845 1 50         if (!rdx_validate_header((RdxHeader *)base, (uint64_t)st.st_size)) {
846 0 0         RDX_ERR("invalid radix-tree file"); munmap(base, map_size); flock(fd, LOCK_UN); close(fd); return NULL;
847             }
848 1           flock(fd, LOCK_UN); close(fd);
849 1           return rdx_setup(base, map_size, path, -1);
850             }
851             }
852 20           rdx_init_header(base, node_cap, arena_cap, total);
853 20 100         if (fd >= 0) { flock(fd, LOCK_UN); close(fd); }
854 20           return rdx_setup(base, map_size, path, -1);
855             }
856              
857 3           static RdxHandle *rdx_create_memfd(const char *name, uint64_t node_cap_in, uint64_t arena_cap_in, char *errbuf) {
858 3 100         if (!rdx_validate_create_args(node_cap_in, arena_cap_in, errbuf)) return NULL;
859 2           uint32_t node_cap = (uint32_t)node_cap_in;
860 2           uint32_t arena_cap = (uint32_t)arena_cap_in;
861              
862 2           uint64_t total = rdx_total_size(node_cap, arena_cap);
863 2 100         int fd = memfd_create(name ? name : "radix", MFD_CLOEXEC | MFD_ALLOW_SEALING);
864 2 50         if (fd < 0) { RDX_ERR("memfd_create: %s", strerror(errno)); return NULL; }
    0          
865 2 50         if (ftruncate(fd, (off_t)total) < 0) {
866 0 0         RDX_ERR("ftruncate: %s", strerror(errno)); close(fd); return NULL;
867             }
868 2           (void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW);
869 2           void *base = mmap(NULL, (size_t)total, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
870 2 50         if (base == MAP_FAILED) { RDX_ERR("mmap: %s", strerror(errno)); close(fd); return NULL; }
    0          
871 2           rdx_init_header(base, node_cap, arena_cap, total);
872 2           return rdx_setup(base, (size_t)total, NULL, fd);
873             }
874              
875 2           static RdxHandle *rdx_open_fd(int fd, char *errbuf) {
876 2 50         if (errbuf) errbuf[0] = '\0';
877             struct stat st;
878 2 50         if (fstat(fd, &st) < 0) { RDX_ERR("fstat: %s", strerror(errno)); return NULL; }
    0          
879 2 100         if ((uint64_t)st.st_size < sizeof(RdxHeader)) { RDX_ERR("too small"); return NULL; }
    50          
880 1           size_t ms = (size_t)st.st_size;
881 1           void *base = mmap(NULL, ms, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
882 1 50         if (base == MAP_FAILED) { RDX_ERR("mmap: %s", strerror(errno)); return NULL; }
    0          
883 1 50         if (!rdx_validate_header((RdxHeader *)base, (uint64_t)st.st_size)) {
884 0 0         RDX_ERR("invalid radix-tree table"); munmap(base, ms); return NULL;
885             }
886 1           int myfd = fcntl(fd, F_DUPFD_CLOEXEC, 0);
887 1 50         if (myfd < 0) { RDX_ERR("fcntl: %s", strerror(errno)); munmap(base, ms); return NULL; }
    0          
888 1           return rdx_setup(base, ms, NULL, myfd);
889             }
890              
891 24           static void rdx_destroy(RdxHandle *h) {
892 24 50         if (!h) return;
893 24 100         if (h->backing_fd >= 0) close(h->backing_fd);
894 24 50         if (h->base) munmap(h->base, h->mmap_size);
895 24           free(h->path);
896 24           free(h);
897             }
898              
899 2           static inline int rdx_msync(RdxHandle *h) {
900 2 50         if (!h || !h->base) return 0;
    50          
901 2           return msync(h->base, h->mmap_size, MS_SYNC);
902             }
903              
904             #endif /* RADIX_H */