File Coverage

intern.h
Criterion Covered Total %
statement 238 366 65.0
branch 105 282 37.2
condition n/a
subroutine n/a
pod n/a
total 343 648 52.9


line stmt bran cond sub pod time code
1             /*
2             * intern.h -- Shared-memory string interning table for Linux
3             *
4             * Maps arbitrary byte strings to dense uint32 ids and back. Each string is
5             * stored once in an append-only arena ([uint32 len][bytes]); an open-addressed
6             * forward hash maps string -> id; reverse[id] -> arena offset is the one
7             * authoritative id->offset map. Several processes share the mapping; a
8             * write-preferring futex rwlock with reader-slot dead-process recovery guards
9             * mutation.
10             *
11             * Layout: Header -> reader_slots[1024] -> forward_hash -> reverse_array -> arena
12             */
13              
14             #ifndef INTERN_H
15             #define INTERN_H
16              
17             #include
18             #include
19             #include
20             #include
21             #include
22             #include
23             #include
24             #include
25             #include
26             #include
27             #include
28             #include
29             #include
30             #include
31             #include
32             #include
33              
34             #define XXH_INLINE_ALL
35             #include "xxhash.h"
36              
37             #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
38             #error "intern.h: requires little-endian architecture"
39             #endif
40              
41              
42             /* ================================================================
43             * Constants
44             * ================================================================ */
45              
46             #define SI_MAGIC 0x544E4953U /* "SINT" (little-endian) */
47             #define SI_VERSION 1
48             #define SI_ERR_BUFLEN 256
49             #define SI_READER_SLOTS 1024 /* max concurrent reader processes for dead-process recovery */
50             #define SI_MAX_STRINGS 0x40000000u /* id-space cap (2^30) */
51             #define SI_MAX_ARENA 0xFFFFFFFFu /* arena cap (offsets are uint32) */
52              
53             #define SI_ERR(fmt, ...) do { if (errbuf) snprintf(errbuf, SI_ERR_BUFLEN, fmt, ##__VA_ARGS__); } while (0)
54              
55             /* ================================================================
56             * Structs
57             * ================================================================ */
58              
59             /* forward-hash slot (open addressing): string -> id. Stores only the id; the
60             string bytes are reached via reverse[id] -> arena, so there is one
61             authoritative id->offset map. `fp` is the low 8 hash bits, a cheap
62             fingerprint to skip most full compares on a probe collision. */
63             typedef struct {
64             uint32_t id; /* interned id */
65             uint8_t fp; /* low 8 bits of the hash */
66             uint8_t state; /* 0 empty, 1 occupied */
67             uint16_t _pad;
68             } SiSlot;
69              
70             /* Per-process slot for dead-process recovery. Each shared rwlock counter
71             * (the main rwlock-reader count, rwlock_waiters, rwlock_writers_waiting)
72             * is mirrored here so a wrlock timeout can attribute and reverse a dead
73             * process's contribution instead of waiting for the slow per-op timeout
74             * drain. */
75             typedef struct {
76             uint32_t pid; /* 0 = unclaimed */
77             uint32_t subcount; /* in-flight rdlock acquisitions for this process */
78             uint32_t waiters_parked; /* contribution to hdr->rwlock_waiters */
79             uint32_t writers_parked; /* contribution to hdr->rwlock_writers_waiting */
80             } SiReaderSlot;
81              
82             struct SiHeader {
83             uint32_t magic, version; /* 0,4 */
84             uint32_t max_strings; /* 8 id capacity */
85             uint32_t hash_slots; /* 12 forward-hash slots (power of two) */
86             uint32_t arena_bytes; /* 16 arena capacity */
87             uint32_t count; /* 20 interned strings; also the next id to assign */
88             uint32_t arena_used; /* 24 bytes used in the arena */
89             uint32_t _pad0; /* 28 */
90             uint64_t total_size; /* 32 */
91             uint64_t reader_slots_off; /* 40 */
92             uint64_t hash_off; /* 48 */
93             uint64_t reverse_off; /* 56 */
94             uint64_t arena_off; /* 64 */
95             uint32_t rwlock; /* 72 */
96             uint32_t rwlock_waiters; /* 76 */
97             uint32_t rwlock_writers_waiting; /* 80 */
98             uint32_t _pad1; /* 84 */
99             uint64_t stat_ops; /* 88 */
100             uint8_t _pad[160]; /* 96..255 */
101             };
102             typedef struct SiHeader SiHeader;
103              
104             _Static_assert(sizeof(SiHeader) == 256, "SiHeader must be 256 bytes");
105              
106             /* ---- Process-local handle ---- */
107              
108             typedef struct SiHandle {
109             SiHeader *hdr;
110             SiReaderSlot *reader_slots; /* SI_READER_SLOTS entries */
111             SiSlot *slots; /* forward hash: string -> id */
112             uint32_t *reverse; /* id -> arena offset */
113             uint8_t *arena; /* string store ([uint32 len][bytes] records) */
114             size_t mmap_size;
115             char *path; /* backing file path (strdup'd) */
116             int backing_fd; /* memfd or reopened-fd to close on destroy, -1 for file/anon */
117             uint32_t my_slot_idx; /* UINT32_MAX if all slots taken (no recovery for this handle) */
118             uint32_t cached_pid; /* getpid() cached at last slot claim */
119             uint32_t cached_fork_gen; /* si_fork_gen value at last slot claim */
120             } SiHandle;
121              
122             /* ================================================================
123             * Helpers
124             * ================================================================ */
125              
126 19           static inline uint32_t si_next_pow2(uint32_t v) {
127 19 50         if (v < 2) return 1;
128 19           return 1u << (32 - __builtin_clz(v - 1));
129             }
130              
131             /* string hash (XXH3): deterministic across processes on this LE platform */
132 114163           static inline uint64_t si_hash(const void *s, size_t n) {
133 114163           return XXH3_64bits(s, n);
134             }
135              
136             /* ================================================================
137             * Futex-based write-preferring read-write lock
138             * with reader-slot dead-process recovery
139             * ================================================================ */
140              
141             #define SI_RWLOCK_SPIN_LIMIT 32
142             #define SI_LOCK_TIMEOUT_SEC 2 /* FUTEX_WAIT timeout for stale lock detection */
143              
144 0           static inline void si_rwlock_spin_pause(void) {
145             #if defined(__x86_64__) || defined(__i386__)
146 0           __asm__ volatile("pause" ::: "memory");
147             #elif defined(__aarch64__)
148             __asm__ volatile("yield" ::: "memory");
149             #else
150             __asm__ volatile("" ::: "memory");
151             #endif
152 0           }
153              
154             /* Extract writer PID from rwlock value (lower 31 bits when write-locked). */
155             #define SI_RWLOCK_WRITER_BIT 0x80000000U
156             #define SI_RWLOCK_PID_MASK 0x7FFFFFFFU
157             #define SI_RWLOCK_WR(pid) (SI_RWLOCK_WRITER_BIT | ((uint32_t)(pid) & SI_RWLOCK_PID_MASK))
158              
159             /* Check if a PID is alive. Returns 1 if alive or unknown, 0 if definitely dead. */
160             /* Liveness via kill(pid,0). NOTE: cannot detect PID reuse -- if a dead
161             * lock-holder's PID is recycled to an unrelated live process before recovery
162             * runs, this reports "alive" and that slot's orphaned contribution is not
163             * reclaimed until the recycled process exits. Robust detection would require
164             * a per-slot process-start-time epoch (a header-layout/version change).
165             * Documented under "Crash Safety" in the POD. */
166 0           static inline int si_pid_alive(uint32_t pid) {
167 0 0         if (pid == 0) return 1; /* no owner recorded, assume alive */
168 0 0         return !(kill((pid_t)pid, 0) == -1 && errno == ESRCH);
    0          
169             }
170              
171             /* Force-recover a stale write lock left by a dead process.
172             * CAS to OUR pid to hold the lock while fixing shared state, then release.
173             * Using our pid (not a bare WRITER_BIT sentinel) means a subsequent
174             * recovering process can detect and re-recover if we crash mid-recovery. */
175 0           static inline void si_recover_stale_lock(SiHandle *h, uint32_t observed_rwlock) {
176 0           SiHeader *hdr = h->hdr;
177 0           uint32_t mypid = SI_RWLOCK_WR((uint32_t)getpid());
178 0 0         if (!__atomic_compare_exchange_n(&hdr->rwlock, &observed_rwlock,
179             mypid, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
180 0           return;
181             /* We now hold the write lock as mypid. No additional shared state needs
182             * repair here (this module has no seqlock); just release the lock. */
183 0           __atomic_store_n(&hdr->rwlock, 0, __ATOMIC_RELEASE);
184 0 0         if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
185 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
186             }
187              
188             static const struct timespec si_lock_timeout = { SI_LOCK_TIMEOUT_SEC, 0 };
189              
190             /* Process-global fork-generation counter. Incremented in the pthread_atfork
191             * child callback so every open handle detects a fork transition on the next
192             * lock call without paying a getpid() syscall on the hot path. */
193             static uint32_t si_fork_gen = 1;
194             static pthread_once_t si_atfork_once = PTHREAD_ONCE_INIT;
195 0           static void si_on_fork_child(void) {
196 0           __atomic_add_fetch(&si_fork_gen, 1, __ATOMIC_RELAXED);
197 0           }
198 3           static void si_atfork_init(void) {
199 3           pthread_atfork(NULL, NULL, si_on_fork_child);
200 3           }
201              
202             /* Ensure this process owns a reader slot. Called from the lock helpers so
203             * that fork()'d children pick up their own slot lazily instead of sharing
204             * the parent's. Hot-path is a single relaxed load + compare; only on a
205             * fork-generation mismatch do we touch getpid() and scan slots. */
206 151201           static inline void si_claim_reader_slot(SiHandle *h) {
207 151201           uint32_t cur_gen = __atomic_load_n(&si_fork_gen, __ATOMIC_RELAXED);
208 151201 100         if (__builtin_expect(cur_gen == h->cached_fork_gen && h->my_slot_idx != UINT32_MAX, 1))
    50          
209 151188           return;
210             /* Cold path -- register the atfork hook once per process, then claim. */
211 13           pthread_once(&si_atfork_once, si_atfork_init);
212             /* Re-read after pthread_once: si_on_fork_child may have bumped it. */
213 13           cur_gen = __atomic_load_n(&si_fork_gen, __ATOMIC_RELAXED);
214 13           uint32_t now_pid = (uint32_t)getpid();
215 13           h->cached_pid = now_pid;
216 13           h->cached_fork_gen = cur_gen;
217 13           h->my_slot_idx = UINT32_MAX;
218 13           uint32_t start = now_pid % SI_READER_SLOTS;
219 14 50         for (uint32_t i = 0; i < SI_READER_SLOTS; i++) {
220 14           uint32_t s = (start + i) % SI_READER_SLOTS;
221 14           uint32_t expected = 0;
222 14 100         if (__atomic_compare_exchange_n(&h->reader_slots[s].pid,
223             &expected, now_pid, 0,
224             __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
225             /* Zero all mirror fields, not just subcount: a SIGKILL'd
226             * predecessor may have left waiters_parked/writers_parked
227             * non-zero, and si_recover_dead_readers won't drain them
228             * once we own the slot (the CAS expects the dead PID). */
229 13           __atomic_store_n(&h->reader_slots[s].subcount, 0, __ATOMIC_RELAXED);
230 13           __atomic_store_n(&h->reader_slots[s].waiters_parked, 0, __ATOMIC_RELAXED);
231 13           __atomic_store_n(&h->reader_slots[s].writers_parked, 0, __ATOMIC_RELAXED);
232 13           h->my_slot_idx = s;
233 13           return;
234             }
235             }
236             /* Table full -- leave my_slot_idx = UINT32_MAX so we silently skip
237             * tracking for this handle (lock still works; just no recovery). */
238             }
239              
240             /* Atomically subtract `sub` from a counter, capped at 0 (never underflows). */
241 0           static inline void si_atomic_sub_cap(uint32_t *p, uint32_t sub) {
242 0 0         if (!sub) return;
243 0           uint32_t cur = __atomic_load_n(p, __ATOMIC_RELAXED);
244 0           for (;;) {
245 0 0         uint32_t want = (cur > sub) ? cur - sub : 0;
246 0 0         if (__atomic_compare_exchange_n(p, &cur, want,
247             1, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
248 0           return;
249             }
250             }
251              
252             /* Try to claim a dead slot (CAS pid -> 0) and drain its parked-waiter
253             * contributions back to the global counters. A no-op if the slot was stolen
254             * by another recoverer or had no waiter contribution to drain.
255             *
256             * Note: subcount/waiters_parked/writers_parked are NOT zeroed here.
257             * Between our CAS and a follow-up store, a new process could claim the
258             * slot and start populating these fields -- our stores would clobber its
259             * state. si_claim_reader_slot zeros all three on every claim, so
260             * leaving stale values is harmless. */
261 0           static inline void si_drain_dead_slot(SiHandle *h, uint32_t i, uint32_t pid) {
262 0           SiHeader *hdr = h->hdr;
263 0           uint32_t expected = pid;
264             /* ACQ_REL on success: RELEASE publishes pid=0 to other observers;
265             * ACQUIRE syncs us with prior writes from the dead process to
266             * waiters_parked/writers_parked. On weakly-ordered archs (aarch64)
267             * a plain RELAXED load before the CAS could miss those writes;
268             * loading them after the CAS keeps them inside the acquire window. */
269 0 0         if (!__atomic_compare_exchange_n(&h->reader_slots[i].pid, &expected, 0,
270             0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
271 0           return;
272 0           uint32_t wp = __atomic_load_n(&h->reader_slots[i].waiters_parked, __ATOMIC_RELAXED);
273 0           uint32_t writp = __atomic_load_n(&h->reader_slots[i].writers_parked, __ATOMIC_RELAXED);
274 0 0         if (wp) si_atomic_sub_cap(&hdr->rwlock_waiters, wp);
275 0 0         if (writp) si_atomic_sub_cap(&hdr->rwlock_writers_waiting, writp);
276             }
277              
278             /* Scan reader slots for dead-process recovery.
279             *
280             * For each dead PID with non-zero contributions to the shared rwlock,
281             * rwlock_waiters, or rwlock_writers_waiting counters, drain its share back
282             * out so live processes don't have to wait for the slow per-op timeout
283             * decrement to drain it for them.
284             *
285             * For the main rwlock counter we use the "no live reader holds -> force-
286             * reset to 0" trick (precise) because per-process attribution of the
287             * subcount is racy across the inc-counter-then-inc-subcount window. */
288 0           static inline void si_recover_dead_readers(SiHandle *h) {
289 0 0         if (!h->reader_slots) return;
290 0           SiHeader *hdr = h->hdr;
291 0           int any_live_reader = 0;
292 0           int found_dead_reader = 0;
293              
294             /* Pass 1: classify slots. Slots with dead pid and sc == 0 (no rwlock
295             * contribution to lose) are wiped immediately to free the slot for
296             * future claimants and drain any orphan parked-waiter counters. Slots
297             * with dead pid and sc > 0 are left intact in this pass: if force-
298             * reset cannot fire (because a live reader is concurrently present),
299             * wiping the dead slot would lose the only record of its orphan
300             * rwlock contribution and strand writers permanently once the live
301             * reader releases. */
302 0 0         for (uint32_t i = 0; i < SI_READER_SLOTS; i++) {
303 0           uint32_t pid = __atomic_load_n(&h->reader_slots[i].pid, __ATOMIC_ACQUIRE);
304 0 0         if (pid == 0) continue;
305 0           uint32_t sc = __atomic_load_n(&h->reader_slots[i].subcount, __ATOMIC_RELAXED);
306 0 0         if (si_pid_alive(pid)) {
307 0 0         if (sc > 0) any_live_reader = 1;
308 0           continue;
309             }
310 0 0         if (sc > 0) { found_dead_reader = 1; continue; }
311 0           si_drain_dead_slot(h, i, pid);
312             }
313              
314             /* Pass 2: only if force-reset will fire. Issue the rwlock force-
315             * reset CAS FIRST, while the window since pass 1's last scan is
316             * still narrow (a handful of instructions, as in the original
317             * single-pass code). A new reader that started rdlock between
318             * pass 1's scan and the CAS will either:
319             * (a) have already CAS'd rwlock from cur to cur+1 -- our CAS then
320             * fails (cur mismatched), recovery yields and a future
321             * cycle retries; or
322             * (b) be still in the subcount-bump phase -- our CAS sees the
323             * stale cur and resets to 0; the new reader's subsequent CAS
324             * rwlock(0 -> 1) succeeds cleanly.
325             * Only after the CAS resolves do we wipe the deferred dead slots,
326             * keeping that work outside the race-sensitive window. */
327 0 0         if (found_dead_reader && !any_live_reader) {
    0          
328 0           uint32_t cur = __atomic_load_n(&hdr->rwlock, __ATOMIC_RELAXED);
329 0 0         if (cur > 0 && cur < SI_RWLOCK_WRITER_BIT) {
    0          
330 0 0         if (__atomic_compare_exchange_n(&hdr->rwlock, &cur, 0,
331             0, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
332 0 0         if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
333 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
334             }
335             }
336 0 0         for (uint32_t i = 0; i < SI_READER_SLOTS; i++) {
337 0           uint32_t pid = __atomic_load_n(&h->reader_slots[i].pid, __ATOMIC_ACQUIRE);
338 0 0         if (pid == 0 || si_pid_alive(pid)) continue;
    0          
339 0           si_drain_dead_slot(h, i, pid);
340             }
341             }
342             }
343              
344             /* Inspect the lock word after a futex-wait timeout. If a dead writer
345             * holds it, force-recover the lock. Otherwise drain dead readers' shares
346             * of the rwlock/waiter counters. Called from rdlock and wrlock ETIMEDOUT
347             * branches -- identical recovery logic in both. */
348 0           static inline void si_recover_after_timeout(SiHandle *h) {
349 0           SiHeader *hdr = h->hdr;
350 0           uint32_t val = __atomic_load_n(&hdr->rwlock, __ATOMIC_RELAXED);
351 0 0         if (val >= SI_RWLOCK_WRITER_BIT) {
352 0           uint32_t pid = val & SI_RWLOCK_PID_MASK;
353 0 0         if (!si_pid_alive(pid))
354 0           si_recover_stale_lock(h, val);
355             } else {
356 0           si_recover_dead_readers(h);
357             }
358 0           }
359              
360             /* Park/unpark helpers: bump the global waiter counters together with this
361             * process's mirrored slot counters so a wrlock-timeout recovery scan can
362             * attribute and reverse a dead PID's contribution. Kept paired to make
363             * accidental drift between global and per-slot counts impossible. */
364 0           static inline void si_park_reader(SiHandle *h) {
365 0 0         if (h->my_slot_idx != UINT32_MAX)
366 0           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
367 0           __atomic_add_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
368 0           }
369 0           static inline void si_unpark_reader(SiHandle *h) {
370 0           __atomic_sub_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
371 0 0         if (h->my_slot_idx != UINT32_MAX)
372 0           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
373 0           }
374 0           static inline void si_park_writer(SiHandle *h) {
375 0 0         if (h->my_slot_idx != UINT32_MAX) {
376 0           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
377 0           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].writers_parked, 1, __ATOMIC_RELAXED);
378             }
379 0           __atomic_add_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
380 0           __atomic_add_fetch(&h->hdr->rwlock_writers_waiting, 1, __ATOMIC_RELAXED);
381 0           }
382 0           static inline void si_unpark_writer(SiHandle *h) {
383 0           __atomic_sub_fetch(&h->hdr->rwlock_waiters, 1, __ATOMIC_RELAXED);
384 0           __atomic_sub_fetch(&h->hdr->rwlock_writers_waiting, 1, __ATOMIC_RELAXED);
385 0 0         if (h->my_slot_idx != UINT32_MAX) {
386 0           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].waiters_parked, 1, __ATOMIC_RELAXED);
387 0           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].writers_parked, 1, __ATOMIC_RELAXED);
388             }
389 0           }
390              
391 109660           static inline void si_rwlock_rdlock(SiHandle *h) {
392 109660           si_claim_reader_slot(h);
393 109660           SiHeader *hdr = h->hdr;
394 109660           uint32_t *lock = &hdr->rwlock;
395 109660           uint32_t *writers_waiting = &hdr->rwlock_writers_waiting;
396             /* Claim subcount BEFORE bumping the shared rwlock counter. This way
397             * a concurrent writer-side recovery scan that sees our PID alive with
398             * subcount > 0 will (correctly) defer force-reset, even while we are
399             * still spinning trying to win the rwlock CAS. Without this, a reader
400             * killed between rwlock CAS-success and subcount++ would let recovery
401             * force-reset rwlock to 0 underneath us, causing a UINT32_MAX wrap on
402             * our eventual rdunlock dec. */
403 109660 50         if (h->my_slot_idx != UINT32_MAX)
404 109660           __atomic_add_fetch(&h->reader_slots[h->my_slot_idx].subcount, 1, __ATOMIC_RELAXED);
405 109660           for (int spin = 0; ; spin++) {
406 109660           uint32_t cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
407             /* Write-preferring: when lock is free (cur==0) and writers are
408             * waiting, yield to let the writer acquire. When readers are
409             * already active (cur>=1), new readers may join freely. */
410 109660 50         if (cur > 0 && cur < SI_RWLOCK_WRITER_BIT) {
    0          
411 0 0         if (__atomic_compare_exchange_n(lock, &cur, cur + 1,
412             1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
413 109660           return;
414 109660 50         } else if (cur == 0 && !__atomic_load_n(writers_waiting, __ATOMIC_RELAXED)) {
    50          
415 109660 50         if (__atomic_compare_exchange_n(lock, &cur, 1,
416             1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
417 109660           return;
418             }
419 0 0         if (__builtin_expect(spin < SI_RWLOCK_SPIN_LIMIT, 1)) {
420 0           si_rwlock_spin_pause();
421 0           continue;
422             }
423 0           si_park_reader(h);
424 0           cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
425             /* Sleep when write-locked OR when yielding to waiting writers */
426 0 0         if (cur >= SI_RWLOCK_WRITER_BIT || cur == 0) {
    0          
427 0           long rc = syscall(SYS_futex, lock, FUTEX_WAIT, cur,
428             &si_lock_timeout, NULL, 0);
429 0 0         if (rc == -1 && errno == ETIMEDOUT) {
    0          
430 0           si_unpark_reader(h);
431 0           si_recover_after_timeout(h);
432 0           spin = 0;
433 0           continue;
434             }
435             }
436 0           si_unpark_reader(h);
437 0           spin = 0;
438             }
439             }
440              
441 109660           static inline void si_rwlock_rdunlock(SiHandle *h) {
442 109660           SiHeader *hdr = h->hdr;
443             /* Release the shared counter BEFORE dropping our subcount so that
444             * "any live PID with subcount > 0" is a reliable in-flight indicator
445             * for the writer-side recovery scan. Inverting these would create a
446             * window where we still own a unit of rwlock but our slot subcount is
447             * 0, letting recovery force-reset rwlock underneath us. */
448 109660           uint32_t after = __atomic_sub_fetch(&hdr->rwlock, 1, __ATOMIC_RELEASE);
449 109660 50         if (h->my_slot_idx != UINT32_MAX)
450 109660           __atomic_sub_fetch(&h->reader_slots[h->my_slot_idx].subcount, 1, __ATOMIC_RELAXED);
451 109660 50         if (after == 0 && __atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
    50          
452 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
453 109660           }
454              
455 41541           static inline void si_rwlock_wrlock(SiHandle *h) {
456 41541           si_claim_reader_slot(h); /* refresh cached_pid across fork */
457 41541           SiHeader *hdr = h->hdr;
458 41541           uint32_t *lock = &hdr->rwlock;
459             /* Encode PID in the rwlock word itself (0x80000000 | pid) to eliminate
460             * any crash window between acquiring the lock and storing the owner. */
461 41541           uint32_t mypid = SI_RWLOCK_WR(h->cached_pid);
462 41541           for (int spin = 0; ; spin++) {
463 41541           uint32_t expected = 0;
464 41541 50         if (__atomic_compare_exchange_n(lock, &expected, mypid,
465             1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
466 41541           return;
467 0 0         if (__builtin_expect(spin < SI_RWLOCK_SPIN_LIMIT, 1)) {
468 0           si_rwlock_spin_pause();
469 0           continue;
470             }
471 0           si_park_writer(h);
472 0           uint32_t cur = __atomic_load_n(lock, __ATOMIC_RELAXED);
473 0 0         if (cur != 0) {
474 0           long rc = syscall(SYS_futex, lock, FUTEX_WAIT, cur,
475             &si_lock_timeout, NULL, 0);
476 0 0         if (rc == -1 && errno == ETIMEDOUT) {
    0          
477 0           si_unpark_writer(h);
478 0           si_recover_after_timeout(h);
479 0           spin = 0;
480 0           continue;
481             }
482             }
483 0           si_unpark_writer(h);
484 0           spin = 0;
485             }
486             }
487              
488 41541           static inline void si_rwlock_wrunlock(SiHandle *h) {
489 41541           SiHeader *hdr = h->hdr;
490 41541           __atomic_store_n(&hdr->rwlock, 0, __ATOMIC_RELEASE);
491 41541 50         if (__atomic_load_n(&hdr->rwlock_waiters, __ATOMIC_RELAXED) > 0)
492 0           syscall(SYS_futex, &hdr->rwlock, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
493 41541           }
494              
495             /* ================================================================
496             * Layout math + create / open / destroy
497             *
498             * Layout: Header -> reader_slots[1024] -> forward_hash -> reverse_array -> arena
499             * ================================================================ */
500              
501             /* Single source of truth for the mmap region layout offsets. */
502             typedef struct { uint64_t reader_slots, hash, reverse, arena; } SiLayout;
503              
504 41           static inline SiLayout si_layout(uint32_t hash_slots, uint32_t max_strings) {
505             SiLayout L;
506 41           L.reader_slots = sizeof(SiHeader);
507 41           L.hash = L.reader_slots + (uint64_t)SI_READER_SLOTS * sizeof(SiReaderSlot);
508 41           L.reverse = L.hash + (uint64_t)hash_slots * sizeof(SiSlot);
509 41           L.arena = L.reverse + (uint64_t)max_strings * sizeof(uint32_t);
510 41           L.arena = (L.arena + 7) & ~(uint64_t)7; /* 8-byte align the arena */
511 41           return L;
512             }
513              
514 22           static inline uint64_t si_total_size(uint32_t hash_slots, uint32_t max_strings, uint32_t arena_bytes) {
515 22           SiLayout L = si_layout(hash_slots, max_strings);
516 22           return L.arena + (uint64_t)arena_bytes;
517             }
518              
519 16           static inline void si_init_header(void *base, uint32_t max_strings, uint32_t hash_slots,
520             uint32_t arena_bytes, uint64_t total) {
521 16           SiLayout L = si_layout(hash_slots, max_strings);
522 16           SiHeader *hdr = (SiHeader *)base;
523             /* zero the header + reader slots + hash region only; the reverse array and
524             arena are read solely within [0,count)/[0,arena_used), both starting at 0,
525             and the fresh mapping is already zero-filled by the OS. */
526 16           memset(base, 0, (size_t)L.reverse);
527 16           hdr->magic = SI_MAGIC;
528 16           hdr->version = SI_VERSION;
529 16           hdr->max_strings = max_strings;
530 16           hdr->hash_slots = hash_slots;
531 16           hdr->arena_bytes = arena_bytes;
532 16           hdr->count = 0;
533 16           hdr->arena_used = 0;
534 16           hdr->total_size = total;
535 16           hdr->reader_slots_off = L.reader_slots;
536 16           hdr->hash_off = L.hash;
537 16           hdr->reverse_off = L.reverse;
538 16           hdr->arena_off = L.arena;
539 16           __atomic_thread_fence(__ATOMIC_SEQ_CST);
540 16           }
541              
542 19           static inline SiHandle *si_setup(void *base, size_t map_size,
543             const char *path, int backing_fd) {
544 19           SiHeader *hdr = (SiHeader *)base;
545 19           SiHandle *h = (SiHandle *)calloc(1, sizeof(SiHandle));
546 19 50         if (!h) {
547 0           munmap(base, map_size);
548 0 0         if (backing_fd >= 0) close(backing_fd);
549 0           return NULL;
550             }
551 19           h->hdr = hdr;
552 19           h->reader_slots = (SiReaderSlot *)((uint8_t *)base + hdr->reader_slots_off);
553 19           h->slots = (SiSlot *)((uint8_t *)base + hdr->hash_off);
554 19           h->reverse = (uint32_t *)((uint8_t *)base + hdr->reverse_off);
555 19           h->arena = (uint8_t *)base + hdr->arena_off;
556 19           h->mmap_size = map_size;
557 19 100         h->path = path ? strdup(path) : NULL;
558 19           h->backing_fd = backing_fd;
559 19           h->my_slot_idx = UINT32_MAX;
560 19           return h;
561             }
562              
563             /* Validate a mapped header (shared by si_create reopen and si_open_fd). */
564 3           static inline int si_validate_header(const SiHeader *hdr, uint64_t file_size) {
565 3 50         if (hdr->magic != SI_MAGIC) return 0;
566 3 50         if (hdr->version != SI_VERSION) return 0;
567 3 50         if (hdr->max_strings == 0 || hdr->max_strings > SI_MAX_STRINGS) return 0;
    50          
568 3 50         if (hdr->hash_slots == 0 || (hdr->hash_slots & (hdr->hash_slots - 1)) != 0) return 0; /* pow2 */
    50          
569 3 50         if (hdr->hash_slots <= hdr->max_strings) return 0; /* probe termination: an empty slot always exists */
570 3 50         if (hdr->arena_bytes == 0) return 0;
571 3 50         if (hdr->total_size != file_size) return 0;
572 3 50         if (hdr->total_size != si_total_size(hdr->hash_slots, hdr->max_strings, hdr->arena_bytes)) return 0;
573 3           SiLayout L = si_layout(hdr->hash_slots, hdr->max_strings);
574 3 50         if (hdr->reader_slots_off != L.reader_slots) return 0;
575 3 50         if (hdr->hash_off != L.hash) return 0;
576 3 50         if (hdr->reverse_off != L.reverse) return 0;
577 3 50         if (hdr->arena_off != L.arena) return 0;
578 3 50         if (hdr->count > hdr->max_strings) return 0;
579 3 50         if (hdr->arena_used > hdr->arena_bytes) return 0;
580 3           return 1;
581             }
582              
583             /* validate args + compute the hash-slot count and (if 0) a default arena size */
584 20           static int si_validate_create_args(uint32_t max_strings, uint32_t *arena_bytes_io,
585             uint32_t *hash_slots, char *errbuf) {
586 20 50         if (errbuf) errbuf[0] = '\0';
587 20 100         if (max_strings == 0) { SI_ERR("max_strings must be > 0"); return 0; }
    50          
588 19 50         if (max_strings > SI_MAX_STRINGS) { SI_ERR("max_strings too large (max %u)", SI_MAX_STRINGS); return 0; }
    0          
589 19           uint64_t want = (uint64_t)max_strings * 10 / 7 + 1; /* hash load factor ~0.7 */
590             /* next_pow2(want) is always strictly > max_strings, so a probe always finds
591             an empty slot (lookup misses cannot loop forever). With max_strings capped
592             at 2^30, want <= ~1.43*2^30 < 2^31, whose next_pow2 is 2^31 -- fits uint32. */
593 19           *hash_slots = si_next_pow2((uint32_t)want);
594 19 100         if (*arena_bytes_io == 0) { /* default arena: 32 bytes/string */
595 7           uint64_t a = (uint64_t)max_strings * 32;
596 7 50         if (a > SI_MAX_ARENA) a = SI_MAX_ARENA;
597 7 50         if (a < 64) a = 64;
598 7           *arena_bytes_io = (uint32_t)a;
599             }
600 19           return 1;
601             }
602              
603 18           static SiHandle *si_create(const char *path, uint32_t max_strings, uint32_t arena_bytes, char *errbuf) {
604             uint32_t hash_slots;
605 18 100         if (!si_validate_create_args(max_strings, &arena_bytes, &hash_slots, errbuf)) return NULL;
606              
607 17           uint64_t total = si_total_size(hash_slots, max_strings, arena_bytes);
608 17           int anonymous = (path == NULL);
609 17           int fd = -1;
610             size_t map_size;
611             void *base;
612              
613 17 100         if (anonymous) {
614 10           map_size = (size_t)total;
615 10           base = mmap(NULL, map_size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
616 10 50         if (base == MAP_FAILED) { SI_ERR("mmap: %s", strerror(errno)); return NULL; }
    0          
617             } else {
618 7           fd = open(path, O_RDWR|O_CREAT, 0666);
619 10 50         if (fd < 0) { SI_ERR("open: %s", strerror(errno)); return NULL; }
    0          
620 7 50         if (flock(fd, LOCK_EX) < 0) { SI_ERR("flock: %s", strerror(errno)); close(fd); return NULL; }
    0          
621             struct stat st;
622 7 50         if (fstat(fd, &st) < 0) { SI_ERR("fstat: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL; }
    0          
623 7           int is_new = (st.st_size == 0);
624 7 100         if (!is_new && (uint64_t)st.st_size < sizeof(SiHeader)) {
    100          
625 1 50         SI_ERR("%s: file too small (%lld)", path, (long long)st.st_size);
626 1           flock(fd, LOCK_UN); close(fd); return NULL;
627             }
628 6 100         if (is_new && ftruncate(fd, (off_t)total) < 0) {
    50          
629 0 0         SI_ERR("ftruncate: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL;
630             }
631 6 100         map_size = is_new ? (size_t)total : (size_t)st.st_size;
632 6           base = mmap(NULL, map_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
633 6 50         if (base == MAP_FAILED) { SI_ERR("mmap: %s", strerror(errno)); flock(fd, LOCK_UN); close(fd); return NULL; }
    0          
634 6 100         if (!is_new) {
635 2 50         if (!si_validate_header((SiHeader *)base, (uint64_t)st.st_size)) {
636 0 0         SI_ERR("invalid intern file"); munmap(base, map_size); flock(fd, LOCK_UN); close(fd); return NULL;
637             }
638 2           flock(fd, LOCK_UN); close(fd);
639 2           return si_setup(base, map_size, path, -1);
640             }
641             }
642 14           si_init_header(base, max_strings, hash_slots, arena_bytes, total);
643 14 100         if (fd >= 0) { flock(fd, LOCK_UN); close(fd); }
644 14           return si_setup(base, map_size, path, -1);
645             }
646              
647 2           static SiHandle *si_create_memfd(const char *name, uint32_t max_strings, uint32_t arena_bytes, char *errbuf) {
648             uint32_t hash_slots;
649 2 50         if (!si_validate_create_args(max_strings, &arena_bytes, &hash_slots, errbuf)) return NULL;
650              
651 2           uint64_t total = si_total_size(hash_slots, max_strings, arena_bytes);
652 2 100         int fd = memfd_create(name ? name : "intern", MFD_CLOEXEC | MFD_ALLOW_SEALING);
653 2 50         if (fd < 0) { SI_ERR("memfd_create: %s", strerror(errno)); return NULL; }
    0          
654 2 50         if (ftruncate(fd, (off_t)total) < 0) {
655 0 0         SI_ERR("ftruncate: %s", strerror(errno)); close(fd); return NULL;
656             }
657 2           (void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW);
658 2           void *base = mmap(NULL, (size_t)total, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
659 2 50         if (base == MAP_FAILED) { SI_ERR("mmap: %s", strerror(errno)); close(fd); return NULL; }
    0          
660 2           si_init_header(base, max_strings, hash_slots, arena_bytes, total);
661 2           return si_setup(base, (size_t)total, NULL, fd);
662             }
663              
664 1           static SiHandle *si_open_fd(int fd, char *errbuf) {
665 1 50         if (errbuf) errbuf[0] = '\0';
666             struct stat st;
667 1 50         if (fstat(fd, &st) < 0) { SI_ERR("fstat: %s", strerror(errno)); return NULL; }
    0          
668 1 50         if ((uint64_t)st.st_size < sizeof(SiHeader)) { SI_ERR("too small"); return NULL; }
    0          
669 1           size_t ms = (size_t)st.st_size;
670 1           void *base = mmap(NULL, ms, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
671 1 50         if (base == MAP_FAILED) { SI_ERR("mmap: %s", strerror(errno)); return NULL; }
    0          
672 1 50         if (!si_validate_header((SiHeader *)base, (uint64_t)st.st_size)) {
673 0 0         SI_ERR("invalid intern table"); munmap(base, ms); return NULL;
674             }
675 1           int myfd = fcntl(fd, F_DUPFD_CLOEXEC, 0);
676 1 50         if (myfd < 0) { SI_ERR("fcntl: %s", strerror(errno)); munmap(base, ms); return NULL; }
    0          
677 1           return si_setup(base, ms, NULL, myfd);
678             }
679              
680 19           static void si_destroy(SiHandle *h) {
681 19 50         if (!h) return;
682 19 100         if (h->backing_fd >= 0) close(h->backing_fd);
683 19 50         if (h->hdr) munmap(h->hdr, h->mmap_size);
684 19           free(h->path);
685 19           free(h);
686             }
687              
688 4           static inline int si_msync(SiHandle *h) {
689 4 50         if (!h || !h->hdr) return 0;
    50          
690 4           return msync(h->hdr, h->mmap_size, MS_SYNC);
691             }
692              
693             /* ================================================================
694             * Interning (callers hold the lock)
695             * ================================================================ */
696              
697             /* reset to empty (caller holds the write lock) */
698 2           static inline void si_clear_locked(SiHandle *h) {
699 2           SiHeader *hdr = h->hdr;
700 2           hdr->count = 0;
701 2           hdr->arena_used = 0;
702 2           memset(h->slots, 0, (size_t)hdr->hash_slots * sizeof(SiSlot));
703 2           }
704              
705             /* the string record at arena offset `off`: sets *len, returns a pointer to the
706             bytes (the uint32 length prefix is read unaligned-safely) */
707 122068           static inline const char *si_arena_str(SiHandle *h, uint32_t off, uint32_t *len) {
708             uint32_t l;
709 122068           memcpy(&l, h->arena + off, sizeof(l));
710 122068           *len = l;
711 122068           return (const char *)(h->arena + off + sizeof(uint32_t));
712             }
713              
714             /* slot for (s,n): if *found, an occupied matching slot; else the first empty
715             slot for insertion. A probe always terminates (hash_slots > max_strings >= count). */
716 114163           static inline uint32_t si_idx_find(SiHandle *h, const char *s, size_t n, uint64_t hash, int *found) {
717 114163           uint32_t mask = h->hdr->hash_slots - 1;
718 114163           uint32_t i = (uint32_t)(hash & mask);
719 114163           uint8_t want_fp = (uint8_t)(hash & 0xff);
720 123629 100         while (h->slots[i].state) {
721 86483 100         if (h->slots[i].fp == want_fp) {
722             uint32_t l;
723 85053           const char *cand = si_arena_str(h, h->reverse[h->slots[i].id], &l);
724 85053 100         if (l == n && memcmp(cand, s, n) == 0) { *found = 1; return i; }
    100          
725             }
726 9466           i = (i + 1) & mask;
727             }
728 37146           *found = 0;
729 37146           return i;
730             }
731              
732             /* id of (s,n) if present: returns 1 and sets *id, else 0 */
733 72624           static inline int si_id_of_locked(SiHandle *h, const char *s, size_t n, uint32_t *id) {
734             int f;
735 72624           uint32_t i = si_idx_find(h, s, n, si_hash(s, n), &f);
736 72624 100         if (f) { *id = h->slots[i].id; return 1; }
737 3           return 0;
738             }
739              
740             /* intern (s,n): returns the id (>=0, existing or new), or -1 if the id space or
741             the arena is exhausted */
742 41539           static int64_t si_intern_locked(SiHandle *h, const char *s, size_t n) {
743 41539           SiHeader *hdr = h->hdr;
744 41539           uint64_t hash = si_hash(s, n);
745             int f;
746 41539           uint32_t slot = si_idx_find(h, s, n, hash, &f);
747 41539 100         if (f) return h->slots[slot].id;
748 37143 100         if (hdr->count >= hdr->max_strings) return -1;
749 37142           uint64_t need = (uint64_t)sizeof(uint32_t) + n; /* arena cap (<= UINT32_MAX) also bounds n */
750 37142 100         if ((uint64_t)hdr->arena_used + need > hdr->arena_bytes) return -1;
751 37139           uint32_t off = hdr->arena_used;
752 37139           uint32_t l = (uint32_t)n;
753 37139           memcpy(h->arena + off, &l, sizeof(l));
754 37139 100         if (n) memcpy(h->arena + off + sizeof(uint32_t), s, n);
755 37139           hdr->arena_used += (uint32_t)need;
756 37139           uint32_t id = hdr->count;
757 37139           h->reverse[id] = off;
758 37139           h->slots[slot].id = id;
759 37139           h->slots[slot].fp = (uint8_t)(hash & 0xff);
760 37139           h->slots[slot].state = 1;
761 37139           hdr->count++;
762 37139           return id;
763             }
764              
765             #endif /* INTERN_H */