File Coverage

blib/lib/Hypersonic/Event/IOUring.pm
Criterion Covered Total %
statement 18 52 34.6
branch 2 16 12.5
condition 4 14 28.5
subroutine 6 22 27.2
pod 13 18 72.2
total 43 122 35.2


line stmt bran cond sub pod time code
1             package Hypersonic::Event::IOUring;
2              
3 21     21   202189 use strict;
  21         32  
  21         755  
4 21     21   86 use warnings;
  21         26  
  21         2780  
5 21     21   330 use 5.010;
  21         68  
6              
7 21     21   99 use parent 'Hypersonic::Event::Role';
  21         38  
  21         295  
8              
9             our $VERSION = '0.19';
10              
11 1     1 1 1379 sub name { 'io_uring' }
12              
13             sub available {
14 51 50   51 1 5826 return 0 unless $^O eq 'linux';
15              
16             # Check kernel version >= 5.13.
17             #
18             # We need kernel 5.13+ (not just 5.1+) because the readiness-only
19             # backend in 0.19+ uses io_uring_prep_poll_multishot() which was
20             # added in Linux 5.13 / liburing 2.1 (Aug 2021). Multi-shot poll
21             # is essential: with one-shot poll_add the userspace re-arm in
22             # gen_get_fd races against the main loop's recv() in a way that
23             # makes the freshly re-armed (level-triggered) poll fire while
24             # the buffer still has unread data, then fire AGAIN with an
25             # empty buffer after recv() drains, causing the next iteration
26             # to recv() and get EAGAIN, which the main loop treats as a
27             # disconnect. This bug killed all sequential-keep-alive tests
28             # (t/2100..t/2102, t/0035 WebSocket echo) the first time we
29             # tried readiness-only mode. Multi-shot lets the kernel manage
30             # the re-arm atomically with the readiness check, avoiding the
31             # race entirely.
32             #
33             # Kernels < 5.13 fall back to epoll automatically via
34             # Hypersonic::Event::best_backend's priority list. cpansmoker
35             # hosts on Debian 12 (kernel 6.1+) and Fedora 43 (kernel 6.x)
36             # all satisfy this; Debian 11 (5.10) and older fall back.
37 51   50     320739 my $ver = `uname -r 2>/dev/null` || '';
38 51         2252 my ($major, $minor) = $ver =~ /^(\d+)\.(\d+)/;
39 51 0 33     1008 return 0 unless $major && ($major > 5 || ($major == 5 && $minor >= 13));
      33        
40              
41             # Check for liburing headers
42 51   33     3167 my $has_header = -f '/usr/include/liburing.h'
43             || -f '/usr/local/include/liburing.h'
44             || -f '/usr/include/x86_64-linux-gnu/liburing.h';
45 51 50       2339 return 0 unless $has_header;
46              
47             # io_uring may be disabled at the kernel level. RHEL9 ships with
48             # kernel.io_uring_disabled=2 by default; a value of 1 or 2 means
49             # the syscall returns EINVAL/EPERM regardless of liburing being
50             # linkable. Bail before we sink time into a compile+link probe.
51 0 0         if (open my $fh, '<', '/proc/sys/kernel/io_uring_disabled') {
52 0           my $disabled = <$fh>;
53 0           close $fh;
54 0 0         chomp $disabled if defined $disabled;
55 0 0 0       return 0 if defined $disabled && $disabled ne '0';
56             }
57              
58             # Compile-link-and-RUN probe. A pure link check passes on systems
59             # that have liburing installed but where io_uring_setup() will
60             # nevertheless fail at runtime (kernel disabled, sandboxing, missing
61             # liburing.so at exec time). Also probe for io_uring_prep_poll_multishot
62             # symbol availability - the symbol was added in liburing 2.1, and
63             # the actual multishot poll operation requires kernel 5.13+. If
64             # either is missing we want to fall back to epoll silently.
65 0           require Hypersonic::JIT::Util;
66 0           return Hypersonic::JIT::Util->can_run(
67             '',
68             '-luring',
69             'struct io_uring ring; int rc = io_uring_queue_init(8, &ring, 0); '
70             . 'if (rc < 0) return 1; '
71             . 'struct io_uring_sqe* sqe = io_uring_get_sqe(&ring); '
72             . 'if (!sqe) { io_uring_queue_exit(&ring); return 2; } '
73             . 'io_uring_prep_poll_multishot(sqe, 0, POLLIN); '
74             . 'io_uring_sqe_set_data(sqe, (void*)0); '
75             . 'io_uring_queue_exit(&ring); return 0;',
76             "#include \n#include ",
77             );
78             }
79              
80             sub includes {
81             # liburing.h for the server loop.
82             # for POLLIN (the readiness mask we pass to prep_poll_add).
83             # is needed for the UA::Async slot-tracking helpers
84             # (gen_create_loop / _add_with_slot / _get_slot) - io_uring is
85             # Linux 5.1+ which always has epoll.
86 0     0 1   return "#include \n#include \n#include ";
87             }
88              
89             sub defines {
90             # 0.19+ uses io_uring purely as a readiness notifier via
91             # io_uring_prep_poll_multishot. Two subtle bugs would otherwise
92             # bite us; both are fixed here:
93             #
94             # BUG 1 - CQE pointer staleness. The previous gen_wait cached
95             # `struct io_uring_cqe*` pointers into a static array and called
96             # io_uring_cqe_seen() per-event from gen_get_fd. Each cqe_seen
97             # advances the user-side consumer cursor by one slot, freeing
98             # that slot for the kernel to overwrite with a new CQE. So by
99             # the time gen_get_fd dereferences cqes[i+1] for i+1 > 0, the
100             # kernel may have rewritten cqes[1..n-1]'s targets. The new
101             # design copies (user_data, res) VALUES out of each CQE inside
102             # the for_each_cqe loop, then calls io_uring_cq_advance(&ring,
103             # count) once to release all consumed slots at once. The
104             # downstream gen_get_fd reads from our private value array, never
105             # from the ring buffer.
106             #
107             # BUG 2 - fd reuse race. Suppose connection A is on fd 7 with a
108             # multi-shot poll registration whose CQEs carry user_data=7. The
109             # kernel may queue a CQE for fd 7 just before A is closed. The
110             # main loop closes fd 7. The next accept() returns fd 7 for a
111             # new connection B; the main loop calls gen_add(7) which arms a
112             # new multi-shot poll. The queued CQE from A arrives -- it
113             # carries user_data=7, looks valid, gen_get_fd returns fd=7, and
114             # the main loop calls recv(7) thinking it's data for B but B
115             # hasn't sent anything yet so recv returns EAGAIN and the main
116             # loop closes B as if it had disconnected. The fix is a per-fd
117             # generation counter, bumped on every gen_add and gen_del, with
118             # the current generation packed into the high 32 bits of
119             # user_data. gen_get_fd compares the CQE's generation to the
120             # current one and discards stale CQEs.
121             #
122             # CQEs from cancel SQEs (gen_del) carry user_data=0 by design
123             # so they can be cheaply distinguished from real poll completions
124             # in gen_get_fd.
125 0     0 1   return <<'C';
126             #define EV_BACKEND_IO_URING 1
127             #ifndef URING_ENTRIES
128             #define URING_ENTRIES 256
129             #endif
130             #ifndef MAX_EVENTS
131             #define MAX_EVENTS 1024
132             #endif
133              
134             /* MAX_FD is set by Hypersonic core to 65536, but its #define is
135             * emitted AFTER the backend's defines() block. Guard ours so the
136             * array declaration below has a valid size; the later core #define
137             * will be a redefinition to the same value, which gcc accepts. */
138             #ifndef MAX_FD
139             #define MAX_FD 65536
140             #endif
141              
142             /* Value-copied CQE for the readiness-only design. See BUG 1 above. */
143             typedef struct {
144             uint64_t ud;
145             int32_t res;
146             } hs_iouring_event_t;
147              
148             /* Per-fd generation counter. See BUG 2 above. Starts at 0; the very
149             * first gen_add() bumps to 1, so a user_data of 0 unambiguously means
150             * "this CQE is from a cancel SQE, not a real poll completion". */
151             static uint32_t g_iouring_fd_gen[MAX_FD];
152              
153             /* Pack (generation, fd) into a uintptr_t for io_uring_sqe_set_data.
154             * Generation is in the high 32 bits so that incrementing it cannot
155             * collide with any valid fd value (which is at most MAX_FD-1).
156             * Cast to void* at the call site because that's what
157             * io_uring_sqe_set_data expects. */
158             #define HS_IOURING_UD(fd) ( ((uint64_t)g_iouring_fd_gen[(fd)] << 32) | (uint32_t)(fd) )
159             C
160             }
161              
162 0     0 1   sub event_struct { 'io_uring_cqe' }
163              
164             # UA::Async slot-tracking helpers below use a private epoll instance
165             # (io_uring's own slot tracking would mean weaving user_data through
166             # the shared submission ring, which is a lot more invasive). So when
167             # UA::Async asks for a buffer to pass to gen_wait_once it must be
168             # struct epoll_event[], NOT io_uring_cqe[]. See the Fedora 43 / perl
169             # 5.38.5 smoker report (5ce1e632) for what happens when the wrong
170             # type is declared - epoll_wait() argument-type mismatch + a missing
171             # `data` member access.
172 0     0 0   sub slot_event_struct { 'epoll_event' }
173              
174 0     0 1   sub extra_cflags { '' }
175 0     0 1   sub extra_ldflags { '-luring' }
176              
177             # io_uring is used here as a *readiness* notifier rather than for
178             # completion-based I/O. The main event loop's accept() and recv()
179             # calls do the actual I/O - identical to the epoll/kqueue path.
180             # See the comment on `defines` above for the two subtle bugs
181             # (CQE pointer staleness, fd reuse race) that this design fixes.
182             sub gen_create {
183 0     0 1   my ($class, $builder, $listen_fd_var) = @_;
184              
185 0           $builder->comment('io_uring backend - readiness-only multi-shot poll')
186             ->line('static struct io_uring ring;')
187             ->line('static int ring_initialized = 0;')
188             ->blank
189             ->if('!ring_initialized')
190             ->if('io_uring_queue_init(URING_ENTRIES, &ring, 0) < 0')
191             # gen_create is inlined into hypersonic_run_event_loop, which
192             # is a void XS function. `return ;` triggers GCC 14+
193             # -Wreturn-mismatch (now an error). croak from XS instead -
194             # it longjmps out cleanly and surfaces a Perl-level error.
195             ->line('croak("io_uring_queue_init() failed: %s", strerror(errno));')
196             ->endif
197             ->line('ring_initialized = 1;')
198             ->endif
199             ->blank
200             ->comment('Arm MULTI-SHOT poll for listen socket. The kernel')
201             ->comment('re-arms automatically after each event (no userspace')
202             ->comment('re-arm race). Generation counter is bumped first so')
203             ->comment('any stale CQEs from a previous lifetime of this fd are')
204             ->comment('discarded as stale by gen_get_fd.')
205             ->line('{')
206             ->line(" g_iouring_fd_gen[$listen_fd_var]++;")
207             ->line(' struct io_uring_sqe* _csqe = io_uring_get_sqe(&ring);')
208             ->line(' if (!_csqe) croak("io_uring_get_sqe() returned NULL during gen_create");')
209             ->line(" io_uring_prep_poll_multishot(_csqe, $listen_fd_var, POLLIN);")
210             ->line(" io_uring_sqe_set_data(_csqe, (void*)(uintptr_t)HS_IOURING_UD($listen_fd_var));")
211             ->line(' if (io_uring_submit(&ring) < 0) croak("io_uring_submit() failed: %s", strerror(errno));')
212             ->line('}')
213             ->line('int ev_fd = 0;') # Dummy - io_uring uses the static ring
214             ->blank;
215             }
216              
217             # Arm a MULTI-SHOT POLLIN poll for $fd_var. Kernel re-arms after each
218             # completion automatically (kernel 5.13+ / liburing 2.1+, validated by
219             # available()). Generation counter is bumped first so the new poll's
220             # CQEs cannot be confused with stale CQEs from an earlier registration
221             # on the same fd value (see BUG 2 in defines() comment).
222             sub gen_add {
223 0     0 1   my ($class, $builder, $loop_var, $fd_var) = @_;
224              
225 0           $builder->line('{')
226             ->line(" if ($fd_var >= 0 && $fd_var < MAX_FD) {")
227             ->line(" g_iouring_fd_gen[$fd_var]++;")
228             ->line(' struct io_uring_sqe* _asqe = io_uring_get_sqe(&ring);')
229             ->line(' if (_asqe) {')
230             ->line(" io_uring_prep_poll_multishot(_asqe, $fd_var, POLLIN);")
231             ->line(" io_uring_sqe_set_data(_asqe, (void*)(uintptr_t)HS_IOURING_UD($fd_var));")
232             ->line(' io_uring_submit(&ring);')
233             ->line(' }')
234             ->line(' }')
235             ->line('}');
236             }
237              
238             # Cancel any pending poll for $fd_var. MUST be called BEFORE close()
239             # by the main loop.
240             #
241             # THE CRITICAL BIT: io_uring's multishot poll registration holds a
242             # kernel-level `struct file` reference to the fd. The caller's
243             # subsequent close(fd) only removes the fd from the process's fd
244             # table; the underlying socket stays open (and the peer never sees a
245             # TCP FIN) until io_uring drops its file reference. Since
246             # io_uring_prep_cancel is async, that drop happens at an unspecified
247             # later time -- the client can wait indefinitely for the EOF that
248             # tells it "the response is complete".
249             #
250             # Symptom: short-lived `Connection: close` HTTP requests appear to
251             # succeed on the server (response is fully sent) but the client's
252             # blocking recv() loop never returns 0. Tests like t/2100 hang at
253             # the first POST.
254             #
255             # Fix: call shutdown(fd, SHUT_RDWR) here BEFORE submitting the
256             # cancel. shutdown operates on the socket directly and unconditionally
257             # sends FIN to the peer regardless of any reference counts.
258             # io_uring's struct file ref is irrelevant to whether TCP FIN goes
259             # out. The caller's close() afterwards still does the right thing
260             # (marks the fd unused in our process); io_uring will eventually
261             # release its own ref when the cancel completes async.
262             #
263             # Bumping the generation counter (still done) is what closes the
264             # fd-reuse race: from this moment on, any pending CQE that still
265             # carries the old generation is silently discarded by gen_get_fd,
266             # even if accept() reuses the fd number before the cancel completes.
267             #
268             # We use io_uring_prep_cancel (not io_uring_prep_poll_remove) because
269             # poll_remove's signature flipped from void* to __u64 around liburing
270             # 2.0 whereas prep_cancel's void* user_data is stable from 0.7+.
271             #
272             # The cancel SQE carries user_data=0 so its own CQE is cheaply
273             # distinguishable from real poll CQEs (which always have non-zero
274             # user_data thanks to the generation in the high 32 bits).
275             sub gen_del {
276 0     0 1   my ($class, $builder, $loop_var, $fd_var) = @_;
277              
278 0           $builder->line('{')
279             ->line(" if ($fd_var >= 0 && $fd_var < MAX_FD) {")
280             ->line(" g_iouring_fd_gen[$fd_var]++;")
281             ->line(' }')
282             ->comment('Force-send TCP FIN regardless of io_uring file refs')
283             ->line(" shutdown($fd_var, SHUT_RDWR);")
284             ->line(' struct io_uring_sqe* _dsqe = io_uring_get_sqe(&ring);')
285             ->line(' if (_dsqe) {')
286             ->line(" io_uring_prep_cancel(_dsqe, (void*)(uintptr_t)$fd_var, 0);")
287             ->line(' io_uring_sqe_set_data(_dsqe, NULL);')
288             ->line(' io_uring_submit(&ring);')
289             ->line(' }')
290             ->line('}');
291             }
292              
293             # Copy CQEs out of the ring buffer into our private value array, then
294             # release all consumed ring slots at once with io_uring_cq_advance.
295             # This avoids BUG 1 (pointer staleness) - we never reference ring
296             # slots after they've been released. We also lose nothing functionally
297             # because the only fields we ever need from a CQE are user_data and
298             # res.
299             #
300             # CRUCIAL: do NOT `continue;` on -ETIME or -EINTR. The main loop's
301             # shutdown-drain branch (which force-closes all connections when
302             # g_shutdown is set) lives AFTER gen_wait but BEFORE the event-
303             # processing loop. If we `continue;` here, we never reach that branch,
304             # and a server with idle keep-alive connections that gets SIGTERMed
305             # will spin in gen_wait forever (no CQEs arriving means perpetual
306             # -ETIME). Instead, set count=0 and fall through so the shutdown
307             # branch runs and the cleanup pass drains the connections. This is
308             # the same shape as epoll_wait()=0 in the Epoll backend.
309             sub gen_wait {
310 0     0 1   my ($class, $builder, $loop_var, $events_var, $count_var, $timeout_var) = @_;
311              
312 0           $builder->line('struct io_uring_cqe* cqe;')
313             ->line('struct __kernel_timespec ts;')
314             ->line("ts.tv_sec = $timeout_var / 1000;")
315             ->line("ts.tv_nsec = ($timeout_var % 1000) * 1000000;")
316             ->blank
317             ->line("int $count_var = 0;")
318             ->line('static hs_iouring_event_t events_buf[MAX_EVENTS];')
319             ->line("$events_var = events_buf;")
320             ->blank
321             ->comment('Block until at least one completion is ready')
322             ->line('int wait_result = io_uring_wait_cqe_timeout(&ring, &cqe, &ts);')
323             ->if('wait_result == 0')
324             ->comment('Drain all currently available CQEs (BUG 1 fix: copy values)')
325             ->line('unsigned head;')
326             ->line('io_uring_for_each_cqe(&ring, head, cqe) {')
327             ->line(" if ($count_var < MAX_EVENTS) {")
328             ->line(" events_buf[$count_var].ud = (uint64_t)(uintptr_t)io_uring_cqe_get_data(cqe);")
329             ->line(" events_buf[$count_var].res = cqe->res;")
330             ->line(" $count_var++;")
331             ->line(' }')
332             ->line('}')
333             ->line("io_uring_cq_advance(&ring, (unsigned)$count_var);")
334             ->elsif('wait_result == -ETIME || wait_result == -EINTR')
335             ->comment('Timeout / signal: fall through with count=0 so the')
336             ->comment('cleanup-on-shutdown branch can run. Do NOT continue;')
337             ->else
338             ->line('break;')
339             ->endif;
340             }
341              
342             # Extract fd from our private value-array CQE. NO io_uring_cqe_seen
343             # call here - gen_wait already advanced the ring cursor once for the
344             # whole batch.
345             #
346             # Filters applied (any failure -> continue, skip this event):
347             # * ud == 0 -> CQE is from a cancel SQE
348             # * res < 0 -> poll cancelled/errored (-ECANCELED, -EBADF, ...)
349             # * fd out of range -> defensive guard against corruption
350             # * stale generation -> fd was closed and reused, this CQE is for the
351             # old lifetime (see BUG 2 in defines() comment)
352             sub gen_get_fd {
353 0     0 1   my ($class, $builder, $events_var, $index_var, $fd_var) = @_;
354              
355 0           $builder->line("uint64_t _ud = ${events_var}[$index_var].ud;")
356             ->line("int _res = ${events_var}[$index_var].res;")
357             ->if('_ud == 0')
358             ->line('continue;') # cancel-SQE completion
359             ->endif
360             ->if('_res < 0')
361             ->line('continue;') # poll cancelled / errored
362             ->endif
363             ->line('uint32_t _ud_gen = (uint32_t)(_ud >> 32);')
364             ->line("int $fd_var = (int)(_ud & 0xFFFFFFFFu);")
365             ->if("$fd_var < 0 || $fd_var >= MAX_FD")
366             ->line('continue;') # defensive: corrupted user_data
367             ->endif
368             ->if("_ud_gen != g_iouring_fd_gen[$fd_var]")
369             ->line('continue;') # stale CQE from a previous fd lifetime
370             ->endif;
371             }
372              
373             # Cleanup io_uring resources
374             sub gen_cleanup {
375 0     0 1   my ($class, $builder) = @_;
376              
377 0           $builder->if('ring_initialized')
378             ->line('io_uring_queue_exit(&ring);')
379             ->line('ring_initialized = 0;')
380             ->endif;
381             }
382              
383             # ============================================================
384             # Async Slot Integration Methods (UA Async)
385             #
386             # UA::Async tracks per-slot fd readiness independently of the server
387             # loop's io_uring ring. We use epoll under the hood since io_uring
388             # requires Linux 5.1+ which always has epoll, and a separate epoll
389             # instance is much simpler than weaving slot tracking through the
390             # server's submission ring.
391             # ============================================================
392              
393             sub gen_wait_once {
394 0     0 0   my ($class, $builder, $loop_var, $events_var, $count_var, $timeout_ms) = @_;
395              
396 0           $builder->line("$count_var = epoll_wait($loop_var, $events_var, MAX_EVENTS, $timeout_ms);")
397             ->line("if ($count_var < 0 && errno == EINTR) $count_var = 0;");
398             }
399              
400             sub gen_create_loop {
401 0     0 0   my ($class, $builder, $loop_var) = @_;
402              
403 0           $builder->line("$loop_var = epoll_create1(0);")
404             ->if("$loop_var < 0")
405             ->line('croak("epoll_create1() failed");')
406             ->endif;
407             }
408              
409             sub gen_add_with_slot {
410 0     0 0   my ($class, $builder, $loop_var, $fd_var, $slot_var, $events) = @_;
411              
412 0 0         my $ev_flags = $events eq 'read' ? 'EPOLLIN | EPOLLET | EPOLLONESHOT'
    0          
413             : $events eq 'write' ? 'EPOLLOUT | EPOLLET | EPOLLONESHOT'
414             : 'EPOLLIN | EPOLLET | EPOLLONESHOT';
415              
416 0           $builder->line('{')
417             ->line(' struct epoll_event _ev;')
418             ->line(" _ev.events = $ev_flags;")
419             ->line(" _ev.data.u32 = (uint32_t)$slot_var;")
420             ->line(" epoll_ctl($loop_var, EPOLL_CTL_ADD, $fd_var, &_ev);")
421             ->line('}');
422             }
423              
424             sub gen_get_slot {
425 0     0 0   my ($class, $builder, $events_var, $index_var, $slot_var) = @_;
426              
427 0           $builder->line("int $slot_var;")
428             ->line("$slot_var = (int)${events_var}[$index_var].data.u32;");
429             }
430              
431             # When async-slot helpers above are emitted, the io_uring includes
432             # already pull in via the generated Hypersonic includes.
433             # We don't need to add it here.
434              
435             # Future/Pool integration: pool_notify_fd is added via the same
436             # gen_add path as any client fd (just an arm-poll-add on the fd, with
437             # user_data = fd). We rely on Hypersonic::Event::Role's default
438             # gen_add_pool_notify which delegates to gen_add. The pre-0.19
439             # override here used a UD_READ|fd encoding which broke alongside the
440             # main accept-handoff bug.
441              
442             1;
443              
444             __END__