| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Hypersonic::Event::IOUring; |
|
2
|
|
|
|
|
|
|
|
|
3
|
21
|
|
|
21
|
|
202189
|
use strict; |
|
|
21
|
|
|
|
|
32
|
|
|
|
21
|
|
|
|
|
755
|
|
|
4
|
21
|
|
|
21
|
|
86
|
use warnings; |
|
|
21
|
|
|
|
|
26
|
|
|
|
21
|
|
|
|
|
2780
|
|
|
5
|
21
|
|
|
21
|
|
330
|
use 5.010; |
|
|
21
|
|
|
|
|
68
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
21
|
|
|
21
|
|
99
|
use parent 'Hypersonic::Event::Role'; |
|
|
21
|
|
|
|
|
38
|
|
|
|
21
|
|
|
|
|
295
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our $VERSION = '0.19'; |
|
10
|
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
1
|
1379
|
sub name { 'io_uring' } |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub available { |
|
14
|
51
|
50
|
|
51
|
1
|
5826
|
return 0 unless $^O eq 'linux'; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# Check kernel version >= 5.13. |
|
17
|
|
|
|
|
|
|
# |
|
18
|
|
|
|
|
|
|
# We need kernel 5.13+ (not just 5.1+) because the readiness-only |
|
19
|
|
|
|
|
|
|
# backend in 0.19+ uses io_uring_prep_poll_multishot() which was |
|
20
|
|
|
|
|
|
|
# added in Linux 5.13 / liburing 2.1 (Aug 2021). Multi-shot poll |
|
21
|
|
|
|
|
|
|
# is essential: with one-shot poll_add the userspace re-arm in |
|
22
|
|
|
|
|
|
|
# gen_get_fd races against the main loop's recv() in a way that |
|
23
|
|
|
|
|
|
|
# makes the freshly re-armed (level-triggered) poll fire while |
|
24
|
|
|
|
|
|
|
# the buffer still has unread data, then fire AGAIN with an |
|
25
|
|
|
|
|
|
|
# empty buffer after recv() drains, causing the next iteration |
|
26
|
|
|
|
|
|
|
# to recv() and get EAGAIN, which the main loop treats as a |
|
27
|
|
|
|
|
|
|
# disconnect. This bug killed all sequential-keep-alive tests |
|
28
|
|
|
|
|
|
|
# (t/2100..t/2102, t/0035 WebSocket echo) the first time we |
|
29
|
|
|
|
|
|
|
# tried readiness-only mode. Multi-shot lets the kernel manage |
|
30
|
|
|
|
|
|
|
# the re-arm atomically with the readiness check, avoiding the |
|
31
|
|
|
|
|
|
|
# race entirely. |
|
32
|
|
|
|
|
|
|
# |
|
33
|
|
|
|
|
|
|
# Kernels < 5.13 fall back to epoll automatically via |
|
34
|
|
|
|
|
|
|
# Hypersonic::Event::best_backend's priority list. cpansmoker |
|
35
|
|
|
|
|
|
|
# hosts on Debian 12 (kernel 6.1+) and Fedora 43 (kernel 6.x) |
|
36
|
|
|
|
|
|
|
# all satisfy this; Debian 11 (5.10) and older fall back. |
|
37
|
51
|
|
50
|
|
|
320739
|
my $ver = `uname -r 2>/dev/null` || ''; |
|
38
|
51
|
|
|
|
|
2252
|
my ($major, $minor) = $ver =~ /^(\d+)\.(\d+)/; |
|
39
|
51
|
0
|
33
|
|
|
1008
|
return 0 unless $major && ($major > 5 || ($major == 5 && $minor >= 13)); |
|
|
|
|
33
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Check for liburing headers |
|
42
|
51
|
|
33
|
|
|
3167
|
my $has_header = -f '/usr/include/liburing.h' |
|
43
|
|
|
|
|
|
|
|| -f '/usr/local/include/liburing.h' |
|
44
|
|
|
|
|
|
|
|| -f '/usr/include/x86_64-linux-gnu/liburing.h'; |
|
45
|
51
|
50
|
|
|
|
2339
|
return 0 unless $has_header; |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# io_uring may be disabled at the kernel level. RHEL9 ships with |
|
48
|
|
|
|
|
|
|
# kernel.io_uring_disabled=2 by default; a value of 1 or 2 means |
|
49
|
|
|
|
|
|
|
# the syscall returns EINVAL/EPERM regardless of liburing being |
|
50
|
|
|
|
|
|
|
# linkable. Bail before we sink time into a compile+link probe. |
|
51
|
0
|
0
|
|
|
|
|
if (open my $fh, '<', '/proc/sys/kernel/io_uring_disabled') { |
|
52
|
0
|
|
|
|
|
|
my $disabled = <$fh>; |
|
53
|
0
|
|
|
|
|
|
close $fh; |
|
54
|
0
|
0
|
|
|
|
|
chomp $disabled if defined $disabled; |
|
55
|
0
|
0
|
0
|
|
|
|
return 0 if defined $disabled && $disabled ne '0'; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# Compile-link-and-RUN probe. A pure link check passes on systems |
|
59
|
|
|
|
|
|
|
# that have liburing installed but where io_uring_setup() will |
|
60
|
|
|
|
|
|
|
# nevertheless fail at runtime (kernel disabled, sandboxing, missing |
|
61
|
|
|
|
|
|
|
# liburing.so at exec time). Also probe for io_uring_prep_poll_multishot |
|
62
|
|
|
|
|
|
|
# symbol availability - the symbol was added in liburing 2.1, and |
|
63
|
|
|
|
|
|
|
# the actual multishot poll operation requires kernel 5.13+. If |
|
64
|
|
|
|
|
|
|
# either is missing we want to fall back to epoll silently. |
|
65
|
0
|
|
|
|
|
|
require Hypersonic::JIT::Util; |
|
66
|
0
|
|
|
|
|
|
return Hypersonic::JIT::Util->can_run( |
|
67
|
|
|
|
|
|
|
'', |
|
68
|
|
|
|
|
|
|
'-luring', |
|
69
|
|
|
|
|
|
|
'struct io_uring ring; int rc = io_uring_queue_init(8, &ring, 0); ' |
|
70
|
|
|
|
|
|
|
. 'if (rc < 0) return 1; ' |
|
71
|
|
|
|
|
|
|
. 'struct io_uring_sqe* sqe = io_uring_get_sqe(&ring); ' |
|
72
|
|
|
|
|
|
|
. 'if (!sqe) { io_uring_queue_exit(&ring); return 2; } ' |
|
73
|
|
|
|
|
|
|
. 'io_uring_prep_poll_multishot(sqe, 0, POLLIN); ' |
|
74
|
|
|
|
|
|
|
. 'io_uring_sqe_set_data(sqe, (void*)0); ' |
|
75
|
|
|
|
|
|
|
. 'io_uring_queue_exit(&ring); return 0;', |
|
76
|
|
|
|
|
|
|
"#include \n#include ", |
|
77
|
|
|
|
|
|
|
); |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub includes { |
|
81
|
|
|
|
|
|
|
# liburing.h for the server loop. |
|
82
|
|
|
|
|
|
|
# for POLLIN (the readiness mask we pass to prep_poll_add). |
|
83
|
|
|
|
|
|
|
# is needed for the UA::Async slot-tracking helpers |
|
84
|
|
|
|
|
|
|
# (gen_create_loop / _add_with_slot / _get_slot) - io_uring is |
|
85
|
|
|
|
|
|
|
# Linux 5.1+ which always has epoll. |
|
86
|
0
|
|
|
0
|
1
|
|
return "#include \n#include \n#include "; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub defines { |
|
90
|
|
|
|
|
|
|
# 0.19+ uses io_uring purely as a readiness notifier via |
|
91
|
|
|
|
|
|
|
# io_uring_prep_poll_multishot. Two subtle bugs would otherwise |
|
92
|
|
|
|
|
|
|
# bite us; both are fixed here: |
|
93
|
|
|
|
|
|
|
# |
|
94
|
|
|
|
|
|
|
# BUG 1 - CQE pointer staleness. The previous gen_wait cached |
|
95
|
|
|
|
|
|
|
# `struct io_uring_cqe*` pointers into a static array and called |
|
96
|
|
|
|
|
|
|
# io_uring_cqe_seen() per-event from gen_get_fd. Each cqe_seen |
|
97
|
|
|
|
|
|
|
# advances the user-side consumer cursor by one slot, freeing |
|
98
|
|
|
|
|
|
|
# that slot for the kernel to overwrite with a new CQE. So by |
|
99
|
|
|
|
|
|
|
# the time gen_get_fd dereferences cqes[i+1] for i+1 > 0, the |
|
100
|
|
|
|
|
|
|
# kernel may have rewritten cqes[1..n-1]'s targets. The new |
|
101
|
|
|
|
|
|
|
# design copies (user_data, res) VALUES out of each CQE inside |
|
102
|
|
|
|
|
|
|
# the for_each_cqe loop, then calls io_uring_cq_advance(&ring, |
|
103
|
|
|
|
|
|
|
# count) once to release all consumed slots at once. The |
|
104
|
|
|
|
|
|
|
# downstream gen_get_fd reads from our private value array, never |
|
105
|
|
|
|
|
|
|
# from the ring buffer. |
|
106
|
|
|
|
|
|
|
# |
|
107
|
|
|
|
|
|
|
# BUG 2 - fd reuse race. Suppose connection A is on fd 7 with a |
|
108
|
|
|
|
|
|
|
# multi-shot poll registration whose CQEs carry user_data=7. The |
|
109
|
|
|
|
|
|
|
# kernel may queue a CQE for fd 7 just before A is closed. The |
|
110
|
|
|
|
|
|
|
# main loop closes fd 7. The next accept() returns fd 7 for a |
|
111
|
|
|
|
|
|
|
# new connection B; the main loop calls gen_add(7) which arms a |
|
112
|
|
|
|
|
|
|
# new multi-shot poll. The queued CQE from A arrives -- it |
|
113
|
|
|
|
|
|
|
# carries user_data=7, looks valid, gen_get_fd returns fd=7, and |
|
114
|
|
|
|
|
|
|
# the main loop calls recv(7) thinking it's data for B but B |
|
115
|
|
|
|
|
|
|
# hasn't sent anything yet so recv returns EAGAIN and the main |
|
116
|
|
|
|
|
|
|
# loop closes B as if it had disconnected. The fix is a per-fd |
|
117
|
|
|
|
|
|
|
# generation counter, bumped on every gen_add and gen_del, with |
|
118
|
|
|
|
|
|
|
# the current generation packed into the high 32 bits of |
|
119
|
|
|
|
|
|
|
# user_data. gen_get_fd compares the CQE's generation to the |
|
120
|
|
|
|
|
|
|
# current one and discards stale CQEs. |
|
121
|
|
|
|
|
|
|
# |
|
122
|
|
|
|
|
|
|
# CQEs from cancel SQEs (gen_del) carry user_data=0 by design |
|
123
|
|
|
|
|
|
|
# so they can be cheaply distinguished from real poll completions |
|
124
|
|
|
|
|
|
|
# in gen_get_fd. |
|
125
|
0
|
|
|
0
|
1
|
|
return <<'C'; |
|
126
|
|
|
|
|
|
|
#define EV_BACKEND_IO_URING 1 |
|
127
|
|
|
|
|
|
|
#ifndef URING_ENTRIES |
|
128
|
|
|
|
|
|
|
#define URING_ENTRIES 256 |
|
129
|
|
|
|
|
|
|
#endif |
|
130
|
|
|
|
|
|
|
#ifndef MAX_EVENTS |
|
131
|
|
|
|
|
|
|
#define MAX_EVENTS 1024 |
|
132
|
|
|
|
|
|
|
#endif |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
/* MAX_FD is set by Hypersonic core to 65536, but its #define is |
|
135
|
|
|
|
|
|
|
* emitted AFTER the backend's defines() block. Guard ours so the |
|
136
|
|
|
|
|
|
|
* array declaration below has a valid size; the later core #define |
|
137
|
|
|
|
|
|
|
* will be a redefinition to the same value, which gcc accepts. */ |
|
138
|
|
|
|
|
|
|
#ifndef MAX_FD |
|
139
|
|
|
|
|
|
|
#define MAX_FD 65536 |
|
140
|
|
|
|
|
|
|
#endif |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
/* Value-copied CQE for the readiness-only design. See BUG 1 above. */ |
|
143
|
|
|
|
|
|
|
typedef struct { |
|
144
|
|
|
|
|
|
|
uint64_t ud; |
|
145
|
|
|
|
|
|
|
int32_t res; |
|
146
|
|
|
|
|
|
|
} hs_iouring_event_t; |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
/* Per-fd generation counter. See BUG 2 above. Starts at 0; the very |
|
149
|
|
|
|
|
|
|
* first gen_add() bumps to 1, so a user_data of 0 unambiguously means |
|
150
|
|
|
|
|
|
|
* "this CQE is from a cancel SQE, not a real poll completion". */ |
|
151
|
|
|
|
|
|
|
static uint32_t g_iouring_fd_gen[MAX_FD]; |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
/* Pack (generation, fd) into a uintptr_t for io_uring_sqe_set_data. |
|
154
|
|
|
|
|
|
|
* Generation is in the high 32 bits so that incrementing it cannot |
|
155
|
|
|
|
|
|
|
* collide with any valid fd value (which is at most MAX_FD-1). |
|
156
|
|
|
|
|
|
|
* Cast to void* at the call site because that's what |
|
157
|
|
|
|
|
|
|
* io_uring_sqe_set_data expects. */ |
|
158
|
|
|
|
|
|
|
#define HS_IOURING_UD(fd) ( ((uint64_t)g_iouring_fd_gen[(fd)] << 32) | (uint32_t)(fd) ) |
|
159
|
|
|
|
|
|
|
C |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
|
|
162
|
0
|
|
|
0
|
1
|
|
sub event_struct { 'io_uring_cqe' } |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# UA::Async slot-tracking helpers below use a private epoll instance |
|
165
|
|
|
|
|
|
|
# (io_uring's own slot tracking would mean weaving user_data through |
|
166
|
|
|
|
|
|
|
# the shared submission ring, which is a lot more invasive). So when |
|
167
|
|
|
|
|
|
|
# UA::Async asks for a buffer to pass to gen_wait_once it must be |
|
168
|
|
|
|
|
|
|
# struct epoll_event[], NOT io_uring_cqe[]. See the Fedora 43 / perl |
|
169
|
|
|
|
|
|
|
# 5.38.5 smoker report (5ce1e632) for what happens when the wrong |
|
170
|
|
|
|
|
|
|
# type is declared - epoll_wait() argument-type mismatch + a missing |
|
171
|
|
|
|
|
|
|
# `data` member access. |
|
172
|
0
|
|
|
0
|
0
|
|
sub slot_event_struct { 'epoll_event' } |
|
173
|
|
|
|
|
|
|
|
|
174
|
0
|
|
|
0
|
1
|
|
sub extra_cflags { '' } |
|
175
|
0
|
|
|
0
|
1
|
|
sub extra_ldflags { '-luring' } |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# io_uring is used here as a *readiness* notifier rather than for |
|
178
|
|
|
|
|
|
|
# completion-based I/O. The main event loop's accept() and recv() |
|
179
|
|
|
|
|
|
|
# calls do the actual I/O - identical to the epoll/kqueue path. |
|
180
|
|
|
|
|
|
|
# See the comment on `defines` above for the two subtle bugs |
|
181
|
|
|
|
|
|
|
# (CQE pointer staleness, fd reuse race) that this design fixes. |
|
182
|
|
|
|
|
|
|
sub gen_create { |
|
183
|
0
|
|
|
0
|
1
|
|
my ($class, $builder, $listen_fd_var) = @_; |
|
184
|
|
|
|
|
|
|
|
|
185
|
0
|
|
|
|
|
|
$builder->comment('io_uring backend - readiness-only multi-shot poll') |
|
186
|
|
|
|
|
|
|
->line('static struct io_uring ring;') |
|
187
|
|
|
|
|
|
|
->line('static int ring_initialized = 0;') |
|
188
|
|
|
|
|
|
|
->blank |
|
189
|
|
|
|
|
|
|
->if('!ring_initialized') |
|
190
|
|
|
|
|
|
|
->if('io_uring_queue_init(URING_ENTRIES, &ring, 0) < 0') |
|
191
|
|
|
|
|
|
|
# gen_create is inlined into hypersonic_run_event_loop, which |
|
192
|
|
|
|
|
|
|
# is a void XS function. `return ;` triggers GCC 14+ |
|
193
|
|
|
|
|
|
|
# -Wreturn-mismatch (now an error). croak from XS instead - |
|
194
|
|
|
|
|
|
|
# it longjmps out cleanly and surfaces a Perl-level error. |
|
195
|
|
|
|
|
|
|
->line('croak("io_uring_queue_init() failed: %s", strerror(errno));') |
|
196
|
|
|
|
|
|
|
->endif |
|
197
|
|
|
|
|
|
|
->line('ring_initialized = 1;') |
|
198
|
|
|
|
|
|
|
->endif |
|
199
|
|
|
|
|
|
|
->blank |
|
200
|
|
|
|
|
|
|
->comment('Arm MULTI-SHOT poll for listen socket. The kernel') |
|
201
|
|
|
|
|
|
|
->comment('re-arms automatically after each event (no userspace') |
|
202
|
|
|
|
|
|
|
->comment('re-arm race). Generation counter is bumped first so') |
|
203
|
|
|
|
|
|
|
->comment('any stale CQEs from a previous lifetime of this fd are') |
|
204
|
|
|
|
|
|
|
->comment('discarded as stale by gen_get_fd.') |
|
205
|
|
|
|
|
|
|
->line('{') |
|
206
|
|
|
|
|
|
|
->line(" g_iouring_fd_gen[$listen_fd_var]++;") |
|
207
|
|
|
|
|
|
|
->line(' struct io_uring_sqe* _csqe = io_uring_get_sqe(&ring);') |
|
208
|
|
|
|
|
|
|
->line(' if (!_csqe) croak("io_uring_get_sqe() returned NULL during gen_create");') |
|
209
|
|
|
|
|
|
|
->line(" io_uring_prep_poll_multishot(_csqe, $listen_fd_var, POLLIN);") |
|
210
|
|
|
|
|
|
|
->line(" io_uring_sqe_set_data(_csqe, (void*)(uintptr_t)HS_IOURING_UD($listen_fd_var));") |
|
211
|
|
|
|
|
|
|
->line(' if (io_uring_submit(&ring) < 0) croak("io_uring_submit() failed: %s", strerror(errno));') |
|
212
|
|
|
|
|
|
|
->line('}') |
|
213
|
|
|
|
|
|
|
->line('int ev_fd = 0;') # Dummy - io_uring uses the static ring |
|
214
|
|
|
|
|
|
|
->blank; |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# Arm a MULTI-SHOT POLLIN poll for $fd_var. Kernel re-arms after each |
|
218
|
|
|
|
|
|
|
# completion automatically (kernel 5.13+ / liburing 2.1+, validated by |
|
219
|
|
|
|
|
|
|
# available()). Generation counter is bumped first so the new poll's |
|
220
|
|
|
|
|
|
|
# CQEs cannot be confused with stale CQEs from an earlier registration |
|
221
|
|
|
|
|
|
|
# on the same fd value (see BUG 2 in defines() comment). |
|
222
|
|
|
|
|
|
|
sub gen_add { |
|
223
|
0
|
|
|
0
|
1
|
|
my ($class, $builder, $loop_var, $fd_var) = @_; |
|
224
|
|
|
|
|
|
|
|
|
225
|
0
|
|
|
|
|
|
$builder->line('{') |
|
226
|
|
|
|
|
|
|
->line(" if ($fd_var >= 0 && $fd_var < MAX_FD) {") |
|
227
|
|
|
|
|
|
|
->line(" g_iouring_fd_gen[$fd_var]++;") |
|
228
|
|
|
|
|
|
|
->line(' struct io_uring_sqe* _asqe = io_uring_get_sqe(&ring);') |
|
229
|
|
|
|
|
|
|
->line(' if (_asqe) {') |
|
230
|
|
|
|
|
|
|
->line(" io_uring_prep_poll_multishot(_asqe, $fd_var, POLLIN);") |
|
231
|
|
|
|
|
|
|
->line(" io_uring_sqe_set_data(_asqe, (void*)(uintptr_t)HS_IOURING_UD($fd_var));") |
|
232
|
|
|
|
|
|
|
->line(' io_uring_submit(&ring);') |
|
233
|
|
|
|
|
|
|
->line(' }') |
|
234
|
|
|
|
|
|
|
->line(' }') |
|
235
|
|
|
|
|
|
|
->line('}'); |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Cancel any pending poll for $fd_var. MUST be called BEFORE close() |
|
239
|
|
|
|
|
|
|
# by the main loop. |
|
240
|
|
|
|
|
|
|
# |
|
241
|
|
|
|
|
|
|
# THE CRITICAL BIT: io_uring's multishot poll registration holds a |
|
242
|
|
|
|
|
|
|
# kernel-level `struct file` reference to the fd. The caller's |
|
243
|
|
|
|
|
|
|
# subsequent close(fd) only removes the fd from the process's fd |
|
244
|
|
|
|
|
|
|
# table; the underlying socket stays open (and the peer never sees a |
|
245
|
|
|
|
|
|
|
# TCP FIN) until io_uring drops its file reference. Since |
|
246
|
|
|
|
|
|
|
# io_uring_prep_cancel is async, that drop happens at an unspecified |
|
247
|
|
|
|
|
|
|
# later time -- the client can wait indefinitely for the EOF that |
|
248
|
|
|
|
|
|
|
# tells it "the response is complete". |
|
249
|
|
|
|
|
|
|
# |
|
250
|
|
|
|
|
|
|
# Symptom: short-lived `Connection: close` HTTP requests appear to |
|
251
|
|
|
|
|
|
|
# succeed on the server (response is fully sent) but the client's |
|
252
|
|
|
|
|
|
|
# blocking recv() loop never returns 0. Tests like t/2100 hang at |
|
253
|
|
|
|
|
|
|
# the first POST. |
|
254
|
|
|
|
|
|
|
# |
|
255
|
|
|
|
|
|
|
# Fix: call shutdown(fd, SHUT_RDWR) here BEFORE submitting the |
|
256
|
|
|
|
|
|
|
# cancel. shutdown operates on the socket directly and unconditionally |
|
257
|
|
|
|
|
|
|
# sends FIN to the peer regardless of any reference counts. |
|
258
|
|
|
|
|
|
|
# io_uring's struct file ref is irrelevant to whether TCP FIN goes |
|
259
|
|
|
|
|
|
|
# out. The caller's close() afterwards still does the right thing |
|
260
|
|
|
|
|
|
|
# (marks the fd unused in our process); io_uring will eventually |
|
261
|
|
|
|
|
|
|
# release its own ref when the cancel completes async. |
|
262
|
|
|
|
|
|
|
# |
|
263
|
|
|
|
|
|
|
# Bumping the generation counter (still done) is what closes the |
|
264
|
|
|
|
|
|
|
# fd-reuse race: from this moment on, any pending CQE that still |
|
265
|
|
|
|
|
|
|
# carries the old generation is silently discarded by gen_get_fd, |
|
266
|
|
|
|
|
|
|
# even if accept() reuses the fd number before the cancel completes. |
|
267
|
|
|
|
|
|
|
# |
|
268
|
|
|
|
|
|
|
# We use io_uring_prep_cancel (not io_uring_prep_poll_remove) because |
|
269
|
|
|
|
|
|
|
# poll_remove's signature flipped from void* to __u64 around liburing |
|
270
|
|
|
|
|
|
|
# 2.0 whereas prep_cancel's void* user_data is stable from 0.7+. |
|
271
|
|
|
|
|
|
|
# |
|
272
|
|
|
|
|
|
|
# The cancel SQE carries user_data=0 so its own CQE is cheaply |
|
273
|
|
|
|
|
|
|
# distinguishable from real poll CQEs (which always have non-zero |
|
274
|
|
|
|
|
|
|
# user_data thanks to the generation in the high 32 bits). |
|
275
|
|
|
|
|
|
|
sub gen_del { |
|
276
|
0
|
|
|
0
|
1
|
|
my ($class, $builder, $loop_var, $fd_var) = @_; |
|
277
|
|
|
|
|
|
|
|
|
278
|
0
|
|
|
|
|
|
$builder->line('{') |
|
279
|
|
|
|
|
|
|
->line(" if ($fd_var >= 0 && $fd_var < MAX_FD) {") |
|
280
|
|
|
|
|
|
|
->line(" g_iouring_fd_gen[$fd_var]++;") |
|
281
|
|
|
|
|
|
|
->line(' }') |
|
282
|
|
|
|
|
|
|
->comment('Force-send TCP FIN regardless of io_uring file refs') |
|
283
|
|
|
|
|
|
|
->line(" shutdown($fd_var, SHUT_RDWR);") |
|
284
|
|
|
|
|
|
|
->line(' struct io_uring_sqe* _dsqe = io_uring_get_sqe(&ring);') |
|
285
|
|
|
|
|
|
|
->line(' if (_dsqe) {') |
|
286
|
|
|
|
|
|
|
->line(" io_uring_prep_cancel(_dsqe, (void*)(uintptr_t)$fd_var, 0);") |
|
287
|
|
|
|
|
|
|
->line(' io_uring_sqe_set_data(_dsqe, NULL);') |
|
288
|
|
|
|
|
|
|
->line(' io_uring_submit(&ring);') |
|
289
|
|
|
|
|
|
|
->line(' }') |
|
290
|
|
|
|
|
|
|
->line('}'); |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Copy CQEs out of the ring buffer into our private value array, then |
|
294
|
|
|
|
|
|
|
# release all consumed ring slots at once with io_uring_cq_advance. |
|
295
|
|
|
|
|
|
|
# This avoids BUG 1 (pointer staleness) - we never reference ring |
|
296
|
|
|
|
|
|
|
# slots after they've been released. We also lose nothing functionally |
|
297
|
|
|
|
|
|
|
# because the only fields we ever need from a CQE are user_data and |
|
298
|
|
|
|
|
|
|
# res. |
|
299
|
|
|
|
|
|
|
# |
|
300
|
|
|
|
|
|
|
# CRUCIAL: do NOT `continue;` on -ETIME or -EINTR. The main loop's |
|
301
|
|
|
|
|
|
|
# shutdown-drain branch (which force-closes all connections when |
|
302
|
|
|
|
|
|
|
# g_shutdown is set) lives AFTER gen_wait but BEFORE the event- |
|
303
|
|
|
|
|
|
|
# processing loop. If we `continue;` here, we never reach that branch, |
|
304
|
|
|
|
|
|
|
# and a server with idle keep-alive connections that gets SIGTERMed |
|
305
|
|
|
|
|
|
|
# will spin in gen_wait forever (no CQEs arriving means perpetual |
|
306
|
|
|
|
|
|
|
# -ETIME). Instead, set count=0 and fall through so the shutdown |
|
307
|
|
|
|
|
|
|
# branch runs and the cleanup pass drains the connections. This is |
|
308
|
|
|
|
|
|
|
# the same shape as epoll_wait()=0 in the Epoll backend. |
|
309
|
|
|
|
|
|
|
sub gen_wait { |
|
310
|
0
|
|
|
0
|
1
|
|
my ($class, $builder, $loop_var, $events_var, $count_var, $timeout_var) = @_; |
|
311
|
|
|
|
|
|
|
|
|
312
|
0
|
|
|
|
|
|
$builder->line('struct io_uring_cqe* cqe;') |
|
313
|
|
|
|
|
|
|
->line('struct __kernel_timespec ts;') |
|
314
|
|
|
|
|
|
|
->line("ts.tv_sec = $timeout_var / 1000;") |
|
315
|
|
|
|
|
|
|
->line("ts.tv_nsec = ($timeout_var % 1000) * 1000000;") |
|
316
|
|
|
|
|
|
|
->blank |
|
317
|
|
|
|
|
|
|
->line("int $count_var = 0;") |
|
318
|
|
|
|
|
|
|
->line('static hs_iouring_event_t events_buf[MAX_EVENTS];') |
|
319
|
|
|
|
|
|
|
->line("$events_var = events_buf;") |
|
320
|
|
|
|
|
|
|
->blank |
|
321
|
|
|
|
|
|
|
->comment('Block until at least one completion is ready') |
|
322
|
|
|
|
|
|
|
->line('int wait_result = io_uring_wait_cqe_timeout(&ring, &cqe, &ts);') |
|
323
|
|
|
|
|
|
|
->if('wait_result == 0') |
|
324
|
|
|
|
|
|
|
->comment('Drain all currently available CQEs (BUG 1 fix: copy values)') |
|
325
|
|
|
|
|
|
|
->line('unsigned head;') |
|
326
|
|
|
|
|
|
|
->line('io_uring_for_each_cqe(&ring, head, cqe) {') |
|
327
|
|
|
|
|
|
|
->line(" if ($count_var < MAX_EVENTS) {") |
|
328
|
|
|
|
|
|
|
->line(" events_buf[$count_var].ud = (uint64_t)(uintptr_t)io_uring_cqe_get_data(cqe);") |
|
329
|
|
|
|
|
|
|
->line(" events_buf[$count_var].res = cqe->res;") |
|
330
|
|
|
|
|
|
|
->line(" $count_var++;") |
|
331
|
|
|
|
|
|
|
->line(' }') |
|
332
|
|
|
|
|
|
|
->line('}') |
|
333
|
|
|
|
|
|
|
->line("io_uring_cq_advance(&ring, (unsigned)$count_var);") |
|
334
|
|
|
|
|
|
|
->elsif('wait_result == -ETIME || wait_result == -EINTR') |
|
335
|
|
|
|
|
|
|
->comment('Timeout / signal: fall through with count=0 so the') |
|
336
|
|
|
|
|
|
|
->comment('cleanup-on-shutdown branch can run. Do NOT continue;') |
|
337
|
|
|
|
|
|
|
->else |
|
338
|
|
|
|
|
|
|
->line('break;') |
|
339
|
|
|
|
|
|
|
->endif; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# Extract fd from our private value-array CQE. NO io_uring_cqe_seen |
|
343
|
|
|
|
|
|
|
# call here - gen_wait already advanced the ring cursor once for the |
|
344
|
|
|
|
|
|
|
# whole batch. |
|
345
|
|
|
|
|
|
|
# |
|
346
|
|
|
|
|
|
|
# Filters applied (any failure -> continue, skip this event): |
|
347
|
|
|
|
|
|
|
# * ud == 0 -> CQE is from a cancel SQE |
|
348
|
|
|
|
|
|
|
# * res < 0 -> poll cancelled/errored (-ECANCELED, -EBADF, ...) |
|
349
|
|
|
|
|
|
|
# * fd out of range -> defensive guard against corruption |
|
350
|
|
|
|
|
|
|
# * stale generation -> fd was closed and reused, this CQE is for the |
|
351
|
|
|
|
|
|
|
# old lifetime (see BUG 2 in defines() comment) |
|
352
|
|
|
|
|
|
|
sub gen_get_fd { |
|
353
|
0
|
|
|
0
|
1
|
|
my ($class, $builder, $events_var, $index_var, $fd_var) = @_; |
|
354
|
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
$builder->line("uint64_t _ud = ${events_var}[$index_var].ud;") |
|
356
|
|
|
|
|
|
|
->line("int _res = ${events_var}[$index_var].res;") |
|
357
|
|
|
|
|
|
|
->if('_ud == 0') |
|
358
|
|
|
|
|
|
|
->line('continue;') # cancel-SQE completion |
|
359
|
|
|
|
|
|
|
->endif |
|
360
|
|
|
|
|
|
|
->if('_res < 0') |
|
361
|
|
|
|
|
|
|
->line('continue;') # poll cancelled / errored |
|
362
|
|
|
|
|
|
|
->endif |
|
363
|
|
|
|
|
|
|
->line('uint32_t _ud_gen = (uint32_t)(_ud >> 32);') |
|
364
|
|
|
|
|
|
|
->line("int $fd_var = (int)(_ud & 0xFFFFFFFFu);") |
|
365
|
|
|
|
|
|
|
->if("$fd_var < 0 || $fd_var >= MAX_FD") |
|
366
|
|
|
|
|
|
|
->line('continue;') # defensive: corrupted user_data |
|
367
|
|
|
|
|
|
|
->endif |
|
368
|
|
|
|
|
|
|
->if("_ud_gen != g_iouring_fd_gen[$fd_var]") |
|
369
|
|
|
|
|
|
|
->line('continue;') # stale CQE from a previous fd lifetime |
|
370
|
|
|
|
|
|
|
->endif; |
|
371
|
|
|
|
|
|
|
} |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# Cleanup io_uring resources |
|
374
|
|
|
|
|
|
|
sub gen_cleanup { |
|
375
|
0
|
|
|
0
|
1
|
|
my ($class, $builder) = @_; |
|
376
|
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
|
$builder->if('ring_initialized') |
|
378
|
|
|
|
|
|
|
->line('io_uring_queue_exit(&ring);') |
|
379
|
|
|
|
|
|
|
->line('ring_initialized = 0;') |
|
380
|
|
|
|
|
|
|
->endif; |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# ============================================================ |
|
384
|
|
|
|
|
|
|
# Async Slot Integration Methods (UA Async) |
|
385
|
|
|
|
|
|
|
# |
|
386
|
|
|
|
|
|
|
# UA::Async tracks per-slot fd readiness independently of the server |
|
387
|
|
|
|
|
|
|
# loop's io_uring ring. We use epoll under the hood since io_uring |
|
388
|
|
|
|
|
|
|
# requires Linux 5.1+ which always has epoll, and a separate epoll |
|
389
|
|
|
|
|
|
|
# instance is much simpler than weaving slot tracking through the |
|
390
|
|
|
|
|
|
|
# server's submission ring. |
|
391
|
|
|
|
|
|
|
# ============================================================ |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub gen_wait_once { |
|
394
|
0
|
|
|
0
|
0
|
|
my ($class, $builder, $loop_var, $events_var, $count_var, $timeout_ms) = @_; |
|
395
|
|
|
|
|
|
|
|
|
396
|
0
|
|
|
|
|
|
$builder->line("$count_var = epoll_wait($loop_var, $events_var, MAX_EVENTS, $timeout_ms);") |
|
397
|
|
|
|
|
|
|
->line("if ($count_var < 0 && errno == EINTR) $count_var = 0;"); |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
sub gen_create_loop { |
|
401
|
0
|
|
|
0
|
0
|
|
my ($class, $builder, $loop_var) = @_; |
|
402
|
|
|
|
|
|
|
|
|
403
|
0
|
|
|
|
|
|
$builder->line("$loop_var = epoll_create1(0);") |
|
404
|
|
|
|
|
|
|
->if("$loop_var < 0") |
|
405
|
|
|
|
|
|
|
->line('croak("epoll_create1() failed");') |
|
406
|
|
|
|
|
|
|
->endif; |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub gen_add_with_slot { |
|
410
|
0
|
|
|
0
|
0
|
|
my ($class, $builder, $loop_var, $fd_var, $slot_var, $events) = @_; |
|
411
|
|
|
|
|
|
|
|
|
412
|
0
|
0
|
|
|
|
|
my $ev_flags = $events eq 'read' ? 'EPOLLIN | EPOLLET | EPOLLONESHOT' |
|
|
|
0
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
: $events eq 'write' ? 'EPOLLOUT | EPOLLET | EPOLLONESHOT' |
|
414
|
|
|
|
|
|
|
: 'EPOLLIN | EPOLLET | EPOLLONESHOT'; |
|
415
|
|
|
|
|
|
|
|
|
416
|
0
|
|
|
|
|
|
$builder->line('{') |
|
417
|
|
|
|
|
|
|
->line(' struct epoll_event _ev;') |
|
418
|
|
|
|
|
|
|
->line(" _ev.events = $ev_flags;") |
|
419
|
|
|
|
|
|
|
->line(" _ev.data.u32 = (uint32_t)$slot_var;") |
|
420
|
|
|
|
|
|
|
->line(" epoll_ctl($loop_var, EPOLL_CTL_ADD, $fd_var, &_ev);") |
|
421
|
|
|
|
|
|
|
->line('}'); |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub gen_get_slot { |
|
425
|
0
|
|
|
0
|
0
|
|
my ($class, $builder, $events_var, $index_var, $slot_var) = @_; |
|
426
|
|
|
|
|
|
|
|
|
427
|
0
|
|
|
|
|
|
$builder->line("int $slot_var;") |
|
428
|
|
|
|
|
|
|
->line("$slot_var = (int)${events_var}[$index_var].data.u32;"); |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# When async-slot helpers above are emitted, the io_uring includes |
|
432
|
|
|
|
|
|
|
# already pull in via the generated Hypersonic includes. |
|
433
|
|
|
|
|
|
|
# We don't need to add it here. |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# Future/Pool integration: pool_notify_fd is added via the same |
|
436
|
|
|
|
|
|
|
# gen_add path as any client fd (just an arm-poll-add on the fd, with |
|
437
|
|
|
|
|
|
|
# user_data = fd). We rely on Hypersonic::Event::Role's default |
|
438
|
|
|
|
|
|
|
# gen_add_pool_notify which delegates to gen_add. The pre-0.19 |
|
439
|
|
|
|
|
|
|
# override here used a UD_READ|fd encoding which broke alongside the |
|
440
|
|
|
|
|
|
|
# main accept-handoff bug. |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
1; |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
__END__ |