File Coverage

libev/ev_iouring.c

Criterion	Covered	Total	%
statement	0	191	0.0
branch	0	70	0.0
condition			n/a
subroutine			n/a
pod			n/a
total	0	261	0.0

line	stmt	bran	code
1			/*
2			* libev linux io_uring fd activity backend
3			*
4			* Copyright (c) 2019-2020 Marc Alexander Lehmann
5			* All rights reserved.
6			*
7			* Redistribution and use in source and binary forms, with or without modifica-
8			* tion, are permitted provided that the following conditions are met:
9			*
10			* 1. Redistributions of source code must retain the above copyright notice,
11			* this list of conditions and the following disclaimer.
12			*
13			* 2. Redistributions in binary form must reproduce the above copyright
14			* notice, this list of conditions and the following disclaimer in the
15			* documentation and/or other materials provided with the distribution.
16			*
17			* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
18			* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
19			* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
20			* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
21			* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22			* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
23			* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24			* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
25			* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
26			* OF THE POSSIBILITY OF SUCH DAMAGE.
27			*
28			* Alternatively, the contents of this file may be used under the terms of
29			* the GNU General Public License ("GPL") version 2 or any later version,
30			* in which case the provisions of the GPL are applicable instead of
31			* the above. If you wish to allow the use of your version of this file
32			* only under the terms of the GPL and not to allow others to use your
33			* version of this file under the BSD license, indicate your decision
34			* by deleting the provisions above and replace them with the notice
35			* and other provisions required by the GPL. If you do not delete the
36			* provisions above, a recipient may use your version of this file under
37			* either the BSD or the GPL.
38			*/
39
40			/*
41			* general notes about linux io_uring:
42			*
43			* a) it's the best interface I have seen so far. on linux.
44			* b) best is not necessarily very good.
45			* c) it's better than the aio mess, doesn't suffer from the fork problems
46			* of linux aio or epoll and so on and so on. and you could do event stuff
47			* without any syscalls. what's not to like?
48			* d) ok, it's vastly more complex, but that's ok, really.
49			* e) why two mmaps instead of one? one would be more space-efficient,
50			* and I can't see what benefit two would have (other than being
51			* somehow resizable/relocatable, but that's apparently not possible).
52			* f) hmm, it's practically undebuggable (gdb can't access the memory, and
53			* the bizarre way structure offsets are communicated makes it hard to
54			* just print the ring buffer heads, even iff the memory were visible
55			* in gdb. but then, that's also ok, really.
56			* g) well, you cannot specify a timeout when waiting for events. no,
57			* seriously, the interface doesn't support a timeout. never seen _that_
58			* before. sure, you can use a timerfd, but that's another syscall
59			* you could have avoided. overall, this bizarre omission smells
60			* like a µ-optimisation by the io_uring author for his personal
61			* applications, to the detriment of everybody else who just wants
62			* an event loop. but, umm, ok, if that's all, it could be worse.
63			* (from what I gather from the author Jens Axboe, it simply didn't
64			* occur to him, and he made good on it by adding an unlimited number
65			* of timeouts later :).
66			* h) initially there was a hardcoded limit of 4096 outstanding events.
67			* later versions not only bump this to 32k, but also can handle
68			* an unlimited amount of events, so this only affects the batch size.
69			* i) unlike linux aio, you can register more then the limit
70			* of fd events. while early versions of io_uring signalled an overflow
71			* and you ended up getting wet. 5.5+ does not do this anymore.
72			* j) but, oh my! it had exactly the same bugs as the linux aio backend,
73			* where some undocumented poll combinations just fail. fortunately,
74			* after finally reaching the author, he was more than willing to fix
75			* this probably in 5.6+.
76			* k) overall, the API itself is, I dare to say, not a total trainwreck.
77			* once the bugs ae fixed (probably in 5.6+), it will be without
78			* competition.
79			*/
80
81			/* TODO: use internal TIMEOUT */
82			/* TODO: take advantage of single mmap, NODROP etc. */
83			/* TODO: resize cq/sq size independently */
84
85			#include
86			#include
87			#include
88			#include
89
90			#define IOURING_INIT_ENTRIES 32
91
92			/*****************************************************************************/
93			/* syscall wrapdadoop - this section has the raw api/abi definitions */
94
95			#include
96			#include
97
98			/* mostly directly taken from the kernel or documentation */
99
100			struct io_uring_sqe
101			{
102			__u8 opcode;
103			__u8 flags;
104			__u16 ioprio;
105			__s32 fd;
106			union {
107			__u64 off;
108			__u64 addr2;
109			};
110			__u64 addr;
111			__u32 len;
112			union {
113			__kernel_rwf_t rw_flags;
114			__u32 fsync_flags;
115			__u16 poll_events;
116			__u32 sync_range_flags;
117			__u32 msg_flags;
118			__u32 timeout_flags;
119			__u32 accept_flags;
120			__u32 cancel_flags;
121			__u32 open_flags;
122			__u32 statx_flags;
123			__u32 fadvise_advice;
124			};
125			__u64 user_data;
126			union {
127			__u16 buf_index;
128			__u16 personality;
129			__u64 __pad2[3];
130			};
131			};
132
133			struct io_uring_cqe
134			{
135			__u64 user_data;
136			__s32 res;
137			__u32 flags;
138			};
139
140			struct io_sqring_offsets
141			{
142			__u32 head;
143			__u32 tail;
144			__u32 ring_mask;
145			__u32 ring_entries;
146			__u32 flags;
147			__u32 dropped;
148			__u32 array;
149			__u32 resv1;
150			__u64 resv2;
151			};
152
153			struct io_cqring_offsets
154			{
155			__u32 head;
156			__u32 tail;
157			__u32 ring_mask;
158			__u32 ring_entries;
159			__u32 overflow;
160			__u32 cqes;
161			__u64 resv[2];
162			};
163
164			struct io_uring_params
165			{
166			__u32 sq_entries;
167			__u32 cq_entries;
168			__u32 flags;
169			__u32 sq_thread_cpu;
170			__u32 sq_thread_idle;
171			__u32 features;
172			__u32 resv[4];
173			struct io_sqring_offsets sq_off;
174			struct io_cqring_offsets cq_off;
175			};
176
177			#define IORING_FEAT_SINGLE_MMAP 0x00000001
178			#define IORING_FEAT_NODROP 0x00000002
179			#define IORING_FEAT_SUBMIT_STABLE 0x00000004
180
181			#define IORING_SETUP_CQSIZE 0x00000008
182			#define IORING_SETUP_CLAMP 0x00000010
183
184			#define IORING_OP_POLL_ADD 6
185			#define IORING_OP_POLL_REMOVE 7
186			#define IORING_OP_TIMEOUT 11
187			#define IORING_OP_TIMEOUT_REMOVE 12
188
189			#define IORING_REGISTER_EVENTFD 4
190			#define IORING_REGISTER_EVENTFD_ASYNC 7
191			#define IORING_REGISTER_PROBE 8
192
193			#define IO_URING_OP_SUPPORTED 1
194
195			struct io_uring_probe_op {
196			__u8 op;
197			__u8 resv;
198			__u16 flags;
199			__u32 resv2;
200			};
201
202			struct io_uring_probe
203			{
204			__u8 last_op;
205			__u8 ops_len;
206			__u16 resv;
207			__u32 resv2[3];
208			struct io_uring_probe_op ops[0];
209			};
210
211			/* relative or absolute, reference clock is CLOCK_MONOTONIC */
212			struct iouring_kernel_timespec
213			{
214			int64_t tv_sec;
215			long long tv_nsec;
216			};
217
218			#define IORING_TIMEOUT_ABS 0x00000001
219
220			#define IORING_ENTER_GETEVENTS 0x01
221
222			#define IORING_OFF_SQ_RING 0x00000000ULL
223			#define IORING_OFF_SQES 0x10000000ULL
224
225			#define IORING_FEAT_SINGLE_MMAP 0x00000001
226			#define IORING_FEAT_NODROP 0x00000002
227			#define IORING_FEAT_SUBMIT_STABLE 0x00000004
228
229			inline_size
230			int
231	0		evsys_io_uring_setup (unsigned entries, struct io_uring_params *params)
232			{
233	0		return ev_syscall2 (SYS_io_uring_setup, entries, params);
234			}
235
236			inline_size
237			int
238	0		evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz)
239			{
240	0		return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz);
241			}
242
243			inline_size
244			int
245			evsys_io_uring_register (unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args)
246			{
247			return ev_syscall4 (SYS_io_uring_register, fd, opcode, arg, nr_args);
248			}
249
250			/*****************************************************************************/
251			/* actual backend implementation */
252
253			/* we hope that volatile will make the compiler access this variables only once */
254			#define EV_SQ_VAR(name) (volatile unsigned )((char *)iouring_ring + iouring_sq_ ## name)
255			#define EV_CQ_VAR(name) (volatile unsigned )((char *)iouring_ring + iouring_cq_ ## name)
256
257			/* the index array */
258			#define EV_SQ_ARRAY ((unsigned )((char )iouring_ring + iouring_sq_array))
259
260			/* the submit/completion queue entries */
261			#define EV_SQES ((struct io_uring_sqe *) iouring_sqes)
262			#define EV_CQES ((struct io_uring_cqe )((char )iouring_ring + iouring_cq_cqes))
263
264			inline_speed
265			int
266	0		iouring_enter (EV_P_ ev_tstamp timeout)
267			{
268			int res;
269
270	0	0	EV_RELEASE_CB;
271
272	0		res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1,
273			timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0);
274
275			assert (("libev: io_uring_enter did not consume all sqes", (res < 0 \|\| res == iouring_to_submit)));
276
277	0		iouring_to_submit = 0;
278
279	0	0	EV_ACQUIRE_CB;
280
281	0		return res;
282			}
283
284			/* TODO: can we move things around so we don't need this forward-reference? */
285			static void
286			iouring_poll (EV_P_ ev_tstamp timeout);
287
288			static
289			struct io_uring_sqe *
290	0		iouring_sqe_get (EV_P)
291			{
292			unsigned tail;
293
294			for (;;)
295	0		{
296	0		tail = EV_SQ_VAR (tail);
297
298	0	0	if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)))
299	0		break; /* whats the problem, we have free sqes */
300
301			/* queue full, need to flush and possibly handle some events */
302
303			#if EV_FEATURE_CODE
304			/* first we ask the kernel nicely, most often this frees up some sqes */
305	0		int res = iouring_enter (EV_A_ EV_TS_CONST (0.));
306
307	0		ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */
308
309	0	0	if (res >= 0)
310	0		continue; /* yes, it worked, try again */
311			#endif
312
313			/* some problem, possibly EBUSY - do the full poll and let it handle any issues */
314
315	0		iouring_poll (EV_A_ EV_TS_CONST (0.));
316			/* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */
317			}
318
319			/assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));/
320
321	0		return EV_SQES + (tail & EV_SQ_VAR (ring_mask));
322			}
323
324			inline_size
325			void
326	0		iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe)
327			{
328	0		unsigned idx = sqe - EV_SQES;
329
330	0		EV_SQ_ARRAY [idx] = idx;
331	0		ECB_MEMORY_FENCE_RELEASE;
332	0		++EV_SQ_VAR (tail);
333			/ECB_MEMORY_FENCE_RELEASE; / for the time being we assume this is not needed */
334	0		++iouring_to_submit;
335	0		}
336
337			/*****************************************************************************/
338
339			/* when the timerfd expires we simply note the fact,
340			* as the purpose of the timerfd is to wake us up, nothing else.
341			* the next iteration should re-set it.
342			*/
343			static void
344	0		iouring_tfd_cb (EV_P_ struct ev_io *w, int revents)
345			{
346	0		iouring_tfd_to = EV_TSTAMP_HUGE;
347	0		}
348
349			/* called for full and partial cleanup */
350			ecb_cold
351			static void
352	0		iouring_internal_destroy (EV_P)
353			{
354	0		close (iouring_tfd);
355	0		close (iouring_fd);
356
357	0	0	if (iouring_ring != MAP_FAILED) munmap (iouring_ring, iouring_ring_size);
358	0	0	if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes, iouring_sqes_size);
359
360	0	0	if (ev_is_active (&iouring_tfd_w))
361			{
362	0		ev_ref (EV_A);
363	0		ev_io_stop (EV_A_ &iouring_tfd_w);
364			}
365	0		}
366
367			ecb_cold
368			static int
369	0		iouring_internal_init (EV_P)
370			{
371	0		struct io_uring_params params = { 0 };
372			uint32_t sq_size, cq_size;
373
374	0		params.flags = IORING_SETUP_CLAMP;
375
376	0		iouring_to_submit = 0;
377
378	0		iouring_tfd = -1;
379	0		iouring_ring = MAP_FAILED;
380	0		iouring_sqes = MAP_FAILED;
381
382	0	0	if (!have_monotonic) /* cannot really happen, but what if11 */
383	0		return -1;
384
385	0		iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms);
386
387	0	0	if (iouring_fd < 0)
388	0		return -1;
389
390	0	0	if ((~params.features) & (IORING_FEAT_NODROP \| IORING_FEAT_SINGLE_MMAP \| IORING_FEAT_SUBMIT_STABLE))
391	0		return -1; /* we require the above features */
392
393			/* TODO: remember somehow whether our queue size has been clamped */
394
395	0		sq_size = params.sq_off.array + params.sq_entries * sizeof (unsigned);
396	0		cq_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe);
397
398	0		iouring_ring_size = sq_size > cq_size ? sq_size : cq_size;
399	0		iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe);
400
401	0		iouring_ring = mmap (0, iouring_ring_size, PROT_READ \| PROT_WRITE,
402			MAP_SHARED \| MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING);
403	0		iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ \| PROT_WRITE,
404			MAP_SHARED \| MAP_POPULATE, iouring_fd, IORING_OFF_SQES);
405
406	0	0	if (iouring_ring == MAP_FAILED \|\| iouring_sqes == MAP_FAILED)
		0
407	0		return -1;
408
409	0		iouring_sq_head = params.sq_off.head;
410	0		iouring_sq_tail = params.sq_off.tail;
411	0		iouring_sq_ring_mask = params.sq_off.ring_mask;
412	0		iouring_sq_ring_entries = params.sq_off.ring_entries;
413	0		iouring_sq_flags = params.sq_off.flags;
414	0		iouring_sq_dropped = params.sq_off.dropped;
415	0		iouring_sq_array = params.sq_off.array;
416
417	0		iouring_cq_head = params.cq_off.head;
418	0		iouring_cq_tail = params.cq_off.tail;
419	0		iouring_cq_ring_mask = params.cq_off.ring_mask;
420	0		iouring_cq_ring_entries = params.cq_off.ring_entries;
421	0		iouring_cq_overflow = params.cq_off.overflow;
422	0		iouring_cq_cqes = params.cq_off.cqes;
423
424	0		iouring_tfd_to = EV_TSTAMP_HUGE;
425
426	0		iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC);
427
428	0	0	if (iouring_tfd < 0)
429	0		return -1;
430
431	0		return 0;
432			}
433
434			ecb_cold
435			static void
436	0		iouring_fork (EV_P)
437			{
438	0		iouring_internal_destroy (EV_A);
439
440	0	0	while (iouring_internal_init (EV_A) < 0)
441	0		ev_syserr ("(libev) io_uring_setup");
442
443	0		fd_rearm_all (EV_A);
444
445	0		ev_io_stop (EV_A_ &iouring_tfd_w);
446	0		ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ);
447	0		ev_io_start (EV_A_ &iouring_tfd_w);
448	0		}
449
450			/*****************************************************************************/
451
452			static void
453	0		iouring_modify (EV_P_ int fd, int oev, int nev)
454			{
455	0	0	if (oev)
456			{
457			/* we assume the sqe's are all "properly" initialised */
458	0		struct io_uring_sqe *sqe = iouring_sqe_get (EV_A);
459	0		sqe->opcode = IORING_OP_POLL_REMOVE;
460	0		sqe->fd = fd;
461			/* Jens Axboe notified me that user_data is not what is documented, but is
462			* some kind of unique ID that has to match, otherwise the request cannot
463			* be removed. Since we don't really have that, we pass in the old
464			* generation counter - if that fails, too bad, it will hopefully be removed
465			* at close time and then be ignored. */
466	0		sqe->addr = (uint32_t)fd \| ((__u64)(uint32_t)anfds [fd].egen << 32);
467	0		sqe->user_data = (uint64_t)-1;
468	0		iouring_sqe_submit (EV_A_ sqe);
469
470			/* increment generation counter to avoid handling old events */
471	0		++anfds [fd].egen;
472			}
473
474	0	0	if (nev)
475			{
476	0		struct io_uring_sqe *sqe = iouring_sqe_get (EV_A);
477	0		sqe->opcode = IORING_OP_POLL_ADD;
478	0		sqe->fd = fd;
479	0		sqe->addr = 0;
480	0		sqe->user_data = (uint32_t)fd \| ((__u64)(uint32_t)anfds [fd].egen << 32);
481	0		sqe->poll_events =
482	0		(nev & EV_READ ? POLLIN : 0)
483	0		\| (nev & EV_WRITE ? POLLOUT : 0);
484	0		iouring_sqe_submit (EV_A_ sqe);
485			}
486	0		}
487
488			inline_size
489			void
490	0		iouring_tfd_update (EV_P_ ev_tstamp timeout)
491			{
492	0		ev_tstamp tfd_to = mn_now + timeout;
493
494			/* we assume there will be many iterations per timer change, so
495			* we only re-set the timerfd when we have to because its expiry
496			* is too late.
497			*/
498	0	0	if (ecb_expect_false (tfd_to < iouring_tfd_to))
499			{
500			struct itimerspec its;
501
502	0		iouring_tfd_to = tfd_to;
503	0		EV_TS_SET (its.it_interval, 0.);
504	0		EV_TS_SET (its.it_value, tfd_to);
505
506	0		if (timerfd_settime (iouring_tfd, TFD_TIMER_ABSTIME, &its, 0) < 0)
507			assert (("libev: iouring timerfd_settime failed", 0));
508			}
509	0		}
510
511			inline_size
512			void
513	0		iouring_process_cqe (EV_P_ struct io_uring_cqe *cqe)
514			{
515	0		int fd = cqe->user_data & 0xffffffffU;
516	0		uint32_t gen = cqe->user_data >> 32;
517	0		int res = cqe->res;
518
519			/* user_data -1 is a remove that we are not atm. interested in */
520	0	0	if (cqe->user_data == (uint64_t)-1)
521	0		return;
522
523			assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax));
524
525			/* documentation lies, of course. the result value is NOT like
526			* normal syscalls, but like linux raw syscalls, i.e. negative
527			* error numbers. fortunate, as otherwise there would be no way
528			* to get error codes at all. still, why not document this?
529			*/
530
531			/* ignore event if generation doesn't match */
532			/* other than skipping removal events, */
533			/* this should actually be very rare */
534	0	0	if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen))
535	0		return;
536
537	0	0	if (ecb_expect_false (res < 0))
538			{
539			/TODO: EINVAL handling (was something failed with this fd)/
540
541	0	0	if (res == -EBADF)
542			{
543			assert (("libev: event loop rejected bad fd", res != -EBADF));
544	0		fd_kill (EV_A_ fd);
545			}
546			else
547			{
548	0		errno = -res;
549	0		ev_syserr ("(libev) IORING_OP_POLL_ADD");
550			}
551
552	0		return;
553			}
554
555			/* feed events, we do not expect or handle POLLNVAL */
556	0		fd_event (
557			EV_A_
558			fd,
559	0	0	(res & (POLLOUT \| POLLERR \| POLLHUP) ? EV_WRITE : 0)
560	0		\| (res & (POLLIN \| POLLERR \| POLLHUP) ? EV_READ : 0)
561			);
562
563			/* io_uring is oneshot, so we need to re-arm the fd next iteration */
564			/* this also means we usually have to do at least one syscall per iteration */
565	0		anfds [fd].events = 0;
566	0		fd_change (EV_A_ fd, EV_ANFD_REIFY);
567			}
568
569			/* called when the event queue overflows */
570			ecb_cold
571			static void
572	0		iouring_overflow (EV_P)
573			{
574			/* we have two options, resize the queue (by tearing down
575			* everything and recreating it, or living with it
576			* and polling.
577			* we implement this by resizing the queue, and, if that fails,
578			* we just recreate the state on every failure, which
579			* kind of is a very inefficient poll.
580			* one danger is, due to the bios toward lower fds,
581			* we will only really get events for those, so
582			* maybe we need a poll() fallback, after all.
583			*/
584			/EV_CQ_VAR (overflow) = 0;/ /* need to do this if we keep the state and poll manually */
585
586	0		fd_rearm_all (EV_A);
587
588			/* we double the size until we hit the hard-to-probe maximum */
589	0	0	if (!iouring_max_entries)
590			{
591	0		iouring_entries <<= 1;
592	0		iouring_fork (EV_A);
593			}
594			else
595			{
596			/* we hit the kernel limit, we should fall back to something else.
597			* we can either poll() a few times and hope for the best,
598			* poll always, or switch to epoll.
599			* TODO: is this necessary with newer kernels?
600			*/
601
602	0		iouring_internal_destroy (EV_A);
603
604			/* this should make it so that on return, we don't call any uring functions */
605	0		iouring_to_submit = 0;
606
607			for (;;)
608			{
609	0		backend = epoll_init (EV_A_ 0);
610
611	0	0	if (backend)
612	0		break;
613
614	0		ev_syserr ("(libev) iouring switch to epoll");
615			}
616			}
617	0		}
618
619			/* handle any events in the completion queue, return true if there were any */
620			static int
621	0		iouring_handle_cq (EV_P)
622			{
623			unsigned head, tail, mask;
624
625	0		head = EV_CQ_VAR (head);
626	0		ECB_MEMORY_FENCE_ACQUIRE;
627	0		tail = EV_CQ_VAR (tail);
628
629	0	0	if (head == tail)
630	0		return 0;
631
632			/* it can only overflow if we have events, yes, yes? */
633	0	0	if (ecb_expect_false (EV_CQ_VAR (overflow)))
634			{
635	0		iouring_overflow (EV_A);
636	0		return 1;
637			}
638
639	0		mask = EV_CQ_VAR (ring_mask);
640
641			do
642	0		iouring_process_cqe (EV_A_ &EV_CQES [head++ & mask]);
643	0	0	while (head != tail);
644
645	0		EV_CQ_VAR (head) = head;
646	0		ECB_MEMORY_FENCE_RELEASE;
647
648	0		return 1;
649			}
650
651			static void
652	0		iouring_poll (EV_P_ ev_tstamp timeout)
653			{
654			/* if we have events, no need for extra syscalls, but we might have to queue events */
655			/* we also clar the timeout if there are outstanding fdchanges */
656			/* the latter should only happen if both the sq and cq are full, most likely */
657			/* because we have a lot of event sources that immediately complete */
658			/* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */
659	0	0	if (iouring_handle_cq (EV_A) \|\| fdchangecnt)
		0
660	0		timeout = EV_TS_CONST (0.);
661			else
662			/* no events, so maybe wait for some */
663	0		iouring_tfd_update (EV_A_ timeout);
664
665			/* only enter the kernel if we have something to submit, or we need to wait */
666	0	0	if (timeout \|\| iouring_to_submit)
		0
667			{
668	0		int res = iouring_enter (EV_A_ timeout);
669
670	0	0	if (ecb_expect_false (res < 0))
671	0	0	if (errno == EINTR)
672			/* ignore */;
673	0	0	else if (errno == EBUSY)
674			/* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */;
675			else
676	0		ev_syserr ("(libev) iouring setup");
677			else
678	0		iouring_handle_cq (EV_A);
679			}
680	0		}
681
682			inline_size
683			int
684	0		iouring_init (EV_P_ int flags)
685			{
686	0		iouring_entries = IOURING_INIT_ENTRIES;
687	0		iouring_max_entries = 0;
688
689	0	0	if (iouring_internal_init (EV_A) < 0)
690			{
691	0		iouring_internal_destroy (EV_A);
692	0		return 0;
693			}
694
695	0		ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ);
696	0		ev_set_priority (&iouring_tfd_w, EV_MINPRI);
697	0		ev_io_start (EV_A_ &iouring_tfd_w);
698	0		ev_unref (EV_A); /* watcher should not keep loop alive */
699
700	0		backend_modify = iouring_modify;
701	0		backend_poll = iouring_poll;
702
703	0		return EVBACKEND_IOURING;
704			}
705
706			inline_size
707			void
708	0		iouring_destroy (EV_P)
709			{
710	0		iouring_internal_destroy (EV_A);
711	0		}
712