From fdae2327bed6b99f516f7fc6fc680a242165879c Mon Sep 17 00:00:00 2001 From: Marc Alexander Lehmann Date: Sun, 23 Jun 2019 02:02:24 +0000 Subject: [PATCH] *** empty log message *** --- ev.pod | 48 +++++++++++---------- ev_linuxaio.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-- ev_vars.h | 1 + ev_wrap.h | 2 + 4 files changed, 142 insertions(+), 25 deletions(-) diff --git a/ev.pod b/ev.pod index 731daaf..0acb270 100644 --- a/ev.pod +++ b/ev.pod @@ -576,33 +576,37 @@ C. =item C (value 64, Linux) -Use the linux-specific linux aio (I C<< aio(7) >>) event interface -available in post-4.18 kernels. +Use the linux-specific linux aio (I C<< aio(7) >> but C<< +io_submit(2) >>) event interface available in post-4.18 kernels. If this backend works for you (as of this writing, it was very -experimental and only supports a subset of file types), it is the best -event interface available on linux and might be well worth it enabling it -- if it isn't available in your kernel this will be detected and another -backend will be chosen. +experimental), it is the best event interface available on linux and might +be well worth enabling it - if it isn't available in your kernel this will +be detected and this backend will be skipped. -This backend can batch oneshot requests and uses a user-space ring buffer -to receive events. It also doesn't suffer from most of the design problems -of epoll (such as not being able to remove event sources from the epoll -set), and generally sounds too good to be true. Because, this being the -linux kernel, of course it suffers from a whole new set of limitations. +This backend can batch oneshot requests and supports a user-space ring +buffer to receive events. It also doesn't suffer from most of the design +problems of epoll (such as not being able to remove event sources from +the epoll set), and generally sounds too good to be true. Because, this +being the linux kernel, of course it suffers from a whole new set of +limitations. For one, it is not easily embeddable (but probably could be done using -an event fd at some extra overhead). It also is subject to various -arbitrary limits that can be configured in F -and F), which could lead to it being skipped during -initialisation. - -Most problematic in practise, however, is that, like kqueue, it requires -special support from drivers, and, not surprisingly, not all drivers -implement it. For example, in linux 4.19, tcp sockets, pipes, event fds, -files, F and a few others are supported, but ttys are not, so -this is not (yet?) a generic event polling interface but is probably still -be very useful in a web server or similar program. +an event fd at some extra overhead). It also is subject to a system wide +limit that can be configured in F - each loop +currently requires C<61> of this number. If no aio requests are left, this +backend will be skipped during initialisation. + +Most problematic in practise, however, is that not all file descriptors +work with it. For example, in linux 5.1, tcp sockets, pipes, event fds, +files, F and a few others are supported, but ttys do not work +(probably because of a bug), so this is not (yet?) a generic event polling +interface. + +To work around this latter problem, the current version of libev uses +epoll as a fallback for file deescriptor types that do not work. Epoll +is used in, kind of, slow mode that hopefully avoids most of its design +problems. This backend maps C and C in the same way as C. diff --git a/ev_linuxaio.c b/ev_linuxaio.c index 9d33e83..3cf8b5e 100644 --- a/ev_linuxaio.c +++ b/ev_linuxaio.c @@ -37,10 +37,16 @@ * either the BSD or the GPL. */ +#define EPOLL_FALLBACK 1 + #include /* actually linux/time.h, but we must assume they are compatible */ #include #include +#if EPOLL_FALLBACK +# include +#endif + /* we try to fill 4kB pages exactly. * the ring buffer header is 32 bytes, every io event is 32 bytes. * the kernel takes the io event number, doubles it, adds 2, adds the ring buffer. @@ -151,6 +157,14 @@ linuxaio_modify (EV_P_ int fd, int oev, int nev) array_needsize (ANIOCBP, linuxaio_iocbps, linuxaio_iocbpmax, fd + 1, linuxaio_array_needsize_iocbp); struct aniocb *iocb = linuxaio_iocbps [fd]; +#if EPOLL_FALLBACK + if (iocb->io.aio_reqprio < 0) + { + epoll_ctl (backend_fd, EPOLL_CTL_DEL, fd, 0); + iocb->io.aio_reqprio = 0; + } +#endif + if (iocb->io.aio_buf) ev_io_cancel (linuxaio_ctx, &iocb->io, (struct io_event *)0); /* always returns an error relevant kernels */ @@ -257,7 +271,10 @@ linuxaio_get_events (EV_P_ ev_tstamp timeout) res = ev_io_getevents (linuxaio_ctx, 1, 1, &ioev, &ts); if (res < 0) - ev_syserr ("(libev) linuxaio io_getevents"); + if (errno == EINTR) + /* ignored */; + else + ev_syserr ("(libev) linuxaio io_getevents"); else if (res) { /* at least one event received, handle it and any remaining ones in the ring buffer */ @@ -266,6 +283,22 @@ linuxaio_get_events (EV_P_ ev_tstamp timeout) } } +#if EPOLL_FALLBACK +static void +linuxaio_rearm_epoll (EV_P_ struct iocb *iocb, int op) +{ + struct epoll_event eev; + + eev.events = EPOLLONESHOT; + if (iocb->aio_buf & POLLIN ) eev.events |= EPOLLIN ; + if (iocb->aio_buf & POLLOUT) eev.events |= EPOLLOUT; + eev.data.fd = iocb->aio_fildes; + + if (epoll_ctl (backend_fd, op, iocb->aio_fildes, &eev) < 0) + ev_syserr ("(libeio) linuxaio epoll_ctl"); +} +#endif + static void linuxaio_poll (EV_P_ ev_tstamp timeout) { @@ -285,8 +318,8 @@ linuxaio_poll (EV_P_ ev_tstamp timeout) { /* This happens when the ring buffer is full, at least. I assume this means * that the event was queued synchronously during io_submit, and thus - * the buffer overflowd. - * In this case, we just try next loop iteration. + * the buffer overflowed. + * In this case, we just try in next loop iteration. * This should not result in a few fds taking priority, as the interface * is one-shot, and we submit iocb's in a round-robin fashion. */ @@ -295,6 +328,20 @@ linuxaio_poll (EV_P_ ev_tstamp timeout) timeout = 0; break; } +#if EPOLL_FALLBACK + else if (errno == EINVAL) + { + /* This hapüpens for unsupported fds, officially, but in my testing, + * also randomly happens for supported fds. We fall back to good old + * poll() here, under the assumption that this is a very rare case. + */ + struct iocb *iocb = linuxaio_submits [submitted]; + res = 1; /* skip this iocb */ + + linuxaio_rearm_epoll (EV_A_ iocb, EPOLL_CTL_ADD); + iocb->aio_reqprio = -1; /* mark iocb as epoll */ + } +#endif else ev_syserr ("(libev) linuxaio io_submit"); @@ -308,6 +355,44 @@ linuxaio_poll (EV_P_ ev_tstamp timeout) linuxaio_get_events (EV_A_ timeout); } +#if EPOLL_FALLBACK + +static void +linuxaio_epoll_cb (EV_P_ struct ev_io *w, int revents) +{ + struct epoll_event events[16]; + + for (;;) + { + int idx; + int res = epoll_wait (backend_fd, events, sizeof (events) / sizeof (events [0]), 0); + + if (ecb_expect_false (res < 0)) + ev_syserr ("(libev) linuxaio epoll_wait"); + else if (!res) + break; + + for (idx = res; idx--; ) + { + int fd = events [idx].data.fd; + uint32_t ev = events [idx].events; + + assert (("libev: iocb fd must be in-bounds", fd >= 0 && fd < anfdmax)); + + linuxaio_rearm_epoll (EV_A_ &linuxaio_iocbps [fd]->io, EPOLL_CTL_MOD); + + fd_event (EV_A_ fd, + (ev & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0) + | (ev & (EPOLLIN | EPOLLERR | EPOLLHUP) ? EV_READ : 0)); + } + + if (res < sizeof (events) / sizeof (events [0])) + break; + } +} + +#endif + inline_size int linuxaio_init (EV_P_ int flags) @@ -321,6 +406,18 @@ linuxaio_init (EV_P_ int flags) if (ev_io_setup (EV_LINUXAIO_DEPTH, &linuxaio_ctx) < 0) return 0; +#if EPOLL_FALLBACK + backend_fd = ev_epoll_create (); + if (backend_fd < 0) + { + ev_io_destroy (linuxaio_ctx); + return 0; + } + + ev_io_init (EV_A_ &linuxaio_epoll_w, linuxaio_epoll_cb, backend_fd, EV_READ); + ev_io_start (EV_A_ &linuxaio_epoll_w); +#endif + backend_modify = linuxaio_modify; backend_poll = linuxaio_poll; @@ -338,6 +435,9 @@ inline_size void linuxaio_destroy (EV_P) { +#if EPOLL_FALLBACK + close (backend_fd); +#endif linuxaio_free_iocbp (EV_A); ev_io_destroy (linuxaio_ctx); } @@ -354,6 +454,16 @@ linuxaio_fork (EV_P) while (ev_io_setup (EV_LINUXAIO_DEPTH, &linuxaio_ctx) < 0) ev_syserr ("(libev) linuxaio io_setup"); +#if EPOLL_FALLBACK + while ((backend_fd = ev_epoll_create ()) < 0) + ev_syserr ("(libev) linuxaio epoll_create"); + + ev_io_stop (EV_A_ &linuxaio_epoll_w); + ev_io_init (EV_A_ &linuxaio_epoll_w, linuxaio_epoll_cb, backend_fd, EV_READ); + ev_io_start (EV_A_ &linuxaio_epoll_w); + ev_unref (EV_A); /* watcher should not keep loop alive */ +#endif + fd_rearm_all (EV_A); } diff --git a/ev_vars.h b/ev_vars.h index 2582b46..b877a67 100644 --- a/ev_vars.h +++ b/ev_vars.h @@ -114,6 +114,7 @@ VARx(int, linuxaio_iocbpmax) VARx(struct iocb **, linuxaio_submits) VARx(int, linuxaio_submitcnt) VARx(int, linuxaio_submitmax) +VARx(ev_io, linuxaio_epoll_w) #endif #if EV_USE_KQUEUE || EV_GENWRAP diff --git a/ev_wrap.h b/ev_wrap.h index 8f8173b..84c583e 100644 --- a/ev_wrap.h +++ b/ev_wrap.h @@ -51,6 +51,7 @@ #define kqueue_events ((loop)->kqueue_events) #define kqueue_fd_pid ((loop)->kqueue_fd_pid) #define linuxaio_ctx ((loop)->linuxaio_ctx) +#define linuxaio_epoll_w ((loop)->linuxaio_epoll_w) #define linuxaio_iocbpmax ((loop)->linuxaio_iocbpmax) #define linuxaio_iocbps ((loop)->linuxaio_iocbps) #define linuxaio_submitcnt ((loop)->linuxaio_submitcnt) @@ -156,6 +157,7 @@ #undef kqueue_events #undef kqueue_fd_pid #undef linuxaio_ctx +#undef linuxaio_epoll_w #undef linuxaio_iocbpmax #undef linuxaio_iocbps #undef linuxaio_submitcnt