[libvirt] [RFC] Substitute poll by epoll in virEventPollRunOnce
Dmitry Derbyshev
dderbyshev at virtuozzo.com
Fri Feb 10 13:34:09 UTC 2017
FlameGraphs show that on small machine that can run 35 VMs version with
epoll has 20% less perf counters.
2/10/2017 4:29 PM, Derbyshev Dmitriy пишет:
> From: Derbyshev Dmitry <dderbyshev at virtuozzo.com>
>
> This makes it possible to avoid allocations in
> virEventPollMakePollFDs, as well as generally reduces time spent in
> kernel if handles remain the same, which should generally be true.
>
> __________________________________________
>
> Questions:
> 1) What to do with probe in virEventPollRunOnce?
> 2) Are ifdef an acceptable solution to epoll avaliability issues?
> 3) Is using MAX_POLL_EVENTS_AT_ONCE constant a valid solution?
> 4) some fds are sometimes added twice, but only once with events!=0? This complicates virEventPoll*Handle fuctions. At the very least, it is called twice in _dbus_watch_list_set_functions.
>
> Signed-off-by: Derbyshev Dmitry <dderbyshev at virtuozzo.com>
> ---
> configure.ac | 26 +++++
> src/util/vireventpoll.c | 205 ++++++++++++++++++++++++++++++++++++---
> src/util/vireventpoll.h | 5 +
> tests/commanddata/test3epoll.log | 15 +++
> tests/commandtest.c | 4 +
> 5 files changed, 242 insertions(+), 13 deletions(-)
> create mode 100644 tests/commanddata/test3epoll.log
>
> diff --git a/configure.ac b/configure.ac
> index dfc536f..c985ccc 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -2695,6 +2695,32 @@ AC_DEFINE_UNQUOTED([isbase64],[libvirt_gl_isbase64],[Hack to avoid symbol clash]
> AC_DEFINE_UNQUOTED([base64_encode],[libvirt_gl_base64_encode],[Hack to avoid symbol clash])
> AC_DEFINE_UNQUOTED([base64_encode_alloc],[libvirt_gl_base64_encode_alloc],[Hack to avoid symbol clash])
>
> +AC_ARG_ENABLE([epoll],
> + [AS_HELP_STRING([--enable-epoll],[use epoll(4) on Linux])],
> + [enable_epoll=$enableval], [enable_epoll=auto])
> +if test x$enable_epoll = xno; then
> + have_linux_epoll=no
> +else
> + AC_MSG_CHECKING([for Linux epoll(4)])
> + AC_LINK_IFELSE([AC_LANG_PROGRAM(
> + [
> + #ifndef __linux__
> + #error This is not Linux
> + #endif
> + #include <sys/epoll.h>
> + ],
> + [epoll_create1 (EPOLL_CLOEXEC);])],
> + [have_linux_epoll=yes],
> + [have_linux_epoll=no])
> + AC_MSG_RESULT([$have_linux_epoll])
> +fi
> +if test x$enable_epoll,$have_linux_epoll = xyes,no; then
> + AC_MSG_ERROR([epoll support explicitly enabled but not available])
> +fi
> +if test "x$have_linux_epoll" = xyes; then
> + AC_DEFINE_UNQUOTED([HAVE_LINUX_EPOLL], 1, [whether epoll is supported])
> +fi
> +
> AC_CONFIG_FILES([run],
> [chmod +x,-w run])
> AC_CONFIG_FILES([\
> diff --git a/src/util/vireventpoll.c b/src/util/vireventpoll.c
> index 81ecab4..1541706 100644
> --- a/src/util/vireventpoll.c
> +++ b/src/util/vireventpoll.c
> @@ -25,7 +25,11 @@
>
> #include <stdlib.h>
> #include <string.h>
> -#include <poll.h>
> +#ifdef HAVE_LINUX_EPOLL
> +# include <sys/epoll.h>
> +#else
> +# include <poll.h>
> +#endif
> #include <sys/time.h>
> #include <errno.h>
> #include <unistd.h>
> @@ -87,6 +91,9 @@ struct virEventPollLoop {
> size_t timeoutsCount;
> size_t timeoutsAlloc;
> struct virEventPollTimeout *timeouts;
> +#ifdef HAVE_LINUX_EPOLL
> + int epollfd;
> +#endif
> };
>
> /* Only have one event loop */
> @@ -109,6 +116,11 @@ int virEventPollAddHandle(int fd, int events,
> virFreeCallback ff)
> {
> int watch;
> +#ifdef HAVE_LINUX_EPOLL
> + size_t i;
> + struct epoll_event ev;
> +#endif
> + int native = virEventPollToNativeEvents(events);
> virMutexLock(&eventLoop.lock);
> if (eventLoop.handlesCount == eventLoop.handlesAlloc) {
> EVENT_DEBUG("Used %zu handle slots, adding at least %d more",
> @@ -124,8 +136,7 @@ int virEventPollAddHandle(int fd, int events,
>
> eventLoop.handles[eventLoop.handlesCount].watch = watch;
> eventLoop.handles[eventLoop.handlesCount].fd = fd;
> - eventLoop.handles[eventLoop.handlesCount].events =
> - virEventPollToNativeEvents(events);
> + eventLoop.handles[eventLoop.handlesCount].events = native;
> eventLoop.handles[eventLoop.handlesCount].cb = cb;
> eventLoop.handles[eventLoop.handlesCount].ff = ff;
> eventLoop.handles[eventLoop.handlesCount].opaque = opaque;
> @@ -133,7 +144,30 @@ int virEventPollAddHandle(int fd, int events,
>
> eventLoop.handlesCount++;
>
> +#ifdef HAVE_LINUX_EPOLL
> + ev.events = native;
> + ev.data.fd = fd;
> + if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
> + if (errno == EEXIST) {
> + for (i = 0; i < eventLoop.handlesCount; i++) {
> + if (eventLoop.handles[i].fd == fd &&
> + !eventLoop.handles[i].deleted) {
> + ev.events |= eventLoop.handles[i].events;
> + }
> + }
> + if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_MOD, fd, &ev) < 0) {
> + virMutexUnlock(&eventLoop.lock);
> + return -1;
> + }
> + }
> + else {
> + virMutexUnlock(&eventLoop.lock);
> + return -1;
> + }
> + }
> +#else
> virEventPollInterruptLocked();
> +#endif
>
> PROBE(EVENT_POLL_ADD_HANDLE,
> "watch=%d fd=%d events=%d cb=%p opaque=%p ff=%p",
> @@ -145,8 +179,14 @@ int virEventPollAddHandle(int fd, int events,
>
> void virEventPollUpdateHandle(int watch, int events)
> {
> +#ifdef HAVE_LINUX_EPOLL
> + struct epoll_event ev;
> + size_t i, j;
> +#else
> size_t i;
> +#endif
> bool found = false;
> + int native = virEventPollToNativeEvents(events);
> PROBE(EVENT_POLL_UPDATE_HANDLE,
> "watch=%d events=%d",
> watch, events);
> @@ -159,13 +199,34 @@ void virEventPollUpdateHandle(int watch, int events)
> virMutexLock(&eventLoop.lock);
> for (i = 0; i < eventLoop.handlesCount; i++) {
> if (eventLoop.handles[i].watch == watch) {
> - eventLoop.handles[i].events =
> - virEventPollToNativeEvents(events);
> + eventLoop.handles[i].events = native;
> +#ifndef HAVE_LINUX_EPOLL
> virEventPollInterruptLocked();
> +#endif
> found = true;
> break;
> }
> }
> +
> +#ifdef HAVE_LINUX_EPOLL
> + ev.events = native;
> + for (j = 0; j < eventLoop.handlesCount; j++) {
> + if (eventLoop.handles[i].fd == eventLoop.handles[i].fd &&
> + !eventLoop.handles[i].deleted &&
> + i != j) {
> + ev.events |= eventLoop.handles[i].events;
> + }
> + }
> + ev.data.fd = eventLoop.handles[i].fd;
> +
> + if (found && epoll_ctl(eventLoop.epollfd, EPOLL_CTL_MOD,
> + eventLoop.handles[i].fd, &ev) < 0) {
> + VIR_WARN("Update for existing handle watch %d failed", watch);
> + virMutexUnlock(&eventLoop.lock);
> + return;
> + }
> +#endif
> +
> virMutexUnlock(&eventLoop.lock);
>
> if (!found)
> @@ -180,7 +241,12 @@ void virEventPollUpdateHandle(int watch, int events)
> */
> int virEventPollRemoveHandle(int watch)
> {
> +#ifdef HAVE_LINUX_EPOLL
> + struct epoll_event ev;
> + size_t i, j;
> +#else
> size_t i;
> +#endif
> PROBE(EVENT_POLL_REMOVE_HANDLE,
> "watch=%d",
> watch);
> @@ -196,15 +262,47 @@ int virEventPollRemoveHandle(int watch)
> continue;
>
> if (eventLoop.handles[i].watch == watch) {
> - EVENT_DEBUG("mark delete %zu %d", i, eventLoop.handles[i].fd);
> - eventLoop.handles[i].deleted = 1;
> - virEventPollInterruptLocked();
> + break;
> + }
> + }
> +
> + if (i == eventLoop.handlesCount) {
> + virMutexUnlock(&eventLoop.lock);
> + return -1;
> + }
> +
> +#ifdef HAVE_LINUX_EPOLL
> + ev.events = 0;
> + for (j = 0; j < eventLoop.handlesCount; j++) {
> + if (eventLoop.handles[j].fd == eventLoop.handles[i].fd &&
> + !eventLoop.handles[j].deleted &&
> + i != j) {
> + ev.events |= eventLoop.handles[i].events;
> + }
> + }
> +
> + ev.data.fd = eventLoop.handles[i].fd;
> + if (ev.events) {
> + if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_MOD, eventLoop.handles[i].fd,
> + &ev) < 0) {
> virMutexUnlock(&eventLoop.lock);
> - return 0;
> + return -1;
> }
> }
> + else {
> + if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_DEL, eventLoop.handles[i].fd,
> + &ev) < 0) {
> + virMutexUnlock(&eventLoop.lock);
> + return -1;
> + }
> + }
> +#endif
> +
> + EVENT_DEBUG("mark delete %zu %d", i, eventLoop.handles[i].fd);
> + eventLoop.handles[i].deleted = 1;
> + virEventPollInterruptLocked();
> virMutexUnlock(&eventLoop.lock);
> - return -1;
> + return 0;
> }
>
>
> @@ -373,6 +471,7 @@ static int virEventPollCalculateTimeout(int *timeout)
> return 0;
> }
>
> +#ifndef HAVE_LINUX_EPOLL
> /*
> * Allocate a pollfd array containing data for all registered
> * file handles. The caller must free the returned data struct
> @@ -409,6 +508,7 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) {
>
> return fds;
> }
> +#endif
>
>
> /*
> @@ -472,7 +572,11 @@ static int virEventPollDispatchTimeouts(void)
> *
> * Returns 0 upon success, -1 if an error occurred
> */
> +#ifdef HAVE_LINUX_EPOLL
> +static int virEventPollDispatchHandles(int nfds, struct epoll_event *events)
> +#else
> static int virEventPollDispatchHandles(int nfds, struct pollfd *fds)
> +#endif
> {
> size_t i, n;
> VIR_DEBUG("Dispatch %d", nfds);
> @@ -482,7 +586,11 @@ static int virEventPollDispatchHandles(int nfds, struct pollfd *fds)
> * in the fds array we've got */
> for (i = 0, n = 0; n < nfds && i < eventLoop.handlesCount; n++) {
> while (i < eventLoop.handlesCount &&
> +#ifdef HAVE_LINUX_EPOLL
> + (eventLoop.handles[i].fd != events[n].data.fd ||
> +#else
> (eventLoop.handles[i].fd != fds[n].fd ||
> +#endif
> eventLoop.handles[i].events == 0)) {
> i++;
> }
> @@ -496,16 +604,25 @@ static int virEventPollDispatchHandles(int nfds, struct pollfd *fds)
> continue;
> }
>
> +#ifdef HAVE_LINUX_EPOLL
> + if (events[n].events) {
> + int hEvents = virEventPollFromNativeEvents(events[n].events);
> +#else
> if (fds[n].revents) {
> + int hEvents = virEventPollFromNativeEvents(fds[n].revents);
> +#endif
> virEventHandleCallback cb = eventLoop.handles[i].cb;
> int watch = eventLoop.handles[i].watch;
> void *opaque = eventLoop.handles[i].opaque;
> - int hEvents = virEventPollFromNativeEvents(fds[n].revents);
> PROBE(EVENT_POLL_DISPATCH_HANDLE,
> "watch=%d events=%d",
> watch, hEvents);
> virMutexUnlock(&eventLoop.lock);
> +#ifdef HAVE_LINUX_EPOLL
> + (cb)(watch, events[n].data.fd, hEvents, opaque);
> +#else
> (cb)(watch, fds[n].fd, hEvents, opaque);
> +#endif
> virMutexLock(&eventLoop.lock);
> }
> }
> @@ -618,7 +735,11 @@ static void virEventPollCleanupHandles(void)
> */
> int virEventPollRunOnce(void)
> {
> +#ifdef HAVE_LINUX_EPOLL
> + struct epoll_event events[MAX_POLL_EVENTS_AT_ONCE];
> +#else
> struct pollfd *fds = NULL;
> +#endif
> int ret, timeout, nfds;
>
> virMutexLock(&eventLoop.lock);
> @@ -628,17 +749,29 @@ int virEventPollRunOnce(void)
> virEventPollCleanupTimeouts();
> virEventPollCleanupHandles();
>
> - if (!(fds = virEventPollMakePollFDs(&nfds)) ||
> - virEventPollCalculateTimeout(&timeout) < 0)
> +#ifndef HAVE_LINUX_EPOLL
> + if (!(fds = virEventPollMakePollFDs(&nfds)))
> + goto error;
> +#endif
> + if (virEventPollCalculateTimeout(&timeout) < 0)
> goto error;
>
> virMutexUnlock(&eventLoop.lock);
>
> retry:
> +#ifdef HAVE_LINUX_EPOLL
> +//FIXME: PROBE handles
> + PROBE(EVENT_POLL_RUN,
> + "nhandles=%d timeout=%d",
> + 42, timeout);
> + nfds = ret = epoll_wait(eventLoop.epollfd, events,
> + MAX_POLL_EVENTS_AT_ONCE, timeout);
> +#else
> PROBE(EVENT_POLL_RUN,
> "nhandles=%d timeout=%d",
> nfds, timeout);
> ret = poll(fds, nfds, timeout);
> +#endif
> if (ret < 0) {
> EVENT_DEBUG("Poll got error event %d", errno);
> if (errno == EINTR || errno == EAGAIN)
> @@ -653,22 +786,32 @@ int virEventPollRunOnce(void)
> if (virEventPollDispatchTimeouts() < 0)
> goto error;
>
> +#ifdef HAVE_LINUX_EPOLL
> + if (ret > 0 &&
> + virEventPollDispatchHandles(nfds, events) < 0)
> + goto error;
> +#else
> if (ret > 0 &&
> virEventPollDispatchHandles(nfds, fds) < 0)
> goto error;
> +#endif
>
> virEventPollCleanupTimeouts();
> virEventPollCleanupHandles();
>
> eventLoop.running = 0;
> virMutexUnlock(&eventLoop.lock);
> +#ifndef HAVE_LINUX_EPOLL
> VIR_FREE(fds);
> +#endif
> return 0;
>
> error:
> virMutexUnlock(&eventLoop.lock);
> error_unlocked:
> +#ifndef HAVE_LINUX_EPOLL
> VIR_FREE(fds);
> +#endif
> return -1;
> }
>
> @@ -698,6 +841,17 @@ int virEventPollInit(void)
> return -1;
> }
>
> +#ifdef HAVE_LINUX_EPOLL
> + eventLoop.epollfd = epoll_create1(0);
> + if (eventLoop.epollfd < 0) {
> + virReportError(VIR_ERR_INTERNAL_ERROR,
> + _("Unable to initialize epoll"));
> + VIR_FORCE_CLOSE(eventLoop.wakeupfd[0]);
> + VIR_FORCE_CLOSE(eventLoop.wakeupfd[1]);
> + return -1;
> + }
> +#endif
> +
> if (virEventPollAddHandle(eventLoop.wakeupfd[0],
> VIR_EVENT_HANDLE_READABLE,
> virEventPollHandleWakeup, NULL, NULL) < 0) {
> @@ -706,6 +860,9 @@ int virEventPollInit(void)
> eventLoop.wakeupfd[0]);
> VIR_FORCE_CLOSE(eventLoop.wakeupfd[0]);
> VIR_FORCE_CLOSE(eventLoop.wakeupfd[1]);
> +#ifdef HAVE_LINUX_EPOLL
> + VIR_FORCE_CLOSE(eventLoop.epollfd);
> +#endif
> return -1;
> }
>
> @@ -742,6 +899,16 @@ int
> virEventPollToNativeEvents(int events)
> {
> int ret = 0;
> +#ifdef HAVE_LINUX_EPOLL
> + if (events & VIR_EVENT_HANDLE_READABLE)
> + ret |= EPOLLIN;
> + if (events & VIR_EVENT_HANDLE_WRITABLE)
> + ret |= EPOLLOUT;
> + if (events & VIR_EVENT_HANDLE_ERROR)
> + ret |= EPOLLERR;
> + if (events & VIR_EVENT_HANDLE_HANGUP)
> + ret |= EPOLLHUP;
> +#else
> if (events & VIR_EVENT_HANDLE_READABLE)
> ret |= POLLIN;
> if (events & VIR_EVENT_HANDLE_WRITABLE)
> @@ -750,6 +917,7 @@ virEventPollToNativeEvents(int events)
> ret |= POLLERR;
> if (events & VIR_EVENT_HANDLE_HANGUP)
> ret |= POLLHUP;
> +#endif
> return ret;
> }
>
> @@ -757,6 +925,16 @@ int
> virEventPollFromNativeEvents(int events)
> {
> int ret = 0;
> +#ifdef HAVE_LINUX_EPOLL
> + if (events & EPOLLIN)
> + ret |= VIR_EVENT_HANDLE_READABLE;
> + if (events & EPOLLOUT)
> + ret |= VIR_EVENT_HANDLE_WRITABLE;
> + if (events & EPOLLERR)
> + ret |= VIR_EVENT_HANDLE_ERROR;
> + if (events & EPOLLHUP)
> + ret |= VIR_EVENT_HANDLE_HANGUP;
> +#else
> if (events & POLLIN)
> ret |= VIR_EVENT_HANDLE_READABLE;
> if (events & POLLOUT)
> @@ -767,5 +945,6 @@ virEventPollFromNativeEvents(int events)
> ret |= VIR_EVENT_HANDLE_ERROR;
> if (events & POLLHUP)
> ret |= VIR_EVENT_HANDLE_HANGUP;
> +#endif
> return ret;
> }
> diff --git a/src/util/vireventpoll.h b/src/util/vireventpoll.h
> index 8844c9c..172009a 100644
> --- a/src/util/vireventpoll.h
> +++ b/src/util/vireventpoll.h
> @@ -26,6 +26,11 @@
>
> # include "internal.h"
>
> +# ifdef HAVE_LINUX_EPOLL
> +/* Maximum number of events that are returned by epoll in virEventPollRunOnce */
> +# define MAX_POLL_EVENTS_AT_ONCE 10
> +# endif
> +
> /**
> * virEventPollAddHandle: register a callback for monitoring file handle events
> *
> diff --git a/tests/commanddata/test3epoll.log b/tests/commanddata/test3epoll.log
> new file mode 100644
> index 0000000..e4619b3
> --- /dev/null
> +++ b/tests/commanddata/test3epoll.log
> @@ -0,0 +1,15 @@
> +ENV:DISPLAY=:0.0
> +ENV:HOME=/home/test
> +ENV:HOSTNAME=test
> +ENV:LANG=C
> +ENV:LOGNAME=testTMPDIR=/tmp
> +ENV:PATH=/usr/bin:/bin
> +ENV:USER=test
> +FD:0
> +FD:1
> +FD:2
> +FD:6
> +FD:8
> +DAEMON:no
> +CWD:/tmp
> +UMASK:0022
> diff --git a/tests/commandtest.c b/tests/commandtest.c
> index 7bf5447..6b23659 100644
> --- a/tests/commandtest.c
> +++ b/tests/commandtest.c
> @@ -227,7 +227,11 @@ static int test3(const void *unused ATTRIBUTE_UNUSED)
> goto cleanup;
> }
>
> +#ifdef HAVE_LINUX_EPOLL
> + ret = checkoutput("test3epoll", NULL);
> +#else
> ret = checkoutput("test3", NULL);
> +#endif
>
> cleanup:
> virCommandFree(cmd);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: local_epoll_wo_interrupt_disp_on_3600s_35vms.svg
Type: image/svg+xml
Size: 1011457 bytes
Desc: not available
URL: <http://listman.redhat.com/archives/libvir-list/attachments/20170210/d6d86f5f/attachment-0002.svg>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: local_disp_on_3600s_35vms.svg
Type: image/svg+xml
Size: 828107 bytes
Desc: not available
URL: <http://listman.redhat.com/archives/libvir-list/attachments/20170210/d6d86f5f/attachment-0003.svg>
More information about the libvir-list
mailing list