lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3350e8b6-e9d5-ff37-cc0a-aaf84a6270d2@akamai.com>
Date:   Wed, 17 Jan 2018 12:21:54 -0500
From:   Jason Baron <jbaron@...mai.com>
To:     Nick Murphy <ncmurphy@...gle.com>, linux-kernel@...r.kernel.org
Subject: Re: PROBLEM: epoll_wait does not obey edge triggering semantics for
 hierarchically constructed epoll sets



On 01/12/2018 07:06 PM, Nick Murphy wrote:
> [1.] One line summary of the problem:
> epoll_wait does not obey edge triggering semantics for file
> descriptors which are themselves epoll file descriptors (i.e., epoll
> fd's added to an epoll set with the EPOLLET flag)
> 
> [2.] Full description of the problem/report:
> When executing the following sequence:
> 1) create and add an event fd (for example) to an inner epoll set
> 2) add the inner epoll fd to an outer epoll set (with EPOLLET flag set)
> 3) write to (increase the value of) the event fd
> 4) epoll_wait on outer fd
> 5) epoll_wait on outer fd again
> 
> Edge triggering semantics imply that the epoll_wait in step 5 should
> block (nothing has changed).  It does not.  It returns immediately.
> 
> If epoll_wait is called on the inner fd between steps 4 and 5, the
> epoll_wait in step 5 will then block as expected.
> 
> Does not seem to matter if the event is added to the inner epoll set
> with EPOLLET set or not.
> 
> [3.] Keywords (i.e., modules, networking, kernel): epoll, epoll_wait,
> edge triggering
> 
> [4.] Kernel version (from /proc/version): 4.4.0-103-generic (gcc version 4.8.4)
> 
> [6.] A small shell script or example program which triggers the
>      problem (if possible)
> 

Interesting - it seems that epoll can excessively queue wakeup events
when not desired. Here's a small patch which cures this case, if you
want to try it out. The semantics around nested edge trigger, though do
seem unexpected. For example, in the test case you presented. If one
does epoll_wait() on the outer first and then the inner both
epoll_wait() will return, however if one does the inner first and then
the outer, only the inner will return an event. This has to do with how
epoll implements its polling, it seems odd as well and trickier to fix.
Afaict its always acted like this.

Thanks,

-Jason

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index afd548e..6bd1f46 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -713,6 +713,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
                if (!ep_is_linked(&epi->rdllink)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
+                       pwake = 1;
                }
        }
        /*
@@ -728,7 +729,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
        list_splice(&txlist, &ep->rdllist);
        __pm_relax(ep->ws);

-       if (!list_empty(&ep->rdllist)) {
+       if (unlikely(pwake)) {
+               pwake = 0;
                /*
                 * Wake up (if active) both the eventpoll wait list and
                 * the ->poll() wait list (delayed after we release the
lock).
@@ -744,7 +746,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
                mutex_unlock(&ep->mtx);

        /* We have to call this outside the lock */
-       if (pwake)
+       if (unlikely(pwake))
                ep_poll_safewake(&ep->poll_wait);

        return error;


> #include <fcntl.h>
> #include <stdio.h>
> #include <sys/epoll.h>
> #include <sys/eventfd.h>
> #include <unistd.h>
> 
> int main(int argc, char** argv) {
>   struct epoll_event ev, events[1];
>   int inner_ep, outer_ep, sem, flags, ret;
>   long long val = 1;
> 
>   if ((sem = eventfd(0, 0)) < 0) {
>     fprintf(stderr, "eventfd failed");
>     return -1;
>   }
> 
>   if ((inner_ep = epoll_create(1)) < 0) {
>     fprintf(stderr, "inner epoll_create failed");
>     return -1;
>   }
> 
>   // Set inner to be non-blocking (probably irrelevant, but...)
>   if ((flags = fcntl(inner_ep, F_GETFL, 0)) < 0) {
>     fprintf(stderr, "fcntl get failed");
>     return -1;
>   }
>   flags |= O_NONBLOCK;
>   if (fcntl(inner_ep, F_SETFL, flags) < 0) {
>     fprintf(stderr, "fcntl set failed");
>     return -1;
>   }
> 
>   // Add the event to the inner epoll instance.
>   ev.events = EPOLLIN | EPOLLET;
>   ev.data.fd = sem;
>   if (epoll_ctl(inner_ep, EPOLL_CTL_ADD, sem, &ev) < 0) {
>     fprintf(stderr, "inner add failed");
>     return -1;
>   }
> 
>   if ((outer_ep = epoll_create(1)) < 0) {
>     fprintf(stderr, "outer epoll_create failed");
>     return -1;
>   }
> 
>   // Add the inner epoll instance to the outer.  Note the EPOLLET!
>   ev.events = EPOLLIN | EPOLLET;
>   ev.data.fd = inner_ep;
>   if (epoll_ctl(outer_ep, EPOLL_CTL_ADD, inner_ep, &ev) < 0) {
>     fprintf(stderr, "outer add failed");
>     return -1;
>   }
> 
>   // Write to the event.
>   if (write(sem, &val, sizeof(val)) != sizeof(val)) {
>     fprintf(stderr, "write failed");
>     return -1;
>   }
> 
>   // This should return immediately.
>   printf("First wait.\n");
>   if ((ret = epoll_wait(outer_ep, events, 1, 10000)) < 0) {
>     fprintf(stderr, "epoll_wait failed");
>     return -1;
>   }
>   printf("...returned %d.\n", ret);
> 
>   // This should _not_ return immediately if edge triggered!
>   printf("Second wait.");
>   if ((ret = epoll_wait(outer_ep, events, 1, 10000)) < 0) {
>     fprintf(stderr, "epoll_wait failed");
>     return -1;
>   }
>   printf("...returned %d.\n", ret);
> 
>   printf("Done.\n");
> 
>   return 0;
> }
> 
> [7.] Environment
> (Should not be sensitive to hardware or kernel version, I don't
> think...a basic flaw with epoll_wait logic)
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ