Re: Unexpected cascaded epoll behavior - my mistake or kernel bug

From: Davide Libenzi
Date: Sun Jan 25 2009 - 16:02:40 EST


On Sat, 24 Jan 2009, Davide Libenzi wrote:

> On Fri, 23 Jan 2009, Pavel Pisa wrote:
>
> > Hello Davide and all others,
> >
> > I have got to implementing yet another event library and I experience
> > something strange when level triggered epoll_wait() monitors another
> > level triggered epoll set. Top level epoll_wait() return information
> > about one event pending. The event points correctly to the monitored
> > second level epoll fd, but epoll_wait() on this fd with 0 or small timeout
> > returns zero/no events and top level epoll reporting is not reset at this
> > point so program enters busy loop. If there are events, they are processed
> > correctly. More interresting is, that the ill behavior is prevented if I
> > change number of debug writes into unrelated fd or if the application
> > is slowed down by strace to standard output. The strace to the
> > file does not (fortunatelly) hide ill behavior to the strange behavior
> > of my code or kernel can be documented in attached straces. The both
> > traces documents the busy loop cause for case, where there is created
> > epoll set with fd #3, that set monitors fd #0. When character arrives
> > at fd #0, the new epoll fd set with fd #4 is created ad fd #3 is added
> > into this new top level fd set.
> >
> > epoll_create(8) = 3
> > epoll_ctl(3, EPOLL_CTL_ADD, 0, {EPOLLIN, {u32=6353264, u64=6353264}}) = 0
> > epoll_wait(3, {{EPOLLIN, {u32=6353264, u64=6353264}}}, 8, 20000) = 1
> > read(0, "E"..., 1) = 1
> > ...
> > epoll_create(8) = 4
> > epoll_ctl(4, EPOLL_CTL_ADD, 3, {EPOLLIN, {u32=6353664, u64=6353664}}) = 0
> > epoll_wait(4, {{EPOLLIN, {u32=6353664, u64=6353664}}}, 8, 19998) = 1
> > epoll_wait(3, {{EPOLLIN, {u32=6353264, u64=6353264}}}, 8, 0) = 1
> > read(0, "\n"..., 1) = 1
> > epoll_wait(4, {{EPOLLIN, {u32=6353664, u64=6353664}}}, 8, 19998) = 1
> > epoll_wait(3, {}, 8, 0) = 0
> > epoll_wait(4, {{EPOLLIN, {u32=6353664, u64=6353664}}}, 8, 19998) = 1
> > epoll_wait(3, {}, 8, 0) = 0
> >
> > You may ask, why that terrible clam starts to writting new event library code
> > and why he needs to cascade event handling mechanisms. I try to defend myself
> > a little. If you do not like beat me, you can skip next section
>
> Pavel, can you give the patch below a spin? It's working fine in my two
> boxes, but it'd be great if you could confirm it too.
> Attached there's also a simple test program to check epoll behavior,
> especially the one related to recursion.

Please use the attached epoll_test.c since the one I sent you before
wasn't actually triggering the loop condition it wanted to check.



- Davide

/*
* epoll_test by Davide Libenzi (Simple code to test epoll internals)
* Copyright (C) 2008 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Davide Libenzi <davidel@xxxxxxxxxxxxxxx>
*
*/

#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <limits.h>
#include <poll.h>
#include <sys/epoll.h>
#include <sys/wait.h>

#define EPWAIT_TIMEO (1 * 1000)
#ifndef POLLRDHUP
#define POLLRDHUP 0x2000
#endif

#define EPOLL_MAX_CHAIN 100L

#define EPOLL_TF_LOOP (1 << 0)

struct epoll_test_cfg {
long size;
long flags;
};

static int xepoll_create(int n) {
int epfd;

if ((epfd = epoll_create(n)) == -1) {
perror("epoll_create");
exit(2);
}

return epfd;
}

static void xepoll_ctl(int epfd, int cmd, int fd, struct epoll_event *evt) {
if (epoll_ctl(epfd, cmd, fd, evt) < 0) {
perror("epoll_ctl");
exit(3);
}
}

static void xpipe(int *fds) {
if (pipe(fds)) {
perror("pipe");
exit(4);
}
}

static pid_t xfork(void) {
pid_t pid;

if ((pid = fork()) == (pid_t) -1) {
perror("pipe");
exit(5);
}

return pid;
}

static int run_forked_proc(int (*proc)(void *), void *data) {
int status;
pid_t pid;

if ((pid = xfork()) == 0)
exit((*proc)(data));
if (waitpid(pid, &status, 0) != pid) {
perror("waitpid");
return -1;
}

return WIFEXITED(status) ? WEXITSTATUS(status): -2;
}

static int check_events(int fd, int timeo) {
struct pollfd pfd;

fprintf(stdout, "Checking events for fd %d\n", fd);
memset(&pfd, 0, sizeof(pfd));
pfd.fd = fd;
pfd.events = POLLIN | POLLOUT;
if (poll(&pfd, 1, timeo) < 0) {
perror("poll()");
return 0;
}
if (pfd.revents & POLLIN)
fprintf(stdout, "\tPOLLIN\n");
if (pfd.revents & POLLOUT)
fprintf(stdout, "\tPOLLOUT\n");
if (pfd.revents & POLLERR)
fprintf(stdout, "\tPOLLERR\n");
if (pfd.revents & POLLHUP)
fprintf(stdout, "\tPOLLHUP\n");
if (pfd.revents & POLLRDHUP)
fprintf(stdout, "\tPOLLRDHUP\n");

return pfd.revents;
}

static int epoll_test_tty(void *data) {
int epfd, ifd = fileno(stdin), res;
struct epoll_event evt;

if (check_events(ifd, 0) != POLLOUT) {
fprintf(stderr, "Something is cooking on STDIN (%d)\n", ifd);
return 1;
}
epfd = xepoll_create(1);
fprintf(stdout, "Created epoll fd (%d)\n", epfd);
memset(&evt, 0, sizeof(evt));
evt.events = EPOLLIN;
xepoll_ctl(epfd, EPOLL_CTL_ADD, ifd, &evt);
if (check_events(epfd, 0) & POLLIN) {
res = epoll_wait(epfd, &evt, 1, 0);
if (res == 0) {
fprintf(stderr, "Epoll fd (%d) is ready when it shouldn't!\n",
epfd);
return 2;
}
}

return 0;
}

static int epoll_wakeup_chain(void *data) {
struct epoll_test_cfg *tcfg = data;
int i, res, epfd, bfd, nfd, pfds[2];
pid_t pid;
struct epoll_event evt;

memset(&evt, 0, sizeof(evt));
evt.events = EPOLLIN;

epfd = bfd = xepoll_create(1);

for (i = 0; i < tcfg->size; i++) {
nfd = xepoll_create(1);
xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt);
bfd = nfd;
}
xpipe(pfds);
if (tcfg->flags & EPOLL_TF_LOOP)
{
xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt);
/*
* If we're testing for loop, we want that the wakeup
* triggered by the write to the pipe done in the child
* process, triggers a fake event. So we add the pipe
* read size with EPOLLOUT events. This will trigger
* an addition to the ready-list, but no real events
* will be there. The the epoll kernel code will proceed
* in calling f_op->poll() of the epfd, triggering the
* loop we want to test.
*/
evt.events = EPOLLOUT;
}
xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt);

/*
* The pipe write must come after the poll(2) call inside
* check_events(). This tests the nested wakeup code in
* fs/eventpoll.c:ep_poll_safewake()
* By having the check_events() (hence poll(2)) happens first,
* we have poll wait queue filled up, and the write(2) in the
* child will trigger the wakeup chain.
*/
if ((pid = xfork()) == 0) {
sleep(1);
write(pfds[1], "w", 1);
exit(0);
}

res = check_events(epfd, 2000) & POLLIN;

if (waitpid(pid, NULL, 0) != pid) {
perror("waitpid");
return -1;
}

return res;
}

static int epoll_poll_chain(void *data) {
struct epoll_test_cfg *tcfg = data;
int i, res, epfd, bfd, nfd, pfds[2];
pid_t pid;
struct epoll_event evt;

memset(&evt, 0, sizeof(evt));
evt.events = EPOLLIN;

epfd = bfd = xepoll_create(1);

for (i = 0; i < tcfg->size; i++) {
nfd = xepoll_create(1);
xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt);
bfd = nfd;
}
xpipe(pfds);
if (tcfg->flags & EPOLL_TF_LOOP)
{
xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt);
/*
* If we're testing for loop, we want that the wakeup
* triggered by the write to the pipe done in the child
* process, triggers a fake event. So we add the pipe
* read size with EPOLLOUT events. This will trigger
* an addition to the ready-list, but no real events
* will be there. The the epoll kernel code will proceed
* in calling f_op->poll() of the epfd, triggering the
* loop we want to test.
*/
evt.events = EPOLLOUT;
}
xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt);

/*
* The pipe write mush come before the poll(2) call inside
* check_events(). This tests the nested f_op->poll calls code in
* fs/eventpoll.c:ep_eventpoll_poll()
* By having the pipe write(2) happen first, we make the kernel
* epoll code to load the ready lists, and the following poll(2)
* done inside check_events() will test nested poll code in
* ep_eventpoll_poll().
*/
if ((pid = xfork()) == 0) {
write(pfds[1], "w", 1);
exit(0);
}
sleep(1);
res = check_events(epfd, 1000) & POLLIN;

if (waitpid(pid, NULL, 0) != pid) {
perror("waitpid");
return -1;
}

return res;
}

int main(int ac, char **av) {
int error;
struct epoll_test_cfg tcfg;

fprintf(stdout, "\n********** Testing TTY events\n");
error = run_forked_proc(epoll_test_tty, NULL);
fprintf(stdout, error == 0 ?
"********** OK\n": "********** FAIL (%d)\n", error);

tcfg.size = 3;
tcfg.flags = 0;
fprintf(stdout, "\n********** Testing short wakeup chain\n");
error = run_forked_proc(epoll_wakeup_chain, &tcfg);
fprintf(stdout, error == POLLIN ?
"********** OK\n": "********** FAIL (%d)\n", error);

tcfg.size = EPOLL_MAX_CHAIN;
tcfg.flags = 0;
fprintf(stdout, "\n********** Testing long wakeup chain (HOLD ON)\n");
error = run_forked_proc(epoll_wakeup_chain, &tcfg);
fprintf(stdout, error == 0 ?
"********** OK\n": "********** FAIL (%d)\n", error);

tcfg.size = 3;
tcfg.flags = 0;
fprintf(stdout, "\n********** Testing short poll chain\n");
error = run_forked_proc(epoll_poll_chain, &tcfg);
fprintf(stdout, error == POLLIN ?
"********** OK\n": "********** FAIL (%d)\n", error);

tcfg.size = EPOLL_MAX_CHAIN;
tcfg.flags = 0;
fprintf(stdout, "\n********** Testing long poll chain (HOLD ON)\n");
error = run_forked_proc(epoll_poll_chain, &tcfg);
fprintf(stdout, error == 0 ?
"********** OK\n": "********** FAIL (%d)\n", error);

tcfg.size = 3;
tcfg.flags = EPOLL_TF_LOOP;
fprintf(stdout, "\n********** Testing loopy wakeup chain (HOLD ON)\n");
error = run_forked_proc(epoll_wakeup_chain, &tcfg);
fprintf(stdout, error == 0 ?
"********** OK\n": "********** FAIL (%d)\n", error);

tcfg.size = 3;
tcfg.flags = EPOLL_TF_LOOP;
fprintf(stdout, "\n********** Testing loopy poll chain (HOLD ON)\n");
error = run_forked_proc(epoll_poll_chain, &tcfg);
fprintf(stdout, error == 0 ?
"********** OK\n": "********** FAIL (%d)\n", error);


return 0;
}