[patch] epoll with a per-fd selectable LT/ET behaviour ...

From: Davide Libenzi (davidel@xmailserver.org)
Date: Tue Mar 11 2003 - 14:50:18 EST


This is a slight ( 2 lines of code ) modification of the patch posted
yesterday, that implements a per-fd selectable LT/ET behaviour. A new
event flag EPOLLET has been defined to let the epoll user to select the
interface behaviour on a per-fd basis. If we agree on this I'll go ahead
in editing man pages and update glibc and man-pages folks ...

PS: The patch is against 2.5.64 vanilla ...

- Davide

fs/eventpoll.c | 227 +++++++++++++++++++++++++++++++---------------
include/linux/eventpoll.h | 5 -
2 files changed, 158 insertions, 74 deletions

diff -Nru linux-2.5.64.vanilla/fs/eventpoll.c linux-2.5.64.epoll/fs/eventpoll.c
--- linux-2.5.64.vanilla/fs/eventpoll.c Tue Mar 4 19:29:16 2003
+++ linux-2.5.64.epoll/fs/eventpoll.c Tue Mar 11 11:29:21 2003
@@ -1,6 +1,6 @@
 /*
  * fs/eventpoll.c ( Efficent event polling implementation )
- * Copyright (C) 2001,...,2002 Davide Libenzi
+ * Copyright (C) 2001,...,2003 Davide Libenzi
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -117,11 +117,6 @@
  */
 #define EP_MAX_BUF_EVENTS 32

-/*
- * Used to optimize ready items collection by reducing the irqlock/irqunlock
- * switching rate. This is kept in stack too, so do not go wild with this number.
- */
-#define EP_MAX_COLLECT_ITEMS 64

 /*
@@ -223,6 +218,15 @@

         /* List header used to link this item to the "struct file" items list */
         struct list_head fllink;
+
+ /* List header used to link the item to the transfer list */
+ struct list_head txlink;
+
+ /*
+ * This is used during the collection/transfer of events to userspace
+ * to pin items empty events set.
+ */
+ unsigned int revents;
 };

 /* Wrapper struct used by poll queueing */
@@ -256,9 +260,10 @@
 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync);
 static int ep_eventpoll_close(struct inode *inode, struct file *file);
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
-static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, int maxepi);
-static int ep_send_events(struct eventpoll *ep, struct epitem **aepi, int nepi,
+static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents);
+static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
                           struct epoll_event *events);
+static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist);
 static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, int maxevents);
 static int ep_poll(struct eventpoll *ep, struct epoll_event *events, int maxevents,
                    long timeout);
@@ -340,13 +345,14 @@
         unsigned long flags;
         task_t *this_task = current;
         struct list_head *lsthead = &psw->wake_task_list, *lnk;
+ struct wake_task_node *tncur;
         struct wake_task_node tnode;

         spin_lock_irqsave(&psw->lock, flags);

         /* Try to see if the current task is already inside this wakeup call */
         list_for_each(lnk, lsthead) {
- struct wake_task_node *tncur = list_entry(lnk, struct wake_task_node, llink);
+ tncur = list_entry(lnk, struct wake_task_node, llink);

                 if (tncur->task == this_task) {
                         if (tncur->wq == wq || ++wake_nests > EP_MAX_POLLWAKE_NESTS) {
@@ -386,7 +392,7 @@
         unsigned int i, val;

         for (i = 0, val = 1; val < hintsize && i < EP_MAX_HASH_BITS; i++, val <<= 1);
- return i < EP_MIN_HASH_BITS ? EP_MIN_HASH_BITS: i;
+ return i < EP_MIN_HASH_BITS ? EP_MIN_HASH_BITS: i;
 }

@@ -830,6 +836,7 @@
 {
         unsigned int i, hsize;
         struct list_head *lsthead, *lnk;
+ struct epitem *epi;

         /*
          * We need to lock this because we could be hit by
@@ -844,7 +851,7 @@
                 lsthead = ep_hash_entry(ep, i);

                 list_for_each(lnk, lsthead) {
- struct epitem *epi = list_entry(lnk, struct epitem, llink);
+ epi = list_entry(lnk, struct epitem, llink);

                         ep_unregister_pollwait(ep, epi);
                 }
@@ -860,7 +867,7 @@
                 lsthead = ep_hash_entry(ep, i);

                 while (!list_empty(lsthead)) {
- struct epitem *epi = list_entry(lsthead->next, struct epitem, llink);
+ epi = list_entry(lsthead->next, struct epitem, llink);

                         ep_remove(ep, epi);
                 }
@@ -939,17 +946,14 @@
         struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
         struct eppoll_entry *pwq;

- if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC()))
- {
+ if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
                 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
                 pwq->whead = whead;
                 pwq->base = epi;
                 add_wait_queue(whead, &pwq->wait);
                 list_add_tail(&pwq->llink, &epi->pwqlist);
                 epi->nwait++;
- }
- else
- {
+ } else {
                 /* We have to signal that an error occurred */
                 epi->nwait = -1;
         }
@@ -971,6 +975,7 @@
         INIT_LIST_HEAD(&epi->llink);
         INIT_LIST_HEAD(&epi->rdllink);
         INIT_LIST_HEAD(&epi->fllink);
+ INIT_LIST_HEAD(&epi->txlink);
         INIT_LIST_HEAD(&epi->pwqlist);
         epi->ep = ep;
         epi->file = tfile;
@@ -1077,16 +1082,28 @@
         /* Copy the data member from inside the lock */
         epi->event.data = event->data;

- /* If the file is already "ready" we drop it inside the ready list */
- if ((revents & event->events) && EP_IS_LINKED(&epi->llink) &&
- !EP_IS_LINKED(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
-
- /* Notify waiting tasks that events are available */
- if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
+ /*
+ * If the item is not linked to the hash it means that it's on its
+ * way toward the removal. Do nothing in this case.
+ */
+ if (EP_IS_LINKED(&epi->llink)) {
+ /*
+ * If the item is "hot" and it is not registered inside the ready
+ * list, push it inside. If the item is not "hot" and it is currently
+ * registered inside the ready list, unlink it.
+ */
+ if (revents & event->events) {
+ if (!EP_IS_LINKED(&epi->rdllink)) {
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+
+ /* Notify waiting tasks that events are available */
+ if (waitqueue_active(&ep->wq))
+ wake_up(&ep->wq);
+ if (waitqueue_active(&ep->poll_wait))
+ pwake++;
+ }
+ } else if (EP_IS_LINKED(&epi->rdllink))
+ EP_LIST_DEL(&epi->rdllink);
         }

         write_unlock_irqrestore(&ep->lock, flags);
@@ -1113,8 +1130,7 @@
         /* This is called without locks, so we need the atomic exchange */
         nwait = xchg(&epi->nwait, 0);

- if (nwait)
- {
+ if (nwait) {
                 while (!list_empty(lsthead)) {
                         pwq = list_entry(lsthead->next, struct eppoll_entry, llink);

@@ -1295,28 +1311,45 @@
  * during the f_op->poll() call, we try to collect the maximum number of items
  * by reducing the irqlock/irqunlock switching rate.
  */
-static int ep_collect_ready_items(struct eventpoll *ep, struct epitem **aepi, int maxepi)
+static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
 {
         int nepi;
         unsigned long flags;
- struct list_head *lsthead = &ep->rdllist;
+ struct list_head *lsthead = &ep->rdllist, *lnk;
+ struct epitem *epi;

         write_lock_irqsave(&ep->lock, flags);

- for (nepi = 0; nepi < maxepi && !list_empty(lsthead);) {
- struct epitem *epi = list_entry(lsthead->next, struct epitem, rdllink);
+ for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
+ epi = list_entry(lnk, struct epitem, rdllink);

- /* Remove the item from the ready list */
- EP_LIST_DEL(&epi->rdllink);
+ lnk = lnk->next;

- /*
- * We need to increase the usage count of the "struct epitem" because
- * another thread might call EPOLL_CTL_DEL on this target and make the
- * object to vanish underneath our nose.
- */
- ep_use_epitem(epi);
+ /* If this file is already in the ready list we exit soon */
+ if (!EP_IS_LINKED(&epi->txlink)) {
+ /*
+ * We need to increase the usage count of the "struct epitem" because
+ * another thread might call EPOLL_CTL_DEL on this target and make the
+ * object to vanish underneath our nose.
+ */
+ ep_use_epitem(epi);

- aepi[nepi++] = epi;
+ /*
+ * This is initialized in this way so that the default
+ * behaviour of the reinjecting code will be to push back
+ * the item inside the ready list.
+ */
+ epi->revents = epi->event.events;
+
+ /* Link the ready item into the transfer list */
+ list_add(&epi->txlink, txlist);
+ nepi++;
+
+ /*
+ * Unlink the item from the ready list.
+ */
+ EP_LIST_DEL(&epi->rdllink);
+ }
         }

         write_unlock_irqrestore(&ep->lock, flags);
@@ -1330,36 +1363,40 @@
  * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
  * because of the way poll() is traditionally implemented in Linux.
  */
-static int ep_send_events(struct eventpoll *ep, struct epitem **aepi, int nepi,
+static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
                           struct epoll_event *events)
 {
- int i, eventcnt, eventbuf, revents;
+ int eventcnt = 0, eventbuf = 0;
+ unsigned int revents;
+ struct list_head *lnk;
         struct epitem *epi;
         struct epoll_event event[EP_MAX_BUF_EVENTS];

- for (i = 0, eventcnt = 0, eventbuf = 0; i < nepi; i++, aepi++) {
- epi = *aepi;
+ list_for_each(lnk, txlist) {
+ epi = list_entry(lnk, struct epitem, txlink);

                 /* Get the ready file event set */
                 revents = epi->file->f_op->poll(epi->file, NULL);

- if (revents & epi->event.events) {
+ /*
+ * Set the return event set for the current file descriptor.
+ * Note that only the task task was successfully able to link
+ * the item to its "txlist" will write this field.
+ */
+ epi->revents = revents & epi->event.events;
+
+ if (epi->revents) {
                         event[eventbuf] = epi->event;
                         event[eventbuf].events &= revents;
                         eventbuf++;
                         if (eventbuf == EP_MAX_BUF_EVENTS) {
                                 if (__copy_to_user(&events[eventcnt], event,
- eventbuf * sizeof(struct epoll_event))) {
- for (; i < nepi; i++, aepi++)
- ep_release_epitem(*aepi);
+ eventbuf * sizeof(struct epoll_event)))
                                         return -EFAULT;
- }
                                 eventcnt += eventbuf;
                                 eventbuf = 0;
                         }
                 }
-
- ep_release_epitem(epi);
         }

         if (eventbuf) {
@@ -1374,12 +1411,68 @@

 /*
+ * Walk through the transfer list we collected with ep_collect_ready_items()
+ * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
+ * not already linked, links it to the ready list.
+ */
+static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
+{
+ int ricnt = 0, pwake = 0;
+ unsigned long flags;
+ struct epitem *epi;
+
+ write_lock_irqsave(&ep->lock, flags);
+
+ while (!list_empty(txlist)) {
+ epi = list_entry(txlist->next, struct epitem, txlink);
+
+ /* Unlink the current item from the transfer list */
+ EP_LIST_DEL(&epi->txlink);
+
+ /*
+ * If the item is no more linked to the interest set, we don't
+ * have to push it inside the ready list because the following
+ * ep_release_epitem() is going to drop it. Also, if the current
+ * item is set to have an Edge Triggered behaviour, we don't have
+ * to push it back either.
+ */
+ if (EP_IS_LINKED(&epi->llink) && !(epi->event.events & EPOLLET) &&
+ (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) {
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ricnt++;
+ }
+
+ ep_release_epitem(epi);
+ }
+
+ if (ricnt) {
+ /*
+ * Wake up ( if active ) both the eventpoll wait list and the ->poll()
+ * wait list.
+ */
+ if (waitqueue_active(&ep->wq))
+ wake_up(&ep->wq);
+ if (waitqueue_active(&ep->poll_wait))
+ pwake++;
+ }
+
+ write_unlock_irqrestore(&ep->lock, flags);
+
+ /* We have to call this outside the lock */
+ if (pwake)
+ ep_poll_safewake(&psw, &ep->poll_wait);
+}
+
+
+/*
  * Perform the transfer of events to user space.
  */
 static int ep_events_transfer(struct eventpoll *ep, struct epoll_event *events, int maxevents)
 {
- int eventcnt, nepi, sepi, maxepi;
- struct epitem *aepi[EP_MAX_COLLECT_ITEMS];
+ int eventcnt = 0;
+ struct list_head txlist;
+
+ INIT_LIST_HEAD(&txlist);

         /*
          * We need to lock this because we could be hit by
@@ -1392,25 +1485,13 @@
          */
         down_read(&epsem);

- for (eventcnt = 0; eventcnt < maxevents;) {
- /* Maximum items we can extract this time */
- maxepi = min(EP_MAX_COLLECT_ITEMS, maxevents - eventcnt);
-
- /* Collect/extract ready items */
- nepi = ep_collect_ready_items(ep, aepi, maxepi);
-
- if (nepi) {
- /* Send events to userspace */
- sepi = ep_send_events(ep, aepi, nepi, &events[eventcnt]);
- if (sepi < 0) {
- up_read(&epsem);
- return sepi;
- }
- eventcnt += sepi;
- }
+ /* Collect/extract ready items */
+ if (ep_collect_ready_items(ep, &txlist, maxevents)) {
+ /* Build result set in userspace */
+ eventcnt = ep_send_events(ep, &txlist, events);

- if (nepi < maxepi)
- break;
+ /* Reinject ready items into the ready list */
+ ep_reinject_items(ep, &txlist);
         }

         up_read(&epsem);
diff -Nru linux-2.5.64.vanilla/include/linux/eventpoll.h linux-2.5.64.epoll/include/linux/eventpoll.h
--- linux-2.5.64.vanilla/include/linux/eventpoll.h Tue Mar 4 19:29:17 2003
+++ linux-2.5.64.epoll/include/linux/eventpoll.h Tue Mar 11 10:53:17 2003
@@ -1,6 +1,6 @@
 /*
  * include/linux/eventpoll.h ( Efficent event polling implementation )
- * Copyright (C) 2001,...,2002 Davide Libenzi
+ * Copyright (C) 2001,...,2003 Davide Libenzi
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,6 +19,9 @@
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
+
+/* Set the Edge Triggered behaviour for the target file descriptor */
+#define EPOLLET (1 << 31)

 struct epoll_event {
         __u32 events;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sat Mar 15 2003 - 22:00:27 EST