Re: [RFC v3 1/2] epoll: avoid spinlock contention with wfcqueue

From: Eric Wong
Date: Sat Mar 23 2013 - 06:16:57 EST


Eric Wong <normalperson@xxxxxxxx> wrote:
> Arve HjÃnnevÃg <arve@xxxxxxxxxxx> wrote:
> >
> > At some point the level triggered event has to get cleared. As far as
> > I can tell, your new code will drop new events that occur between
> > "revents = ep_item_poll(epi, &pt);" and "epi->state = EP_STATE_IDLE;"
> > in that case.
>
> Thanks for catching that, I'll need to fix that. Maybe reintroduce
> EP_STATE_DEQUEUE, but just for the (LT && !revents) case.

I reintroduced ovflist (with less locking) instead.
I think the problem with !revents you pointed out affects non-LT, as well.

Will post an updated series (including wfcq changes, and some other
cleanups/fixes) this weekend. Here's a work-in-progress on top of
my original [RFC v3 2/2] Comments greatly appreciated, thanks.

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e04175..c3b2ad8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -152,6 +152,12 @@ struct epitem {
/* List header used to link this structure to the eventpoll ready list */
struct wfcq_node rdllink;

+ /*
+ * Works together "struct eventpoll"->ovflist in keeping the
+ * single linked chain of items.
+ */
+ struct epitem *next;
+
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;

@@ -226,6 +232,16 @@ struct eventpoll {
int visited;
struct list_head visited_list_link;

+ /* this only protects ovflist */
+ spinlock_t ovflock;
+
+ /*
+ * This is a single linked list that chains all the "struct epitem" that
+ * happened while transferring ready events to userspace w/out
+ * holding ->lock.
+ */
+ struct epitem *ovflist;
+
struct wfcq_tail rdltail ____cacheline_aligned_in_smp;
};

@@ -890,12 +906,14 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;

spin_lock_init(&ep->wqlock);
+ spin_lock_init(&ep->ovflock);
mutex_init(&ep->mtx);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
wfcq_init(&ep->rdlhead, &ep->rdltail);
wfcq_init(&ep->txlhead, &ep->txltail);
ep->rbr = RB_ROOT;
+ ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;

*pep = ep;
@@ -953,6 +971,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
{
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
+ unsigned long flags;

if ((unsigned long)key & POLLFREE) {
RCU_INIT_POINTER(ep_pwq_from_wait(wait)->whead, NULL);
@@ -990,7 +1009,31 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
wfcq_enqueue(&ep->rdlhead, &ep->rdltail, &epi->rdllink);
ep_pm_stay_awake_rcu(epi);
ep_notify_waiters(ep);
+ return 1;
+ }
+
+ /*
+ * If we are transferring events to userspace, we can hold no locks
+ * (because we're accessing user memory, and because of linux f_op->poll()
+ * semantics). All the events that happen during that period of time are
+ * chained in ep->ovflist and requeued later on.
+ */
+ spin_lock_irqsave(&ep->ovflock, flags);
+ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = ep->ovflist;
+ ep->ovflist = epi;
+ if (epi->ws) {
+ /*
+ * Activate ep->ws since epi->ws may get
+ * deactivated at any time.
+ */
+ __pm_stay_awake(ep->ws);
+ }
+ }
+ /* no need to wake up waiters, ep_send_events */
}
+ spin_unlock_irqrestore(&ep->ovflock, flags);

return 1;
}
@@ -1204,6 +1247,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
epi->state = EP_STATE_IDLE;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->nwait = 0;
+ epi->next = EP_UNACTIVE_PTR;
epi->event = *event;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
@@ -1356,6 +1400,61 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
return 0;
}

+static void ep_ovf_enable(struct eventpoll *ep)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ep->ovflock, flags);
+ ep->ovflist = NULL;
+ spin_unlock_irqrestore(&ep->ovflock, flags);
+}
+
+/*
+ * disables the ovf queue and reinjects events that went into ovf queue
+ * into txlist.
+ */
+static void ep_ovf_disable(struct eventpoll *ep)
+{
+ unsigned long flags;
+ struct epitem *epi, *nepi;
+
+ spin_lock_irqsave(&ep->ovflock, flags);
+ nepi = ep->ovflist;
+ ep->ovflist = EP_UNACTIVE_PTR;
+ spin_unlock_irqrestore(&ep->ovflock, flags);
+
+ /*
+ * We can work on ovflist without the ovflock since we are certain
+ * ep_poll_callback will not append to ovflist anymore.
+ *
+ * We still need ep->mtx to be held during this, though.
+ *
+ * During the time we spent inside the ep_send_events, some
+ * other events might have been queued by the poll callback.
+ * We re-insert them inside the main ready-list here.
+ */
+ for (; (epi = nepi) != NULL;
+ nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+ /*
+ * We need to check if the item is already in the list.
+ * During the ep_send_events loop execution time, items are
+ * queued into ->ovflist but the "txlhead/tail" might already
+ * contain them, and the ep_mark_ready takes care of them
+ */
+ if (!ep_mark_ready(epi))
+ continue;
+
+ wfcq_enqueue_local(&ep->txlhead, &ep->txltail, &epi->rdllink);
+ ep_pm_stay_awake(epi);
+ }
+
+ /*
+ * we've now activated all the epi->ws we could not activate from
+ * ep_poll_callback while ovflist was active, we may now relax this
+ */
+ __pm_relax(ep->ws);
+}
+
static int ep_send_events(struct eventpoll *ep, bool *eavail,
struct epoll_event __user *uevent, int maxevents)
{
@@ -1372,6 +1471,8 @@ static int ep_send_events(struct eventpoll *ep, bool *eavail,
wfcq_init(&lthead, &lttail);
init_poll_funcptr(&pt, NULL);

+ ep_ovf_enable(ep);
+
/*
* Items cannot vanish during the loop because we are holding
* "mtx" during this call.
@@ -1390,22 +1491,6 @@ static int ep_send_events(struct eventpoll *ep, bool *eavail,
WARN_ON(state != EP_STATE_READY);
wfcq_node_init(&epi->rdllink);

- /*
- * Activate ep->ws before deactivating epi->ws to prevent
- * triggering auto-suspend here (in case we reactive epi->ws
- * below).
- *
- * This could be rearranged to delay the deactivation of epi->ws
- * instead, but then epi->ws would temporarily be out of sync
- * with epi->state.
- */
- ws = ep_wakeup_source(epi);
- if (ws) {
- if (ws->active)
- __pm_stay_awake(ep->ws);
- __pm_relax(ws);
- }
-
revents = ep_item_poll(epi, &pt);

/*
@@ -1419,7 +1504,6 @@ static int ep_send_events(struct eventpoll *ep, bool *eavail,
__put_user(epi->event.data, &uevent->data)) {
wfcq_enqueue_local(&ep->txlhead, &ep->txltail,
&epi->rdllink);
- ep_pm_stay_awake(epi);
if (!eventcnt)
eventcnt = -EFAULT;
break;
@@ -1441,19 +1525,28 @@ static int ep_send_events(struct eventpoll *ep, bool *eavail,
*/
wfcq_enqueue_local(&lthead, &lttail,
&epi->rdllink);
- ep_pm_stay_awake(epi);
continue;
}
}

/*
- * reset item state for EPOLLONESHOT and EPOLLET
- * no barrier here, rely on ep->mtx release for write barrier
+ * Deactivate the wakeup source before marking it idle.
+ * The barrier implied by the spinlock in __pm_relax ensures
+ * any future ep_poll_callback callers running will see the
+ * deactivated ws before epi->state == EP_STATE_IDLE.
*/
+ ws = ep_wakeup_source(epi);
+ if (ws)
+ __pm_relax(ws);
+
+ /* no barrier needed, wait until ep_ovf_disable */
epi->state = EP_STATE_IDLE;
}

- /* grab any events we got while copying */
+ /* grab any (possibly redundant) events we got while copying */
+ ep_ovf_disable(ep);
+
+ /* see if we have any more items left (or very new ones) */
*eavail = ep_events_available(ep);

/* requeue level-triggered items */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/