[RFC] eventpoll: Move a kmem_cache_alloc and kmem_cache_free

From: Nathan Zimmer
Date: Fri Sep 13 2013 - 11:54:17 EST


We noticed some scaling issue in the SPECjbb benchmark. Running perf
we found that the it was spending lots of time in SYS_epoll_ctl.
In particular it is holding the epmutex.
This patch helps by moving out the kmem_cache_alloc and kmem_cache_free out
from under the lock. It improves throughput by around 15% on 16 sockets.

While this patch should be fine as it is there are probably is more things
that can be done out side the lock, like wakeup_source_unregister, but I am
not familar with the area and I don't know of many tests. I did find the
one posted by Jason Baron at https://lkml.org/lkml/2011/2/25/297.

Any thoughts?

Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: Jason Baron <jbaron@xxxxxxxxxx>
Reported-by: Jerry Lohr <glohr@xxxxxxx>
Signed-off-by: Nathan Zimmer <nzimmer@xxxxxxx>
---
fs/eventpoll.c | 27 ++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9ad17b15..752e5ff 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -707,7 +707,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
wakeup_source_unregister(ep_wakeup_source(epi));

/* At this point it is safe to free the eventpoll item */
- kmem_cache_free(epi_cache, epi);

atomic_long_dec(&ep->user->epoll_watches);

@@ -754,6 +753,7 @@ static void ep_free(struct eventpoll *ep)
while ((rbp = rb_first(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
+ kmem_cache_free(epi_cache, epi);
}
mutex_unlock(&ep->mtx);

@@ -1230,18 +1230,17 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
- struct file *tfile, int fd)
+ struct file *tfile, int fd, struct epitem *epi)
{
int error, revents, pwake = 0;
unsigned long flags;
long user_watches;
- struct epitem *epi;
struct ep_pqueue epq;

user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
- if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
+ if (!epi)
return -ENOMEM;

/* Item initialization follow here ... */
@@ -1349,7 +1348,6 @@ error_unregister:
wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
- kmem_cache_free(epi_cache, epi);

return error;
}
@@ -1795,6 +1793,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
+ struct epitem *epi_prepped = NULL;
+ struct epitem *epi_dropped = NULL;
struct epoll_event epds;

error = -EFAULT;
@@ -1849,6 +1849,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* b/c we want to make sure we are looking at a coherent view of
* epoll network.
*/
+ if (op == EPOLL_CTL_ADD)
+ epi_prepped = kmem_cache_alloc(epi_cache, GFP_KERNEL);
+
if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
mutex_lock(&epmutex);
did_lock_epmutex = 1;
@@ -1878,15 +1881,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
- } else
+ error = ep_insert(ep, &epds, tfile, fd, epi_prepped);
+ if (error)
+ epi_dropped = epi_prepped;
+ } else {
error = -EEXIST;
+ }
clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
- if (epi)
+ if (epi) {
error = ep_remove(ep, epi);
- else
+ epi_dropped = epi;
+ } else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
@@ -1902,6 +1909,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
error_tgt_fput:
if (did_lock_epmutex)
mutex_unlock(&epmutex);
+ if (epi_dropped)
+ kmem_cache_free(epi_cache, epi_dropped);

fput(tfile);
error_fput:
--
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/