Re: [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file

From: Xie XiuQi
Date: Thu Aug 08 2019 - 07:52:38 EST


Hi,
Zhou Xiong test it on 2 sockets 128 cores ARM64 box, and get the following numbers:

seconds, smaller is better:
concurrency 5.3-rc2 patched diff (smaller is better)
------------------------------------------------------------------------
10 6.983 4.658 -33.30%
20 14.479 6.8 -53.04%
40 47.689 15.303 -67.91%
80 103.336 22.402 -78.32%

There is a significant improvement in high concurrent and short connection scenario.
Any comments is welcome.

The test program:

#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <unistd.h>
#include <errno.h>
#include <error.h>
#include <sys/epoll.h>

#define TRUE 1
#define FALSE 0

int main(int argc, char * argv[])
{
int sk = 0, ret = 0;
struct epoll_event ev, events[10];
int ep = epoll_create(10);

int time = 1000000;

while (time--) {
sk = socket(PF_INET, SOCK_STREAM, 0);
if (sk < 0)
perror("socket failed.\n");

ev.events = EPOLLIN;
ev.data.u64 = 43; // Some other number
if (epoll_ctl(ep, EPOLL_CTL_ADD, sk, &ev) < 0) {
error(-1, errno, "epoll_ctl");
}

close(sk);
}
}



On 2019/7/27 19:35, Cheng Jian wrote:
> We are optimizing the Request-Per-Second of nginx http server,
> and we found that acquiring epmutex in eventpoll_release_file()
> will become a bottleneck under the one-request-per-connection
> scenario.
>
> Optimize the epmutex with a smaller granularity. Introduce
> an ref-counter to eventpoll and free eventpoll by rcu, using rcu
> and list_first_or_null_rcu() to iterate file->f_ep_links instead
> of epmutex.
>
> The following are some details of the scenario:
>
> HTTP server (nginx):
> * under ARM64 with 64 cores
> * 64 worker processes, each worker is binded to a specific CPU
> * keepalive_requests = 1 in nginx.conf: nginx will close the
> connection fd after a reply is send
> HTTP client[benchmark] (wrk):
> * under x86-64 with 48 cores
> * 16 threads, 64 connections per-thread
>
> Before the patch, the RPS measured by wrk is ~220K, after applying
> the patch the RPS is ~240K. We also measure the overhead of
> eventpoll_release_file() and its children by perf: 29% before and
> 2% after.
>
> Link : https://lkml.org/lkml/2017/10/28/81
>
> Signed-off-by: Cheng Jian <cj.chengjian@xxxxxxxxxx>
> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
> ---
> fs/eventpoll.c | 106 ++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 92 insertions(+), 14 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index d7f1f5011fac..dc81f1c4fbaa 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -38,6 +38,7 @@
> #include <linux/compat.h>
> #include <linux/rculist.h>
> #include <net/busy_poll.h>
> +#include <linux/refcount.h>
>
> /*
> * LOCKING:
> @@ -225,6 +226,11 @@ struct eventpoll {
> /* used to track busy poll napi_id */
> unsigned int napi_id;
> #endif
> +
> + /* used to ensure the validity of eventpoll when release file */
> + refcount_t ref;
> + /* used to free itself */
> + struct rcu_head rcu;
> };
>
> /* Wait structure used by the poll hooks */
> @@ -809,6 +815,32 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
> return 0;
> }
>
> +static void ep_rcu_free(struct rcu_head *head)
> +{
> + struct eventpoll *ep = container_of(head, struct eventpoll, rcu);
> +
> + kfree(ep);
> +}
> +
> +static void eventpoll_put_ep(struct eventpoll *ep)
> +{
> + if (refcount_dec_and_test(&ep->ref)) {
> + mutex_destroy(&ep->mtx);
> + free_uid(ep->user);
> + wakeup_source_unregister(ep->ws);
> +
> + call_rcu(&ep->rcu, ep_rcu_free);
> + }
> +}
> +
> +static struct eventpoll *eventpoll_get_ep(struct eventpoll *ep)
> +{
> + if (refcount_inc_not_zero(&ep->ref))
> + return ep;
> + else
> + return NULL;
> +}
> +
> static void ep_free(struct eventpoll *ep)
> {
> struct rb_node *rbp;
> @@ -826,11 +858,11 @@ static void ep_free(struct eventpoll *ep)
> * anymore. The only hit might come from eventpoll_release_file() but
> * holding "epmutex" is sufficient here.
> */
> - mutex_lock(&epmutex);
>
> /*
> * Walks through the whole tree by unregistering poll callbacks.
> */
> + mutex_lock(&ep->mtx);
> for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> epi = rb_entry(rbp, struct epitem, rbn);
>
> @@ -846,7 +878,6 @@ static void ep_free(struct eventpoll *ep)
> * We do not need to lock ep->mtx, either, we only do it to prevent
> * a lockdep warning.
> */
> - mutex_lock(&ep->mtx);
> while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
> epi = rb_entry(rbp, struct epitem, rbn);
> ep_remove(ep, epi);
> @@ -854,11 +885,19 @@ static void ep_free(struct eventpoll *ep)
> }
> mutex_unlock(&ep->mtx);
>
> - mutex_unlock(&epmutex);
> - mutex_destroy(&ep->mtx);
> - free_uid(ep->user);
> - wakeup_source_unregister(ep->ws);
> - kfree(ep);
> + /*
> + * ep will not been added to visited_list, because ep_ctrl()
> + * can not get its reference and can not reference it by the
> + * corresponding epitem. The only possible operation is list_del_init,
> + * so it's OK to use list_empty_careful() here.
> + */
> + if (!list_empty_careful(&ep->visited_list_link)) {
> + mutex_lock(&epmutex);
> + list_del_init(&ep->visited_list_link);
> + mutex_unlock(&epmutex);
> + }
> +
> + eventpoll_put_ep(ep);
> }
>
> static int ep_eventpoll_release(struct inode *inode, struct file *file)
> @@ -985,7 +1024,7 @@ static const struct file_operations eventpoll_fops = {
> void eventpoll_release_file(struct file *file)
> {
> struct eventpoll *ep;
> - struct epitem *epi, *next;
> + struct epitem *epi;
>
> /*
> * We don't want to get "file->f_lock" because it is not
> @@ -1000,14 +1039,51 @@ void eventpoll_release_file(struct file *file)
> *
> * Besides, ep_remove() acquires the lock, so we can't hold it here.
> */
> - mutex_lock(&epmutex);
> - list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
> - ep = epi->ep;
> + rcu_read_lock();
> + while (true) {
> + epi = list_first_or_null_rcu(&file->f_ep_links,
> + struct epitem, fllink);
> + if (!epi)
> + break;
> +
> + ep = eventpoll_get_ep(epi->ep);
> + /* Current epi had been removed by ep_free() */
> + if (!ep)
> + continue;
> + rcu_read_unlock();
> +
> mutex_lock_nested(&ep->mtx, 0);
> - ep_remove(ep, epi);
> + /*
> + * If rb_first_cached() returns NULL, it means that
> + * the current epi had been removed by ep_free().
> + * To prevent epi from double-freeing, check the
> + * condition before invoking ep_remove().
> + * If eventpoll_release_file() frees epi firstly,
> + * the epi will not be freed again because the epi
> + * must have been removed from ep->rbr when ep_free()
> + * is invoked.
> + */
> + if (rb_first_cached(&ep->rbr))
> + ep_remove(ep, epi);
> mutex_unlock(&ep->mtx);
> +
> + eventpoll_put_ep(ep);
> +
> + rcu_read_lock();
> + }
> + rcu_read_unlock();
> +
> + /*
> + * The file can not been added to tfile_check_list again, because
> + * (1) refcnt has been zero, ep_ctrl() can no longer get its reference
> + * (2) related ep items have been removed, ep_loop_check_proc() can not
> + * get the file by ep->rbr.
> + */
> + if (!list_empty_careful(&file->f_tfile_llink)) {
> + mutex_lock(&epmutex);
> + list_del_init(&file->f_tfile_llink);
> + mutex_unlock(&epmutex);
> }
> - mutex_unlock(&epmutex);
> }
>
> static int ep_alloc(struct eventpoll **pep)
> @@ -1030,6 +1106,8 @@ static int ep_alloc(struct eventpoll **pep)
> ep->rbr = RB_ROOT_CACHED;
> ep->ovflist = EP_UNACTIVE_PTR;
> ep->user = user;
> + INIT_LIST_HEAD(&ep->visited_list_link);
> + refcount_set(&ep->ref, 1);
>
> *pep = ep;
>
> @@ -2018,7 +2096,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
> list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
> visited_list_link) {
> ep_cur->visited = 0;
> - list_del(&ep_cur->visited_list_link);
> + list_del_init(&ep_cur->visited_list_link);
> }
> return ret;
> }
>

--
Thanks,
Xie XiuQi