[PATCH 2/2] eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization

From: wenyang . linux
Date: Sun Jan 29 2023 - 12:49:15 EST


From: Wen Yang <wenyang.linux@xxxxxxxxxxx>

For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's
value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
N event_writes vs ONE event_read is possible.

However, the current implementation wakes up the read thread immediately
in eventfd_write so that the cpu utilization increases unnecessarily.

By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided, thereby reducing cpu utilization.

We used the following test code:
https://github.com/w-simon/tests/blob/master/src/test.c
./test_zmq > /dev/null

The cpu usage is as follows:
12:14:22 CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
12:14:24 all 55.46 0.00 4.78 0.00 0.00 0.96 0.00 0.00 0.00 38.80
12:14:26 all 56.29 0.00 4.70 0.00 0.00 1.24 0.00 0.00 0.00 37.76
12:14:28 all 54.97 0.00 5.25 0.00 0.00 0.97 0.00 0.00 0.00 38.81
12:14:30 all 56.02 0.00 5.26 0.00 0.00 1.24 0.00 0.00 0.00 37.48
12:14:32 all 55.31 0.00 5.03 0.00 0.00 1.40 0.00 0.00 0.00 38.27
12:14:34 all 55.46 0.00 5.26 0.00 0.00 1.24 0.00 0.00 0.00 38.04

Then adjust the new control parameter, as follows:
echo 5 > /proc/sys/fs/eventfd_write_wake_delay_ms

The cpu usagen was observed to decrease by more than 30%, as follows:
12:14:36 all 28.17 0.00 0.93 0.00 0.00 0.00 0.00 0.00 0.00 70.90
12:14:38 all 24.00 0.00 0.80 0.00 0.00 0.13 0.00 0.00 0.00 75.07
12:14:40 all 23.57 0.00 0.53 0.00 0.00 0.13 0.00 0.00 0.00 75.77
12:14:42 all 23.59 0.00 0.40 0.00 0.00 0.00 0.00 0.00 0.00 76.01
12:14:44 all 23.69 0.00 0.27 0.00 0.00 0.00 0.00 0.00 0.00 76.04
12:14:46 all 23.20 0.00 0.67 0.00 0.00 0.13 0.00 0.00 0.00 76.00
12:14:48 all 24.87 0.00 0.66 0.00 0.00 0.00 0.00 0.00 0.00 74.47
12:14:50 all 24.27 0.00 0.66 0.00 0.00 0.00 0.00 0.00 0.00 75.07

Signed-off-by: Wen Yang <wenyang.linux@xxxxxxxxxxx>
Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Dylan Yudaken <dylany@xxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: David Woodhouse <dwmw@xxxxxxxxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: linux-fsdevel@xxxxxxxxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
fs/eventfd.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index c5bda3df4a28..e45436737f9d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -41,6 +41,9 @@ struct eventfd_ctx {
__u64 count;
unsigned int flags;
int id;
+#ifdef CONFIG_SYSCTL
+ struct delayed_work dwork;
+#endif
};

__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
@@ -95,6 +98,9 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
if (ctx->id >= 0)
ida_simple_remove(&eventfd_ida, ctx->id);
+#ifdef CONFIG_SYSCTL
+ flush_delayed_work(&ctx->dwork);
+#endif
kfree(ctx);
}

@@ -256,6 +262,28 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
return sizeof(ucnt);
}

+#ifdef CONFIG_SYSCTL
+
+static unsigned long sysctl_eventfd_write_wake_delay_ms;
+
+static void eventfd_delayed_workfn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork);
+
+ spin_lock_irq(&ctx->wqh.lock);
+ current->in_eventfd = 1;
+ if (ctx->count) {
+ /* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ }
+ current->in_eventfd = 0;
+ spin_unlock_irq(&ctx->wqh.lock);
+}
+
+#endif
+
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
loff_t *ppos)
{
@@ -282,8 +310,26 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
if (likely(res > 0)) {
ctx->count += ucnt;
current->in_eventfd = 1;
- if (waitqueue_active(&ctx->wqh))
+
+ /* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+ if (waitqueue_active(&ctx->wqh)) {
+#ifdef CONFIG_SYSCTL
+ if (ctx->flags & EFD_SEMAPHORE)
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ else {
+ unsigned long delay = sysctl_eventfd_write_wake_delay_ms;
+
+ if (delay) {
+ if (!delayed_work_pending(&ctx->dwork))
+ queue_delayed_work(system_unbound_wq,
+ &ctx->dwork, delay);
+ } else
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ }
+#else
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+#endif
+ }
current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
@@ -406,6 +452,9 @@ static int do_eventfd(unsigned int count, int flags)
ctx->count = count;
ctx->flags = flags;
ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+#ifdef CONFIG_SYSCTL
+ INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn);
+#endif

flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
@@ -438,3 +487,31 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
return do_eventfd(count, 0);
}

+#ifdef CONFIG_SYSCTL
+
+static unsigned long min_wake_delay;
+
+static unsigned long max_wake_delay = HZ / 10;
+
+static struct ctl_table fs_eventfd_ctl[] = {
+ {
+ .procname = "eventfd_write_wake_delay_ms",
+ .data = &sysctl_eventfd_write_wake_delay_ms,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = (void *)&min_wake_delay,
+ .extra2 = (void *)&max_wake_delay,
+ },
+ { }
+};
+
+static int __init init_fs_exec_sysctls(void)
+{
+ register_sysctl_init("fs", fs_eventfd_ctl);
+ return 0;
+}
+
+fs_initcall(init_fs_exec_sysctls);
+
+#endif /* CONFIG_SYSCTL */
--
2.37.2