[PATCH 2/3] kernel: decouple TASK_WORK TWA_SIGNAL handling from signals

From: Jens Axboe
Date: Thu Oct 01 2020 - 15:42:22 EST


Users of TWA_SIGNAL need to break out of kernel waits loops, or force
re-entry into the kernel, to ensure that the queued task_work is run.
TWA_SIGNAL currently works like signal delivery in that sense, and uses
the same delivery mechanism. This currently works well from a functional
standpoint, but it is very heavy handed on a multithreaded application
where sighand is shared between all threads and main process. Adding
TWA_SIGNAL task_work on such setups need to grab the sighand->lock, which
creates a hot spot for otherwise unrelated task_work. This lock grabbing
is necessary on both the queue-work and run-work side of things,
exacerbating the problem/contention.

This adds TIF_TASKWORK for x86, which if set, will return true on
checking for pending signals. That in turn causes tasks to restart the
system call, which will run the added task_work. If TIF_TASKWORK is
available, we'll use that for notification when TWA_SIGNAL is specified.
If it isn't available, the existing TIF_SIGPENDING path is used.

Once all archs have added support for TIF_TASKWORK, we can kill the
old code completely. That will also allow removal of JOBCTL_TASK_WORK
and related code.

On my test box, even just using 16 threads shows a nice improvement
running an io_uring based echo server.

stock kernel:
0.01% <= 0.1 milliseconds
95.86% <= 0.2 milliseconds
98.27% <= 0.3 milliseconds
99.71% <= 0.4 milliseconds
100.00% <= 0.5 milliseconds
100.00% <= 0.6 milliseconds
100.00% <= 0.7 milliseconds
100.00% <= 0.8 milliseconds
100.00% <= 0.9 milliseconds
100.00% <= 1.0 milliseconds
100.00% <= 1.1 milliseconds
100.00% <= 2 milliseconds
100.00% <= 3 milliseconds
100.00% <= 3 milliseconds
1378930.00 requests per second
~1600% CPU

1.38M requests/second, and all 16 CPUs are maxed out.

patched kernel:
0.01% <= 0.1 milliseconds
98.24% <= 0.2 milliseconds
99.47% <= 0.3 milliseconds
99.99% <= 0.4 milliseconds
100.00% <= 0.5 milliseconds
100.00% <= 0.6 milliseconds
100.00% <= 0.7 milliseconds
100.00% <= 0.8 milliseconds
100.00% <= 0.9 milliseconds
100.00% <= 1.2 milliseconds
1666111.38 requests per second
~1450% CPU

1.67M requests/second, and we're no longer just hammering on the sighand
lock. The original reporter states:

"For 5.7.15 my benchmark achieves 1.6M qps and system cpu is at ~80%.
for 5.7.16 or later it achieves only 1M qps and the system cpu is is
at ~100%"

with the only difference there being that TWA_SIGNAL is used
unconditionally in 5.7.16, since we need it to be able to solve an
inability to run task_work if the application is waiting in the kernel
already on an event that needs task_work run to be satisfied. Also
see commit 0ba9c9edcd15.

Reported-by: Roman Gershman <romger@xxxxxxxxxx>
Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
arch/x86/include/asm/thread_info.h | 2 ++
arch/x86/kernel/signal.c | 32 +++++++++++++++++-------------
include/linux/entry-common.h | 20 ++++++++++++++++---
include/linux/sched/signal.h | 19 ++++++++++++++++--
kernel/entry/common.c | 14 ++++++++++---
5 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 267701ae3d86..79fe7db3208c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -93,6 +93,7 @@ struct thread_info {
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_SLD 18 /* Restore split lock detection on context switch */
+#define TIF_TASKWORK 19 /* task_work pending */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -123,6 +124,7 @@ struct thread_info {
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_SLD (1 << TIF_SLD)
+#define _TIF_TASKWORK (1 << TIF_TASKWORK)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index be0d7d4152ec..5dc1eeaf0866 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -799,21 +799,8 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
#endif
}

-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- */
-void arch_do_signal(struct pt_regs *regs)
+void arch_restart_syscall(struct pt_regs *regs)
{
- struct ksignal ksig;
-
- if (get_signal(&ksig)) {
- /* Whee! Actually deliver the signal. */
- handle_signal(&ksig, regs);
- return;
- }
-
/* Did we come from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
@@ -831,12 +818,29 @@ void arch_do_signal(struct pt_regs *regs)
break;
}
}
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+bool arch_do_signal(struct pt_regs *regs)
+{
+ struct ksignal ksig;
+
+ if (get_signal(&ksig)) {
+ /* Whee! Actually deliver the signal. */
+ handle_signal(&ksig, regs);
+ return true;
+ }

/*
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
restore_saved_sigmask();
+ return false;
}

void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 159c7476b11b..03cab8b9ddab 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -37,6 +37,10 @@
# define _TIF_UPROBE (0)
#endif

+#ifndef _TIF_TASKWORK
+# define _TIF_TASKWORK (0)
+#endif
+
/*
* TIF flags handled in syscall_enter_from_usermode()
*/
@@ -69,7 +73,7 @@

#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
+ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_TASKWORK| \
ARCH_EXIT_TO_USER_MODE_WORK)

/**
@@ -262,9 +266,19 @@ static __always_inline void arch_exit_to_user_mode(void) { }
* arch_do_signal - Architecture specific signal delivery function
* @regs: Pointer to currents pt_regs
*
- * Invoked from exit_to_user_mode_loop().
+ * Invoked from exit_to_user_mode_loop(). Returns true if a signal was
+ * handled.
+ */
+bool arch_do_signal(struct pt_regs *regs);
+
+/**
+ * arch_restart_syscall - Architecture specific syscall restarting
+ * @regs: Pointer to currents pt_regs
+ *
+ * Invoked from exit_to_user_mode_loop(), if we need to restart the current
+ * system call.
*/
-void arch_do_signal(struct pt_regs *regs);
+void arch_restart_syscall(struct pt_regs *regs);

/**
* arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e6f34d8fbf4d..3093a7d30a24 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -360,6 +360,15 @@ static inline int task_sigpending(struct task_struct *p)

static inline int signal_pending(struct task_struct *p)
{
+#ifdef TIF_TASKWORK
+ /*
+ * IF_TASKWORK isn't really a signal, but it requires the same
+ * behavior in terms of ensuring that we break out of wait loops
+ * so that task_work can be processed.
+ */
+ if (unlikely(test_tsk_thread_flag(p, TIF_TASKWORK)))
+ return 1;
+#endif
return task_sigpending(p);
}

@@ -506,10 +515,16 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
- if (interrupted)
+ if (interrupted) {
+#ifdef TIF_TASKWORK
+ WARN_ON(!test_thread_flag(TIF_SIGPENDING) &&
+ !test_thread_flag(TIF_TASKWORK));
+#else
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
- else
+#endif
+ } else {
restore_saved_sigmask();
+ }
}

static inline sigset_t *sigmask_to_save(void)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 6fdb6105e6d6..d25ee8f7f071 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -135,11 +135,13 @@ static __always_inline void exit_to_user_mode(void)
}

/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal(struct pt_regs *regs) { }
+bool __weak arch_do_signal(struct pt_regs *regs) { return true; }

static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
+ bool restart_sys = (ti_work & (_TIF_SIGPENDING|_TIF_TASKWORK)) != 0;
+
/*
* Before returning to user space ensure that all pending work
* items have been completed.
@@ -157,8 +159,11 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);

- if (ti_work & _TIF_SIGPENDING)
- arch_do_signal(regs);
+ if (ti_work & _TIF_TASKWORK)
+ task_work_run();
+
+ if ((ti_work & _TIF_SIGPENDING) && arch_do_signal(regs))
+ restart_sys = false;

if (ti_work & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
@@ -178,6 +183,9 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
ti_work = READ_ONCE(current_thread_info()->flags);
}

+ if (restart_sys)
+ arch_restart_syscall(regs);
+
/* Return the latest work state for arch_exit_to_user_mode() */
return ti_work;
}
--
2.28.0