Re: [PATCH 3/3] task_work: use TIF_TASKWORK if available

From: Jens Axboe
Date: Fri Oct 02 2020 - 16:14:29 EST


On 10/2/20 1:10 PM, Thomas Gleixner wrote:
> On Fri, Oct 02 2020 at 09:52, Jens Axboe wrote:
>> On 10/2/20 9:31 AM, Thomas Gleixner wrote:
>>>> This way task_work_run() doesn't need to clear TIF_NOTIFY_SIGNAL and it can
>>>> have more users.
>>>
>>> I think it's fundamentaly wrong that we have several places and several
>>> flags which handle task_work_run() instead of having exactly one place
>>> and one flag.
>>
>> I don't disagree with that. I know it's not happening in this series, but
>> if we to the TIF_NOTIFY_SIGNAL route and get all archs supporting that,
>> then we can kill the signal and notify resume part of running task_work.
>> And that leaves us with exactly one place that runs it.
>>
>> So we can potentially improve the current situation in that regard.
>
> I'll think about it over the weekend.

Thanks, I appreciate it!

Just to drive the point home, we'd end up with something like the below.
Which also enables me to remove a nasty sighand->lock deadlock
workaround in io_uring.

Not in this patch, but the io_uring cqring_wait() call can also be
removed. Outside of the core calling it in tracehook_notify_signal(),
the only callers are then the case where kthreads are used with
task_work.


fs/io_uring.c | 41 ++++++++++++----------------------
include/linux/sched/jobctl.h | 4 +---
include/linux/task_work.h | 4 +---
include/linux/tracehook.h | 9 --------
kernel/signal.c | 22 ------------------
kernel/task_work.c | 40 +++------------------------------
kernel/time/posix-cpu-timers.c | 2 +-
7 files changed, 20 insertions(+), 102 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2a67552a9c2f..3a5f4a7bd369 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1597,12 +1597,12 @@ static void __io_free_req(struct io_kiocb *req)
int ret;

init_task_work(&req->task_work, io_req_task_file_table_put);
- ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
+ ret = task_work_add(req->task, &req->task_work, true);
if (unlikely(ret)) {
struct task_struct *tsk;

tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, false);
}
}
}
@@ -1746,25 +1746,21 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}

-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
- bool twa_signal_ok)
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
- int ret, notify;
+ bool notify = false;
+ int ret;

if (tsk->flags & PF_EXITING)
return -ESRCH;

/*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
*/
- notify = 0;
- if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
- notify = TWA_SIGNAL;
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
+ notify = true;

ret = task_work_add(tsk, cb, notify);
if (!ret)
@@ -1825,13 +1821,13 @@ static void io_req_task_queue(struct io_kiocb *req)
init_task_work(&req->task_work, io_req_task_submit);
percpu_ref_get(&req->ctx->refs);

- ret = io_req_task_work_add(req, &req->task_work, true);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
struct task_struct *tsk;

init_task_work(&req->task_work, io_req_task_cancel);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, false);
wake_up_process(tsk);
}
}
@@ -3056,14 +3052,14 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,

/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, &req->task_work, true);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
struct task_struct *tsk;

/* queue just for cancelation */
init_task_work(&req->task_work, io_req_task_cancel);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, false);
wake_up_process(tsk);
}
return 1;
@@ -4598,7 +4594,6 @@ struct io_poll_table {
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
- bool twa_signal_ok;
int ret;

/* for instances that support it check for an event match first: */
@@ -4613,27 +4608,19 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
init_task_work(&req->task_work, func);
percpu_ref_get(&req->ctx->refs);

- /*
- * If we using the signalfd wait_queue_head for this wakeup, then
- * it's not safe to use TWA_SIGNAL as we could be recursing on the
- * tsk->sighand->siglock on doing the wakeup. Should not be needed
- * either, as the normal wakeup will suffice.
- */
- twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
-
/*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
struct task_struct *tsk;

WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, false);
wake_up_process(tsk);
}
return 1;
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index d2b4204ba4d3..fa067de9f1a9 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -19,7 +19,6 @@ struct task_struct;
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
-#define JOBCTL_TASK_WORK_BIT 24 /* set by TWA_SIGNAL */

#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
@@ -29,10 +28,9 @@ struct task_struct;
#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
-#define JOBCTL_TASK_WORK (1UL << JOBCTL_TASK_WORK_BIT)

#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
-#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK)
+#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)

extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask);
extern void task_clear_jobctl_trapping(struct task_struct *task);
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 0fb93aafa478..a221bd5f746c 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -13,9 +13,7 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
twork->func = func;
}

-#define TWA_RESUME 1
-#define TWA_SIGNAL 2
-int task_work_add(struct task_struct *task, struct callback_head *twork, int);
+int task_work_add(struct task_struct *task, struct callback_head *twork, bool);

struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
void task_work_run(void);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 7ec0e94c5250..3a4a35ae87d1 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -178,15 +178,6 @@ static inline void set_notify_resume(struct task_struct *task)
*/
static inline void tracehook_notify_resume(struct pt_regs *regs)
{
- /*
- * The caller just cleared TIF_NOTIFY_RESUME. This barrier
- * pairs with task_work_add()->set_notify_resume() after
- * hlist_add_head(task->task_works);
- */
- smp_mb__after_atomic();
- if (unlikely(current->task_works))
- task_work_run();
-
#ifdef CONFIG_KEYS_REQUEST_CACHE
if (unlikely(current->cached_requested_key)) {
key_put(current->cached_requested_key);
diff --git a/kernel/signal.c b/kernel/signal.c
index ad52141ab0d2..d44fa9141cef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2271,8 +2271,6 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
void ptrace_notify(int exit_code)
{
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
- if (unlikely(current->task_works))
- task_work_run();

spin_lock_irq(&current->sighand->siglock);
ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2541,26 +2539,6 @@ bool get_signal(struct ksignal *ksig)

relock:
spin_lock_irq(&sighand->siglock);
- /*
- * Make sure we can safely read ->jobctl() in task_work add. As Oleg
- * states:
- *
- * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
- * roughly have
- *
- * task_work_add: get_signal:
- * STORE(task->task_works, new_work); STORE(task->jobctl);
- * mb(); mb();
- * LOAD(task->jobctl); LOAD(task->task_works);
- *
- * and we can rely on STORE-MB-LOAD [ in task_work_add].
- */
- smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
- if (unlikely(current->task_works)) {
- spin_unlock_irq(&sighand->siglock);
- task_work_run();
- goto relock;
- }

/*
* Every stopped thread goes here after wakeup. Check to see if
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95604e57af46..e68f5831a078 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -5,34 +5,6 @@

static struct callback_head work_exited; /* all we need is ->next == NULL */

-/*
- * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster
- * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is
- * shared for threads, and can cause contention on sighand->lock. Even for
- * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking
- * or IRQ disabling is involved for notification (or running) purposes.
- */
-static void task_work_notify_signal(struct task_struct *task)
-{
-#ifdef TIF_NOTIFY_SIGNAL
- set_notify_signal(task);
-#else
- unsigned long flags;
-
- /*
- * Only grab the sighand lock if we don't already have some
- * task_work pending. This pairs with the smp_store_mb()
- * in get_signal(), see comment there.
- */
- if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
- lock_task_sighand(task, &flags)) {
- task->jobctl |= JOBCTL_TASK_WORK;
- signal_wake_up(task, 0);
- unlock_task_sighand(task, &flags);
- }
-#endif
-}
-
/**
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
@@ -53,7 +25,7 @@ static void task_work_notify_signal(struct task_struct *task)
* 0 if succeeds or -ESRCH.
*/
int
-task_work_add(struct task_struct *task, struct callback_head *work, int notify)
+task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
{
struct callback_head *head;

@@ -64,14 +36,8 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify)
work->next = head;
} while (cmpxchg(&task->task_works, head, work) != head);

- switch (notify) {
- case TWA_RESUME:
- set_notify_resume(task);
- break;
- case TWA_SIGNAL:
- task_work_notify_signal(task);
- break;
- }
+ if (notify)
+ set_notify_signal(task);

return 0;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..51080a1ed11f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1128,7 +1128,7 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)

/* Schedule task work to actually expire the timers */
tsk->posix_cputimers_work.scheduled = true;
- task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+ task_work_add(tsk, &tsk->posix_cputimers_work.work, true);
}

static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,

--
Jens Axboe