Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.

From: Peter Zijlstra
Date: Thu Apr 07 2022 - 08:14:12 EST


On Tue, Apr 05, 2022 at 01:28:16PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 05, 2022 at 12:29:03PM +0200, Oleg Nesterov wrote:
> > On 04/05, Peter Zijlstra wrote:
> > >
> > > As is, I think we can write task_is_stopped() like:
> > >
> > > #define task_is_stopped(task) ((task)->jobctl & JOBCTL_STOP_PENDING)
> > >
> > > Because jobctl is in fact the canonical state.
> >
> > Not really. JOBCTL_STOP_PENDING means that the task should stop.
> >
> > And this flag is cleared right before set_special_state(TASK_STOPPED)
> > in do_signal_stop(), see task_participate_group_stop().
>
> Argh! More work then :-( Still, I really want to not have p->__state be
> actual unique state.

How insane is this?

In addition to solving the whole saved_state issue, it also allows my
freezer rewrite to reconstruct __state. If you don't find the below
fundamentally broken I can respin that freezer rewrite on top of it.

---
include/linux/sched.h | 8 +++-----
include/linux/sched/jobctl.h | 8 ++++++++
include/linux/sched/signal.h | 15 ++++++++++++++-
kernel/ptrace.c | 18 ++++++++++++++----
kernel/signal.c | 9 ++++++---
5 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 67f06f72c50e..6698b97b8e3b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;

#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)

-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
+#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
+#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
* Special states are those that do not use the normal wait-loop pattern. See
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index fa067de9f1a9..ec8b312f7506 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -20,6 +20,10 @@ struct task_struct;
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */

+#define JOBCTL_STOPPED_BIT 24
+#define JOBCTL_TRACED_BIT 25
+#define JOBCTL_TRACED_FROZEN_BIT 26
+
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
@@ -29,6 +33,10 @@ struct task_struct;
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)

+#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT)
+#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT)
+#define JOBCTL_TRACED_FROZEN (1UL << JOBCTL_TRACED_FROZEN_BIT)
+
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3c8b34876744..3e4a14ed402e 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void)
static inline void kernel_signal_stop(void)
{
spin_lock_irq(&current->sighand->siglock);
- if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+ if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
+ }
spin_unlock_irq(&current->sighand->siglock);

schedule();
@@ -437,10 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool resume)
{
+ lockdep_assert_held(t->sighand->siglock);
+
+ if (resume && !(t->jobctl & JOBCTL_TRACED_FROZEN))
+ t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+
signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
+
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
+ lockdep_assert_held(t->sighand->siglock);
+
+ if (resume)
+ t->jobctl &= ~JOBCTL_TRACED;
+
signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
}

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ccc4b465775b..626f96d275c7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
return true;
}

-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -197,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
+ task->jobctl |= JOBCTL_TRACED_FROZEN;
WRITE_ONCE(task->__state, __TASK_TRACED);
ret = true;
}
@@ -218,9 +224,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
*/
spin_lock_irq(&task->sighand->siglock);
if (READ_ONCE(task->__state) == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
+ task->jobctl &= ~JOBCTL_TRACED_FROZEN;
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~JOBCTL_TRACED;
wake_up_state(task, __TASK_TRACED);
- else
+ } else
WRITE_ONCE(task->__state, TASK_TRACED);
}
spin_unlock_irq(&task->sighand->siglock);
@@ -475,8 +483,10 @@ static int ptrace_attach(struct task_struct *task, long request,
* in and out of STOPPED are protected by siglock.
*/
if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
signal_wake_up_state(task, __TASK_STOPPED);
+ }

spin_unlock(&task->sighand->siglock);

diff --git a/kernel/signal.c b/kernel/signal.c
index 30cd1ca43bcd..0aea3f0a8002 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
static void ptrace_trap_notify(struct task_struct *t)
{
WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
- assert_spin_locked(&t->sighand->siglock);
+ lockdep_assert_held(&t->sighand->siglock);

task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
@@ -930,9 +930,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
for_each_thread(p, t) {
flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- if (likely(!(t->ptrace & PT_SEIZED)))
+ if (likely(!(t->ptrace & PT_SEIZED))) {
+ t->jobctl &= ~JOBCTL_STOPPED;
wake_up_state(t, __TASK_STOPPED);
- else
+ } else
ptrace_trap_notify(t);
}

@@ -2219,6 +2220,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
* schedule() will not sleep if there is a pending signal that
* can awaken the task.
*/
+ current->jobctl |= JOBCTL_TRACED;
set_special_state(TASK_TRACED);

/*
@@ -2460,6 +2462,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;

+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);