Re: [BUG] Use of probe_kernel_address() in task_rcu_dereference() without checking return value

From: Oleg Nesterov
Date: Mon Sep 02 2019 - 09:40:10 EST


On 08/30, Eric W. Biederman wrote:
>
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -182,6 +182,24 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
> put_task_struct(tsk);
> }
>
> +void put_dead_task_struct(struct task_struct *task)
> +{
> + bool delay = false;
> + unsigned long flags;
> +
> + /* Is the task both reaped and no longer being scheduled? */
> + raw_spin_lock_irqsave(&task->pi_lock, flags);
> + if ((task->state == TASK_DEAD) &&
> + (cmpxchg(&task->exit_state, EXIT_DEAD, EXIT_RCU) == EXIT_DEAD))
> + delay = true;
> + raw_spin_lock_irqrestore(&task->pi_lock, flags);
> +
> + /* If both are true use rcu delay the put_task_struct */
> + if (delay)
> + call_rcu(&task->rcu, delayed_put_task_struct);
> + else
> + put_task_struct(task);
> +}
>
> void release_task(struct task_struct *p)
> {
> @@ -222,76 +240,13 @@ void release_task(struct task_struct *p)
>
> write_unlock_irq(&tasklist_lock);
> release_thread(p);
> - call_rcu(&p->rcu, delayed_put_task_struct);
> + put_dead_task_struct(p);

I had a similar change in mind, see below. This is subjective, but to me
it looks more simple and clean.

Oleg.

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dc1811..1f9b021 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1134,7 +1134,10 @@ struct task_struct {

struct tlbflush_unmap_batch tlb_ubc;

- struct rcu_head rcu;
+ union {
+ bool xxx;
+ struct rcu_head rcu;
+ };

/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
diff --git a/kernel/exit.c b/kernel/exit.c
index a75b6a7..baacfce 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}

+void call_delayed_put_task_struct(struct task_struct *p)
+{
+ if (xchg(&p->xxx, 1))
+ call_rcu(&p->rcu, delayed_put_task_struct);
+}

void release_task(struct task_struct *p)
{
@@ -222,7 +227,7 @@ void release_task(struct task_struct *p)

write_unlock_irq(&tasklist_lock);
release_thread(p);
- call_rcu(&p->rcu, delayed_put_task_struct);
+ call_delayed_put_task_struct(p);

p = leader;
if (unlikely(zap_leader))
diff --git a/kernel/fork.c b/kernel/fork.c
index d8ae0f1..e90f6de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -900,11 +900,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;

- /*
- * One for us, one for whoever does the "release_task()" (usually
- * parent)
- */
- refcount_set(&tsk->usage, 2);
+ refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f1..e77389c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3135,7 +3135,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
/* Task is done with its stack. */
put_task_stack(prev);

- put_task_struct(prev);
+ call_delayed_put_task_struct(prev);
}

tick_nohz_task_switch();