[PATCH 08/25] vtime: Exit vtime before exit_notify()

From: Frederic Weisbecker
Date: Tue Nov 13 2018 - 21:46:38 EST


In order to correctly implement kcpustat under nohz_full, we need to
track the task running on a given CPU and read its vtime state safely,
reliably and locklessly.

This leaves us with tracking and fetching that task under RCU. This will
be done in a further patch. Until then we need to prepare vtime for
handling that properly and close the accounting before we meet the earliest
opportunity for the RCU delayed put_task_struct() to be queued. That
point happens to be in exit_notify() in case of auto-reaping.

Therefore we need to finish the accounting right before exit_notify().
After that we shouldn't track the exiting task any further.

Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Yauheni Kaliuta <yauheni.kaliuta@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Wanpeng Li <wanpengli@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/sched.h | 2 ++
include/linux/vtime.h | 2 ++
kernel/exit.c | 1 +
kernel/sched/cputime.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------
4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d458d65..27e0544 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -265,6 +265,8 @@ struct task_cputime {
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
+ /* Task has passed exit_notify() */
+ VTIME_DEAD,
/* Task is idle */
VTIME_IDLE,
/* Task runs in kernelspace in a CPU with VTIME active: */
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index d9160ab..8350a0b 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -73,12 +73,14 @@ extern void vtime_user_exit(struct task_struct *tsk);
extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
+extern void vtime_exit_task(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
static inline void vtime_user_enter(struct task_struct *tsk) { }
static inline void vtime_user_exit(struct task_struct *tsk) { }
static inline void vtime_guest_enter(struct task_struct *tsk) { }
static inline void vtime_guest_exit(struct task_struct *tsk) { }
static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
+static inline void vtime_exit_task(struct task_struct *tsk) { }
#endif

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d..cae3fe9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -883,6 +883,7 @@ void __noreturn do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);

+ vtime_exit_task(tsk);
exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f64afd7..a0c3a82 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -702,7 +702,7 @@ static u64 get_vtime_delta(struct vtime *vtime)
* errors from causing elapsed vtime to go negative.
*/
other = account_other_time(delta);
- WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ WARN_ON_ONCE(vtime->state < VTIME_IDLE);
vtime->starttime += delta;

return delta - other;
@@ -813,17 +813,31 @@ void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;

- write_seqcount_begin(&vtime->seqcount);
- if (vtime->state == VTIME_IDLE)
- vtime_account_idle(prev);
- else
- __vtime_account_kernel(prev, vtime);
- vtime->state = VTIME_INACTIVE;
- vtime->cpu = -1;
- write_seqcount_end(&vtime->seqcount);
+ /*
+ * Flush the prev task vtime, unless it has passed
+ * vtime_exit_task(), in which case there is nothing
+ * left to account.
+ */
+ if (vtime->state != VTIME_DEAD) {
+ write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
+ vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
+ write_seqcount_end(&vtime->seqcount);
+ }

vtime = &current->vtime;

+ /*
+ * Ignore the next task if it has been preempted after
+ * vtime_exit_task().
+ */
+ if (vtime->state == VTIME_DEAD)
+ return;
+
write_seqcount_begin(&vtime->seqcount);
if (is_idle_task(current))
vtime->state = VTIME_IDLE;
@@ -850,6 +864,30 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_restore(flags);
}

+/*
+ * This is the final settlement point after which we don't account
+ * anymore vtime for this task.
+ */
+void vtime_exit_task(struct task_struct *t)
+{
+ struct vtime *vtime = &t->vtime;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ write_seqcount_begin(&vtime->seqcount);
+ /*
+ * A task that has never run on a nohz_full CPU hasn't
+ * been tracked by vtime. Thus it's in VTIME_INACTIVE
+ * state. Nothing to account for it.
+ */
+ if (vtime->state != VTIME_INACTIVE)
+ vtime_account_system(t, vtime);
+ vtime->state = VTIME_DEAD;
+ vtime->cpu = -1;
+ write_seqcount_end(&vtime->seqcount);
+ local_irq_restore(flags);
+}
+
u64 task_gtime(struct task_struct *t)
{
struct vtime *vtime = &t->vtime;
--
2.7.4