[PATCH 20/25] sched/kcpustat: Introduce vtime-aware kcpustat accessor

From: Frederic Weisbecker
Date: Tue Nov 13 2018 - 21:47:04 EST


Kcpustat is not correctly supported on nohz_full CPUs. The tick doesn't
fire and the cputime therefore doesn't move forward. The issue has shown
up after the vanishing of the remaining 1Hz which has made the issue
visible.

We are solving that with tracking the task running on a CPU through RCU
and reading its vtime delta that we add to the raw kcpustat values.

We make sure that we fetch a coherent raw-kcpustat/vtime-delta couple
sequence while checking that the CPU referred by the target vtime is the
correct one, under the locked vtime seqcount.

Reported-by: Yauheni Kaliuta <yauheni.kaliuta@xxxxxxxxxx>
Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Yauheni Kaliuta <yauheni.kaliuta@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Wanpeng Li <wanpengli@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/kernel_stat.h | 25 +++++++++++++
kernel/sched/cputime.c | 90 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 115 insertions(+)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 049d973..2d4d301 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,31 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
return kstat_cpu(cpu).irqs_sum;
}

+
+static inline void kcpustat_cputime_raw(struct kernel_cpustat *kcpustat,
+ u64 *user, u64 *nice, u64 *system,
+ u64 *guest, u64 *guest_nice)
+{
+ *user = kcpustat->cpustat[CPUTIME_USER];
+ *nice = kcpustat->cpustat[CPUTIME_NICE];
+ *system = kcpustat->cpustat[CPUTIME_SYSTEM];
+ *guest = kcpustat->cpustat[CPUTIME_GUEST];
+ *guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE];
+}
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void kcpustat_cputime(struct kernel_cpustat *kcpustat, int cpu,
+ u64 *user, u64 *nice, u64 *system,
+ u64 *guest, u64 *guest_nice);
+#else
+static inline void kcpustat_cputime(struct kernel_cpustat *kcpustat, int cpu,
+ u64 *user, u64 *nice, u64 *system,
+ u64 *guest, u64 *guest_nice)
+{
+ kcpustat_cputime_raw(kcpustat, user, nice, system, guest, guest_nice);
+}
+#endif
+
extern void account_user_time(struct task_struct *, u64);
extern void account_guest_time(struct task_struct *, u64);
extern void account_system_time(struct task_struct *, int, u64);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2b35132..3afde9f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1024,4 +1024,94 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
}
+
+static int kcpustat_vtime(struct kernel_cpustat *kcpustat, struct vtime *vtime,
+ int cpu, u64 *user, u64 *nice,
+ u64 *system, u64 *guest, u64 *guest_nice)
+{
+ unsigned int seq;
+ u64 delta;
+ int err;
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ /*
+ * We raced against context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1) {
+ err = -EAGAIN;
+ continue;
+ }
+
+ err = 0;
+
+ kcpustat_cputime_raw(kcpustat, user, nice,
+ system, guest, guest_nice);
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (vtime->state == VTIME_SYS) {
+ *system += vtime->stime + delta;
+ } else if (vtime->state == VTIME_USER) {
+ if (vtime->nice)
+ *nice += vtime->utime + delta;
+ else
+ *user += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+ if (vtime->nice) {
+ *guest_nice += vtime->gtime + delta;
+ *nice += vtime->gtime + delta;
+ } else {
+ *guest += vtime->gtime + delta;
+ *user += vtime->gtime + delta;
+ }
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return err;
+}
+
+void kcpustat_cputime(struct kernel_cpustat *kcpustat, int cpu,
+ u64 *user, u64 *nice, u64 *system,
+ u64 *guest, u64 *guest_nice)
+{
+ struct task_struct *curr;
+ struct vtime *vtime;
+ int err;
+
+ if (!vtime_accounting_enabled()) {
+ kcpustat_cputime_raw(kcpustat, user, nice,
+ system, guest, guest_nice);
+ return;
+ }
+
+ rcu_read_lock();
+
+ do {
+ curr = rcu_dereference(kcpustat->curr);
+ if (!curr) {
+ kcpustat_cputime_raw(kcpustat, user, nice,
+ system, guest, guest_nice);
+ break;
+ }
+
+ vtime = &curr->vtime;
+ err = kcpustat_vtime(kcpustat, vtime, cpu, user,
+ nice, system, guest, guest_nice);
+ } while (err == -EAGAIN);
+
+ rcu_read_unlock();
+}
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
--
2.7.4