Re: [PATCH 1/1] cputime: Make the reported utime+stime correspond to the actual runtime.

From: Peter Zijlstra
Date: Tue Jun 30 2015 - 05:31:15 EST


On Mon, Jun 29, 2015 at 05:28:42PM +0200, Fredrik Markström wrote:
> Hello Peter, the locking part looks good, I don't have a strong
> opinion on per task/signal lock vs global lock.
>
> But with the patch we still update prev->utime and prev->stime
> independently, which was the original problem.

Ah,..

> > @@ -609,19 +593,19 @@ static void cputime_adjust(struct task_cputime *curr,
> > } else if (stime == 0) {
> > utime = rtime;
> > } else {
> > + stime = scale_stime((__force u64)stime, (__force u64)rtime,
> > + (__force u64)(stime + utime));
> > + if (stime < prev->stime)
> > + stime = prev->stime;
> > utime = rtime - stime;
> > }
> >
> > + prev->stime = max(prev->stime, stime);
> > + prev->utime = max(prev->utime, utime);
> > out:
> > *ut = prev->utime;
> > *st = prev->stime;
> > + raw_spin_unlock(&prev->lock);
> > }

So the things we want from this function is that:

- stime + utime == rtime
- stime_i+1 >= stime_i, utime_i+1 >= utime_i

Under the assumption that rtime_i+1 >= rtime_i.

There are 3 cases:

1) utime == 0 -> stime = rtime
2) stime == 0 -> utime = rtime

Both these trivially meet the above conditions, and:

3) stime_i+1 = max((stime * rtime) / (stime + utime), stime_i)
utime = rtime - stime

Which I thought would meet them because we keep stime from shrinking and
therefore utime from growing too big. But there is indeed the other side
to consider, what if stime grows too big and makes the new utime too
small. When we update stime but not utime and break the first condition.

Now, you propose:

> + if (stime < prev->stime) {
> + stime = prev->stime;
> + utime = rtime - stime;

Right, I have this branch, it keeps stime in check as per the above.

Since we set stime_i+1 = stime_i, utime_i+1 = rtime - stime_i >= utime_i
since rtime_i+1 >= rtime_i.

> + } else if (utime < prev->utime) {
> + utime = prev->utime;
> + stime = rtime - utime;
> + }

And that deals with the other side, similar proof.


> + if (prev->stime + prev->utime < rtime) {
> + prev->stime = stime;
> + prev->utime = utime;
> + }

This I don't really agree with, we can leave the inverse check as is and
simply bail early.

Folding this all together gives me

---
include/linux/init_task.h | 10 ++++++++++
include/linux/sched.h | 23 +++++++++++++--------
kernel/fork.c | 7 ++++---
kernel/sched/cputime.c | 51 ++++++++++++++++++++++-------------------------
4 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bb9b075f0eb0..e38681f4912d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -39,6 +39,14 @@ extern struct fs_struct init_fs;
#define INIT_CPUSET_SEQ(tsk)
#endif

+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#define INIT_PREV_CPUTIME(x) .prev_cputime = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock), \
+},
+#else
+#define INIT_PREV_CPUTIME(x)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.thread_head = LIST_HEAD_INIT(init_task.thread_node), \
@@ -53,6 +61,7 @@ extern struct fs_struct init_fs;
.cputime_atomic = INIT_CPUTIME_ATOMIC, \
.running = 0, \
}, \
+ INIT_PREV_CPUTIME(sig) \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
INIT_GROUP_RWSEM(sig) \
@@ -254,6 +263,7 @@ extern struct task_group root_task_group;
INIT_TASK_RCU_TASKS(tsk) \
INIT_CPUSET_SEQ(tsk) \
INIT_RT_MUTEXES(tsk) \
+ INIT_PREV_CPUTIME(tsk) \
INIT_VTIME(tsk) \
INIT_NUMA_BALANCING(tsk) \
INIT_KASAN(tsk) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6633e83e608a..5dfbe92ce04e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -531,17 +531,28 @@ struct cpu_itimer {
};

/**
- * struct cputime - snaphsot of system and user cputime
+ * struct prev_cputime - snaphsot of system and user cputime
* @utime: time spent in user mode
* @stime: time spent in system mode
*
* Gathers a generic snapshot of user and system time.
*/
-struct cputime {
+struct prev_cputime {
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
cputime_t utime;
cputime_t stime;
+ raw_spinlock_t lock;
+#endif
};

+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ prev->utime = prev->stime = 0;
+ raw_spin_lock_init(&prev->lock);
+#endif
+}
+
/**
* struct task_cputime - collected CPU time counts
* @utime: time spent in user mode, in &cputime_t units
@@ -716,9 +727,7 @@ struct signal_struct {
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- struct cputime prev_cputime;
-#endif
+ struct prev_cputime prev_cputime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
@@ -1494,9 +1503,7 @@ struct task_struct {

cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- struct cputime prev_cputime;
-#endif
+ struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_t vtime_seqlock;
unsigned long long vtime_snap;
diff --git a/kernel/fork.c b/kernel/fork.c
index c9507afd4a7d..306481d3a0e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
rcu_assign_pointer(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
+
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
@@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
seqlock_init(&sig->stats_lock);
+ prev_cputime_init(&sig->prev_cputime);

hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
@@ -1338,9 +1340,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,

p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+ prev_cputime_init(&p->prev_cputime);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
p->vtime_snap = 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..f3af6d86f38b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,32 +555,16 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
}

/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
- *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
- cputime_t old;
-
- while (new > (old = READ_ONCE(*counter)))
- cmpxchg_cputime(counter, old, new);
-}
-
-/*
* Adjust tick based cputime random precision against scheduler
* runtime accounting.
*/
static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
+ struct prev_cputime *prev,
cputime_t *ut, cputime_t *st)
{
cputime_t rtime, stime, utime;

+ raw_spin_lock(&prev->lock);
/*
* Tick based cputime accounting depend on random scheduling
* timeslices of a task to be interrupted or not by the timer.
@@ -606,22 +590,35 @@ static void cputime_adjust(struct task_cputime *curr,

if (utime == 0) {
stime = rtime;
- } else if (stime == 0) {
- utime = rtime;
- } else {
- cputime_t total = stime + utime;
+ goto update;
+ }

- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}

- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ stime = scale_stime((__force u64)stime, (__force u64)rtime,
+ (__force u64)(stime + utime));
+
+ /* make sure stime doesn't go backwards */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /* make sure utime doesn't go backwards */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }

+update:
+ prev->stime = stime;
+ prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
+ raw_spin_unlock(&prev->lock);
}

void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/