Re: [PATCH 2/2] nohz: use delayed iowait accounting to avoid race on idle time stats

From: Peter Zijlstra
Date: Thu Apr 17 2014 - 06:06:16 EST


On Wed, Apr 16, 2014 at 03:33:06PM +0900, Hidetoshi Seto wrote:
> [3] : new tricks
>
> To use seqcount, observers must be readers and never be writers.
> It means that:
>
> - Observed cpu's time stats are fixed at idle entry, and
> unchanged while sleeping (otherwise results of readers will
> not be coherent).
>
> - Readers must not refer nr_iowait of sleeping cpu because
> it can be changed by task woken up on other cpu.
>
> At this point:
>
> - As already pointed out, stating "I'll sleep as iowait"
> at idle entry will result in infinite iowait.
> => Then how about stating:
> "I'll sleep for <e.g. few nsec> as iowait
> and rest as idle"?
> => how to determine reasonable <few nsecs>?

Well, we actually _know_ when that counter drops to 0. We've got the
actual event there, we don't need to guess about any of this.

> - Original code accounts iowait only when nr_iowait is >0
> at idle exit. It means we can not determine whether the
> sleep time will be idle or iowait on the fly.
> => we cannot determine <few nsecs> at idle entry

Intel really should give us this crystal ball instruction already ;-)


Anyway, if you want to preserve the same broken ass crap we had pre
NOHZ, something like the below should do that.

I'm not really thrilled with iowait_{start,stop}() but I think they
should have the same general cost as the atomic ops we already had. In
particular on x86 an uncontended lock+unlock is a single atomic.

This is on top the first patch from Frederic that both you and Denys
carried.

That said; I really hate duckt taping this together, for the generated
numbers are still useless.

--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -58,6 +58,8 @@ union ktime {

typedef union ktime ktime_t; /* Kill this */

+#define ktime_zero ((ktime_t){ .tv64 = 0 })
+
/*
* ktime_t definitions when using the 64-bit scalar representation:
*/
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2297,15 +2297,29 @@ unsigned long nr_iowait(void)
unsigned long i, sum = 0;

for_each_possible_cpu(i)
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
+ sum += cpu_rq(i)->nr_iowait;

return sum;
}

unsigned long nr_iowait_cpu(int cpu)
{
- struct rq *this = cpu_rq(cpu);
- return atomic_read(&this->nr_iowait);
+ return cpu_rq(cpu)->nr_iowait;
+}
+
+void nr_iowait_deltas(ktime_t start, ktime_t now,
+ ktime_t *iowait_delta, ktime_t *idle_delta)
+{
+ struct rq *rq = this_rq();
+
+ raw_spin_lock(&rq->iowait_lock);
+ if (rq->nr_iowait) {
+ *iowait_delta = ktime_sub(now, start);
+ } else {
+ *iowait_delta = ktime_sub(rq->last_iowait, start);
+ *idle_delta = ktime_sub(now, rq->last_iowait);
+ }
+ raw_spin_unlock(&rq->iowait_lock);
}

#ifdef CONFIG_SMP
@@ -4201,6 +4215,24 @@ bool __sched yield_to(struct task_struct
}
EXPORT_SYMBOL_GPL(yield_to);

+static inline void iowait_start(struct rq *rq)
+{
+ raw_spin_lock(&rq->iowait_lock);
+ rq->nr_iowait++;
+ raw_spin_unlock(&rq->iowait_lock);
+ current->in_iowait = 1;
+}
+
+static inline void iowait_stop(struct rq *rq)
+{
+ current->in_iowait = 0;
+ raw_spin_lock(&rq->iowait_lock);
+ rq->nr_iowait--;
+ if (!rq->nr_iowait && rq != this_rq())
+ rq->last_iowait = ktime_get();
+ raw_spin_unlock(&rq->iowait_lock);
+}
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
@@ -4210,12 +4242,10 @@ void __sched io_schedule(void)
struct rq *rq = raw_rq();

delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
+ iowait_start();
blk_flush_plug(current);
- current->in_iowait = 1;
schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
+ iowait_stop();
delayacct_blkio_end();
}
EXPORT_SYMBOL(io_schedule);
@@ -4226,12 +4256,10 @@ long __sched io_schedule_timeout(long ti
long ret;

delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
+ iowait_start();
blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
+ iowait_stop();
delayacct_blkio_end();
return ret;
}
@@ -6880,7 +6908,10 @@ void __init sched_init(void)
#endif
#endif
init_rq_hrtick(rq);
- atomic_set(&rq->nr_iowait, 0);
+
+ raw_spinlock_init(&rq->iowait_lock);
+ rq->nr_iowait = 0;
+ rq->last_iowait = ktime_get();
}

set_load_weight(&init_task);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -571,7 +571,9 @@ struct rq {
u64 clock;
u64 clock_task;

- atomic_t nr_iowait;
+ raw_spinlock_t iowait_lock ____cacheline_aligned;
+ unsigned int nr_iowait;
+ ktime_t last_iowait

#ifdef CONFIG_SMP
struct root_domain *rd;
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -429,14 +429,16 @@ static void tick_nohz_update_jiffies(kti

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
- ktime_t delta;
+ ktime_t iowait_delta = ktime_zero, idle_delta = ktime_zero;
+
+ if (ts->idle_active == 2) {
+ nr_iowait_deltas(ts->idle_entrytime, now, &iowait_delta, &idle_delta);
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, iowait_delta);
+ } else {
+ idle_delta = ktime_sub(now, ts->idle_entrytime);
+ }
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, idle_delta);

- /* Updates the per cpu time idle statistics counters */
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(smp_processor_id()) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_active = 0;

sched_clock_idle_wakeup_event(0);
@@ -447,7 +449,7 @@ static ktime_t tick_nohz_start_idle(stru
ktime_t now = ktime_get();

ts->idle_entrytime = now;
- ts->idle_active = 1;
+ ts->idle_active = 1 + !!nr_iowait_cpu(smp_processor_id());
sched_clock_idle_sleep_event();
return now;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/