[RFC][PATCH 5/5] sched/core: Add debug code to catch missing update_rq_clock()

From: Matt Fleming
Date: Thu May 12 2016 - 15:50:19 EST


There's no diagnostic checks for figuring out when we've accidentally
missed update_rq_clock() calls. Let's add some by piggybacking on the
rq_*pin_lock() wrappers.

The idea behind the diagnostic checks is that upon pining rq lock the
rq clock should be updated, via update_rq_clock(), before anybody
reads the clock with rq_clock(). The exception to this rule is when
updates have explicitly been disabled with the rq_clock_skip_update()
optimisation.

There are some functions that only unpin the rq lock in order to grab
some other lock and avoid deadlock. In that case we don't need to
update the clock again and the previous diagnostic state can be
carried over in rq_repin_lock() by saving the state in the rq_flags
context.

Since this patch adds a new clock update flag and some already exist
in rq::clock_skip_update, that field has now been renamed. An attempt
has been made to keep the flag manipulation code small and fast since
it's used in the heart of the __schedule() fast path.

For the !CONFIG_SCHED_DEBUG case the only object code change (other
than addresses) is the following change to the two lines to reset
RQCF_ACT_SKIP inside of __schedule(),

- 41 c7 84 24 f0 08 00 movl $0x0,0x8f0(%r12)
- 00 00 00 00 00
+ 41 83 b4 24 f0 08 00 xorl $0x2,0x8f0(%r12)
+ 00 02

Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Cc: Yuyang Du <yuyang.du@xxxxxxxxx>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@xxxxxxxxx>
Signed-off-by: Matt Fleming <matt@xxxxxxxxxxxxxxxxxxx>
---
kernel/sched/core.c | 13 +++++++---
kernel/sched/sched.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2112c268fd1..0999b3f23dde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -101,9 +101,12 @@ void update_rq_clock(struct rq *rq)

lockdep_assert_held(&rq->lock);

- if (rq->clock_skip_update & RQCF_ACT_SKIP)
+ if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;

+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags |= RQCF_UPDATED;
+#endif
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
@@ -2825,7 +2828,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
rq->prev_mm = oldmm;
}

- rq->clock_skip_update = 0;
+ /* Clear ACT, preserve everything else */
+ rq->clock_update_flags ^= RQCF_ACT_SKIP;

/*
* Since the runqueue lock will be released by the next
@@ -3287,7 +3291,7 @@ static void __sched notrace __schedule(bool preempt)
raw_spin_lock(&rq->lock);
rq_pin_lock(rq, &rf);

- rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+ rq->clock_update_flags <<= 1; /* promote REQ to ACT */

switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
@@ -3328,7 +3332,8 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
} else {
- rq->clock_skip_update = 0;
+ /* Clear ACT, preserve everything else */
+ rq->clock_update_flags ^= RQCF_ACT_SKIP;
rq_unpin_lock(rq, &rf);
raw_spin_unlock_irq(&rq->lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cbeb830c7ac6..072c020cd8e3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -628,7 +628,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;

- unsigned int clock_skip_update;
+ unsigned int clock_update_flags;
u64 clock;
u64 clock_task;

@@ -735,9 +735,45 @@ static inline u64 __rq_clock_broken(struct rq *rq)
return READ_ONCE(rq->clock);
}

+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ * call to __schedule(). This is an optimisation to avoid
+ * neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ * in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ * made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ * if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+#define RQCF_UPDATED 0x04
+
static inline u64 rq_clock(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * The only reason for not seeing a clock update since the
+ * last rq_pin_lock() is if we're currently skipping updates.
+ */
+ WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
+#endif
+
return rq->clock;
}

@@ -747,36 +783,58 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}

-#define RQCF_REQ_SKIP 0x01
-#define RQCF_ACT_SKIP 0x02
-
static inline void rq_clock_skip_update(struct rq *rq, bool skip)
{
lockdep_assert_held(&rq->lock);
if (skip)
- rq->clock_skip_update |= RQCF_REQ_SKIP;
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
else
- rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}

struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+ * current pin context is stashed here in case it needs to be
+ * restored in rq_repin_lock().
+ */
+ unsigned int clock_update_flags;
+#endif
};

static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rf->clock_update_flags = 0;
+#endif
}

static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
{
+#ifdef CONFIG_SCHED_DEBUG
+ if (rq->clock_update_flags > RQCF_ACT_SKIP)
+ rf->clock_update_flags = RQCF_UPDATED;
+#endif
+
lockdep_unpin_lock(&rq->lock, rf->cookie);
}

static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
lockdep_repin_lock(&rq->lock, rf->cookie);
+
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * Restore the value we stashed in @rf for this pin context.
+ */
+ rq->clock_update_flags |= rf->clock_update_flags;
+#endif
}

#ifdef CONFIG_NUMA
--
2.7.3