Re: [PATCH v8 2/7] sched/fair: Decay task PELT values during wakeup migration

From: Vincent Guittot
Date: Fri May 06 2022 - 09:59:01 EST

Next message: Greg KH: "Re: [PATCH] kobject: Refactor kobject_set_name_vargs()"
Previous message: Xiaoyao Li: "Re: [RFC PATCH v6 011/104] KVM: TDX: Initialize TDX module when loading kvm_intel.ko"
In reply to: Dietmar Eggemann: "Re: [PATCH v8 2/7] sched/fair: Decay task PELT values during wakeup migration"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Le samedi 30 avril 2022 à 01:09:25 (+0800), Tao Zhou a écrit :
> On Fri, Apr 29, 2022 at 03:11:43PM +0100, Vincent Donnefort wrote:

[..]

> >
> > -static inline u64 rq_clock_pelt(struct rq *rq)
> > +#ifdef CONFIG_CFS_BANDWIDTH
> > +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
> > {
> > - lockdep_assert_rq_held(rq);
> > - assert_clock_updated(rq);
> > -
> > - return rq->clock_pelt - rq->lost_idle_time;
> > + /*
> > + * Make sure that pending update of rq->clock_pelt_idle and
> > + * rq->enter_idle are visible during update_blocked_average() before
> > + * updating cfs_rq->throttled_pelt_idle.
> > + */
>
> Two places to call update_idle_cfs_rq_clock_pelt():
>
> 1 dequeue_entity()
> (no pending update before. and this is fast path)
> update_idle_cfs_rq_clock_pelt
>
> 2 update_blocked_averages()
> update_clock_rq() -> pending update here.
> __update_blocked_fair()
> update_idle_cfs_rq_clock_pelt
>
> Another way will be to move the smp_wmb() to _update_idle_rq_clock_pelt()
>
> static inline void _update_idle_rq_clock_pelt(struct rq *rq)
> {
> rq->clock_pelt = rq_clock_task(rq);
>
> u64_u32_store(rq->enter_idle, rq_clock(rq));
> u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
> smp_wmb();
> }
>
> But does this function called more often enough than dequeue_entity(),
>
> pick_next_task_fair()
> (rq will be idle)
> update_idle_rq_clock_pelt()
>
> update_rq_clock_pelt()
> (curr is idle)
> _update_idle_rq_clock_pelt()
>
> The condition is they are all idle.
> And the migrate_se_pelt_lag() is for idle also.
>
> If smp_wmb() is here like the patch, smp_rmb() place in
> migrate_se_pelt_lag() here:
>
> #ifdef CONFIG_CFS_BANDWIDTH
> throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
> smp_rmb();
> /* The clock has been stopped for throttling */
> if (throttled == U64_MAX)
> return;
> #endif
>
> If smp_wmb() is in _update_idle_rq_clock_pelt(), smp_rmb() place in
> migrate_se_pelt_lag() here:
>
> #ifdef CONFIG_CFS_BANDWIDTH
> throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
> /* The clock has been stopped for throttling */
> if (throttled == U64_MAX)
> return;
> #endif
> smp_rmb();
> now = u64_u32_load(rq->clock_pelt_idle);
> now -= throttled;
>
> Sorry for these noise words.

I thought a bit more on this and the memory barrier should be put as below.
This will ensure that we will not over estimate the lag but only under estimate
it if an update happens in the middle of the computation of the lag.

---
kernel/sched/fair.c | 18 +++++++++++++-----
kernel/sched/pelt.h | 11 +++++------
2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ce94df5a6df6..1aeca8d518a2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3705,7 +3705,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#ifdef CONFIG_NO_HZ_COMMON
static inline void migrate_se_pelt_lag(struct sched_entity *se)
{
- u64 throttled = 0, now;
+ u64 throttled = 0, now, lut;
struct cfs_rq *cfs_rq;
struct rq *rq;
bool is_idle;
@@ -3761,13 +3761,21 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
return;
#endif
now = u64_u32_load(rq->clock_pelt_idle);
+ smp_rmb();
now -= throttled;

- /* An update happened while computing lag */
- if (now < cfs_rq_last_update_time(cfs_rq))
- return;
+ lut = cfs_rq_last_update_time(cfs_rq);

- now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->enter_idle);
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation which means that an update happened while
+ * computing the lag. LUT is the most up to date value so use
+ * it instead of trying to estimate what it should be
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->enter_idle);

__update_load_avg_blocked_se(now, se);
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9aed92262bd9..330d582efafe 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -75,6 +75,11 @@ static inline void _update_idle_rq_clock_pelt(struct rq *rq)
rq->clock_pelt = rq_clock_task(rq);

u64_u32_store(rq->enter_idle, rq_clock(rq));
+ /*
+ * Make sure that pending update of rq->enter_idle is visible before
+ * rq->clock_pelt_idle so we will never overestimate the lag.
+ */
+ smp_wmb();
u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
}

@@ -153,12 +158,6 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
#ifdef CONFIG_CFS_BANDWIDTH
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- /*
- * Make sure that pending update of rq->clock_pelt_idle and
- * rq->enter_idle are visible during update_blocked_average() before
- * updating cfs_rq->throttled_pelt_idle.
- */
- smp_wmb();
if (unlikely(cfs_rq->throttle_count))
u64_u32_store(cfs_rq->throttled_pelt_idle, U64_MAX);
else
--
2.17.1

> > + smp_wmb();
> > + if (unlikely(cfs_rq->throttle_count))
> > + u64_u32_store(cfs_rq->throttled_pelt_idle, U64_MAX);
> > + else
> > + u64_u32_store(cfs_rq->throttled_pelt_idle,
> > + cfs_rq->throttled_clock_pelt_time);
> > }
> >
> > -#ifdef CONFIG_CFS_BANDWIDTH
> > /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
> > static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
> > {
> > @@ -150,6 +175,7 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
> > return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
> > }
> > #else
> > +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
> > static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
> > {
> > return rq_clock_pelt(rq_of(cfs_rq));
> > @@ -204,6 +230,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
> > static inline void
> > update_idle_rq_clock_pelt(struct rq *rq) { }
> >
> > +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
> > #endif
> >
> >
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index e2cf6e48b165..ea9365e1a24e 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -641,6 +641,10 @@ struct cfs_rq {
> > int runtime_enabled;
> > s64 runtime_remaining;
> >
> > + u64 throttled_pelt_idle;
> > +#ifndef CONFIG_64BIT
> > + u64 throttled_pelt_idle_copy;
> > +#endif
> > u64 throttled_clock;
> > u64 throttled_clock_pelt;
> > u64 throttled_clock_pelt_time;
> > @@ -1013,6 +1017,12 @@ struct rq {
> > u64 clock_task ____cacheline_aligned;
> > u64 clock_pelt;
> > unsigned long lost_idle_time;
> > + u64 clock_pelt_idle;
> > + u64 enter_idle;
> > +#ifndef CONFIG_64BIT
> > + u64 clock_pelt_idle_copy;
> > + u64 enter_idle_copy;
> > +#endif
> >
> > atomic_t nr_iowait;
> >
> > --
> > 2.25.1

Next message: Greg KH: "Re: [PATCH] kobject: Refactor kobject_set_name_vargs()"
Previous message: Xiaoyao Li: "Re: [RFC PATCH v6 011/104] KVM: TDX: Initialize TDX module when loading kvm_intel.ko"
In reply to: Dietmar Eggemann: "Re: [PATCH v8 2/7] sched/fair: Decay task PELT values during wakeup migration"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]