Re: [PATCH v5 4/5] sched/fair: Skip SIS domain scan if fully busy

From: Yicong Yang
Date: Wed Sep 14 2022 - 02:21:38 EST


On 2022/9/9 13:53, Abel Wu wrote:
> If a full domain scan failed, then no unoccupied cpus available
> and the LLC is fully busy. In this case we'd better use cpus
> more wisely, rather than wasting it trying to find an idle cpu
> that probably not exist. The fully busy status will be cleared
> when any cpu of that LLC goes idle and everything goes back to
> normal again.
>
> Make the has_idle_cores boolean hint more rich by turning it
> into a state machine.
>
> Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
> ---
> include/linux/sched/topology.h | 35 +++++++++++++++++-
> kernel/sched/fair.c | 67 ++++++++++++++++++++++++++++------
> 2 files changed, 89 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 816df6cc444e..cc6089765b64 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -77,10 +77,43 @@ extern int sched_domain_level_max;
>
> struct sched_group;
>
> +/*
> + * States of the sched-domain
> + *
> + * - sd_has_icores
> + * This state is only used in LLC domains to indicate worthy
> + * of a full scan in SIS due to idle cores available.
> + *
> + * - sd_has_icpus
> + * This state indicates that unoccupied (sched-idle/idle) cpus
> + * might exist in this domain. For the LLC domains it is the
> + * default state since these cpus are the main targets of SIS
> + * search, and is also used as a fallback state of the other
> + * states.
> + *
> + * - sd_is_busy
> + * This state indicates there are no unoccupied cpus in this
> + * domain. So for LLC domains, it gives the hint on whether
> + * we should put efforts on the SIS search or not.
> + *
> + * For LLC domains, sd_has_icores is set when the last non-idle cpu of
> + * a core becomes idle. After a full SIS scan and if no idle cores found,
> + * sd_has_icores must be cleared and the state will be set to sd_has_icpus
> + * or sd_is_busy depending on whether there is any idle cpu. And during
> + * load balancing on each SMT domain inside the LLC, the state will be
> + * re-evaluated and switch from sd_is_busy to sd_has_icpus if idle cpus
> + * exist.
> + */
> +enum sd_state {
> + sd_has_icores,
> + sd_has_icpus,
> + sd_is_busy
> +};
> +
> struct sched_domain_shared {
> atomic_t ref;
> atomic_t nr_busy_cpus;
> - int has_idle_cores;
> + enum sd_state state;
> int nr_idle_scan;
> };
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index fad289530e07..25df73c7e73c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6262,26 +6262,47 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
> DEFINE_STATIC_KEY_FALSE(sched_smt_present);
> EXPORT_SYMBOL_GPL(sched_smt_present);
>
> -static inline void set_idle_cores(int cpu, int val)
> +static inline void set_llc_state(int cpu, enum sd_state state)
> {
> struct sched_domain_shared *sds;
>
> sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> if (sds)
> - WRITE_ONCE(sds->has_idle_cores, val);
> + WRITE_ONCE(sds->state, state);
> }
>
> -static inline bool test_idle_cores(int cpu, bool def)
> +static inline enum sd_state get_llc_state(int cpu, enum sd_state def)
> {
> struct sched_domain_shared *sds;
>
> sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> if (sds)
> - return READ_ONCE(sds->has_idle_cores);
> + return READ_ONCE(sds->state);
>
> return def;
> }
>
> +static inline void clear_idle_cpus(int cpu)
> +{
> + set_llc_state(cpu, sd_is_busy);
> +}
> +
> +static inline bool test_idle_cpus(int cpu)
> +{
> + return get_llc_state(cpu, sd_has_icpus) != sd_is_busy;
> +}
> +
> +static inline void set_idle_cores(int cpu, int core_idle)
> +{
> + set_llc_state(cpu, core_idle ? sd_has_icores : sd_has_icpus);
> +}
> +
> +static inline bool test_idle_cores(int cpu, bool def)
> +{
> + return sd_has_icores ==
> + get_llc_state(cpu, def ? sd_has_icores : sd_has_icpus);
> +}
> +
> /*
> * Scans the local SMT mask to see if the entire core is idle, and records this
> * information in sd_llc_shared->has_idle_cores.
> @@ -6291,25 +6312,29 @@ static inline bool test_idle_cores(int cpu, bool def)
> */
> void __update_idle_core(struct rq *rq)
> {
> - int core = cpu_of(rq);
> - int cpu;
> + enum sd_state old, new = sd_has_icores;
> + int core = cpu_of(rq), cpu;
>
> if (rq->ttwu_pending)
> return;
>
> rcu_read_lock();
> - if (test_idle_cores(core, true))
> + old = get_llc_state(core, sd_has_icores);
> + if (old == sd_has_icores)
> goto unlock;
>
> for_each_cpu(cpu, cpu_smt_mask(core)) {
> if (cpu == core)
> continue;
>
> - if (!available_idle_cpu(cpu))
> - goto unlock;
> + if (!available_idle_cpu(cpu)) {
> + new = sd_has_icpus;
> + break;
> + }
> }
>
> - set_idle_cores(core, 1);
> + if (old != new)
> + set_llc_state(core, new);
> unlock:
> rcu_read_unlock();
> }

We'll reach this function only when SMT is active (sched_smt_present == True)...

> @@ -6370,6 +6395,15 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
>
> #else /* CONFIG_SCHED_SMT */
>
> +static inline void clear_idle_cpus(int cpu)
> +{
> +}
> +
> +static inline bool test_idle_cpus(int cpu)
> +{
> + return true;
> +}
> +
> static inline void set_idle_cores(int cpu, int val)
> {
> }
> @@ -6406,6 +6440,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> struct sched_domain *this_sd;
> u64 time = 0;
>
> + if (!test_idle_cpus(target))
> + return -1;
> +

...and on a non-SMT machine, we'll always fail here after the first time we clear_idle_cpus() below.
since we have no place to make sds->state to sd_has_icpus again. You may need to make sds->state
update in set_next_task_idle() also when smt is inactive.

> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
> if (!this_sd)
> return -1;
> @@ -6482,8 +6519,14 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
> }
> }
>
> - if (has_idle_core)
> - set_idle_cores(target, false);
> + /*
> + * If no idle cpu can be found, set LLC state to busy to prevent
> + * us from SIS domain scan to save a few cycles.
> + */
> + if (idle_cpu == -1)
> + clear_idle_cpus(target);
> + else if (has_idle_core)
> + set_idle_cores(target, 0);
>
> if (sched_feat(SIS_PROP) && !has_idle_core) {
> time = cpu_clock(this) - time;
>