Re: [RFC v5 4/6] sched/fair: Tune task wake-up logic to pack small background tasks on fewer cores

From: Vincent Guittot
Date: Mon Oct 07 2019 - 08:20:15 EST


On Mon, 7 Oct 2019 at 10:31, Parth Shah <parth@xxxxxxxxxxxxx> wrote:
>
> The algorithm finds the first non idle core in the system and tries to
> place a task in the idle CPU in the chosen core. To maintain
> cache hotness, work of finding non idle core starts from the prev_cpu,
> which also reduces task ping-pong behaviour inside of the core.
>
> Define a new method to select_non_idle_core which keep tracks of the idle
> and non-idle CPUs in the core and based on the heuristics determines if the
> core is sufficiently busy to place the incoming backgroung task. The
> heuristic further defines the non-idle CPU into either busy (>12.5% util)
> CPU and overutilized (>80% util) CPU.
> - The core containing more idle CPUs and no busy CPUs is not selected for
> packing
> - The core if contains more than 1 overutilized CPUs are exempted from
> task packing
> - Pack if there is atleast one busy CPU and overutilized CPUs count is <2
>
> Value of 12.5% utilization for busy CPU gives sufficient heuristics for CPU
> doing enough work and not become idle in nearby timeframe.
>
> Signed-off-by: Parth Shah <parth@xxxxxxxxxxxxx>
> ---
> kernel/sched/core.c | 3 ++
> kernel/sched/fair.c | 95 ++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 97 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 6e1ae8046fe0..7e3aff59540a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6402,6 +6402,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
>
> DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
> DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
> +DECLARE_PER_CPU(cpumask_var_t, turbo_sched_mask);
>
> void __init sched_init(void)
> {
> @@ -6442,6 +6443,8 @@ void __init sched_init(void)
> cpumask_size(), GFP_KERNEL, cpu_to_node(i));
> per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
> cpumask_size(), GFP_KERNEL, cpu_to_node(i));
> + per_cpu(turbo_sched_mask, i) = (cpumask_var_t)kzalloc_node(
> + cpumask_size(), GFP_KERNEL, cpu_to_node(i));
> }
> #endif /* CONFIG_CPUMASK_OFFSTACK */
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b798fe7ff7cd..d4a1b6474338 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5353,6 +5353,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> /* Working cpumask for: load_balance, load_balance_newidle. */
> DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
> DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
> +/* A cpumask to find active cores in the system. */
> +DEFINE_PER_CPU(cpumask_var_t, turbo_sched_mask);
>
> #ifdef CONFIG_NO_HZ_COMMON
>
> @@ -5964,6 +5966,76 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
> return cpu;
> }
>
> +#ifdef CONFIG_SCHED_SMT
> +static inline bool is_background_task(struct task_struct *p)
> +{
> + if (p->flags & PF_CAN_BE_PACKED)
> + return true;
> +
> + return false;
> +}
> +
> +#define busyness_threshold (100 >> 3)
> +#define is_cpu_busy(util) ((util) > busyness_threshold)
> +
> +/*
> + * Try to find a non idle core in the system based on few heuristics:
> + * - Keep track of overutilized (>80% util) and busy (>12.5% util) CPUs
> + * - If none CPUs are busy then do not select the core for task packing
> + * - If atleast one CPU is busy then do task packing unless overutilized CPUs
> + * count is < busy/2 CPU count
> + * - Always select idle CPU for task packing
> + */
> +static int select_non_idle_core(struct task_struct *p, int prev_cpu, int target)
> +{
> + struct cpumask *cpus = this_cpu_cpumask_var_ptr(turbo_sched_mask);
> + int iter_cpu, sibling;
> +
> + cpumask_and(cpus, cpu_online_mask, p->cpus_ptr);
> +
> + for_each_cpu_wrap(iter_cpu, cpus, prev_cpu) {
> + int idle_cpu_count = 0, non_idle_cpu_count = 0;
> + int overutil_cpu_count = 0;
> + int busy_cpu_count = 0;
> + int best_cpu = iter_cpu;
> +
> + for_each_cpu(sibling, cpu_smt_mask(iter_cpu)) {
> + __cpumask_clear_cpu(sibling, cpus);
> + if (idle_cpu(iter_cpu)) {
> + idle_cpu_count++;
> + best_cpu = iter_cpu;
> + } else {
> + non_idle_cpu_count++;
> + if (cpu_overutilized(iter_cpu))
> + overutil_cpu_count++;
> + if (is_cpu_busy(cpu_util(iter_cpu)))
> + busy_cpu_count++;
> + }
> + }
> +
> + /*
> + * Pack tasks to this core if
> + * 1. Idle CPU count is higher and atleast one is busy
> + * 2. If idle_cpu_count < non_idle_cpu_count then ideally do
> + * packing but if there are more CPUs overutilized then don't
> + * overload it.

Could you give details about the rationale behind these conditions ?

> + */
> + if (idle_cpu_count > non_idle_cpu_count) {
> + if (busy_cpu_count)
> + return best_cpu;
> + } else {
> + /*
> + * Pack tasks if at max 1 CPU is overutilized
> + */
> + if (overutil_cpu_count < 2)
> + return best_cpu;
> + }
> + }
> +
> + return select_idle_sibling(p, prev_cpu, target);
> +}
> +#endif /* CONFIG_SCHED_SMT */
> +
> /*
> * Try and locate an idle core/thread in the LLC cache domain.
> */
> @@ -6418,6 +6490,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> return -1;
> }
>
> +#ifdef CONFIG_SCHED_SMT
> +/*
> + * Select all classified background tasks for task packing
> + */
> +static inline int turbosched_select_non_idle_core(struct task_struct *p,
> + int prev_cpu, int target)
> +{
> + return select_non_idle_core(p, prev_cpu, target);
> +}
> +#else
> +static inline int turbosched_select_non_idle_core(struct task_struct *p,
> + int prev_cpu, int target)
> +{
> + return select_idle_sibling(p, prev_cpu, target);

should be better to make turbosched_select_non_idle_core empty and
make sure that __turbo_sched_enabled is never enabled if
CONFIG_SCHED_SMT is disabled

> +}
> +#endif
> +
> /*
> * select_task_rq_fair: Select target runqueue for the waking task in domains
> * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -6483,7 +6572,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
> } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
> /* Fast path */
>
> - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
> + if (is_turbosched_enabled() && unlikely(is_background_task(p)))
> + new_cpu = turbosched_select_non_idle_core(p, prev_cpu,
> + new_cpu);

Could you add turbosched_select_non_idle_core() similarly to
find_energy_efficient_cpu() ?
Add it at the beg select_task_rq_fair()
Return immediately with theCPU if you have found one
Or let the normal path select a CPU if the
turbosched_select_non_idle_core() has not been able to find a suitable
CPU for packing


> + else
> + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>
> if (want_affine)
> current->recent_used_cpu = cpu;
> --
> 2.17.1
>