Re: [PATCH 00/10] steal tasks to improve CPU utilization

From: Subhra Mazumdar
Date: Fri Nov 02 2018 - 19:39:17 EST

Next message: Borislav Petkov: "Re: [PATCH] EDAC: skx_edac: add ACPI dependency"
Previous message: Jann Horn: "Re: RFC: userspace exception fixups"
In reply to: Steven Sistare: "Re: [PATCH 00/10] steal tasks to improve CPU utilization"
Next in thread: Steven Sistare: "Re: [PATCH 00/10] steal tasks to improve CPU utilization"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On 10/22/18 7:59 AM, Steve Sistare wrote:

When a CPU has no more CFS tasks to run, and idle_balance() fails to
find a task, then attempt to steal a task from an overloaded CPU in the
same LLC. Maintain and use a bitmap of overloaded CPUs to efficiently
identify candidates. To minimize search time, steal the first migratable
task that is found when the bitmap is traversed. For fairness, search
for migratable tasks on an overloaded CPU in order of next to run.

This simple stealing yields a higher CPU utilization than idle_balance()
alone, because the search is cheap, so it may be called every time the CPU
is about to go idle. idle_balance() does more work because it searches
widely for the busiest queue, so to limit its CPU consumption, it declines
to search if the system is too busy. Simple stealing does not offload the
globally busiest queue, but it is much better than running nothing at all.

The bitmap of overloaded CPUs is a new type of sparse bitmap, designed to
reduce cache contention vs the usual bitmap when many threads concurrently
set, clear, and visit elements.

Is the bitmask saving much? I tried a simple stealing that just starts
searching the domain from the current cpu and steals a thread from the
first cpu that has more than one runnable thread. It seems to perform
similar to your patch.

hackbench on X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ baselineÂÂÂÂÂÂÂ %stdevÂ patch %stdev
1(40 tasks)ÂÂÂÂ 0.5524ÂÂÂÂÂÂÂÂÂ 2.36ÂÂÂ 0.5522 (0.045%) 3.82
2(80 tasks)ÂÂÂÂ 0.6482ÂÂÂÂÂÂÂÂÂ 11.4ÂÂÂ 0.7241 (-11.7%) 20.34
4(160 tasks)ÂÂÂ 0.9756ÂÂÂÂÂÂÂÂÂ 0.95ÂÂÂ 0.8276 (15.1%) 5.8
8(320 tasks)ÂÂÂ 1.7699ÂÂÂÂÂÂÂÂÂ 1.62ÂÂÂ 1.6655 (5.9%) 1.57
16(640 tasks)ÂÂ 3.1018ÂÂÂÂÂÂÂÂÂ 0.77ÂÂÂ 2.9858 (3.74%) 1.4
32(1280 tasks)Â 5.565ÂÂÂÂÂÂÂÂÂÂ 0.62ÂÂÂ 5.3388 (4.1%) 0.72

X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Oracle database OLTP, logging _enabled_

Users %speedup
20 1.2
40 -0.41
60 0.83
80 2.37
100 1.54
120 3.0
140 2.24
160 1.82
180 1.94
200 2.23
220 1.49

Below is the patch (not in best shape)

----------->8--------------------

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f808ddf..1690451 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3540,6 +3540,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
Â}

Âstatic int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int my_idle_balance(struct rq *this_rq, struct rq_flags *rf);

Âstatic inline unsigned long task_util(struct task_struct *p)
Â{
@@ -6619,6 +6620,8 @@ done: __maybe_unused;

Âidle:
ÂÂÂÂÂÂÂ new_tasks = idle_balance(rq, rf);
+ÂÂÂÂÂÂ if (new_tasks == 0)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ new_tasks = my_idle_balance(rq, rf);

ÂÂÂÂÂÂÂ /*
ÂÂÂÂÂÂÂÂ * Because idle_balance() releases (and re-acquires) rq->lock, it is
@@ -8434,6 +8437,75 @@ static int should_we_balance(struct lb_env *env)
ÂÂÂÂÂÂÂ return balance_cpu == env->dst_cpu;
Â}

+int get_best_cpu(int this_cpu, struct sched_domain *sd)
+{
+ÂÂÂÂÂÂ struct rq *this_rq, *rq;
+ÂÂÂÂÂÂ int i;
+ÂÂÂÂÂÂ int best_cpu = -1;
+
+ÂÂÂÂÂÂ this_rq = cpu_rq(this_cpu);
+ÂÂÂÂÂÂ for_each_cpu_wrap(i, sched_domain_span(sd), this_cpu) {
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (this_rq->nr_running > 0)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ return (-1);
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (i == this_cpu)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ continue;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ rq = cpu_rq(i);
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (rq->nr_running <= 1)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ continue;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ best_cpu = i;
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ break;
+ÂÂÂÂÂÂ }
+ÂÂÂÂÂÂ return (best_cpu);
+}
+static int my_load_balance(int this_cpu, struct rq *this_rq,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ÂÂÂÂÂÂ int ld_moved = 0;
+ÂÂÂÂÂÂ struct rq *busiest;
+ÂÂÂÂÂÂ unsigned long flags;
+ÂÂÂÂÂÂ struct task_struct *p = NULL;
+ÂÂÂÂÂÂ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+ÂÂÂÂÂÂ int best_cpu;
+
+ÂÂÂÂÂÂ struct lb_env env = {
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .sdÂÂÂÂÂÂÂÂÂÂÂÂ = sd,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .dst_cpuÂÂÂÂÂÂÂ = this_cpu,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .dst_rqÂÂÂÂÂÂÂÂ = this_rq,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .dst_grpmaskÂÂÂ = sched_group_span(sd->groups),
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .idleÂÂÂÂÂÂÂÂÂÂ = idle,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .cpusÂÂÂÂÂÂÂÂÂÂ = cpus,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ .tasksÂÂÂÂÂÂÂÂÂ = LIST_HEAD_INIT(env.tasks),
+ÂÂÂÂÂÂ };
+
+ÂÂÂÂÂÂ if (idle == CPU_NEWLY_IDLE)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ env.dst_grpmask = NULL;
+
+ÂÂÂÂÂÂ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
+
+ÂÂÂÂÂÂ best_cpu = get_best_cpu(this_cpu, sd);
+
+ÂÂÂÂÂÂ if (best_cpu >= 0)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ busiest = cpu_rq(best_cpu);
+ÂÂÂÂÂÂ else
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ goto out;
+
+ÂÂÂÂÂÂ env.src_cpu = busiest->cpu;
+ÂÂÂÂÂÂ env.src_rq = busiest;
+ÂÂÂÂÂÂ raw_spin_lock_irqsave(&busiest->lock, flags);
+
+ÂÂÂÂÂÂ p = detach_one_task(&env);
+ÂÂÂÂÂÂ raw_spin_unlock(&busiest->lock);
+ÂÂÂÂÂÂ if (p) {
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ attach_one_task(this_rq, p);
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ ld_moved++;
+ÂÂÂÂÂÂ }
+ÂÂÂÂÂÂ local_irq_restore(flags);
+
+out:
+ÂÂÂÂÂÂ return ld_moved;
+}
+
Â/*
Â * Check this_cpu to ensure it is balanced within domain. Attempt to move
Â * tasks if there is an imbalance.
@@ -9369,6 +9441,71 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
Âstatic inline void nohz_newidle_balance(struct rq *this_rq) { }
Â#endif /* CONFIG_NO_HZ_COMMON */

+
+static int my_idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+ÂÂÂÂÂÂ int this_cpu = this_rq->cpu;
+ÂÂÂÂÂÂ struct sched_domain *sd;
+ÂÂÂÂÂÂ int pulled_task = 0;
+ÂÂÂÂÂÂ int level;
+
+ÂÂÂÂÂÂ /*
+ÂÂÂÂÂÂÂ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ÂÂÂÂÂÂÂ * measure the duration of idle_balance() as idle time.
+ÂÂÂÂÂÂÂ */
+ÂÂÂÂÂÂ this_rq->idle_stamp = rq_clock(this_rq);
+
+ÂÂÂÂÂÂ rq_unpin_lock(this_rq, rf);
+
+ÂÂÂÂÂÂ /*
+ÂÂÂÂÂÂÂ * Drop the rq->lock, but keep IRQ/preempt disabled.
+ÂÂÂÂÂÂÂ */
+ÂÂÂÂÂÂ raw_spin_unlock(&this_rq->lock);
+
+ÂÂÂÂÂÂ rcu_read_lock();
+
+ÂÂÂÂÂÂ level = 0;
+ÂÂÂÂÂÂ for_each_domain(this_cpu, sd) {
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (level++ > 1)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ break;
+
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (!(sd->flags & SD_LOAD_BALANCE))
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ continue;
+
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (sd->flags & SD_BALANCE_NEWIDLE)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pulled_task = my_load_balance(this_cpu, this_rq,
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ sd, CPU_NEWLY_IDLE);
+
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ /*
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ * Stop searching for tasks to pull if there are
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ * now runnable tasks on this rq.
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ */
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (pulled_task || this_rq->nr_running > 0)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ break;
+ÂÂÂÂÂÂ }
+ÂÂÂÂÂÂ rcu_read_unlock();
+
+ÂÂÂÂÂÂ raw_spin_lock(&this_rq->lock);
+
+ÂÂÂÂÂÂ /*
+ÂÂÂÂÂÂÂ * While browsing the domains, we released the rq lock, a task could
+ÂÂÂÂÂÂÂ * have been enqueued in the meantime. Since we're not going idle,
+ÂÂÂÂÂÂÂ * pretend we pulled a task.
+ÂÂÂÂÂÂÂ */
+ÂÂÂÂÂÂ if (this_rq->cfs.h_nr_running && !pulled_task)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pulled_task = 1;
+out:
+ÂÂÂÂÂÂ /* Is there a task of a high priority class? */
+ÂÂÂÂÂÂ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pulled_task = -1;
+
+ÂÂÂÂÂÂ if (pulled_task)
+ÂÂÂÂÂÂÂÂÂÂÂÂÂÂ this_rq->idle_stamp = 0;
+
+ÂÂÂÂÂÂ rq_repin_lock(this_rq, rf);
+ÂÂÂÂÂÂ return pulled_task;
+}
+
Â/*
Â * idle_balance is called by schedule() if this_cpu is about to become
Â * idle. Attempts to pull tasks from other CPUs.

Next message: Borislav Petkov: "Re: [PATCH] EDAC: skx_edac: add ACPI dependency"
Previous message: Jann Horn: "Re: RFC: userspace exception fixups"
In reply to: Steven Sistare: "Re: [PATCH 00/10] steal tasks to improve CPU utilization"
Next in thread: Steven Sistare: "Re: [PATCH 00/10] steal tasks to improve CPU utilization"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]