[RFC][PATCH 4/7] sched: Replace sd_busy/nr_busy_cpus with sched_domain_shared

From: Peter Zijlstra
Date: Mon May 09 2016 - 06:58:20 EST


Move the nr_busy_cpus thing from its hacky sd->parent->groups->sgc
location into the much more natural sched_domain_shared location.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 10 +++++-----
kernel/sched/fair.c | 22 ++++++++++++----------
kernel/sched/sched.h | 6 +-----
kernel/time/tick-sched.c | 10 +++++-----
5 files changed, 24 insertions(+), 25 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1059,6 +1059,7 @@ struct sched_group;

struct sched_domain_shared {
atomic_t ref;
+ atomic_t nr_busy_cpus;
};

struct sched_domain {
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5866,14 +5866,14 @@ static void destroy_sched_domains(struct
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);

static void update_top_cache_domain(int cpu)
{
+ struct sched_domain_shared *sds = NULL;
struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
int id = cpu;
int size = 1;

@@ -5881,13 +5881,13 @@ static void update_top_cache_domain(int
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- busy_sd = sd->parent; /* sd_busy */
+ sds = sd->shared;
}
- rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);

sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -6184,7 +6184,6 @@ static void init_sched_groups_capacity(i
return;

update_group_capacity(sd, cpu);
- atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}

/*
@@ -6388,6 +6387,7 @@ sd_init(struct sched_domain_topology_lev

sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);

#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7842,13 +7842,13 @@ static inline void set_cpu_sd_state_busy
int cpu = smp_processor_id();

rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));

if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;

- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7859,13 +7859,13 @@ void set_cpu_sd_state_idle(void)
int cpu = smp_processor_id();

rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));

if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;

- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -8092,8 +8092,8 @@ static void nohz_idle_balance(struct rq
static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
struct sched_domain *sd;
- struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
bool kick = false;

@@ -8121,11 +8121,13 @@ static inline bool nohz_kick_needed(stru
return true;

rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
- sgc = sd->groups->sgc;
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
-
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http:lkml.kernel.org/r/20111202010832.602203411@xxxxxxxxxxxxxxxxxxxxxxxxxx
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
goto unlock;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -856,8 +856,8 @@ static inline struct sched_domain *lowes
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);

struct sched_group_capacity {
@@ -869,10 +869,6 @@ struct sched_group_capacity {
unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- /*
- * Number of busy cpus in this group.
- */
- atomic_t nr_busy_cpus;

unsigned long cpumask[0]; /* iteration mask */
};
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(irqs_disabled());

/*
- * Update the idle state in the scheduler domain hierarchy
- * when tick_nohz_stop_sched_tick() is called from the idle loop.
- * State will be updated to busy during the first busy tick after
- * exiting idle.
- */
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
set_cpu_sd_state_idle();

local_irq_disable();