RE: hotplug support for arch/arc/plat-eznps platform

From: Ofer Levi(SW)
Date: Mon Aug 14 2017 - 03:55:06 EST


Sorry for the late response but this patch is a drawback,. Its back to about 0.4 sec per cpu bring up.
This is when possible, present and isolcpus are 16-4095
Most time is spent at:
register_sched_domain_sysctl() calling sd_sysctl_header = register_sysctl_table(sd_ctl_root);

[ 22.150000] ## CPU16 LIVE ##: Executing Code...
[ 22.170000] partition_sched_domains start
[ 22.220000] register_sched_domain_sysctl start
[ 22.580000] register_sched_domain_sysctl end
[ 22.580000] partition_sched_domains end


> BTW, what physical size does your toy have? I'm thinking its less than
> multiple racks worth like the SGI systems were.
It's a single chip with 4K cpus, capable of 400Gbps duplex. Evaluation board is pizza box size.

Thanks


> -----Original Message-----
> From: Peter Zijlstra [mailto:peterz@xxxxxxxxxxxxx]
> Sent: Thursday, August 10, 2017 6:45 PM
> To: Ofer Levi(SW) <oferle@xxxxxxxxxxxx>
> Cc: rusty@xxxxxxxxxxxxxxx; mingo@xxxxxxxxxx;
> Vineet.Gupta1@xxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Tejun Heo
> <tj@xxxxxxxxxx>
> Subject: Re: hotplug support for arch/arc/plat-eznps platform
>
> On Thu, Aug 10, 2017 at 11:19:05AM +0200, Peter Zijlstra wrote:
> > On Thu, Aug 10, 2017 at 07:40:16AM +0000, Ofer Levi(SW) wrote:
> > > Well, this definitely have pleased the little toy :) Thank you. I
> > > really appreciate your time and effort.
> > >
> > > If I may, one more newbie question. What do I need to do for the two
> > > patches to find their way into formal kernel code?
> >
> > I'll split the first patch into two separate patches and line them up.
> >
> > I'm not sure about this last patch, I'll speak with Ingo once he's
> > back to see what would be the thing to do here.
> >
> > I suspect we can make it work, that sysctl stuff is only debug crud
> > after all and that should never get in the way of getting work done.
>
> Can you test this instead of the second patch? It should have the same
> effect.
>
>
> ---
> Subject: sched/debug: Optimize sched_domain sysctl generation
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Date: Thu Aug 10 17:10:26 CEST 2017
>
> Currently we unconditionally destroy all sysctl bits and regenerate them after
> we've rebuild the domains (even if that rebuild is a no-op).
>
> And since we unconditionally (re)build the sysctl for all possible CPUs,
> onlining all CPUs gets us O(n^2) time. Instead change this to only rebuild the
> bits for CPUs we've actually installed new domains on.
>
> Reported-by: "Ofer Levi(SW)" <oferle@xxxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> kernel/sched/debug.c | 68
> ++++++++++++++++++++++++++++++++++++++----------
> kernel/sched/sched.h | 4 ++
> kernel/sched/topology.c | 1
> 3 files changed, 59 insertions(+), 14 deletions(-)
>
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cp
> return table;
> }
>
> +static cpumask_var_t sd_sysctl_cpus;
> static struct ctl_table_header *sd_sysctl_header;
> +
> void register_sched_domain_sysctl(void)
> {
> - int i, cpu_num = num_possible_cpus();
> - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
> + static struct ctl_table *cpu_entries;
> + static struct ctl_table **cpu_idx;
> char buf[32];
> + int i;
> +
> + if (!cpu_entries) {
> + cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
> + if (!cpu_entries)
> + return;
> +
> + WARN_ON(sd_ctl_dir[0].child);
> + sd_ctl_dir[0].child = cpu_entries;
> + }
> +
> + if (!cpu_idx) {
> + struct ctl_table *e = cpu_entries;
> +
> + cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*),
> GFP_KERNEL);
> + if (!cpu_idx)
> + return;
> +
> + /* deal with sparse possible map */
> + for_each_possible_cpu(i) {
> + cpu_idx[i] = e;
> + e++;
> + }
> + }
>
> - WARN_ON(sd_ctl_dir[0].child);
> - sd_ctl_dir[0].child = entry;
> + if (!cpumask_available(sd_sysctl_cpus)) {
> + if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
> + return;
>
> - if (entry == NULL)
> - return;
> + /* init to possible to not have holes in @cpu_entries */
> + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
> + }
> +
> + for_each_cpu(i, sd_sysctl_cpus) {
> + struct ctl_table *e = cpu_idx[i];
> +
> + if (e->child)
> + sd_free_ctl_entry(&e->child);
> +
> + if (!e->procname) {
> + snprintf(buf, 32, "cpu%d", i);
> + e->procname = kstrdup(buf, GFP_KERNEL);
> + }
> + e->mode = 0555;
> + e->child = sd_alloc_ctl_cpu_table(i);
>
> - for_each_possible_cpu(i) {
> - snprintf(buf, 32, "cpu%d", i);
> - entry->procname = kstrdup(buf, GFP_KERNEL);
> - entry->mode = 0555;
> - entry->child = sd_alloc_ctl_cpu_table(i);
> - entry++;
> + __cpumask_clear_cpu(i, sd_sysctl_cpus);
> }
>
> WARN_ON(sd_sysctl_header);
> sd_sysctl_header = register_sysctl_table(sd_ctl_root);
> }
>
> +void dirty_sched_domain_sysctl(int cpu) {
> + if (cpumask_available(sd_sysctl_cpus))
> + __cpumask_set_cpu(cpu, sd_sysctl_cpus); }
> +
> /* may be called multiple times per register */ void
> unregister_sched_domain_sysctl(void)
> {
> unregister_sysctl_table(sd_sysctl_header);
> sd_sysctl_header = NULL;
> - if (sd_ctl_dir[0].child)
> - sd_free_ctl_entry(&sd_ctl_dir[0].child);
> }
> #endif /* CONFIG_SYSCTL */
> #endif /* CONFIG_SMP */
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sche
>
> #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) void
> register_sched_domain_sysctl(void);
> +void dirty_sched_domain_sysctl(int cpu);
> void unregister_sched_domain_sysctl(void);
> #else
> static inline void register_sched_domain_sysctl(void)
> {
> }
> +static inline void dirty_sched_domain_sysctl(int cpu) { }
> static inline void unregister_sched_domain_sysctl(void)
> {
> }
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -461,6 +461,7 @@ cpu_attach_domain(struct sched_domain *s
> rq_attach_root(rq, rd);
> tmp = rq->sd;
> rcu_assign_pointer(rq->sd, sd);
> + dirty_sched_domain_sysctl(cpu);
> destroy_sched_domains(tmp);
>
> update_top_cache_domain(cpu);