Re: [PATCH v3 2/6] drivers/cpufreq: implement init_cpu_capacity_default()

From: Vincent Guittot
Date: Wed Feb 03 2016 - 16:05:08 EST


On 3 February 2016 at 12:59, Juri Lelli <juri.lelli@xxxxxxx> wrote:
> To get default values for CPUs capacity we profile a simple (bogus)
> integer benchmark on such CPUs; then we normalize results to 1024
> (highest capacity in the system).
>
> Architectures that want this during boot have to define a weak function
> (arch_wants_init_cpu_capacity) to return true.
>
> Also, kernel has to boot with init_cpu_capacity parameter if profiling
> is needed, as it can be expensive and might add ~1 sec to boot time.
>
> Cc: Russell King <linux@xxxxxxxxxxxxxxxx>
> Cc: Catalin Marinas <catalin.marinas@xxxxxxx>
> Cc: Will Deacon <will.deacon@xxxxxxx>
> Cc: "Rafael J. Wysocki" <rjw@xxxxxxxxxxxxx>
> Cc: Viresh Kumar <viresh.kumar@xxxxxxxxxx>
> Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
> Cc: Sudeep Holla <sudeep.holla@xxxxxxx>
> Cc: Mark Rutland <mark.rutland@xxxxxxx>
> Signed-off-by: Juri Lelli <juri.lelli@xxxxxxx>
> ---
> Changes since v1:
> - add kernel command line parameter to enable profiling
> - add define for max trials
>
> Documentation/kernel-parameters.txt | 4 +
> arch/arm/kernel/topology.c | 2 +-
> arch/arm64/kernel/topology.c | 12 +++
> drivers/cpufreq/Makefile | 2 +-
> drivers/cpufreq/cpufreq.c | 1 +
> drivers/cpufreq/cpufreq_capacity.c | 174 ++++++++++++++++++++++++++++++++++++
> include/linux/cpufreq.h | 2 +
> 7 files changed, 195 insertions(+), 2 deletions(-)
> create mode 100644 drivers/cpufreq/cpufreq_capacity.c
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 87d40a7..fad2b89 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1570,6 +1570,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>
> initrd= [BOOT] Specify the location of the initial ramdisk
>
> + init_cpu_capacity
> + [KNL,ARM] Enables dynamic CPUs capacity benchmarking
> + at boot.
> +
> inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
> Format: <irq>
>
> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
> index ec279d1..c9c87a5 100644
> --- a/arch/arm/kernel/topology.c
> +++ b/arch/arm/kernel/topology.c
> @@ -47,7 +47,7 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
> return per_cpu(cpu_scale, cpu);
> }
>
> -static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
> +void set_capacity_scale(unsigned int cpu, unsigned long capacity)
> {
> per_cpu(cpu_scale, cpu) = capacity;
> }
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 694f6de..3b75d63 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -23,6 +23,18 @@
> #include <asm/cputype.h>
> #include <asm/topology.h>
>
> +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
> +
> +unsigned long arm_arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
> +{
> + return per_cpu(cpu_scale, cpu);
> +}
> +
> +void set_capacity_scale(unsigned int cpu, unsigned long capacity)
> +{
> + per_cpu(cpu_scale, cpu) = capacity;
> +}
> +
> static int __init get_cpu_for_node(struct device_node *node)
> {
> struct device_node *cpu_node;
> diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> index 9e63fb1..c4025fd 100644
> --- a/drivers/cpufreq/Makefile
> +++ b/drivers/cpufreq/Makefile
> @@ -1,5 +1,5 @@
> # CPUfreq core
> -obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o
> +obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o cpufreq_capacity.o

Do you really want to have the calibration of capacity dependent of
cpufreq ? It means that we can't use it without a cpufreq driver.
IMHO, this creates a unnecessary dependency. I understand that you
must ensure that core runs at max fequency if a driver is present but
you should be able to calibrate the capacity if cpufreq is not
available but you have different capacity because micro architecture

>
> # CPUfreq stats
> obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index e979ec7..b22afe8 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2440,6 +2440,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
> }
>
> register_hotcpu_notifier(&cpufreq_cpu_notifier);
> + cpufreq_init_cpu_capacity();
> pr_debug("driver %s up and running\n", driver_data->name);
>
> out:
> diff --git a/drivers/cpufreq/cpufreq_capacity.c b/drivers/cpufreq/cpufreq_capacity.c
> new file mode 100644
> index 0000000..e54310b
> --- /dev/null
> +++ b/drivers/cpufreq/cpufreq_capacity.c
> @@ -0,0 +1,174 @@
> +/*
> + * Default CPU capacity calculation for u-arch invariance
> + *
> + * Copyright (C) 2015 ARM Ltd.
> + * Juri Lelli <juri.lelli@xxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed "as is" WITHOUT ANY WARRANTY of any
> + * kind, whether express or implied; without even the implied warranty
> + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + */
> +#include <linux/cpufreq.h>
> +#include <linux/sched.h>
> +
> +#define MAX_TRIALS 10 /* how many times benchmark is executed */
> +static unsigned long long elapsed[NR_CPUS];
> +
> +/*
> + * Don't let compiler optimize following two functions; we want to avoid any
> + * microarchitecture specific optimization that compiler would do and favour
> + * one CPU vs. another. Also, my_int_sqrt is cut-and-paste from
> + * lib/int_sqrt.c.
> + */
> +static unsigned long __attribute__((optimize("O0")))
> +my_int_sqrt(unsigned long x)
> +{
> + unsigned long b, m, y = 0;
> +
> + if (x <= 1)
> + return x;
> +
> + m = 1UL << (BITS_PER_LONG - 2);
> + while (m != 0) {
> + b = y + m;
> + y >>= 1;
> +
> + if (x >= b) {
> + x -= b;
> + y += m;
> + }
> + m >>= 2;
> + }
> +
> + return y;
> +}
> +
> +static unsigned long __attribute__((optimize("O0")))
> +bogus_bench(void)
> +{
> + unsigned long i, res;
> +
> + for (i = 0; i < 100000; i++)
> + res = my_int_sqrt(i);
> +
> + return res;
> +}
> +
> +static int run_bogus_benchmark(int cpu)
> +{
> + int ret, trials = MAX_TRIALS;
> + u64 begin, end, sample, mean = 0, count = 0;
> + unsigned long res;
> +
> + ret = set_cpus_allowed_ptr(current, cpumask_of(cpu));
> + if (ret) {
> + pr_warn("%s: failed to set allowed ptr\n", __func__);
> + return -EINVAL;
> + }
> +
> + while (trials--) {
> + begin = local_clock();
> + res = bogus_bench();
> + end = local_clock();
> + sample = end - begin;
> +
> + mean = mean * count + sample;
> + mean = div64_u64(mean, ++count);
> + pr_debug("%s: cpu=%d begin=%llu end=%llu"
> + " sample=%llu mean=%llu count=%llu res=%lu\n",
> + __func__, cpu, begin, end, sample,
> + mean, count, res);
> + }
> + elapsed[cpu] = mean;
> +
> + ret = set_cpus_allowed_ptr(current, cpu_active_mask);
> + if (ret) {
> + pr_warn("%s: failed to set allowed ptr\n", __func__);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +bool __weak arch_wants_init_cpu_capacity(void)
> +{
> + return false;
> +}
> +
> +void __weak set_capacity_scale(int cpu, unsigned long capacity) { }
> +
> +static __read_mostly bool init_cpu_capacity_enabled;
> +
> +static int __init init_cpu_capacity_setup(char *str)
> +{
> + init_cpu_capacity_enabled = true;
> +
> + return 0;
> +}
> +early_param("init_cpu_capacity", init_cpu_capacity_setup);
> +
> +void cpufreq_init_cpu_capacity(void)
> +{
> + int cpu, fcpu;
> + unsigned long long elapsed_min = ULLONG_MAX;
> + unsigned int curr_min, curr_max;
> + struct cpufreq_policy *policy;
> +
> + if (!arch_wants_init_cpu_capacity() || !init_cpu_capacity_enabled)
> + return;
> +
> + for_each_possible_cpu(cpu) {
> + policy = cpufreq_cpu_get(cpu);
> + if (IS_ERR_OR_NULL(policy))
> + return;
> +
> + /*
> + * We profile only first CPU of each frequency domain;
> + * and use that value as capacity of every CPU in the domain.
> + */
> + fcpu = cpumask_first(policy->related_cpus);
> + if (cpu != fcpu) {
> + elapsed[cpu] = elapsed[fcpu];
> + cpufreq_cpu_put(policy);
> + continue;
> + }
> +
> + down_write(&policy->rwsem);
> + curr_min = policy->user_policy.min;
> + curr_max = policy->user_policy.max;
> + policy->user_policy.min = policy->cpuinfo.max_freq;
> + policy->user_policy.max = policy->cpuinfo.max_freq;
> + up_write(&policy->rwsem);
> + cpufreq_cpu_put(policy);
> + cpufreq_update_policy(cpu);
> +
> + run_bogus_benchmark(cpu);
> + if (elapsed[cpu] < elapsed_min)
> + elapsed_min = elapsed[cpu];
> + pr_debug("%s: cpu=%d elapsed=%llu (min=%llu)\n",
> + __func__, cpu, elapsed[cpu], elapsed_min);
> +
> + policy = cpufreq_cpu_get(cpu);
> + down_write(&policy->rwsem);
> + policy->user_policy.min = curr_min;
> + policy->user_policy.max = curr_max;
> + up_write(&policy->rwsem);
> + cpufreq_cpu_put(policy);
> + cpufreq_update_policy(cpu);
> + }
> +
> + for_each_possible_cpu(cpu) {
> + unsigned long capacity;
> +
> + capacity = div64_u64((elapsed_min << 10), elapsed[cpu]);
> + pr_debug("%s: CPU%d capacity=%lu\n", __func__, cpu, capacity);
> + set_capacity_scale(cpu, capacity);
> + }
> +
> + pr_info("dynamic CPUs capacity installed\n");
> +}
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index 88a4215..9924351 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -419,6 +419,8 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
> #endif
> }
>
> +void cpufreq_init_cpu_capacity(void);
> +
> /*********************************************************************
> * CPUFREQ GOVERNORS *
> *********************************************************************/
> --
> 2.7.0
>