Re: [PATCH v2 2/7] cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq

From: Marek Szyprowski
Date: Fri May 09 2025 - 19:49:35 EST

Next message: patchwork-bot+netdevbpf: "Re: [PATCH net-next 0/3] dpaa_eth conversion to ndo_hwtstamp_get() and ndo_hwtstamp_set()"
Previous message: Edgecombe, Rick P: "Re: [RFC PATCH 08/21] KVM: TDX: Increase/decrease folio ref for huge pages"
Next in thread: Rafael J. Wysocki: "Re: [PATCH v2 2/7] cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On 06.05.2025 22:37, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
>
> Doing cpufreq-specific EAS checks that require accessing policy
> internals directly from sched_is_eas_possible() is a bit unfortunate,
> so introduce cpufreq_ready_for_eas() in cpufreq, move those checks
> into that new function and make sched_is_eas_possible() call it.
>
> While at it, address a possible race between the EAS governor check
> and governor change by doing the former under the policy rwsem.
>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
> Reviewed-by: Christian Loehle <christian.loehle@xxxxxxx>
> Tested-by: Christian Loehle <christian.loehle@xxxxxxx>
> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>

In my tests I've noticed that this patch, merged as commit 4854649b1fb4
("cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq"), causes a
regression on ARM64 Amlogic Meson SoC based OdroidN2 board. The board
finally lockups. Reverting $subject on top of next-20250509 fixes this
issue. Here is the lockdep warning observed before the lockup:

======================================================
WARNING: possible circular locking dependency detected
6.15.0-rc5-next-20250509-dirty #10335 Tainted: G         C
cpufreq: cpufreq_policy_online: CPU2: Running at unlisted initial
frequency: 999999 kHz, changing to: 1000000 kHz
------------------------------------------------------
kworker/3:1/79 is trying to acquire lock:
ffff00000494b380 (&policy->rwsem){++++}-{4:4}, at:
cpufreq_ready_for_eas+0x60/0xbc

but task is already holding lock:
ffff8000832887a0 (sched_domains_mutex){+.+.}-{4:4}, at:
partition_sched_domains+0x54/0x938

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (sched_domains_mutex){+.+.}-{4:4}:
       __mutex_lock+0xa8/0x598
       mutex_lock_nested+0x24/0x30
       partition_sched_domains+0x54/0x938
       rebuild_sched_domains_locked+0x2d4/0x900
       rebuild_sched_domains+0x2c/0x48
       rebuild_sched_domains_energy+0x3c/0x58
       rebuild_sd_workfn+0x10/0x1c
       process_one_work+0x208/0x604
       worker_thread+0x244/0x388
       kthread+0x150/0x228
       ret_from_fork+0x10/0x20

-> #1 (cpuset_mutex){+.+.}-{4:4}:
       __mutex_lock+0xa8/0x598
       mutex_lock_nested+0x24/0x30
       cpuset_lock+0x1c/0x28
       __sched_setscheduler+0x31c/0x830
       sched_setattr_nocheck+0x18/0x24
       sugov_init+0x1b4/0x388
       cpufreq_init_governor.part.0+0x58/0xd4
       cpufreq_set_policy+0x2c8/0x3ec
       cpufreq_online+0x520/0xb20
       cpufreq_add_dev+0x80/0x98
       subsys_interface_register+0xfc/0x118
       cpufreq_register_driver+0x150/0x238
       dt_cpufreq_probe+0x148/0x488
       platform_probe+0x68/0xdc
       really_probe+0xbc/0x298
       __driver_probe_device+0x78/0x12c
       driver_probe_device+0xdc/0x164
       __device_attach_driver+0xb8/0x138
       bus_for_each_drv+0x80/0xdc
       __device_attach+0xa8/0x1b0
       device_initial_probe+0x14/0x20
       bus_probe_device+0xb0/0xb4
       deferred_probe_work_func+0x8c/0xc8
       process_one_work+0x208/0x604
       worker_thread+0x244/0x388
       kthread+0x150/0x228
       ret_from_fork+0x10/0x20

-> #0 (&policy->rwsem){++++}-{4:4}:
       __lock_acquire+0x1408/0x2254
       lock_acquire+0x1c8/0x354
       down_read+0x60/0x180
       cpufreq_ready_for_eas+0x60/0xbc
       sched_is_eas_possible+0x144/0x170
       partition_sched_domains+0x504/0x938
       rebuild_sched_domains_locked+0x2d4/0x900
       rebuild_sched_domains+0x2c/0x48
       rebuild_sched_domains_energy+0x3c/0x58
       rebuild_sd_workfn+0x10/0x1c
       process_one_work+0x208/0x604
       worker_thread+0x244/0x388
       kthread+0x150/0x228
       ret_from_fork+0x10/0x20

other info that might help us debug this:

Chain exists of:
&policy->rwsem --> cpuset_mutex --> sched_domains_mutex

Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
lock(sched_domains_mutex);
                               lock(cpuset_mutex);
                               lock(sched_domains_mutex);
rlock(&policy->rwsem);

*** DEADLOCK ***

6 locks held by kworker/3:1/79:
#0: ffff00000000a748 ((wq_completion)events){+.+.}-{0:0}, at:
process_one_work+0x18c/0x604
#1: ffff800084a83dd0 (rebuild_sd_work){+.+.}-{0:0}, at:
process_one_work+0x1b4/0x604
#2: ffff800083288908 (sched_energy_mutex){+.+.}-{4:4}, at:
rebuild_sched_domains_energy+0x30/0x58
#3: ffff80008327c2d8 (cpu_hotplug_lock){++++}-{0:0}, at:
cpus_read_lock+0x10/0x1c
#4: ffff80008331b098 (cpuset_mutex){+.+.}-{4:4}, at:
rebuild_sched_domains+0x28/0x48
#5: ffff8000832887a0 (sched_domains_mutex){+.+.}-{4:4}, at:
partition_sched_domains+0x54/0x938

stack backtrace:
CPU: 3 UID: 0 PID: 79 Comm: kworker/3:1 Tainted: G C
6.15.0-rc5-next-20250509-dirty #10335 PREEMPT
Tainted: [C]=CRAP
Hardware name: Hardkernel ODROID-N2 (DT)
Workqueue: events rebuild_sd_workfn
Call trace:
show_stack+0x18/0x24 (C)
dump_stack_lvl+0x90/0xd0
dump_stack+0x18/0x24
print_circular_bug+0x298/0x37c
check_noncircular+0x15c/0x170
__lock_acquire+0x1408/0x2254
lock_acquire+0x1c8/0x354
down_read+0x60/0x180
cpufreq_ready_for_eas+0x60/0xbc
sched_is_eas_possible+0x144/0x170
partition_sched_domains+0x504/0x938
rebuild_sched_domains_locked+0x2d4/0x900
rebuild_sched_domains+0x2c/0x48
rebuild_sched_domains_energy+0x3c/0x58
rebuild_sd_workfn+0x10/0x1c
process_one_work+0x208/0x604
worker_thread+0x244/0x388
kthread+0x150/0x228
ret_from_fork+0x10/0x20

> ---
>
> v1 -> v2:
> * Add missing newline characters in two places (Christian).
> * Pick up tags.
>
> ---
> drivers/cpufreq/cpufreq.c | 32 ++++++++++++++++++++++++++++++++
> include/linux/cpufreq.h | 2 ++
> kernel/sched/topology.c | 25 +++++--------------------
> 3 files changed, 39 insertions(+), 20 deletions(-)
>
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -3056,6 +3056,38 @@
>
> return 0;
> }
> +
> +static bool cpufreq_policy_is_good_for_eas(unsigned int cpu)
> +{
> + struct cpufreq_policy *policy __free(put_cpufreq_policy);
> +
> + policy = cpufreq_cpu_get(cpu);
> + if (!policy) {
> + pr_debug("cpufreq policy not set for CPU: %d\n", cpu);
> + return false;
> + }
> +
> + guard(cpufreq_policy_read)(policy);
> +
> + return sugov_is_governor(policy);
> +}
> +
> +bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask)
> +{
> + unsigned int cpu;
> +
> + /* Do not attempt EAS if schedutil is not being used. */
> + for_each_cpu(cpu, cpu_mask) {
> + if (!cpufreq_policy_is_good_for_eas(cpu)) {
> + pr_debug("rd %*pbl: schedutil is mandatory for EAS\n",
> + cpumask_pr_args(cpu_mask));
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> module_param(off, int, 0444);
> module_param_string(default_governor, default_governor, CPUFREQ_NAME_LEN, 0444);
> core_initcall(cpufreq_core_init);
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -1237,6 +1237,8 @@
> struct cpufreq_frequency_table *table,
> unsigned int transition_latency);
>
> +bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask);
> +
> static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy)
> {
> dev_pm_opp_of_register_em(get_cpu_device(policy->cpu),
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -212,8 +212,6 @@
> static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
> {
> bool any_asym_capacity = false;
> - struct cpufreq_policy *policy;
> - bool policy_is_ready;
> int i;
>
> /* EAS is enabled for asymmetric CPU capacity topologies. */
> @@ -248,25 +246,12 @@
> return false;
> }
>
> - /* Do not attempt EAS if schedutil is not being used. */
> - for_each_cpu(i, cpu_mask) {
> - policy = cpufreq_cpu_get(i);
> - if (!policy) {
> - if (sched_debug()) {
> - pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
> - cpumask_pr_args(cpu_mask), i);
> - }
> - return false;
> - }
> - policy_is_ready = sugov_is_governor(policy);
> - cpufreq_cpu_put(policy);
> - if (!policy_is_ready) {
> - if (sched_debug()) {
> - pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
> - cpumask_pr_args(cpu_mask));
> - }
> - return false;
> + if (!cpufreq_ready_for_eas(cpu_mask)) {
> + if (sched_debug()) {
> + pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
> + cpumask_pr_args(cpu_mask));
> }
> + return false;
> }
>
> return true;
>
>
>
>
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

Next message: patchwork-bot+netdevbpf: "Re: [PATCH net-next 0/3] dpaa_eth conversion to ndo_hwtstamp_get() and ndo_hwtstamp_set()"
Previous message: Edgecombe, Rick P: "Re: [RFC PATCH 08/21] KVM: TDX: Increase/decrease folio ref for huge pages"
Next in thread: Rafael J. Wysocki: "Re: [PATCH v2 2/7] cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]