Re: [PATCH v3 18/19] x86/resctrl: Add cpu offline callback for resctrl work

From: Ilpo Järvinen
Date: Tue Mar 21 2023 - 11:33:00 EST


On Mon, 20 Mar 2023, James Morse wrote:

> The resctrl architecture specific code may need to free a domain when
> a CPU goes offline, it also needs to reset the CPUs PQR_ASSOC register.
> The resctrl filesystem code needs to move the overflow and limbo work
> to run on a different CPU, and clear this CPU from the cpu_mask of
> control and monitor groups.
>
> Currently this is all done in core.c and called from
> resctrl_offline_cpu(), making the split between architecture and
> filesystem code unclear.
>
> Move the filesystem work into a filesystem helper called
> resctrl_offline_cpu(), and rename the one in core.c
> resctrl_arch_offline_cpu().
>
> The rdtgroup_mutex is unlocked and locked again in the call in
> preparation for changing the locking rules for the architecture
> code.
>
> resctrl_offline_cpu() is called before any of the resource/domains
> are updated, and makes use of the exclude_cpu feature that was
> previously added.
>
> Tested-by: Shaopeng Tan <tan.shaopeng@xxxxxxxxxxx>
> Signed-off-by: James Morse <james.morse@xxxxxxx>
> ---
> arch/x86/kernel/cpu/resctrl/core.c | 41 ++++----------------------
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 39 ++++++++++++++++++++++++
> include/linux/resctrl.h | 1 +
> 3 files changed, 45 insertions(+), 36 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index aafe4b74587c..4e5fc89dab6d 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -578,22 +578,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
>
> return;
> }
> -
> - if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
> - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
> - cancel_delayed_work(&d->mbm_over);
> - /*
> - * exclude_cpu=-1 as this CPU has already been removed
> - * by cpumask_clear_cpu()d
> - */

This was added in 17/19 and now removed (not moved) in 18/19. Please avoid
such back-and-forth churn.

--
i.


> - mbm_setup_overflow_handler(d, 0, RESCTRL_PICK_ANY_CPU);
> - }
> - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
> - has_busy_rmid(r, d)) {
> - cancel_delayed_work(&d->cqm_limbo);
> - cqm_setup_limbo_handler(d, 0, RESCTRL_PICK_ANY_CPU);
> - }
> - }
> }
>
> static void clear_closid_rmid(int cpu)
> @@ -623,31 +607,15 @@ static int resctrl_arch_online_cpu(unsigned int cpu)
> return err;
> }
>
> -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
> +static int resctrl_arch_offline_cpu(unsigned int cpu)
> {
> - struct rdtgroup *cr;
> -
> - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
> - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
> - break;
> - }
> - }
> -}
> -
> -static int resctrl_offline_cpu(unsigned int cpu)
> -{
> - struct rdtgroup *rdtgrp;
> struct rdt_resource *r;
>
> mutex_lock(&rdtgroup_mutex);
> + resctrl_offline_cpu(cpu);
> +
> for_each_capable_rdt_resource(r)
> domain_remove_cpu(cpu, r);
> - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
> - clear_childcpus(rdtgrp, cpu);
> - break;
> - }
> - }
> clear_closid_rmid(cpu);
> mutex_unlock(&rdtgroup_mutex);
>
> @@ -970,7 +938,8 @@ static int __init resctrl_late_init(void)
>
> state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> "x86/resctrl/cat:online:",
> - resctrl_arch_online_cpu, resctrl_offline_cpu);
> + resctrl_arch_online_cpu,
> + resctrl_arch_offline_cpu);
> if (state < 0)
> return state;
>
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index bf206bdb21ee..c27ec56c6c60 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -3710,6 +3710,45 @@ int resctrl_online_cpu(unsigned int cpu)
> return 0;
> }
>
> +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
> +{
> + struct rdtgroup *cr;
> +
> + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
> + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
> + break;
> + }
> +}
> +
> +void resctrl_offline_cpu(unsigned int cpu)
> +{
> + struct rdt_domain *d;
> + struct rdtgroup *rdtgrp;
> + struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
> + clear_childcpus(rdtgrp, cpu);
> + break;
> + }
> + }
> +
> + d = get_domain_from_cpu(cpu, l3);
> + if (d) {
> + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
> + cancel_delayed_work(&d->mbm_over);
> + mbm_setup_overflow_handler(d, 0, cpu);
> + }
> + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
> + has_busy_rmid(l3, d)) {
> + cancel_delayed_work(&d->cqm_limbo);
> + cqm_setup_limbo_handler(d, 0, cpu);
> + }
> + }
> +}
> +
> /*
> * rdtgroup_init - rdtgroup initialization
> *
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 3ea7d618f33f..f053527aaa5b 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -226,6 +226,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
> int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
> void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
> int resctrl_online_cpu(unsigned int cpu);
> +void resctrl_offline_cpu(unsigned int cpu);
>
> /**
> * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid
>