Re: [PATCH RESEND v8 1/2] sched/numa: introduce per-cgroup NUMA locality info

From: çè
Date: Wed Feb 12 2020 - 21:35:49 EST


Hi, Peter

Could you please give some comments on this one?

Regards,
Michael Wang

On 2020/2/7 äå11:37, çè wrote:
> Forwarded:
>
> Hi, Peter, Ingo
>
> Could you give some comments please?
>
> As Mel replied previously, he won't disagree the idea, so we're looking
> forward the opinion from the maintainers.
>
> Please allow me to highlight the necessary of monitoring NUMA Balancing
> again, this feature is critical to the performance on NUMA platform,
> it cost and benefit -- lot or less, however there are not enough
> information for an admin to analysis the trade-off, while locality could
> be the missing piece.
>
> Regards,
> Michael Wang
>
> On 2020/2/7 äå11:35, çè wrote:
>> Currently there are no good approach to monitoring the per-cgroup NUMA
>> efficiency, this could be a trouble especially when groups are sharing
>> CPUs, we don't know which one introduced remote-memory accessing.
>>
>> Although the per-task NUMA accessing info from PMU is good for further
>> debuging, but not light enough for daily monitoring, especial on a box
>> with thousands of tasks.
>>
>> Fortunately, when NUMA Balancing enabled, it will periodly trigger page
>> fault and try to increase the NUMA locality, by tracing the results we
>> will be able to estimate the NUMA efficiency.
>>
>> On each page fault of NUMA Balancing, when task's executing CPU is from
>> the same node of pages, we call this a local page accessing, otherwise
>> a remote page accessing.
>>
>> By updating task's accessing counter into it's cgroup on ticks, we get
>> the per-cgroup numa locality info.
>>
>> For example the new entry 'cpu.numa_stat' show:
>> page_access local=1231412 remote=53453
>>
>> Here we know the workloads in hierarchy have totally been traced 1284865
>> times of page accessing, and 1231412 of them are local page access, which
>> imply a good NUMA efficiency.
>>
>> By monitoring the increments, we will be able to locate the per-cgroup
>> workload which NUMA Balancing can't helpwith (usually caused by wrong
>> CPU and memory node bindings), then we got chance to fix that in time.
>>
>> Cc: Mel Gorman <mgorman@xxxxxxx>
>> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>> Cc: Michal Koutnà <mkoutny@xxxxxxxx>
>> Signed-off-by: Michael Wang <yun.wang@xxxxxxxxxxxxxxxxx>
>> ---
>> include/linux/sched.h | 15 +++++++++
>> include/linux/sched/sysctl.h | 6 ++++
>> init/Kconfig | 9 ++++++
>> kernel/sched/core.c | 75 ++++++++++++++++++++++++++++++++++++++++++++
>> kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++
>> kernel/sched/sched.h | 12 +++++++
>> kernel/sysctl.c | 11 +++++++
>> 7 files changed, 190 insertions(+)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index a6c924fa1c77..74bf234bae53 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1128,6 +1128,21 @@ struct task_struct {
>> unsigned long numa_pages_migrated;
>> #endif /* CONFIG_NUMA_BALANCING */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> + /*
>> + * Counter index stand for:
>> + * 0 -- remote page accessing
>> + * 1 -- local page accessing
>> + * 2 -- remote page accessing updated to cgroup
>> + * 3 -- local page accessing updated to cgroup
>> + *
>> + * We record the counter before the end of task_numa_fault(), this
>> + * is based on the fact that after page fault is handled, the task
>> + * will access the page on the CPU where it triggered the PF.
>> + */
>> + unsigned long numa_page_access[4];
>> +#endif
>> +
>> #ifdef CONFIG_RSEQ
>> struct rseq __user *rseq;
>> u32 rseq_sig;
>> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
>> index d4f6215ee03f..bb3721cf48e0 100644
>> --- a/include/linux/sched/sysctl.h
>> +++ b/include/linux/sched/sysctl.h
>> @@ -101,4 +101,10 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write,
>> loff_t *ppos);
>> #endif
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +extern int sysctl_numa_locality(struct ctl_table *table, int write,
>> + void __user *buffer, size_t *lenp,
>> + loff_t *ppos);
>> +#endif
>> +
>> #endif /* _LINUX_SCHED_SYSCTL_H */
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 322fd2c65304..63c6b90a515d 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -813,6 +813,15 @@ config NUMA_BALANCING_DEFAULT_ENABLED
>> If set, automatic NUMA balancing will be enabled if running on a NUMA
>> machine.
>>
>> +config CGROUP_NUMA_LOCALITY
>> + bool "per-cgroup NUMA Locality"
>> + default n
>> + depends on CGROUP_SCHED && NUMA_BALANCING
>> + help
>> + This option enables the collection of per-cgroup NUMA locality info,
>> + to tell whether NUMA Balancing is working well for a particular
>> + workload, also imply the NUMA efficiency.
>> +
>> menuconfig CGROUPS
>> bool "Control Group support"
>> select KERNFS
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index e7b08d52db93..40dd6b221eef 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -7657,6 +7657,68 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>> }
>> #endif /* CONFIG_RT_GROUP_SCHED */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +DEFINE_STATIC_KEY_FALSE(sched_numa_locality);
>> +
>> +#ifdef CONFIG_PROC_SYSCTL
>> +int sysctl_numa_locality(struct ctl_table *table, int write,
>> + void __user *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> + struct ctl_table t;
>> + int err;
>> + int state = static_branch_likely(&sched_numa_locality);
>> +
>> + if (write && !capable(CAP_SYS_ADMIN))
>> + return -EPERM;
>> +
>> + t = *table;
>> + t.data = &state;
>> + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
>> + if (err < 0 || !write)
>> + return err;
>> +
>> + if (state)
>> + static_branch_enable(&sched_numa_locality);
>> + else
>> + static_branch_disable(&sched_numa_locality);
>> +
>> + return err;
>> +}
>> +#endif
>> +
>> +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
>> +{
>> + return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
>> +}
>> +
>> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
>> +{
>> + int cpu;
>> + u64 local = 0, remote = 0;
>> + struct task_group *tg = css_tg(seq_css(sf));
>> +
>> + if (!static_branch_likely(&sched_numa_locality))
>> + return 0;
>> +
>> + for_each_possible_cpu(cpu) {
>> + local += tg_cfs_rq(tg, cpu)->local_page_access;
>> + remote += tg_cfs_rq(tg, cpu)->remote_page_access;
>> + }
>> +
>> + seq_printf(sf, "page_access local=%llu remote=%llu\n", local, remote);
>> +
>> + return 0;
>> +}
>> +
>> +static __init int numa_locality_setup(char *opt)
>> +{
>> + static_branch_enable(&sched_numa_locality);
>> +
>> + return 0;
>> +}
>> +__setup("numa_locality", numa_locality_setup);
>> +#endif
>> +
>> static struct cftype cpu_legacy_files[] = {
>> #ifdef CONFIG_FAIR_GROUP_SCHED
>> {
>> @@ -7706,6 +7768,12 @@ static struct cftype cpu_legacy_files[] = {
>> .seq_show = cpu_uclamp_max_show,
>> .write = cpu_uclamp_max_write,
>> },
>> +#endif
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> + {
>> + .name = "numa_stat",
>> + .seq_show = cpu_numa_stat_show,
>> + },
>> #endif
>> { } /* Terminate */
>> };
>> @@ -7887,6 +7955,13 @@ static struct cftype cpu_files[] = {
>> .seq_show = cpu_uclamp_max_show,
>> .write = cpu_uclamp_max_write,
>> },
>> +#endif
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> + {
>> + .name = "numa_stat",
>> + .flags = CFTYPE_NOT_ON_ROOT,
>> + .seq_show = cpu_numa_stat_show,
>> + },
>> #endif
>> { } /* terminate */
>> };
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 2d170b5da0e3..eb838557bae2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1049,7 +1049,63 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>> * Scheduling class queueing methods:
>> */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +/*
>> + * We want to record the real local/remote page access statistic
>> + * here, so 'pnid' should be pages's real residential node after
>> + * migrate_misplaced_page(), and 'cnid' should be the node of CPU
>> + * where triggered the PF.
>> + */
>> +static inline void
>> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
>> +{
>> + if (!static_branch_unlikely(&sched_numa_locality))
>> + return;
>> +
>> + /*
>> + * pnid != cnid --> remote idx 0
>> + * pnid == cnid --> local idx 1
>> + */
>> + p->numa_page_access[!!(pnid == cnid)] += pages;
>> +}
>> +
>> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
>> +{
>> + unsigned long ldiff, rdiff;
>> +
>> + if (!static_branch_unlikely(&sched_numa_locality))
>> + return;
>> +
>> + rdiff = current->numa_page_access[0] - current->numa_page_access[2];
>> + ldiff = current->numa_page_access[1] - current->numa_page_access[3];
>> + if (!ldiff && !rdiff)
>> + return;
>> +
>> + cfs_rq->local_page_access += ldiff;
>> + cfs_rq->remote_page_access += rdiff;
>> +
>> + /*
>> + * Consider updated when reach root cfs_rq, no NUMA Balancing PF
>> + * should happen on current task during the hierarchical updating.
>> + */
>> + if (&cfs_rq->rq->cfs == cfs_rq) {
>> + current->numa_page_access[2] = current->numa_page_access[0];
>> + current->numa_page_access[3] = current->numa_page_access[1];
>> + }
>> +}
>> +#else
>> +static inline void
>> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
>> +{
>> +}
>> +
>> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>> +
>> #ifdef CONFIG_NUMA_BALANCING
>> +
>> /*
>> * Approximate time to scan a full NUMA task in ms. The task scan period is
>> * calculated based on the tasks virtual memory size and
>> @@ -2465,6 +2521,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>> p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
>> p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
>> p->numa_faults_locality[local] += pages;
>> +
>> + update_task_locality(p, mem_node, numa_node_id(), pages);
>> }
>>
>> static void reset_ptenuma_scan(struct task_struct *p)
>> @@ -2650,6 +2708,9 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
>> p->last_sum_exec_runtime = 0;
>>
>> init_task_work(&p->numa_work, task_numa_work);
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> + memset(p->numa_page_access, 0, sizeof(p->numa_page_access));
>> +#endif
>>
>> /* New address space, reset the preferred nid */
>> if (!(clone_flags & CLONE_VM)) {
>> @@ -4313,6 +4374,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>> */
>> update_load_avg(cfs_rq, curr, UPDATE_TG);
>> update_cfs_group(curr);
>> + update_group_locality(cfs_rq);
>>
>> #ifdef CONFIG_SCHED_HRTICK
>> /*
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 1a88dc8ad11b..66b4e581b6ed 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -575,6 +575,14 @@ struct cfs_rq {
>> struct list_head throttled_list;
>> #endif /* CONFIG_CFS_BANDWIDTH */
>> #endif /* CONFIG_FAIR_GROUP_SCHED */
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> + /*
>> + * The local/remote page access info collected from all
>> + * the tasks in hierarchy.
>> + */
>> + u64 local_page_access;
>> + u64 remote_page_access;
>> +#endif
>> };
>>
>> static inline int rt_bandwidth_enabled(void)
>> @@ -1601,6 +1609,10 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
>> extern struct static_key_false sched_numa_balancing;
>> extern struct static_key_false sched_schedstats;
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +extern struct static_key_false sched_numa_locality;
>> +#endif
>> +
>> static inline u64 global_rt_period(void)
>> {
>> return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>> index d396aaaf19a3..a8f5951f92b3 100644
>> --- a/kernel/sysctl.c
>> +++ b/kernel/sysctl.c
>> @@ -428,6 +428,17 @@ static struct ctl_table kern_table[] = {
>> .extra2 = SYSCTL_ONE,
>> },
>> #endif /* CONFIG_NUMA_BALANCING */
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> + {
>> + .procname = "numa_locality",
>> + .data = NULL, /* filled in by handler */
>> + .maxlen = sizeof(unsigned int),
>> + .mode = 0644,
>> + .proc_handler = sysctl_numa_locality,
>> + .extra1 = SYSCTL_ZERO,
>> + .extra2 = SYSCTL_ONE,
>> + },
>> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>> #endif /* CONFIG_SCHED_DEBUG */
>> {
>> .procname = "sched_rt_period_us",
>>