[RFC PATCH 2/5] x86/ibs: Drive NUMA balancing via IBS access data

From: Bharata B Rao
Date: Wed Feb 08 2023 - 02:36:59 EST


Feed the page access data obtained from IBS to NUMA balancing
as hint fault equivalents. The existing per-task and per-group
fault stats are now built from IBS-provided page access information.
With this it will not be necessary to scan the address space to
introduce NUMA hinting faults.

Use task_work framework to process the IBS sampled data. Actual
programming of IBS to generate page access information isn't
done yet.

Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
arch/x86/mm/ibs.c | 38 ++++++++++++++-
include/linux/migrate.h | 1 +
include/linux/sched.h | 1 +
include/linux/vm_event_item.h | 1 +
kernel/sched/fair.c | 10 ++++
mm/memory.c | 92 +++++++++++++++++++++++++++++++++++
mm/vmstat.c | 1 +
7 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c
index 411dba2a88d1..adbc587b1767 100644
--- a/arch/x86/mm/ibs.c
+++ b/arch/x86/mm/ibs.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0

#include <linux/init.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>

#include <asm/nmi.h>
#include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
@@ -8,12 +10,30 @@

static u64 ibs_config __read_mostly;

+struct ibs_access_work {
+ struct callback_head work;
+ u64 laddr, paddr;
+};
+
+void task_ibs_access_work(struct callback_head *work)
+{
+ struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work);
+ struct task_struct *p = current;
+
+ u64 laddr = iwork->laddr;
+ u64 paddr = iwork->paddr;
+
+ kfree(iwork);
+ do_numa_access(p, laddr, paddr);
+}
+
static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
{
u64 ops_ctl, ops_data3, ops_data2;
u64 remote_access;
u64 laddr = -1, paddr = -1;
struct mm_struct *mm = current->mm;
+ struct ibs_access_work *iwork;

rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl);

@@ -86,8 +106,24 @@ static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
/* Is phys addr valid? */
if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID)
rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr);
- else
+ else {
count_vm_event(IBS_PADDR_INVALID);
+ goto handled;
+ }
+
+ /*
+ * TODO: GFP_ATOMIC!
+ */
+ iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+ if (!iwork)
+ goto handled;
+
+ count_vm_event(IBS_USEFUL_SAMPLES);
+
+ iwork->laddr = laddr;
+ iwork->paddr = paddr;
+ init_task_work(&iwork->work, task_ibs_access_work);
+ task_work_add(current, &iwork->work, TWA_RESUME);

handled:
return NMI_HANDLED;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3ef77f52a4f0..4dcce7885b0c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -216,6 +216,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
unsigned long npages);
void migrate_device_finalize(unsigned long *src_pfns,
unsigned long *dst_pfns, unsigned long npages);
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr);

#endif /* CONFIG_MIGRATION */

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 853d08f7562b..19dd4ee07436 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2420,4 +2420,5 @@ static inline void sched_core_fork(struct task_struct *p) { }

extern void sched_set_stop_task(int cpu, struct task_struct *stop);

+DECLARE_STATIC_KEY_FALSE(hw_access_hints);
#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 1d55e347d16c..2ccc7dee3c13 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -159,6 +159,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
IBS_LADDR_INVALID,
IBS_KERNEL_ADDR,
IBS_PADDR_INVALID,
+ IBS_USEFUL_SAMPLES,
#endif
#endif
NR_VM_EVENT_ITEMS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..c9b9e62da779 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/migrate.h>

#include <asm/switch_to.h>

@@ -3125,6 +3126,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
}
}

+DEFINE_STATIC_KEY_FALSE(hw_access_hints);
+
/*
* Drive the periodic memory faults..
*/
@@ -3133,6 +3136,13 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
struct callback_head *work = &curr->numa_work;
u64 period, now;

+ /*
+ * If we are using access hints from hardware (like using
+ * IBS), don't scan the address space.
+ */
+ if (static_branch_unlikely(&hw_access_hints))
+ return;
+
/*
* We don't care about NUMA placement if we don't have memory.
*/
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..79096aba197c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4668,6 +4668,98 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}

+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by IBS) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr)
+{
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ struct page *page = NULL;
+ int page_nid = NUMA_NO_NODE;
+ int last_cpupid;
+ int target_nid;
+ int flags = 0;
+
+ if (!mm)
+ return;
+
+ if (!mmap_read_trylock(mm))
+ return;
+
+ vma = find_vma(mm, laddr);
+ if (!vma)
+ goto out_unlock;
+
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+ goto out_unlock;
+
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ goto out_unlock;
+
+ if (!vma_is_accessible(vma))
+ goto out_unlock;
+
+ page = pfn_to_online_page(PHYS_PFN(paddr));
+ if (!page || is_zone_device_page(page))
+ goto out_unlock;
+
+ if (unlikely(!PageLRU(page)))
+ goto out_unlock;
+
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page))
+ goto out_unlock;
+
+ /*
+ * Flag if the page is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ flags |= TNF_SHARED;
+
+ last_cpupid = page_cpupid_last(page);
+ page_nid = page_to_nid(page);
+
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(page_nid))
+ last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ last_cpupid = page_cpupid_last(page);
+
+ target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags);
+ if (target_nid == NUMA_NO_NODE) {
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ if (migrate_misplaced_page(page, vma, target_nid)) {
+ page_nid = target_nid;
+ flags |= TNF_MIGRATED;
+ } else {
+ flags |= TNF_MIGRATE_FAIL;
+ }
+
+out:
+ if (page_nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+ mmap_read_unlock(mm);
+}
+
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7a9d0d9ade8..33738426ae48 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1408,6 +1408,7 @@ const char * const vmstat_text[] = {
"ibs_invalid_laddr",
"ibs_kernel_addr",
"ibs_invalid_paddr",
+ "ibs_useful_samples",
#endif
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
--
2.25.1