[RFC PATCH 1/5] x86/ibs: In-kernel IBS driver for page access profiling

From: Bharata B Rao
Date: Wed Feb 08 2023 - 02:36:49 EST


Use IBS (Instruction Based Sampling) feature present
in AMD processors for memory access tracking. The access
information obtained from IBS will be used in subsequent
patches to drive NUMA balancing.

An NMI handler is registered to obtain the IBS data. The
handler does nothing much yet. It just filters out the
non-useful samples and collects some stats. This patch
just builds the framework and IBS execution sampling is
enabled only in a subsequent patch.

TODOs
-----
1. Perf also uses IBS. For the purpose of this prototype
just disable the use of IBS in perf. This needs to be
done cleanly.
2. Only the required MSR bits are defined here.

About IBS
---------
IBS can be programmed to provide data about instruction
execution periodically. This is done by programming a desired
sample count (number of ops) in a control register. When the
programmed number of ops are dispatched, a micro-op gets tagged,
various information about the tagged micro-op's execution is
populated in IBS execution MSRs and an interrupt is raised.
While IBS provides a lot of data for each sample, for the
purpose of memory access profiling, we are interested in
linear and physical address of the memory access that reached
DRAM. Recent AMD processors provide further filtering where
it is possible to limit the sampling to those ops that had
an L3 miss which greately reduces the non-useful samples.

While IBS provides capability to sample instruction fetch
and execution, only IBS execution sampling is used here
to collect data about memory accesses that occur during
the instruction execution.

More information about IBS is available in Sec 13.3 of
AMD64 Architecture Programmer's Manual, Volume 2:System
Programming which is present at:
https://bugzilla.kernel.org/attachment.cgi?id=288923

Information about MSRs used for programming IBS can be
found in Sec 2.1.14.4 of PPR Vol 1 for AMD Family 19h
Model 11h B1 which is currently present at:
https://www.amd.com/system/files/TechDocs/55901_0.25.zip

Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
arch/x86/events/amd/ibs.c | 6 ++
arch/x86/include/asm/msr-index.h | 12 +++
arch/x86/mm/Makefile | 1 +
arch/x86/mm/ibs.c | 169 +++++++++++++++++++++++++++++++
include/linux/vm_event_item.h | 11 ++
mm/vmstat.c | 11 ++
6 files changed, 210 insertions(+)
create mode 100644 arch/x86/mm/ibs.c

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index da3f5ebac4e1..290e6d221844 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -1512,6 +1512,12 @@ static __init int amd_ibs_init(void)
{
u32 caps;

+ /*
+ * TODO: Find a clean way to disable perf IBS so that IBS
+ * can be used for NUMA balancing.
+ */
+ return 0;
+
caps = __get_ibs_caps();
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 37ff47552bcb..443d4cf73366 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -593,6 +593,18 @@
/* AMD Last Branch Record MSRs */
#define MSR_AMD64_LBR_SELECT 0xc000010e

+/* AMD IBS MSR bits */
+#define MSR_AMD64_IBSOPDATA2_DATASRC 0x7
+#define MSR_AMD64_IBSOPDATA2_DATASRC_DRAM 0x3
+#define MSR_AMD64_IBSOPDATA2_DATASRC_FAR_CCX_CACHE 0x5
+
+#define MSR_AMD64_IBSOPDATA3_LDOP BIT_ULL(0)
+#define MSR_AMD64_IBSOPDATA3_STOP BIT_ULL(1)
+#define MSR_AMD64_IBSOPDATA3_DCMISS BIT_ULL(7)
+#define MSR_AMD64_IBSOPDATA3_LADDR_VALID BIT_ULL(17)
+#define MSR_AMD64_IBSOPDATA3_PADDR_VALID BIT_ULL(18)
+#define MSR_AMD64_IBSOPDATA3_L2MISS BIT_ULL(20)
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c80febc44cd2..e74b95a57d86 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -27,6 +27,7 @@ endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o

+obj-$(CONFIG_NUMA_BALANCING) += ibs.o
obj-y += pat/

# Make sure __phys_addr has no stackprotector
diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c
new file mode 100644
index 000000000000..411dba2a88d1
--- /dev/null
+++ b/arch/x86/mm/ibs.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+
+#include <asm/nmi.h>
+#include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
+#include <asm/apic.h>
+
+static u64 ibs_config __read_mostly;
+
+static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ u64 ops_ctl, ops_data3, ops_data2;
+ u64 remote_access;
+ u64 laddr = -1, paddr = -1;
+ struct mm_struct *mm = current->mm;
+
+ rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl);
+
+ /*
+ * When IBS sampling period is reprogrammed via read-modify-update
+ * of MSR_AMD64_IBSOPCTL, overflow NMIs could be generated with
+ * IBS_OP_ENABLE not set. For such cases, return as HANDLED.
+ *
+ * With this, the handler will say "handled" for all NMIs that
+ * aren't related to this NMI. This stems from the limitation of
+ * having both status and control bits in one MSR.
+ */
+ if (!(ops_ctl & IBS_OP_VAL))
+ goto handled;
+
+ wrmsrl(MSR_AMD64_IBSOPCTL, ops_ctl & ~IBS_OP_VAL);
+
+ count_vm_event(IBS_NR_EVENTS);
+
+ if (!mm) {
+ count_vm_event(IBS_KTHREAD);
+ goto handled;
+ }
+
+ rdmsrl(MSR_AMD64_IBSOPDATA3, ops_data3);
+
+ /* Load/Store ops only */
+ if (!(ops_data3 & (MSR_AMD64_IBSOPDATA3_LDOP |
+ MSR_AMD64_IBSOPDATA3_STOP))) {
+ count_vm_event(IBS_NON_LOAD_STORES);
+ goto handled;
+ }
+
+ /* Discard the sample if it was L1 or L2 hit */
+ if (!(ops_data3 & (MSR_AMD64_IBSOPDATA3_DCMISS |
+ MSR_AMD64_IBSOPDATA3_L2MISS))) {
+ count_vm_event(IBS_DC_L2_HITS);
+ goto handled;
+ }
+
+ rdmsrl(MSR_AMD64_IBSOPDATA2, ops_data2);
+ remote_access = ops_data2 & MSR_AMD64_IBSOPDATA2_DATASRC;
+
+ /* Consider only DRAM accesses, exclude cache accesses from near ccx */
+ if (remote_access < MSR_AMD64_IBSOPDATA2_DATASRC_DRAM) {
+ count_vm_event(IBS_NEAR_CACHE_HITS);
+ goto handled;
+ }
+
+ /* Exclude hits from peer cache in far ccx */
+ if (remote_access == MSR_AMD64_IBSOPDATA2_DATASRC_FAR_CCX_CACHE) {
+ count_vm_event(IBS_FAR_CACHE_HITS);
+ goto handled;
+ }
+
+ /* Is linear addr valid? */
+ if (ops_data3 & MSR_AMD64_IBSOPDATA3_LADDR_VALID)
+ rdmsrl(MSR_AMD64_IBSDCLINAD, laddr);
+ else {
+ count_vm_event(IBS_LADDR_INVALID);
+ goto handled;
+ }
+
+ /* Discard kernel address accesses */
+ if (laddr & (1UL << 63)) {
+ count_vm_event(IBS_KERNEL_ADDR);
+ goto handled;
+ }
+
+ /* Is phys addr valid? */
+ if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID)
+ rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr);
+ else
+ count_vm_event(IBS_PADDR_INVALID);
+
+handled:
+ return NMI_HANDLED;
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset < 0)
+ goto failed;
+
+ if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+ return;
+failed:
+ pr_warn("IBS APIC setup failed on cpu #%d\n",
+ smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int x86_amd_ibs_access_profile_startup(unsigned int cpu)
+{
+ setup_APIC_ibs();
+ return 0;
+}
+
+static int x86_amd_ibs_access_profile_teardown(unsigned int cpu)
+{
+ clear_APIC_ibs();
+ return 0;
+}
+
+int __init ibs_access_profiling_init(void)
+{
+ u32 caps;
+
+ ibs_config = IBS_OP_CNT_CTL | IBS_OP_ENABLE;
+
+ if (!boot_cpu_has(X86_FEATURE_IBS)) {
+ pr_info("IBS capability is unavailable for access profiling\n");
+ return 0;
+ }
+
+ caps = cpuid_eax(IBS_CPUID_FEATURES);
+ if (caps & IBS_CAPS_ZEN4)
+ ibs_config |= IBS_OP_L3MISSONLY;
+
+ register_nmi_handler(NMI_LOCAL, ibs_overflow_handler, 0, "ibs");
+
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
+ "x86/amd/ibs_access_profile:starting",
+ x86_amd_ibs_access_profile_startup,
+ x86_amd_ibs_access_profile_teardown);
+
+ pr_info("IBS access profiling setup for NUMA Balancing\n");
+ return 0;
+}
+
+arch_initcall(ibs_access_profiling_init);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7f5d1caf5890..1d55e347d16c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -149,6 +149,17 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_X86
DIRECT_MAP_LEVEL2_SPLIT,
DIRECT_MAP_LEVEL3_SPLIT,
+#ifdef CONFIG_NUMA_BALANCING
+ IBS_NR_EVENTS,
+ IBS_KTHREAD,
+ IBS_NON_LOAD_STORES,
+ IBS_DC_L2_HITS,
+ IBS_NEAR_CACHE_HITS,
+ IBS_FAR_CACHE_HITS,
+ IBS_LADDR_INVALID,
+ IBS_KERNEL_ADDR,
+ IBS_PADDR_INVALID,
+#endif
#endif
NR_VM_EVENT_ITEMS
};
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1ea6a5ce1c41..c7a9d0d9ade8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1398,6 +1398,17 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
+#ifdef CONFIG_NUMA_BALANCING
+ "ibs_nr_events",
+ "ibs_kthread",
+ "ibs_non_load_stores",
+ "ibs_dc_l2_hits",
+ "ibs_near_cache_hits",
+ "ibs_far_cache_hits",
+ "ibs_invalid_laddr",
+ "ibs_kernel_addr",
+ "ibs_invalid_paddr",
+#endif
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
--
2.25.1