[PATCH RFC 2/3] x86: Add Intel PT logger

From: Takao Indoh
Date: Wed Jul 29 2015 - 01:02:44 EST


This patch provides Intel PT logging feature. When system boots with a
parameter "intel_pt_log", log buffers for Intel PT are allocated and
logging starts, then processor flow information is written in the log
buffer by hardware like flight recorder. This is very helpful to
investigate a cause of kernel panic.

The log buffer size is specified by the parameter
"intel_pt_log_buf_len=<size>". This buffer is used as circular buffer,
therefore old events are overwritten by new events.

Signed-off-by: Takao Indoh <indou.takao@xxxxxxxxxxxxxx>
---
arch/x86/Kconfig | 16 ++
arch/x86/kernel/cpu/Makefile | 2 +
arch/x86/kernel/cpu/intel_pt_log.c | 288 ++++++++++++++++++++++++++++++++++++
3 files changed, 306 insertions(+), 0 deletions(-)
create mode 100644 arch/x86/kernel/cpu/intel_pt_log.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced1..c31400f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1658,6 +1658,22 @@ config X86_INTEL_MPX

If unsure, say N.

+config X86_INTEL_PT_LOG
+ prompt "Intel PT logger"
+ def_bool n
+ depends on CPU_SUP_INTEL
+ ---help---
+ Intel PT is a hardware features that can capture information
+ about program execution flow. Once Intel PT is enabled, the
+ events which change program flow, like branch instructions,
+ exceptions, interruptions, traps and so on are logged in
+ the memory.
+
+ This option enables starting Intel PT logging feature at boot
+ time. When kernel panic occurs, Intel PT log buffer can be
+ retrieved from crash dump file and enables to reconstruct the
+ detailed flow that led to the panic.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 77d371c..24629ff 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -58,6 +58,8 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o

obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o

+obj-$(CONFIG_X86_INTEL_PT_LOG) += intel_pt_log.o
+
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
diff --git a/arch/x86/kernel/cpu/intel_pt_log.c b/arch/x86/kernel/cpu/intel_pt_log.c
new file mode 100644
index 0000000..b1c4d66
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt_log.c
@@ -0,0 +1,288 @@
+/*
+ * Intel Processor Trace Logger
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/intel_pt.h>
+
+#define PT_LOG_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+
+struct pt_log_buf {
+ int cpu;
+
+ void **region; /* array of pointer to output region */
+ int region_size; /* size of region array */
+ int region_order; /* page order of region */
+
+ void **tbl; /* array of pointer to ToPA table */
+ int tbl_size; /* size of tbl array */
+
+ /* Saved registers on panic */
+ u64 saved_msr_ctl;
+ u64 saved_msr_status;
+ u64 saved_msr_output_base;
+ u64 saved_msr_output_mask;
+};
+
+static int pt_log_enabled;
+static int pt_log_buf_nr_pages = 1024; /* number of pages for log buffer */
+
+static DEFINE_PER_CPU(struct pt_log_buf, pt_log_buf_ptr);
+static struct cpumask pt_cpu_mask;
+
+static void enable_pt(int enable)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ if (enable)
+ ctl |= RTIT_CTL_TRACEEN;
+ else
+ ctl &= ~RTIT_CTL_TRACEEN;
+
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+}
+
+void save_intel_pt_registers(void)
+{
+ struct pt_log_buf *buf = this_cpu_ptr(&pt_log_buf_ptr);
+
+ if (!cpumask_test_cpu(smp_processor_id(), &pt_cpu_mask))
+ return;
+
+ enable_pt(0);
+
+ rdmsrl(MSR_IA32_RTIT_CTL, buf->saved_msr_ctl);
+ rdmsrl(MSR_IA32_RTIT_STATUS, buf->saved_msr_status);
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, buf->saved_msr_output_base);
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, buf->saved_msr_output_mask);
+}
+
+static void setup_pt_ctl_register(void)
+{
+ u64 reg;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, reg);
+
+ reg |= RTIT_CTL_OS|RTIT_CTL_USR|RTIT_CTL_TOPA|RTIT_CTL_TSC_EN|RTIT_CTL_BRANCH_EN;
+
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void setup_pt_output_register(void *base, unsigned int topa_idx,
+ unsigned int output_off)
+{
+ u64 reg;
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+
+ reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+static void *pt_alloc_pages(void **buf, int *index, int node, int order)
+{
+ struct page *page;
+ void *ptr = NULL;
+
+ page = alloc_pages_node(node, PT_LOG_GFP, order);
+ if (page) {
+ ptr = page_address(page);
+ buf[(*index)++] = ptr;
+ }
+
+ return ptr;
+}
+
+static void pt_free_pages(void **buf, int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ __free_page(virt_to_page(buf[i]));
+}
+
+static int setup_pt_buffer(struct pt_log_buf *buf)
+{
+ int node = cpu_to_node(buf->cpu);
+ int size, order;
+
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ /* A page is used as one output region */
+ size = pt_log_buf_nr_pages;
+ order = 0;
+ } else {
+ /* One contiguous memory range is used as one output region */
+ size = 1;
+ order = min(get_order(pt_log_buf_nr_pages*PAGE_SIZE),
+ TOPA_SZ_END - 1);
+ }
+
+ buf->region = kzalloc_node(size * sizeof(void *), GFP_KERNEL, node);
+ if (!buf->region)
+ return -ENOMEM;
+
+ buf->region_size = 0;
+ buf->region_order = order;
+
+ while (buf->region_size < size) {
+ if (!pt_alloc_pages(buf->region, &(buf->region_size),
+ node, order)) {
+ pt_free_pages(buf->region, buf->region_size);
+ kfree(buf->region);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static int setup_pt_topa_tbl(struct pt_log_buf *buf)
+{
+ int node = cpu_to_node(buf->cpu);
+ int nr_pages, nr_entries_per_page, i;
+ struct topa_entry *entry;
+ int topa_offset = 0;
+ void *new_tbl;
+
+ /*
+ * Count the number of ToPA entris in a page. ToPA entry size
+ * is 8byte, threfore there are (PAGE_SIZE >> 3) entries in one
+ * page. And one entry is used for END entry.
+ */
+ nr_entries_per_page = (PAGE_SIZE >> 3) - 1;
+
+ nr_pages = 0;
+ while (nr_pages*nr_entries_per_page < buf->region_size)
+ nr_pages++;
+
+ buf->tbl = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+ if (!buf->tbl)
+ return -ENOMEM;
+
+ buf->tbl_size = 0;
+ entry = pt_alloc_pages(buf->tbl, &(buf->tbl_size), node, 0);
+ if (!entry)
+ goto fail;
+
+ /* Insert all buf->region pages into ToPA table */
+ for (i = 0; i < buf->region_size; i++) {
+ if (topa_offset == nr_entries_per_page) {
+ /* Use the last entry as END entry */
+ new_tbl = pt_alloc_pages(buf->tbl, &(buf->tbl_size),
+ node, 0);
+ if (!new_tbl)
+ goto fail;
+
+ entry[topa_offset].end = 1;
+ entry[topa_offset].base =
+ virt_to_phys(new_tbl) >> TOPA_SHIFT;
+ topa_offset = 0;
+ entry = new_tbl;
+ }
+
+ /* Add region to ToPA table */
+ entry[topa_offset].size = buf->region_order;
+ entry[topa_offset].base =
+ virt_to_phys(buf->region[i]) >> TOPA_SHIFT;
+ topa_offset++;
+ }
+
+ /* END entry */
+ entry[topa_offset].end = 1;
+ entry[topa_offset].base = virt_to_phys(buf->tbl[0]) >> TOPA_SHIFT;
+
+ return 0;
+
+fail:
+ pt_free_pages(buf->tbl, buf->tbl_size);
+ kfree(buf->tbl);
+ return -ENOMEM;
+}
+
+static void pt_log_start(void *data)
+{
+ struct pt_log_buf *buf = this_cpu_ptr(&pt_log_buf_ptr);
+
+ setup_pt_output_register(buf->tbl[0], 0, 0);
+ setup_pt_ctl_register();
+
+ enable_pt(1);
+ cpumask_set_cpu(smp_processor_id(), &pt_cpu_mask);
+}
+
+__init int pt_log_init(void)
+{
+ int cpu;
+ struct cpumask status;
+
+ cpumask_clear(&pt_cpu_mask);
+ cpumask_clear(&status);
+
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ return 0;
+
+ if (!pt_log_enabled)
+ return 0;
+
+ pt_cap_init();
+
+ if (!pt_cap_get(PT_CAP_topa_output)) {
+ pr_err("ToPA table is not supported.\n");
+ return -ENODEV;
+ }
+
+ /* Prepare log buffer */
+ for_each_online_cpu(cpu) {
+ struct pt_log_buf *buf = per_cpu_ptr(&pt_log_buf_ptr, cpu);
+
+ buf->cpu = cpu;
+ if (setup_pt_buffer(buf)) {
+ pr_err("[%d]: Failed to set up log buffer\n", cpu);
+ continue;
+ }
+
+ if (setup_pt_topa_tbl(buf)) {
+ pt_free_pages(buf->region, buf->region_size);
+ kfree(buf->region);
+ pr_err("[%d]: Failed to set up ToPA table\n", cpu);
+ continue;
+ }
+
+ cpumask_set_cpu(cpu, &status);
+ }
+
+ /* Start logging on each CPU */
+ smp_call_function_many(&status, pt_log_start, NULL, 1);
+ if (cpumask_test_cpu(smp_processor_id(), &status))
+ pt_log_start(NULL);
+
+ pr_info("logging started: %*pb\n", cpumask_pr_args(&pt_cpu_mask));
+
+ return 0;
+}
+postcore_initcall(pt_log_init);
+
+static __init int pt_log_buf_setup(char *str)
+{
+ int len;
+
+ if (get_option(&str, &len))
+ pt_log_buf_nr_pages = len>>PAGE_SHIFT;
+
+ return 1;
+}
+__setup("intel_pt_log_buf_len", pt_log_buf_setup);
+
+static __init int pt_log_setup(char *str)
+{
+ pt_log_enabled = 1;
+ return 1;
+}
+__setup("intel_pt_log", pt_log_setup);
--
1.7.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/