[PATCH] Add support for disabling Intel PT trace in ftrace

From: Andi Kleen
Date: Fri Nov 18 2016 - 11:55:32 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

ftrace has powerfull trigger functions. Intel PT on modern Intel CPUs
can trace execution flow.

For debugging I found it useful to disable the PT trace from ftrace triggers,
for example when specific kernel functions are hit, which indicate
a problem. Then we can see the exact execution trace up to this point.

This patch adds a "ptoff" ftrace trigger/function that disables the trace
on the current function. The PT trace still has to be set up with perf

% perf record -e intel_pt// -a ... &
% cd /sys/kernel/debug/tracing
% echo do_page_fault:ptoff > set_ftrace_filter
...
% cd -
% kill %1
% perf script --itrace=i0ns

I only implemented local disabling. Enabling would be much more complicated
and require a black list of functions to avoid recursion. Global
disabling with IPIs would be possible, but also risk some deadlock
scenarios. Local disabling is very easy and can be done without
accessing any special state, so there are no such problems. It is
usually good enough for debugging purposes. The trace can be always
reenabled from perf.

This patch adds "ptoff" both as ftrace trigger and ftrace functions.
This makes it work from "set_ftrace_filter" and through the trigger
field of trace points.

The PT driver exports a pt_disable() function for this that can be also
used for manual instrumentation.

Cc: tom.zanussi@xxxxxxxxxxxxxxx
Cc: rostedt@xxxxxxxxxxx
Cc: peterz@xxxxxxxxxxxxx
Cc: alexander.shishkin@xxxxxxxxx
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
Documentation/trace/ftrace.txt | 5 +++
arch/x86/events/intel/pt.c | 16 ++++++++
include/linux/perf_event.h | 2 +
include/linux/trace_events.h | 1 +
kernel/trace/trace.c | 6 +++
kernel/trace/trace_events_trigger.c | 79 +++++++++++++++++++++++++++++++++++++
kernel/trace/trace_functions.c | 58 +++++++++++++++++++++++++++
7 files changed, 167 insertions(+)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 185c39fea2a0..5dc8ec658678 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -2549,6 +2549,11 @@ The following commands are supported:
command, it only prints out the contents of the ring buffer for the
CPU that executed the function that triggered the dump.

+- ptoff
+ When the function is hit disable Intel PT trace. The Intel PT
+ trace has to be set up earlier with perf record -a -e intel_pt// ...
+ This disables the trace on the current CPU only.
+
trace_pipe
----------

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c5047b8f777b..cf15881da9a5 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1455,3 +1455,19 @@ static __init int pt_init(void)
return ret;
}
arch_initcall(pt_init);
+
+/*
+ * Disable the PT trace for debugging purposes.
+ */
+void pt_disable(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
+ return;
+
+ rdmsrl_safe(MSR_IA32_RTIT_CTL, &val);
+ val &= ~RTIT_CTL_TRACEEN;
+ wrmsrl_safe(MSR_IA32_RTIT_CTL, val);
+}
+EXPORT_SYMBOL(pt_disable);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..a408d288298b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1387,4 +1387,6 @@ int perf_event_exit_cpu(unsigned int cpu);
#define perf_event_exit_cpu NULL
#endif

+void pt_disable(void);
+
#endif /* _LINUX_PERF_EVENT_H */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index be007610ceb0..4d2d4a1b738e 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -391,6 +391,7 @@ enum event_trigger_type {
ETT_EVENT_ENABLE = (1 << 3),
ETT_EVENT_HIST = (1 << 4),
ETT_HIST_ENABLE = (1 << 5),
+ ETT_PTOFF = (1 << 6),
};

extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..e55405dce821 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4082,6 +4082,9 @@ static const char readme_msg[] =
#endif
"\t\t dump\n"
"\t\t cpudump\n"
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+ "\t\t ptoff\n"
+#endif
"\t example: echo do_fault:traceoff > set_ftrace_filter\n"
"\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
"\t The first one will disable tracing every time do_fault is hit\n"
@@ -4175,6 +4178,9 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t\t hist (see below)\n"
#endif
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+ "\t\t ptoff\t\t- Disable PT trace on current CPU\n"
+#endif
"\t example: echo traceoff > events/block/block_unplug/trigger\n"
"\t echo traceoff:3 > events/block/block_unplug/trigger\n"
"\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a26ff1345784..b4ec8c417c12 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,7 @@
#include <linux/ctype.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/perf_event.h>

#include "trace.h"

@@ -1044,6 +1045,83 @@ static struct event_command trigger_traceoff_cmd = {
.set_filter = set_trigger_filter,
};

+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+
+static void
+ptoff_trigger(struct event_trigger_data *data, void *rec)
+{
+ pt_disable();
+}
+
+static void
+ptoff_count_trigger(struct event_trigger_data *data, void *rec)
+{
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ ptoff_trigger(data, rec);
+}
+
+static int
+ptoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("ptoff", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops ptoff_trigger_ops = {
+ .func = ptoff_trigger,
+ .print = ptoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops ptoff_count_trigger_ops = {
+ .func = ptoff_count_trigger,
+ .print = ptoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+ptoff_get_trigger_ops(char *cmd, char *param)
+{
+ return param ? &ptoff_count_trigger_ops : &ptoff_trigger_ops;
+}
+
+static struct event_command trigger_ptoff_cmd = {
+ .name = "ptoff",
+ .trigger_type = ETT_PTOFF,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = ptoff_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init int register_trigger_ptoff_cmd(void)
+{
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
+ return 0;
+
+ ret = register_event_command(&trigger_ptoff_cmd);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+
+#else
+
+static inline int register_trigger_ptoff_cmd(void) { return 0; }
+
+#endif
+
#ifdef CONFIG_TRACER_SNAPSHOT
static void
snapshot_trigger(struct event_trigger_data *data, void *rec)
@@ -1609,6 +1687,7 @@ __init int register_trigger_cmds(void)
register_trigger_enable_disable_cmds();
register_trigger_hist_enable_disable_cmds();
register_trigger_hist_cmd();
+ register_trigger_ptoff_cmd();

return 0;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0efa00d80623..80867e3166f7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -15,6 +15,7 @@
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/fs.h>
+#include <linux/perf_event.h>

#include "trace.h"

@@ -643,6 +644,57 @@ static struct ftrace_func_command ftrace_cpudump_cmd = {
.func = ftrace_cpudump_callback,
};

+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+static void
+ftrace_ptoff_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (update_count(data))
+ pt_disable();
+}
+
+static int
+ftrace_ptoff_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("ptoff", m, ip, data);
+}
+
+static struct ftrace_probe_ops ptoff_probe_ops = {
+ .func = ftrace_ptoff_probe,
+ .print = ftrace_ptoff_print,
+};
+
+static int
+ftrace_ptoff_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = &ptoff_probe_ops;
+
+ /* Only dump once. */
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ "1", enable);
+}
+
+static struct ftrace_func_command ftrace_ptoff_cmd = {
+ .name = "ptoff",
+ .func = ftrace_ptoff_callback,
+};
+
+static int register_ptoff_command(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
+ return 0;
+ return register_ftrace_command(&ftrace_ptoff_cmd);
+}
+
+#else
+
+static inline int register_ptoff_command(void) { return 0; }
+
+#endif
+
static int __init init_func_cmd_traceon(void)
{
int ret;
@@ -667,8 +719,14 @@ static int __init init_func_cmd_traceon(void)
if (ret)
goto out_free_dump;

+ ret = register_ptoff_command();
+ if (ret)
+ goto out_free_cpudump;
+
return 0;

+ out_free_cpudump:
+ unregister_ftrace_command(&ftrace_cpudump_cmd);
out_free_dump:
unregister_ftrace_command(&ftrace_dump_cmd);
out_free_stacktrace:
--
2.5.5