[RFC,PATCH] VMWARE faults on accessing disabled counters

From: Jiri Olsa
Date: Wed Aug 31 2016 - 08:04:12 EST


hi,
when booting under VMWARE we've got following dmesg lines:

[ 0.051567] perf_event_intel: CPUID marked event: 'cpu cycles' unavailable
[ 0.051567] perf_event_intel: CPUID marked event: 'instructions' unavailable
[ 0.051568] perf_event_intel: CPUID marked event: 'bus cycles' unavailable
[ 0.051568] perf_event_intel: CPUID marked event: 'cache references' unavailable
[ 0.051569] perf_event_intel: CPUID marked event: 'cache misses' unavailable
[ 0.051570] perf_event_intel: CPUID marked event: 'branch instructions' unavailable
[ 0.051570] perf_event_intel: CPUID marked event: 'branch misses' unavailable

that means all the architectural events are disabled by CPUID(0xa)

The kernel code sets intel_perfmon_event_map to prevent
those event to be configured by PERF_TYPE_HARDWARE pmu
type. However they can still be configured by via
PERF_TYPE_RAW type.

We're getting GP fault on VMWARE when reading cycles PMC
configured throgh the PERF_TYPE_RAW interface:

#4 [ffff88007c603e10] do_general_protection at ffffffff8163da9e
#5 [ffff88007c603e40] general_protection at ffffffff8163d3a8
[exception RIP: native_read_pmc+6]
RIP: ffffffff81058d66 RSP: ffff88007c603ef0 RFLAGS: 00010083
RAX: ffffffff81957ee0 RBX: 0000000000000000 RCX: 0000000040000002
RDX: 000000000ff8f719 RSI: ffff88007c617fa8 RDI: 0000000040000002
RBP: ffff88007c603ef0 R8: 00007ffde5053150 R9: 0000000000000000
R10: 00007ffde5052530 R11: 00007fbb22aedc70 R12: ffffffff80000001
R13: ffff880079b74400 R14: ffff880079b74578 R15: 0000000000000010
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
#6 [ffff88007c603ef8] x86_perf_event_update at ffffffff81029e03
#7 [ffff88007c603f30] x86_pmu_read at ffffffff8102a079
#8 [ffff88007c603f40] __perf_event_read at ffffffff811590de

I couldn't find what real HW rdpmc does on this situation,
so I'm not sure if we actually want to prevent this.. patch
below tries to catch this case.

thanks,
jirka


---
arch/x86/events/core.c | 8 ++++-
arch/x86/events/intel/core.c | 72 ++++++++++++++++++++++++++++++++------------
arch/x86/events/perf_event.h | 6 ++++
3 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 473519100b11..d836c5922b12 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -534,8 +534,14 @@ int x86_pmu_hw_config(struct perf_event *event)
if (!event->attr.exclude_kernel)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;

- if (event->attr.type == PERF_TYPE_RAW)
+ if (event->attr.type == PERF_TYPE_RAW) {
+ u64 arch_config = event->attr.config & INTEL_ARCH_EVENT_MASK;
+
+ if (x86_pmu_event_disabled(arch_config))
+ return -ENOENT;
+
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+ }

if (event->attr.sample_period && x86_pmu.limit_period) {
if (x86_pmu.limit_period(event, event->attr.sample_period) >
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9049d62f34ae..99a83529c7ff 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -23,16 +23,22 @@
/*
* Intel PerfMon, used on Core and later.
*/
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
- [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
+struct intel_perfmon_event {
+ u64 config;
+ bool disabled;
+ u64 replacement;
+};
+
+static struct intel_perfmon_event intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = { .config = 0x003c },
+ [PERF_COUNT_HW_INSTRUCTIONS] = { .config = 0x00c0 },
+ [PERF_COUNT_HW_CACHE_REFERENCES] = { .config = 0x4f2e },
+ [PERF_COUNT_HW_CACHE_MISSES] = { .config = 0x412e },
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { .config = 0x00c4 },
+ [PERF_COUNT_HW_BRANCH_MISSES] = { .config = 0x00c5 },
+ [PERF_COUNT_HW_BUS_CYCLES] = { .config = 0x013c },
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = { .config = 0x0300 }, /* pseudo-encoding */
};

static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -268,7 +274,31 @@ struct event_constraint intel_bdw_event_constraints[] = {

static u64 intel_pmu_event_map(int hw_event)
{
- return intel_perfmon_event_map[hw_event];
+ struct intel_perfmon_event *event = &intel_perfmon_event_map[hw_event];
+
+ if (event->disabled)
+ return event->config;
+ if (event->replacement)
+ return event->replacement;
+
+ return event->config;
+}
+
+static bool intel_pmu_event_disabled(int hw_event)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(intel_perfmon_event_map); i++) {
+ struct intel_perfmon_event *event = &intel_perfmon_event_map[hw_event];
+
+ if (event->config != hw_event)
+ continue;
+
+ if (event->disabled)
+ return true;
+ }
+
+ return false;
}

/*
@@ -3165,6 +3195,7 @@ static __initconst const struct x86_pmu core_pmu = {
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
+ .event_disabled = intel_pmu_event_disabled,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
.free_running_flags = PEBS_FREERUNNING_FLAGS,
@@ -3205,6 +3236,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
+ .event_disabled = intel_pmu_event_disabled,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
.free_running_flags = PEBS_FREERUNNING_FLAGS,
@@ -3357,7 +3389,7 @@ static __init void intel_arch_events_quirk(void)

/* disable event that reported as not presend by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
- intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+ intel_perfmon_event_map[intel_arch_events_map[bit].id].disabled = true;
pr_warn("CPUID marked event: \'%s\' unavailable\n",
intel_arch_events_map[bit].name);
}
@@ -3375,7 +3407,7 @@ static __init void intel_nehalem_quirk(void)
* branch-misses, but it's still much better than the
* architectural event which is often completely bogus:
*/
- intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES].replacement = 0x7f89;
ebx.split.no_branch_misses_retired = 0;
x86_pmu.events_maskl = ebx.full;
pr_info("CPU erratum AAJ80 worked around\n");
@@ -3543,10 +3575,10 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = nhm_events_attrs;

/* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);

intel_pmu_pebs_data_source_nhm();
@@ -3630,10 +3662,10 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = nhm_events_attrs;

/* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);

intel_pmu_pebs_data_source_nhm();
@@ -3667,10 +3699,10 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = snb_events_attrs;

/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);

pr_cont("SandyBridge events, ");
@@ -3704,7 +3736,7 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = snb_events_attrs;

/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);

pr_cont("IvyBridge events, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 01ddfeadaee6..69cca7dc8de4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -514,6 +514,7 @@ struct x86_pmu {
int (*addr_offset)(int index, bool eventsel);
int (*rdpmc_index)(int index);
u64 (*event_map)(int);
+ bool (*event_disabled)(int);
int max_events;
int num_counters;
int num_counters_fixed;
@@ -715,6 +716,11 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}

+static inline bool x86_pmu_event_disabled(u64 config)
+{
+ return x86_pmu.event_disabled ? x86_pmu.event_disabled(config) : false;
+}
+
int x86_add_exclusive(unsigned int what);

void x86_del_exclusive(unsigned int what);
--
2.7.4