[PATCH] perf/x86/intel/lbr: Enable NO_{CYCLES,FLAGS} for all LBR formats

From: kan . liang
Date: Tue Jul 07 2020 - 14:57:25 EST


From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

An option to disable reading branch flags/cycles was introduced in
commit b16a5b52eb90 ("perf/x86: Add option to disable reading branch
flags/cycles"). Currently, the option is only supported by the
LBR_FORMAT_INFO format. For the other LBR formats, including the legacy
LBR, Architectural LBR, and LBR PEBS record, there is no effect with the
NO_{CYCLES,FLAGS} flag set. The flags/cycles information is still
output, which breaks the ABI.

For all LBR formats, avoid output flags and cycles if the user
explicitly sets PERF_SAMPLE_BRANCH_NO_{CYCLES,FLAGS} branch type.

For Architectural LBR, the branch type information is retrieved from
the LBR_INFO field/MSR. With the NO_{CYCLES,FLAGS} flag set, the
LBR_INFO will not be read. The branch type information will relay on the
software decoding just like the legacy LBR.

Fixes: b16a5b52eb90 ("perf/x86: Add option to disable reading branch flags/cycles")
Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
---
arch/x86/events/intel/lbr.c | 130 ++++++++++++++++++++++++++------------------
1 file changed, 78 insertions(+), 52 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 63f58bd..944291a 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -799,6 +799,14 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
cpuc->lbr_stack.hw_idx = tos;
}

+static inline bool lbr_need_info(struct cpu_hw_events *cpuc)
+{
+ if (cpuc->lbr_sel)
+ return !(cpuc->lbr_sel->config & LBR_NO_INFO);
+
+ return false;
+}
+
/*
* Due to lack of segmentation in Linux the effective address (offset)
* is the same as the linear address, allowing us to merge the LIP and EIP
@@ -806,7 +814,8 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
*/
void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
- bool need_info = false, call_stack = false;
+ bool need_info = lbr_need_info(cpuc);
+ bool call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
@@ -814,11 +823,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
int out = 0;
int num = x86_pmu.lbr_nr;

- if (cpuc->lbr_sel) {
- need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
- if (cpuc->lbr_sel->config & LBR_CALL_STACK)
- call_stack = true;
- }
+ if (cpuc->lbr_sel && (cpuc->lbr_sel->config & LBR_CALL_STACK))
+ call_stack = true;

for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
@@ -849,23 +855,28 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
}

if (lbr_format == LBR_FORMAT_TIME) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
skip = 1;
- cycles = ((to >> 48) & LBR_INFO_CYCLES);
-
to = (u64)((((s64)to) << 16) >> 16);
+ if (need_info) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+ }
}

if (lbr_flags & LBR_EIP_FLAGS) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
skip = 1;
+ if (need_info) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ }
}
if (lbr_flags & LBR_TSX) {
- in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
- abort = !!(from & LBR_FROM_FLAG_ABORT);
skip = 3;
+ if (need_info) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ }
}
from = (u64)((((s64)from) << skip) >> skip);

@@ -928,8 +939,21 @@ static __always_inline bool get_lbr_cycles(u64 info)
return info & LBR_INFO_CYCLES;
}

+enum {
+ ARCH_LBR_BR_TYPE_JCC = 0,
+ ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1,
+ ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2,
+ ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3,
+ ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4,
+ ARCH_LBR_BR_TYPE_NEAR_RET = 5,
+ ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET,
+
+ ARCH_LBR_BR_TYPE_MAP_MAX = 16,
+};
+
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
- struct lbr_entry *entries)
+ struct lbr_entry *entries,
+ bool need_info)
{
struct perf_branch_entry *e;
struct lbr_entry *lbr;
@@ -948,16 +972,33 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
break;

to = rdlbr_to(i, lbr);
- info = rdlbr_info(i, lbr);

e->from = from;
e->to = to;
- e->mispred = get_lbr_mispred(info);
- e->predicted = get_lbr_predicted(info);
- e->in_tx = !!(info & LBR_INFO_IN_TX);
- e->abort = !!(info & LBR_INFO_ABORT);
- e->cycles = get_lbr_cycles(info);
- e->type = get_lbr_br_type(info);
+ if (need_info) {
+ info = rdlbr_info(i, lbr);
+ e->mispred = get_lbr_mispred(info);
+ e->predicted = get_lbr_predicted(info);
+ e->in_tx = !!(info & LBR_INFO_IN_TX);
+ e->abort = !!(info & LBR_INFO_ABORT);
+ e->cycles = get_lbr_cycles(info);
+ e->type = get_lbr_br_type(info);
+ } else {
+ e->mispred = 0;
+ e->predicted = 0;
+ e->in_tx = 0;
+ e->abort = 0;
+ e->cycles = 0;
+ /*
+ * For Architectural LBR, 0 means X86_BR_JCC. Assign an
+ * invalid branch type, which will be ignored in the
+ * intel_pmu_lbr_filter().
+ *
+ * For the legacy LBR, there is no branch type
+ * information available. The field is always ignored.
+ */
+ e->type = ARCH_LBR_BR_TYPE_KNOWN_MAX + 1;
+ }
e->reserved = 0;
}

@@ -966,7 +1007,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,

static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
{
- intel_pmu_store_lbr(cpuc, NULL);
+ intel_pmu_store_lbr(cpuc, NULL, lbr_need_info(cpuc));
}

static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
@@ -974,12 +1015,12 @@ static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;

if (!xsave) {
- intel_pmu_store_lbr(cpuc, NULL);
+ intel_pmu_store_lbr(cpuc, NULL, lbr_need_info(cpuc));
return;
}
copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);

- intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+ intel_pmu_store_lbr(cpuc, xsave->lbr.entries, lbr_need_info(cpuc));
}

void intel_pmu_lbr_read(void)
@@ -1096,23 +1137,20 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;

- if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
reg->config = mask;
- return 0;
+ else {
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ * But the 10th bit LBR_CALL_STACK does not operate
+ * in suppress mode.
+ */
+ reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);
}
-
- /*
- * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
- * in suppress mode. So LBR_SELECT should be set to
- * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
- * But the 10th bit LBR_CALL_STACK does not operate
- * in suppress mode.
- */
- reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);
-
if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
- (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
- (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS))
reg->config |= LBR_NO_INFO;

return 0;
@@ -1357,18 +1395,6 @@ common_branch_type(int type)
return PERF_BR_UNKNOWN;
}

-enum {
- ARCH_LBR_BR_TYPE_JCC = 0,
- ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1,
- ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2,
- ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3,
- ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4,
- ARCH_LBR_BR_TYPE_NEAR_RET = 5,
- ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET,
-
- ARCH_LBR_BR_TYPE_MAP_MAX = 16,
-};
-
static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
[ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC,
[ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP,
@@ -1460,7 +1486,7 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
else
cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();

- intel_pmu_store_lbr(cpuc, lbr);
+ intel_pmu_store_lbr(cpuc, lbr, lbr_need_info(cpuc));
intel_pmu_lbr_filter(cpuc);
}

--
2.7.4