[PATCH] perf arm-spe: Use SPE data source for neoverse cores

From: Ali Saidi
Date: Fri Jan 21 2022 - 13:26:08 EST


When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
the same encoding. I can't find encoding information for any other SPE
implementations to unify their choices with Arm's thus that is left for future
work.

This changes enables the expected behavior of perf c2c on a system with SPE where
lines that are shared among multiple cores show up in perf c2c output.

Signed-off-by: Ali Saidi <alisaidi@xxxxxxxxxx>
---
.../util/arm-spe-decoder/arm-spe-decoder.c | 1 +
.../util/arm-spe-decoder/arm-spe-decoder.h | 12 +++++
tools/perf/util/arm-spe.c | 48 ++++++++++++++-----
3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 5e390a1a79ab..091987dd3966 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -220,6 +220,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)

break;
case ARM_SPE_DATA_SOURCE:
+ decoder->record.source = payload;
break;
case ARM_SPE_BAD:
break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 69b31084d6be..1ecf4ee99415 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -29,10 +29,22 @@ enum arm_spe_op_type {
ARM_SPE_ST = 1 << 1,
};

+enum arm_spe_neoverse_data_source {
+ ARM_SPE_NV_L1D = 0x0,
+ ARM_SPE_NV_L2 = 0x8,
+ ARM_SPE_NV_PEER_CORE = 0x9,
+ ARM_SPE_NV_LCL_CLSTR = 0xa,
+ ARM_SPE_NV_SYS_CACHE = 0xb,
+ ARM_SPE_NV_PEER_CLSTR = 0xc,
+ ARM_SPE_NV_REMOTE = 0xd,
+ ARM_SPE_NV_DRAM = 0xe,
+};
+
struct arm_spe_record {
enum arm_spe_sample_type type;
int err;
u32 op;
+ u16 source;
u32 latency;
u64 from_ip;
u64 to_ip;
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index d2b64e3f588b..d025af13f5e4 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -34,6 +34,7 @@
#include "arm-spe-decoder/arm-spe-decoder.h"
#include "arm-spe-decoder/arm-spe-pkt-decoder.h"

+#include <../../../arch/arm64/include/asm/cputype.h>
#define MAX_TIMESTAMP (~0ULL)

struct arm_spe {
@@ -45,6 +46,7 @@ struct arm_spe {
struct perf_session *session;
struct machine *machine;
u32 pmu_type;
+ u64 midr;

struct perf_tsc_conversion tc;

@@ -399,9 +401,16 @@ static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
return false;
}

-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+static const struct midr_range neoverse_spe[] = {
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+ {},
+};
+
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
{
union perf_mem_data_src data_src = { 0 };
+ bool is_neoverse = is_midr_in_range(midr, neoverse_spe);

if (record->op == ARM_SPE_LD)
data_src.mem_op = PERF_MEM_OP_LOAD;
@@ -409,19 +418,30 @@ static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
data_src.mem_op = PERF_MEM_OP_STORE;

if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
- data_src.mem_lvl = PERF_MEM_LVL_L3;
+ if (is_neoverse && record->source == ARM_SPE_NV_DRAM) {
+ data_src.mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
+ } else if (is_neoverse && record->source == ARM_SPE_NV_PEER_CLSTR) {
+ data_src.mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+ } else {
+ data_src.mem_lvl = PERF_MEM_LVL_L3;

- if (record->type & ARM_SPE_LLC_MISS)
- data_src.mem_lvl |= PERF_MEM_LVL_MISS;
- else
- data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ if (record->type & ARM_SPE_LLC_MISS)
+ data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+ else
+ data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ }
} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
- data_src.mem_lvl = PERF_MEM_LVL_L1;
+ if (is_neoverse && record->source == ARM_SPE_NV_L2) {
+ data_src.mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ } else {
+ data_src.mem_lvl = PERF_MEM_LVL_L1;

- if (record->type & ARM_SPE_L1D_MISS)
- data_src.mem_lvl |= PERF_MEM_LVL_MISS;
- else
- data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ if (record->type & ARM_SPE_L1D_MISS)
+ data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+ else
+ data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ }
}

if (record->type & ARM_SPE_REMOTE_ACCESS)
@@ -446,7 +466,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
u64 data_src;
int err;

- data_src = arm_spe__synth_data_source(record);
+ data_src = arm_spe__synth_data_source(record, spe->midr);

if (spe->sample_flc) {
if (record->type & ARM_SPE_L1D_MISS) {
@@ -796,6 +816,10 @@ static int arm_spe_process_event(struct perf_session *session,
u64 timestamp;
struct arm_spe *spe = container_of(session->auxtrace,
struct arm_spe, auxtrace);
+ const char *cpuid = perf_env__cpuid(session->evlist->env);
+ u64 midr = strtol(cpuid, NULL, 16);
+
+ spe->midr = midr;

if (dump_trace)
return 0;
--
2.33.1