Re: [RFC PATCH 5/6] riscv: perf: introduce DT mechanism

From: Zong Li
Date: Mon Jun 29 2020 - 16:44:49 EST


On Mon, Jun 29, 2020 at 12:37 PM Anup Patel <anup@xxxxxxxxxxxxxx> wrote:
>
> On Mon, Jun 29, 2020 at 8:49 AM Zong Li <zong.li@xxxxxxxxxx> wrote:
> >
> > Each architecture is responsible for mapping generic hardware and cache
> > events to their own specific encoding of hardware events. For each
> > architecture, it also have to distinguish the defination of hardware
> > events of different platforms of each vendor. We use DT file to describe
> > platform-specific information to make our perf implementation more
> > generic and common.
> >
> > Signed-off-by: Zong Li <zong.li@xxxxxxxxxx>
> > ---
> > arch/riscv/include/asm/perf_event.h | 55 ++----
> > arch/riscv/kernel/perf_event.c | 273 +++++++++++++---------------
> > 2 files changed, 139 insertions(+), 189 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/perf_event.h b/arch/riscv/include/asm/perf_event.h
> > index 41d515a1f331..e95d3bbaae3e 100644
> > --- a/arch/riscv/include/asm/perf_event.h
> > +++ b/arch/riscv/include/asm/perf_event.h
> > @@ -17,6 +17,8 @@
> > #define RISCV_EVENT_COUNTERS 29
> > #define RISCV_TOTAL_COUNTERS (RISCV_BASE_COUNTERS + RISCV_EVENT_COUNTERS)
> >
> > +#define RISCV_DEFAULT_WIDTH_COUNTER 64
> > +
> > /*
> > * According to the spec, an implementation can support counter up to
> > * mhpmcounter31, but many high-end processors has at most 6 general
> > @@ -33,9 +35,21 @@
> >
> > #define RISCV_PMU_HPMCOUNTER_FIRST 3
> > #define RISCV_PMU_HPMCOUNTER_LAST \
> > - (RISCV_PMU_HPMCOUNTER_FIRST + riscv_pmu->num_counters - 1)
> > + (RISCV_PMU_HPMCOUNTER_FIRST + riscv_pmu.num_event_cntr - 1)
> > +
> > +#define RISCV_OP_UNSUPP (-EOPNOTSUPP)
> > +
> > +#define RISCV_MAP_ALL_UNSUPPORTED \
> > + [0 ... PERF_COUNT_HW_MAX - 1] = RISCV_OP_UNSUPP
> >
> > -#define RISCV_OP_UNSUPP (-EOPNOTSUPP)
> > +#define C(x) PERF_COUNT_HW_CACHE_##x
> > +
> > +#define RISCV_CACHE_MAP_ALL_UNSUPPORTED \
> > +[0 ... C(MAX) - 1] = { \
> > + [0 ... C(OP_MAX) - 1] = { \
> > + [0 ... C(RESULT_MAX) - 1] = RISCV_OP_UNSUPP, \
> > + }, \
> > +}
> >
> > /* Hardware cache event encoding */
> > #define PERF_HW_CACHE_TYPE 0
> > @@ -65,43 +79,6 @@
> > #define CSR_MHPMEVENT7 0x327
> > #define CSR_MHPMEVENT8 0x328
> >
> > -struct cpu_hw_events {
> > - /* # currently enabled events*/
> > - int n_events;
> > - /* currently enabled events */
> > - struct perf_event *events[RISCV_EVENT_COUNTERS];
> > - /* bitmap of used event counters */
> > - unsigned long used_cntr_mask;
> > - /* vendor-defined PMU data */
> > - void *platform;
> > -};
> > -
> > -struct riscv_pmu {
> > - struct pmu *pmu;
> > -
> > - /* generic hw/cache events table */
> > - const int *hw_events;
> > - const int (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
> > - [PERF_COUNT_HW_CACHE_OP_MAX]
> > - [PERF_COUNT_HW_CACHE_RESULT_MAX];
> > - /* method used to map hw/cache events */
> > - int (*map_hw_event)(u64 config);
> > - int (*map_cache_event)(u64 config);
> > -
> > - /* max generic hw events in map */
> > - int max_events;
> > - /* number total counters, 2(base) + x(general) */
> > - int num_counters;
> > - /* the width of the counter */
> > - int counter_width;
> > -
> > - /* vendor-defined PMU features */
> > - void *platform;
> > -
> > - irqreturn_t (*handle_irq)(int irq_num, void *dev);
> > - int irq;
> > -};
> > -
> > #endif
> > #ifdef CONFIG_PERF_EVENTS
> > #define perf_arch_bpf_user_pt_regs(regs) (struct user_regs_struct *)regs
> > diff --git a/arch/riscv/kernel/perf_event.c b/arch/riscv/kernel/perf_event.c
> > index 0cfcd6f1e57b..3bdfbe4efd5c 100644
> > --- a/arch/riscv/kernel/perf_event.c
> > +++ b/arch/riscv/kernel/perf_event.c
> > @@ -9,6 +9,7 @@
> > * Copyright (C) 2009 Google, Inc., Stephane Eranian
> > * Copyright 2014 Tilera Corporation. All Rights Reserved.
> > * Copyright (C) 2018 Andes Technology Corporation
> > + * Copyright (C) 2020 SiFive
> > *
> > * Perf_events support for RISC-V platforms.
> > *
> > @@ -30,113 +31,55 @@
> > #include <linux/perf_event.h>
> > #include <linux/atomic.h>
> > #include <linux/of.h>
> > +#include <asm/csr.h>
> > #include <asm/perf_event.h>
> >
> > -static const struct riscv_pmu *riscv_pmu __read_mostly;
> > +static struct riscv_pmu {
> > + struct pmu *pmu;
> > +
> > + /* number of event counters */
> > + int num_event_cntr;
> > +
> > + /* the width of base counters */
> > + int width_base_cntr;
> > +
> > + /* the width of event counters */
> > + int width_event_cntr;
> > +
> > + irqreturn_t (*handle_irq)(int irq_num, void *dev);
> > +
> > + int irq;
> > +} riscv_pmu;
> > +
> > +struct cpu_hw_events {
> > + /* # currently enabled events*/
> > + int n_events;
> > +
> > + /* currently enabled events */
> > + struct perf_event *events[RISCV_EVENT_COUNTERS];
> > +
> > + /* bitmap of used event counters */
> > + unsigned long used_cntr_mask;
> > +};
> > +
> > static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
> >
> > /*
> > * Hardware & cache maps and their methods
> > */
> >
> > -static const int riscv_hw_event_map[] = {
> > - [PERF_COUNT_HW_CPU_CYCLES] = RISCV_PMU_CYCLE,
> > - [PERF_COUNT_HW_INSTRUCTIONS] = RISCV_PMU_INSTRET,
> > - [PERF_COUNT_HW_CACHE_REFERENCES] = RISCV_OP_UNSUPP,
> > - [PERF_COUNT_HW_CACHE_MISSES] = RISCV_OP_UNSUPP,
> > - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = RISCV_OP_UNSUPP,
> > - [PERF_COUNT_HW_BRANCH_MISSES] = RISCV_OP_UNSUPP,
> > - [PERF_COUNT_HW_BUS_CYCLES] = RISCV_OP_UNSUPP,
> > +static int riscv_hw_event_map[PERF_COUNT_HW_MAX] = {
> > + RISCV_MAP_ALL_UNSUPPORTED,
> > +
> > + /* Specify base pmu, even if they aren't present in DT file */
> > + [PERF_COUNT_HW_CPU_CYCLES] = RISCV_PMU_CYCLE,
> > + [PERF_COUNT_HW_INSTRUCTIONS] = RISCV_PMU_INSTRET,
> > };
> >
> > -#define C(x) PERF_COUNT_HW_CACHE_##x
> > -static const int riscv_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> > -[PERF_COUNT_HW_CACHE_OP_MAX]
> > -[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> > - [C(L1D)] = {
> > - [C(OP_READ)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_WRITE)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_PREFETCH)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - },
> > - [C(L1I)] = {
> > - [C(OP_READ)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_WRITE)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_PREFETCH)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - },
> > - [C(LL)] = {
> > - [C(OP_READ)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_WRITE)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_PREFETCH)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - },
> > - [C(DTLB)] = {
> > - [C(OP_READ)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_WRITE)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_PREFETCH)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - },
> > - [C(ITLB)] = {
> > - [C(OP_READ)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_WRITE)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_PREFETCH)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - },
> > - [C(BPU)] = {
> > - [C(OP_READ)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_WRITE)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - [C(OP_PREFETCH)] = {
> > - [C(RESULT_ACCESS)] = RISCV_OP_UNSUPP,
> > - [C(RESULT_MISS)] = RISCV_OP_UNSUPP,
> > - },
> > - },
> > +static int riscv_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
> > + [PERF_COUNT_HW_CACHE_OP_MAX]
> > + [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> > + RISCV_CACHE_MAP_ALL_UNSUPPORTED,
> > };
> >
> > /*
> > @@ -154,6 +97,17 @@ static inline int is_event_counter(int idx)
> > idx <= RISCV_PMU_HPMCOUNTER_LAST);
> > }
> >
> > +static inline int get_counter_width(int idx)
> > +{
> > + if (is_base_counter(idx))
> > + return riscv_pmu.width_base_cntr;
> > +
> > + if (is_event_counter(idx))
> > + return riscv_pmu.width_event_cntr;
> > +
> > + return 0;
> > +}
> > +
> > static inline int get_available_counter(struct perf_event *event)
> > {
> > struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> > @@ -188,10 +142,14 @@ static inline int get_available_counter(struct perf_event *event)
> > */
> > static int riscv_map_hw_event(u64 config)
> > {
> > - if (config >= riscv_pmu->max_events)
> > + int ret;
> > +
> > + if (config >= PERF_COUNT_HW_MAX)
> > return -EINVAL;
> >
> > - return riscv_pmu->hw_events[config];
> > + ret = riscv_hw_event_map[config];
> > +
> > + return ret == RISCV_OP_UNSUPP ? -ENOENT : ret;
> > }
> >
> > /*
> > @@ -355,7 +313,7 @@ static void riscv_pmu_read(struct perf_event *event)
> > * delta is the value to update the counter we maintain in the kernel.
> > */
> > delta = (new_raw_count - prev_raw_count) &
> > - ((1ULL << riscv_pmu->counter_width) - 1);
> > + ((1ULL << get_counter_width(idx)) - 1);
> >
> > local64_add(delta, &event->count);
> > /*
> > @@ -386,7 +344,7 @@ static void riscv_pmu_stop(struct perf_event *event, int flags)
> > hwc->state |= PERF_HES_STOPPED;
> >
> > if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
> > - riscv_pmu->pmu->read(event);
> > + riscv_pmu_read(event);
> > hwc->state |= PERF_HES_UPTODATE;
> > }
> > }
> > @@ -429,7 +387,7 @@ static int riscv_pmu_add(struct perf_event *event, int flags)
> > struct hw_perf_event *hwc = &event->hw;
> > int count_idx;
> >
> > - if (cpuc->n_events == riscv_pmu->num_counters)
> > + if (cpuc->n_events == riscv_pmu.num_event_cntr)
> > return -ENOSPC;
> >
> > count_idx = get_available_counter(event);
> > @@ -437,13 +395,13 @@ static int riscv_pmu_add(struct perf_event *event, int flags)
> > return -ENOSPC;
> >
> > cpuc->n_events++;
> > +
> > hwc->idx = count_idx;
> > - cpuc->events[hwc->idx] = event;
> >
> > hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
> >
> > if (flags & PERF_EF_START)
> > - riscv_pmu->pmu->start(event, PERF_EF_RELOAD);
> > + riscv_pmu_start(event, PERF_EF_RELOAD);
> >
> > return 0;
> > }
> > @@ -459,8 +417,8 @@ static void riscv_pmu_del(struct perf_event *event, int flags)
> > cpuc->n_events--;
> > __clear_bit(hwc->idx, &cpuc->used_cntr_mask);
> >
> > - cpuc->events[hwc->idx] = NULL;
> > - riscv_pmu->pmu->stop(event, PERF_EF_UPDATE);
> > + riscv_pmu_stop(event, PERF_EF_UPDATE);
> > +
> > perf_event_update_userpage(event);
> > }
> >
> > @@ -470,7 +428,7 @@ static void riscv_pmu_del(struct perf_event *event, int flags)
> >
> > static DEFINE_MUTEX(pmc_reserve_mutex);
> >
> > -static irqreturn_t riscv_base_pmu_handle_irq(int irq_num, void *dev)
> > +static irqreturn_t riscv_pmu_handle_irq(int irq_num, void *dev)
> > {
> > return IRQ_NONE;
> > }
> > @@ -480,8 +438,8 @@ static int reserve_pmc_hardware(void)
> > int err = 0;
> >
> > mutex_lock(&pmc_reserve_mutex);
> > - if (riscv_pmu->irq >= 0 && riscv_pmu->handle_irq) {
> > - err = request_irq(riscv_pmu->irq, riscv_pmu->handle_irq,
> > + if (riscv_pmu.irq >= 0 && riscv_pmu.handle_irq) {
> > + err = request_irq(riscv_pmu.irq, riscv_pmu.handle_irq,
> > IRQF_PERCPU, "riscv-base-perf", NULL);
> > }
> > mutex_unlock(&pmc_reserve_mutex);
> > @@ -492,8 +450,8 @@ static int reserve_pmc_hardware(void)
> > static void release_pmc_hardware(void)
> > {
> > mutex_lock(&pmc_reserve_mutex);
> > - if (riscv_pmu->irq >= 0)
> > - free_irq(riscv_pmu->irq, NULL);
> > + if (riscv_pmu.irq >= 0)
> > + free_irq(riscv_pmu.irq, NULL);
> > mutex_unlock(&pmc_reserve_mutex);
> > }
> >
> > @@ -529,10 +487,10 @@ static int riscv_event_init(struct perf_event *event)
> >
> > switch (event->attr.type) {
> > case PERF_TYPE_HARDWARE:
> > - code = riscv_pmu->map_hw_event(attr->config);
> > + code = riscv_map_hw_event(attr->config);
> > break;
> > case PERF_TYPE_HW_CACHE:
> > - code = riscv_pmu->map_cache_event(attr->config);
> > + code = riscv_map_cache_event(attr->config);
> > break;
> > case PERF_TYPE_RAW:
> > code = attr->config;
> > @@ -555,9 +513,6 @@ static int riscv_event_init(struct perf_event *event)
> > /*
> > * idx is set to -1 because the index of a general event should not be
> > * decided until binding to some counter in pmu->add().
> > - *
> > - * But since we don't have such support, later in pmu->add(), we just
> > - * use hwc->config as the index instead.
> > */
> > hwc->config_base = config_base;
> > hwc->config = code;
> > @@ -570,52 +525,70 @@ static int riscv_event_init(struct perf_event *event)
> > * Initialization
> > */
> >
> > -static struct pmu min_pmu = {
> > - .name = "riscv-base",
> > - .event_init = riscv_event_init,
> > - .add = riscv_pmu_add,
> > - .del = riscv_pmu_del,
> > - .start = riscv_pmu_start,
> > - .stop = riscv_pmu_stop,
> > - .read = riscv_pmu_read,
> > -};
> > +static struct riscv_pmu riscv_pmu = {
> > + .pmu = &(struct pmu) {
> > + .name = "riscv-pmu",
> > + .event_init = riscv_event_init,
> > + .add = riscv_pmu_add,
> > + .del = riscv_pmu_del,
> > + .start = riscv_pmu_start,
> > + .stop = riscv_pmu_stop,
> > + .read = riscv_pmu_read,
> > + },
> >
> > -static const struct riscv_pmu riscv_base_pmu = {
> > - .pmu = &min_pmu,
> > - .max_events = ARRAY_SIZE(riscv_hw_event_map),
> > - .map_hw_event = riscv_map_hw_event,
> > - .hw_events = riscv_hw_event_map,
> > - .map_cache_event = riscv_map_cache_event,
> > - .cache_events = &riscv_cache_event_map,
> > - .counter_width = 63,
> > - .num_counters = RISCV_BASE_COUNTERS + 0,
> > - .handle_irq = &riscv_base_pmu_handle_irq,
> > + .num_event_cntr = 0,
> > + .width_event_cntr = RISCV_DEFAULT_WIDTH_COUNTER,
> > + .width_base_cntr = RISCV_DEFAULT_WIDTH_COUNTER,
> >
> > + .handle_irq = &riscv_pmu_handle_irq,
> > /* This means this PMU has no IRQ. */
> > .irq = -1,
> > };
> >
> > -static const struct of_device_id riscv_pmu_of_ids[] = {
> > - {.compatible = "riscv,base-pmu", .data = &riscv_base_pmu},
> > - { /* sentinel value */ }
> > -};
> > +static int __init init_riscv_pmu(struct device_node *node)
> > +{
> > + int num_events, key, value, i;
> > +
> > + of_property_read_u32(node, "riscv,width-base-cntr", &riscv_pmu.width_base_cntr);
> > +
> > + of_property_read_u32(node, "riscv,width-event-cntr", &riscv_pmu.width_event_cntr);
> > +
> > + of_property_read_u32(node, "riscv,n-event-cntr", &riscv_pmu.num_event_cntr);
> > + if (riscv_pmu.num_event_cntr > RISCV_EVENT_COUNTERS)
> > + riscv_pmu.num_event_cntr = RISCV_EVENT_COUNTERS;
> > +
> > + num_events = of_property_count_u32_elems(node, "riscv,hw-event-map");
> > + if (num_events > 0 && num_events % 2 == 0)
> > + for (i = 0; i < num_events;) {
> > + of_property_read_u32_index(node, "riscv,hw-event-map", i++, &key);
> > + of_property_read_u32_index(node, "riscv,hw-event-map", i++, &value);
> > + riscv_hw_event_map[key] = value;
> > + }
> > +
> > + num_events = of_property_count_u32_elems(node, "riscv,hw-cache-event-map");
> > + if (num_events > 0 && num_events % 2 == 0)
> > + for (i = 0; i < num_events;) {
> > + of_property_read_u32_index(node, "riscv,hw-cache-event-map", i++, &key);
> > + of_property_read_u32_index(node, "riscv,hw-cache-event-map", i++, &value);
> > + riscv_cache_event_map
> > + [(key >> PERF_HW_CACHE_TYPE) & PERF_HW_CACHE_MASK]
> > + [(key >> PERF_HW_CACHE_OP) & PERF_HW_CACHE_MASK]
> > + [(key >> PERF_HW_CACHE_RESULT) & PERF_HW_CACHE_MASK] = value;
> > + }
> > +
> > + return 0;
> > +}
> >
> > static int __init init_hw_perf_events(void)
> > {
> > - struct device_node *node = of_find_node_by_type(NULL, "pmu");
> > - const struct of_device_id *of_id;
> > + struct device_node *node = of_find_compatible_node(NULL, NULL, "riscv,pmu");
> >
> > - riscv_pmu = &riscv_base_pmu;
> > + if (node)
> > + init_riscv_pmu(node);
> >
> > - if (node) {
> > - of_id = of_match_node(riscv_pmu_of_ids, node);
> > -
> > - if (of_id)
> > - riscv_pmu = of_id->data;
> > - of_node_put(node);
> > - }
> > + /* Even if there is no pmu node in DT, we reach here for base PMU. */
> > + perf_pmu_register(riscv_pmu.pmu, "cpu", PERF_TYPE_RAW);
> >
> > - perf_pmu_register(riscv_pmu->pmu, "cpu", PERF_TYPE_RAW);
> > return 0;
> > }
> > arch_initcall(init_hw_perf_events);
>
> Why does this have to arch_initcall() ??
> Why can't we just probe this like a regular DT based driver ??
>
> This driver needs total re-write because it has to be a platform driver.
> Even the names of counters are not registered. The implementation
> specific counter names should come from DT property.
>

We have some discussion in the cover letter, let us talk about in one place. :)

> Regards,
> Anup
>
> > --
> > 2.27.0
> >