Re: [v5 3/9] RISC-V: Add a perf core library for pmu drivers

From: Anup Patel
Date: Tue Jan 18 2022 - 00:02:19 EST


On Sat, Dec 25, 2021 at 11:17 AM Atish Patra <atishp@xxxxxxxxxxxxxx> wrote:
>
> From: Atish Patra <atish.patra@xxxxxxx>
>
> Implement a perf core library that can support all the essential perf
> features in future. It can also accommodate any type of PMU implementation
> in future. Currently, both SBI based perf driver and legacy driver
> implemented uses the library. Most of the common perf functionalities
> are kept in this core library wile PMU specific driver can implement PMU
> specific features. For example, the SBI specific functionality will be
> implemented in the SBI specific driver.
>
> Signed-off-by: Atish Patra <atish.patra@xxxxxxx>
> Signed-off-by: Atish Patra <atishp@xxxxxxxxxxxx>
> ---
> drivers/perf/Kconfig | 10 +
> drivers/perf/Makefile | 1 +
> drivers/perf/riscv_pmu.c | 328 +++++++++++++++++++++++++++++++++
> include/linux/cpuhotplug.h | 1 +
> include/linux/perf/riscv_pmu.h | 65 +++++++
> 5 files changed, 405 insertions(+)
> create mode 100644 drivers/perf/riscv_pmu.c
> create mode 100644 include/linux/perf/riscv_pmu.h
>
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 77522e5efe11..03ca0315df73 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -56,6 +56,16 @@ config ARM_PMU
> Say y if you want to use CPU performance monitors on ARM-based
> systems.
>
> +config RISCV_PMU
> + depends on RISCV
> + bool "RISC-V PMU framework"
> + default y
> + help
> + Say y if you want to use CPU performance monitors on RISCV-based
> + systems. This provides the core PMU framework that abstracts common
> + PMU functionalities in a core library so that different PMU drivers
> + can reuse it.
> +
> config ARM_PMU_ACPI
> depends on ARM_PMU && ACPI
> def_bool y
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index 5260b116c7da..76e5c50e24bb 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -10,6 +10,7 @@ obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o
> obj-$(CONFIG_HISI_PMU) += hisilicon/
> obj-$(CONFIG_QCOM_L2_PMU) += qcom_l2_pmu.o
> obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
> +obj-$(CONFIG_RISCV_PMU) += riscv_pmu.o
> obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
> obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
> obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
> diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
> new file mode 100644
> index 000000000000..83c3e8d085c7
> --- /dev/null
> +++ b/drivers/perf/riscv_pmu.c
> @@ -0,0 +1,328 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * RISC-V performance counter support.
> + *
> + * Copyright (C) 2021 Western Digital Corporation or its affiliates.
> + *
> + * This implementation is based on old RISC-V perf and ARM perf event code
> + * which are in turn based on sparc64 and x86 code.
> + */
> +
> +#include <linux/cpumask.h>
> +#include <linux/irq.h>
> +#include <linux/irqdesc.h>
> +#include <linux/perf/riscv_pmu.h>
> +#include <linux/printk.h>
> +#include <linux/smp.h>
> +
> +static unsigned long csr_read_num(int csr_num)
> +{
> +#define switchcase_csr_read(__csr_num, __val) {\
> + case __csr_num: \
> + __val = csr_read(__csr_num); \
> + break; }
> +#define switchcase_csr_read_2(__csr_num, __val) {\
> + switchcase_csr_read(__csr_num + 0, __val) \
> + switchcase_csr_read(__csr_num + 1, __val)}
> +#define switchcase_csr_read_4(__csr_num, __val) {\
> + switchcase_csr_read_2(__csr_num + 0, __val) \
> + switchcase_csr_read_2(__csr_num + 2, __val)}
> +#define switchcase_csr_read_8(__csr_num, __val) {\
> + switchcase_csr_read_4(__csr_num + 0, __val) \
> + switchcase_csr_read_4(__csr_num + 4, __val)}
> +#define switchcase_csr_read_16(__csr_num, __val) {\
> + switchcase_csr_read_8(__csr_num + 0, __val) \
> + switchcase_csr_read_8(__csr_num + 8, __val)}
> +#define switchcase_csr_read_32(__csr_num, __val) {\
> + switchcase_csr_read_16(__csr_num + 0, __val) \
> + switchcase_csr_read_16(__csr_num + 16, __val)}
> +
> + unsigned long ret = 0;
> +
> + switch (csr_num) {
> + switchcase_csr_read_32(CSR_CYCLE, ret)
> + switchcase_csr_read_32(CSR_CYCLEH, ret)
> + default :
> + break;
> + }
> +
> + return ret;
> +#undef switchcase_csr_read_32
> +#undef switchcase_csr_read_16
> +#undef switchcase_csr_read_8
> +#undef switchcase_csr_read_4
> +#undef switchcase_csr_read_2
> +#undef switchcase_csr_read
> +}
> +
> +/*
> + * Read the CSR of a corresponding counter.
> + */
> +unsigned long riscv_pmu_ctr_read_csr(unsigned long csr)
> +{
> + if (csr < CSR_CYCLE || csr > CSR_HPMCOUNTER31H ||
> + (csr > CSR_HPMCOUNTER31 && csr < CSR_CYCLEH)) {
> + pr_err("Invalid performance counter csr %lx\n", csr);
> + return -EINVAL;
> + }
> +
> + return csr_read_num(csr);
> +}
> +
> +uint64_t riscv_pmu_ctr_get_width_mask(struct perf_event *event)

Use u64 inplace of uint64_t for consistency.

> +{
> + int cwidth;
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> +
> + if (!rvpmu->ctr_get_width)
> + /**
> + * If the pmu driver doesn't support counter width, set it to default maximum
> + * allowed by the specification.
> + */

Improve indentation of this comment block and ensure it does not go
beyond 80 characters.

> + cwidth = 63;
> + else {
> + if (hwc->idx == -1)
> + /* Handle init case where idx is not initialized yet */
> + cwidth = rvpmu->ctr_get_width(0);
> + else
> + cwidth = rvpmu->ctr_get_width(hwc->idx);
> + }
> +
> + return GENMASK_ULL(cwidth, 0);
> +}
> +
> +u64 riscv_pmu_event_update(struct perf_event *event)
> +{
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> + struct hw_perf_event *hwc = &event->hw;
> + u64 prev_raw_count, new_raw_count;
> + unsigned long cmask;
> + u64 oldval, delta;
> +
> + if (!rvpmu->ctr_read)
> + return 0;
> +
> + cmask = riscv_pmu_ctr_get_width_mask(event);
> +
> + do {
> + prev_raw_count = local64_read(&hwc->prev_count);
> + new_raw_count = rvpmu->ctr_read(event);
> + oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count,
> + new_raw_count);
> + } while (oldval != prev_raw_count);
> +
> + delta = (new_raw_count - prev_raw_count) & cmask;
> + local64_add(delta, &event->count);
> + local64_sub(delta, &hwc->period_left);
> +
> + return delta;
> +}
> +
> +static void riscv_pmu_stop(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> +
> + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
> +
> + if (!(hwc->state & PERF_HES_STOPPED)) {
> + if (rvpmu->ctr_stop) {
> + rvpmu->ctr_stop(event, 0);
> + hwc->state |= PERF_HES_STOPPED;
> + }
> + riscv_pmu_event_update(event);
> + hwc->state |= PERF_HES_UPTODATE;
> + }
> +}
> +
> +int riscv_pmu_event_set_period(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + s64 left = local64_read(&hwc->period_left);
> + s64 period = hwc->sample_period;
> + int overflow = 0;
> + uint64_t max_period = riscv_pmu_ctr_get_width_mask(event);
> +
> + if (unlikely(left <= -period)) {
> + left = period;
> + local64_set(&hwc->period_left, left);
> + hwc->last_period = period;
> + overflow = 1;
> + }
> +
> + if (unlikely(left <= 0)) {
> + left += period;
> + local64_set(&hwc->period_left, left);
> + hwc->last_period = period;
> + overflow = 1;
> + }
> +
> + /*
> + * Limit the maximum period to prevent the counter value
> + * from overtaking the one we are about to program. In
> + * effect we are reducing max_period to account for
> + * interrupt latency (and we are being very conservative).
> + */
> + if (left > (max_period >> 1))
> + left = (max_period >> 1);
> +
> + local64_set(&hwc->prev_count, (u64)-left);
> + perf_event_update_userpage(event);
> +
> + return overflow;
> +}
> +
> +static void riscv_pmu_start(struct perf_event *event, int flags)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> + uint64_t max_period = riscv_pmu_ctr_get_width_mask(event);
> + u64 init_val;
> +
> + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> + return;
> +
> + if (flags & PERF_EF_RELOAD) {
> + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> +
> + /*
> + * Set the counter to the period to the next interrupt here,
> + * if you have any.
> + */

This comment block is not followed by any code.

Do you expect code to be added in future ?

> + }
> +
> + hwc->state = 0;
> + riscv_pmu_event_set_period(event);
> + init_val = local64_read(&hwc->prev_count) & max_period;
> + rvpmu->ctr_start(event, init_val);
> + perf_event_update_userpage(event);
> +}
> +
> +static int riscv_pmu_add(struct perf_event *event, int flags)
> +{
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> + struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
> + struct hw_perf_event *hwc = &event->hw;
> + int idx;
> +
> + idx = rvpmu->ctr_get_idx(event);
> + if (idx < 0)
> + return idx;
> +
> + hwc->idx = idx;
> + cpuc->events[idx] = event;
> + cpuc->n_events++;
> + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
> + if (flags & PERF_EF_START)
> + riscv_pmu_start(event, PERF_EF_RELOAD);
> +
> + /* Propagate our changes to the userspace mapping. */
> + perf_event_update_userpage(event);
> +
> + return 0;
> +}
> +
> +static void riscv_pmu_del(struct perf_event *event, int flags)
> +{
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> + struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
> + struct hw_perf_event *hwc = &event->hw;
> +
> + riscv_pmu_stop(event, PERF_EF_UPDATE);
> + cpuc->events[hwc->idx] = NULL;
> + /* The firmware need to reset the counter mapping */
> + if (rvpmu->ctr_stop)
> + rvpmu->ctr_stop(event, RISCV_PMU_STOP_FLAG_RESET);
> + cpuc->n_events--;
> + if (rvpmu->ctr_clear_idx)
> + rvpmu->ctr_clear_idx(event);
> + perf_event_update_userpage(event);
> + hwc->idx = -1;
> +}
> +
> +static void riscv_pmu_read(struct perf_event *event)
> +{
> + riscv_pmu_event_update(event);
> +}
> +
> +static int riscv_pmu_event_init(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> + int mapped_event;
> + u64 event_config = 0;
> + uint64_t cmask;
> +
> + hwc->flags = 0;
> + mapped_event = rvpmu->event_map(event, &event_config);
> + if (mapped_event < 0) {
> + pr_debug("event %x:%llx not supported\n", event->attr.type,
> + event->attr.config);
> + return mapped_event;
> + }
> +
> + /*
> + * idx is set to -1 because the index of a general event should not be
> + * decided until binding to some counter in pmu->add().
> + * config will contain the information about counter CSR
> + * the idx will contain the counter index
> + */
> + hwc->config = event_config;
> + hwc->idx = -1;
> + hwc->event_base = mapped_event;
> +
> + if (!is_sampling_event(event)) {
> + /*
> + * For non-sampling runs, limit the sample_period to half
> + * of the counter width. That way, the new counter value
> + * is far less likely to overtake the previous one unless
> + * you have some serious IRQ latency issues.
> + */
> + cmask = riscv_pmu_ctr_get_width_mask(event);
> + hwc->sample_period = cmask >> 1;
> + hwc->last_period = hwc->sample_period;
> + local64_set(&hwc->period_left, hwc->sample_period);
> + }
> +
> + return 0;
> +}
> +
> +struct riscv_pmu *riscv_pmu_alloc(void)
> +{
> + struct riscv_pmu *pmu;
> + int cpuid, i;
> + struct cpu_hw_events *cpuc;
> +
> + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
> + if (!pmu)
> + goto out;
> +
> + pmu->hw_events = alloc_percpu_gfp(struct cpu_hw_events, GFP_KERNEL);
> + if (!pmu->hw_events) {
> + pr_info("failed to allocate per-cpu PMU data.\n");
> + goto out_free_pmu;
> + }
> +
> + for_each_possible_cpu(cpuid) {
> + cpuc = per_cpu_ptr(pmu->hw_events, cpuid);
> + cpuc->n_events = 0;
> + for (i = 0; i < RISCV_MAX_COUNTERS; i++)
> + cpuc->events[i] = NULL;
> + }
> + pmu->pmu = (struct pmu) {
> + .event_init = riscv_pmu_event_init,
> + .add = riscv_pmu_add,
> + .del = riscv_pmu_del,
> + .start = riscv_pmu_start,
> + .stop = riscv_pmu_stop,
> + .read = riscv_pmu_read,
> + };
> +
> + return pmu;
> +
> +out_free_pmu:
> + kfree(pmu);
> +out:
> + return NULL;
> +}
> diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> index 39cf84a30b9f..1743b8a4624d 100644
> --- a/include/linux/cpuhotplug.h
> +++ b/include/linux/cpuhotplug.h
> @@ -123,6 +123,7 @@ enum cpuhp_state {
> CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,
> CPUHP_AP_PERF_ARM_ACPI_STARTING,
> CPUHP_AP_PERF_ARM_STARTING,
> + CPUHP_AP_PERF_RISCV_STARTING,

First of all, CPUHP_AP_PERF_RISCV_STARTING should not
be part of this patch.

In general, we should avoid adding RISC-V specific HP state
if CPUHP_AP_ONLINE_DYN is sufficient.

> CPUHP_AP_ARM_L2X0_STARTING,
> CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
> CPUHP_AP_ARM_ARCH_TIMER_STARTING,
> diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
> new file mode 100644
> index 000000000000..0d8979765d79
> --- /dev/null
> +++ b/include/linux/perf/riscv_pmu.h
> @@ -0,0 +1,65 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2018 SiFive
> + * Copyright (C) 2018 Andes Technology Corporation
> + * Copyright (C) 2021 Western Digital Corporation or its affiliates.
> + *
> + */
> +
> +#ifndef _ASM_RISCV_PERF_EVENT_H
> +#define _ASM_RISCV_PERF_EVENT_H
> +
> +#include <linux/perf_event.h>
> +#include <linux/ptrace.h>
> +#include <linux/interrupt.h>
> +
> +#ifdef CONFIG_RISCV_PMU
> +
> +/*
> + * The RISCV_MAX_COUNTERS parameter should be specified.
> + */
> +
> +#define RISCV_MAX_COUNTERS 64
> +#define RISCV_OP_UNSUPP (-EOPNOTSUPP)
> +#define RISCV_PMU_PDEV_NAME "riscv-pmu"
> +
> +#define RISCV_PMU_STOP_FLAG_RESET 1
> +
> +struct cpu_hw_events {
> + /* currently enabled events */
> + int n_events;
> + /* currently enabled events */
> + struct perf_event *events[RISCV_MAX_COUNTERS];
> + /* currently enabled counters */
> + DECLARE_BITMAP(used_event_ctrs, RISCV_MAX_COUNTERS);
> +};
> +
> +struct riscv_pmu {
> + struct pmu pmu;
> + char *name;
> +
> + irqreturn_t (*handle_irq)(int irq_num, void *dev);
> +
> + int num_counters;
> + u64 (*ctr_read)(struct perf_event *event);
> + int (*ctr_get_idx)(struct perf_event *event);
> + int (*ctr_get_width)(int idx);
> + void (*ctr_clear_idx)(struct perf_event *event);
> + void (*ctr_start)(struct perf_event *event, u64 init_val);
> + void (*ctr_stop)(struct perf_event *event, unsigned long flag);
> + int (*event_map)(struct perf_event *event, u64 *config);
> +
> + struct cpu_hw_events __percpu *hw_events;
> + struct hlist_node node;
> +};
> +
> +#define to_riscv_pmu(p) (container_of(p, struct riscv_pmu, pmu))
> +unsigned long riscv_pmu_ctr_read_csr(unsigned long csr);
> +int riscv_pmu_event_set_period(struct perf_event *event);
> +uint64_t riscv_pmu_ctr_get_width_mask(struct perf_event *event);
> +u64 riscv_pmu_event_update(struct perf_event *event);
> +struct riscv_pmu *riscv_pmu_alloc(void);
> +
> +#endif /* CONFIG_RISCV_PMU */
> +
> +#endif /* _ASM_RISCV_PERF_EVENT_H */
> --
> 2.33.1
>

Apart from minor comments above, it looks good to me.

Reviewed-by: Anup Patel <anup@xxxxxxxxxxxxxx>

Regards,
Anup