Re: [RFC 2/2] perf/core: Invoke pmu::sched_task callback for per-cpu events

From: Liang, Kan
Date: Thu Nov 05 2020 - 09:48:18 EST




On 11/2/2020 9:52 AM, Namhyung Kim wrote:
The commit 44fae179ce73 ("perf/core: Pull pmu::sched_task() into
perf_event_context_sched_out()") moved the pmu::sched_task callback to
be called for task event context. But it missed to call it for
per-cpu events to flush PMU internal buffers (i.e. for PEBS, ...).

This commit basically reverts the commit but keeps the optimization
for task events and only add missing calls for cpu events.

Fixes: 44fae179ce73 ("perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()") > Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 38 ++++++++++++++++++++++++++++++++++++--
2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0defb526cd0c..abb70a557cb5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -872,6 +872,7 @@ struct perf_cpu_context {
struct list_head cgrp_cpuctx_entry;
#endif
+ struct list_head sched_cb_entry;
int sched_cb_usage;
int online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aaa0155c4142..c04d9a913537 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -383,6 +383,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -3480,11 +3481,16 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- --cpuctx->sched_cb_usage;
+ this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
@@ -3492,7 +3498,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- cpuctx->sched_cb_usage++;
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ this_cpu_inc(perf_sched_cb_usages);
}
/*
@@ -3521,6 +3530,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (prev == next)
+ return;
+
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
+ continue;
+
+ __perf_pmu_sched_task(cpuctx, sched_in);
+ }
+}
+
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@@ -3543,6 +3570,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3850,6 +3880,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}

It looks like the ("perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()") is also reverted in the patch.

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -12999,6 +13032,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}


Can we only update the perf_sched_cb_usages and sched_cb_list for per-cpu event as below patch (not tested)?

If user only uses the per-process event, we don't need to go through the list.


diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6586f7e71cfb..63c9b87cab5e 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -380,7 +380,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state & PERF_ATTACH_TASK));
}

static void power_pmu_bhrb_disable(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 444e5f061d04..a34b90c7fa6d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1022,9 +1022,9 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,

if (needed_cb != pebs_needs_sched_cb(cpuc)) {
if (!needed_cb)
- perf_sched_cb_inc(pmu);
+ perf_sched_cb_inc(pmu, !(event->attach_state & PERF_ATTACH_TASK));
else
- perf_sched_cb_dec(pmu);
+ perf_sched_cb_dec(pmu, !(event->attach_state & PERF_ATTACH_TASK));

update = true;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653c5dd2..8d4d02cde3d4 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -693,7 +693,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->ctx->pmu, !(event->attach_state & PERF_ATTACH_TASK));
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();

@@ -740,7 +740,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->ctx->pmu, !(event->attach_state & PERF_ATTACH_TASK));
}

static inline bool vlbr_exclude_host(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index abb70a557cb5..5a02239ca8fd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -966,8 +966,8 @@ extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
-extern void perf_sched_cb_dec(struct pmu *pmu);
-extern void perf_sched_cb_inc(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu, bool systemwide);
+extern void perf_sched_cb_inc(struct pmu *pmu, bool systemwide);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e6b98507734a..2d7c07af02f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3484,25 +3484,29 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,

static DEFINE_PER_CPU(struct list_head, sched_cb_list);

-void perf_sched_cb_dec(struct pmu *pmu)
+void perf_sched_cb_dec(struct pmu *pmu, bool systemwide)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

- this_cpu_dec(perf_sched_cb_usages);
+ --cpuctx->sched_cb_usage;

- if (!--cpuctx->sched_cb_usage)
+ if (systemwide) {
+ this_cpu_dec(perf_sched_cb_usages);
list_del(&cpuctx->sched_cb_entry);
+ }
}


-void perf_sched_cb_inc(struct pmu *pmu)
+void perf_sched_cb_inc(struct pmu *pmu, bool systemwide)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ cpuctx->sched_cb_usage++;

- this_cpu_inc(perf_sched_cb_usages);
+ if (systemwide) {
+ this_cpu_inc(perf_sched_cb_usages);
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ }
}

/*

Thanks,
Kan