Re: [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring

From: Liang, Kan
Date: Tue Apr 30 2019 - 11:46:06 EST




On 4/30/2019 4:56 AM, Peter Zijlstra wrote:
On Mon, Apr 29, 2019 at 07:44:02AM -0700, kan.liang@xxxxxxxxxxxxxxx wrote:

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ef76..039e2f2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -795,6 +795,7 @@ struct perf_cpu_context {
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
+ unsigned int cgrp_switch :1;

If you're not adding more bits, why not just keep it an int?


Yes, there is no more bits for now.
Sure, I will change it to int.


#endif
struct list_head sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dc7dead..388dd42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -809,6 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
+ cpuctx->cgrp_switch = true;
if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -832,6 +833,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
&cpuctx->ctx);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
+ cpuctx->cgrp_switch = false;
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

That is a bit of a hack...

We need an indicator to indicate the context switch, so we can apply the optimization.
I was thinking to use the existing variables, e.g. cpuctx->cgrp. But it doesn't work. We cannot tell if it's the first sched_in or in a context switch by checking cpuctx->cgrp.
cgrp_switch should be the simplest method.


@@ -2944,13 +2946,25 @@ static void ctx_sched_out(struct perf_event_context *ctx,
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
+ list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) {
+#ifdef CONFIG_CGROUP_PERF
+ /* Don't sched system-wide event when cgroup context switch */
+ if (cpuctx->cgrp_switch && !event->cgrp)
+ continue;
+#endif
group_sched_out(event, cpuctx, ctx);
+ }
}

This works by accident, however..

if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
+ list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) {
+#ifdef CONFIG_CGROUP_PERF
+ /* Don't sched system-wide event when cgroup context switch */
+ if (cpuctx->cgrp_switch && !event->cgrp)
+ continue;
+#endif
group_sched_out(event, cpuctx, ctx);
+ }
}
perf_pmu_enable(ctx->pmu);
}

this one is just wrong afaict.

Suppose the new cgroup has pinned events, which we cannot schedule
because you left !cgroup flexible events on.


Right, the priority order may be changed.

I think we can track the event type in perf_cgroup.
If the new cgroup has pinned events, we can fall back to slow path, which will switch everything.

How about the patch as below? (Not test yet)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7eff286..f751852 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -843,6 +843,8 @@ struct perf_cgroup {
struct perf_cgroup_info __percpu *info;
/* perf cgroup ID = (CPU ID << 32) | css subsys-unique ID */
u64 __percpu *cgrp_id;
+
+ int cgrp_event_type;
};

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2066322..557d371 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -839,6 +839,22 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
*/
cpuctx->cgrp = perf_cgroup_from_task(task,
&cpuctx->ctx);
+
+ /*
+ * The system-wide events weren't sched out.
+ * To keep the following priority order:
+ * cpu pinned, cpu flexible
+ * We need to switch system-wide events as well,
+ * if the new cgroup has pinned events.
+ *
+ * Disable fast path and sched out the flexible events.
+ */
+ if (cpuctx->cgrp->cgrp_event_type & EVENT_PINNED) {
+ /* disable fast path */
+ cpuctx->cgrp_switch = false;
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ }
+
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
cpuctx->cgrp_switch = false;
@@ -924,6 +940,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;

+ if (event->attr.pinned)
+ cgrp->cgrp_event_type |= EVENT_PINNED;
+ else
+ cgrp->cgrp_event_type |= EVENT_FLEXIBLE;
+
cgrp_id = ((u64)smp_processor_id() << 32) | css->id;
event->cgrp_id = cgrp_id;
*per_cpu_ptr(cgrp->cgrp_id, event->cpu) = cgrp_id;


Thanks,
Kan