[RFC PATCH] trace/perf: cure locking issue in perf_event_open() error path

From: Sebastian Siewior
Date: Fri Apr 28 2017 - 10:25:12 EST


As trinity figured out, there is a recursive get_online_cpus() in
perf_event_open()'s error path:
| Call Trace:
| dump_stack+0x86/0xce
| __lock_acquire+0x2520/0x2cd0
| lock_acquire+0x27c/0x2f0
| get_online_cpus+0x3d/0x80
| static_key_slow_dec+0x5a/0x70
| sw_perf_event_destroy+0x8e/0x100
| _free_event+0x61b/0x800
| free_event+0x68/0x70
| SyS_perf_event_open+0x19db/0x1d80

In order to cure, I am moving free_event() after the put_online_cpus()
block.
Besides that one, there also the error path in perf_event_alloc() which
also invokes event->destory. Here I delayed the destory work to
schedule_work().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---

I am not quite happy with the schedule_work() part.

include/linux/perf_event.h | 1 +
kernel/events/core.c | 52 +++++++++++++++++++++++++++++++++-------------
2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a635887f28..d6a874dbbd21 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -718,6 +718,7 @@ struct perf_event {
#endif

struct list_head sb_list;
+ struct work_struct destroy_work;
#endif /* CONFIG_PERF_EVENTS */
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7aed78b516fc..3358889609f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9320,6 +9320,16 @@ static void account_event(struct perf_event *event)
account_pmu_sb_event(event);
}

+static void perf_alloc_destroy_ev(struct work_struct *work)
+{
+ struct perf_event *event;
+
+ event = container_of(work, struct perf_event, destroy_work);
+ event->destroy(event);
+ module_put(event->pmu->module);
+ kfree(event);
+}
+
/*
* Allocate and initialize a event structure
*/
@@ -9334,6 +9344,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
+ bool delay_destroy = false;
long err = -EINVAL;

if ((unsigned)cpu >= nr_cpu_ids) {
@@ -9497,15 +9508,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
exclusive_event_destroy(event);

err_pmu:
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
+ if (event->destroy) {
+ /* delay ->destroy due to nested get_online_cpus() */
+ INIT_WORK(&event->destroy_work, perf_alloc_destroy_ev);
+ delay_destroy = true;
+ } else {
+ module_put(pmu->module);
+ }
err_ns:
if (is_cgroup_event(event))
perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
- kfree(event);
+ if (delay_destroy)
+ schedule_work(&event->destroy_work);
+ else
+ kfree(event);

return ERR_PTR(err);
}
@@ -9798,7 +9816,7 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
- struct perf_event *event, *sibling;
+ struct perf_event *event = NULL, *sibling;
struct perf_event_attr attr;
struct perf_event_context *ctx, *uninitialized_var(gctx);
struct file *event_file = NULL;
@@ -9908,13 +9926,14 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
+ event = NULL;
goto err_cred;
}

if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
err = -EOPNOTSUPP;
- goto err_alloc;
+ goto err_cred;
}
}

@@ -9927,7 +9946,7 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.use_clockid) {
err = perf_event_set_clock(event, attr.clockid);
if (err)
- goto err_alloc;
+ goto err_cred;
}

if (pmu->task_ctx_nr == perf_sw_context)
@@ -9962,7 +9981,7 @@ SYSCALL_DEFINE5(perf_event_open,
ctx = find_get_context(pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
}

if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
@@ -10186,18 +10205,21 @@ SYSCALL_DEFINE5(perf_event_open,
err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
-err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
err_cred:
if (task)
mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
+ /*
+ * The event cleanup should happen earlier (as per cleanup in reverse
+ * allocation order). It is delayed after the put_online_cpus() section
+ * so we don't invoke event->destroy in it and risk recursive invocation
+ * of it via static_key_slow_dec().
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (event && !event_file)
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
--
2.11.0