Re: [PATCH v2] perf: Synchronously cleanup child events

From: Peter Zijlstra
Date: Mon Jan 25 2016 - 09:54:27 EST


On Mon, Jan 25, 2016 at 12:48:46PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 22, 2016 at 11:44:03AM -0800, Alexei Starovoitov wrote:
> > On Fri, Jan 22, 2016 at 01:38:47PM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 22, 2016 at 01:35:40PM +0200, Alexander Shishkin wrote:
> > > > Peter Zijlstra <peterz@xxxxxxxxxxxxx> writes:
> > > >
> > > > > So I think there's a number of problems still :-(
> >
> > I've been looking at how perf_event->owner is handled and couldn't
> > figure out how you deal with the case of passing perf_event_fd via scm_rights.
> > It seems one process can open an event, pass it to another process,
> > but when current process exists owner will still point to dead task,
> > since refcount > 0.
> > Which part am I missing?
>
> Nothing, you raised a good point. I think this shows we cannot link
> !event->owner to an event being 'dead'.
>
> If we keep these two states separate, the scm_rights thing should work
> again.

Alexander, Alexei,

How about the below? That uses event->state == PERF_EVENT_STATE_EXIT to
indicate the event has been given up by its 'owner' and decouples us
from the actual event->owner logic.

This retains the event->owner and event->owner_list thing purely for the
prclt(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE) calls, but does give
us strict 'owner' semantics in that:

struct perf_event *my_event = perf_event_create_kernel_counter();

/* ... */

perf_event_release_kernel(my_event);

Or

int fd = sys_perf_event_open(...);

close(fd); /* last, calls fops::release */

Will destroy the event dead. event::refcount will 'retain' the object
but it will become non functional and is strictly meant as a temporal
existence guarantee (for when RCU isn't good enough).

So this should restore the scm_rights case, which preserves the fd but
could result in not having event->owner (and therefore being removed
from its owner_list), which is fine.

BPF still needs to get fixed to use filedesc references instead.

---

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,9 +634,6 @@ struct perf_event_context {
int nr_cgroups; /* cgroup evts */
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
-
- struct delayed_work orphans_remove;
- bool orphans_remove_sched;
};

/*
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@

#include <asm/irq_regs.h>

-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);

struct remote_function_call {
@@ -1086,8 +1084,8 @@ static void put_ctx(struct perf_event_co
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
@@ -1646,45 +1644,11 @@ static void perf_group_detach(struct per
perf_event__header_size(tmp);
}

-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
+ return event->state == PERF_EVENT_STATE_EXIT;
}

-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
@@ -1745,9 +1709,6 @@ event_sched_out(struct perf_event *event
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;

- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}

@@ -1771,6 +1732,9 @@ group_sched_out(struct perf_event *group
cpuctx->exclusive = 0;
}

+#define DETACH_GROUP 0x01
+#define DETACH_STATE 0x02
+
/*
* Cross CPU call to remove a performance event
*
@@ -1783,13 +1747,19 @@ __perf_remove_from_context(struct perf_e
struct perf_event_context *ctx,
void *info)
{
- bool detach_group = (unsigned long)info;
+ unsigned long flags = (unsigned long)info;

event_sched_out(event, cpuctx, ctx);
- if (detach_group)
+
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);

+ if (flags & DETACH_STATE) {
+ event->state = PERF_EVENT_STATE_EXIT;
+ perf_event_wakeup(event);
+ }
+
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
if (ctx->task) {
@@ -1809,12 +1779,11 @@ __perf_remove_from_context(struct perf_e
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
lockdep_assert_held(&event->ctx->mutex);

- event_function_call(event, __perf_remove_from_context,
- (void *)(unsigned long)detach_group);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}

/*
@@ -1980,9 +1949,6 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;

- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);

@@ -2253,7 +2219,8 @@ static void __perf_event_enable(struct p
struct perf_event *leader = event->group_leader;
struct perf_event_context *task_ctx;

- if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
return;

update_context_time(ctx);
@@ -2298,7 +2265,8 @@ static void _perf_event_enable(struct pe
struct perf_event_context *ctx = event->ctx;

raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -3363,7 +3331,6 @@ static void __perf_event_init_context(st
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}

static struct perf_event_context *
@@ -3665,29 +3632,6 @@ static bool exclusive_event_installable(
return true;
}

-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3709,7 +3653,25 @@ static void _free_event(struct perf_even
if (is_cgroup_event(event))
perf_detach_cgroup(event);

- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}

/*
@@ -3771,8 +3733,10 @@ static void perf_remove_from_owner(struc
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ event->owner = NULL;
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3780,11 +3744,23 @@ static void perf_remove_from_owner(struc

static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;

+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *child, *tmp;
+ LIST_HEAD(child_list);
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);

@@ -3802,14 +3778,57 @@ static void put_event(struct perf_event
*/
ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
+ perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
perf_event_ctx_unlock(event, ctx);

- _free_event(event);
-}
+ /*
+ * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
+ * either from the above perf_remove_from_context() or through
+ * __perf_event_exit_task().
+ *
+ * Therefore, anybody acquireing event->child_mutex after the below
+ * list_splice_init() _must_ also see this, most importantly
+ * inherit_event() which will avoid placing more children on the
+ * list.
+ *
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
+ */
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);

-int perf_event_release_kernel(struct perf_event *event)
-{
+ /*
+ * event::child_mutex nests inside ctx::mutex, so move children
+ * to a safe place first and avoid inversion
+ */
+ mutex_lock(&event->child_mutex);
+ list_splice_init(&event->child_list, &child_list);
+ mutex_unlock(&event->child_mutex);
+
+ list_for_each_entry_safe(child, tmp, &child_list, child_list) {
+ struct perf_event_context *ctx;
+
+ /*
+ * This is somewhat similar to perf_free_event(),
+ * except for these events are alive and need
+ * proper perf_remove_from_context().
+ */
+ ctx = perf_event_ctx_lock(child);
+ perf_remove_from_context(child, DETACH_GROUP);
+ perf_event_ctx_unlock(child, ctx);
+
+ list_del(&child->child_list);
+
+ /* Children will have exactly one reference */
+ free_event(child);
+
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ /* Must be the last reference */
put_event(event);
return 0;
}
@@ -3820,46 +3839,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_ker
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}

-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -8432,11 +8415,11 @@ SYSCALL_DEFINE5(perf_event_open,
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);

list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}

@@ -8616,7 +8599,7 @@ void perf_pmu_migrate_context(struct pmu
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -8665,7 +8648,7 @@ void perf_pmu_migrate_context(struct pmu
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event,
- struct task_struct *child)
+ struct task_struct *child)
{
struct perf_event *parent_event = child_event->parent;
u64 child_val;
@@ -8684,25 +8667,6 @@ static void sync_child_event(struct perf
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);

- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}

static void
@@ -8710,6 +8674,8 @@ __perf_event_exit_task(struct perf_event
struct perf_event_context *child_ctx,
struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
@@ -8728,20 +8694,39 @@ __perf_event_exit_task(struct perf_event
if (!!child_event->parent)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
raw_spin_unlock_irq(&child_ctx->lock);

/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!child_event->parent) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * This is a child event, cleanup and free.
+ */
+
+ /*
+ * Fold delta back into parent counts.
+ */
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list.
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}

static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -8973,8 +8958,15 @@ inherit_event(struct perf_event *parent_
if (IS_ERR(child_event))
return child_event;

+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel().
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
@@ -9022,8 +9014,6 @@ inherit_event(struct perf_event *parent_
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);