Re: [RFC] perf: Delayed userspace unwind (Was: [PATCH v3 00/10] x86: ORC unwinder)

From: Andy Lutomirski
Date: Fri Jul 28 2017 - 23:42:05 EST


> On Jul 25, 2017, at 7:55 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
>> On Thu, Jul 13, 2017 at 11:19:11AM +0200, Ingo Molnar wrote:
>>
>> * Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>>
>>>> One gloriously ugly hack would be to delay the userspace unwind to
>>>> return-to-userspace, at which point we have a schedulable context and can take
>>>> faults.
>>
>> I don't think it's ugly, and it has various advantages:
>>
>>>> Of course, then you have to somehow identify this later unwind sample with all
>>>> relevant prior samples and stitch the whole thing back together, but that
>>>> should be doable.
>>>>
>>>> In fact, it would not be at all hard to do, just queue a task_work from the
>>>> NMI and have that do the EH based unwind.
>>

I haven't checked task_work specifically, but a bunch of the exit work
is permitted to sleep, which is potentially useful.

If this becomes successful enough that we could eventually deprecate
the old code, I wonder if copy_from_user_nmi() could go away? :)

>
> ---
> include/linux/perf_event.h | 1 +
> include/uapi/linux/perf_event.h | 14 ++++++-
> kernel/events/callchain.c | 86 ++++++++++++++++++++++++++++++++++++++---
> kernel/events/core.c | 18 +++------
> 4 files changed, 100 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index a3b873fc59e4..241251533e39 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -682,6 +682,7 @@ struct perf_event {
> int pending_disable;
> struct irq_work pending;
>
> + struct callback_head pending_callchain;
> atomic_t event_limit;
>
> /* address range filters */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 642db5fa3286..342def57ef34 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -368,7 +368,8 @@ struct perf_event_attr {
> context_switch : 1, /* context switch data */
> write_backward : 1, /* Write ring buffer from end to beginning */
> namespaces : 1, /* include namespaces data */
> - __reserved_1 : 35;
> + delayed_user_callchain : 1, /* ... */
> + __reserved_1 : 34;
>
> union {
> __u32 wakeup_events; /* wakeup every n events */
> @@ -915,6 +916,17 @@ enum perf_event_type {
> */
> PERF_RECORD_NAMESPACES = 16,
>
> + /*
> + * struct {
> + * struct perf_event_header header;
> + * { u64 nr,
> + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
> + * struct sample_id sample_id;
> + * };
> + *
> + */
> + PERF_RECORD_CALLCHAIN = 17,
> +
> PERF_RECORD_MAX, /* non-ABI */
> };
>
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 1b2be63c8528..c98a12f3592c 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -12,6 +12,7 @@
> #include <linux/perf_event.h>
> #include <linux/slab.h>
> #include <linux/sched/task_stack.h>
> +#include <linux/task_work.h>
>
> #include "internal.h"
>
> @@ -178,19 +179,94 @@ put_callchain_entry(int rctx)
> put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
> }
>
> +static struct perf_callchain_entry __empty = { .nr = 0, };
> +
> +static void perf_callchain_work(struct callback_head *work)
> +{
> + struct perf_event *event = container_of(work, struct perf_event, pending_callchain);
> + struct perf_output_handle handle;
> + struct perf_sample_data sample;
> + size_t size;
> + int ret;
> +
> + struct {
> + struct perf_event_header header;
> + } callchain_event = {
> + .header = {
> + .type = PERF_RECORD_CALLCHAIN,
> + .misc = 0,
> + .size = sizeof(callchain_event),
> + },
> + };
> +
> + perf_event_header__init_id(&callchain_event.header, &sample, event);
> +
> + sample.callchain = get_perf_callchain(task_pt_regs(current),
> + /* init_nr */ 0,
> + /* kernel */ false,
> + /* user */ true,
> + event->attr.sample_max_stack,
> + /* crosstask */ false,
> + /* add_mark */ true);
> +
> + if (!sample.callchain)
> + sample.callchain = &__empty;
> +
> + size = sizeof(u64) * (1 + sample.callchain->nr);
> + callchain_event.header.size += size;
> +
> + ret = perf_output_begin(&handle, event, callchain_event.header.size);
> + if (ret)
> + return;
> +
> + perf_output_put(&handle, callchain_event);
> + __output_copy(&handle, sample.callchain, size);
> + perf_event__output_id_sample(event, &handle, &sample);
> + perf_output_end(&handle);
> +
> + barrier();
> + work->func = NULL; /* done */
> +}
> +
> struct perf_callchain_entry *
> perf_callchain(struct perf_event *event, struct pt_regs *regs)
> {
> - bool kernel = !event->attr.exclude_callchain_kernel;
> - bool user = !event->attr.exclude_callchain_user;
> + bool kernel = !event->attr.exclude_callchain_kernel;
> + bool user = !event->attr.exclude_callchain_user;
> + bool delayed = event->attr.delayed_user_callchain;
> +
> /* Disallow cross-task user callchains. */
> bool crosstask = event->ctx->task && event->ctx->task != current;
> const u32 max_stack = event->attr.sample_max_stack;
>
> - if (!kernel && !user)
> - return NULL;
> + struct perf_callchain_entry *callchain = NULL;
> +
> + if (user && delayed && !crosstask) {
> + struct callback_head *work = &event->pending_callchain;
> +
> + if (!work->func) {
> + work->func = perf_callchain_work;
> + /*
> + * We cannot do set_notify_resume() from NMI context,
> + * also, knowing we are already in an interrupted
> + * context and will pass return to userspace, we can
> + * simply set TIF_NOTIFY_RESUME.
> + */
> + task_work_add(current, work, false);
> + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);

There's a more or leas unavoidable window in which this won't be
noticed, which could plausibly confuse userspace. It might be
possible to figure out a way for an NMI to tell if it lands in this
window, but it would be a bit tricky. Also, is the task_work code
prepared to handle task_work_add during exit?