Re: [PATCH v10 06/14] unwind_user/deferred: Add deferred unwinding interface
From: Steven Rostedt
Date: Wed Jun 18 2025 - 15:09:32 EST
On Wed, 18 Jun 2025 20:46:20 +0200
Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> > +struct unwind_work;
> > +
> > +typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
> > +
> > +struct unwind_work {
> > + struct list_head list;
>
> Does this really need to be a list? Single linked list like
> callback_head not good enough?
Doesn't a list head make it easier to remove without having to iterate the
list?
>
> > + unwind_callback_t func;
> > +};
> > +
> > #ifdef CONFIG_UNWIND_USER
> >
> > void unwind_task_init(struct task_struct *task);
> > @@ -12,10 +22,15 @@ void unwind_task_free(struct task_struct *task);
> >
> > int unwind_deferred_trace(struct unwind_stacktrace *trace);
> >
> > +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t
> > func); +int unwind_deferred_request(struct unwind_work *work, u64
> > *timestamp); +void unwind_deferred_cancel(struct unwind_work *work);
> > +
> > static __always_inline void unwind_exit_to_user_mode(void)
> > {
> > if (unlikely(current->unwind_info.cache))
> > current->unwind_info.cache->nr_entries = 0;
> > + current->unwind_info.timestamp = 0;
>
> Surely clearing that timestamp is only relevant when there is a cache
> around? Better to not add this unconditional write to the exit path.
That's actually not quite true. If the allocation fails, we still want to
clear the timestamp. But later patches add more data to check and it does
exit out if there's been no requests:
{
struct unwind_task_info *info = ¤t->unwind_info;
unsigned long bits;
/* Was there any unwinding? */
if (likely(!info->unwind_mask))
return;
bits = info->unwind_mask;
do {
/* Is a task_work going to run again before going back */
if (bits & UNWIND_PENDING)
return;
} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
if (likely(info->cache))
info->cache->nr_entries = 0;
info->timestamp = 0;
}
But for better reviewing, I could add a comment in this patch that states
that this will eventually exit out early when it does more work.
>
> > }
> >
> > #else /* !CONFIG_UNWIND_USER */
> > @@ -24,6 +39,9 @@ static inline void unwind_task_init(struct
> > task_struct *task) {} static inline void unwind_task_free(struct
> > task_struct *task) {}
> > static inline int unwind_deferred_trace(struct unwind_stacktrace
> > *trace) { return -ENOSYS; } +static inline int
> > unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
> > { return -ENOSYS; } +static inline int unwind_deferred_request(struct
> > unwind_work *work, u64 *timestamp) { return -ENOSYS; } +static inline
> > void unwind_deferred_cancel(struct unwind_work *work) {} static inline
> > void unwind_exit_to_user_mode(void) {}
> > diff --git a/include/linux/unwind_deferred_types.h
> > b/include/linux/unwind_deferred_types.h index
> > db5b54b18828..5df264cf81ad 100644 ---
> > a/include/linux/unwind_deferred_types.h +++
> > b/include/linux/unwind_deferred_types.h @@ -9,6 +9,9 @@ struct
> > unwind_cache {
> > struct unwind_task_info {
> > struct unwind_cache *cache;
> > + struct callback_head work;
> > + u64 timestamp;
> > + int pending;
> > };
> >
> > #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
> > diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> > index e3913781c8c6..b76c704ddc6d 100644
> > --- a/kernel/unwind/deferred.c
> > +++ b/kernel/unwind/deferred.c
> > @@ -2,13 +2,35 @@
> > /*
> > * Deferred user space unwinding
> > */
> > +#include <linux/sched/task_stack.h>
> > +#include <linux/unwind_deferred.h>
> > +#include <linux/sched/clock.h>
> > +#include <linux/task_work.h>
> > #include <linux/kernel.h>
> > #include <linux/sched.h>
> > #include <linux/slab.h>
> > -#include <linux/unwind_deferred.h>
> > +#include <linux/mm.h>
> >
> > #define UNWIND_MAX_ENTRIES 512
> >
> > +/* Guards adding to and reading the list of callbacks */
> > +static DEFINE_MUTEX(callback_mutex);
> > +static LIST_HEAD(callbacks);
>
> Global state.. smells like failure.
Yes, the unwind infrastructure is global, as it is the way tasks know what
tracer's callbacks to call.
>
> > +/*
> > + * Read the task context timestamp, if this is the first caller then
> > + * it will set the timestamp.
> > + */
> > +static u64 get_timestamp(struct unwind_task_info *info)
> > +{
> > + lockdep_assert_irqs_disabled();
> > +
> > + if (!info->timestamp)
> > + info->timestamp = local_clock();
> > +
> > + return info->timestamp;
> > +}
> > +
> > /**
> > * unwind_deferred_trace - Produce a user stacktrace in faultable
> > context
> > * @trace: The descriptor that will store the user stacktrace
> > @@ -59,11 +81,117 @@ int unwind_deferred_trace(struct unwind_stacktrace
> > *trace) return 0;
> > }
> >
> > +static void unwind_deferred_task_work(struct callback_head *head)
> > +{
> > + struct unwind_task_info *info = container_of(head, struct
> > unwind_task_info, work);
> > + struct unwind_stacktrace trace;
> > + struct unwind_work *work;
> > + u64 timestamp;
> > +
> > + if (WARN_ON_ONCE(!info->pending))
> > + return;
> > +
> > + /* Allow work to come in again */
> > + WRITE_ONCE(info->pending, 0);
> > +
> > + /*
> > + * From here on out, the callback must always be called, even
> > if it's
> > + * just an empty trace.
> > + */
> > + trace.nr = 0;
> > + trace.entries = NULL;
> > +
> > + unwind_deferred_trace(&trace);
> > +
> > + timestamp = info->timestamp;
> > +
> > + guard(mutex)(&callback_mutex);
> > + list_for_each_entry(work, &callbacks, list) {
> > + work->func(work, &trace, timestamp);
> > + }
>
> So now you're globally serializing all return-to-user instances. How is
> that not a problem?
It was the original way we did things. The next patch changes this to SRCU.
But it requires a bit more care. For breaking up the series, I preferred
not to add that logic and make it a separate patch.
For better reviewing, I'll add a comment here that says:
/* TODO switch this global lock to SRCU */
-- Steve