Re: [PATCH] sched: Add trace for task wake up latency and leave running time

From: Peter Zijlstra
Date: Fri Aug 21 2020 - 14:50:27 EST



> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 93ecd930efd3..edb622c40a90 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1324,6 +1324,13 @@ struct task_struct {
> /* CPU-specific state of this task: */
> struct thread_struct thread;
>
> + /* Task wake up time stamp */
> + u64 ts_wakeup;
> + /* Previous task switch out time stamp */
> + u64 pre_ts_end;
> + /* Next task switch in time stamp */
> + u64 next_ts_start;
> + bool wakeup_state;
> /*
> * WARNING: on x86, 'thread_struct' contains a variable-sized
> * structure. It *MUST* be at the end of 'task_struct'.

^^^ did you read that comment?

> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index fec25b9cfbaf..e99c6d573a42 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -183,6 +183,72 @@ TRACE_EVENT(sched_switch,
> __entry->next_comm, __entry->next_pid, __entry->next_prio)
> );
>
> +DECLARE_EVENT_CLASS(sched_latency_template,
> +
> + TP_PROTO(bool preempt,
> + struct task_struct *prev,
> + struct task_struct *next,
> + u64 time),
> +
> + TP_ARGS(preempt, prev, next, time),
> +
> + TP_STRUCT__entry(
> + __array( char, prev_comm, TASK_COMM_LEN )
> + __field( pid_t, prev_pid )
> + __field( int, prev_prio )
> + __field( long, prev_state )
> + __array( char, next_comm, TASK_COMM_LEN )
> + __field( pid_t, next_pid )
> + __field( int, next_prio )
> + __field( u64, time )
> + ),
> +
> + TP_fast_assign(
> + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> + __entry->prev_pid = prev->pid;
> + __entry->prev_prio = prev->prio;
> + __entry->prev_state = __trace_sched_switch_state(preempt, prev);
> + memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> + __entry->next_pid = next->pid;
> + __entry->next_prio = next->prio;
> + __entry->time = time;
> + /* XXX SCHED_DEADLINE */
> + ),
> +
> + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d passed time=%llu (ns)",
> + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> +
> + (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> + __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
> + { TASK_INTERRUPTIBLE, "S" },
> + { TASK_UNINTERRUPTIBLE, "D" },
> + { __TASK_STOPPED, "T" },
> + { __TASK_TRACED, "t" },
> + { EXIT_DEAD, "X" },
> + { EXIT_ZOMBIE, "Z" },
> + { TASK_PARKED, "P" },
> + { TASK_DEAD, "I" }) :
> + "R",
> +
> + __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> + __entry->next_comm, __entry->next_pid, __entry->next_prio,
> + __entry->time)
> +);

NAK, that tracepoint is already broken, we don't want to proliferate the
broken.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8471a0f7eb32..b5a1928dc948 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2464,6 +2464,8 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
> {
> check_preempt_curr(rq, p, wake_flags);
> p->state = TASK_RUNNING;
> + p->ts_wakeup = local_clock();
> + p->wakeup_state = true;
> trace_sched_wakeup(p);
>
> #ifdef CONFIG_SMP

NAK, userless overhead.

> @@ -2846,6 +2848,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> success = 1;
> trace_sched_waking(p);
> p->state = TASK_RUNNING;
> + p->ts_wakeup = local_clock();
> + p->wakeup_state = true;
> trace_sched_wakeup(p);
> goto out;
> }

idem


> @@ -3355,6 +3362,8 @@ void wake_up_new_task(struct task_struct *p)
> post_init_entity_util_avg(p);
>
> activate_task(rq, p, ENQUEUE_NOCLOCK);
> + p->ts_wakeup = local_clock();
> + p->wakeup_state = true;
> trace_sched_wakeup_new(p);
> check_preempt_curr(rq, p, WF_FORK);
> #ifdef CONFIG_SMP

idem

> @@ -4521,8 +4530,19 @@ static void __sched notrace __schedule(bool preempt)
>
> psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>
> + next->next_ts_start = prev->pre_ts_end = local_clock();
> +
> trace_sched_switch(preempt, prev, next);
>
> + if (next->ts_wakeup && next->wakeup_state) {
> + trace_sched_wakeup_latency(preempt, prev, next,
> + next->next_ts_start - next->ts_wakeup);
> + next->wakeup_state = false;
> + }
> +
> + trace_sched_leave_running_time(preempt, prev,
> + next, next->next_ts_start - next->pre_ts_end);
> +
> /* Also unlocks the rq: */
> rq = context_switch(rq, prev, next, &rf);
> } else {

This really isn't ever going to happen.