Re: [RFC][PATCH] ftrace: function graph trace context switches

From: Frederic Weisbecker
Date: Mon Mar 16 2009 - 14:03:50 EST


On Mon, Jan 26, 2009 at 03:38:12PM +0100, Peter Zijlstra wrote:
> On Mon, 2009-01-26 at 12:35 +0100, Peter Zijlstra wrote:
> >
> > Another something nice would be to have ctx switches like:
> >
> > foo-1 => bar-2 ran: ${time foo spend on the cpu} since: ${time bar
> > spend away from the cpu}
>
> Steve, Frederic, how's this?
>
> (compile tested only)


Hi Peter,

This patch was very useful. Would you still like to post
an updated one?

May be this feature could be a separate option so that we can
select it only when needed.

Thanks,
Frederic.


> ---
> include/linux/ftrace.h | 5 ++
> kernel/sched_fair.c | 1 -
> kernel/trace/Kconfig | 1 +
> kernel/trace/trace.c | 51 +++++++++++++++++++
> kernel/trace/trace.h | 10 ++++
> kernel/trace/trace_functions_graph.c | 88 ++++++++++++++-------------------
> 6 files changed, 104 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 9f7880d..411b027 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -381,6 +381,11 @@ struct ftrace_graph_ret {
> int depth;
> };
>
> +struct ftrace_graph_switch {
> + pid_t prev, next;
> + u64 ran, since;
> +};
> +
> #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>
> /*
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index f34cf42..fa477ac 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -530,7 +530,6 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> schedstat_set(se->wait_count, se->wait_count + 1);
> schedstat_set(se->wait_sum, se->wait_sum +
> rq_of(cfs_rq)->clock - se->wait_start);
> - schedstat_set(se->wait_start, 0);
> }
>
> static inline void
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index dde1d46..7aa1c13 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -67,6 +67,7 @@ config FUNCTION_GRAPH_TRACER
> bool "Kernel Function Graph Tracer"
> depends on HAVE_FUNCTION_GRAPH_TRACER
> depends on FUNCTION_TRACER
> + select SCHEDSTATS
> default y
> help
> Enable the kernel to trace a function at both its return
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 2129ab9..380a334 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -31,6 +31,7 @@
> #include <linux/fs.h>
> #include <linux/kprobes.h>
> #include <linux/writeback.h>
> +#include <linux/sched.h>
>
> #include <linux/stacktrace.h>
> #include <linux/ring_buffer.h>
> @@ -820,6 +821,35 @@ static void __trace_graph_return(struct trace_array *tr,
> entry->ret = *trace;
> ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
> }
> +
> +static void __trace_graph_switch(struct trace_array *tr,
> + struct trace_array_cpu *data,
> + unsigned long flags, int pc,
> + struct task_struct *prev,
> + struct task_struct *next)
> +{
> + struct ring_buffer_event *event;
> + struct ftrace_graph_switch_entry *entry;
> + unsigned long irq_flags;
> +
> + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
> + return;
> +
> + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
> + &irq_flags);
> + if (!event)
> + return;
> + entry = ring_buffer_event_data(event);
> + tracing_generic_entry_update(&entry->ent, flags, pc);
> + entry->ent.type = TRACE_GRAPH_SWITCH;
> + entry->ctx.prev = prev->pid;
> + entry->ctx.next = next->pid;
> + entry->ctx.ran = prev->se.sum_exec_runtime -
> + prev->se.prev_sum_exec_runtime;
> + entry->ctx.since = next->se.exec_start - next->se.wait_start;
> +
> + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
> +}
> #endif
>
> void
> @@ -1097,6 +1127,27 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
> atomic_dec(&data->disabled);
> local_irq_restore(flags);
> }
> +
> +void trace_graph_switch(struct task_struct *prev, struct task_struct *next)
> +{
> + struct trace_array *tr = &global_trace;
> + struct trace_array_cpu *data;
> + unsigned long flags;
> + long disabled;
> + int cpu;
> + int pc;
> +
> + local_irq_save(flags);
> + cpu = raw_smp_processor_id();
> + data = tr->data[cpu];
> + disabled = atomic_inc_return(&data->disabled);
> + if (likely(disabled == 1)) {
> + pc = preempt_count();
> + __trace_graph_switch(tr, data, flags, pc, prev, next);
> + }
> + atomic_dec(&data->disabled);
> + local_irq_restore(flags);
> +}
> #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
>
> enum trace_file_type {
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index b96037d..781fbce 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -27,6 +27,7 @@ enum trace_type {
> TRACE_BOOT_RET,
> TRACE_GRAPH_RET,
> TRACE_GRAPH_ENT,
> + TRACE_GRAPH_SWITCH,
> TRACE_USER_STACK,
> TRACE_HW_BRANCHES,
> TRACE_KMEM_ALLOC,
> @@ -71,6 +72,12 @@ struct ftrace_graph_ret_entry {
> struct trace_entry ent;
> struct ftrace_graph_ret ret;
> };
> +
> +struct ftrace_graph_switch_entry {
> + struct trace_entry ent;
> + struct ftrace_graph_switch ctx;
> +};
> +
> extern struct tracer boot_tracer;
>
> /*
> @@ -295,6 +302,8 @@ extern void __ftrace_bad_type(void);
> TRACE_GRAPH_ENT); \
> IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
> TRACE_GRAPH_RET); \
> + IF_ASSIGN(var, ent, struct ftrace_graph_switch_entry, \
> + TRACE_GRAPH_SWITCH); \
> IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
> IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
> IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
> @@ -438,6 +447,7 @@ void trace_function(struct trace_array *tr,
>
> void trace_graph_return(struct ftrace_graph_ret *trace);
> int trace_graph_entry(struct ftrace_graph_ent *trace);
> +void trace_graph_switch(struct task_struct *prev, struct task_struct *next);
>
> void tracing_start_cmdline_record(void);
> void tracing_stop_cmdline_record(void);
> diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
> index 66fc7b8..cf6494b 100644
> --- a/kernel/trace/trace_functions_graph.c
> +++ b/kernel/trace/trace_functions_graph.c
> @@ -10,6 +10,7 @@
> #include <linux/uaccess.h>
> #include <linux/ftrace.h>
> #include <linux/fs.h>
> +#include <trace/sched.h>
>
> #include "trace.h"
> #include "trace_output.h"
> @@ -158,28 +159,13 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
> return TRACE_TYPE_HANDLED;
> }
>
> -
> -/* If the pid changed since the last trace, output this event */
> static enum print_line_t
> -verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
> +print_graph_switch(struct ftrace_graph_switch_entry *field, struct trace_seq *s,
> + struct trace_iterator *iter)
> {
> - pid_t prev_pid;
> - pid_t *last_pid;
> + int cpu = iter->cpu;
> int ret;
>
> - if (!last_pids_cpu)
> - return TRACE_TYPE_HANDLED;
> -
> - last_pid = per_cpu_ptr(last_pids_cpu, cpu);
> -
> - if (*last_pid == pid)
> - return TRACE_TYPE_HANDLED;
> -
> - prev_pid = *last_pid;
> - *last_pid = pid;
> -
> - if (prev_pid == -1)
> - return TRACE_TYPE_HANDLED;
> /*
> * Context-switch trace line:
>
> @@ -197,7 +183,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
> if (ret == TRACE_TYPE_PARTIAL_LINE)
> TRACE_TYPE_PARTIAL_LINE;
>
> - ret = print_graph_proc(s, prev_pid);
> + ret = print_graph_proc(s, field->ctx.prev);
> if (ret == TRACE_TYPE_PARTIAL_LINE)
> TRACE_TYPE_PARTIAL_LINE;
>
> @@ -205,16 +191,21 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
> if (!ret)
> TRACE_TYPE_PARTIAL_LINE;
>
> - ret = print_graph_proc(s, pid);
> + ret = print_graph_proc(s, field->ctx.next);
> if (ret == TRACE_TYPE_PARTIAL_LINE)
> TRACE_TYPE_PARTIAL_LINE;
>
> + ret = trace_seq_printf(s, " ran: %Lu, since: %Lu",
> + field->ctx.ran, field->ctx.since);
> + if (!ret)
> + TRACE_TYPE_PARTIAL_LINE;
> +
> ret = trace_seq_printf(s,
> "\n ------------------------------------------\n\n");
> if (!ret)
> TRACE_TYPE_PARTIAL_LINE;
>
> - return ret;
> + return TRACE_TYPE_HANDLED;
> }
>
> static bool
> @@ -471,14 +462,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
> {
> int ret;
> int cpu = iter->cpu;
> - pid_t *last_entry = iter->private;
> struct trace_entry *ent = iter->ent;
> struct ftrace_graph_ent *call = &field->graph_ent;
>
> - /* Pid */
> - if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
> - return TRACE_TYPE_PARTIAL_LINE;
> -
> /* Interrupt */
> ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
> if (ret == TRACE_TYPE_PARTIAL_LINE)
> @@ -523,12 +509,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
> int i;
> int ret;
> int cpu = iter->cpu;
> - pid_t *last_pid = iter->private;
> unsigned long long duration = trace->rettime - trace->calltime;
>
> - /* Pid */
> - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
> - return TRACE_TYPE_PARTIAL_LINE;
>
> /* Absolute time */
> if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
> @@ -600,7 +582,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
> int i;
> int ret;
> int cpu = iter->cpu;
> - pid_t *last_pid = iter->private;
>
> /* Absolute time */
> if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
> @@ -609,10 +590,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
> return TRACE_TYPE_PARTIAL_LINE;
> }
>
> - /* Pid */
> - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
> - return TRACE_TYPE_PARTIAL_LINE;
> -
> /* Cpu */
> if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
> ret = print_graph_cpu(s, cpu);
> @@ -677,6 +654,11 @@ print_graph_function(struct trace_iterator *iter)
> struct trace_entry *entry = iter->ent;
>
> switch (entry->type) {
> + case TRACE_GRAPH_SWITCH: {
> + struct ftrace_graph_switch_entry *field;
> + trace_assign_type(field, entry);
> + return print_graph_switch(field, s, iter);
> + }
> case TRACE_GRAPH_ENT: {
> struct ftrace_graph_ent_entry *field;
> trace_assign_type(field, entry);
> @@ -724,34 +706,38 @@ static void print_graph_headers(struct seq_file *s)
> seq_printf(s, " | | | |\n");
> }
>
> -static void graph_trace_open(struct trace_iterator *iter)
> +static void probe_sched_switch(struct rq *rq,
> + struct task_struct *prev,
> + struct task_struct *next)
> {
> - /* pid on the last trace processed */
> - pid_t *last_pid = alloc_percpu(pid_t);
> - int cpu;
> + trace_graph_switch(prev, next);
> +}
>
> - if (!last_pid)
> - pr_warning("function graph tracer: not enough memory\n");
> - else
> - for_each_possible_cpu(cpu) {
> - pid_t *pid = per_cpu_ptr(last_pid, cpu);
> - *pid = -1;
> - }
> +static DEFINE_MUTEX(graph_trace_mutex);
> +static int graph_trace_ref;
>
> - iter->private = last_pid;
> +static void graph_trace_start(struct trace_array *tr)
> +{
> + mutex_lock(&graph_trace_mutex);
> + if (!(graph_trace_ref++))
> + register_trace_sched_switch(probe_sched_switch);
> + mutex_unlock(&graph_trace_mutex);
> }
>
> -static void graph_trace_close(struct trace_iterator *iter)
> +static void graph_trace_stop(struct trace_array *tr)
> {
> - percpu_free(iter->private);
> + mutex_lock(&graph_trace_mutex);
> + if (!(--graph_trace_ref))
> + unregister_trace_sched_switch(probe_sched_switch);
> + mutex_unlock(&graph_trace_mutex);
> }
>
> static struct tracer graph_trace __read_mostly = {
> .name = "function_graph",
> - .open = graph_trace_open,
> - .close = graph_trace_close,
> .init = graph_trace_init,
> .reset = graph_trace_reset,
> + .start = graph_trace_start,
> + .stop = graph_trace_stop,
> .print_line = print_graph_function,
> .print_header = print_graph_headers,
> .flags = &tracer_flags,
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/