Re: [PATCH v2] tracing: Expose event tracing infrastructure
From: Jovi Zhang
Date: Fri Mar 15 2013 - 08:31:23 EST
On Wed, Mar 13, 2013 at 6:41 PM, zhangwei(Jovi)
<jovi.zhangwei@xxxxxxxxxx> wrote:
> [change from v1: add missed type assignment in ftrace_event_register]
>
> Currently event tracing only can be use for ftrace and perf,
> there don't have any mechanism to let modules(like external tracing tool)
> register callback tracing function.
>
> Event tracing implement based on tracepoint, compare with raw tracepoint,
> event tracing infrastructure provide built-in structured event annotate format,
> this feature should expose to external user.
>
> For example, simple pseudo ktap script demonstrate how to use this event
> tracing expose change.
>
> function event_trace(e)
> {
> printf(e.annotate);
> }
>
> os.trace("sched:sched_switch", event_trace);
> os.trace("irq:softirq_raise", event_trace);
>
> The running result:
> sched_switch: prev_comm=rcu_sched prev_pid=10 prev_prio=120 prev_state=S ==> next_comm=swapper/1 next_pid=0 next_prio=120
> softirq_raise: vec=1 [action=TIMER]
> ...
>
> This expose change can be use by other tracing tool, like systemtap/lttng,
> if they would implement this.
>
> This patch introduce struct event_trace_ops, it have two function pointers,
> pre_trace and do_trace. when ftrace_raw_event_<call> function hit,
> it will call all registered event_trace_ops.
>
> Use this unify callback mechanism, ftrace_raw_event_<call> and
> perf_trace_<call> is integrated into one function,
> the benefit of this change is kernel size shrink ~52K(with ftrace and perf compiled in).
>
> text data bss dec hex filename
> 7801238 841596 3473408 12116242 b8e112 vmlinux.old
> 7757064 833596 3473408 12064068 b81544 vmlinux.new
>
> Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@xxxxxxxxxx>
> ---
> include/linux/ftrace_event.h | 63 +++++++++++++-
> include/trace/ftrace.h | 198 ++++++++----------------------------------
> kernel/trace/trace_events.c | 174 ++++++++++++++++++++++++++++++++++---
> 3 files changed, 260 insertions(+), 175 deletions(-)
>
> diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
> index 13a54d0..4539a79 100644
> --- a/include/linux/ftrace_event.h
> +++ b/include/linux/ftrace_event.h
> @@ -167,9 +167,6 @@ struct ftrace_event_call;
> struct ftrace_event_class {
> char *system;
> void *probe;
> -#ifdef CONFIG_PERF_EVENTS
> - void *perf_probe;
> -#endif
> int (*reg)(struct ftrace_event_call *event,
> enum trace_reg type, void *data);
> int (*define_fields)(struct ftrace_event_call *);
> @@ -199,6 +196,57 @@ enum {
> TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
> };
>
> +struct ftrace_trace_descriptor_t {
> + struct ring_buffer_event *event;
> + struct ring_buffer *buffer;
> + unsigned long irq_flags;
> + int pc;
> +};
> +
> +#ifdef CONFIG_PERF_EVENTS
> +struct perf_trace_descriptor_t {
> + struct pt_regs __regs;
> + struct task_struct *__task;
> + u64 __addr;
> + u64 __count;
> + int rctx;
> +};
> +#endif
> +
> +/*
> + * trace_descriptor_t is purpose for passing arguments between
> + * pre_trace and do_trace function.
> + * this definition is ugly, change it in future.
> + */
> +struct trace_descriptor_t {
> + struct ftrace_trace_descriptor_t f;
> +#ifdef CONFIG_PERF_EVENTS
> + struct perf_trace_descriptor_t p;
> +#endif
> + void *data;
> +};
> +
> +enum TRACE_REG_TYPE {
> + TRACE_REG_FTRACE,
> + TRACE_REG_PERF,
> +};
> +
> +/* callback function for tracing */
> +struct event_trace_ops {
> + void *(*pre_trace)(struct ftrace_event_call *event_call,
> + int entry_size, void *data);
> + void (*do_trace)(struct ftrace_event_call *event_call,
> + void *entry, int entry_size, void *data);
> +};
> +
> +struct ftrace_probe {
> + struct list_head list;
> +
> + /* 0: TRACE_REG_FTRACE; 1 : TRACE_REG_PERF */
> + int type;
> + struct event_trace_ops *ops;
> +};
> +
> struct ftrace_event_call {
> struct list_head list;
> struct ftrace_event_class *class;
> @@ -210,6 +258,10 @@ struct ftrace_event_call {
> void *mod;
> void *data;
>
> + /* list head of "struct ftrace_probe" */
> + struct list_head probe_ops_list;
> + int probe_count;
> +
> /*
> * 32 bit flags:
> * bit 1: enabled
> @@ -274,6 +326,11 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type,
> extern int trace_add_event_call(struct ftrace_event_call *call);
> extern void trace_remove_event_call(struct ftrace_event_call *call);
>
> +extern int ftrace_event_register(struct ftrace_event_call *call, int type,
> + struct event_trace_ops *ops);
> +extern void ftrace_event_unregister(struct ftrace_event_call *call, int type,
> + struct event_trace_ops *ops);
> +
> #define is_signed_type(type) (((type)(-1)) < (type)0)
>
> int trace_set_clr_event(const char *system, const char *event, int set);
> diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
> index 40dc5e8..c1f526a 100644
> --- a/include/trace/ftrace.h
> +++ b/include/trace/ftrace.h
> @@ -412,38 +412,6 @@ static inline notrace int ftrace_get_offsets_##call( \
> *
> * static struct ftrace_event_call event_<call>;
> *
> - * static void ftrace_raw_event_<call>(void *__data, proto)
> - * {
> - * struct ftrace_event_call *event_call = __data;
> - * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
> - * struct ring_buffer_event *event;
> - * struct ftrace_raw_<call> *entry; <-- defined in stage 1
> - * struct ring_buffer *buffer;
> - * unsigned long irq_flags;
> - * int __data_size;
> - * int pc;
> - *
> - * local_save_flags(irq_flags);
> - * pc = preempt_count();
> - *
> - * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
> - *
> - * event = trace_current_buffer_lock_reserve(&buffer,
> - * event_<call>->event.type,
> - * sizeof(*entry) + __data_size,
> - * irq_flags, pc);
> - * if (!event)
> - * return;
> - * entry = ring_buffer_event_data(event);
> - *
> - * { <assign>; } <-- Here we assign the entries by the __field and
> - * __array macros.
> - *
> - * if (!filter_current_check_discard(buffer, event_call, entry, event))
> - * trace_current_buffer_unlock_commit(buffer,
> - * event, irq_flags, pc);
> - * }
> - *
> * static struct trace_event ftrace_event_type_<call> = {
> * .trace = ftrace_raw_output_<call>, <-- stage 2
> * };
> @@ -472,20 +440,6 @@ static inline notrace int ftrace_get_offsets_##call( \
> *
> */
>
> -#ifdef CONFIG_PERF_EVENTS
> -
> -#define _TRACE_PERF_PROTO(call, proto) \
> - static notrace void \
> - perf_trace_##call(void *__data, proto);
> -
> -#define _TRACE_PERF_INIT(call) \
> - .perf_probe = perf_trace_##call,
> -
> -#else
> -#define _TRACE_PERF_PROTO(call, proto)
> -#define _TRACE_PERF_INIT(call)
> -#endif /* CONFIG_PERF_EVENTS */
> -
> #undef __entry
> #define __entry entry
>
> @@ -509,44 +463,56 @@ static inline notrace int ftrace_get_offsets_##call( \
> #undef TP_fast_assign
> #define TP_fast_assign(args...) args
>
> +#ifdef CONFIG_PERF_EVENTS
> +#undef __perf_addr
> +#define __perf_addr(a) __desc.p.__addr = (a)
> +
> +#undef __perf_count
> +#define __perf_count(c) __desc.p.__count = (c)
> +
> +#undef __perf_task
> +#define __perf_task(t) __desc.p.__task = (t)
> +
> #undef TP_perf_assign
> -#define TP_perf_assign(args...)
> +#define TP_perf_assign(args...) args
> +#endif /* CONFIG_PERF_EVENTS */
>
> #undef DECLARE_EVENT_CLASS
> -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
> +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
> \
> -static notrace void \
> -ftrace_raw_event_##call(void *__data, proto) \
> -{ \
> - struct ftrace_event_call *event_call = __data; \
> +static notrace void \
> +ftrace_raw_event_##call(void *__data, proto) \
> +{ \
> + struct ftrace_event_call *event_call = __data; \
> struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
> - struct ring_buffer_event *event; \
> - struct ftrace_raw_##call *entry; \
> - struct ring_buffer *buffer; \
> - unsigned long irq_flags; \
> - int __data_size; \
> - int pc; \
> - \
> - local_save_flags(irq_flags); \
> - pc = preempt_count(); \
> + struct trace_descriptor_t __desc; \
> + struct ftrace_raw_##call *entry; \
> + struct ftrace_probe *probe_data; \
> + int __data_size, __entry_size; \
> \
> __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
> + __entry_size = sizeof(*entry) + __data_size; \
> \
> - event = trace_current_buffer_lock_reserve(&buffer, \
> - event_call->event.type, \
> - sizeof(*entry) + __data_size, \
> - irq_flags, pc); \
> - if (!event) \
> - return; \
> - entry = ring_buffer_event_data(event); \
> + list_for_each_entry_rcu(probe_data, &event_call->probe_ops_list,\
> + list) { \
> + struct event_trace_ops *probe_ops = probe_data->ops; \
> \
> - tstruct \
> + if (probe_data->type == TRACE_REG_PERF) \
> + perf_fetch_caller_regs(&__desc.p.__regs); \
> \
> - { assign; } \
> + entry = probe_ops->pre_trace(event_call, __entry_size, \
> + &__desc); \
> + if (!entry) \
> + continue; \
> \
> - if (!filter_current_check_discard(buffer, event_call, entry, event)) \
> - trace_buffer_unlock_commit(buffer, event, irq_flags, pc); \
> + tstruct \
> + \
> + { assign; } \
> + \
> + probe_ops->do_trace(event_call, entry, __entry_size, &__desc); \
> + } \
> }
> +
> /*
> * The ftrace_test_probe is compiled out, it is only here as a build time check
> * to make sure that if the tracepoint handling changes, the ftrace probe will
> @@ -579,7 +545,6 @@ static inline void ftrace_test_probe_##call(void) \
>
> #undef DECLARE_EVENT_CLASS
> #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
> -_TRACE_PERF_PROTO(call, PARAMS(proto)); \
> static const char print_fmt_##call[] = print; \
> static struct ftrace_event_class __used event_class_##call = { \
> .system = __stringify(TRACE_SYSTEM), \
> @@ -588,7 +553,6 @@ static struct ftrace_event_class __used event_class_##call = { \
> .raw_init = trace_event_raw_init, \
> .probe = ftrace_raw_event_##call, \
> .reg = ftrace_event_reg, \
> - _TRACE_PERF_INIT(call) \
> };
>
> #undef DEFINE_EVENT
> @@ -619,91 +583,5 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
>
> #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
>
> -
> -#ifdef CONFIG_PERF_EVENTS
> -
> -#undef __entry
> -#define __entry entry
> -
> -#undef __get_dynamic_array
> -#define __get_dynamic_array(field) \
> - ((void *)__entry + (__entry->__data_loc_##field & 0xffff))
> -
> -#undef __get_str
> -#define __get_str(field) (char *)__get_dynamic_array(field)
> -
> -#undef __perf_addr
> -#define __perf_addr(a) __addr = (a)
> -
> -#undef __perf_count
> -#define __perf_count(c) __count = (c)
> -
> -#undef __perf_task
> -#define __perf_task(t) __task = (t)
> -
> -#undef TP_perf_assign
> -#define TP_perf_assign(args...) args
> -
> -#undef DECLARE_EVENT_CLASS
> -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
> -static notrace void \
> -perf_trace_##call(void *__data, proto) \
> -{ \
> - struct ftrace_event_call *event_call = __data; \
> - struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
> - struct ftrace_raw_##call *entry; \
> - struct pt_regs __regs; \
> - u64 __addr = 0, __count = 1; \
> - struct task_struct *__task = NULL; \
> - struct hlist_head *head; \
> - int __entry_size; \
> - int __data_size; \
> - int rctx; \
> - \
> - perf_fetch_caller_regs(&__regs); \
> - \
> - __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
> - __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
> - sizeof(u64)); \
> - __entry_size -= sizeof(u32); \
> - \
> - if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE, \
> - "profile buffer not large enough")) \
> - return; \
> - \
> - entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare( \
> - __entry_size, event_call->event.type, &__regs, &rctx); \
> - if (!entry) \
> - return; \
> - \
> - tstruct \
> - \
> - { assign; } \
> - \
> - head = this_cpu_ptr(event_call->perf_events); \
> - perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
> - __count, &__regs, head, __task); \
> -}
> -
> -/*
> - * This part is compiled out, it is only here as a build time check
> - * to make sure that if the tracepoint handling changes, the
> - * perf probe will fail to compile unless it too is updated.
> - */
> -#undef DEFINE_EVENT
> -#define DEFINE_EVENT(template, call, proto, args) \
> -static inline void perf_test_probe_##call(void) \
> -{ \
> - check_trace_callback_type_##call(perf_trace_##template); \
> -}
> -
> -
> -#undef DEFINE_EVENT_PRINT
> -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
> - DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
> -
> -#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
> -#endif /* CONFIG_PERF_EVENTS */
> -
> #undef _TRACE_PROFILE_INIT
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 57e9b28..69304ff 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -142,33 +142,183 @@ int trace_event_raw_init(struct ftrace_event_call *call)
> if (!id)
> return -ENODEV;
>
> + INIT_LIST_HEAD(&call->probe_ops_list);
> + call->probe_count = 0;
> +
> return 0;
> }
> EXPORT_SYMBOL_GPL(trace_event_raw_init);
>
> +static void *ftrace_events_pre_trace(struct ftrace_event_call *event_call,
> + int entry_size, void *data)
> +{
> + struct ftrace_trace_descriptor_t *desc = &((struct trace_descriptor_t *)
> + data)->f;
> + struct ring_buffer_event *event;
> + struct ring_buffer *buffer;
> + unsigned long irq_flags;
> + int pc;
> +
> + local_save_flags(irq_flags);
> + pc = preempt_count();
> +
> + event = trace_current_buffer_lock_reserve(&buffer,
> + event_call->event.type,
> + entry_size, irq_flags, pc);
> +
> + if (!event)
> + return NULL;
> +
> + desc->event = event;
> + desc->buffer = buffer;
> + desc->irq_flags = irq_flags;
> + desc->pc = pc;
> +
> + return ring_buffer_event_data(event);
> +}
> +
> +static void ftrace_events_do_trace(struct ftrace_event_call *event_call,
> + void *entry, int entry_size, void *data)
> +{
> + struct ftrace_trace_descriptor_t *desc = &((struct trace_descriptor_t *)
> + data)->f;
> + struct ring_buffer_event *event = desc->event;
> + struct ring_buffer *buffer = desc->buffer;
> + unsigned long irq_flags = desc->irq_flags;
> + int pc = desc->pc;
> +
> + if (!filter_current_check_discard(buffer, event_call, entry, event))
> + trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
> +}
> +
> +static struct event_trace_ops ftrace_events_ops = {
> + .pre_trace = ftrace_events_pre_trace,
> + .do_trace = ftrace_events_do_trace,
> +};
> +
> +#ifdef CONFIG_PERF_EVENTS
> +static void *perf_events_pre_trace(struct ftrace_event_call *event_call,
> + int entry_size, void *data)
> +{
> + struct perf_trace_descriptor_t *desc = &((struct trace_descriptor_t *)
> + data)->p;
> + struct pt_regs *__regs = &desc->__regs;
> + int *rctx = &desc->rctx;
> + int __entry_size;
> +
> + __entry_size = ALIGN(entry_size + sizeof(u32), sizeof(u64));
> + __entry_size -= sizeof(u32);
> +
> + if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE,
> + "profile buffer not large enough"))
> + return NULL;
> +
> + return perf_trace_buf_prepare(__entry_size, event_call->event.type,
> + __regs, rctx);
> +}
> +
> +static void perf_events_do_trace(struct ftrace_event_call *event_call,
> + void *entry, int entry_size, void *data)
> +{
> + struct perf_trace_descriptor_t *desc = &((struct trace_descriptor_t *)
> + data)->p;
> + struct hlist_head *head;
> +
> + head = this_cpu_ptr(event_call->perf_events);
> + perf_trace_buf_submit(entry, entry_size, desc->rctx, desc->__addr,
> + desc->__count, &desc->__regs, head, desc->__task);
> +}
> +
> +static struct event_trace_ops perf_events_ops = {
> + .pre_trace = perf_events_pre_trace,
> + .do_trace = perf_events_do_trace,
> +};
> +#endif /* CONFIG_PERF_EVENTS */
> +
> +int ftrace_event_register(struct ftrace_event_call *call, int type,
> + struct event_trace_ops *ops)
> +{
> + struct ftrace_probe *probe_data;
> + int ret = 0;
> +
> + if (call->probe_count == 0) {
> + ret = tracepoint_probe_register(call->name,
> + call->class->probe, call);
> + if (ret)
> + return ret;
> + } else {
> + /* reject duplicate register */
> + list_for_each_entry_rcu(probe_data, &call->probe_ops_list,
> + list) {
> + if ((probe_data->type == type) &&
> + (probe_data->ops == ops))
> + return -EBUSY;
> + }
> + }
> +
> + probe_data = kmalloc(sizeof(struct ftrace_probe), GFP_KERNEL);
> + if (!probe_data)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&probe_data->list);
> + probe_data->ops = ops;
> + probe_data->type = type;
> + list_add_tail_rcu(&probe_data->list, &call->probe_ops_list);
> + call->probe_count++;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(ftrace_event_register);
> +
> +void ftrace_event_unregister(struct ftrace_event_call *call, int type,
> + struct event_trace_ops *ops)
> +{
> + struct ftrace_probe *probe_data;
> + int found = 0;
> +
> + if (call->probe_count == 0)
> + return;
> +
> + list_for_each_entry_rcu(probe_data, &call->probe_ops_list, list) {
> + if ((probe_data->type == type) && (probe_data->ops == ops)) {
> + list_del_rcu(&probe_data->list);
> + kfree(probe_data);
> + found = 1;
> + break;
> + }
> + }
> +
> + if (!found)
> + return;
> +
> + call->probe_count--;
> +
> + if (!call->probe_count)
> + tracepoint_probe_unregister(call->name,
> + call->class->probe, call);
> +}
> +EXPORT_SYMBOL_GPL(ftrace_event_unregister);
> +
> int ftrace_event_reg(struct ftrace_event_call *call,
> enum trace_reg type, void *data)
> {
> switch (type) {
> case TRACE_REG_REGISTER:
> - return tracepoint_probe_register(call->name,
> - call->class->probe,
> - call);
> + return ftrace_event_register(call, TRACE_REG_FTRACE,
> + &ftrace_events_ops);
> +
> case TRACE_REG_UNREGISTER:
> - tracepoint_probe_unregister(call->name,
> - call->class->probe,
> - call);
> + ftrace_event_unregister(call, TRACE_REG_FTRACE,
> + &ftrace_events_ops);
> return 0;
>
> #ifdef CONFIG_PERF_EVENTS
> case TRACE_REG_PERF_REGISTER:
> - return tracepoint_probe_register(call->name,
> - call->class->perf_probe,
> - call);
> + return ftrace_event_register(call, TRACE_REG_PERF,
> + &perf_events_ops);
> +
> case TRACE_REG_PERF_UNREGISTER:
> - tracepoint_probe_unregister(call->name,
> - call->class->perf_probe,
> - call);
> + ftrace_event_unregister(call, TRACE_REG_PERF, &perf_events_ops);
> return 0;
> case TRACE_REG_PERF_OPEN:
> case TRACE_REG_PERF_CLOSE:
> --
> 1.7.9.7
>
Hi steven,
Would you please give some comments? patch works normally on my box.
.jovi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/