Re: [PATCH v2 00/14] perf, persistent: Kernel updates for perf toolintegration

From: Ingo Molnar
Date: Wed Jun 26 2013 - 08:25:36 EST



* Ingo Molnar <mingo@xxxxxxxxxx> wrote:

> Note, for tracing the PERF_FLAG_FD_OUTPUT method of multiplexing
> multiple events onto a single mmap buffers is probably useful (also
> usable via the PERF_EVENT_IOC_SET_OUTPUT ioctl()), so please make sure
> the scheme works naturally with that model as well, not just with 1:1
> event+buffer mappings.
>
> See the uses of PERF_EVENT_IOC_SET_OUTPUT in tools/perf/.

Note that another facility that would be very useful for tracing is
PeterZ's and tglx's patch that enables multiple tracepoints to be attached
to a single event.

See the 2+ years old (bitrotten and unfinished) WIP patch below.

It adds a PERF_EVENT_IOC_ADD_TP ioctl() that adds a new tracepoint to an
existing event. This makes perf based tracing scale up to an arbitrary
number of tracepoints in essence.

Thanks,

Ingo

------------------>
Subject: perf-tracepoint-idr.patch
From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Date: Wed, 24 Nov 2010 12:09:26 +0100

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
---
include/linux/ftrace_event.h | 10
include/linux/perf_event.h | 9
include/linux/sched.h | 9
include/trace/ftrace.h | 4
kernel/events/core.c | 407 ++++++++++++++++++++++++++++++++++++++--
kernel/trace/trace_event_perf.c | 95 +++------
kernel/trace/trace_kprobe.c | 10
kernel/trace/trace_output.c | 116 +++--------
kernel/trace/trace_syscalls.c | 8
9 files changed, 498 insertions(+), 170 deletions(-)

Index: linux/include/linux/ftrace_event.h
===================================================================
--- linux.orig/include/linux/ftrace_event.h
+++ linux/include/linux/ftrace_event.h
@@ -87,8 +87,6 @@ struct trace_event_functions {
};

struct trace_event {
- struct hlist_node node;
- struct list_head list;
int type;
struct trace_event_functions *funcs;
};
@@ -194,7 +192,6 @@ struct ftrace_event_call {

#ifdef CONFIG_PERF_EVENTS
int perf_refcount;
- struct hlist_head __percpu *perf_events;
#endif
};

@@ -263,8 +260,9 @@ struct perf_event;

DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);

-extern int perf_trace_init(struct perf_event *event);
+extern int perf_trace_init(struct perf_event *event, int event_id);
extern void perf_trace_destroy(struct perf_event *event);
+extern void perf_trace_destroy_id(int id);
extern int perf_trace_add(struct perf_event *event, int flags);
extern void perf_trace_del(struct perf_event *event, int flags);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -275,9 +273,9 @@ extern void *perf_trace_buf_prepare(int

static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
- u64 count, struct pt_regs *regs, void *head)
+ u64 count, struct pt_regs *regs, int id)
{
- perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
+ perf_tp_event(addr, count, raw_data, size, regs, rctx, id);
}
#endif

Index: linux/include/linux/perf_event.h
===================================================================
--- linux.orig/include/linux/perf_event.h
+++ linux/include/linux/perf_event.h
@@ -247,6 +247,7 @@ struct perf_event_attr {
#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64)
#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ADD_TP _IO ('$', 7)

enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -568,6 +569,11 @@ struct hw_perf_event {
struct task_struct *bp_target;
};
#endif
+ /*
+ * Same fudge as for breakpoints, trace-events needs
+ * it too,.. convert the bp crap over..
+ */
+ struct task_struct *event_target;
};
int state;
local64_t prev_count;
@@ -859,6 +865,7 @@ struct perf_event {
#ifdef CONFIG_EVENT_TRACING
struct ftrace_event_call *tp_event;
struct event_filter *filter;
+ struct perf_tp_idr tp_idr;
#endif

#ifdef CONFIG_CGROUP_PERF
@@ -1133,7 +1140,7 @@ static inline bool perf_paranoid_kernel(
extern void perf_event_init(void);
extern void perf_tp_event(u64 addr, u64 count, void *record,
int entry_size, struct pt_regs *regs,
- struct hlist_head *head, int rctx);
+ int rctx, int id);
extern void perf_bp_event(struct perf_event *event, void *data);

#ifndef perf_misc_flags
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
#include <linux/rculist.h>
#include <linux/rtmutex.h>

+#include <linux/idr.h>
#include <linux/time.h>
#include <linux/param.h>
#include <linux/resource.h>
@@ -1199,6 +1200,11 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};

+struct perf_tp_idr {
+ struct mutex lock;
+ struct idr idr;
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1485,6 +1491,9 @@ struct task_struct {
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
+#ifdef CONFIG_EVENT_TRACING
+ struct perf_tp_idr *perf_tp_idr;
+#endif
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
Index: linux/include/trace/ftrace.h
===================================================================
--- linux.orig/include/trace/ftrace.h
+++ linux/include/trace/ftrace.h
@@ -708,7 +708,6 @@ perf_trace_##call(void *__data, proto)
struct ftrace_raw_##call *entry; \
struct pt_regs __regs; \
u64 __addr = 0, __count = 1; \
- struct hlist_head *head; \
int __entry_size; \
int __data_size; \
int rctx; \
@@ -733,9 +732,8 @@ perf_trace_##call(void *__data, proto)
\
{ assign; } \
\
- head = this_cpu_ptr(event_call->perf_events); \
perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
- __count, &__regs, head); \
+ __count, &__regs, event_call->event.type); \
}

/*
Index: linux/kernel/events/core.c
===================================================================
--- linux.orig/kernel/events/core.c
+++ linux/kernel/events/core.c
@@ -823,6 +823,7 @@ list_add_event(struct perf_event *event,
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ ++ctx->generation;
}

/*
@@ -976,6 +977,7 @@ list_del_event(struct perf_event *event,
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+ ++ctx->generation;
}

static void perf_group_detach(struct perf_event *event)
@@ -1894,6 +1896,12 @@ static void perf_event_context_sched_out
if (!cpuctx->task_ctx)
return;

+#if 0
+ /*
+ * Need to sort out how to make task_struct::perf_tp_idr
+ * work with this fancy switching stuff.. tracepoints could be
+ * in multiple contexts due to the software event muck.
+ */
rcu_read_lock();
parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp[ctxn];
@@ -1927,6 +1935,7 @@ static void perf_event_context_sched_out
raw_spin_unlock(&ctx->lock);
}
rcu_read_unlock();
+#endif

if (do_switch) {
ctx_sched_out(ctx, cpuctx, EVENT_ALL);
@@ -3261,6 +3270,7 @@ static struct perf_event *perf_fget_ligh
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_add_tp(struct perf_event *event, int tp_id);

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -3307,6 +3317,9 @@ static long perf_ioctl(struct file *file
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);

+ case PERF_EVENT_IOC_ADD_TP:
+ return perf_event_add_tp(event, arg);
+
default:
return -ENOTTY;
}
@@ -5471,6 +5484,9 @@ static struct pmu perf_swevent = {

#ifdef CONFIG_EVENT_TRACING

+#include <linux/ftrace_event.h>
+#include "../trace/trace_output.h"
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -5485,8 +5501,9 @@ static int perf_tp_event_match(struct pe
struct perf_sample_data *data,
struct pt_regs *regs)
{
- if (event->hw.state & PERF_HES_STOPPED)
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
return 0;
+
/*
* All tracepoints are from kernel-space.
*/
@@ -5499,8 +5516,60 @@ static int perf_tp_event_match(struct pe
return 1;
}

+static void perf_tp_idr_init(struct perf_tp_idr *idr)
+{
+ idr_init(&idr->idr);
+ mutex_init(&idr->lock);
+}
+
+static DEFINE_PER_CPU(struct perf_tp_idr, perf_tp_idr);
+
+struct perf_tp_node {
+ struct list_head list;
+ struct perf_event *event;
+ struct rcu_head rcu;
+};
+
+static void do_perf_tp_event(struct perf_event *event, u64 count,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, 1, data, regs);
+}
+
+static void perf_tp_idr_event(struct perf_tp_idr *tp_idr,
+ int id, u64 count,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct perf_tp_node *tp_node, *node;
+ struct perf_event *event;
+
+ if (!tp_idr)
+ return;
+
+ /*
+ * Most of this is done under rcu_read_lock_sched(), which doesn't
+ * exclude regular RCU grace periods, but the IDR code uses call_rcu()
+ * so we have to use rcu_read_lock() here as well.
+ */
+ rcu_read_lock();
+ tp_node = idr_find(&tp_idr->idr, id);
+ rcu_read_unlock();
+
+ if (!tp_node)
+ return;
+
+ event = tp_node->event;
+
+ do_perf_tp_event(event, count, data, regs);
+ list_for_each_entry_rcu(node, &tp_node->list, list)
+ do_perf_tp_event(node->event, count, data, regs);
+}
+
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
- struct pt_regs *regs, struct hlist_head *head, int rctx)
+ struct pt_regs *regs, int rctx, int id)
{
struct perf_sample_data data;
struct perf_event *event;
@@ -5514,18 +5583,197 @@ void perf_tp_event(u64 addr, u64 count,
perf_sample_data_init(&data, addr);
data.raw = &raw;

- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, 1, &data, regs);
- }
+ perf_tp_idr_event(&__get_cpu_var(perf_tp_idr), id, count, &data, regs);
+ perf_tp_idr_event(current->perf_tp_idr, id, count, &data, regs);

perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

+static struct perf_tp_idr *
+perf_tp_init_task(struct perf_event *event, struct task_struct *task)
+{
+ struct perf_tp_idr *idr;
+
+ mutex_lock(&task->perf_event_mutex);
+ idr = task->perf_tp_idr;
+ if (idr)
+ goto unlock;
+
+ idr = kzalloc(sizeof(struct perf_tp_idr), GFP_KERNEL);
+ if (!idr)
+ goto unlock;
+
+ perf_tp_idr_init(idr);
+
+ task->perf_tp_idr = idr;
+unlock:
+ mutex_unlock(&task->perf_event_mutex);
+
+ return idr;
+}
+
+static struct perf_tp_idr *perf_event_idr(struct perf_event *event, bool create)
+{
+ struct perf_tp_idr *tp_idr;
+ struct task_struct *task;
+
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ task = event->hw.event_target;
+ tp_idr = task->perf_tp_idr;
+ if (!tp_idr && create)
+ tp_idr = perf_tp_init_task(event, task);
+ } else
+ tp_idr = &per_cpu(perf_tp_idr, event->cpu);
+
+ return tp_idr;
+}
+
+static void perf_tp_free_node(struct rcu_head *rcu)
+{
+ struct perf_tp_node *node = container_of(rcu, struct perf_tp_node, rcu);
+
+ kfree(node);
+}
+
+static int perf_tp_remove_idr(int id, void *p, void *data)
+{
+ struct perf_tp_node *node = p;
+ struct perf_tp_node *first, *next;
+ struct perf_tp_idr *tp_idr = data;
+
+ if (!tp_idr)
+ goto no_idr;
+
+ mutex_lock(&tp_idr->lock);
+ first = idr_find(&tp_idr->idr, id);
+ if (first == node) {
+ next = list_first_entry(&first->list, struct perf_tp_node, list);
+ if (next != first)
+ idr_replace(&tp_idr->idr, next, id);
+ else
+ idr_remove(&tp_idr->idr, id);
+ }
+ list_del_rcu(&node->list);
+ mutex_unlock(&tp_idr->lock);
+
+no_idr:
+ perf_trace_destroy_id(id);
+ call_rcu_sched(&node->rcu, perf_tp_free_node);
+ return 0;
+}
+
static void tp_perf_event_destroy(struct perf_event *event)
{
- perf_trace_destroy(event);
+ /*
+ * Since this is the free path, the fd is gone an there
+ * can be no concurrency on event->tp_idr.
+ */
+
+ idr_for_each(&event->tp_idr.idr, perf_tp_remove_idr,
+ perf_event_idr(event, false));
+
+ idr_remove_all(&event->tp_idr.idr);
+ idr_destroy(&event->tp_idr.idr);
+}
+
+static int __perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+ struct perf_tp_node *node, *first;
+ struct perf_tp_idr *idr;
+ int tmp_id, err, ret = -ENOMEM;
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ goto out;
+
+ node->event = event;
+ INIT_LIST_HEAD(&node->list);
+
+ /*
+ * Insert the node into the event->idr, this idr tracks the
+ * tracepoints we're interested in, it has a 1:1 relation
+ * with the node.
+ */
+ idr = &event->tp_idr;
+ mutex_lock(&idr->lock);
+ err = idr_pre_get(&idr->idr, GFP_KERNEL);
+ if (!err) {
+ ret = -ENOMEM;
+ goto free_node;
+ }
+
+ ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+ if (ret)
+ goto free_node;
+
+ if (WARN_ON(tp_id != tmp_id)) {
+ printk(KERN_ERR "fail: %d %d\n" , tp_id, tmp_id);
+ ret = -EBUSY;
+ goto free_idr1;
+ }
+ mutex_unlock(&idr->lock);
+
+ /*
+ * Insert the node into the task/cpu idr, this idr tracks
+ * all active tracepoints for the task/cpu, it has a 1:n relation
+ * with the node.
+ */
+ idr = perf_event_idr(event, true);
+ if (!idr) {
+ if (event->attach_state & PERF_ATTACH_CONTEXT)
+ ret = -ENOMEM;
+ else
+ ret = -ESRCH;
+ goto free_idr1_set;
+ }
+ mutex_lock(&idr->lock);
+ first = idr_find(&idr->idr, tp_id);
+ if (first) {
+ list_add_rcu(&node->list, &first->list);
+ goto unlock;
+ }
+
+ err = idr_pre_get(&idr->idr, GFP_KERNEL);
+ if (!err) {
+ ret = -ENOMEM;
+ goto free_idr1_set_unlock;
+ }
+
+ ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+ if (ret)
+ goto free_idr1_set;
+
+ if (WARN_ON(tp_id != tmp_id)) {
+ ret = -EBUSY;
+ goto free_idr2;
+ }
+unlock:
+ mutex_unlock(&idr->lock);
+
+ ret = perf_trace_init(event, tp_id);
+ if (ret)
+ goto free_all;
+
+out:
+ return ret;
+
+free_all:
+ mutex_lock(&idr->lock);
+free_idr2:
+ idr_remove(&idr->idr, tmp_id);
+free_idr1_set_unlock:
+ mutex_unlock(&idr->lock);
+free_idr1_set:
+ idr = &event->tp_idr;
+ tmp_id = tp_id;
+ mutex_lock(&idr->lock);
+free_idr1:
+ idr_remove(&idr->idr, tmp_id);
+free_node:
+ mutex_unlock(&idr->lock);
+ kfree(node);
+ goto out;
}

static int perf_tp_event_init(struct perf_event *event)
@@ -5535,21 +5783,35 @@ static int perf_tp_event_init(struct per
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;

- err = perf_trace_init(event);
- if (err)
- return err;
+ perf_tp_idr_init(&event->tp_idr);

event->destroy = tp_perf_event_destroy;

+ if (event->attr.config != ~0ULL) {
+ err = __perf_event_add_tp(event, event->attr.config);
+ if (err)
+ return err;
+ }
+
return 0;
}

+static int perf_tp_event_add(struct perf_event *event, int flags)
+{
+ event->hw.state = flags & PERF_EF_START ? 0 : PERF_HES_STOPPED;
+ return 0;
+}
+
+static void perf_tp_event_del(struct perf_event *event, int flags)
+{
+}
+
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,

.event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
+ .add = perf_tp_event_add,
+ .del = perf_tp_event_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
@@ -5557,6 +5819,11 @@ static struct pmu perf_tracepoint = {

static inline void perf_tp_register(void)
{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ perf_tp_idr_init(&per_cpu(perf_tp_idr, cpu));
+
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}

@@ -5565,7 +5832,8 @@ static int perf_event_set_filter(struct
char *filter_str;
int ret;

- if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ if (event->attr.type != PERF_TYPE_TRACEPOINT ||
+ event->attr.config == ~0ULL)
return -EINVAL;

filter_str = strndup_user(arg, PAGE_SIZE);
@@ -5583,6 +5851,74 @@ static void perf_event_free_filter(struc
ftrace_profile_free_filter(event);
}

+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+ if (event->attr.type != PERF_TYPE_TRACEPOINT &&
+ event->attr.config != ~0ULL)
+ return -EINVAL;
+
+ return __perf_event_add_tp(event, tp_id);
+}
+
+/*
+ * Called from the exit path, _after_ all events have been detached from it.
+ */
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+ struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+ if (!idr)
+ return;
+
+ idr_remove_all(&idr->idr);
+ idr_destroy(&idr->idr);
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+ struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+ tsk->perf_tp_idr = NULL;
+ kfree(idr);
+}
+
+static int perf_tp_inherit_idr(int id, void *p, void *data)
+{
+ struct perf_event *child = data;
+
+ return __perf_event_add_tp(child, id);
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+ struct perf_event *child_event)
+{
+ int ret;
+
+ if (parent_event->attr.type != PERF_TYPE_TRACEPOINT ||
+ parent_event->attr.config != ~0ULL)
+ return 0;
+
+ /*
+ * The child is not yet exposed, hence no need to serialize things
+ * on that side.
+ */
+ mutex_lock(&parent_event->tp_idr.lock);
+ ret = idr_for_each(&parent_event->tp_idr.idr,
+ perf_tp_inherit_idr,
+ child_event);
+ mutex_unlock(&parent_event->tp_idr.lock);
+
+ return ret;
+}
+
+static void perf_tp_event_init_task(struct task_struct *child)
+{
+ /*
+ * Clear the idr pointer copied from the parent.
+ */
+ child->perf_tp_idr = NULL;
+}
+
#else

static inline void perf_tp_register(void)
@@ -5598,6 +5934,29 @@ static void perf_event_free_filter(struc
{
}

+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+ return -ENOENT;
+}
+
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+ struct perf_event *child_event)
+{
+ return 0;
+}
+
+static void perf_tp_event_init_task()(struct task_struct *child)
+{
+}
+
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6173,6 +6532,9 @@ perf_event_alloc(struct perf_event_attr
INIT_LIST_HEAD(&event->sibling_list);
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);
+#ifdef CONFIG_EVENT_TRACING
+ perf_tp_idr_init(&event->tp_idr);
+#endif

mutex_init(&event->mmap_mutex);

@@ -6191,6 +6553,7 @@ perf_event_alloc(struct perf_event_attr

if (task) {
event->attach_state = PERF_ATTACH_TASK;
+ event->hw.event_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
@@ -6236,7 +6599,7 @@ done:
if (err) {
if (event->ns)
put_pid_ns(event->ns);
- kfree(event);
+ free_event(event);
return ERR_PTR(err);
}

@@ -6604,7 +6967,6 @@ SYSCALL_DEFINE5(perf_event_open,
}

perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);

@@ -6681,7 +7043,6 @@ perf_event_create_kernel_counter(struct
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);

@@ -6858,6 +7219,8 @@ void perf_event_exit_task(struct task_st

for_each_task_context_nr(ctxn)
perf_event_exit_task_context(child, ctxn);
+
+ perf_tp_event_exit(child);
}

static void perf_free_event(struct perf_event *event,
@@ -6920,6 +7283,8 @@ void perf_event_delayed_put(struct task_

for_each_task_context_nr(ctxn)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+
+ perf_tp_event_delayed_put(task);
}

/*
@@ -6935,6 +7300,7 @@ inherit_event(struct perf_event *parent_
{
struct perf_event *child_event;
unsigned long flags;
+ int ret;

/*
* Instead of creating recursive hierarchies of events,
@@ -6952,6 +7318,13 @@ inherit_event(struct perf_event *parent_
NULL);
if (IS_ERR(child_event))
return child_event;
+
+ ret = perf_tp_event_inherit(parent_event, child_event);
+ if (ret) {
+ free_event(child_event);
+ return ERR_PTR(ret);
+ }
+
get_ctx(child_ctx);

/*
@@ -7177,6 +7550,8 @@ int perf_event_init_task(struct task_str
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);

+ perf_tp_event_init_task(child);
+
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
if (ret)
Index: linux/kernel/trace/trace_event_perf.c
===================================================================
--- linux.orig/kernel/trace/trace_event_perf.c
+++ linux/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include "trace.h"
+#include "trace_output.h"

static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];

@@ -47,9 +48,7 @@ static int perf_trace_event_perm(struct
static int perf_trace_event_init(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
- struct hlist_head __percpu *list;
int ret;
- int cpu;

ret = perf_trace_event_perm(tp_event, p_event);
if (ret)
@@ -61,15 +60,6 @@ static int perf_trace_event_init(struct

ret = -ENOMEM;

- list = alloc_percpu(struct hlist_head);
- if (!list)
- goto fail;
-
- for_each_possible_cpu(cpu)
- INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
-
- tp_event->perf_events = list;
-
if (!total_ref_count) {
char __percpu *buf;
int i;
@@ -100,63 +90,40 @@ fail:
}
}

- if (!--tp_event->perf_refcount) {
- free_percpu(tp_event->perf_events);
- tp_event->perf_events = NULL;
- }
+ --tp_event->perf_refcount;

return ret;
}

-int perf_trace_init(struct perf_event *p_event)
+int perf_trace_init(struct perf_event *p_event, int event_id)
{
struct ftrace_event_call *tp_event;
- int event_id = p_event->attr.config;
+ struct trace_event *t_event;
int ret = -EINVAL;

+ trace_event_read_lock();
+ t_event = ftrace_find_event(event_id);
+ if (!t_event)
+ goto out;
+
+ tp_event = container_of(t_event, struct ftrace_event_call, event);
+
mutex_lock(&event_mutex);
- list_for_each_entry(tp_event, &ftrace_events, list) {
- if (tp_event->event.type == event_id &&
- tp_event->class && tp_event->class->reg &&
- try_module_get(tp_event->mod)) {
- ret = perf_trace_event_init(tp_event, p_event);
- if (ret)
- module_put(tp_event->mod);
- break;
- }
+ if (tp_event->class && tp_event->class->reg &&
+ try_module_get(tp_event->mod)) {
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ module_put(tp_event->mod);
}
mutex_unlock(&event_mutex);
+out:
+ trace_event_read_unlock();

return ret;
}

-int perf_trace_add(struct perf_event *p_event, int flags)
-{
- struct ftrace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;
-
- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;
-
- if (!(flags & PERF_EF_START))
- p_event->hw.state = PERF_HES_STOPPED;
-
- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
-
- return 0;
-}
-
-void perf_trace_del(struct perf_event *p_event, int flags)
-{
- hlist_del_rcu(&p_event->hlist_entry);
-}
-
-void perf_trace_destroy(struct perf_event *p_event)
+static void __perf_trace_destroy(struct ftrace_event_call *tp_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
int i;

mutex_lock(&event_mutex);
@@ -171,9 +138,6 @@ void perf_trace_destroy(struct perf_even
*/
tracepoint_synchronize_unregister();

- free_percpu(tp_event->perf_events);
- tp_event->perf_events = NULL;
-
if (!--total_ref_count) {
for (i = 0; i < PERF_NR_CONTEXTS; i++) {
free_percpu(perf_trace_buf[i]);
@@ -185,6 +149,27 @@ out:
mutex_unlock(&event_mutex);
}

+void perf_trace_destroy(struct perf_event *p_event)
+{
+ __perf_trace_destroy(p_event->tp_event);
+}
+
+void perf_trace_destroy_id(int event_id)
+{
+ struct ftrace_event_call *tp_event;
+ struct trace_event *t_event;
+
+ trace_event_read_lock();
+ t_event = ftrace_find_event(event_id);
+ if (!t_event)
+ goto unlock;
+
+ tp_event = container_of(t_event, struct ftrace_event_call, event);
+ __perf_trace_destroy(tp_event);
+unlock:
+ trace_event_read_unlock();
+}
+
__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs *regs, int *rctxp)
{
Index: linux/kernel/trace/trace_kprobe.c
===================================================================
--- linux.orig/kernel/trace/trace_kprobe.c
+++ linux/kernel/trace/trace_kprobe.c
@@ -1659,7 +1659,6 @@ static __kprobes void kprobe_perf_func(s
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry_head *entry;
- struct hlist_head *head;
int size, __size, dsize;
int rctx;

@@ -1679,8 +1678,8 @@ static __kprobes void kprobe_perf_func(s
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);

- head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs,
+ call->event.type);
}

/* Kretprobe profile handler */
@@ -1690,7 +1689,6 @@ static __kprobes void kretprobe_perf_fun
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry_head *entry;
- struct hlist_head *head;
int size, __size, dsize;
int rctx;

@@ -1710,8 +1708,8 @@ static __kprobes void kretprobe_perf_fun
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);

- head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+ perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+ regs, call->event.type);
}

static int probe_perf_enable(struct ftrace_event_call *call)
Index: linux/kernel/trace/trace_output.c
===================================================================
--- linux.orig/kernel/trace/trace_output.c
+++ linux/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
+#include <linux/idr.h>

#include "trace_output.h"

@@ -16,9 +17,9 @@

DECLARE_RWSEM(trace_event_mutex);

-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static const int first_event_type = __TRACE_LAST_TYPE + 1;

-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static DEFINE_IDR(trace_type_idr);

int trace_print_seq(struct seq_file *m, struct trace_seq *s)
{
@@ -664,58 +665,43 @@ static int task_state_char(unsigned long
*/
struct trace_event *ftrace_find_event(int type)
{
- struct trace_event *event;
- struct hlist_node *n;
- unsigned key;
-
- key = type & (EVENT_HASHSIZE - 1);
-
- hlist_for_each_entry(event, n, &event_hash[key], node) {
- if (event->type == type)
- return event;
- }
-
- return NULL;
+ return idr_find(&trace_type_idr, type);
}

-static LIST_HEAD(ftrace_event_list);
+void trace_event_read_lock(void)
+{
+ down_read(&trace_event_mutex);
+}

-static int trace_search_list(struct list_head **list)
+void trace_event_read_unlock(void)
{
- struct trace_event *e;
- int last = __TRACE_LAST_TYPE;
+ up_read(&trace_event_mutex);
+}

- if (list_empty(&ftrace_event_list)) {
- *list = &ftrace_event_list;
- return last + 1;
- }
+static int register_event(struct trace_event *event, int id, bool strict)
+{
+ int ret, type;

- /*
- * We used up all possible max events,
- * lets see if somebody freed one.
- */
- list_for_each_entry(e, &ftrace_event_list, list) {
- if (e->type != last + 1)
- break;
- last++;
- }
+ ret = idr_pre_get(&trace_type_idr, GFP_KERNEL);
+ if (!ret)
+ return 0;

- /* Did we used up all 65 thousand events??? */
- if ((last + 1) > FTRACE_MAX_EVENT)
+ ret = idr_get_new_above(&trace_type_idr, event, id, &type);
+ if (ret)
return 0;

- *list = &e->list;
- return last + 1;
-}
+ if (strict && id != type) {
+ idr_remove(&trace_type_idr, type);
+ return 0;
+ }

-void trace_event_read_lock(void)
-{
- down_read(&trace_event_mutex);
-}
+ if (type > FTRACE_MAX_EVENT) {
+ idr_remove(&trace_type_idr, type);
+ return 0;
+ }

-void trace_event_read_unlock(void)
-{
- up_read(&trace_event_mutex);
+ event->type = type;
+ return type;
}

/**
@@ -735,7 +721,6 @@ void trace_event_read_unlock(void)
*/
int register_ftrace_event(struct trace_event *event)
{
- unsigned key;
int ret = 0;

down_write(&trace_event_mutex);
@@ -746,35 +731,18 @@ int register_ftrace_event(struct trace_e
if (WARN_ON(!event->funcs))
goto out;

- INIT_LIST_HEAD(&event->list);
-
if (!event->type) {
- struct list_head *list = NULL;
-
- if (next_event_type > FTRACE_MAX_EVENT) {
-
- event->type = trace_search_list(&list);
- if (!event->type)
- goto out;
-
- } else {
-
- event->type = next_event_type++;
- list = &ftrace_event_list;
- }
-
- if (WARN_ON(ftrace_find_event(event->type)))
+ ret = register_event(event, first_event_type, false);
+ if (!ret)
goto out;
-
- list_add_tail(&event->list, list);
-
- } else if (event->type > __TRACE_LAST_TYPE) {
- printk(KERN_WARNING "Need to add type to trace.h\n");
- WARN_ON(1);
- goto out;
} else {
- /* Is this event already used */
- if (ftrace_find_event(event->type))
+ if (event->type > __TRACE_LAST_TYPE) {
+ printk(KERN_WARNING "Need to add type to trace.h\n");
+ WARN_ON(1);
+ goto out;
+ }
+ ret = register_event(event, event->type, true);
+ if (!ret)
goto out;
}

@@ -787,11 +755,6 @@ int register_ftrace_event(struct trace_e
if (event->funcs->binary == NULL)
event->funcs->binary = trace_nop_print;

- key = event->type & (EVENT_HASHSIZE - 1);
-
- hlist_add_head(&event->node, &event_hash[key]);
-
- ret = event->type;
out:
up_write(&trace_event_mutex);

@@ -804,8 +767,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event)
*/
int __unregister_ftrace_event(struct trace_event *event)
{
- hlist_del(&event->node);
- list_del(&event->list);
+ idr_remove(&trace_type_idr, event->type);
return 0;
}

Index: linux/kernel/trace/trace_syscalls.c
===================================================================
--- linux.orig/kernel/trace/trace_syscalls.c
+++ linux/kernel/trace/trace_syscalls.c
@@ -499,7 +499,6 @@ static void perf_syscall_enter(void *ign
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- struct hlist_head *head;
int syscall_nr;
int rctx;
int size;
@@ -530,8 +529,7 @@ static void perf_syscall_enter(void *ign
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);

- head = this_cpu_ptr(sys_data->enter_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
}

int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -573,7 +571,6 @@ static void perf_syscall_exit(void *igno
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- struct hlist_head *head;
int syscall_nr;
int rctx;
int size;
@@ -606,8 +603,7 @@ static void perf_syscall_exit(void *igno
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);

- head = this_cpu_ptr(sys_data->exit_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
}

int perf_sysexit_enable(struct ftrace_event_call *call)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/