[RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v2)

From: Stephane Eranian
Date: Wed Sep 08 2010 - 09:46:28 EST


This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.

The patch adds perf_event_attr.cgroup, a boolean, to activate
this new mode. The cgroup is designated by passing in
perf_event_attr.cgroup_fd, an opened file descriptor to
the <mnt>/<cgroup>/perf_event.perf file.

This is the second version of this patch. It corrects the way
time_enabled is accounted for. In cgroup mode, time_enabled reflects
the time the cgroup was active, i.e., threads from the cgroup executed
on the monitored CPU. This is a more useful metric than just
wall-clock. The meaning of time_enabled without cgroup is unaffected.

Signed-off-by: Stephane Eranian <eranian@xxxxxxxxxx>

--

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3cb7d04..ed76357 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -618,6 +618,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
unsigned short css_id(struct cgroup_subsys_state *css);
unsigned short css_depth(struct cgroup_subsys_state *css);

+struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id);
+
#else /* !CONFIG_CGROUPS */

static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..93f86b7 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
SUBSYS(blkio)
#endif

+#ifdef CONFIG_PERF_EVENTS
+SUBSYS(perf)
+#endif
+
/* */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000610c..ba43996 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -215,8 +215,9 @@ struct perf_event_attr {
*/
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */
+ cgroup : 1, /* cgroup aggregation */

- __reserved_1 : 46;
+ __reserved_1 : 45;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -226,6 +227,8 @@ struct perf_event_attr {
__u32 bp_type;
__u64 bp_addr;
__u64 bp_len;
+
+ int cgroup_fd;
};

/*
@@ -463,6 +466,7 @@ enum perf_callchain_context {
#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
+# include <linux/cgroup.h>
#endif

struct perf_guest_info_callbacks {
@@ -657,6 +661,16 @@ struct swevent_hlist {
#define PERF_ATTACH_CONTEXT 0x01
#define PERF_ATTACH_GROUP 0x02

+#ifdef CONFIG_CGROUPS
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct {
+ u64 time;
+ u64 timestamp;
+ } times[NR_CPUS] ____cacheline_aligned_in_smp;
+};
+#endif
+
/**
* struct perf_event - performance event kernel representation:
*/
@@ -759,7 +773,9 @@ struct perf_event {
struct ftrace_event_call *tp_event;
struct event_filter *filter;
#endif
-
+#ifdef CONFIG_CGROUPS
+ struct perf_cgroup *css;
+#endif
#endif /* CONFIG_PERF_EVENTS */
};

@@ -806,6 +822,8 @@ struct perf_event_context {
u64 generation;
int pin_count;
struct rcu_head rcu_head;
+
+ int nr_cgroups;
};

/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e5c5497..3e56354 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4722,6 +4722,23 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}

+struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+
+ /* check in cgroup filesystem */
+ if (f->f_op != &cgroup_seqfile_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry->d_parent);
+
+ return cgrp->subsys[id];
+}
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4b84e63..9c5d1f9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -35,6 +35,7 @@

#include <asm/irq_regs.h>

+#define PERF_TSTAMP_ENABLE_INVALID (~0) /* invalid marker, cannot be zero */
/*
* Each CPU has a list of per CPU events:
*/
@@ -49,6 +50,84 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;

+#ifdef CONFIG_CGROUPS
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ if (!task)
+ return NULL;
+ return container_of(task_subsys_state(task, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline
+struct perf_cgroup *perf_cgroup_from_cont(struct cgroup *cont)
+{
+ return container_of(cgroup_subsys_state(cont, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+ struct perf_cgroup *css = perf_cgroup_from_task(task);
+ return !event->css || event->css == css;
+}
+
+static void *perf_get_cgroup(int fd)
+{
+ struct cgroup_subsys_state *css;
+ struct file *file;
+ int fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ css = cgroup_css_from_file(file, perf_subsys_id);
+ if (!IS_ERR(css))
+ css_get(css);
+
+ fput_light(file, fput_needed);
+
+ return css;
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+ if (event->css)
+ css_put(&event->css->css);
+}
+#else /* !CONFIG_CGROUP */
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+ return true;
+}
+
+static inline void *perf_get_cgroup(int fd)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{}
+
+#endif
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->css != NULL;
+}
+
+static inline int is_css_current(struct perf_event *event)
+{
+ struct perf_cgroup *css = perf_cgroup_from_task(current);
+
+ return css == event->css;
+}
+
/*
* perf event paranoia level:
* -1 - not paranoid at all
@@ -228,29 +307,60 @@ static void update_context_time(struct perf_event_context *ctx)
ctx->timestamp = now;
}

+static void update_css_time(struct perf_cgroup *css)
+{
+ u64 now;
+ int cpu = smp_processor_id();
+
+ if (!css)
+ return;
+
+ now = perf_clock();
+ css->times[cpu].time += now - css->times[cpu].timestamp;
+ css->times[cpu].timestamp = now;
+}
+
+static u64 get_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (is_cgroup_event(event)) {
+ if (event->cpu == -1) {
+ WARN_ON(event->cpu != smp_processor_id());
+ return 0;
+ }
+ return event->css->times[event->cpu].time;
+ }
+
+ return ctx ? ctx->time : 0;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
static void update_event_times(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- u64 run_end;
+ u64 run_end, run_start;
+ int cpu = smp_processor_id();

if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;

- if (ctx->is_active)
- run_end = ctx->time;
- else
- run_end = event->tstamp_stopped;
+ run_end = get_event_time(event);
+ run_start = event->tstamp_enabled;
+
+ /*
+ * that means the cgroup never got scheduled in
+ * so ensure total_time_enabled is zero
+ */
+ if (run_start == PERF_TSTAMP_ENABLE_INVALID)
+ run_start = run_end;

- event->total_time_enabled = run_end - event->tstamp_enabled;
+ event->total_time_enabled = run_end - run_start;

if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
- else
- run_end = ctx->time;

event->total_time_running = run_end - event->tstamp_running;
}
@@ -301,6 +411,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}

+ if (is_cgroup_event(event))
+ ctx->nr_cgroups++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
@@ -340,6 +453,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)

event->attach_state &= ~PERF_ATTACH_CONTEXT;

+ if (is_cgroup_event(event))
+ ctx->nr_cgroups--;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -403,9 +519,10 @@ static void perf_group_detach(struct perf_event *event)
}

static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, struct task_struct *task)
{
- return event->cpu == -1 || event->cpu == smp_processor_id();
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event, task);
}

static void
@@ -413,6 +530,7 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = get_event_time(event);
u64 delta;
/*
* An event which could not be activated because of
@@ -421,10 +539,10 @@ event_sched_out(struct perf_event *event,
* via read() for time_enabled, time_running:
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
- delta = ctx->time - event->tstamp_stopped;
+ && !event_filter_match(event, current)) {
+ delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
}

if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -435,7 +553,7 @@ event_sched_out(struct perf_event *event,
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
event->pmu->disable(event);
event->oncpu = -1;

@@ -589,6 +707,12 @@ static void __perf_event_disable(void *info)
* If it is in error state, leave it in error state.
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ /*
+ * update css time only if current->css corresponds
+ * to event. This is used to update tstamp->stopped
+ */
+ if (is_css_current(event))
+ update_css_time(event->css);
update_context_time(ctx);
update_group_times(event);
if (event == event->group_leader)
@@ -673,7 +797,7 @@ event_sched_in(struct perf_event *event,
return -EAGAIN;
}

- event->tstamp_running += ctx->time - event->tstamp_stopped;
+ event->tstamp_running += get_event_time(event) - event->tstamp_stopped;

if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -775,11 +899,33 @@ static int group_can_go_on(struct perf_event *event,
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
+ u64 tstamp = get_event_time(event);
+
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = ctx->time;
- event->tstamp_running = ctx->time;
- event->tstamp_stopped = ctx->time;
+
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
+ event->tstamp_enabled = tstamp;
+
+ /*
+ * an event is added to a context even if the css constraint
+ * is not satisfied. In per-cgroup mode, time_enabled only
+ * counts when threads from the css are active on the CPU.
+ *
+ * tstamp_enabled denotes the first time the event CAN be
+ * enabled, i.e., the first time threads from the css are
+ * scheduled in. Note that the event may not be scheduled
+ * immediately if the PMU is overcommitted yet the timestamp
+ * points to the first css activation.
+ *
+ * If css is not currently active, then we mark
+ * tstamp_enabled = ~0 to remember that it needs to be
+ * corrected in ctx_flexible_sched_in() and
+ * ctx_pinned_sched_in()
+ */
+ if (is_cgroup_event(event) && !is_css_current(event))
+ event->tstamp_enabled = PERF_TSTAMP_ENABLE_INVALID;
}

/*
@@ -818,9 +964,17 @@ static void __perf_install_in_context(void *info)
*/
perf_disable();

+ /*
+ * in cgroup mode, we know the event matches
+ * the current cgroup, so update the cgroup's
+ * time so we timestamp correctly.
+ */
+ if (is_css_current(event))
+ update_css_time(event->css);
+
add_event_to_ctx(event, ctx);

- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
goto unlock;

/*
@@ -928,13 +1082,14 @@ static void __perf_event_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *sub;
+ u64 tstamp = get_event_time(event);

event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+
list_for_each_entry(sub, &event->sibling_list, group_entry)
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled =
- ctx->time - sub->total_time_enabled;
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
}

/*
@@ -964,9 +1119,18 @@ static void __perf_event_enable(void *info)

if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto unlock;
+
+ /*
+ * in cgroup mode, we know the event matches
+ * the current cgroup, so update the cgroup's
+ * time so we timestamp correctly.
+ */
+ if (is_css_current(event))
+ update_css_time(event->css);
+
__perf_event_mark_enabled(event, ctx);

- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
goto unlock;

/*
@@ -1090,12 +1254,14 @@ static void ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
{
struct perf_event *event;
+ struct perf_cgroup *css_out = perf_cgroup_from_task(current);

raw_spin_lock(&ctx->lock);
ctx->is_active = 0;
if (likely(!ctx->nr_events))
goto out;
update_context_time(ctx);
+ update_css_time(css_out);

perf_disable();
if (!ctx->nr_active)
@@ -1209,71 +1375,6 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}

-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- struct perf_event_context *ctx = task->perf_event_ctxp;
- struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
- int do_switch = 1;
-
- perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-
- if (likely(!ctx || !cpuctx->task_ctx))
- return;
-
- rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
- next_ctx = next->perf_event_ctxp;
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
- /*
- * Looks like the two contexts are clones, so we might be
- * able to optimize the context switch. We lock both
- * contexts and check that they are clones under the
- * lock (including re-checking that neither has been
- * uncloned in the meantime). It doesn't matter which
- * order we take the locks because no other cpu could
- * be trying to lock both of these tasks.
- */
- raw_spin_lock(&ctx->lock);
- raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
- if (context_equiv(ctx, next_ctx)) {
- /*
- * XXX do we need a memory barrier of sorts
- * wrt to rcu_dereference() of perf_event_ctxp
- */
- task->perf_event_ctxp = next_ctx;
- next->perf_event_ctxp = ctx;
- ctx->task = next;
- next_ctx->task = task;
- do_switch = 0;
-
- perf_event_sync_stat(ctx, next_ctx);
- }
- raw_spin_unlock(&next_ctx->lock);
- raw_spin_unlock(&ctx->lock);
- }
- rcu_read_unlock();
-
- if (do_switch) {
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
- }
-}
-
static void task_ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
{
@@ -1308,16 +1409,40 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,

static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+ struct perf_cpu_context *cpuctx,
+ struct task_struct *task, int css_sw)
{
struct perf_event *event;

list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+ u64 tstamp = get_event_time(event);
+
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, task))
continue;

+ if (is_cgroup_event(event)) {
+ /*
+ * if css was not active when the event was
+ * added to ctx, then this is the first time
+ * the event can be effectively scheduled, thus
+ * we update tstamp_enabled
+ */
+ if (event->tstamp_enabled == PERF_TSTAMP_ENABLE_INVALID)
+ event->tstamp_enabled = tstamp;
+ /*
+ * if we come here because of a context switch
+ * with cgroup switch, then we need to update
+ * the point in time at which all cgroup events
+ * have been stopped. Oterwise, we would compute
+ * bogus tstamp_running deltas, which would include
+ * time the cgorup is not active.
+ */
+ if (css_sw)
+ event->tstamp_stopped = tstamp;
+ }
+
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);

@@ -1334,7 +1459,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+ struct perf_cpu_context *cpuctx,
+ struct task_struct *task, int css_sw)
{
struct perf_event *event;
int can_add_hw = 1;
@@ -1347,9 +1473,31 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, task))
continue;

+ if (is_cgroup_event(event)) {
+ u64 tstamp = get_event_time(event);
+ /*
+ * if css was not active when the event was
+ * added to ctx, then this is the first time
+ * the event can be effectively scheduled, thus
+ * we update tstamp_enabled
+ */
+ if (event->tstamp_enabled == PERF_TSTAMP_ENABLE_INVALID)
+ event->tstamp_enabled = tstamp;
+ /*
+ * if we come here because of a context switch
+ * with cgroup switch, then we need to update
+ * the point in time at which all cgroup events
+ * have been stopped. Oterwise, we would compute
+ * bogus tstamp_running deltas, which would include
+ * time the cgorup is not active.
+ */
+ if (css_sw)
+ event->tstamp_stopped = tstamp;
+ }
+
if (group_can_go_on(event, cpuctx, can_add_hw))
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -1359,7 +1507,8 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task, int css_sw)
{
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
@@ -1375,11 +1524,11 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (event_type & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, cpuctx, task, css_sw);

/* Then walk through the lower prio flexible groups */
if (event_type & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, cpuctx, task, css_sw);

perf_enable();
out:
@@ -1387,11 +1536,12 @@ ctx_sched_in(struct perf_event_context *ctx,
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task, int css_sw)
{
struct perf_event_context *ctx = &cpuctx->ctx;

- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task, css_sw);
}

static void task_ctx_sched_in(struct task_struct *task,
@@ -1404,7 +1554,7 @@ static void task_ctx_sched_in(struct task_struct *task,
return;
if (cpuctx->task_ctx == ctx)
return;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task, 0);
cpuctx->task_ctx = ctx;
}
/*
@@ -1438,15 +1588,103 @@ void perf_event_task_sched_in(struct task_struct *task)
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0);

cpuctx->task_ctx = ctx;

perf_enable();
}

+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = task->perf_event_ctxp;
+ struct perf_event_context *next_ctx;
+ struct perf_event_context *parent;
+ struct perf_cgroup *css_out = perf_cgroup_from_task(task);
+ struct perf_cgroup *css_in = perf_cgroup_from_task(next);
+ int do_switch = 1, css_sw = 0;
+
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+ /*
+ * switching cgroups
+ * must update time in going out cgroup
+ * mark new start time in coming in cgroup
+ */
+ if (css_out != css_in) {
+ css_sw = 1;
+ update_css_time(css_out);
+ css_in->times[smp_processor_id()].timestamp = perf_clock();
+ }
+
+ /*
+ * if cpu context has at least one event with cgroup constraint,
+ * then flushout all existing events and scheduled again taking
+ * into account the incoming cgroup. This is a cgroup switch
+ */
+ if (cpuctx->ctx.nr_cgroups > 0 && css_sw) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, next, 1);
+ }
+ if (likely(!ctx || !cpuctx->task_ctx))
+ return;
+
+ rcu_read_lock();
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_ctx = next->perf_event_ctxp;
+ if (parent && next_ctx &&
+ rcu_dereference(next_ctx->parent_ctx) == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither has been
+ * uncloned in the meantime). It doesn't matter which
+ * order we take the locks because no other cpu could
+ * be trying to lock both of these tasks.
+ */
+ raw_spin_lock(&ctx->lock);
+ raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+ if (context_equiv(ctx, next_ctx)) {
+ /*
+ * XXX do we need a memory barrier of sorts
+ * wrt to rcu_dereference() of perf_event_ctxp
+ */
+ task->perf_event_ctxp = next_ctx;
+ next->perf_event_ctxp = ctx;
+ ctx->task = next;
+ next_ctx->task = task;
+ do_switch = 0;
+
+ perf_event_sync_stat(ctx, next_ctx);
+ }
+ raw_spin_unlock(&next_ctx->lock);
+ raw_spin_unlock(&ctx->lock);
+ }
+ rcu_read_unlock();
+
+ if (do_switch) {
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ cpuctx->task_ctx = NULL;
+ }
+}
+
+
#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
@@ -1579,7 +1817,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;

- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
continue;

hwc = &event->hw;
@@ -1660,7 +1898,7 @@ void perf_event_task_tick(struct task_struct *curr)
if (ctx)
rotate_ctx(ctx);

- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, curr, 0);
if (ctx)
task_ctx_sched_in(curr, EVENT_FLEXIBLE);
perf_enable();
@@ -1747,6 +1985,8 @@ static void __perf_event_read(void *info)
return;

raw_spin_lock(&ctx->lock);
+ if (is_css_current(event))
+ update_css_time(event->css);
update_context_time(ctx);
update_event_times(event);
raw_spin_unlock(&ctx->lock);
@@ -1773,6 +2013,8 @@ static u64 perf_event_read(struct perf_event *event)
unsigned long flags;

raw_spin_lock_irqsave(&ctx->lock, flags);
+ if (is_css_current(event))
+ update_css_time(event->css);
update_context_time(ctx);
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -2132,6 +2374,9 @@ static void free_event(struct perf_event *event)
event->buffer = NULL;
}

+ if (is_cgroup_event(event))
+ perf_put_cgroup(event);
+
if (event->destroy)
event->destroy(event);

@@ -3764,7 +4009,7 @@ static int perf_event_task_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;

- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
return 0;

if (event->attr.comm || event->attr.mmap ||
@@ -3878,7 +4123,7 @@ static int perf_event_comm_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;

- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
return 0;

if (event->attr.comm)
@@ -3999,7 +4244,7 @@ static int perf_event_mmap_match(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;

- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
return 0;

if ((!executable && event->attr.mmap_data) ||
@@ -4660,6 +4905,8 @@ static void task_clock_perf_event_read(struct perf_event *event)
u64 time;

if (!in_nmi()) {
+ if (is_css_current(event))
+ update_css_time(event->css);
update_context_time(event->ctx);
time = event->ctx->time;
} else {
@@ -5031,12 +5278,32 @@ perf_event_alloc(struct perf_event_attr *attr,
const struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
+ struct perf_cgroup *css = NULL;
long err;

event = kzalloc(sizeof(*event), gfpflags);
if (!event)
return ERR_PTR(-ENOMEM);

+ if (attr->cgroup) {
+ css = perf_get_cgroup(attr->cgroup_fd);
+ if (IS_ERR(css)) {
+ kfree(event);
+ return (void *)css;
+ }
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a thread belongs
+ * to only one cgroup at a time
+ */
+ if (group_leader && group_leader->css != css) {
+ event->css = css;
+ perf_put_cgroup(event);
+ kfree(event);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
/*
* Single events are their own group leaders, with an
* empty sibling list:
@@ -5067,6 +5334,7 @@ perf_event_alloc(struct perf_event_attr *attr,
event->id = atomic64_inc_return(&perf_event_id);

event->state = PERF_EVENT_STATE_INACTIVE;
+ event->css = css;

if (!overflow_handler && parent_event)
overflow_handler = parent_event->overflow_handler;
@@ -5125,6 +5393,7 @@ done:
if (err) {
if (event->ns)
put_pid_ns(event->ns);
+ perf_put_cgroup(event);
kfree(event);
return ERR_PTR(err);
}
@@ -5320,6 +5589,10 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}

+ /* cgroup reserved for system-wide */
+ if (attr.cgroup && pid != -1)
+ return -EINVAL;
+
event_fd = get_unused_fd_flags(O_RDWR);
if (event_fd < 0)
return event_fd;
@@ -6094,3 +6367,51 @@ static int __init perf_event_sysfs_init(void)
&perfclass_attr_group);
}
device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUPS
+static int perf_cgroup_read_map(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ return 0;
+}
+
+static struct cftype perf_cgroup_files[] = {
+ { .name = "perf",
+ .read_map = perf_cgroup_read_map,
+ },
+};
+
+static struct cgroup_subsys_state *perf_cgroup_create(
+ struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+
+ jc = vmalloc(sizeof(*jc));
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+ memset(jc, 0, sizeof(*jc));
+ return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ vfree(perf_cgroup_from_cont(cont));
+}
+
+static int perf_cgroup_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, perf_cgroup_files,
+ ARRAY_SIZE(perf_cgroup_files));
+}
+
+struct cgroup_subsys perf_subsys = {
+ .name = "perf_event",
+ .subsys_id = perf_subsys_id,
+ .create = perf_cgroup_create,
+ .destroy = perf_cgroup_destroy,
+ .populate = perf_cgroup_populate,
+ .early_init = 0,
+};
+#endif /* CONFIG_CGROUP */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/