[RFC][PATCH] perf: Implement read_group() PMU operation

From: Sukadev Bhattiprolu
Date: Thu Feb 05 2015 - 21:59:49 EST


From: Sukadev Bhattiprolu <sukadev@xxxxxxxxxxxxxxxxxx>
Date: Thu Feb 5 20:56:20 EST 2015 -0300
Subject: [RFC][PATCH] perf: Implement read_group() PMU operation

This is a lightly tested, exploratory patch to allow PMUs to return
several counters at once. Appreciate any comments :-)

Unlike normal hardware PMCs, the 24x7 counters[1] in Power8 are stored
in memory and accessed via a hypervisor call (HCALL). A major aspect
of the HCALL is that it allows retireving _SEVERAL_ counters at once
(unlike regular PMCs, which are read one at a time).

This patch implements a ->read_group() PMU operation that tries to
take advantage of this ability to read several counters at once. A
PMU that implements the ->read_group() operation would allow users
to retrieve several counters at once and get a more consistent
snapshot.

NOTE: This patch has a TODO in h_24x7_event_read_group() in that it
still does multiple HCALLS. I think that can be optimized
independently, once the pmu->read_group() interface itself is
finalized.

Appreciate comments on the ->read_group interface and best managing the
interfaces between the core and PMU layers - eg: Ok for hv-24x7 PMU to
to walk the ->sibling_list ?

[1] Some notes about 24x7 counters:

Power8 supports 24x7 counters[1] which differ from traditional PMCs
in several ways:

- The 24x7 counters are always on and counting. Rather than
start/stop the PMCs, we read/report the _change_ in values
in the counters during the execution of the workload.

- The 24x7 counters are not tied to a task context (they are
always on).

- Rather than reading the event counts from registers, we make
a hypervisor call (HCALL) to retrieve counts. The HCALL allows
retrieving a large number of counters in a single call.

- These counters don't generate interrupts when they overflow (so
sampling does not apply to these counters).
---

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1d36314..b69fbdf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -232,6 +232,13 @@ struct pmu {
void (*read) (struct perf_event *event);

/*
+ * Read a group of counters.
+ */
+ int (*read_group) (struct perf_event *event,
+ u64 *values,
+ int ncounters);
+
+ /*
* Group events scheduling is treated as a transaction, add
* group events as a whole and perform one schedulability test.
* If the test fails, roll back the whole group
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 934687f..026a9d0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3549,10 +3549,43 @@ static int perf_event_read_group(struct perf_event *event,
struct perf_event *leader = event->group_leader, *sub;
int n = 0, size = 0, ret = -EFAULT;
struct perf_event_context *ctx = leader->ctx;
+ u64 *valuesp;
u64 values[5];
+ int use_group_read;
u64 count, enabled, running;
+ struct pmu *pmu = event->pmu;
+
+ /*
+ * If PMU supports group read and group read is requested,
+ * allocate memory before taking the mutex.
+ */
+ use_group_read = 0;
+ if ((read_format & PERF_FORMAT_GROUP) && pmu->read_group) {
+ use_group_read++;
+ }
+
+ if (use_group_read) {
+ valuesp = kzalloc(leader->nr_siblings * sizeof(u64), GFP_KERNEL);
+ if (!valuesp)
+ return -ENOMEM;
+ }

mutex_lock(&ctx->mutex);
+
+ if (use_group_read) {
+ ret = pmu->read_group(leader, valuesp, leader->nr_siblings);
+ if (ret >= 0) {
+ size = ret * sizeof(u64);
+
+ ret = size;
+ if (copy_to_user(buf, valuesp, size))
+ ret = -EFAULT;
+ }
+
+ kfree(valuesp);
+ goto unlock;
+ }
+
count = perf_event_read_value(leader, &enabled, &running);

values[n++] = 1 + leader->nr_siblings;
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9445a82..cd48cf0 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1071,12 +1071,33 @@ static int h_24x7_event_init(struct perf_event *event)
struct hv_perf_caps caps;
unsigned domain;
unsigned long hret;
+ u64 read_format, inv_flags;
u64 ct;

/* Not our event */
if (event->attr.type != event->pmu->type)
return -ENOENT;

+ /*
+ * We don't support enabled/running times with PERF_FORMAT_GROUP.
+ * The ->read_group() operation is intended to be used in continous
+ * monitoring mode, so these time values are not important at least
+ * for now.
+ *
+ * Not sure if the PERF_FORMAT_ID is useful. Block it for now.
+ */
+ read_format = event->attr.read_format;
+ inv_flags = PERF_FORMAT_TOTAL_TIME_ENABLED;
+ inv_flags |= PERF_FORMAT_TOTAL_TIME_RUNNING;
+ inv_flags |= PERF_FORMAT_ID;
+
+ if ((read_format & PERF_FORMAT_GROUP) && (read_format & inv_flags)) {
+ pr_devel("%s(): Invalid flags: rf 0x%llx, invf 0x%llx\n",
+ __func__, (unsigned long long)read_format,
+ (unsigned long long)inv_flags);
+ return -EINVAL;
+ }
+
/* Unused areas must be 0 */
if (event_get_reserved1(event) ||
event_get_reserved2(event) ||
@@ -1181,6 +1202,50 @@ static int h_24x7_event_add(struct perf_event *event, int flags)
return 0;
}

+static int h_24x7_event_read_group(struct perf_event *leader, u64 *values,
+ int ncounters)
+{
+ struct perf_event *sub;
+ int n = 0;
+
+ BUG_ON(!(leader->attr.read_format & PERF_FORMAT_GROUP));
+
+ /*
+ * sys_perf_event_open() for now prevents inheritance with
+ * PERF_FORMAT_GROUP. Ensure that hasn't changed.
+ */
+ BUG_ON(!list_empty(&leader->child_list));
+
+ if (ncounters < leader->nr_siblings) {
+ pr_devel("%s(): Insufficient buffer : ns %d, nc %d\n",
+ __func__, leader->nr_siblings, ncounters);
+ return -EINVAL;
+ }
+
+ raw_spin_lock(&leader->ctx->lock);
+
+ if (leader->state == PERF_EVENT_STATE_ACTIVE) {
+ h_24x7_event_update(leader);
+ values[n++] = local64_read(&leader->count);
+ }
+
+ /*
+ * TODO: For now, make one HCALL per event. We will soon retrieve
+ * several events with one HCALL.
+ */
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state != PERF_EVENT_STATE_ACTIVE)
+ continue;
+
+ h_24x7_event_update(sub);
+ values[n++] = local64_read(&sub->count);
+ }
+
+ raw_spin_unlock(&leader->ctx->lock);
+
+ return n;
+}
+
static struct pmu h_24x7_pmu = {
.task_ctx_nr = perf_invalid_context,

@@ -1192,6 +1257,7 @@ static struct pmu h_24x7_pmu = {
.start = h_24x7_event_start,
.stop = h_24x7_event_stop,
.read = h_24x7_event_update,
+ .read_group = h_24x7_event_read_group,
};

static int hv_24x7_init(void)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/