[RFC PATCH 2/2] perf stat: Use event group to simulate PMI onPMI-less hardware counter

From: Lin Ming
Date: Wed Nov 10 2010 - 01:13:45 EST


Some hardware counters(for example, Intel RAPL) can't generate interrupt
when overflow. So we need to simulate the interrupt to periodically
record the counter values. Otherwise, the counter may overflow and the
wrong value is read.

This patch uses event group to simulate PMI as suggested by Peter
Zijlstra, http://marc.info/?l=linux-kernel&m=128220854801819&w=2

create_group_counters() will create a group with 2 events, one hrtimer
based event as the group leader, and the other event to count. The
hrtimer is fired periodically, so the sibling event can record its
counter value periodically as well.

Signed-off-by: Lin Ming <ming.m.lin@xxxxxxxxx>
---
include/linux/perf_event.h | 4 ++-
tools/perf/builtin-stat.c | 58 +++++++++++++++++++++++++++++++++++++++----
2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 057bf22..8a4c0aa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -216,7 +216,9 @@ struct perf_event_attr {
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */

- __reserved_1 : 46;
+ pmi_simulate : 1, /* simulate pmi with group events */
+
+ __reserved_1 : 45;

union {
__u32 wakeup_events; /* wakeup every n events */
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a6b4d44..e0497cf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -148,6 +148,38 @@ struct stats runtime_branches_stats;
#define ERR_PERF_OPEN \
"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"

+/*
+ * Create a group with hrtimer event(task-clock) as leader
+ * to simulate PMI
+ */
+static int create_group_counters(struct perf_event_attr *attr,
+ pid_t pid, int cpu, int flags)
+{
+ int leader_fd, counter_fd;
+ struct perf_event_attr leader;
+
+ memset(&leader, 0, sizeof(struct perf_event_attr));
+ leader.type = PERF_TYPE_SOFTWARE;
+ leader.config = PERF_COUNT_SW_TASK_CLOCK;
+ leader.sample_type = PERF_SAMPLE_READ;
+ leader.read_format = attr->read_format | PERF_FORMAT_GROUP;
+ leader.sample_period = attr->sample_period;
+ leader.disabled = attr->disabled;
+ leader.enable_on_exec = attr->enable_on_exec;
+
+ leader_fd = sys_perf_event_open(&leader, pid, cpu, -1, flags);
+ if (leader_fd < 0)
+ return leader_fd;
+
+ counter_fd = sys_perf_event_open(attr, pid, cpu, leader_fd, flags);
+ if (counter_fd < 0) {
+ close(leader_fd);
+ return counter_fd;
+ }
+
+ return leader_fd;
+}
+
static int create_perf_stat_counter(int counter)
{
struct perf_event_attr *attr = attrs + counter;
@@ -162,8 +194,12 @@ static int create_perf_stat_counter(int counter)
int cpu;

for (cpu = 0; cpu < nr_cpus; cpu++) {
- fd[cpu][counter][0] = sys_perf_event_open(attr,
+ if (!attr->pmi_simulate)
+ fd[cpu][counter][0] = sys_perf_event_open(attr,
-1, cpumap[cpu], -1, 0);
+ else
+ fd[cpu][counter][0] = create_group_counters(attr,
+ -1, cpumap[cpu], 0);
if (fd[cpu][counter][0] < 0)
pr_debug(ERR_PERF_OPEN, counter,
fd[cpu][counter][0], strerror(errno));
@@ -177,8 +213,12 @@ static int create_perf_stat_counter(int counter)
attr->enable_on_exec = 1;
}
for (thread = 0; thread < thread_num; thread++) {
- fd[0][counter][thread] = sys_perf_event_open(attr,
- all_tids[thread], -1, -1, 0);
+ if (!attr->pmi_simulate)
+ fd[0][counter][thread] = sys_perf_event_open(attr,
+ all_tids[thread], -1, -1, 0);
+ else
+ fd[0][counter][thread] = create_group_counters(attr,
+ all_tids[thread], -1, 0);
if (fd[0][counter][thread] < 0)
pr_debug(ERR_PERF_OPEN, counter,
fd[0][counter][thread],
@@ -208,15 +248,21 @@ static inline int nsec_counter(int counter)
*/
static void read_counter(int counter)
{
- u64 count[3], single_count[3];
+ u64 count[3], single_count[5];
int cpu;
size_t res, nv;
int scaled;
int i, thread;
+ int data_idx = 0;

count[0] = count[1] = count[2] = 0;

- nv = scale ? 3 : 1;
+ if (!attrs[counter].pmi_simulate)
+ nv = scale ? 3 : 1;
+ else {
+ nv = scale ? 5 : 3;
+ data_idx = nv - 1;
+ }
for (cpu = 0; cpu < nr_cpus; cpu++) {
for (thread = 0; thread < thread_num; thread++) {
if (fd[cpu][counter][thread] < 0)
@@ -229,7 +275,7 @@ static void read_counter(int counter)
close(fd[cpu][counter][thread]);
fd[cpu][counter][thread] = -1;

- count[0] += single_count[0];
+ count[0] += single_count[data_idx];
if (scale) {
count[1] += single_count[1];
count[2] += single_count[2];
--
1.7.1



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/