[RFC] perf: Add "-f" and "-F" flags to watch a "/sys" style file

From: Luck, Tony
Date: Mon May 15 2017 - 15:27:14 EST


From: Tony Luck <tony.luck@xxxxxxxxx>

Thomas Gleixner is encouraging us to extend the /sys/fs/resctrl file system
to include monitoring data (LLC occupancy, memory bandwidth) from the
(weird) counters that come as part of "Resource Director Technology".
See Intel Software Developer Manual volume 3, section 17.17.1.

Our current plan will provide readout files for each counter type in
each of the existing "control" directories, also in the new "monitoring"
directories that we plan to add.

With this change, people are asking how this will be integrated with
"perf" ... this patch represents a concept for how we might do this.

Basically we teach perf how to read /sys style files (single number)
that are dynamically updated. I just hacked in two new arguments
that allow the user to name a file to be read and included in the
output along with any normal events that they requested with "-e".

There are two options because some files report a "snapshot" value
that should be reported as-is, while other report an ever increasing
value that the user most likely cares how much it changed from one
timepoint to the next.

cache occupancy fits the first category, while memory bandwidth (which
actually reports total bytes since boot) fits the second.

This may be useful in other contexts. There are many /sys files
on my desktop that might also be interesting to monitor in this way.
E.g.
/sys/devices/pci0000:00/0000:00:1c.4/0000:03:00.0/net/enp3s0/statistics/tx_packets
/sys/devices/pci0000:00/0000:00:1c.4/0000:03:00.0/net/enp3s0/statistics/tx_bytes
/sys/fs/ext4/sda2/delayed_allocation_blocks
/sys/fs/ext4/sda2/session_write_kbytes
/sys/fs/ext4/sda2/lifetime_write_kbytes
/sys/fs/ext4/sdb/delayed_allocation_blocks
/sys/fs/ext4/sdb/session_write_kbytes
/sys/fs/ext4/sdb/lifetime_write_kbytes
/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp2_input
/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp5_input
/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input
/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp4_input

Thoughts? [Both on the concept, and on my hacky implementation]

Cc: Vikas Shivappa <vikas.shivappa@xxxxxxxxx>
Cc: Stephane Eranian <eranian@xxxxxxxxxx>
Cc: David Carrillo-Cisneros <davidcc@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx

---
tools/perf/builtin-c2c.c | 2 ++
tools/perf/builtin-mem.c | 2 ++
tools/perf/builtin-record.c | 4 ++++
tools/perf/builtin-stat.c | 4 ++++
tools/perf/builtin-top.c | 4 ++++
tools/perf/builtin-trace.c | 4 ++++
tools/perf/util/evsel.c | 34 ++++++++++++++++++++++++++++------
tools/perf/util/evsel.h | 1 +
tools/perf/util/parse-events.c | 17 +++++++++++++++++
tools/perf/util/parse-events.h | 1 +
10 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index e33b4acece90..e0a0a668fa5b 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2691,6 +2691,8 @@ static int perf_c2c__record(int argc, const char **argv)
OPT_CALLBACK('e', "event", &event_set, "event",
"event selector. Use 'perf mem record -e list' to list available events",
parse_record_events),
+ OPT_CALLBACK('f', "file", &event_set, "file", "file selector", parse_files_option),
+ OPT_CALLBACK('F', "file", &event_set, "file", "delta file selector", parse_files_option),
OPT_BOOLEAN('u', "all-user", &all_user, "collect only user level data"),
OPT_BOOLEAN('k', "all-kernel", &all_kernel, "collect only kernel level data"),
OPT_UINTEGER('l', "ldlat", &perf_mem_events__loads_ldlat, "setup mem-loads latency"),
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index e001c0290793..86a332dd30cc 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -72,6 +72,8 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
OPT_CALLBACK('e', "event", &mem, "event",
"event selector. use 'perf mem record -e list' to list available events",
parse_record_events),
+ OPT_CALLBACK('f', "file", &mem, "file", "file selector", parse_files_option),
+ OPT_CALLBACK('F', "file", &mem, "file", "delta file selector", parse_files_option),
OPT_UINTEGER(0, "ldlat", &perf_mem_events__loads_ldlat, "mem-loads latency"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show counter open errors, etc)"),
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ee7d0a82ccd0..024539e29437 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1551,6 +1551,10 @@ static struct option __record_options[] = {
OPT_CALLBACK('e', "event", &record.evlist, "event",
"event selector. use 'perf list' to list available events",
parse_events_option),
+ OPT_CALLBACK('f', "file", &record.evlist, "file",
+ "file selector", parse_files_option),
+ OPT_CALLBACK('F', "file", &record.evlist, "file",
+ "delta file selector", parse_files_option),
OPT_CALLBACK(0, "filter", &record.evlist, "filter",
"event filter", parse_filter),
OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a935b5023732..8bd10f7e0026 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1722,6 +1722,10 @@ static const struct option stat_options[] = {
OPT_CALLBACK('e', "event", &evsel_list, "event",
"event selector. use 'perf list' to list available events",
parse_events_option),
+ OPT_CALLBACK('f', "file", &evsel_list, "file",
+ "file selector", parse_files_option),
+ OPT_CALLBACK('F', "file", &evsel_list, "file",
+ "delta file selector", parse_files_option),
OPT_CALLBACK(0, "filter", &evsel_list, "filter",
"event filter", parse_filter),
OPT_BOOLEAN('i', "no-inherit", &no_inherit,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7ab42b8311a1..beafd78217df 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1104,6 +1104,10 @@ int cmd_top(int argc, const char **argv)
OPT_CALLBACK('e', "event", &top.evlist, "event",
"event selector. use 'perf list' to list available events",
parse_events_option),
+ OPT_CALLBACK('f', "file", &top.evlist, "file",
+ "file selector", parse_files_option),
+ OPT_CALLBACK('F', "file", &top.evlist, "file",
+ "delta file selector", parse_files_option),
OPT_U64('c', "count", &opts->user_interval, "event period to sample"),
OPT_STRING('p', "pid", &target->pid, "pid",
"profile events on existing process id"),
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index eaa66fb57347..9116d5d46056 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2857,6 +2857,10 @@ int cmd_trace(int argc, const char **argv)
OPT_CALLBACK('e', "event", &trace, "event",
"event/syscall selector. use 'perf list' to list available events",
trace__parse_events_option),
+ OPT_CALLBACK('f', "file", &evsel_list, "file",
+ "file selector", parse_files_option),
+ OPT_CALLBACK('F', "file", &evsel_list, "file",
+ "delta file selector", parse_files_option),
OPT_BOOLEAN(0, "comm", &trace.show_comm,
"show the thread COMM next to its id"),
OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 0e879097adfb..381eb0c085b6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1236,6 +1236,21 @@ void perf_counts_values__scale(struct perf_counts_values *count,
*pscaled = scaled;
}

+static int read_sys_file(int fd, struct perf_counts_values *count)
+{
+ char buf[100];
+ int n;
+ static u64 fake;
+
+ n = pread(fd, buf, sizeof buf, 0);
+ if (n >= 0) {
+ count->val = n ? strtol(buf, NULL, 0) : 0;
+ count->ena = count->run = ++fake;
+ return 0;
+ } else
+ return -errno;
+}
+
int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
struct perf_counts_values *count)
{
@@ -1244,6 +1259,8 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
if (FD(evsel, cpu, thread) < 0)
return -EINVAL;

+ if (evsel->sysfile)
+ return read_sys_file(FD(evsel, cpu, thread), count);
if (readn(FD(evsel, cpu, thread), count, sizeof(*count)) <= 0)
return -errno;

@@ -1539,18 +1556,23 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
for (cpu = 0; cpu < cpus->nr; cpu++) {

for (thread = 0; thread < nthreads; thread++) {
- int fd, group_fd;
+ int fd;

if (!evsel->cgrp && !evsel->system_wide)
pid = thread_map__pid(threads, thread);

- group_fd = get_group_fd(evsel, cpu, thread);
+ if (evsel->sysfile) {
+ fd = open(evsel->name, O_RDONLY, 0);
+ } else {
+ int group_fd;
retry_open:
- pr_debug2("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx",
- pid, cpus->map[cpu], group_fd, flags);
+ group_fd = get_group_fd(evsel, cpu, thread);
+ pr_debug2("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx",
+ pid, cpus->map[cpu], group_fd, flags);

- fd = sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu],
- group_fd, flags);
+ fd = sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu],
+ group_fd, flags);
+ }

FD(evsel, cpu, thread) = fd;

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index d101695c482c..ede30e111947 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -121,6 +121,7 @@ struct perf_evsel {
bool per_pkg;
bool precise_max;
bool ignore_missing_thread;
+ bool sysfile;
/* parse modifier helper */
int exclude_GH;
int nr_members;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 01e779b91c8e..0d725eda476d 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1793,6 +1793,23 @@ int parse_events_option(const struct option *opt, const char *str,
return ret;
}

+int parse_files_option(const struct option *opt, const char *str,
+ int unset __maybe_unused)
+{
+ struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
+ struct perf_evsel *evsel = calloc(1, sizeof (*evsel));
+
+ evsel->name = strdup(str);
+ evsel->unit = "";
+ evsel->sysfile = true;
+ evsel->snapshot = (opt->short_name == 'f');
+ evsel->scale = 1.0;
+ INIT_LIST_HEAD(&evsel->config_terms);
+ evsel->bpf_fd = -1;
+ perf_evlist__add(evlist, evsel);
+ return 0;
+}
+
static int
foreach_evsel_in_last_glob(struct perf_evlist *evlist,
int (*func)(struct perf_evsel *evsel,
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index a235f4d6d5e5..7e14a4c66fad 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -30,6 +30,7 @@ bool have_tracepoints(struct list_head *evlist);
const char *event_type(int type);

int parse_events_option(const struct option *opt, const char *str, int unset);
+int parse_files_option(const struct option *opt, const char *str, int unset);
int parse_events(struct perf_evlist *evlist, const char *str,
struct parse_events_error *error);
int parse_events_terms(struct list_head *terms, const char *str);
--
2.11.0