[PATCH 09/15] perf ftrace: Add 'record' sub-command

From: Namhyung Kim
Date: Tue May 14 2013 - 06:15:57 EST


From: Namhyung Kim <namhyung.kim@xxxxxxx>

The ftrace record command is for saving raw ftrace buffer contents
which can be get from per_cpu/cpuX/trace_pipe_raw.

Since ftrace events are generated very frequently so single thread for
recording mostly resulted in buffer overruns. Thus it uses per-cpu
recorder thread to prevent such cases and they save the contents to
their own files.

These per-cpu data files are saved in a directory so that they can be
easily found when needed. I chose the default directory name as
"perf.data.dir" and the first two (i.e. "perf.data") can be changed
with -o option. The structure of the directory looks like:

$ tree perf.data.dir
perf.data.dir/
|-- perf.header
|-- trace-cpu0.buf
|-- trace-cpu1.buf
|-- trace-cpu2.buf
`-- trace-cpu3.buf

In addition to trace-cpuX.buf files, it has perf.header file also.
The perf.header file is compatible with existing perf.data format and
contains usual event information, feature mask and sample data. The
sample data is synthesized to indicate given cpu has a record file.

Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
---
tools/perf/builtin-ftrace.c | 629 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 611 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 4ec96413d3a5..4392e08d545a 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -12,6 +12,7 @@
#include <unistd.h>
#include <signal.h>
#include <fcntl.h>
+#include <dirent.h>

#include "util/debug.h"
#include "util/parse-options.h"
@@ -19,14 +20,17 @@
#include "util/target.h"
#include "util/thread_map.h"
#include "util/cpumap.h"
+#include "util/trace-event.h"


#define DEFAULT_TRACER "function_graph"
+#define DEFAULT_DIRNAME "perf.data"

struct perf_ftrace {
struct perf_evlist *evlist;
struct perf_target target;
const char *tracer;
+ const char *dirname;
};

static bool done;
@@ -170,41 +174,57 @@ static int reset_tracing_cpu(void)
return 0;
}

-static int do_ftrace_live(struct perf_ftrace *ftrace)
+static int setup_tracing_files(struct perf_ftrace *ftrace)
{
- char *trace_file;
- int trace_fd;
- char buf[4096];
- struct pollfd pollfd = {
- .events = POLLIN,
- };
-
- signal(SIGINT, sig_handler);
- signal(SIGUSR1, sig_handler);
- signal(SIGCHLD, sig_handler);
+ int ret = -1;

- if (reset_tracing_files(ftrace) < 0)
+ if (reset_tracing_files(ftrace) < 0) {
+ pr_err("failed to reset tracing files\n");
goto out;
+ }

/* reset ftrace buffer */
- if (write_tracing_file("trace", "0") < 0)
+ if (write_tracing_file("trace", "0") < 0) {
+ pr_err("failed to reset ftrace buffer\n");
goto out;
+ }

if (set_tracing_pid(ftrace) < 0) {
pr_err("failed to set ftrace pid\n");
- goto out_reset;
+ goto out;
}

if (set_tracing_cpu(ftrace) < 0) {
pr_err("failed to set tracing cpumask\n");
- goto out_reset;
+ goto out;
}

if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
- goto out_reset;
+ goto out;
}

+ ret = 0;
+out:
+ return ret;
+}
+
+static int do_ftrace_live(struct perf_ftrace *ftrace)
+{
+ char *trace_file;
+ int trace_fd;
+ char buf[4096];
+ struct pollfd pollfd = {
+ .events = POLLIN,
+ };
+
+ signal(SIGINT, sig_handler);
+ signal(SIGUSR1, sig_handler);
+ signal(SIGCHLD, sig_handler);
+
+ if (setup_tracing_files(ftrace) < 0)
+ goto out_reset;
+
trace_file = get_tracing_file("trace_pipe");
if (!trace_file) {
pr_err("failed to open trace_pipe\n");
@@ -258,7 +278,510 @@ out_close_fd:
close(trace_fd);
out_reset:
reset_tracing_files(ftrace);
+ return done ? 0 : -1;
+}
+
+static int alloc_ftrace_evsel(struct perf_ftrace *ftrace)
+{
+ if (!strcmp(ftrace->tracer, "function")) {
+ if (perf_evlist__add_newtp(ftrace->evlist, "ftrace",
+ "function", NULL) < 0) {
+ pr_err("failed to allocate ftrace event\n");
+ return -1;
+ }
+ } else if (!strcmp(ftrace->tracer, "function_graph")) {
+ if (perf_evlist__add_newtp(ftrace->evlist, "ftrace",
+ "funcgraph_entry", NULL) ||
+ perf_evlist__add_newtp(ftrace->evlist, "ftrace",
+ "funcgraph_exit", NULL)) {
+ pr_err("failed to allocate ftrace event\n");
+ return -1;
+ }
+ } else {
+ pr_err("Not supported tracer: %s\n", ftrace->tracer);
+ return -1;
+ }
+ return 0;
+}
+
+static int remove_directory(const char *pathname)
+{
+ DIR *dir;
+ int ret = 0;
+ struct dirent *dent;
+ char namebuf[PATH_MAX];
+
+ dir = opendir(pathname);
+ if (dir == NULL)
+ return 0;
+
+ while ((dent = readdir(dir)) != NULL && !ret) {
+ struct stat statbuf;
+
+ if (dent->d_name[0] == '.')
+ continue;
+
+ scnprintf(namebuf, sizeof(namebuf), "%s/%s",
+ pathname, dent->d_name);
+
+ ret = stat(namebuf, &statbuf);
+ if (ret < 0) {
+ pr_debug("stat failed\n");
+ break;
+ }
+
+ if (S_ISREG(statbuf.st_mode))
+ ret = unlink(namebuf);
+ else if (S_ISDIR(statbuf.st_mode))
+ ret = remove_directory(namebuf);
+ else {
+ pr_debug("unknown file.\n");
+ ret = -1;
+ }
+ }
+ closedir(dir);
+
+ if (ret < 0)
+ return ret;
+
+ return rmdir(pathname);
+}
+
+static int create_perf_directory(struct perf_ftrace *ftrace)
+{
+ int err;
+ char buf[PATH_MAX];
+ struct stat statbuf;
+
+ scnprintf(buf, sizeof(buf), "%s.dir", ftrace->dirname);
+
+ if (stat(buf, &statbuf) == 0) {
+ /* same name already exists - rename to *.old.dir */
+ char *old_name = malloc(strlen(buf) + 5);
+ if (old_name == NULL)
+ return -1;
+
+ scnprintf(old_name, strlen(buf) + 5,
+ "%s.old.dir", ftrace->dirname);
+
+ if (remove_directory(old_name) < 0) {
+ perror("rmdir");
+ return -1;
+ }
+
+ if (rename(buf, old_name) < 0) {
+ perror("rename");
+ free(old_name);
+ return -1;
+ }
+
+ free(old_name);
+ }
+
+ err = mkdir(buf, 0755);
+ if (err < 0) {
+ perror("mkdir");
+ return -1;
+ }
+
+ strcat(buf, "/perf.header");
+
+ err = open(buf, O_RDWR | O_CREAT | O_TRUNC, 0644);
+ return err;
+}
+
+static void sig_dummy_handler(int sig __maybe_unused)
+{
+ while (!done)
+ continue;
+}
+
+enum {
+ RECORD_STATE__ERROR = -1,
+ RECORD_STATE__INIT,
+ RECORD_STATE__READY,
+ RECORD_STATE__DONE,
+};
+
+struct ftrace_record_arg {
+ struct perf_ftrace *ftrace;
+ int cpu;
+ int state;
+ pthread_t id;
+ struct list_head node;
+};
+
+static int recorder_count;
+pthread_cond_t recorder_ready_cond = PTHREAD_COND_INITIALIZER;
+pthread_cond_t recorder_start_cond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t recorder_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *record_ftrace_raw_buffer(void *arg)
+{
+ struct ftrace_record_arg *fra = arg;
+ char buf[4096];
+ char *trace_file;
+ int trace_fd;
+ int output_fd;
+ int ret, n;
+ off_t byte_written = 0;
+ sigset_t sigmask;
+ struct pollfd pollfd = {
+ .events = POLLIN,
+ };
+
+ fra->state = RECORD_STATE__ERROR;
+
+ snprintf(buf, sizeof(buf), "per_cpu/cpu%d/trace_pipe_raw", fra->cpu);
+
+ trace_file = get_tracing_file(buf);
+ if (!trace_file) {
+ pr_err("failed to get trace_pipe_raw\n");
+ goto out;
+ }
+
+ trace_fd = open(trace_file, O_RDONLY);
+
+ put_tracing_file(trace_file);
+
+ if (trace_fd < 0) {
+ pr_err("failed to open trace_pipe_raw\n");
+ goto out;
+ }
+
+ snprintf(buf, sizeof(buf), "%s.dir/trace-cpu%d.buf",
+ fra->ftrace->dirname, fra->cpu);
+
+ output_fd = open(buf, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if (output_fd < 0) {
+ pr_err("failed to open output file\n");
+ goto out_close;
+ }
+
+ fra->state = RECORD_STATE__READY;
+
+ /*
+ * block all signals but SIGUSR2.
+ * It'll be used to unblock a recorder to finish.
+ */
+ sigfillset(&sigmask);
+ sigdelset(&sigmask, SIGUSR2);
+ pthread_sigmask(SIG_SETMASK, &sigmask,NULL);
+
+ signal(SIGUSR2, sig_dummy_handler);
+
+ fcntl(trace_fd, F_SETFL, O_NONBLOCK);
+ pollfd.fd = trace_fd;
+
+ /* Now I'm ready */
+ pthread_mutex_lock(&recorder_mutex);
+ recorder_count++;
+ pthread_cond_signal(&recorder_ready_cond);
+ pthread_cond_wait(&recorder_start_cond, &recorder_mutex);
+ pthread_mutex_unlock(&recorder_mutex);
+
+ pr_debug2("now recording for cpu%d\n", fra->cpu);
+
+ while (!done) {
+ ret = poll(&pollfd, 1, -1);
+ if (ret < 0 && errno != EINTR)
+ goto out_close;
+
+ if (pollfd.revents & POLLIN) {
+ n = read(trace_fd, buf, sizeof(buf));
+ if (n < 0)
+ goto out_close;
+ if (write(output_fd, buf, n) != n)
+ goto out_close;
+ byte_written += n;
+ }
+ }
+
+ /* read remaining buffer contents */
+ while (true) {
+ n = read(trace_fd, buf, sizeof(buf));
+ if (n < 0)
+ goto out_close;
+ if (n == 0)
+ break;
+ if (write(output_fd, buf, n) != n)
+ goto out_close;
+ byte_written += n;
+ }
+ fra->state = RECORD_STATE__DONE;
+
+out_close:
+ close(trace_fd);
out:
+ if (fra->state == RECORD_STATE__ERROR) {
+ /*
+ * We need to update recorder_count in this case also
+ * in order to prevent deadlocking in the main thread.
+ */
+ pthread_mutex_lock(&recorder_mutex);
+ recorder_count++;
+ pthread_cond_signal(&recorder_ready_cond);
+ pthread_mutex_unlock(&recorder_mutex);
+ }
+ return fra;
+}
+
+static void *synthesize_raw_data(struct perf_evsel *evsel)
+{
+ void *data = NULL;
+ u32 data_size;
+
+ if (!strcmp(evsel->tp_format->name, "function")) {
+ struct {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ int common_padding;
+
+ unsigned long ip;
+ unsigned long parent_ip;
+ } function_format = {
+ .common_type = evsel->attr.config,
+ };
+
+ data_size = sizeof(function_format);
+
+ data = malloc(data_size + sizeof(u32));
+ if (data == NULL)
+ return NULL;
+
+ memcpy(data, &data_size, sizeof(data_size));
+
+ data += sizeof(data_size);
+ memcpy(data, &function_format, sizeof(function_format));
+ } else if (!strcmp(evsel->tp_format->name, "funcgraph_entry")) {
+ struct {
+ unsigned short common_type;
+ unsigned char common_flags;
+ unsigned char common_preempt_count;
+ int common_pid;
+ int common_padding;
+
+ unsigned long func;
+ int depth;
+ } funcgraph_entry_format = {
+ .common_type = evsel->attr.config,
+ };
+
+ data_size = sizeof(funcgraph_entry_format);
+
+ data = malloc(data_size + sizeof(u32));
+ if (data == NULL)
+ return NULL;
+
+ memcpy(data, &data_size, sizeof(data_size));
+
+ data += sizeof(data_size);
+ memcpy(data, &funcgraph_entry_format,
+ sizeof(funcgraph_entry_format));
+ }
+ return data;
+}
+
+static int do_ftrace_record(struct perf_ftrace *ftrace)
+{
+ int i, err, feat;
+ int perf_fd;
+ LIST_HEAD(recorders);
+ struct perf_session *session;
+ struct ftrace_record_arg *fra, *tmp;
+
+ signal(SIGINT, sig_handler);
+ signal(SIGUSR1, sig_handler);
+ signal(SIGCHLD, sig_handler);
+
+ if (setup_tracing_files(ftrace) < 0)
+ goto out_reset;
+
+ alloc_ftrace_evsel(ftrace);
+
+ perf_fd = create_perf_directory(ftrace);
+ if (perf_fd < 0) {
+ pr_err("failed to create perf directory\n");
+ goto out_reset;
+ }
+
+ /* just use a dummy session for header recording */
+ session = zalloc(sizeof(*session));
+ if (session == NULL) {
+ pr_err("failed to allocate perf session\n");
+ goto out_close;
+ }
+ session->evlist = ftrace->evlist;
+
+ for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
+ perf_header__set_feat(&session->header, feat);
+
+ perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
+ perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
+ perf_header__clear_feat(&session->header, HEADER_PMU_MAPPINGS);
+ perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
+
+ err = perf_session__write_header(session, ftrace->evlist,
+ perf_fd, false);
+ if (err < 0) {
+ pr_err("failed to write perf header\n");
+ goto out_session;
+ }
+
+ /*
+ * We record ftrace's per_cpu buffer so that it'd better having
+ * corresponding cpu maps anyway.
+ */
+ if (!perf_target__has_cpu(&ftrace->target)) {
+ struct cpu_map *new_map;
+
+ new_map = cpu_map__new(NULL);
+ if (new_map == NULL) {
+ pr_err("failed to create new cpu map\n");
+ goto out_session;
+ }
+
+ /* replace existing cpu map */
+ cpu_map__delete(ftrace->evlist->cpus);
+ ftrace->evlist->cpus = new_map;
+ }
+
+ for (i = 0; i < cpu_map__nr(ftrace->evlist->cpus); i++) {
+ fra = malloc(sizeof(*fra));
+ if (fra == NULL) {
+ pr_err("not enough memory!\n");
+ goto out_fra;
+ }
+
+ fra->ftrace = ftrace;
+ fra->cpu = ftrace->evlist->cpus->map[i];
+ fra->state = RECORD_STATE__INIT;
+ list_add_tail(&fra->node, &recorders);
+
+ err = pthread_create(&fra->id, NULL,
+ record_ftrace_raw_buffer, fra);
+ if (err < 0) {
+ pr_err("failed to create recorder thread\n");
+ goto out_fra;
+ }
+ }
+
+ /* wait for all recorders ready */
+ pthread_mutex_lock(&recorder_mutex);
+ while (recorder_count != cpu_map__nr(ftrace->evlist->cpus))
+ pthread_cond_wait(&recorder_ready_cond, &recorder_mutex);
+ pthread_mutex_unlock(&recorder_mutex);
+
+ list_for_each_entry(fra, &recorders, node) {
+ if (fra->state != RECORD_STATE__READY) {
+ pr_err("cpu%d: failed to start recorder", fra->cpu);
+ goto out_fra;
+ }
+ }
+
+ if (write_tracing_file("tracing_on", "1") < 0) {
+ pr_err("can't enable tracing\n");
+ goto out_fra;
+ }
+
+ perf_evlist__start_workload(ftrace->evlist);
+
+ pr_debug2("start recording per cpu buffers\n");
+ pthread_mutex_lock(&recorder_mutex);
+ pthread_cond_broadcast(&recorder_start_cond);
+ pthread_mutex_unlock(&recorder_mutex);
+
+ /* wait for signal/finish */
+ pause();
+
+ if (write_tracing_file("tracing_on", "0") < 0) {
+ pr_err("can't disable tracing\n");
+ goto out_fra;
+ }
+
+ /* signal recorders to terminate */
+ list_for_each_entry(fra, &recorders, node) {
+ pr_debug2("killing recorder thread for cpu%d\n", fra->cpu);
+ pthread_kill(fra->id, SIGUSR2);
+ }
+
+ list_for_each_entry(fra, &recorders, node)
+ pthread_join(fra->id, NULL);
+
+ /* synthesize sample data */
+ list_for_each_entry(fra, &recorders, node) {
+ struct perf_evsel *evsel = perf_evlist__first(ftrace->evlist);
+ union perf_event event = {
+ .sample = {
+ .header = {
+ .type = PERF_RECORD_SAMPLE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(event.sample.header) +
+ evsel->sample_size,
+ },
+ },
+ };
+ struct perf_sample sample = {
+ .cpu = fra->cpu,
+ .period = 1,
+ };
+ void *raw_data;
+ u32 raw_size;
+ int orig_size;
+
+ if (fra->state != RECORD_STATE__DONE) {
+ pr_warning("recorder failed for some reason on cpu%d\n",
+ fra->cpu);
+ continue;
+ }
+
+ perf_event__synthesize_sample(&event, evsel->attr.sample_type,
+ &sample, false);
+
+ raw_data = synthesize_raw_data(evsel);
+ if (raw_data == NULL) {
+ pr_err("synthesizing raw sample failed\n");
+ goto out_fra;
+ }
+
+ /*
+ * start of raw data is the size of raw data excluding itself.
+ */
+ raw_size = sizeof(u32) + (*(u32 *) raw_data);
+
+ orig_size = event.sample.header.size;
+ event.sample.header.size += raw_size;
+
+ err = write(perf_fd, &event.sample, orig_size);
+ if (err != orig_size) {
+ pr_err("write error occurred\n");
+ goto out_fra;
+ }
+
+ err = write(perf_fd, raw_data, raw_size);
+ if (err != (int)raw_size) {
+ pr_err("write error occurred\n");
+ goto out_fra;
+ }
+
+ session->header.data_size += event.sample.header.size;
+ }
+
+ perf_session__write_header(session, ftrace->evlist, perf_fd, true);
+
+out_fra:
+ list_for_each_entry_safe(fra, tmp, &recorders, node) {
+ list_del(&fra->node);
+ free(fra);
+ }
+out_session:
+ free(session);
+out_close:
+ close(perf_fd);
+out_reset:
+ reset_tracing_files(ftrace);
return done ? 0 : -1;
}

@@ -325,6 +848,74 @@ out:
return ret;
}

+static int
+__cmd_ftrace_record(struct perf_ftrace *ftrace, int argc, const char **argv)
+{
+ int ret = -1;
+ const char * const record_usage[] = {
+ "perf ftrace record [<options>] [<command>]",
+ "perf ftrace record [<options>] -- <command> [<options>]",
+ NULL
+ };
+ const struct option record_options[] = {
+ OPT_STRING('t', "tracer", &ftrace->tracer, "tracer",
+ "tracer to use"),
+ OPT_STRING('p', "pid", &ftrace->target.tid, "pid",
+ "trace on existing process id"),
+ OPT_INCR('v', "verbose", &verbose,
+ "be more verbose"),
+ OPT_BOOLEAN('a', "all-cpus", &ftrace->target.system_wide,
+ "system-wide collection from all CPUs"),
+ OPT_STRING('C', "cpu", &ftrace->target.cpu_list, "cpu",
+ "list of cpus to monitor"),
+ OPT_STRING('o', "output", &ftrace->dirname, "dirname",
+ "input directory name to use (default: perf.data)"),
+ OPT_END()
+ };
+
+ argc = parse_options(argc, argv, record_options, record_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (!argc && perf_target__none(&ftrace->target))
+ usage_with_options(record_usage, record_options);
+
+ ret = perf_target__validate(&ftrace->target);
+ if (ret) {
+ char errbuf[512];
+
+ perf_target__strerror(&ftrace->target, ret, errbuf, 512);
+ pr_err("%s\n", errbuf);
+ return -EINVAL;
+ }
+
+ ftrace->evlist = perf_evlist__new();
+ if (ftrace->evlist == NULL)
+ return -ENOMEM;
+
+ ret = perf_evlist__create_maps(ftrace->evlist, &ftrace->target);
+ if (ret < 0)
+ goto out;
+
+ if (ftrace->tracer == NULL)
+ ftrace->tracer = DEFAULT_TRACER;
+
+ if (ftrace->dirname == NULL)
+ ftrace->dirname = DEFAULT_DIRNAME;
+
+ if (argc && perf_evlist__prepare_workload(ftrace->evlist,
+ &ftrace->target,
+ argv, false, true) < 0)
+ goto out_maps;
+
+ ret = do_ftrace_record(ftrace);
+
+out_maps:
+ perf_evlist__delete_maps(ftrace->evlist);
+out:
+ perf_evlist__delete(ftrace->evlist);
+
+ return ret;
+}
+
int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
{
int ret;
@@ -332,8 +923,8 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
.target = { .uid = UINT_MAX, },
};
const char * const ftrace_usage[] = {
- "perf ftrace {live} [<options>] [<command>]",
- "perf ftrace {live} [<options>] -- <command> [<options>]",
+ "perf ftrace {live|record} [<options>] [<command>]",
+ "perf ftrace {live|record} [<options>] -- <command> [<options>]",
NULL
};
const struct option ftrace_options[] = {
@@ -352,6 +943,8 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)

if (strcmp(argv[0], "live") == 0) {
ret = __cmd_ftrace_live(&ftrace, argc, argv);
+ } else if (strncmp(argv[0], "rec", 3) == 0) {
+ ret = __cmd_ftrace_record(&ftrace, argc, argv);
} else {
usage_with_options(ftrace_usage, ftrace_options);
}
--
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/