Re: [PATCH v2 2/3] perf kvm: enable record|report feature on powerpc

From: Arnaldo Carvalho de Melo
Date: Mon Feb 01 2016 - 16:06:15 EST


Em Fri, Jan 22, 2016 at 11:28:11AM +0530, Ravi Bangoria escreveu:
> This patch contains core logic for enabling perf kvm {record|report} on
> powerpc.
>
> For perf kvm record,
> This patch will replace default event(cycle) with kvm_hv:kvm_guest_exit
> while recording guest data from host.
>
> For perf kvm report,
> This patch makes use of the 'kvm_guest_exit' tracepoint and checks the
> exit reason for any kvm exit. If it is HV_DECREMENTER, then the
> instruction pointer dumped along with this tracepoint is retrieved and
> mapped with the guest kallsyms.
>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@xxxxxxxxxxxxxxxxxx>
> Signed-off-by: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
> ---
> changes in v2:
> - Breakdown of v1 patch into two sub patches
> - Merged parse-tp.c and evlist.c from tools/perf/arch/powerpc/util/ into
> single file with name kvm.c
>
> tools/perf/arch/powerpc/util/Build | 1 +
> tools/perf/arch/powerpc/util/kvm.c | 104 +++++++++++++++++++++++++++++++++++++
> tools/perf/util/event.c | 12 ++++-
> tools/perf/util/evlist.c | 9 ++++
> tools/perf/util/evlist.h | 1 +
> tools/perf/util/evsel.c | 7 +++
> tools/perf/util/evsel.h | 4 ++
> tools/perf/util/session.c | 9 ++--
> tools/perf/util/util.c | 5 ++
> tools/perf/util/util.h | 1 +
> 10 files changed, 147 insertions(+), 6 deletions(-)
> create mode 100644 tools/perf/arch/powerpc/util/kvm.c
>
> diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
> index 7b8b0d1..eb819e0 100644
> --- a/tools/perf/arch/powerpc/util/Build
> +++ b/tools/perf/arch/powerpc/util/Build
> @@ -1,5 +1,6 @@
> libperf-y += header.o
> libperf-y += sym-handling.o
> +libperf-y += kvm.o
>
> libperf-$(CONFIG_DWARF) += dwarf-regs.o
> libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
> diff --git a/tools/perf/arch/powerpc/util/kvm.c b/tools/perf/arch/powerpc/util/kvm.c
> new file mode 100644
> index 0000000..317f29a
> --- /dev/null
> +++ b/tools/perf/arch/powerpc/util/kvm.c
> @@ -0,0 +1,104 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * Copyright (C) 2016 Hemant Kumar Shaw, IBM Corporation
> + * Copyright (C) 2016 Ravikumar B. Bangoria, IBM Corporation
> + */
> +
> +#include <linux/err.h>
> +#include "../../../util/evsel.h"
> +#include "../../../util/evlist.h"
> +#include "../../../util/trace-event.h"
> +#include "../../../util/session.h"
> +#include "../../../util/util.h"
> +
> +#define KVMPPC_EXIT "kvm_hv:kvm_guest_exit"
> +#define HV_DECREMENTER 2432
> +#define HV_BIT 3
> +#define PR_BIT 49
> +#define PPC_MAX 63
> +
> +/*
> + * To sample for only guest, record kvm_hv:kvm_guest_exit.
> + * Otherwise go via normal way(cycles).
> + */
> +int perf_evlist__arch_add_default(struct perf_evlist *evlist)
> +{
> + struct perf_evsel *evsel;
> +
> + if (!perf_guest_only())
> + return -1;
> +
> + evsel = perf_evsel__newtp_idx("kvm_hv", "kvm_guest_exit", 0);
> + if (IS_ERR(evsel))
> + return PTR_ERR(evsel);
> +
> + perf_evlist__add(evlist, evsel);
> + return 0;
> +}
> +
> +static bool is_kvmppc_exit_event(struct perf_evsel *evsel)
> +{
> + static unsigned int kvmppc_exit;
> +
> + if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
> + return false;
> +
> + if (unlikely(kvmppc_exit == 0)) {
> + if (strcmp(KVMPPC_EXIT, evsel->name))
> + return false;
> + kvmppc_exit = evsel->attr.config;
> + } else if (kvmppc_exit != evsel->attr.config) {
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static bool is_hv_dec_trap(struct perf_evsel *evsel, struct perf_sample *sample)
> +{
> + int trap = perf_evsel__intval(evsel, sample, "trap");
> + return trap == HV_DECREMENTER;
> +}
> +
> +/*
> + * Get the instruction pointer from the tracepoint data
> + */
> +u64 arch__get_ip(struct perf_evsel *evsel, struct perf_sample *sample)
> +{
> + if (perf_guest_only() &&
> + is_kvmppc_exit_event(evsel) &&
> + is_hv_dec_trap(evsel, sample))
> + return perf_evsel__intval(evsel, sample, "pc");
> +
> + return sample->ip;
> +}
> +
> +/*
> + * Get the HV and PR bits and accordingly, determine the cpumode
> + */
> +u8 arch__get_cpumode(const union perf_event *event, struct perf_evsel *evsel,
> + struct perf_sample *sample)
> +{
> + unsigned long hv, pr, msr;
> + u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
> +
> + if (!perf_guest_only() || !is_kvmppc_exit_event(evsel))
> + goto ret;
> +
> + if (sample->raw_data && is_hv_dec_trap(evsel, sample)) {
> + msr = perf_evsel__intval(evsel, sample, "msr");
> + hv = msr & ((unsigned long)1 << (PPC_MAX - HV_BIT));
> + pr = msr & ((unsigned long)1 << (PPC_MAX - PR_BIT));
> +
> + if (!hv && pr)
> + cpumode = PERF_RECORD_MISC_GUEST_USER;
> + else
> + cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
> + }
> +
> +ret:
> + return cpumode;
> +}
> diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
> index f86e172..b8105a6 100644
> --- a/tools/perf/util/event.c
> +++ b/tools/perf/util/event.c
> @@ -1291,6 +1291,13 @@ void thread__find_addr_location(struct thread *thread,
> al->sym = NULL;
> }
>
> +u8 __weak arch__get_cpumode(const union perf_event *event,
> + struct perf_evsel *evsel __maybe_unused,
> + struct perf_sample *sample __maybe_unused)
> +{
> + return event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
> +}

This hunk and the next should be on the previous patch, that is not even
compiling...

You have to compile patch by patch, we can't just test at the end of a
patchkit like this, this destroys bisection ;-\

Also you first need to put in place a way to override how to obtain the
cpumode, then you should use it.

Also this mode doesn't look feasible at all, think about processing
perf.data files generated in !powerpc systems being analysed in a
powerpc system. This has to be dependend on the architecture of the
machine where the perf.data file was recorded, not on the archictecture
of the machine the binary was built for.

It is only when you do live analysis, like with 'perf trace' and 'perf
top' that its guaranteed to be all on the same machine.

IIRC in one of the patches in this series you introduce and use a
library function on the same patch, please break it into two patches as
well, lemme see what is the name...

Yeah, it is also in this patch:

perf_evlist__arch_add_default(struct perf_evlist *evlist)

Please add this in a separate patch, stating in the changeset comment
why it is needed and how architectures can override it.

- Arnaldo

> +
> /*
> * Callers need to drop the reference to al->thread, obtained in
> * machine__findnew_thread()
> @@ -1301,13 +1308,14 @@ int perf_event__preprocess_sample(const union perf_event *event,
> struct perf_sample *sample,
> struct perf_evsel *evsel)
> {
> - u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
> + u8 cpumode;
> struct thread *thread = machine__findnew_thread(machine, sample->pid,
> sample->tid);
> -
> if (thread == NULL)
> return -1;
>
> + al->cpumode = cpumode = arch__get_cpumode(event, evsel, sample);
> +
> dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid);
> /*
> * Have we already created the kernel maps for this machine?
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index d81f13d..d0dca72 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -231,6 +231,12 @@ void perf_event_attr__set_max_precise_ip(struct perf_event_attr *attr)
> }
> }
>
> +int __weak
> +perf_evlist__arch_add_default(struct perf_evlist *evlist __maybe_unused)
> +{
> + return -1;
> +}
> +
> int perf_evlist__add_default(struct perf_evlist *evlist)
> {
> struct perf_event_attr attr = {
> @@ -239,6 +245,9 @@ int perf_evlist__add_default(struct perf_evlist *evlist)
> };
> struct perf_evsel *evsel;
>
> + if (!perf_evlist__arch_add_default(evlist))
> + return 0;
> +
> event_attr_init(&attr);
>
> perf_event_attr__set_max_precise_ip(&attr);
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index 7c4d9a2..98e24cd 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -75,6 +75,7 @@ void perf_evlist__delete(struct perf_evlist *evlist);
>
> void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
> void perf_evlist__remove(struct perf_evlist *evlist, struct perf_evsel *evsel);
> +int perf_evlist__arch_add_default(struct perf_evlist *evlist);
> int perf_evlist__add_default(struct perf_evlist *evlist);
> int __perf_evlist__add_default_attrs(struct perf_evlist *evlist,
> struct perf_event_attr *attrs, size_t nr_attrs);
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 4678086..afe1091 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1607,6 +1607,12 @@ static inline bool overflow(const void *endp, u16 max_size, const void *offset,
> #define OVERFLOW_CHECK_u64(offset) \
> OVERFLOW_CHECK(offset, sizeof(u64), sizeof(u64))
>
> +u64 __weak arch__get_ip(struct perf_evsel *evsel __maybe_unused,
> + struct perf_sample *sample)
> +{
> + return sample->ip;
> +}
> +
> int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
> struct perf_sample *data)
> {
> @@ -1780,6 +1786,7 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
> OVERFLOW_CHECK(array, data->raw_size, max_size);
> data->raw_data = (void *)array;
> array = (void *)array + data->raw_size;
> + data->ip = arch__get_ip(evsel, data);
> }
>
> if (type & PERF_SAMPLE_BRANCH_STACK) {
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 8e75434..eb6f52e 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -400,4 +400,8 @@ typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *);
> int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
> attr__fprintf_f attr__fprintf, void *priv);
>
> +u64 arch__get_ip(struct perf_evsel *evsel, struct perf_sample *sample);
> +u8 arch__get_cpumode(const union perf_event *event, struct perf_evsel *evsel,
> + struct perf_sample *sample);
> +
> #endif /* __PERF_EVSEL_H */
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 40b7a0d..1081ee0 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1130,10 +1130,11 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
> }
>
> static struct machine *machines__find_for_cpumode(struct machines *machines,
> - union perf_event *event,
> - struct perf_sample *sample)
> + union perf_event *event,
> + struct perf_sample *sample,
> + struct perf_evsel *evsel)
> {
> - const u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
> + u8 cpumode = arch__get_cpumode(event, evsel, sample);
> struct machine *machine;
>
> if (perf_guest &&
> @@ -1237,7 +1238,7 @@ static int machines__deliver_event(struct machines *machines,
>
> evsel = perf_evlist__id2evsel(evlist, sample->id);
>
> - machine = machines__find_for_cpumode(machines, event, sample);
> + machine = machines__find_for_cpumode(machines, event, sample, evsel);
>
> switch (event->header.type) {
> case PERF_RECORD_SAMPLE:
> diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
> index 7a2da7e..5e48ef1 100644
> --- a/tools/perf/util/util.c
> +++ b/tools/perf/util/util.c
> @@ -37,6 +37,11 @@ bool test_attr__enabled;
> bool perf_host = true;
> bool perf_guest = false;
>
> +bool perf_guest_only(void)
> +{
> + return !perf_host && perf_guest;
> +}
> +
> void event_attr_init(struct perf_event_attr *attr)
> {
> if (!perf_host)
> diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
> index 61650f0..eff1d8f 100644
> --- a/tools/perf/util/util.h
> +++ b/tools/perf/util/util.h
> @@ -344,5 +344,6 @@ int fetch_kernel_version(unsigned int *puint,
>
> const char *perf_tip(const char *dirpath);
> bool is_regular_file(const char *file);
> +bool perf_guest_only(void);
>
> #endif /* GIT_COMPAT_UTIL_H */
> --
> 2.1.4