Re: [PATCH] perf_events: fix bogus warn_on(_once) in perf_prepare_sample()

From: Stephane Eranian
Date: Thu Apr 08 2010 - 17:18:31 EST


On Thu, Apr 8, 2010 at 11:03 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Thu, 2010-04-08 at 22:55 +0200, Peter Zijlstra wrote:
>> On Thu, 2010-04-08 at 22:45 +0200, Stephane Eranian wrote:
>> > Â Â There is a warn_on_once() check for PERF_SAMPLE_RAW which trips
>> > Â Â when using PEBS on both Core and Nehalem. Core PEBS sample size is 144
>> > Â Â bytes and 176 bytes for Nehalem. Both are multiples of 8, but the size
>> > Â Â field is encoded as int, thus the total is never a multiple of 8 which
>> > Â Â trips the check. I think the size should have been u64, but now it is
>> > Â Â too late to change given it is ABI.
>>
>> PEBS hasn't seen -linus yet, so we can fix that.
>>
>> There's various things that do indeed rely on the perf buffer to always
>> be u64 aligned, so this warning isn't bogus at all.
>
> On the subject of PEBS, we need to change the ABI before it does hit
> -linus, I've got something like the below,. but I'm not quite sure of
> it.
>
> We could keep the PERF_RECORD_MISC_EXACT bit and allow non-fixed up IPs
> in PERF_SAMPLE_EVENT_IP fields, that way we can avoid having to add both
> IP and EVENT_IP to samples.
>
You always have the original IP in the PEBS sample, so why add yet another way
of getting to the same data?

> ---
> Subject: perf: Change the PEBS interface
> From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> Date: Tue Mar 30 13:35:58 CEST 2010
>
> This patch changes the PEBS interface from perf_event_attr::precise
> and PERF_RECORD_MISC_EXACT to perf_event_attr::precise and
> PERF_SAMPLE_EVENT_IP.
>
> The rationale is that .precise=1 !PERF_SAMPLE_EVENT_IP could be used
> to implement buffered PEBS.
>
I am not sure I understand what you mean by buffered PEBS. Are you talking
about using PEBS buffer bigger than one entry?
If so, how can you do:
- the LBR based fixups for multiple samples (on PMU interrupt)
- attribute PID/TID in system-wide mode


> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> LKML-Reference: <new-submission>
> ---
> Âarch/x86/include/asm/perf_event.h     |  Â9 --
> Âarch/x86/kernel/cpu/perf_event_intel_ds.c | Â133 +++++++++++++-----------------
> Âinclude/linux/perf_event.h        Â|  Â7 +
> Âkernel/perf_event.c            |  Â6 +
> Âtools/perf/builtin-top.c         Â|  Â9 +-
> Â5 files changed, 79 insertions(+), 85 deletions(-)
>
> Index: linux-2.6/arch/x86/include/asm/perf_event.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/perf_event.h
> +++ linux-2.6/arch/x86/include/asm/perf_event.h
> @@ -128,21 +128,12 @@ extern void perf_events_lapic_init(void)
>
> Â#define PERF_EVENT_INDEX_OFFSET Â Â Â Â Â Â Â Â Â Â Â Â0
>
> -/*
> - * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
> - * This flag is otherwise unused and ABI specified to be 0, so nobody should
> - * care what we do with it.
> - */
> -#define PERF_EFLAGS_EXACT Â Â Â(1UL << 3)
> -
> Â#define perf_misc_flags(regs) Â Â Â Â Â Â Â Â Â Â Â Â Â\
> Â({ Â Â int misc = 0; Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â \
> Â Â Â Âif (user_mode(regs)) Â Â Â Â Â Â Â Â Â Â Â Â Â Â\
> Â Â Â Â Â Â Â Âmisc |= PERF_RECORD_MISC_USER; Â Â Â Â Â\
>    Âelse                      Â\
> Â Â Â Â Â Â Â Âmisc |= PERF_RECORD_MISC_KERNEL; Â Â Â Â\
> - Â Â Â if (regs->flags & PERF_EFLAGS_EXACT) Â Â Â Â Â Â\
> - Â Â Â Â Â Â Â misc |= PERF_RECORD_MISC_EXACT; Â Â Â Â \
> Â Â Â Âmisc; })
>
> Â#define perf_instruction_pointer(regs) ((regs)->ip)
> Index: linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -330,7 +330,8 @@ static void intel_pmu_pebs_enable(struct
> Â Â Â Âcpuc->pebs_enabled |= 1ULL << hwc->idx;
> Â Â Â ÂWARN_ON_ONCE(cpuc->enabled);
>
> - Â Â Â if (x86_pmu.intel_cap.pebs_trap)
> + Â Â Â if (x86_pmu.intel_cap.pebs_trap &&
> + Â Â Â Â Â Â Â Â Â Â Â (event->attr.sample_type & PERF_SAMPLE_EVENT_IP))
> Â Â Â Â Â Â Â Âintel_pmu_lbr_enable(event);
> Â}
>
> @@ -345,7 +346,8 @@ static void intel_pmu_pebs_disable(struc
>
> Â Â Â Âhwc->config |= ARCH_PERFMON_EVENTSEL_INT;
>
> - Â Â Â if (x86_pmu.intel_cap.pebs_trap)
> + Â Â Â if (x86_pmu.intel_cap.pebs_trap &&
> + Â Â Â Â Â Â Â Â Â Â Â (event->attr.sample_type & PERF_SAMPLE_EVENT_IP))
> Â Â Â Â Â Â Â Âintel_pmu_lbr_disable(event);
> Â}
>
> @@ -376,12 +378,12 @@ static inline bool kernel_ip(unsigned lo
> Â#endif
> Â}
>
> -static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> +static int intel_pmu_pebs_fixup_ip(unsigned long *ipp)
> Â{
> Â Â Â Âstruct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> Â Â Â Âunsigned long from = cpuc->lbr_entries[0].from;
> Â Â Â Âunsigned long old_to, to = cpuc->lbr_entries[0].to;
> - Â Â Â unsigned long ip = regs->ip;
> + Â Â Â unsigned long ip = *ipp;
>
> Â Â Â Â/*
> Â Â Â Â * We don't need to fixup if the PEBS assist is fault like
> @@ -412,7 +414,7 @@ static int intel_pmu_pebs_fixup_ip(struc
> Â Â Â Â * We sampled a branch insn, rewind using the LBR stack
> Â Â Â Â */
> Â Â Â Âif (ip == to) {
> - Â Â Â Â Â Â Â regs->ip = from;
> + Â Â Â Â Â Â Â *ipp = from;
> Â Â Â Â Â Â Â Âreturn 1;
> Â Â Â Â}
>
> @@ -439,7 +441,7 @@ static int intel_pmu_pebs_fixup_ip(struc
> Â Â Â Â} while (to < ip);
>
> Â Â Â Âif (to == ip) {
> - Â Â Â Â Â Â Â regs->ip = old_to;
> + Â Â Â Â Â Â Â *ipp = old_to;
> Â Â Â Â Â Â Â Âreturn 1;
> Â Â Â Â}
>
> @@ -452,15 +454,62 @@ static int intel_pmu_pebs_fixup_ip(struc
>
> Âstatic int intel_pmu_save_and_restart(struct perf_event *event);
>
> +static void __intel_pmu_pebs_event(struct perf_event *event,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âstruct pt_regs *iregs, void *__pebs)
> +{
> + Â Â Â /*
> + Â Â Â Â* We cast to pebs_record_core since that is a subset of
> + Â Â Â Â* both formats and we don't use the other fields in this
> + Â Â Â Â* routine.
> + Â Â Â Â*/
> + Â Â Â struct pebs_record_core *pebs = __pebs;
> + Â Â Â struct perf_sample_data data;
> + Â Â Â struct perf_raw_record raw;
> + Â Â Â struct pt_regs regs;
> +
> + Â Â Â if (!intel_pmu_save_and_restart(event))
> + Â Â Â Â Â Â Â return;
> +
> + Â Â Â perf_sample_data_init(&data, 0);
> + Â Â Â data.period = event->hw.last_period;
> +
> + Â Â Â if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> + Â Â Â Â Â Â Â raw.size = x86_pmu.pebs_record_size;
> + Â Â Â Â Â Â Â raw.data = pebs;
> + Â Â Â Â Â Â Â data.raw = &raw;
> + Â Â Â }
> +
> + Â Â Â /*
> + Â Â Â Â* We use the interrupt regs as a base because the PEBS record
> + Â Â Â Â* does not contain a full regs set, specifically it seems to
> + Â Â Â Â* lack segment descriptors, which get used by things like
> + Â Â Â Â* user_mode().
> + Â Â Â Â*
> + Â Â Â Â* In the simple case fix up only the IP and BP,SP regs, for
> + Â Â Â Â* PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
> + Â Â Â Â* A possible PERF_SAMPLE_REGS will have to transfer all regs.
> + Â Â Â Â*/
> + Â Â Â regs = *iregs;
> + Â Â Â regs.ip = pebs->ip;
> + Â Â Â regs.bp = pebs->bp;
> + Â Â Â regs.sp = pebs->sp;
> +
> + Â Â Â if (event->attr.sample_type & PERF_SAMPLE_EVENT_IP) {
> + Â Â Â Â Â Â Â unsigned long event_ip = pebs->ip;
> + Â Â Â Â Â Â Â if (intel_pmu_pebs_fixup_ip(&event_ip))
> + Â Â Â Â Â Â Â Â Â Â Â data.event_ip = event_ip;
> + Â Â Â }
> +
> + Â Â Â if (perf_event_overflow(event, 1, &data, &regs))
> + Â Â Â Â Â Â Â x86_pmu_stop(event);
> +}
> +
> Âstatic void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
> Â{
> Â Â Â Âstruct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> Â Â Â Âstruct debug_store *ds = cpuc->ds;
> Â Â Â Âstruct perf_event *event = cpuc->events[0]; /* PMC0 only */
> Â Â Â Âstruct pebs_record_core *at, *top;
> - Â Â Â struct perf_sample_data data;
> - Â Â Â struct perf_raw_record raw;
> - Â Â Â struct pt_regs regs;
> Â Â Â Âint n;
>
> Â Â Â Âif (!ds || !x86_pmu.pebs)
> @@ -486,9 +535,6 @@ static void intel_pmu_drain_pebs_core(st
> Â Â Â Âif (n <= 0)
> Â Â Â Â Â Â Â Âreturn;
>
> - Â Â Â if (!intel_pmu_save_and_restart(event))
> - Â Â Â Â Â Â Â return;
> -
> Â Â Â Â/*
> Â Â Â Â * Should not happen, we program the threshold at 1 and do not
> Â Â Â Â * set a reset value.
> @@ -496,37 +542,7 @@ static void intel_pmu_drain_pebs_core(st
> Â Â Â ÂWARN_ON_ONCE(n > 1);
> Â Â Â Âat += n - 1;
>
> - Â Â Â perf_sample_data_init(&data, 0);
> - Â Â Â data.period = event->hw.last_period;
> -
> - Â Â Â if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> - Â Â Â Â Â Â Â raw.size = x86_pmu.pebs_record_size;
> - Â Â Â Â Â Â Â raw.data = at;
> - Â Â Â Â Â Â Â data.raw = &raw;
> - Â Â Â }
> -
> - Â Â Â /*
> - Â Â Â Â* We use the interrupt regs as a base because the PEBS record
> - Â Â Â Â* does not contain a full regs set, specifically it seems to
> - Â Â Â Â* lack segment descriptors, which get used by things like
> - Â Â Â Â* user_mode().
> - Â Â Â Â*
> - Â Â Â Â* In the simple case fix up only the IP and BP,SP regs, for
> - Â Â Â Â* PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
> - Â Â Â Â* A possible PERF_SAMPLE_REGS will have to transfer all regs.
> - Â Â Â Â*/
> - Â Â Â regs = *iregs;
> - Â Â Â regs.ip = at->ip;
> - Â Â Â regs.bp = at->bp;
> - Â Â Â regs.sp = at->sp;
> -
> - Â Â Â if (intel_pmu_pebs_fixup_ip(&regs))
> - Â Â Â Â Â Â Â regs.flags |= PERF_EFLAGS_EXACT;
> - Â Â Â else
> - Â Â Â Â Â Â Â regs.flags &= ~PERF_EFLAGS_EXACT;
> -
> - Â Â Â if (perf_event_overflow(event, 1, &data, &regs))
> - Â Â Â Â Â Â Â x86_pmu_stop(event);
> + Â Â Â __intel_pmu_pebs_event(event, iregs, at);
> Â}
>
> Âstatic void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> @@ -534,10 +550,7 @@ static void intel_pmu_drain_pebs_nhm(str
> Â Â Â Âstruct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> Â Â Â Âstruct debug_store *ds = cpuc->ds;
> Â Â Â Âstruct pebs_record_nhm *at, *top;
> - Â Â Â struct perf_sample_data data;
> Â Â Â Âstruct perf_event *event = NULL;
> - Â Â Â struct perf_raw_record raw;
> - Â Â Â struct pt_regs regs;
> Â Â Â Âu64 status = 0;
> Â Â Â Âint bit, n;
>
> @@ -579,33 +592,7 @@ static void intel_pmu_drain_pebs_nhm(str
> Â Â Â Â Â Â Â Âif (!event || bit >= MAX_PEBS_EVENTS)
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
>
> - Â Â Â Â Â Â Â if (!intel_pmu_save_and_restart(event))
> - Â Â Â Â Â Â Â Â Â Â Â continue;
> -
> - Â Â Â Â Â Â Â perf_sample_data_init(&data, 0);
> - Â Â Â Â Â Â Â data.period = event->hw.last_period;
> -
> - Â Â Â Â Â Â Â if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> - Â Â Â Â Â Â Â Â Â Â Â raw.size = x86_pmu.pebs_record_size;
> - Â Â Â Â Â Â Â Â Â Â Â raw.data = at;
> - Â Â Â Â Â Â Â Â Â Â Â data.raw = &raw;
> - Â Â Â Â Â Â Â }
> -
> - Â Â Â Â Â Â Â /*
> - Â Â Â Â Â Â Â Â* See the comment in intel_pmu_drain_pebs_core()
> - Â Â Â Â Â Â Â Â*/
> - Â Â Â Â Â Â Â regs = *iregs;
> - Â Â Â Â Â Â Â regs.ip = at->ip;
> - Â Â Â Â Â Â Â regs.bp = at->bp;
> - Â Â Â Â Â Â Â regs.sp = at->sp;
> -
> - Â Â Â Â Â Â Â if (intel_pmu_pebs_fixup_ip(&regs))
> - Â Â Â Â Â Â Â Â Â Â Â regs.flags |= PERF_EFLAGS_EXACT;
> - Â Â Â Â Â Â Â else
> - Â Â Â Â Â Â Â Â Â Â Â regs.flags &= ~PERF_EFLAGS_EXACT;
> -
> - Â Â Â Â Â Â Â if (perf_event_overflow(event, 1, &data, &regs))
> - Â Â Â Â Â Â Â Â Â Â Â x86_pmu_stop(event);
> + Â Â Â Â Â Â Â __intel_pmu_pebs_event(event, iregs, at);
> Â Â Â Â}
> Â}
>
> Index: linux-2.6/include/linux/perf_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -125,8 +125,9 @@ enum perf_event_sample_format {
> Â Â Â ÂPERF_SAMPLE_PERIOD Â Â Â Â Â Â Â Â Â Â Â= 1U << 8,
> Â Â Â ÂPERF_SAMPLE_STREAM_ID Â Â Â Â Â Â Â Â Â = 1U << 9,
> Â Â Â ÂPERF_SAMPLE_RAW Â Â Â Â Â Â Â Â Â Â Â Â = 1U << 10,
> + Â Â Â PERF_SAMPLE_EVENT_IP Â Â Â Â Â Â Â Â Â Â= 1U << 11,
>
> - Â Â Â PERF_SAMPLE_MAX = 1U << 11, Â Â Â Â Â Â /* non-ABI */
> + Â Â Â PERF_SAMPLE_MAX = 1U << 12, Â Â Â Â Â Â /* non-ABI */
> Â};
>
> Â/*
> @@ -294,7 +295,6 @@ struct perf_event_mmap_page {
> Â#define PERF_RECORD_MISC_USER Â Â Â Â Â Â Â Â Â(2 << 0)
> Â#define PERF_RECORD_MISC_HYPERVISOR Â Â Â Â Â Â(3 << 0)
>
> -#define PERF_RECORD_MISC_EXACT Â Â Â Â Â Â Â Â (1 << 14)
> Â/*
> Â* Reserve the last bit to indicate some extended misc field
> Â*/
> @@ -389,6 +389,7 @@ enum perf_event_type {
>     *   Âstruct perf_event_header    Âheader;
> Â Â Â Â *
> Â Â Â Â * Â Â Â{ u64 Â Â Â Â Â Â Â Â Â ip; Â Â Â } && PERF_SAMPLE_IP
> + Â Â Â Â* Â Â Â{ u64 Â Â Â Â Â Â Â Â Â event_ip; } && PERF_SAMPLE_EVENT_IP
> Â Â Â Â * Â Â Â{ u32 Â Â Â Â Â Â Â Â Â pid, tid; } && PERF_SAMPLE_TID
> Â Â Â Â * Â Â Â{ u64 Â Â Â Â Â Â Â Â Â time; Â Â } && PERF_SAMPLE_TIME
> Â Â Â Â * Â Â Â{ u64 Â Â Â Â Â Â Â Â Â addr; Â Â } && PERF_SAMPLE_ADDR
> @@ -804,6 +805,7 @@ struct perf_sample_data {
> Â Â Â Âu64 Â Â Â Â Â Â Â Â Â Â Â Â Â Â type;
>
> Â Â Â Âu64 Â Â Â Â Â Â Â Â Â Â Â Â Â Â ip;
> + Â Â Â u64 Â Â Â Â Â Â Â Â Â Â Â Â Â Â event_ip;
> Â Â Â Âstruct {
> Â Â Â Â Â Â Â Âu32 Â Â pid;
> Â Â Â Â Â Â Â Âu32 Â Â tid;
> @@ -824,6 +826,7 @@ struct perf_sample_data {
> Âstatic inline
> Âvoid perf_sample_data_init(struct perf_sample_data *data, u64 addr)
> Â{
> + Â Â Â data->event_ip = 0;
> Â Â Â Âdata->addr = addr;
> Â Â Â Âdata->raw Â= NULL;
> Â}
> Index: linux-2.6/kernel/perf_event.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> @@ -3158,6 +3158,9 @@ void perf_output_sample(struct perf_outp
> Â Â Â Âif (sample_type & PERF_SAMPLE_IP)
> Â Â Â Â Â Â Â Âperf_output_put(handle, data->ip);
>
> + Â Â Â if (sample_type & PERF_SAMPLE_EVENT_IP)
> + Â Â Â Â Â Â Â perf_output_put(handle, data->event_ip);
> +
> Â Â Â Âif (sample_type & PERF_SAMPLE_TID)
> Â Â Â Â Â Â Â Âperf_output_put(handle, data->tid_entry);
>
> @@ -3237,6 +3240,9 @@ void perf_prepare_sample(struct perf_eve
> Â Â Â Â Â Â Â Âheader->size += sizeof(data->ip);
> Â Â Â Â}
>
> + Â Â Â if (sample_type & PERF_SAMPLE_EVENT_IP)
> + Â Â Â Â Â Â Â header->size += sizeof(data->event_ip);
> +
> Â Â Â Âif (sample_type & PERF_SAMPLE_TID) {
> Â Â Â Â Â Â Â Â/* namespace issues */
> Â Â Â Â Â Â Â Âdata->tid_entry.pid = perf_event_pid(event, current);
> Index: linux-2.6/tools/perf/builtin-top.c
> ===================================================================
> --- linux-2.6.orig/tools/perf/builtin-top.c
> +++ linux-2.6/tools/perf/builtin-top.c
> @@ -975,8 +975,10 @@ static void event__process_sample(const
> Â Â Â Â Â Â Â Âreturn;
> Â Â Â Â}
>
> - Â Â Â if (self->header.misc & PERF_RECORD_MISC_EXACT)
> + Â Â Â if (ip)
> Â Â Â Â Â Â Â Âexact_samples++;
> + Â Â Â else
> + Â Â Â Â Â Â Â return;
>
> Â Â Â Âif (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
> Â Â Â Â Â Âal.filtered)
> @@ -1169,6 +1171,11 @@ static void start_counter(int i, int cou
>
>    Âattr->sample_type    = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
>
> + Â Â Â if (attr->precise) {
> + Â Â Â Â Â Â Â attr->sample_type &= ~PERF_SAMPLE_IP;
> + Â Â Â Â Â Â Â attr->sample_type |= PERF_SAMPLE_EVENT_IP;
> + Â Â Â }
> +
> Â Â Â Âif (freq) {
>        Âattr->sample_type    |= PERF_SAMPLE_PERIOD;
>        Âattr->freq       Â= 1;
>
>
>
¢éì®&Þ~º&¶¬–+-±éÝ¥Šw®žË±Êâmébžìdz¹Þ)í…æèw*jg¬±¨¶‰šŽŠÝj/êäz¹ÞŠà2ŠÞ¨è­Ú&¢)ß«a¶Úþø®G«éh®æj:+v‰¨Šwè†Ù>Wš±êÞiÛaxPjØm¶Ÿÿà -»+ƒùdš_