Re: bts & perf_counters

From: Peter Zijlstra
Date: Mon Jul 06 2009 - 11:34:30 EST


On Tue, 2009-06-30 at 08:32 +0100, Metzger, Markus T wrote:
>
> >> A debugger is interested in the tail of the execution trace. It
> >> won't poll the trace data (which would be far too much overhead).
> >> How would a user synchronize on the profile stream when the
> >> profiled process is stopped?
> >
> >Yeah, with a new perf_attr flag that activates overwrite this
> >usecase would be solved, right? The debugger has to make sure the
> >task is stopped before reading out the buffer, but that's pretty
> >much all.
>
> I'm not sure about that. The way I read struct perf_counter_mmap_page,
> data_head points to the end of the stream (I would guess one byte
> beyond the last record).
>
> I think we can ignore data_tail in the debug scenario since debuggers
> won't poll. We can further assume a buffer overflow no matter how big
> the ring buffer - branch trace grows terribly fast and we don't want
> normal uses to lock megabytes of memory, do we?
>
> How would a debugger find the beginning of the event stream to start
> reading?

something like the below? (utterly untested)

---
include/linux/perf_counter.h | 3 ++-
kernel/perf_counter.c | 35 +++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5e970c7..95b5257 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -180,8 +180,9 @@ struct perf_counter_attr {
freq : 1, /* use freq, not period */
inherit_stat : 1, /* per task counts */
enable_on_exec : 1, /* next exec enables */
+ overwrite : 1, /* overwrite mmap data */

- __reserved_1 : 51;
+ __reserved_1 : 50;

__u32 wakeup_events; /* wakeup every n events */
__u32 __reserved_2;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50d..0c64d53 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2097,6 +2097,13 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
nr_pages = (vma_size / PAGE_SIZE) - 1;

/*
+ * attr->overwrite and PROT_WRITE both use ->data_tail in an exclusive
+ * manner, disallow this combination.
+ */
+ if ((vma->vm_flags & VM_WRITE) && counter->attr.overwrite)
+ return -EINVAL;
+
+ /*
* If we have data pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
*/
@@ -2329,6 +2336,7 @@ struct perf_output_handle {
struct perf_counter *counter;
struct perf_mmap_data *data;
unsigned long head;
+ unsigned long tail;
unsigned long offset;
int nmi;
int sample;
@@ -2363,6 +2371,31 @@ static bool perf_output_space(struct perf_mmap_data *data,
return true;
}

+static void perf_output_tail(struct perf_mmap_data *data, unsigned int head)
+{
+ __u64 *tailp = &data->user_page->data_tail;
+ struct perf_event_header *header;
+ unsigned long pages_mask, nr;
+ unsigned long tail, new;
+ unsigned long size;
+ void *ptr;
+
+ if (data->writable)
+ return;
+
+ size = data->nr_pages << PAGE_SHIFT;
+ pages_mask = data->nr_pages - 1;
+ tail = ACCESS_ONCE(*tailp);
+
+ while (tail + size - head < 0) {
+ nr = (tail >> PAGE_SHIFT) & pages_mask;
+ ptr = data->pages[nr] + (tail & (PAGE_SIZE - 1));
+ header = (struct perf_event_header *)ptr;
+ new = tail + header->size;
+ tail = atomic64_cmpxchg(tailp, tail, new);
+ }
+}
+
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->data->poll, POLL_IN);
@@ -2535,6 +2568,8 @@ static int perf_output_begin(struct perf_output_handle *handle,
head += size;
if (unlikely(!perf_output_space(data, offset, head)))
goto fail;
+ if (unlikely(counter->attr.overwrite))
+ perf_output_tail(data, head);
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);

handle->offset = offset;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/