[RFC PATCH 0/2] Introducing trace buffer mapping by user-space
From: Vincent Donnefort
Date:  Sun Feb 12 2023 - 10:33:13 EST
Hi all,
We (Android folks) have been recently working on bringing tracing to the
pKVM hypervisor (more about pKVM? [1] [2]) reusing as much as possible the
tracefs support already available in the host. More specifically, sharing
the ring_buffer_per_cpu between the kernel and the hypervisor, the later
being the writer while the former is only reading. After presenting this
endeavour at the tracingsummit, end of last year [3], Steven observed this
is a similar problem to another idea he had a while ago: mapping the
tracing ring buffers directly into userspace.
The tracing ring-buffer can be stored or sent to network without any copy
via splice. However the later doesn't allow real time processing of the
traces by userspace without a copy, which can only be achieved by letting
userspace map directly the ring-buffer.
And indeed, in both ideas, we have a ring-buffer, an entity being the
writer, the other being a reader and both share the ring buffer pages while
having different VA spaces. So here's an RFC bringing userspace mapping of
a ring-buffer and if it doesn't cover the pKVM hypervisor it nonetheless
brings building blocks that will be reused later.
Any feedback very much appreciated.
Vincent
[1] https://lwn.net/Articles/836693/
[2] https://www.youtube.com/watch?v=9npebeVFbFw
[3] https://tracingsummit.org/ts/2022/hypervisortracing/
-- 
As an example, Steve wrote this quick demo that only needs libtracefs:
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <stdarg.h>
  #include <errno.h>
  #include <unistd.h>
  #include <tracefs.h>
  #include <kbuffer.h>
  #include <event-parse.h>
  
  #include <asm/types.h>
  #include <sys/mman.h>
  #include <sys/ioctl.h>
  
  #define TRACE_MMAP_IOCTL_GET_READER_PAGE	_IO('T', 0x1)
  #define TRACE_MMAP_IOCTL_UPDATE_META_PAGE	_IO('T', 0x2)
  
  struct ring_buffer_meta_page {
  	__u64		entries;
  	__u64		overrun;
  	__u32		pages_touched;
  	__u32		reader_page;
  	__u32		nr_data_pages;
  	__u32		data_page_head;
  	__u32		data_pages[];
  };
  
  static char *argv0;
  static int page_size;
  
  static char *get_this_name(void)
  {
  	static char *this_name;
  	char *arg;
  	char *p;
  
  	if (this_name)
  		return this_name;
  
  	arg = argv0;
  	p = arg+strlen(arg);
  
  	while (p >= arg && *p != '/')
  		p--;
  	p++;
  
  	this_name = p;
  	return p;
  }
  
  static void usage(void)
  {
  	char *p = get_this_name();
  
  	printf("usage: %s exec\n"
  	       "\n",p);
  	exit(-1);
  }
  
  static void __vdie(const char *fmt, va_list ap, int err)
  {
  	int ret = errno;
  	char *p = get_this_name();
  
  	if (err && errno)
  		perror(p);
  	else
  		ret = -1;
  
  	fprintf(stderr, "  ");
  	vfprintf(stderr, fmt, ap);
  
  	fprintf(stderr, "\n");
  	exit(ret);
  }
  
  void die(const char *fmt, ...)
  {
  	va_list ap;
  
  	va_start(ap, fmt);
  	__vdie(fmt, ap, 0);
  	va_end(ap);
  }
  
  void pdie(const char *fmt, ...)
  {
  	va_list ap;
  
  	va_start(ap, fmt);
  	__vdie(fmt, ap, 1);
  	va_end(ap);
  }
  
  static void read_page(struct tep_handle *tep, struct kbuffer *kbuf,
  		      void *data, int page)
  {
  	static struct trace_seq seq;
  	struct tep_record record;
  
  	if (seq.buffer)
  		trace_seq_reset(&seq);
  	else
  		trace_seq_init(&seq);
  
  	kbuffer_load_subbuffer(kbuf, data + page_size * page);
  	while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
  		kbuffer_next_event(kbuf, NULL);
  		tep_print_event(tep, &seq, &record,
  				"%s-%d %9d\t%s: %s\n",
  				TEP_PRINT_COMM,
  				TEP_PRINT_PID,
  				TEP_PRINT_TIME,
  				TEP_PRINT_NAME,
  				TEP_PRINT_INFO);
  		trace_seq_do_printf(&seq);
  		trace_seq_reset(&seq);
  	}
  }
  
  static int get_reader_page(int fd, struct ring_buffer_meta_page *meta)
  {
  	return meta->reader_page;
  }
  
  static int next_reader_page(int fd, struct ring_buffer_meta_page *meta)
  {
  	if (ioctl(fd, TRACE_MMAP_IOCTL_GET_READER_PAGE) < 0)
  		pdie("ioctl");
  	return meta->reader_page;
  }
  
  int main (int argc, char **argv)
  {
  	struct ring_buffer_meta_page *map;
  	struct tep_handle *tep;
  	struct kbuffer *kbuf;
  	unsigned long *p;
  	void *meta;
  	void *data;
  	char *buf;
  	int data_len;
  	int start;
  	int page;
  	int fd;
  
  	argv0 = argv[0];
  
  	tep = tracefs_local_events(NULL);
  	kbuf = tep_kbuffer(tep);
  
  	page_size = getpagesize();
  
  	fd = tracefs_instance_file_open(NULL, "per_cpu/cpu0/trace_pipe_raw",
  					O_RDONLY);
  	if (fd < 0)
  		pdie("raw");
  
  	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
  	if (meta == MAP_FAILED)
  		pdie("mmap");
  
  	if (ioctl(fd, TRACE_MMAP_IOCTL_UPDATE_META_PAGE) < 0)
  		pdie("ioctl");
  
  	map = meta;
  	printf("entries:	%llu\n", map->entries);
  	printf("overrun:	%llu\n", map->overrun);
  	printf("pages_touched:	%u\n", map->pages_touched);
  	printf("reader_page:	%u\n", map->reader_page);
  	printf("nr_data_pages:	%u\n\n", map->nr_data_pages);
  
  	data_len = page_size * map->nr_data_pages;
  
  	data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, fd, page_size);
  	if (data == MAP_FAILED)
  		pdie("mmap data");
  
  	page = get_reader_page(fd, meta);
  	start = page;
  	do {
  		read_page(tep, kbuf, data, page);
  		printf("reader_page:	%u\n", map->reader_page);
  		printf("PAGE: %d\n", page);
  	} while ((page = next_reader_page(fd, meta)) != start);
  	
  	p = data;
  	printf("%lx\n%lx\n%lx\n\n", p[0], p[1], p[2]);
  
  	munmap(data, data_len);
  	munmap(meta, page_size);
  	close(fd);
  
  	buf = tracefs_instance_file_read(NULL, "per_cpu/cpu0/stats", NULL);
  	if (!buf)
  		pdie("stats");
  	printf("%s\n", buf);
  	free(buf);
  
  
  	return 0;
  }
Vincent Donnefort (2):
  ring-buffer: Introducing ring-buffer mapping functions
  tracing: Allow user-space mapping of the ring-buffer
 include/linux/ring_buffer.h     |   8 +
 include/uapi/linux/trace_mmap.h |  17 ++
 kernel/trace/ring_buffer.c      | 355 +++++++++++++++++++++++++++++++-
 kernel/trace/trace.c            |  74 ++++++-
 4 files changed, 441 insertions(+), 13 deletions(-)
 create mode 100644 include/uapi/linux/trace_mmap.h
-- 
2.39.1.581.gbfd45094c4-goog