[PATCH bpf-next] bpf: Add bpf_read_raw_record() helper

From: Namhyung Kim
Date: Tue Aug 23 2022 - 17:04:07 EST


The helper is for BPF programs attached to perf_event in order to read
event-specific raw data. I followed the convention of the
bpf_read_branch_records() helper so that it can tell the size of
record using BPF_F_GET_RAW_RECORD flag.

The use case is to filter perf event samples based on the HW provided
data which have more detailed information about the sample.

Note that it only reads the first fragment of the raw record. But it
seems mostly ok since all the existing PMU raw data have only single
fragment and the multi-fragment records are only for BPF output attached
to sockets. So unless it's used with such an extreme case, it'd work
for most of tracing use cases.

Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
---
I don't know how to test this. As the raw data is available on some
hardware PMU only (e.g. AMD IBS). I tried a tracepoint event but it was
rejected by the verifier. Actually it needs a bpf_perf_event_data
context so that's not an option IIUC.

include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 64 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 934a2a8beb87..af7f70564819 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5355,6 +5355,23 @@ union bpf_attr {
* Return
* Current *ktime*.
*
+ * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
+ * Description
+ * For an eBPF program attached to a perf event, retrieve the
+ * raw record associated to *ctx* and store it in the buffer
+ * pointed by *buf* up to size *size* bytes.
+ * Return
+ * On success, number of bytes written to *buf*. On error, a
+ * negative value.
+ *
+ * The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
+ * instead return the number of bytes required to store the raw
+ * record. If this flag is set, *buf* may be NULL.
+ *
+ * **-EINVAL** if arguments invalid or **size** not a multiple
+ * of **sizeof**\ (u64\ ).
+ *
+ * **-ENOENT** if the event does not have raw records.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5566,6 +5583,7 @@ union bpf_attr {
FN(tcp_raw_check_syncookie_ipv4), \
FN(tcp_raw_check_syncookie_ipv6), \
FN(ktime_get_tai_ns), \
+ FN(read_raw_record), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5749,6 +5767,11 @@ enum {
BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
};

+/* BPF_FUNC_read_raw_record flags. */
+enum {
+ BPF_F_GET_RAW_RECORD_SIZE = (1ULL << 0),
+};
+
#define __bpf_md_ptr(type, name) \
union { \
type name; \
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 68e5cdd24cef..db172b12e5f8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
#include <linux/fprobe.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/perf_event.h>

#include <net/bpf_sk_storage.h>

@@ -1532,6 +1533,44 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = {
.arg4_type = ARG_ANYTHING,
};

+BPF_CALL_4(bpf_read_raw_record, struct bpf_perf_event_data_kern *, ctx,
+ void *, buf, u32, size, u64, flags)
+{
+ struct perf_raw_record *raw = ctx->data->raw;
+ struct perf_raw_frag *frag;
+ u32 to_copy;
+
+ if (unlikely(flags & ~BPF_F_GET_RAW_RECORD_SIZE))
+ return -EINVAL;
+
+ if (unlikely(!raw))
+ return -ENOENT;
+
+ if (flags & BPF_F_GET_RAW_RECORD_SIZE)
+ return raw->size;
+
+ if (!buf || (size % sizeof(u32) != 0))
+ return -EINVAL;
+
+ frag = &raw->frag;
+ WARN_ON_ONCE(!perf_raw_frag_last(frag));
+
+ to_copy = min_t(u32, frag->size, size);
+ memcpy(buf, frag->data, to_copy);
+
+ return to_copy;
+}
+
+static const struct bpf_func_proto bpf_read_raw_record_proto = {
+ .func = bpf_read_raw_record,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1548,6 +1587,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_read_branch_records_proto;
case BPF_FUNC_get_attach_cookie:
return &bpf_get_attach_cookie_proto_pe;
+ case BPF_FUNC_read_raw_record:
+ return &bpf_read_raw_record_proto;
default:
return bpf_tracing_func_proto(func_id, prog);
}
--
2.37.2.609.g9ff673ca1a-goog