[RFC PATCH V2 2/9] perf: Extend ABI to support post-processing monotonic raw conversion

From: kan . liang
Date: Mon Feb 13 2023 - 14:08:06 EST


From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

The monotonic raw clock is not affected by NTP/PTP correction. The
calculation of the monotonic raw clock can be done in the
post-processing, which can reduce the kernel overhead.

Add hw_time in the struct perf_event_attr to tell the kernel dump the
raw HW time to user space. The perf tool will calculate the HW time
in post-processing.
Currently, only supports the monotonic raw conversion.
Only dump the raw HW time with PERF_RECORD_SAMPLE, because the accurate
HW time can only be provided in a sample by HW. For other type of
records, the user requested clock should be returned as usual. Nothing
is changed.

Add perf_event_mmap_page::cap_user_time_mono_raw ABI to dump the
conversion information. The cap_user_time_mono_raw also indicates
whether the monotonic raw conversion information is available.
If yes, the clock monotonic raw can be calculated as
mono_raw = base + ((cyc - last) * mult + nsec) >> shift

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
---
include/uapi/linux/perf_event.h | 21 ++++++++++++++++++---
kernel/events/core.c | 7 +++++++
2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ccb7f5dad59b..9d56fe027f6c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -455,7 +455,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ hw_time : 1, /* generate raw HW time for samples */
+ __reserved_1 : 25;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -615,7 +616,8 @@ struct perf_event_mmap_page {
cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */
cap_user_time_zero : 1, /* The time_zero field is used */
cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */
- cap_____res : 58;
+ cap_user_time_mono_raw : 1, /* The time_mono_* fields are used */
+ cap_____res : 57;
};
};

@@ -692,11 +694,24 @@ struct perf_event_mmap_page {
__u64 time_cycles;
__u64 time_mask;

+ /*
+ * If cap_user_time_mono_raw, the monotonic raw clock can be calculated
+ * from the hardware clock (e.g. TSC) 'cyc'.
+ *
+ * mono_raw = base + ((cyc - last) * mult + nsec) >> shift
+ *
+ */
+ __u64 time_mono_last;
+ __u32 time_mono_mult;
+ __u32 time_mono_shift;
+ __u64 time_mono_nsec;
+ __u64 time_mono_base;
+
/*
* Hole for extension of the self monitor capabilities
*/

- __u8 __reserved[116*8]; /* align to 1k. */
+ __u8 __reserved[112*8]; /* align to 1k. */

/*
* Control data for the mmap() data buffer.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 380476a934e8..f062cce2dafc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12135,6 +12135,13 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sigtrap && !attr->remove_on_exec)
return -EINVAL;

+ if (attr->use_clockid) {
+ /*
+ * Only support post-processing for the monotonic raw clock
+ */
+ if (attr->hw_time && (attr->clockid != CLOCK_MONOTONIC_RAW))
+ return -EINVAL;
+ }
out:
return ret;

--
2.35.1