Re: [PATCH v2 1/3] timekeeping: Introduce fast accessor to clock tai

From: Steven Rostedt
Date: Wed Apr 27 2022 - 15:23:51 EST


On Wed, 27 Apr 2022 13:22:05 -0400
Steven Rostedt <rostedt@xxxxxxxxxxx> wrote:

> I can see if I can play some games to detect this and replace the top 5
> bits with the saved timestamp at the head of the sub buffer.

This appears to fix the issue. It adds a function when reading the absolute
time stamp comparing it to a previous time stamp. If the previous time
stamp has any of the 5 MSB set, then it will OR it into the absolute time
stamp, and then compare it to the previous time stamp to check for overflow.

It also does not complain of "big deltas" if the update is an absolute
time stamp and the current time stamp has any of the 5 MSB set.

I will need to update libtraceevent to handle this case as well.

But looks like there's nothing on your end that needs to be done. Probably
just set those other functions you found to notrace. But that can be a
separate patch.

-- Steve

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 655d6db3e3c3..3a0c7ed0e93f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -29,6 +29,14 @@

#include <asm/local.h>

+/*
+ * The "absolute" timestamp in the buffer is only 59 bits.
+ * If a clock has the 5 MSBs set, it needs to be saved and
+ * reinserted.
+ */
+#define TS_MSB (0xf8ULL << 56)
+#define ABS_TS_MASK (~TS_MSB)
+
static void update_pages_handler(struct work_struct *work);

/*
@@ -783,6 +791,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
}
#endif

+/*
+ * The absolute time stamp drops the 5 MSBs and some clocks may
+ * require them. The rb_fix_abs_ts() will take a previous full
+ * time stamp, and add the 5 MSB of that time stamp on to the
+ * saved absolute time stamp. Then they are compared in case of
+ * the unlikely event that the latest time stamp incremented
+ * the 5 MSB.
+ */
+static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
+{
+ if (save_ts & TS_MSB) {
+ abs |= save_ts & TS_MSB;
+ /* Check for overflow */
+ if (unlikely(abs < save_ts))
+ abs += 1ULL << 59;
+ }
+ return abs;
+}

static inline u64 rb_time_stamp(struct trace_buffer *buffer);

@@ -811,8 +837,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
u64 ts;

/* If the event includes an absolute time, then just use that */
- if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
- return rb_event_time_stamp(event);
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ ts = rb_event_time_stamp(event);
+ return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
+ }

nest = local_read(&cpu_buffer->committing);
verify_event(cpu_buffer, event);
@@ -2754,8 +2782,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);

if (unlikely(info->delta > (1ULL << 59))) {
+ /*
+ * Some timers can use more than 59 bits, and when a timestamp
+ * is added to the buffer, it will lose those bits.
+ */
+ if (abs && (info->ts & TS_MSB)) {
+ info->delta &= ABS_TS_MASK;
+
/* did the clock go backwards */
- if (info->before == info->after && info->before > info->ts) {
+ } else if (info->before == info->after && info->before > info->ts) {
/* not interrupted */
static int once;

@@ -3304,7 +3339,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage,

case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
break;

@@ -3380,7 +3415,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,

case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
break;

case RINGBUF_TYPE_PADDING:
@@ -4367,6 +4402,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,

case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
cpu_buffer->read_stamp = delta;
return;

@@ -4397,6 +4433,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,

case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, iter->read_stamp);
iter->read_stamp = delta;
return;

@@ -4650,6 +4687,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4741,6 +4779,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}