[RFC][PATCH 4/4] time: Do leapsecond adjustment in gettime fastpaths

From: John Stultz
Date: Fri May 29 2015 - 16:25:42 EST


Currently, leapsecond adjustments are done at tick time.

As a result, the leapsecond was applied at the first timer
tick *after* the leapsecond (~1-10ms late depending on HZ),
rather then exactly on the second edge.

This was in part historical from back when we were always
tick based, but correcting this since has been avoided since
it adds extra conditional checks in the gettime fastpath,
which has performance overhead.

However, it was recently pointed out that ABS_TIME
CLOCK_REALTIME timers set for right after the leapsecond
could fire a second early, since some timers may be expired
before we trigger the timekeeping timer, which then applies
the leapsecond.

This isn't quite as bad as it sounds, since behaviorally
it is similar to what is possible w/ ntpd made leapsecond
adjustments done w/o using the kernel discipline. Where
due to latencies, timers may fire just prior to the
settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be
careful, since they are prone to quirks from settimeofday()
disturbances.)

However, the purpose of having the kernel do the leap adjustment
is to avoid such latencies, so I think this is worth fixing.

So in order to properly keep those timers from firing a second
early, this patch modifies the gettime accessors to do the
extra checks to apply the leapsecond adjustment on the second
edge. This prevents the timer core from expiring timers too
early.

This patch does not handle VDSO time implementations, so
userspace using vdso gettime will still see the leapsecond
applied at the first timer tick after the leapsecond.
This is a bit of a tradeoff, since the performance impact
would be greatest to VDSO implementations, and since vdso
interfaces don't provide the TIME_OOP flag, one can't
distinquish the leapsecond from a time discontinuity (such
as settimeofday), so correcting the VDSO may not be as
important there.

Apologies to Richard Cochran, who pushed for such a change
years ago, which I resisted due to the concerns about the
performance overhead.

While I suspect this isn't extremely critical, folks who
care about strict leap-second correctness will likely
want to watch this, and it will likely be a -stable candidate.

Cc: Prarit Bhargava <prarit@xxxxxxxxxx>
Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Richard Cochran <richardcochran@xxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
Cc: Jiri Bohac <jbohac@xxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Shuah Khan <shuahkh@xxxxxxxxxxxxxxx>
Originally-suggested-by: Richard Cochran <richardcochran@xxxxxxxxx>
Reported-by: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Reported-by: Prarit Bhargava <prarit@xxxxxxxxxx>
Signed-off-by: John Stultz <john.stultz@xxxxxxxxxx>
---
include/linux/time64.h | 1 +
include/linux/timekeeper_internal.h | 7 +++
kernel/time/ntp.c | 73 +++++++++++++++++++++++++---
kernel/time/ntp_internal.h | 1 +
kernel/time/timekeeping.c | 97 ++++++++++++++++++++++++++++++++-----
5 files changed, 159 insertions(+), 20 deletions(-)

diff --git a/include/linux/time64.h b/include/linux/time64.h
index a383147..ff46e87 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -28,6 +28,7 @@ struct timespec64 {
#define FSEC_PER_SEC 1000000000000000LL

/* Located here for timespec[64]_valid_strict */
+#define TIME64_MAX ((s64)~((u64)1 << 63))
#define KTIME_MAX ((s64)~((u64)1 << 63))
#define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index fb86963..78980ea 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -60,6 +60,9 @@ struct tk_read_base {
* shifted nano seconds.
* @ntp_error_shift: Shift conversion between clock shifted nano seconds and
* ntp shifted nano seconds.
+ * @next_leap_sec: Second value of the next leap sec (or TIME64_MAX)
+ * @next_leap_ktime: ktime_t value of the next leap sec (or KTIME_MAX)
+ * @leap_direction: Direction of pending leap adjustment
*
* Note: For timespec(64) based interfaces wall_to_monotonic is what
* we need to add to xtime (or xtime corrected for sub jiffie times)
@@ -104,6 +107,10 @@ struct timekeeper {
s64 ntp_error;
u32 ntp_error_shift;
u32 ntp_err_mult;
+ /* Leapsecond status */
+ time64_t next_leap_sec;
+ ktime_t next_leap_ktime;
+ int leap_direction;
};

#ifdef CONFIG_GENERIC_TIME_VSYSCALL
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 472591e..6e15fbb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -35,6 +35,7 @@ unsigned long tick_nsec;
static u64 tick_length;
static u64 tick_length_base;

+#define SECS_PER_DAY 86400
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -76,6 +77,9 @@ static long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;

+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t ntp_next_leap_sec = TIME64_MAX;
+
#ifdef CONFIG_NTP_PPS

/*
@@ -349,6 +353,7 @@ void ntp_clear(void)
tick_length = tick_length_base;
time_offset = 0;

+ ntp_next_leap_sec = TIME64_MAX;
/* Clear PPS state variables */
pps_clear();
}
@@ -359,6 +364,33 @@ u64 ntp_tick_length(void)
return tick_length;
}

+/**
+ * get_leap_state - Returns the NTP leap state
+ * @next_leap_sec: Next leapsecond in time64_t
+ * @next_leap_ktime: Next leapsecond in ktime_t
+ *
+ * Provides NTP leapsecond state. Returns direction
+ * of the leapsecond adjustment as an integer.
+ */
+int get_leap_state(time64_t *next_leap_sec, ktime_t *next_leap_ktime)
+{
+ int dir;
+
+ if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+ dir = -1;
+ *next_leap_sec = ntp_next_leap_sec;
+ *next_leap_ktime = ktime_set(ntp_next_leap_sec, 0);
+ } else if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+ dir = 1;
+ *next_leap_sec = ntp_next_leap_sec;
+ *next_leap_ktime = ktime_set(ntp_next_leap_sec, 0);
+ } else {
+ dir = 0;
+ *next_leap_sec = TIME64_MAX;
+ next_leap_ktime->tv64 = KTIME_MAX;
+ }
+ return dir;
+}

/*
* this routine handles the overflow of the microsecond field
@@ -382,15 +414,21 @@ int second_overflow(unsigned long secs)
*/
switch (time_state) {
case TIME_OK:
- if (time_status & STA_INS)
+ if (time_status & STA_INS) {
time_state = TIME_INS;
- else if (time_status & STA_DEL)
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ (secs % SECS_PER_DAY);
+ } else if (time_status & STA_DEL) {
time_state = TIME_DEL;
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ ((secs+1) % SECS_PER_DAY);
+ }
break;
case TIME_INS:
- if (!(time_status & STA_INS))
+ if (!(time_status & STA_INS)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if (secs % 86400 == 0) {
+ } else if (secs % SECS_PER_DAY == 0) {
leap = -1;
time_state = TIME_OOP;
printk_deferred(KERN_NOTICE
@@ -398,19 +436,21 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_DEL:
- if (!(time_status & STA_DEL))
+ if (!(time_status & STA_DEL)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if ((secs + 1) % 86400 == 0) {
+ } else if ((secs + 1) % SECS_PER_DAY == 0) {
leap = 1;
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
printk_deferred(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
}
break;
case TIME_OOP:
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
break;
-
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
@@ -547,6 +587,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
+ ntp_next_leap_sec = TIME64_MAX;
/* restart PPS frequency calibration */
pps_reset_freq_interval();
}
@@ -711,6 +752,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;

+ /* Handle leapsec adjustments */
+ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
+ if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+ result = TIME_OOP;
+ txc->tai++;
+ txc->time.tv_sec--;
+ }
+ if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+ result = TIME_WAIT;
+ txc->tai--;
+ txc->time.tv_sec++;
+ }
+ if ((time_state == TIME_OOP) &&
+ (ts->tv_sec == ntp_next_leap_sec)) {
+ result = TIME_WAIT;
+ }
+ }
+
return result;
}

diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index bbd102a..cd831b6 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -5,6 +5,7 @@ extern void ntp_init(void);
extern void ntp_clear(void);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(void);
+extern int get_leap_state(time64_t *next_leap_sec, ktime_t *next_leap_ktime);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 946acb7..9313190 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -591,6 +591,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
ntp_clear();
}

+ /* Capture leapsecond state */
+ tk->leap_direction = get_leap_state(&tk->next_leap_sec,
+ &tk->next_leap_ktime);
+
tk_update_ktime_data(tk);

update_vsyscall(tk);
@@ -634,6 +638,24 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}

/**
+ * __getnstimeofday64_preleap - Returns the time of day in a timespec64,
+ * @tk: pointer to the timekeeper structure to use
+ * @ts: pointer to the timespec to be set
+ *
+ * Internal function. Does not take lock. Updates the time of day in the
+ * timespec, WITHOUT the leapsecond edge adjustment.
+ */
+static void __getnstimeofday64_preleap(struct timekeeper *tk, struct timespec64 *ts)
+{
+ s64 nsecs;
+
+ ts->tv_sec = tk->xtime_sec;
+ ts->tv_nsec = 0;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ timespec64_add_ns(ts, nsecs);
+}
+
+/**
* __getnstimeofday64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
@@ -643,20 +665,22 @@ static void timekeeping_forward_now(struct timekeeper *tk)
int __getnstimeofday64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
+ time64_t next_leap;
+ int dir;
unsigned long seq;
- s64 nsecs = 0;

do {
seq = read_seqcount_begin(&tk_core.seq);

- ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ __getnstimeofday64_preleap(tk, ts);
+ next_leap = tk->next_leap_sec;
+ dir = tk->leap_direction;

} while (read_seqcount_retry(&tk_core.seq, seq));

- ts->tv_nsec = 0;
- timespec64_add_ns(ts, nsecs);
-
+ /* Apply leapsecond adjustment */
+ if (unlikely(ts->tv_sec >= next_leap))
+ ts->tv_sec += dir;
/*
* Do not bail out early, in case there were callers still using
* the value, even in the face of the WARN_ON.
@@ -710,6 +734,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ ktime_t next_leap;
+ int dir;
s64 nsecs;

WARN_ON(timekeeping_suspended);
@@ -718,11 +744,17 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ next_leap = tk->next_leap_ktime;
+ dir = tk->leap_direction;

} while (read_seqcount_retry(&tk_core.seq, seq));

- return ktime_add_ns(base, nsecs);
-
+ base = ktime_add_ns(base, nsecs);
+ /* apply leapsecond adjustment */
+ if (offs == TK_OFFS_REAL)
+ if (unlikely(base.tv64 >= next_leap.tv64))
+ base = ktime_add(base, ktime_set(dir, 0));
+ return base;
}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

@@ -733,15 +765,23 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
*/
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
+ struct timekeeper *tk = &tk_core.timekeeper;
ktime_t *offset = offsets[offs];
unsigned long seq;
- ktime_t tconv;
+ ktime_t tconv, next_leap;
+ int dir;

do {
seq = read_seqcount_begin(&tk_core.seq);
tconv = ktime_add(tmono, *offset);
+ next_leap = tk->next_leap_ktime;
+ dir = tk->leap_direction;
} while (read_seqcount_retry(&tk_core.seq, seq));

+ /* apply leapsecond adjustment */
+ if (offs == TK_OFFS_REAL)
+ if (unlikely(tconv.tv64 >= next_leap.tv64))
+ tconv = ktime_add(tconv, ktime_set(dir, 0));
return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);
@@ -862,6 +902,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
s64 nsecs_raw, nsecs_real;
+ time64_t next_leap;
+ int dir;

WARN_ON_ONCE(timekeeping_suspended);

@@ -875,10 +917,17 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
nsecs_real = timekeeping_get_ns(&tk->tkr_mono);

+ next_leap = tk->next_leap_sec;
+ dir = tk->leap_direction;
+
} while (read_seqcount_retry(&tk_core.seq, seq));

timespec_add_ns(ts_raw, nsecs_raw);
timespec_add_ns(ts_real, nsecs_real);
+
+ /* apply leapsecond adjustment */
+ if (unlikely(ts_real->tv_sec >= next_leap))
+ ts_real->tv_sec += dir;
}
EXPORT_SYMBOL(getnstime_raw_and_real);

@@ -1252,6 +1301,10 @@ void __init timekeeping_init(void)
set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);

+ /* Capture leapsecond state */
+ tk->leap_direction = get_leap_state(&tk->next_leap_sec,
+ &tk->next_leap_ktime);
+
timekeeping_update(tk, TK_MIRROR);

write_seqcount_end(&tk_core.seq);
@@ -1422,6 +1475,10 @@ void timekeeping_resume(void)
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;

+ /* Capture leapsecond state */
+ tk->leap_direction = get_leap_state(&tk->next_leap_sec,
+ &tk->next_leap_ktime);
+
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1825,6 +1882,10 @@ void update_wall_time(void)
*/
clock_set |= accumulate_nsecs_to_secs(tk);

+ /* Capture leapsecond state */
+ tk->leap_direction = get_leap_state(&tk->next_leap_sec,
+ &tk->next_leap_ktime);
+
write_seqcount_begin(&tk_core.seq);
/*
* Update the real timekeeper.
@@ -1970,7 +2031,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
- ktime_t base;
+ ktime_t base, next_leap;
+ int dir;
u64 nsecs;

do {
@@ -1982,9 +2044,18 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
*offs_tai = tk->offs_tai;
+
+ next_leap = tk->next_leap_ktime;
+ dir = tk->leap_direction;
} while (read_seqcount_retry(&tk_core.seq, seq));

- return ktime_add_ns(base, nsecs);
+ base = ktime_add_ns(base, nsecs);
+
+ /* apply leapsecond adjustment */
+ if (unlikely(ktime_add(base, *offs_real).tv64 >= next_leap.tv64))
+ *offs_real = ktime_add(*offs_real, ktime_set(dir, 0));
+
+ return base;
}
#endif

@@ -2015,11 +2086,11 @@ int do_adjtimex(struct timex *txc)
return ret;
}

- getnstimeofday64(&ts);
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);

+ __getnstimeofday64_preleap(tk, &ts);
+
orig_tai = tai = tk->tai_offset;
ret = __do_adjtimex(txc, &ts, &tai);

--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/