diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index f19856d..050aea7 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -21,6 +21,7 @@ #include #include #include +#include #define gtod (&VVAR(vsyscall_gtod_data)) @@ -57,7 +58,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) return ret; } - #else notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) @@ -182,6 +182,29 @@ notrace static u64 vread_tsc(void) return last; } +notrace static u64 vread_tsc_raw(void) +{ + u64 tsc, last=gtod->raw_cycle_last; + if( likely( gtod->has_rdtscp ) ) { + u32 tsc_lo, tsc_hi, + tsc_cpu __attribute__((unused)); + asm volatile + ( "rdtscp" + /* ^- has built-in cancellation point / pipeline stall "barrier" */ + : "=a" (tsc_lo) + , "=d" (tsc_hi) + , "=c" (tsc_cpu) + ); // since all variables 32-bit, eax, edx, ecx used - NOT rax, rdx, rcx + tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL); + } else { + tsc = rdtsc_ordered(); + } + if (likely(tsc >= last)) + return tsc; + asm volatile (""); + return last; +} + notrace static inline u64 vgetsns(int *mode) { u64 v; @@ -203,6 +226,27 @@ notrace static inline u64 vgetsns(int *mode) return v * gtod->mult; } +notrace static inline u64 vgetsns_raw(int *mode) +{ + u64 v; + cycles_t cycles; + + if (gtod->vclock_mode == VCLOCK_TSC) + cycles = vread_tsc_raw(); +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif +#ifdef CONFIG_HYPERV_TSCPAGE + else if (gtod->vclock_mode == VCLOCK_HVCLOCK) + cycles = vread_hvclock(mode); +#endif + else + return 0; + v = (cycles - gtod->raw_cycle_last) & gtod->raw_mask; + return v * gtod->raw_mult; +} + /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ notrace static int __always_inline do_realtime(struct timespec *ts) { @@ -246,6 +290,27 @@ notrace static int __always_inline do_monotonic(struct timespec *ts) return mode; } +notrace static int __always_inline do_monotonic_raw( struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->monotonic_time_raw_sec; + ns = gtod->monotonic_time_raw_nsec; + ns += vgetsns_raw(&mode); + ns >>= gtod->raw_shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + notrace static void do_realtime_coarse(struct timespec *ts) { unsigned long seq; @@ -277,6 +342,10 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) if (do_monotonic(ts) == VCLOCK_NONE) goto fallback; break; + case CLOCK_MONOTONIC_RAW: + if (do_monotonic_raw(ts) == VCLOCK_NONE) + goto fallback; + break; case CLOCK_REALTIME_COARSE: do_realtime_coarse(ts); break; @@ -326,3 +395,23 @@ notrace time_t __vdso_time(time_t *t) } time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); + +extern unsigned +__vdso_linux_tsc_calibration(struct linux_tsc_calibration *); + +notrace unsigned +__vdso_linux_tsc_calibration(struct linux_tsc_calibration *tsc_cal) +{ + if ( (gtod->vclock_mode == VCLOCK_TSC) && (tsc_cal != ((void*)0UL)) ) + { + tsc_cal -> tsc_khz = gtod->tsc_khz; + tsc_cal -> mult = gtod->raw_mult; + tsc_cal -> shift = gtod->raw_shift; + return 1; + } + return 0; +} + +const struct linux_tsc_calibration * linux_tsc_calibration(void) + __attribute((weak, alias("__vdso_linux_tsc_calibration"))); + diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index d3a2dce..e0b5cce 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -25,6 +25,8 @@ VERSION { __vdso_getcpu; time; __vdso_time; + linux_tsc_calibration; + __vdso_linux_tsc_calibration; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 422764a..17fd07f 100644 --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -26,6 +26,7 @@ VERSION __vdso_clock_gettime; __vdso_gettimeofday; __vdso_time; + __vdso_linux_tsc_calibration; }; LINUX_2.5 { diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S index 05cd1c5..b69b0bf 100644 --- a/arch/x86/entry/vdso/vdsox32.lds.S +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -21,6 +21,7 @@ VERSION { __vdso_gettimeofday; __vdso_getcpu; __vdso_time; + __vdso_linux_tsc_calibration ; local: *; }; } diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index e1216dd..5fc49b9b 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -16,11 +16,14 @@ #include #include #include +#include int vclocks_used __read_mostly; DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); +extern unsigned int tsc_khz; + void update_vsyscall_tz(void) { vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; @@ -45,6 +48,12 @@ void update_vsyscall(struct timekeeper *tk) vdata->mult = tk->tkr_mono.mult; vdata->shift = tk->tkr_mono.shift; + vdata->raw_cycle_last = tk->tkr_raw.cycle_last; + vdata->raw_mask = tk->tkr_raw.mask; + vdata->raw_mult = tk->tkr_raw.mult; + vdata->raw_shift = tk->tkr_raw.shift; + vdata->has_rdtscp = static_cpu_has(X86_FEATURE_RDTSCP); + vdata->wall_time_sec = tk->xtime_sec; vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; @@ -74,5 +83,9 @@ void update_vsyscall(struct timekeeper *tk) vdata->monotonic_time_coarse_sec++; } + vdata->monotonic_time_raw_sec = tk->raw_sec; + vdata->monotonic_time_raw_nsec = tk->tkr_raw.xtime_nsec; + vdata->tsc_khz = tsc_khz; + gtod_write_end(vdata); } diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index fb856c9..4c474fa 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -22,7 +22,12 @@ struct vsyscall_gtod_data { u64 mask; u32 mult; u32 shift; - + u64 raw_cycle_last; + u64 raw_mask; + u32 raw_mult; + u32 raw_shift; + u32 has_rdtscp; + u32 tsc_khz; /* open coded 'struct timespec' */ u64 wall_time_snsec; gtod_long_t wall_time_sec; @@ -32,6 +37,8 @@ struct vsyscall_gtod_data { gtod_long_t wall_time_coarse_nsec; gtod_long_t monotonic_time_coarse_sec; gtod_long_t monotonic_time_coarse_nsec; + gtod_long_t monotonic_time_raw_sec; + gtod_long_t monotonic_time_raw_nsec; int tz_minuteswest; int tz_dsttime; diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h new file mode 100644 index 0000000..3eded59 --- /dev/null +++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H +#define _ASM_X86_VDSO_TSC_CALIBRATION_H +/* + * Programs that want to use rdtsc / rdtscp instructions + * from user-space can make use of the Linux kernel TSC calibration + * by calling : + * __vdso_linux_tsc_calibration(struct linux_tsc_calibration_s *); + * ( one has to resolve this symbol as in + * tools/testing/selftests/vDSO/parse_vdso.c + * ) + * which fills in a structure + * with the following layout : + */ + +/** struct linux_tsc_calibration - + * mult: amount to multiply 64-bit TSC value by + * shift: the right shift to apply to (mult*TSC) yielding nanoseconds + * tsc_khz: the calibrated TSC frequency in KHz from which previous members calculated + */ +struct linux_tsc_calibration +{ + unsigned int mult; + unsigned int shift; + unsigned int tsc_khz; +}; + +/* To use: + * + * static unsigned + * (*linux_tsc_cal)(struct linux_tsc_calibration *linux_tsc_cal) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration"); + * if( linux_tsc_cal == 0UL ) + * { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not applied to the kernel.\n"); + * return ERROR; + * } + * static const struct linux_tsc_calibration *clock_source=(void*)0UL; + * if( ! (*linux_tsc_cal)(&clock_source) ) + * fprintf(stderr,"TSC is not the system clocksource.\n"); + * unsigned int tsc_lo, tsc_hi, tsc_cpu; + * asm volatile + * ( "rdtscp" : (=a) tsc_hi, (=d) tsc_lo, (=c) tsc_cpu ); + * unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo; + * unsigned long nanoseconds = + * (( clock_source -> mult ) * tsc ) >> (clock_source -> shift); + * + * nanoseconds is now TSC value converted to nanoseconds, + * according to Linux' clocksource calibration values. + * Incidentally, 'tsc_cpu' is the number of the CPU the task is running on. + * + * But better results are obtained by applying this to the difference (delta) + * and adding this to some previous timespec value: + * static u64 previous_tsc=0, previous_nsec=0, previous_sec=0; + * u64 tsc = rdtscp(); + * u64 delta = tsc - previous_tsc; + * u64 nsec = ((delta * clock_source->mult) + previous_nsec ) + * >> clock_source->shift; + * ts->tv_sec = previous_sec + (nsec / NSEC_PER_SEC); + * ts->tv_nsec = nsec % NSEC_PER_SEC; + * previous_tsc = tsc + * previous_sec = ts->tv_sec; + * previous_nsec = ts->tv_nsec << clock_source->shift; + * return ts; + * This is the approach taken by Linux kernel & in VDSO . + * + * Or, in user-space, with floating point, one could use the rdtscp value as number of picoseconds : + * u64 ns = lround( ((double)rdtscp()) / (((double)clock_source->tsc_khz) / 1e3) ); + * (ie. if tsc_khz is 3000 , there are 3 tsc ticks per nanosecond, so divide tsc ticks by 3). + * + * There should actually be very little difference between the two values obtained (@ 0.02% ) + * by either method. + */ + +#endif