[RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time

From: Christophe Leroy
Date: Mon Jan 13 2020 - 12:09:25 EST


Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.

Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday: vdso: 1078 nsec/call
clock-gettime-monotonic-raw: vdso: 1317 nsec/call
clock-gettime-monotonic: vdso: 1255 nsec/call

After:
gettimeofday: vdso: 1032 nsec/call
clock-gettime-monotonic-raw: vdso: 1312 nsec/call
clock-gettime-monotonic: vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxx>
---
lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#endif

+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder)
+{
+ u32 ret = 0;
+
+ while (dividend >= divisor) {
+ /* The following asm() prevents the compiler from
+ optimising this loop into a modulo operation. */
+ asm("" : "+rm"(dividend));
+
+ dividend -= divisor;
+ ret++;
+ }
+
+ *remainder = dividend;
+
+ return ret;
+}
+
static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
u64 cycles, last, sec, ns;
u32 seq;
+ unsigned long nsec;

do {
seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
return -1;

ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
- ns >>= vd->shift;
+ nsec = ns >> vd->shift;
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));

@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
* Do this outside the loop: a race inside the loop could result
* in __iter_div_u64_rem() being extremely slow.
*/
- ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
+ ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+ ts->tv_nsec = nsec;

return 0;
}
--
2.13.3