Re: enabling libgcc for 64-bit divisions, was Re: PROBLEM: XFS on ARM corruption 'Structure needs cl

From: George Spelvin
Date: Thu Aug 13 2015 - 02:18:49 EST


Linus wrote:
> Ugh. gcc still does a pretty horrible job at it. While gcc knows that
> a widening 32x32->64 multiplication can be simplified, it doesn't do
> the same thing for a 64/32->64 division, and always calls __udivdi3
> for it.

Agreed. But some x86 code I'm working on now, I have a *lot* of
asm("divl") calls because even do_div isn't great for 64/32->32 division.

If I know that the quotient must fit into 32 bits, then do_div does
unnecessary work. Even if I have an explicit test and special case for
(x >> 32) >= y, GCC doessn't know it can eliminate the first divide, and
sometimes the guarantee is more obscure. The usual case is I'm computing
x * y / z, where I know that y <= z, so I know the final result must be
less than x.

Point gcc -m32 -mregparm=3 -O3 -S at the following and compare the
three functions' objhect code:

#include <stdint.h>

#define do_div(n, base) \
({ \
uint32_t __upper, __low, __high, __mod; \
uint32_t __base = (base); \
asm("" : "=a" (__low), "=d" (__high) : "A" (n)); \
__upper = __high; \
if (__high) { \
__upper = __high % __base; \
__high = __high / __base; \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
__mod; \
})

uint32_t saturating_divide(uint64_t x, uint32_t y)
{
if (x >> 32 > y)
return 0xffffffff;
do_div(x, y);
return (uint32_t)x;
}

uint32_t truncating_divide(uint64_t x, uint32_t y)
{
do_div(x, y);
return (uint32_t)x;
}

uint32_t simple_divide(uint64_t x, uint32_t y)
{
asm("divl %1" : "=A" (x) : "rm" (y));
return (uint32_t)x;
}


For a more practical example, consider the following wonder of clarity,
which answers "if X processor cycles take the same time as Y ticks of
the HPET (which is ticking at hpet_readl(HPET_PERIOD) fs per tick),
what is the processor frequency in kHz?"

/*
* Calculate the TSC frequency from HPET reference.
*
* Doing this exactly involves some numbers larger than 64 bits,
* which is a pain to deal with portably. Fortunately, this is
* highly machine-dependent code, so we can indulge in x86 assembly
* hackery.
*
* We want the tsc in kHz, i.e. tsc_delta/ms.
* What we have are tsc_delta, hpet_delta, and fs_per_hpet_tick.
*
* So we want (tsc_delta * fs_per_ms) / (hpet_delta * fs_per_hpet_tick).
* But fs_per_ms is 1e12, an awkwardly large number.
* Fortunately, 1e12 >> 12 is only 28 bits.
*
* We still have to be a bit careful about overflows in tsc_delta;
* the current code can only handle up to 0x119799812D = 75.557
* billion. That's large enough (compared to the 1s duration of
* tsc_refine_calibration) that a wider multiply isn't needed.
*/
#define FSEC_PER_MSEC 1000000000000ull /* A 40-bit number */

static unsigned __attribute_const__
calc_hpet_ref(u64 tsc_delta, u32 hpet_delta)
{
u32 hpet_period;

/*
* If the tsc_delta is greater than 75 billion (several seconds,
* should never happen), don't even try.
*/
if (unlikely(tsc_delta > -1ull / (FSEC_PER_MSEC >> 12)))
return UINT_MAX;

hpet_period = hpet_readl(HPET_PERIOD);
{
#ifdef __i386
/*
* Form a 77-bit numerator delta_tsc * 2e12. This is done in two
* steps: first we multiply by 5**12, which is a 28-bit number and
* produces a 64-bit product, and then shift it up 13 more bits.
*/
u64 hi64 = tsc_delta * (FSEC_PER_MSEC >> 12);
u32 rem, hi32, lo = (uint32_t)hi64 << 13;

hi64 >>= (32 - 13);

/*
* hpet_delta * hpet_period is a 64-bit number, and dividing by
* that is awkward on a 32-bit machine. But they're each 32 bits,
* so we can simply divide by each in turn.
*
* First, 96/32 -> 64-bit division.
* Divide (hi64,lo) by the first 32-bit value, hpet_period.
* Because hi64 is at most 45 bits and hpet_period is more
* than 20 bits (the HPET timer period is more than 1 ns),
* we are guaranteed the first division's quotient (hi32)
* easily fits into 32 bits.
*/
asm("divl %2" : "=a" (hi32), "=d" (rem)
: "rm" (hpet_period), "A" (hi64));
asm("divl %2" : "+a" (lo), "+d" (rem) : "rm" (hpet_period));

/*
* Second, 64/32->32-bit division.
* Divide (hi32,lo) by the second 32-bit value, hpet_ticks.
* This quotient is guaranteed to fit into 32 bits.
*/
if (unlikely(hi32 >= hpet_delta))
return UINT_MAX;
asm("divl %2" : "+a" (lo), "+d" (hi32) : "rm" (hpet_delta));
#else
/*
* x86_64 supports 64x64->128-bit multply and 128/64-bit
* divides, so this is easy, although not expressible in C.
*/
u64 hi, lo;

/* (hi,lo) = delta_tsc * 2*FSEC_PER_MSEC */
asm("mulq %2" : "=a" (lo), "=d" (hi)
: "%rm" ((u64)tsc_delta), "0" (2*FSEC_PER_MSEC));
/* lo = (hi, lo) / (hpet_period * hpet_ticks) */
asm("divq %2" : "+a" (lo), "+d" (hi)
: "rm" ((u64)hpet_period * hpet_delta));
if (unlikely(lo >> 32))
return UINT_MAX;
#endif
/* Finally, add one and divide by 2 to produce a rounded result */
return ((u32)lo + 1) / 2;
}
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/