RE: [PATCH v3 2/2] riscv: Optimize bitops with Zbb extension

From: Wang, Xiao W
Date: Sun Oct 29 2023 - 22:12:15 EST




> -----Original Message-----
> From: Charlie Jenkins <charlie@xxxxxxxxxxxx>
> Sent: Friday, October 27, 2023 2:25 PM
> To: Wang, Xiao W <xiao.w.wang@xxxxxxxxx>
> Cc: paul.walmsley@xxxxxxxxxx; palmer@xxxxxxxxxxx;
> aou@xxxxxxxxxxxxxxxxx; ardb@xxxxxxxxxx; anup@xxxxxxxxxxxxxx; Li, Haicheng
> <haicheng.li@xxxxxxxxx>; ajones@xxxxxxxxxxxxxxxx; Liu, Yujie
> <yujie.liu@xxxxxxxxx>; linux-riscv@xxxxxxxxxxxxxxxxxxx; linux-
> efi@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx
> Subject: Re: [PATCH v3 2/2] riscv: Optimize bitops with Zbb extension
>
> On Tue, Sep 26, 2023 at 05:46:55PM +0800, Xiao Wang wrote:
> > This patch leverages the alternative mechanism to dynamically optimize
> > bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When
> > Zbb ext is not supported by the runtime CPU, legacy implementation is
> > used. If Zbb is supported, then the optimized variants will be selected
> > via alternative patching.
> >
> > The legacy bitops support is taken from the generic C implementation as
> > fallback.
> >
> > If the parameter is a build-time constant, we leverage compiler builtin to
> > calculate the result directly, this approach is inspired by x86 bitops
> > implementation.
> >
> > EFI stub runs before the kernel, so alternative mechanism should not be
> > used there, this patch introduces a macro NO_ALTERNATIVE for this
> purpose.
> >
> > Signed-off-by: Xiao Wang <xiao.w.wang@xxxxxxxxx>
> > ---
> > arch/riscv/include/asm/bitops.h | 266 +++++++++++++++++++++++++-
> > drivers/firmware/efi/libstub/Makefile | 2 +-
> > 2 files changed, 264 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/bitops.h
> b/arch/riscv/include/asm/bitops.h
> > index 3540b690944b..c97e774cb647 100644
> > --- a/arch/riscv/include/asm/bitops.h
> > +++ b/arch/riscv/include/asm/bitops.h
> > @@ -15,13 +15,273 @@
> > #include <asm/barrier.h>
> > #include <asm/bitsperlong.h>
> >
> > +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE)
> > #include <asm-generic/bitops/__ffs.h>
> > -#include <asm-generic/bitops/ffz.h>
> > -#include <asm-generic/bitops/fls.h>
> > #include <asm-generic/bitops/__fls.h>
> > +#include <asm-generic/bitops/ffs.h>
> > +#include <asm-generic/bitops/fls.h>
> > +
> > +#else
> > +#include <asm/alternative-macros.h>
> > +#include <asm/hwcap.h>
> > +
> > +#if (BITS_PER_LONG == 64)
> > +#define CTZW "ctzw "
> > +#define CLZW "clzw "
> > +#elif (BITS_PER_LONG == 32)
> > +#define CTZW "ctz "
> > +#define CLZW "clz "
> > +#else
> > +#error "Unexpected BITS_PER_LONG"
> > +#endif
> > +
> > +static __always_inline unsigned long variable__ffs(unsigned long word)
> > +{
> > + int num;
> > +
> > + asm_volatile_goto(
> > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > + : : : : legacy);
> > +
> > + asm volatile (
> > + ".option push\n"
> > + ".option arch,+zbb\n"
> > + "ctz %0, %1\n"
> > + ".option pop\n"
> > + : "=r" (word) : "r" (word) :);
> > +
> > + return word;
>
> That's so clean!
>
> > +
> > +legacy:
> > + num = 0;
> > +#if BITS_PER_LONG == 64
> > + if ((word & 0xffffffff) == 0) {
> > + num += 32;
> > + word >>= 32;
> > + }
> > +#endif
> > + if ((word & 0xffff) == 0) {
> > + num += 16;
> > + word >>= 16;
> > + }
> > + if ((word & 0xff) == 0) {
> > + num += 8;
> > + word >>= 8;
> > + }
> > + if ((word & 0xf) == 0) {
> > + num += 4;
> > + word >>= 4;
> > + }
> > + if ((word & 0x3) == 0) {
> > + num += 2;
> > + word >>= 2;
> > + }
> > + if ((word & 0x1) == 0)
> > + num += 1;
> > + return num;
> > +}
> > +
> > +/**
> > + * __ffs - find first set bit in a long word
> > + * @word: The word to search
> > + *
> > + * Undefined if no set bit exists, so code should check against 0 first.
> > + */
> > +#define __ffs(word) \
> > + (__builtin_constant_p(word) ? \
> > + (unsigned long)__builtin_ctzl(word) : \
> > + variable__ffs(word))
> > +
> > +static __always_inline unsigned long variable__fls(unsigned long word)
> > +{
> > + int num;
> > +
> > + asm_volatile_goto(
> > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > + : : : : legacy);
> > +
> > + asm volatile (
> > + ".option push\n"
> > + ".option arch,+zbb\n"
> > + "clz %0, %1\n"
> > + ".option pop\n"
> > + : "=r" (word) : "r" (word) :);
> > +
> > + return BITS_PER_LONG - 1 - word;
> > +
> > +legacy:
> > + num = BITS_PER_LONG - 1;
> > +#if BITS_PER_LONG == 64
> > + if (!(word & (~0ul << 32))) {
> > + num -= 32;
> > + word <<= 32;
> > + }
> > +#endif
> > + if (!(word & (~0ul << (BITS_PER_LONG-16)))) {
> > + num -= 16;
> > + word <<= 16;
> > + }
> > + if (!(word & (~0ul << (BITS_PER_LONG-8)))) {
> > + num -= 8;
> > + word <<= 8;
> > + }
> > + if (!(word & (~0ul << (BITS_PER_LONG-4)))) {
> > + num -= 4;
> > + word <<= 4;
> > + }
> > + if (!(word & (~0ul << (BITS_PER_LONG-2)))) {
> > + num -= 2;
> > + word <<= 2;
> > + }
> > + if (!(word & (~0ul << (BITS_PER_LONG-1))))
> > + num -= 1;
> > + return num;
> > +}
> > +
> > +/**
> > + * __fls - find last set bit in a long word
> > + * @word: the word to search
> > + *
> > + * Undefined if no set bit exists, so code should check against 0 first.
> > + */
> > +#define __fls(word) \
> > + (__builtin_constant_p(word) ? \
> > + (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) : \
> > + variable__fls(word))
> > +
> > +static __always_inline int variable_ffs(int x)
> > +{
> > + int r;
> > +
> > + asm_volatile_goto(
> > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > + : : : : legacy);
> > +
> > + asm volatile (
> > + ".option push\n"
> > + ".option arch,+zbb\n"
> > + "bnez %1, 1f\n"
> > + "li %0, 0\n"
> > + "j 2f\n"
> > + "1:\n"
> > + CTZW "%0, %1\n"
> > + "addi %0, %0, 1\n"
> > + "2:\n"
> > + ".option pop\n"
> > + : "=r" (r) : "r" (x) :);
> > +
> > + return r;
> > +
>
> I generally like to move as much code out of asm as possible. You are
> also able to remove one of the branches if you rearrange as follows:
>
> if (!x)
> return 0;
>
> asm volatile (
> ".option push\n"
> ".option arch,+zbb\n"
> CTZW "%0, %1\n"
> ".option pop\n"
> : "=r" (r) : "r" (x) :);
>
> return r + 1;
>
> You could then also extract out the "if (!x)" at the beginning of the
> legacy section at put it at the top of the function.
>

Agree, less asm code will be more reader friendly. And your sample code looks
nice, thanks for the suggestion. Will do the adjustment in next version.

> > +legacy:
> > + r = 1;
> > + if (!x)
> > + return 0;
> > + if (!(x & 0xffff)) {
> > + x >>= 16;
> > + r += 16;
> > + }
> > + if (!(x & 0xff)) {
> > + x >>= 8;
> > + r += 8;
> > + }
> > + if (!(x & 0xf)) {
> > + x >>= 4;
> > + r += 4;
> > + }
> > + if (!(x & 3)) {
> > + x >>= 2;
> > + r += 2;
> > + }
> > + if (!(x & 1)) {
> > + x >>= 1;
> > + r += 1;
> > + }
> > + return r;
> > +}
> > +
> > +/**
> > + * ffs - find first set bit in a word
> > + * @x: the word to search
> > + *
> > + * This is defined the same way as the libc and compiler builtin ffs routines.
> > + *
> > + * ffs(value) returns 0 if value is 0 or the position of the first set bit if
> > + * value is nonzero. The first (least significant) bit is at position 1.
> > + */
> > +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
> > +
> > +static __always_inline int variable_fls(unsigned int x)
> > +{
> > + int r;
> > +
> > + asm_volatile_goto(
> > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1)
> > + : : : : legacy);
> > +
> > + asm volatile (
> > + ".option push\n"
> > + ".option arch,+zbb\n"
> > + "bnez %1, 1f\n"
> > + "li %0, 0\n"
> > + "j 2f\n"
> > + "1:\n"
> > + CLZW "%0, %1\n"
> > + "neg %0, %0\n"
> > + "addi %0, %0, 32\n"
> > + "2:\n"
> > + ".option pop\n"
> > + : "=r" (r) : "r" (x) :);
> > +
> > + return r;
> > +
>
> Same comment as above but with appropriate changes.

Will refine it in next version.

>
> > +legacy:
> > + r = 32;
> > + if (!x)
> > + return 0;
> > + if (!(x & 0xffff0000u)) {
> > + x <<= 16;
> > + r -= 16;
> > + }
> > + if (!(x & 0xff000000u)) {
> > + x <<= 8;
> > + r -= 8;
> > + }
> > + if (!(x & 0xf0000000u)) {
> > + x <<= 4;
> > + r -= 4;
> > + }
> > + if (!(x & 0xc0000000u)) {
> > + x <<= 2;
> > + r -= 2;
> > + }
> > + if (!(x & 0x80000000u)) {
> > + x <<= 1;
> > + r -= 1;
> > + }
> > + return r;
> > +}
> > +
> > +/**
> > + * fls - find last set bit in a word
> > + * @x: the word to search
> > + *
> > + * This is defined in a similar way as ffs, but returns the position of the most
> > + * significant set bit.
> > + *
> > + * fls(value) returns 0 if value is 0 or the position of the last set bit if
> > + * value is nonzero. The last (most significant) bit is at position 32.
> > + */
> > +#define fls(x)
> \
> > + (__builtin_constant_p(x) ? \
> > + (int)(((x) != 0) ? \
> > + (sizeof(unsigned int) * 8 - __builtin_clz(x)) : 0) : \
> > + variable_fls(x))
> > +
> > +#endif
>
> As a nit, it's nice when large #ifdef blocks have a comment like
>
> /* !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) */
>

Yes, I will add it in next version. Thanks for the comments.

BRs,
Xiao

> Overall great changes, just that small optimization to get rid of the
> jump.
>
> - Charlie
>
> > +
> > +#include <asm-generic/bitops/ffz.h>
> > #include <asm-generic/bitops/fls64.h>
> > #include <asm-generic/bitops/sched.h>
> > -#include <asm-generic/bitops/ffs.h>
> >
> > #include <asm-generic/bitops/hweight.h>
> >