Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

From: cp0613
Date: Mon Jun 30 2025 - 08:14:51 EST


On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@xxxxxxxxx wrote:

> > It can be found that the zbb optimized implementation uses fewer instructions,
> > even for 16-bit and 8-bit data.
>
> Far too many register spills to stack.
> I think you've forgotten to specify -O2

Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
I used the web tool you provided as follows:
```
unsigned int generic_ror32(unsigned int word, unsigned int shift)
{
return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
{
#ifdef __riscv
__asm__ volatile("nop"); // ALTERNATIVE(nop)

__asm__ volatile(
".option push\n"
".option arch,+zbb\n"
"rorw %0, %1, %2\n"
".option pop\n"
: "=r" (word) : "r" (word), "r" (shift) :);
#endif
return word;
}

unsigned short generic_ror16(unsigned short word, unsigned int shift)
{
return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
{
unsigned int word32 = ((unsigned int)word << 16) | word;
#ifdef __riscv
__asm__ volatile("nop"); // ALTERNATIVE(nop)

__asm__ volatile(
".option push\n"
".option arch,+zbb\n"
"rorw %0, %1, %2\n"
".option pop\n"
: "=r" (word32) : "r" (word32), "r" (shift) :);
#endif
return (unsigned short)word;
}
```
The disassembly obtained is:
```
generic_ror32:
andi a1,a1,31
negw a5,a1
sllw a5,a0,a5
srlw a0,a0,a1
or a0,a5,a0
ret

zbb_opt_ror32:
nop
rorw a0, a0, a1
sext.w a0,a0
ret

generic_ror16:
andi a1,a1,15
negw a5,a1
andi a5,a5,15
sllw a5,a0,a5
srlw a0,a0,a1
or a0,a0,a5
slli a0,a0,48
srli a0,a0,48
ret

zbb_opt_ror16:
slliw a5,a0,16
addw a5,a5,a0
nop
rorw a5, a5, a1
ret
```

Thanks,
Pei