[PATCH 3/4] bitops: squeeze even more out of fns()

From: Yury Norov
Date: Thu May 02 2024 - 19:32:50 EST


The function clears N-1 first set bits to find the N'th one with:

while (word && n--)
word &= word - 1;

In the worst case, it would take 63 iterations.

Instead of linear walk through the set bits, we can do a binary search
by using hweight(). This would work even better on platforms supporting
hardware-assisted hweight() - pretty much every modern arch.

On my Ryzen 9 5900X, the test_fns() benchmark runs binary fns() twice
faster, comparing to linear one.

The fns8() returns 64 to make sure that in case of no bit found, the
return value will be greater than the bit capacity of arguments of all
fnsXX() functions up to fns64().

Signed-off-by: Yury Norov <yury.norov@xxxxxxxxx>
---
include/linux/bitops.h | 42 +++++++++++++++++++++++++++++++++++++-----
1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 57ecef354f47..1c4773db56e0 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -247,17 +247,49 @@ static inline unsigned long __ffs64(u64 word)
return __ffs((unsigned long)word);
}

+static inline unsigned long fns8(u8 word, unsigned int n)
+{
+ while (word && n--)
+ word &= word - 1;
+
+ return word ? __ffs(word) : 64;
+}
+
+static inline unsigned long fns16(u16 word, unsigned int n)
+{
+ unsigned int w = hweight8((u8)word);
+
+ return n < w ? fns8((u8)word, n) : 8 + fns8((u8)(word >> 8), n - w);
+}
+
+static inline unsigned long fns32(u32 word, unsigned int n)
+{
+ unsigned int w = hweight16((u16)word);
+
+ return n < w ? fns16((u16)word, n) : 16 + fns16((u16)(word >> 16), n - w);
+}
+
+static inline unsigned long fns64(u64 word, unsigned int n)
+{
+ unsigned int w = hweight32((u32)word);
+
+ return n < w ? fns32((u32)word, n) : 32 + fns32((u32)(word >> 32), n - w);
+}
+
/**
* fns - find N'th set bit in a word
* @word: The word to search
- * @n: Bit to find
+ * @n: Bit to find, counting from 0
+ *
+ * Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
- while (word && n--)
- word &= word - 1;
-
- return word ? __ffs(word) : BITS_PER_LONG;
+#if BITS_PER_LONG == 64
+ return fns64(word, n);
+#else
+ return fns32(word, n);
+#endif
}

/**
--
2.40.1