Re: [PATCH v2 1/3] lib: find_*_bit reimplementation

From: Yury
Date: Wed Feb 04 2015 - 17:52:51 EST



On 02.02.2015 13:43, Rasmus Villemoes wrote:
> On Sat, Jan 31 2015, yury.norov@xxxxxxxxx wrote:
>
>> From: Yury Norov <y.norov@xxxxxxxxxxx>
>>
>> New implementations takes less space in source file (see diffstat)
>> and in object. For me it's 710 vs 453 bytes of text.
>>
> New version generally looks good. Please include a summary of the
> changes between the versions either below the --- line or in a 0/n cover
> letter, especially since you've now expanded the scope of the series.
>
> Comments below.
>
>> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
>> Performance tests were ran on userspace with code like this:
>>
>> /* addr[] is filled from /dev/urandom */
>> start = clock();
>> while (ret < nbits)
>> ret = find_next_bit(addr, nbits, ret + 1);
>>
>> end = clock();
>> printf("%ld\t", (unsigned long) end - start);
>>
>> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
>> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>>
>> find_next_bit: find_first_bit:
>> new current new current
>> 26932 43151 14777 14925
>> 26947 43182 14521 15423
>> 26507 43824 15053 14705
>> 27329 43759 14473 14777
>> 26895 43367 14847 15023
>> 26990 43693 15103 15163
>> 26775 43299 15067 15232
>> 27282 42752 14544 15121
>> 27504 43088 14644 14858
>> 26761 43856 14699 15193
>> 26692 43075 14781 14681
>> 27137 42969 14451 15061
>> ... ...
>>
>> find_next_bit performance gain is 35-40%;
>> find_first_bit - no measurable difference.
>>
>> Signed-off-by: Yury Norov <yury.norov@xxxxxxxxx>
>> ---
>> lib/find_last_bit.c | 31 ++-----
>> lib/find_next_bit.c | 254 +++++++++++++++-------------------------------------
>> 2 files changed, 79 insertions(+), 206 deletions(-)
>>
>> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
>> index 91ca09f..e67e970 100644
>> --- a/lib/find_last_bit.c
>> +++ b/lib/find_last_bit.c
>> @@ -4,44 +4,29 @@
>> * Written by Rusty Russell <rusty@xxxxxxxxxxxxxxx>
>> * (Inspired by David Howell's find_next_bit implementation)
>> *
>> + * Rewritten by Yury Norov <yury.norov@xxxxxxxxx> to decrease
>> + * size and improve performance, 2015.
>> + *
>> * This program is free software; you can redistribute it and/or
>> * modify it under the terms of the GNU General Public License
>> * as published by the Free Software Foundation; either version
>> * 2 of the License, or (at your option) any later version.
>> */
>>
>> -#include <linux/bitops.h>
> Why do you remove that #include? It is rather important that the header
> and implementation don't get out of sync. I know that kernel.h includes
> bitops.h, but please don't rely on such things. Quoting SubmitChecklist:
>
> 1: If you use a facility then #include the file that defines/declares
> that facility. Don't depend on other header files pulling in ones
> that you use.
>
>
>> #include <linux/export.h>
>> -#include <asm/types.h>
>> -#include <asm/byteorder.h>
> However, getting rid of includes that are no longer needed is certainly
> a good thing.
Yes, linux/bitops.h are to get back.
>> +#include <linux/kernel.h>
>>
>> #ifndef find_last_bit
>>
>> unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
>> {
>> - unsigned long words;
>> - unsigned long tmp;
>> -
>> - /* Start at final word. */
>> - words = size / BITS_PER_LONG;
>> -
>> - /* Partial final word? */
>> - if (size & (BITS_PER_LONG-1)) {
>> - tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
>> - - (size & (BITS_PER_LONG-1)))));
>> - if (tmp)
>> - goto found;
>> - }
>> + unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>>
>> - while (words) {
>> - tmp = addr[--words];
>> - if (tmp) {
>> -found:
>> - return words * BITS_PER_LONG + __fls(tmp);
>> - }
>> + while (idx--) {
>> + if (addr[idx])
>> + return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
>> }
>>
>> - /* Not found */
>> return size;
>> }
>> EXPORT_SYMBOL(find_last_bit);
>> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
>> index 0cbfc0b..ebfb3dc 100644
>> --- a/lib/find_next_bit.c
>> +++ b/lib/find_next_bit.c
>> @@ -3,18 +3,45 @@
>> * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
>> * Written by David Howells (dhowells@xxxxxxxxxx)
>> *
>> + * Rewritten by Yury Norov <yury.norov@xxxxxxxxx> to decrease
>> + * size and improve performance, 2015.
>> + *
>> * This program is free software; you can redistribute it and/or
>> * modify it under the terms of the GNU General Public License
>> * as published by the Free Software Foundation; either version
>> * 2 of the License, or (at your option) any later version.
>> */
>>
>> -#include <linux/bitops.h>
>> #include <linux/export.h>
>> -#include <asm/types.h>
>> -#include <asm/byteorder.h>
>> +#include <linux/kernel.h>
> Same as above.
>
>> +#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
>> +
>> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>> +static unsigned long _find_next_bit(const unsigned long *addr,
>> + unsigned long nbits, unsigned long start, bool set)
>> +{
>> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> +
>> + /* Handle 1st word. */
>> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> + tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
>> + start = round_down(start, BITS_PER_LONG);
>> + }
>>
>> -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
>> + while (!tmp) {
>> + start += BITS_PER_LONG;
>> + if (start >= nbits)
>> + return nbits;
>> +
>> + tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> + }
>> +
>> + return start + __ffs(tmp);
>> +}
>> +#endif
>>
>> #ifndef find_next_bit
>> /*
>> @@ -23,86 +50,22 @@
>> unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
>> unsigned long offset)
>> {
>> - const unsigned long *p = addr + BITOP_WORD(offset);
>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
> Why can't this ...
>
>
>> - size -= result;
>> - offset %= BITS_PER_LONG;
>> - if (offset) {
>> - tmp = *(p++);
>> - tmp &= (~0UL << offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if ((tmp = *(p++)))
>> - goto found_middle;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = *p;
>>
>> -found_first:
>> - tmp &= (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + __ffs(tmp);
>> + return min(_find_next_bit(addr, size, offset, 1), size);
> ... and this be part of _find_next_bit? Can find_next_bit not be simply
> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
> find_next_zero_bit? Btw., passing true and false for the boolean
> parameter may be a little clearer.
I moved size checkers out of '_find_next_bit' to let user call it from his code
if he knows for sure that size/offset pair is valid. This may help save a couple
of clocks. I think, I'll walk over the code to find how many such places we have.
If not too much / not in critical paths, checks may be moved into the function.

Or maybe leave as is and place some comment for future?...
>
>> }
>> EXPORT_SYMBOL(find_next_bit);
>> #endif
>>
>> #ifndef find_next_zero_bit
>> -/*
>> - * This implementation of find_{first,next}_zero_bit was stolen from
>> - * Linus' asm-alpha/bitops.h.
>> - */
>> unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
>> unsigned long offset)
>> {
>> - const unsigned long *p = addr + BITOP_WORD(offset);
>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
> See above.
>
>> - size -= result;
>> - offset %= BITS_PER_LONG;
>> - if (offset) {
>> - tmp = *(p++);
>> - tmp |= ~0UL >> (BITS_PER_LONG - offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (~tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if (~(tmp = *(p++)))
>> - goto found_middle;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = *p;
>>
>> -found_first:
>> - tmp |= ~0UL << size;
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + ffz(tmp);
>> + return min(_find_next_bit(addr, size, offset, 0), size);
> See above.
>
>> }
>> EXPORT_SYMBOL(find_next_zero_bit);
>> #endif
>> @@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
>> */
>> unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = 0;
>> - unsigned long tmp;
>> + unsigned long idx;
>>
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if ((tmp = *(p++)))
>> - goto found;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> + if (addr[idx])
>> + return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
>> }
>> - if (!size)
>> - return result;
>>
>> - tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found:
>> - return result + __ffs(tmp);
>> + return size;
>> }
>> EXPORT_SYMBOL(find_first_bit);
>> #endif
>> @@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
>> */
>> unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = 0;
>> - unsigned long tmp;
>> + unsigned long idx;
>>
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if (~(tmp = *(p++)))
>> - goto found;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> + if (addr[idx] != ULONG_MAX)
>> + return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
>> }
>> - if (!size)
>> - return result;
>>
>> - tmp = (*p) | (~0UL << size);
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. */
>> -found:
>> - return result + ffz(tmp);
>> + return size;
>> }
>> EXPORT_SYMBOL(find_first_zero_bit);
>> #endif
>> @@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
>> #ifdef __BIG_ENDIAN
>>
>> /* include/linux/byteorder does not support "unsigned long" type */
>> -static inline unsigned long ext2_swabp(const unsigned long * x)
>> -{
>> -#if BITS_PER_LONG == 64
>> - return (unsigned long) __swab64p((u64 *) x);
>> -#elif BITS_PER_LONG == 32
>> - return (unsigned long) __swab32p((u32 *) x);
>> -#else
>> -#error BITS_PER_LONG not defined
>> -#endif
>> -}
>> -
>> -/* include/linux/byteorder doesn't support "unsigned long" type */
>> static inline unsigned long ext2_swab(const unsigned long y)
>> {
>> #if BITS_PER_LONG == 64
>> @@ -189,48 +120,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
>> #endif
>> }
>>
>> +#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
>> +static unsigned long _find_next_bit_le(const unsigned long *addr,
>> + unsigned long nbits, unsigned long start, bool set)
>> +{
>> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> +
>> + /* Handle 1st word. */
>> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> + tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
>> + start = round_down(start, BITS_PER_LONG);
>> + }
>> +
>> + while (!tmp) {
>> + start += BITS_PER_LONG;
>> + if (start >= nbits)
>> + return nbits;
>> +
>> + tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> + }
>> +
>> + return start + __ffs(ext2_swab(tmp));
>> +}
>> +#endif
>> +
>> #ifndef find_next_zero_bit_le
>> unsigned long find_next_zero_bit_le(const void *addr, unsigned
>> long size, unsigned long offset)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
> Again, I think this should be moved to the common implementation in
> _find_next_bit_le, and similarly for find_next_bit_le below.
>
>> - p += BITOP_WORD(offset);
>> - size -= result;
>> - offset &= (BITS_PER_LONG - 1UL);
>> - if (offset) {
>> - tmp = ext2_swabp(p++);
>> - tmp |= (~0UL >> (BITS_PER_LONG - offset));
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (~tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>>
>> - while (size & ~(BITS_PER_LONG - 1)) {
>> - if (~(tmp = *(p++)))
>> - goto found_middle_swap;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = ext2_swabp(p);
>> -found_first:
>> - tmp |= ~0UL << size;
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. Skip ffz */
>> -found_middle:
>> - return result + ffz(tmp);
>> -
>> -found_middle_swap:
>> - return result + ffz(ext2_swab(tmp));
>> + return min(_find_next_bit_le(addr, size, offset, 0), size);
>> }
>> EXPORT_SYMBOL(find_next_zero_bit_le);
>> #endif
>> @@ -239,45 +162,10 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
>> unsigned long find_next_bit_le(const void *addr, unsigned
>> long size, unsigned long offset)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
>> - p += BITOP_WORD(offset);
>> - size -= result;
>> - offset &= (BITS_PER_LONG - 1UL);
>> - if (offset) {
>> - tmp = ext2_swabp(p++);
>> - tmp &= (~0UL << offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> -
>> - while (size & ~(BITS_PER_LONG - 1)) {
>> - tmp = *(p++);
>> - if (tmp)
>> - goto found_middle_swap;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = ext2_swabp(p);
>> -found_first:
>> - tmp &= (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + __ffs(tmp);
>>
>> -found_middle_swap:
>> - return result + __ffs(ext2_swab(tmp));
>> + return min(_find_next_bit_le(addr, size, offset, 1), size);
>> }
>> EXPORT_SYMBOL(find_next_bit_le);
>> #endif

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/