Re: [PATCH v2 1/3] lib: find_*_bit reimplementation

From: Rasmus Villemoes
Date: Mon Feb 02 2015 - 05:44:05 EST

Next message: Hans Verkuil: "Re: [PATCH] media: i2c: ADV7604: Migrate to regmap"
Previous message: Roger Pau Monné: "Re: [PATCH 2/2] drivers: xen/block: add multi-page ring support"
In reply to: Yury: "Re: [PATCH v2 1/3] lib: find_*_bit reimplementation"
Next in thread: George Spelvin: "Re: [PATCH v2 1/3] lib: find_*_bit reimplementation"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Sat, Jan 31 2015, yury.norov@xxxxxxxxx wrote:

> From: Yury Norov <y.norov@xxxxxxxxxxx>
>
> New implementations takes less space in source file (see diffstat)
> and in object. For me it's 710 vs 453 bytes of text.
>

New version generally looks good. Please include a summary of the
changes between the versions either below the --- line or in a 0/n cover
letter, especially since you've now expanded the scope of the series.

Comments below.

>
> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
> Performance tests were ran on userspace with code like this:
>
> /* addr[] is filled from /dev/urandom */
> start = clock();
> while (ret < nbits)
> ret = find_next_bit(addr, nbits, ret + 1);
>
> end = clock();
> printf("%ld\t", (unsigned long) end - start);
>
> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>
> find_next_bit: find_first_bit:
> new current new current
> 26932 43151 14777 14925
> 26947 43182 14521 15423
> 26507 43824 15053 14705
> 27329 43759 14473 14777
> 26895 43367 14847 15023
> 26990 43693 15103 15163
> 26775 43299 15067 15232
> 27282 42752 14544 15121
> 27504 43088 14644 14858
> 26761 43856 14699 15193
> 26692 43075 14781 14681
> 27137 42969 14451 15061
> ... ...
>
> find_next_bit performance gain is 35-40%;
> find_first_bit - no measurable difference.
>
> Signed-off-by: Yury Norov <yury.norov@xxxxxxxxx>
> ---
> lib/find_last_bit.c | 31 ++-----
> lib/find_next_bit.c | 254 +++++++++++++++-------------------------------------
> 2 files changed, 79 insertions(+), 206 deletions(-)
>
> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
> index 91ca09f..e67e970 100644
> --- a/lib/find_last_bit.c
> +++ b/lib/find_last_bit.c
> @@ -4,44 +4,29 @@
> * Written by Rusty Russell <rusty@xxxxxxxxxxxxxxx>
> * (Inspired by David Howell's find_next_bit implementation)
> *
> + * Rewritten by Yury Norov <yury.norov@xxxxxxxxx> to decrease
> + * size and improve performance, 2015.
> + *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> * 2 of the License, or (at your option) any later version.
> */
>
> -#include <linux/bitops.h>

Why do you remove that #include? It is rather important that the header
and implementation don't get out of sync. I know that kernel.h includes
bitops.h, but please don't rely on such things. Quoting SubmitChecklist:

1: If you use a facility then #include the file that defines/declares
that facility. Don't depend on other header files pulling in ones
that you use.

> #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>

However, getting rid of includes that are no longer needed is certainly
a good thing.

> +#include <linux/kernel.h>
>
> #ifndef find_last_bit
>
> unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
> {
> - unsigned long words;
> - unsigned long tmp;
> -
> - /* Start at final word. */
> - words = size / BITS_PER_LONG;
> -
> - /* Partial final word? */
> - if (size & (BITS_PER_LONG-1)) {
> - tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
> - - (size & (BITS_PER_LONG-1)))));
> - if (tmp)
> - goto found;
> - }
> + unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>
> - while (words) {
> - tmp = addr[--words];
> - if (tmp) {
> -found:
> - return words * BITS_PER_LONG + __fls(tmp);
> - }
> + while (idx--) {
> + if (addr[idx])
> + return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
> }
>
> - /* Not found */
> return size;
> }
> EXPORT_SYMBOL(find_last_bit);
> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
> index 0cbfc0b..ebfb3dc 100644
> --- a/lib/find_next_bit.c
> +++ b/lib/find_next_bit.c
> @@ -3,18 +3,45 @@
> * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
> * Written by David Howells (dhowells@xxxxxxxxxx)
> *
> + * Rewritten by Yury Norov <yury.norov@xxxxxxxxx> to decrease
> + * size and improve performance, 2015.
> + *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> * 2 of the License, or (at your option) any later version.
> */
>
> -#include <linux/bitops.h>
> #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
> +#include <linux/kernel.h>

Same as above.

> +#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
> +
> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
> +static unsigned long _find_next_bit(const unsigned long *addr,
> + unsigned long nbits, unsigned long start, bool set)
> +{
> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> +
> + /* Handle 1st word. */
> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
> + tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
> + start = round_down(start, BITS_PER_LONG);
> + }
>
> -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
> + while (!tmp) {
> + start += BITS_PER_LONG;
> + if (start >= nbits)
> + return nbits;
> +
> + tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> + }
> +
> + return start + __ffs(tmp);
> +}
> +#endif
>
> #ifndef find_next_bit
> /*
> @@ -23,86 +50,22 @@
> unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
> unsigned long offset)
> {
> - const unsigned long *p = addr + BITOP_WORD(offset);
> - unsigned long result = offset & ~(BITS_PER_LONG-1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;

Why can't this ...

> - size -= result;
> - offset %= BITS_PER_LONG;
> - if (offset) {
> - tmp = *(p++);
> - tmp &= (~0UL << offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
> - while (size & ~(BITS_PER_LONG-1)) {
> - if ((tmp = *(p++)))
> - goto found_middle;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = *p;
>
> -found_first:
> - tmp &= (~0UL >> (BITS_PER_LONG - size));
> - if (tmp == 0UL) /* Are any bits set? */
> - return result + size; /* Nope. */
> -found_middle:
> - return result + __ffs(tmp);
> + return min(_find_next_bit(addr, size, offset, 1), size);

... and this be part of _find_next_bit? Can find_next_bit not be simply
'return _find_next_bit(addr, size, offset, 1);', and similarly for
find_next_zero_bit? Btw., passing true and false for the boolean
parameter may be a little clearer.

> }
> EXPORT_SYMBOL(find_next_bit);
> #endif
>
> #ifndef find_next_zero_bit
> -/*
> - * This implementation of find_{first,next}_zero_bit was stolen from
> - * Linus' asm-alpha/bitops.h.
> - */
> unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
> unsigned long offset)
> {
> - const unsigned long *p = addr + BITOP_WORD(offset);
> - unsigned long result = offset & ~(BITS_PER_LONG-1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;

See above.

> - size -= result;
> - offset %= BITS_PER_LONG;
> - if (offset) {
> - tmp = *(p++);
> - tmp |= ~0UL >> (BITS_PER_LONG - offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (~tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
> - while (size & ~(BITS_PER_LONG-1)) {
> - if (~(tmp = *(p++)))
> - goto found_middle;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = *p;
>
> -found_first:
> - tmp |= ~0UL << size;
> - if (tmp == ~0UL) /* Are any bits zero? */
> - return result + size; /* Nope. */
> -found_middle:
> - return result + ffz(tmp);
> + return min(_find_next_bit(addr, size, offset, 0), size);

See above.

> }
> EXPORT_SYMBOL(find_next_zero_bit);
> #endif
> @@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
> */
> unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
> {
> - const unsigned long *p = addr;
> - unsigned long result = 0;
> - unsigned long tmp;
> + unsigned long idx;
>
> - while (size & ~(BITS_PER_LONG-1)) {
> - if ((tmp = *(p++)))
> - goto found;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> + if (addr[idx])
> + return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
> }
> - if (!size)
> - return result;
>
> - tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
> - if (tmp == 0UL) /* Are any bits set? */
> - return result + size; /* Nope. */
> -found:
> - return result + __ffs(tmp);
> + return size;
> }
> EXPORT_SYMBOL(find_first_bit);
> #endif
> @@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
> */
> unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
> {
> - const unsigned long *p = addr;
> - unsigned long result = 0;
> - unsigned long tmp;
> + unsigned long idx;
>
> - while (size & ~(BITS_PER_LONG-1)) {
> - if (~(tmp = *(p++)))
> - goto found;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> + if (addr[idx] != ULONG_MAX)
> + return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
> }
> - if (!size)
> - return result;
>
> - tmp = (*p) | (~0UL << size);
> - if (tmp == ~0UL) /* Are any bits zero? */
> - return result + size; /* Nope. */
> -found:
> - return result + ffz(tmp);
> + return size;
> }
> EXPORT_SYMBOL(find_first_zero_bit);
> #endif
> @@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
> #ifdef __BIG_ENDIAN
>
> /* include/linux/byteorder does not support "unsigned long" type */
> -static inline unsigned long ext2_swabp(const unsigned long * x)
> -{
> -#if BITS_PER_LONG == 64
> - return (unsigned long) __swab64p((u64 *) x);
> -#elif BITS_PER_LONG == 32
> - return (unsigned long) __swab32p((u32 *) x);
> -#else
> -#error BITS_PER_LONG not defined
> -#endif
> -}
> -
> -/* include/linux/byteorder doesn't support "unsigned long" type */
> static inline unsigned long ext2_swab(const unsigned long y)
> {
> #if BITS_PER_LONG == 64
> @@ -189,48 +120,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
> #endif
> }
>
> +#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
> +static unsigned long _find_next_bit_le(const unsigned long *addr,
> + unsigned long nbits, unsigned long start, bool set)
> +{
> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> +
> + /* Handle 1st word. */
> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
> + tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
> + start = round_down(start, BITS_PER_LONG);
> + }
> +
> + while (!tmp) {
> + start += BITS_PER_LONG;
> + if (start >= nbits)
> + return nbits;
> +
> + tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> + }
> +
> + return start + __ffs(ext2_swab(tmp));
> +}
> +#endif
> +
> #ifndef find_next_zero_bit_le
> unsigned long find_next_zero_bit_le(const void *addr, unsigned
> long size, unsigned long offset)
> {
> - const unsigned long *p = addr;
> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;

Again, I think this should be moved to the common implementation in
_find_next_bit_le, and similarly for find_next_bit_le below.

> - p += BITOP_WORD(offset);
> - size -= result;
> - offset &= (BITS_PER_LONG - 1UL);
> - if (offset) {
> - tmp = ext2_swabp(p++);
> - tmp |= (~0UL >> (BITS_PER_LONG - offset));
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (~tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
>
> - while (size & ~(BITS_PER_LONG - 1)) {
> - if (~(tmp = *(p++)))
> - goto found_middle_swap;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = ext2_swabp(p);
> -found_first:
> - tmp |= ~0UL << size;
> - if (tmp == ~0UL) /* Are any bits zero? */
> - return result + size; /* Nope. Skip ffz */
> -found_middle:
> - return result + ffz(tmp);
> -
> -found_middle_swap:
> - return result + ffz(ext2_swab(tmp));
> + return min(_find_next_bit_le(addr, size, offset, 0), size);
> }
> EXPORT_SYMBOL(find_next_zero_bit_le);
> #endif
> @@ -239,45 +162,10 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
> unsigned long find_next_bit_le(const void *addr, unsigned
> long size, unsigned long offset)
> {
> - const unsigned long *p = addr;
> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;
> - p += BITOP_WORD(offset);
> - size -= result;
> - offset &= (BITS_PER_LONG - 1UL);
> - if (offset) {
> - tmp = ext2_swabp(p++);
> - tmp &= (~0UL << offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
> -
> - while (size & ~(BITS_PER_LONG - 1)) {
> - tmp = *(p++);
> - if (tmp)
> - goto found_middle_swap;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = ext2_swabp(p);
> -found_first:
> - tmp &= (~0UL >> (BITS_PER_LONG - size));
> - if (tmp == 0UL) /* Are any bits set? */
> - return result + size; /* Nope. */
> -found_middle:
> - return result + __ffs(tmp);
>
> -found_middle_swap:
> - return result + __ffs(ext2_swab(tmp));
> + return min(_find_next_bit_le(addr, size, offset, 1), size);
> }
> EXPORT_SYMBOL(find_next_bit_le);
> #endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Hans Verkuil: "Re: [PATCH] media: i2c: ADV7604: Migrate to regmap"
Previous message: Roger Pau Monné: "Re: [PATCH 2/2] drivers: xen/block: add multi-page ring support"
In reply to: Yury: "Re: [PATCH v2 1/3] lib: find_*_bit reimplementation"
Next in thread: George Spelvin: "Re: [PATCH v2 1/3] lib: find_*_bit reimplementation"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]