[RFC] Rebasing the IDR

From: Matthew Wilcox
Date: Thu Nov 30 2017 - 12:36:40 EST


About 40 of the approximately 180 users of the IDR in the kernel are
"1-based" instead of "0-based". That is, they never want to have ID 0
allocated; they want to see IDs allocated between 1 and N. Usually, that's
expressed like this:

/* Get the user-visible handle using idr. */
ret = idr_alloc(&file_priv->object_idr, obj, 1, 0, GFP_KERNEL);

The current implementation of this grieves me. You see, we mark each
node of the radix tree according to whether it has any free entries
or not, and entry 0 is always free! If we've already allocated 10,000
entries from this IDR, and see this call, we'll walk all the way down
the left side of the tree looking for entry 1, get to the bottom,
see that entries 1-63 are allocated, then walk up to the next level,
see that 64-4095 are allocated, walk up to the next level, see that
8192-12287 has a free entry, and then start walking down again.

It'd be somewhere around two to three times quicker to know not to
look down the left hand side of the tree, see that 1-8191 are used and
start looking in the range 8192-12287. I considered a function like
idr_reserve(idr, N) to reserve the first N entries (we have at least one
consumer in the tree that is 2-based, not 1-based), but that struck me
as inelegant. We also have the cool optimisation that if there's only
one element in the radix tree at offset 0, then we don't even need
to allocate a node to store it, we just store it right in the head.
And that optimisation was never available to the 1-based users.

I've come up with this solution instead, idr_base. It's free in terms
of memory consumption on 64-bit as it fits in the gap at the end of the
struct idr. Essentially, we just subtract off idr_base when looking
up the ID, and add it back on when reporting the ID. We need to do some
saturating arithmetic in idr_get_next(), because starting a walk at 4
billion or negative 8 quintillion doesn't work out terribly well.

It does require the user to call idr_init_base() instead of idr_init().
That should be a relatively small conversion effort. Opinions? Feedback?
Better test cases for the test-suite (ahem)?

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 91d27a9bcdf4..a657b1f925f8 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -18,6 +18,7 @@

struct idr {
struct radix_tree_root idr_rt;
+ unsigned int idr_base;
unsigned int idr_next;
};

@@ -30,9 +31,10 @@ struct idr {
/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT))

-#define IDR_INIT \
-{ \
- .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER) \
+#define IDR_INIT { \
+ .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \
+ .idr_base = 0, \
+ .idr_next = 0, \
}
#define DEFINE_IDR(name) struct idr name = IDR_INIT

@@ -123,12 +125,20 @@ static inline int __must_check idr_alloc_u32(struct idr *idr, void *ptr,

static inline void *idr_remove(struct idr *idr, unsigned long id)
{
- return radix_tree_delete_item(&idr->idr_rt, id, NULL);
+ return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}

static inline void idr_init(struct idr *idr)
{
INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
+ idr->idr_base = 0;
+ idr->idr_next = 0;
+}
+
+static inline void idr_init_base(struct idr *idr, int base)
+{
+ INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
+ idr->idr_base = base;
idr->idr_next = 0;
}

@@ -163,7 +173,7 @@ static inline void idr_preload_end(void)
*/
static inline void *idr_find(const struct idr *idr, unsigned long id)
{
- return radix_tree_lookup(&idr->idr_rt, id);
+ return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}

/**
diff --git a/lib/idr.c b/lib/idr.c
index 1aaeb5a8c426..a1d3531bd62f 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -33,21 +33,22 @@ int idr_alloc_ul(struct idr *idr, void *ptr, unsigned long *nextid,
{
struct radix_tree_iter iter;
void __rcu **slot;
+ int base = idr->idr_base;

if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
return -EINVAL;
if (WARN_ON_ONCE(!(idr->idr_rt.gfp_mask & ROOT_IS_IDR)))
idr->idr_rt.gfp_mask |= IDR_RT_MARKER;

- radix_tree_iter_init(&iter, *nextid);
- slot = idr_get_free(&idr->idr_rt, &iter, gfp, max);
+ radix_tree_iter_init(&iter, *nextid - base);
+ slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
if (IS_ERR(slot))
return PTR_ERR(slot);

radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

- *nextid = iter.index;
+ *nextid = iter.index + base;
return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_ul);
@@ -143,13 +144,14 @@ int idr_for_each(const struct idr *idr,
{
struct radix_tree_iter iter;
void __rcu **slot;
+ int base = idr->idr_base;

radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
int ret;

if (WARN_ON_ONCE(iter.index > INT_MAX))
break;
- ret = fn(iter.index, rcu_dereference_raw(*slot), data);
+ ret = fn(iter.index + base, rcu_dereference_raw(*slot), data);
if (ret)
return ret;
}
@@ -172,15 +174,19 @@ void *idr_get_next(struct idr *idr, int *nextid)
{
struct radix_tree_iter iter;
void __rcu **slot;
+ int base = idr->idr_base;
+ int id = *nextid;

- slot = radix_tree_iter_find(&idr->idr_rt, &iter, *nextid);
+ id = (id < base) ? 0 : id - base;
+ slot = radix_tree_iter_find(&idr->idr_rt, &iter, id);
if (!slot)
return NULL;
+ id = iter.index + base;

- if (WARN_ON_ONCE(iter.index > INT_MAX))
+ if (WARN_ON_ONCE(id > INT_MAX))
return NULL;

- *nextid = iter.index;
+ *nextid = id;
return rcu_dereference_raw(*slot);
}
EXPORT_SYMBOL(idr_get_next);
@@ -199,12 +205,15 @@ void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
struct radix_tree_iter iter;
void __rcu **slot;
+ unsigned long base = idr->idr_base;
+ unsigned long id = *nextid;

- slot = radix_tree_iter_find(&idr->idr_rt, &iter, *nextid);
+ id = (id < base) ? 0 : id - base;
+ slot = radix_tree_iter_find(&idr->idr_rt, &iter, id);
if (!slot)
return NULL;

- *nextid = iter.index;
+ *nextid = iter.index + base;
return rcu_dereference_raw(*slot);
}
EXPORT_SYMBOL(idr_get_next_ul);
@@ -231,6 +240,7 @@ void *idr_replace(struct idr *idr, void *ptr, unsigned long id)

if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
return ERR_PTR(-EINVAL);
+ id -= idr->idr_base;

entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
index 36437ade429c..44ef9eba5a7a 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -153,11 +153,12 @@ void idr_nowait_test(void)
idr_destroy(&idr);
}

-void idr_get_next_test(void)
+void idr_get_next_test(int base)
{
unsigned long i;
int nextid;
DEFINE_IDR(idr);
+ idr_init_base(&idr, base);

int indices[] = {4, 7, 9, 15, 65, 128, 1000, 99999, 0};

@@ -244,7 +245,9 @@ void idr_checks(void)
idr_alloc_test();
idr_null_test();
idr_nowait_test();
- idr_get_next_test();
+ idr_get_next_test(0);
+ idr_get_next_test(1);
+ idr_get_next_test(4);
}

/*