[PATCH v2 2/4] futex: Larger hash table

From: Davidlohr Bueso
Date: Tue Dec 03 2013 - 04:46:51 EST


Currently, the futex global hash table suffers from it's fixed, smallish
(for today's standards) size of 256 entries, as well as its lack of NUMA
awareness. Large systems, using many futexes, can be prone to high amounts
of collisions; where these futexes hash to the same bucket and lead to
extra contention on the same hb->lock. Furthermore, cacheline bouncing is a
reality when we have multiple hb->locks residing on the same cacheline and
different futexes hash to adjacent buckets.

This patch keeps the current static size of 16 entries for small systems,
or otherwise, 256 * ncpus (or larger as we need to round the number to a
power of 2). Note that this number of CPUs accounts for all CPUs that can
ever be available in the system, taking into consideration things like
hotpluging. While we do impose extra overhead at bootup by making the hash
table larger, this is a one time thing, and does not shadow the benefits
of this patch.

Furthermore, as suggested by tglx, by cache aligning the hash buckets we can
avoid access across cacheline boundaries and also avoid massive cache line
bouncing if multiple cpus are hammering away at different hash buckets which
happen to reside in the same cache line.

Also, similar to other core kernel components (pid, dcache, tcp), by using
alloc_large_system_hash() we benefit from its NUMA awareness and thus the
table is distributed among the nodes instead of in a single one.

For a custom microbenchmark that pounds on the uaddr hashing -- making the wait
path fail at futex_wait_setup() returning -EWOULDBLOCK for large amounts of
futexes, we can see the following benefits on a 80-core, 8-socket 1Tb server:

+---------+--------------------+------------------------+-----------------------+-------------------------------+
| threads | baseline (ops/sec) | aligned-only (ops/sec) | large table (ops/sec) | large table+aligned (ops/sec) |
+---------+--------------------+------------------------+-----------------------+-------------------------------+
|ÂÂÂÂ 512 |ÂÂÂÂÂÂÂÂÂÂÂÂÂ 32426 | 50531Â (+55.8%)ÂÂÂÂÂÂÂ | 255274Â (+687.2%)ÂÂÂÂ | 292553Â (+802.2%)ÂÂÂÂÂÂÂÂÂÂÂÂ |
|ÂÂÂÂ 256 |ÂÂÂÂÂÂÂÂÂÂÂÂÂ 65360 | 99588Â (+52.3%)ÂÂÂÂÂÂÂ | 443563Â (+578.6%)ÂÂÂÂ | 508088Â (+677.3%)ÂÂÂÂÂÂÂÂÂÂÂÂ |
|ÂÂÂÂ 128 |ÂÂÂÂÂÂÂÂÂÂÂÂ 125635 | 200075 (+59.2%)ÂÂÂÂÂÂÂ | 742613Â (+491.1%)ÂÂÂÂ | 835452Â (+564.9%)ÂÂÂÂÂÂÂÂÂÂÂÂ |
|ÂÂÂÂÂ 80 |ÂÂÂÂÂÂÂÂÂÂÂÂ 193559 | 323425 (+67.1%)ÂÂÂÂÂÂÂ | 1028147 (+431.1%)ÂÂÂÂ | 1130304 (+483.9%)ÂÂÂÂÂÂÂÂÂÂÂÂ |
|ÂÂÂÂÂ 64 |ÂÂÂÂÂÂÂÂÂÂÂÂ 247667 | 443740 (+79.1%)ÂÂÂÂÂÂÂ | 997300Â (+302.6%)ÂÂÂÂ | 1145494 (+362.5%)ÂÂÂÂÂÂÂÂÂÂÂÂ |
|ÂÂÂÂÂ 32 |ÂÂÂÂÂÂÂÂÂÂÂÂ 628412 | 721401 (+14.7%)ÂÂÂÂÂÂÂ | 965996Â (+53.7%)ÂÂÂÂÂ | 1122115 (+78.5%)ÂÂÂÂÂÂÂÂÂÂÂÂÂ |
+---------+--------------------+------------------------+-----------------------+-------------------------------+

Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Reviewed-by: Darren Hart <dvhart@xxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Mike Galbraith <efault@xxxxxx>
Cc: Jeff Mahoney <jeffm@xxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Scott Norton <scott.norton@xxxxxx>
Cc: Tom Vaden <tom.vaden@xxxxxx>
Cc: Aswin Chandramouleeswaran <aswin@xxxxxx>
Signed-off-by: Waiman Long <Waiman.Long@xxxxxx>
Signed-off-by: Jason Low <jason.low2@xxxxxx>
Signed-off-by: Davidlohr Bueso <davidlohr@xxxxxx>
---
kernel/futex.c | 26 +++++++++++++++++++-------
1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index e6ffe73..e603520 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,6 +63,7 @@
#include <linux/sched/rt.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
+#include <linux/bootmem.h>

#include <asm/futex.h>

@@ -70,8 +71,6 @@

int __read_mostly futex_cmpxchg_enabled;

-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
-
/*
* Futex flags used to encode options to functions and preserve them across
* restarts.
@@ -149,9 +148,11 @@ static const struct futex_q futex_q_init = {
struct futex_hash_bucket {
spinlock_t lock;
struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;

-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;

/*
* We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +162,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
- return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+ return &futex_queues[hash & (futex_hashsize - 1)];
}

/*
@@ -2718,7 +2719,18 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
static int __init futex_init(void)
{
u32 curval;
- int i;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ NULL, NULL, futex_hashsize, futex_hashsize);

/*
* This will fail and we want it. Some arch implementations do
@@ -2733,7 +2745,7 @@ static int __init futex_init(void)
if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;

- for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ for (i = 0; i < futex_hashsize; i++) {
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
--
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/