[patch V2 2/7] futex: Hash private futexes per process

From: Thomas Gleixner
Date: Thu May 05 2016 - 15:06:29 EST


From: Sebastian Siewior <bigeasy@xxxxxxxxxxxxx>

The standard futex mechanism in the Linux kernel uses a global hash to store
transient state. Collisions on that hash can lead to performance degradation
especially on NUMA systems and on real-time enabled kernels even to priority
inversions.

To mitigate that problem we provide per process private hashing. On the first
futex operation in a process the kernel allocates a hash table. The hash table
is accessible via the process mm_struct. On Numa systems the hash is allocated
node local.

If the allocation fails then the global hash table is used as fallback, so
there is no user space visible impact of this feature.

The hash size is a default value which can be tweaked by the sys admin. The
sysctl interface is implemented in a follow up patch to make the review
simpler. For applications which have special requirements for the private hash
and to allow preallocation of the hash for RT applications, we'll provide a
futex OP in a follow up patch.

Performance data acquired on a 4 socket (node) Intel machine with perf bench
futex-hash:

Threads G 65536 P 4 P 8 P 16 P 32 P 64 P 128 P 256

1 8175006 8645465 8617469 8628686 8625223 8664491 8590934 8631582
2 8149869 8618385 8578185 8622267 8603253 8618787 8595073 8590591
4 7479482 5867840 7882991 7604838 7894380 7882850 7884911 7886278
8 7308822 2378057 5731051 5550479 7691198 7672814 7711939 7681549
16 7295893 677414 2670682 3453552 7158906 7688978 7677603 7690290

So with the proper hash size of the private hash is ~5% faster than the global
hash.

With a full perf bench futex-hash run with one process (36 threads) per node
and 1024 futexes per thread the following results are achieved:

G 65536 P 4 P 8 P 16 P 32 P 64 P 128 P 256 P 512 P 1024 P 2048
2673390 368952 682626 1223908 1845922 3003524 3538313 4118533 4286925 4289589 4274020

Ratio: 0,14 0,26 0,46 0,69 1,12 1,32 1,54 1,60 1,60 1,60

So with a private hash size of 256 buckets and above the performance is almost
steady in this pathological test case and factor 1.6 better than the global
hash. Even a 64 buckets hash is already 10% faster,

Signed-off-by: Sebastian Siewior <bigeasy@xxxxxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
include/linux/futex.h | 38 ++++++++--
include/linux/futex_types.h | 12 +++
include/linux/mm_types.h | 4 +
init/Kconfig | 4 +
kernel/fork.c | 3
kernel/futex.c | 162 +++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 212 insertions(+), 11 deletions(-)
create mode 100644 include/linux/futex_types.h

--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,6 +1,7 @@
#ifndef _LINUX_FUTEX_H
#define _LINUX_FUTEX_H

+#include <linux/futex_types.h>
#include <uapi/linux/futex.h>

struct inode;
@@ -21,16 +22,19 @@ handle_futex_death(u32 __user *uaddr, st
*
* offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
* We use the two low order bits of offset to tell what is the kind of key :
- * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
- * (no reference on an inode or mm)
+ * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE) using process private
+ * hash (no reference on an inode or mm)
* 01 : Shared futex (PTHREAD_PROCESS_SHARED)
* mapped on a file (reference on the underlying inode)
* 10 : Shared futex (PTHREAD_PROCESS_SHARED)
* (but private mapping on an mm, and reference taken on it)
+ * 11 : Private process futex (PTHREAD_PROCESS_PRIVATE) using global hash
+ * (no reference on an inode or mm)
*/

-#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+#define FUT_OFF_INODE 0x01 /* Key has a reference on inode */
+#define FUT_OFF_MMSHARED 0x02 /* Key has a reference on mm */
+#define FUT_OFF_PRIVATE 0x03 /* Key has no ref on inode/mm */

union futex_key {
struct {
@@ -60,12 +64,30 @@ extern void exit_pi_state_list(struct ta
#else
extern int futex_cmpxchg_enabled;
#endif
+
#else
-static inline void exit_robust_list(struct task_struct *curr)
-{
-}
-static inline void exit_pi_state_list(struct task_struct *curr)
+static inline void exit_robust_list(struct task_struct *curr) { }
+static inline void exit_pi_state_list(struct task_struct *curr) { }
+#endif
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+/* Process private hash data for futexes */
+
+extern unsigned int futex_default_hash_bits;
+extern unsigned int futex_max_hash_bits;
+
+extern void futex_mm_hash_exit(struct mm_struct *mm);
+
+static inline void futex_mm_hash_init(struct mm_struct *mm)
{
+ raw_spin_lock_init(&mm->futex_hash.lock);
+ mm->futex_hash.hash = NULL;
}
+
+#else
+
+static inline void futex_mm_hash_init(struct mm_struct *mm) { }
+static inline void futex_mm_hash_exit(struct mm_struct *mm) { }
#endif
+
#endif
--- /dev/null
+++ b/include/linux/futex_types.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_FUTEX_TYPES_H
+#define _LINUX_FUTEX_TYPES_H
+
+struct futex_hash_bucket;
+
+struct futex_hash {
+ struct raw_spinlock lock;
+ unsigned int hash_bits;
+ struct futex_hash_bucket *hash;
+};
+
+#endif
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
+#include <linux/futex_types.h>
#include <linux/page-flags-layout.h>
#include <asm/page.h>
#include <asm/mmu.h>
@@ -442,6 +443,9 @@ struct mm_struct {

struct linux_binfmt *binfmt;

+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ struct futex_hash futex_hash;
+#endif
cpumask_var_t cpu_vm_mask_var;

/* Architecture-specific MM context */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1498,6 +1498,10 @@ config FUTEX
support for "fast userspace mutexes". The resulting kernel may not
run glibc-based applications correctly.

+config FUTEX_PRIVATE_HASH
+ bool
+ default FUTEX && SMP
+
config HAVE_FUTEX_CMPXCHG
bool
depends on FUTEX
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,8 @@ static struct mm_struct *mm_init(struct
mm_init_owner(mm, p);
mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
+ futex_mm_hash_init(mm);
+
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
@@ -713,6 +715,7 @@ void mmput(struct mm_struct *mm)
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
set_mm_exe_file(mm, NULL);
+ futex_mm_hash_exit(mm);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
list_del(&mm->mmlist);
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -23,6 +23,9 @@
* Copyright (C) IBM Corporation, 2009
* Thanks to Thomas Gleixner for conceptual design and careful reviews.
*
+ * Private hashed futex support by Sebastian Siewior and Thomas Gleixner
+ * Copyright (C) Linutronix GmbH, 2016
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -49,6 +52,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
+#include <linux/hash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
@@ -169,6 +173,34 @@
* the code that actually moves the futex(es) between hash buckets (requeue_futex)
* will do the additional required waiter count housekeeping. This is done for
* double_lock_hb() and double_unlock_hb(), respectively.
+ *
+ * For private futexes we (pre)allocate a per process hash. We check lockless
+ * whether the hash is already allocated. To access the hash later we need
+ * information about the hash properties as well. This requires barriers as
+ * follows:
+ *
+ * CPU 0 CPU 1
+ * check_hash_allocation()
+ * if (mm->futex_hash.hash)
+ * return;
+ * hash = alloc_hash()
+ * lock(&mm->futex_hash.lock);
+ * if (!mm->futex_hash.hash) {
+ * mm->futex_hash.par = params;
+ *
+ * smp_wmb(); (A0) <-paired with-|
+ * |
+ * mm->futex_hash.hash = hash; |
+ * | check_hash_allocation()
+ * | if (mm->futex_hash.hash)
+ * | return;
+ * unlock(&mm->futex_hash.lock); | get_futex_key_refs()
+ * |
+ * |--------- smp_mb() (B)
+ * s = hash(f, mm->futex_hash.par);
+ * hb = &mm->futex_hash.hash[s];
+ *
+ * So we utilize the existing smp_mb() in get_futex_key_refs().
*/

#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
@@ -255,6 +287,22 @@ struct futex_hash_bucket {
struct plist_head chain;
} ____cacheline_aligned_in_smp;

+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+/*
+ * Process private hash for non-shared futexes
+ */
+#define FUTEX_USE_GLOBAL_HASH ((void *) 0x03)
+
+#define FUTEX_MIN_HASH_BITS order_base_2(4UL)
+#define FUTEX_DEF_HASH_BITS order_base_2(8UL)
+#define FUTEX_MAX_HASH_BITS order_base_2(256UL)
+
+unsigned int futex_default_hash_bits = FUTEX_DEF_HASH_BITS;
+unsigned int futex_max_hash_bits = FUTEX_MAX_HASH_BITS;
+#else
+static const unsigned int futex_default_hash_bits = 0;
+#endif
+
/*
* The base of the bucket array and its size are always used together
* (after initialization only in hash_futex()), so ensure that they
@@ -374,13 +422,13 @@ static inline int hb_waiters_pending(str
}

/**
- * hash_futex - Return the hash bucket in the global hash
+ * hash_global_futex - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
*
* We hash on the keys returned from get_futex_key (see below) and return the
* corresponding hash bucket in the global hash.
*/
-static struct futex_hash_bucket *hash_futex(union futex_key *key)
+static struct futex_hash_bucket *hash_global_futex(union futex_key *key)
{
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
@@ -388,9 +436,33 @@ static struct futex_hash_bucket *hash_fu
return &futex_queues[hash & (futex_hashsize - 1)];
}

+/**
+ * hash_futex - Get the hash bucket for a futex
+ *
+ * Returns either the process private or the global hash bucket which fits the
+ * key.
+ */
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
+{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ struct mm_struct *mm = current->mm;
+ unsigned int slot;
+
+ /*
+ * Futexes which use the per process hash have the lower bits cleared
+ */
+ if (key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED))
+ return hash_global_futex(key);
+
+ slot = hash_long(key->private.address, mm->futex_hash.hash_bits);
+ return &mm->futex_hash.hash[slot];
+#else
+ return hash_global_futex(key);
+#endif
+}

/**
- * match_futex - Check whether to futex keys are equal
+ * match_futex - Check whether two futex keys are equal
* @key1: Pointer to key1
* @key2: Pointer to key2
*
@@ -505,7 +577,20 @@ get_futex_key(u32 __user *uaddr, int fsh
*/
if (!fshared) {
key->private.mm = mm;
+ /*
+ * If we have a process private hash, then we store uaddr
+ * instead of the page base address.
+ */
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ if (mm->futex_hash.hash != FUTEX_USE_GLOBAL_HASH) {
+ key->private.address = (unsigned long) uaddr;
+ } else {
+ key->private.address = address;
+ key->both.offset |= FUT_OFF_PRIVATE;
+ }
+#else
key->private.address = address;
+#endif
get_futex_key_refs(key); /* implies smp_mb(); (B) */
return 0;
}
@@ -3153,6 +3238,75 @@ void exit_robust_list(struct task_struct
curr, pip);
}

+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+void futex_mm_hash_exit(struct mm_struct *mm)
+{
+ if (mm->futex_hash.hash && mm->futex_hash.hash != FUTEX_USE_GLOBAL_HASH)
+ kfree(mm->futex_hash.hash);
+ mm->futex_hash.hash = NULL;
+}
+
+static struct futex_hash_bucket *futex_alloc_hash(unsigned int hash_bits)
+{
+ struct futex_hash_bucket *hb;
+ size_t hash_size, size;
+ int i;
+
+ hash_size = 1 << hash_bits;
+ size = hash_size * sizeof(struct futex_hash_bucket);
+ hb = kzalloc_node(size, GFP_KERNEL, numa_node_id());
+ if (!hb)
+ return NULL;
+
+ for (i = 0; i < hash_size; i++) {
+ atomic_set(&hb[i].waiters, 0);
+ plist_head_init(&hb[i].chain);
+ spin_lock_init(&hb[i].lock);
+ }
+ return hb;
+}
+
+static void futex_populate_hash(unsigned int hash_bits)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_hash_bucket *hb = NULL;
+
+ /*
+ * We don't need an explicit smp_mb() when the hash is populated
+ * because before we dereference mm->futex_hash.hash_bits in the hash
+ * function we have an smp_mb() in futex_get_key_refs() already.
+ */
+ if (mm->futex_hash.hash)
+ return;
+
+ /*
+ * If we failed to allocate a hash on the fly, fall back to the global
+ * hash.
+ */
+ hb = futex_alloc_hash(hash_bits);
+ if (!hb)
+ hb = FUTEX_USE_GLOBAL_HASH;
+
+ raw_spin_lock(&mm->futex_hash.lock);
+ /* We might have raced with another task allocating the hash. */
+ if (!mm->futex_hash.hash) {
+ mm->futex_hash.hash_bits = hash_bits;
+ /*
+ * Ensure that the above is visible before we store
+ * the pointer.
+ */
+ smp_wmb(); /* (A0) Pairs with (B) */
+ mm->futex_hash.hash = hb;
+ hb = NULL;
+ }
+ raw_spin_unlock(&mm->futex_hash.lock);
+ kfree(hb);
+}
+#else /* CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_populate_hash(unsigned int hash_bits) { }
+#endif
+
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
@@ -3161,6 +3315,8 @@ long do_futex(u32 __user *uaddr, int op,

if (!(op & FUTEX_PRIVATE_FLAG))
flags |= FLAGS_SHARED;
+ else
+ futex_populate_hash(futex_default_hash_bits);

if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;