[RFC PATCH] membarrier: handle nohz_full with expedited thread registration

From: Mathieu Desnoyers
Date: Mon Jan 16 2017 - 14:59:06 EST


Threads running on nohz_full CPUs are not considered by
synchronize_sched, but they should be covered by a membarrier system
call with MEMBARRIER_CMD_SHARED command.

Introduce two new commands to membarrier:
MEMBARRIER_CMD_REGISTER_EXPEDITED and
MEMBARRIER_CMD_UNREGISTER_EXPEDITED.

No-hz full threads requiring to receive interrupts to ensure correct
memory ordering pairing compiler barriers with membarrier system call
should register as "expedited" threads.

[ This RFC patch lacks documentation. I mainly want feedback to see if
everyone is OK with the general approach. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Josh Triplett <josh@xxxxxxxxxxxxxxxx>
Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Nicholas Miell <nmiell@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Alan Cox <gnomes@xxxxxxxxxxxxxxxxxxx>
Cc: Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
Cc: Stephen Hemminger <stephen@xxxxxxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: David Howells <dhowells@xxxxxxxxxx>
Cc: Pranith Kumar <bobby.prani@xxxxxxxxx>
Cc: Michael Kerrisk <mtk.manpages@xxxxxxxxx>
Cc: Shuah Khan <shuahkh@xxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
fs/exec.c | 1 +
include/linux/sched.h | 27 +++++++++++++++
include/uapi/linux/membarrier.h | 6 ++++
kernel/fork.c | 2 ++
kernel/membarrier.c | 77 +++++++++++++++++++++++++++++++++++++++--
5 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e579466..2cf1f87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1771,6 +1771,7 @@ static int do_execveat_common(int fd, struct filename *filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ membarrier_execve(current);
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9e..1242eb9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1998,6 +1998,9 @@ struct task_struct {
/* A live task holds one reference. */
atomic_t stack_refcount;
#endif
+#ifdef CONFIG_MEMBARRIER
+ unsigned int membarrier_expedited;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/*
@@ -3671,4 +3674,28 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void cpufreq_remove_update_util_hook(int cpu);
#endif /* CONFIG_CPU_FREQ */

+#ifdef CONFIG_MEMBARRIER
+static inline void membarrier_fork(struct task_struct *t,
+ unsigned long clone_flags)
+{
+ if (clone_flags & CLONE_THREAD)
+ t->membarrier_expedited = 0;
+ else
+ t->membarrier_expedited = current->membarrier_expedited;
+}
+
+static inline void membarrier_execve(struct task_struct *t)
+{
+ t->membarrier_expedited = 0;
+}
+#else
+static inline void membarrier_fork(struct task_struct *t,
+ unsigned long clone_flags)
+{
+}
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
#endif
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108b..4b78f07 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,6 +40,10 @@
* (non-running threads are de facto in such a
* state). This covers threads from all processes
* running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_REGISTER_EXPEDITED:
+ * TODO
+ * @MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+ * TODO
*
* Command to be passed to the membarrier system call. The commands need to
* be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -48,6 +52,8 @@
enum membarrier_cmd {
MEMBARRIER_CMD_QUERY = 0,
MEMBARRIER_CMD_SHARED = (1 << 0),
+ MEMBARRIER_CMD_REGISTER_EXPEDITED = (1 << 1),
+ MEMBARRIER_CMD_UNREGISTER_EXPEDITED = (1 << 2),
};

#endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..cec23e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1769,6 +1769,8 @@ static __latent_entropy struct task_struct *copy_process(
*/
copy_seccomp(p);

+ membarrier_fork(p, clone_flags);
+
/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727..65a6fbf 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,12 +16,79 @@

#include <linux/syscalls.h>
#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+
+/*
+ * TODO: private sched.h is needed for runqueue. Should we move the
+ * sched code under kernel/sched/ ?
+ */
+#include "sched/sched.h"

/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_SHARED \
+ | MEMBARRIER_CMD_REGISTER_EXPEDITED \
+ | MEMBARRIER_CMD_UNREGISTER_EXPEDITED)
+
+static int membarrier_register_expedited(struct task_struct *t)
+{
+ struct rq *rq;
+
+ if (t->membarrier_expedited == UINT_MAX)
+ return -EOVERFLOW;
+ rq = this_rq();
+ raw_spin_lock(&rq->lock);
+ t->membarrier_expedited++;
+ raw_spin_unlock(&rq->lock);
+ return 0;
+}
+
+static int membarrier_unregister_expedited(struct task_struct *t)
+{
+ struct rq *rq;
+
+ if (!t->membarrier_expedited)
+ return -ENOENT;
+ rq = this_rq();
+ raw_spin_lock(&rq->lock);
+ t->membarrier_expedited--;
+ raw_spin_unlock(&rq->lock);
+ return 0;
+}
+
+static void memory_barrier(void *info)
+{
+ smp_mb();
+}
+
+static void membarrier_nohz_full_expedited(void)
+{
+ int cpu;
+
+ if (!tick_nohz_full_enabled())
+ return;
+ for_each_cpu(cpu, tick_nohz_full_mask) {
+ struct rq *rq;
+ struct task_struct *t;
+
+ rq = cpu_rq(cpu);
+ raw_spin_lock(&rq->lock);
+ t = rq->curr;
+ if (t->membarrier_expedited) {
+ int ret;
+
+ ret = smp_call_function_single(cpu, memory_barrier,
+ NULL, 1);
+ WARN_ON_ONCE(ret);
+ }
+ raw_spin_unlock(&rq->lock);
+ }
+}

/**
* sys_membarrier - issue memory barriers on a set of threads
@@ -57,9 +124,15 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_QUERY:
return MEMBARRIER_CMD_BITMASK;
case MEMBARRIER_CMD_SHARED:
- if (num_online_cpus() > 1)
+ if (num_online_cpus() > 1) {
synchronize_sched();
+ membarrier_nohz_full_expedited();
+ }
return 0;
+ case MEMBARRIER_CMD_REGISTER_EXPEDITED:
+ return membarrier_register_expedited(current);
+ case MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+ return membarrier_unregister_expedited(current);
default:
return -EINVAL;
}
--
2.1.4