[PATCH v2 2/3] locking/percpu-rwsem: Rework writer block/wake to not use wait-queues

From: Davidlohr Bueso
Date: Fri Dec 02 2016 - 21:19:17 EST


The use of any kind of wait queue is an overkill for pcpu-rwsems.
While one option would be to use the less heavy simple (swait)
flavor, this is still too much for what pcpu-rwsems needs. For one,
we do not care about any sort of queuing in that the only (rare) time
writers (and readers, for that matter) are queued is when trying to
acquire the regular contended rw_sem. There cannot be any further
queuing as writers are serialized by the rw_sem in the first place.

This patch, therefore, implements custom wait/wake, with an rcu-aware
writer task pointer. The only time this is !nil is when a writer is
determining if it is going to block, and reset as soon as we know that
the percpu_down_write() call has succeeded. All this is obviously done while
holding the regular rw_sem. As such, we can avoid the queue handling and
locking overhead (although we currently end up taking the waitqueue
spinlock fastpath, so it wouldn't be a very big an impact).

Signed-off-by: Davidlohr Bueso <dbueso@xxxxxxx>
---
include/linux/percpu-rwsem.h | 5 ++---
kernel/locking/percpu-rwsem.c | 26 +++++++++++++++++++++-----
2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5b2e6159b744..9942b7e8bde8 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -4,7 +4,6 @@
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
-#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>

@@ -12,7 +11,7 @@ struct percpu_rw_semaphore {
struct rcu_sync rss;
unsigned int __percpu *read_count;
struct rw_semaphore rw_sem;
- wait_queue_head_t writer;
+ struct task_struct *writer; /* blocked writer */
int readers_block;
};

@@ -22,7 +21,7 @@ static struct percpu_rw_semaphore name = { \
.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
.read_count = &__percpu_rwsem_rc_##name, \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
- .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
+ .writer = NULL, \
}

extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ce182599cf2e..7856a77396d3 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,7 +1,6 @@
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
-#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
@@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
__init_rwsem(&sem->rw_sem, name, rwsem_key);
- init_waitqueue_head(&sem->writer);
+ sem->writer = NULL;
sem->readers_block = 0;
return 0;
}
@@ -94,6 +93,8 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);

void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
+ struct task_struct *writer;
+
smp_mb(); /* B matches C */
/*
* In other words, if they see our decrement (presumably to aggregate
@@ -102,8 +103,13 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
*/
__this_cpu_dec(*sem->read_count);

+ rcu_read_lock();
+ writer = rcu_dereference(sem->writer);
+
/* Prod writer to recheck readers_active */
- wake_up(&sem->writer);
+ if (writer)
+ wake_up_process(writer);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(__percpu_up_read);

@@ -159,8 +165,18 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* will wait for them.
*/

- /* Wait for all now active readers to complete. */
- wait_event(sem->writer, readers_active_check(sem));
+ WRITE_ONCE(sem->writer, current);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ if (readers_active_check(sem))
+ break;
+
+ schedule();
+ }
+
+ rcu_assign_pointer(sem->writer, NULL);
+ __set_current_state(TASK_RUNNING);
}
EXPORT_SYMBOL_GPL(percpu_down_write);

--
2.6.6