[tip:locking/core] locking/rwsem: Guard against making count negative

From: tip-bot for Waiman Long
Date: Mon Jun 17 2019 - 10:40:41 EST


Commit-ID: a15ea1a35f1b2782befc8b958c123c5d6a7cab0a
Gitweb: https://git.kernel.org/tip/a15ea1a35f1b2782befc8b958c123c5d6a7cab0a
Author: Waiman Long <longman@xxxxxxxxxx>
AuthorDate: Mon, 20 May 2019 16:59:15 -0400
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Mon, 17 Jun 2019 12:28:11 +0200

locking/rwsem: Guard against making count negative

The upper bits of the count field is used as reader count. When
sufficient number of active readers are present, the most significant
bit will be set and the count becomes negative. If the number of active
readers keep on piling up, we may eventually overflow the reader counts.
This is not likely to happen unless the number of bits reserved for
reader count is reduced because those bits are need for other purpose.

To prevent this count overflow from happening, the most significant
bit is now treated as a guard bit (RWSEM_FLAG_READFAIL). Read-lock
attempts will now fail for both the fast and slow paths whenever this
bit is set. So all those extra readers will be put to sleep in the wait
list. Wakeup will not happen until the reader count reaches 0.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Davidlohr Bueso <dave@xxxxxxxxxxxx>
Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
Cc: Will Deacon <will.deacon@xxxxxxx>
Cc: huang ying <huang.ying.caritas@xxxxxxxxx>
Link: https://lkml.kernel.org/r/20190520205918.22251-17-longman@xxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
kernel/locking/rwsem.c | 53 ++++++++++++++++++++++++++++++++++++++------------
1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e1e0bac957c4..37524a47f002 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -116,13 +116,28 @@
#endif

/*
- * The definition of the atomic counter in the semaphore:
+ * On 64-bit architectures, the bit definitions of the count are:
*
- * Bit 0 - writer locked bit
- * Bit 1 - waiters present bit
- * Bit 2 - lock handoff bit
- * Bits 3-7 - reserved
- * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit 63 - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit 31 - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
*
* atomic_long_fetch_add() is used to obtain reader lock, whereas
* atomic_long_cmpxchg() will be used to obtain writer lock.
@@ -139,6 +154,7 @@
#define RWSEM_WRITER_LOCKED (1UL << 0)
#define RWSEM_FLAG_WAITERS (1UL << 1)
#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))

#define RWSEM_READER_SHIFT 8
#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
@@ -146,7 +162,7 @@
#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
- RWSEM_FLAG_HANDOFF)
+ RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)

/*
* All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -254,6 +270,14 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
owner | RWSEM_NONSPINNABLE));
}

+static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+{
+ long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+ if (WARN_ON_ONCE(cnt < 0))
+ rwsem_set_nonspinnable(sem);
+ return !(cnt & RWSEM_READ_FAILED_MASK);
+}
+
/*
* Return just the real task structure pointer of the owner
*/
@@ -402,6 +426,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
return;
}

+ /*
+ * No reader wakeup if there are too many of them already.
+ */
+ if (unlikely(atomic_long_read(&sem->count) < 0))
+ return;
+
/*
* Writers might steal the lock before we grant it to the next reader.
* We prefer to do the first reader grant before counting readers
@@ -949,9 +979,9 @@ static struct rw_semaphore __sched *
rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
{
long count, adjustment = -RWSEM_READER_BIAS;
- bool wake = false;
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
+ bool wake = false;

/*
* Save the current read-owner of rwsem, if available, and the
@@ -1270,8 +1300,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
*/
inline void __down_read(struct rw_semaphore *sem)
{
- if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
- &sem->count) & RWSEM_READ_FAILED_MASK)) {
+ if (!rwsem_read_trylock(sem)) {
rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
} else {
@@ -1281,8 +1310,7 @@ inline void __down_read(struct rw_semaphore *sem)

static inline int __down_read_killable(struct rw_semaphore *sem)
{
- if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
- &sem->count) & RWSEM_READ_FAILED_MASK)) {
+ if (!rwsem_read_trylock(sem)) {
if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
return -EINTR;
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
@@ -1359,6 +1387,7 @@ inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+ DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
clear_wr_nonspinnable(sem);