[RFC] locking/mutex: Optimize __mutex_trylock_fast

From: Peter Zijlstra
Date: Thu Apr 05 2018 - 05:38:51 EST


Subject: locking/mutex: Optimize __mutex_trylock_fast()
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Thu Apr 5 11:05:35 CEST 2018

Use try_cmpxchg to avoid the pointless TEST instruction..
And add the (missing) atomic_long_try_cmpxchg*() wrappery.

On x86_64 this gives:

0000000000000710 <mutex_lock>: 0000000000000710 <mutex_lock>:
710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx 710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx
717: 00 00 717: 00 00
715: R_X86_64_32S current_task 715: R_X86_64_32S current_task
719: 31 c0 xor %eax,%eax 719: 31 c0 xor %eax,%eax
71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi) 71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi)
720: 48 85 c0 test %rax,%rax 720: 75 02 jne 724 <mutex_lock+0x14>
723: 75 02 jne 727 <mutex_lock+0x17> 722: f3 c3 repz retq
725: f3 c3 repz retq 724: eb da jmp 700 <__mutex_lock_slowpath>
727: eb d7 jmp 700 <__mutex_lock_slowpath>

On ARM64 this gives:

000000000000638 <mutex_lock>: 0000000000000638 <mutex_lock>:
638: d5384101 mrs x1, sp_el0 638: d5384101 mrs x1, sp_el0
63c: d2800002 mov x2, #0x0 63c: d2800002 mov x2, #0x0
640: f9800011 prfm pstl1strm, [x0] 640: f9800011 prfm pstl1strm, [x0]
644: c85ffc03 ldaxr x3, [x0] 644: c85ffc03 ldaxr x3, [x0]
648: ca020064 eor x4, x3, x2 648: ca020064 eor x4, x3, x2
64c: b5000064 cbnz x4, 658 <mutex_lock+0x20> 64c: b5000064 cbnz x4, 658 <mutex_lock+0x20>
650: c8047c01 stxr w4, x1, [x0] 650: c8047c01 stxr w4, x1, [x0]
654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc> 654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc>
658: b40000c3 cbz x3, 670 <mutex_lock+0x38> 658: b5000043 cbnz x3, 660 <mutex_lock+0x28>
65c: a9bf7bfd stp x29, x30, [sp,#-16]! 65c: d65f03c0 ret
660: 910003fd mov x29, sp 660: a9bf7bfd stp x29, x30, [sp,#-16]!
664: 97ffffef bl 620 <__mutex_lock_slowpath> 664: 910003fd mov x29, sp
668: a8c17bfd ldp x29, x30, [sp],#16 668: 97ffffee bl 620 <__mutex_lock_slowpath>
66c: d65f03c0 ret 66c: a8c17bfd ldp x29, x30, [sp],#16
670: d65f03c0 ret 670: d65f03c0 ret

Which to my untrained eye just looks different, not worse. Will?

Reported-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/asm-generic/atomic-long.h | 17 +++++++++++++++++
kernel/locking/mutex.c | 3 ++-
2 files changed, 19 insertions(+), 1 deletion(-)

--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;

#define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i)
#define ATOMIC_LONG_PFX(x) atomic64 ## x
+#define ATOMIC_LONG_TYPE s64

#else

@@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;

#define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i)
#define ATOMIC_LONG_PFX(x) atomic ## x
+#define ATOMIC_LONG_TYPE int

#endif

@@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
#define atomic_long_cmpxchg(l, old, new) \
(ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))

+
+#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_acquire(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_acquire)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg_release(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg_release)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+#define atomic_long_try_cmpxchg(l, old, new) \
+ (ATOMIC_LONG_PFX(_try_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), \
+ (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
+
+
#define atomic_long_xchg_relaxed(v, new) \
(ATOMIC_LONG_PFX(_xchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(v), (new)))
#define atomic_long_xchg_acquire(v, new) \
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struc
static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;

- if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;

return false;