[tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

From: tip-bot2 for Uros Bizjak
Date: Wed Mar 20 2024 - 07:22:34 EST


The following commit has been merged into the x86/percpu branch of tip:

Commit-ID: 50c6d2457d01944d58af355d324cb7106de19a66
Gitweb: https://git.kernel.org/tip/50c6d2457d01944d58af355d324cb7106de19a66
Author: Uros Bizjak <ubizjak@xxxxxxxxx>
AuthorDate: Wed, 20 Mar 2024 09:30:40 +01:00
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitterDate: Wed, 20 Mar 2024 12:08:11 +01:00

x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations: e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op().

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
Link: https://lore.kernel.org/r/20240320083127.493250-1-ubizjak@xxxxxxxxx
---
arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958eb..de991e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do { \
})

/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define this_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ pxo_old__; \
})

/*
@@ -534,9 +524,9 @@ do { \
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)

#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif