Re: [block IO crash] Re: 2.6.39-rc5-git2 boot crashs

From: Christoph Lameter
Date: Wed May 04 2011 - 16:04:48 EST


On Wed, 4 May 2011, Linus Torvalds wrote:

> On Wed, May 4, 2011 at 12:30 PM, Christoph Lameter <cl@xxxxxxxxx> wrote:
> >
> > The naming convention came about from the existing this_cpu_xxx
> > operations
>
> You're missing my point.
>
> An "add" operation makes sense even if it isn't atomic, because
> atomicity isn't a part of the definition of "add".
>
> But cmpxchg DOES NOT MAKE SENSE without atomicity guarantees.

This is not a real cmpxchg after all. Its not atomic in the sense of
other functions. Its only "percpu atomic" if you want it that way. This is
*not* a full cmpxchg_double().

> The whole operation is about atomicity.
>
> Having a version that isn't atomic is STUPID. It's misleading. It's _wrong_.

Its "atomic" in the sense that it is an instruction that is either
executed or not in total and that fact alone allow the avoiding of
synchronization for preemption and interrupts. We just push as much
processing as possible into this single instruction and then we dont have
to worry about preemption or interrupts while this function is
executed by the processor.

> In contrast, having a non-atomic "add" version is understandable.
>
> So when you say "naming convention", you're missing the much bigger
> naming convention. Namely the "cmpxchg" part!

Well this is not really a true cmpxchg. There is no lock prefix.

The semantics of the this_cpu_xxx functions are not atomic but only per
cpu atomic. That per cpu atomicity can require only the exclusion of
preemption or the exclusion of interrupts.

In extreme cases we dont care about preemption or interrupts interfering
with the operation. We just want to opportunistically take advantage of
sophisticated instructions if they are available (f.e. for accurate vm
counters). Or we may have some other external means of serialization (like
lock or we already disabled preemption). Thats what the __ operations are
for.

Maybe I should have pushed the cmpxchg_double() before the
this_cpu_cmpxchg to avoid these misunderstandings


Here is the patch for the fullly atomic cmpxchg_double() which will be
needed for making the non per cpu specific processing lockless later:


Subject: x86: Add support for cmpxchg_double

A simple implementation that only supports the word size and does not
have a fallback mode (would require a spinlock).

And 32 and 64 bit support for cmpxchg_double. cmpxchg double uses
the cmpxchg8b or cmpxchg16b instruction on x86 processors to compare
and swap 2 machine words. This allows lockless algorithms to move more
context information through critical sections.

Set a flag CONFIG_CMPXCHG_DOUBLE to signal the support of that feature
during kernel builds.

Signed-off-by: Christoph Lameter <cl@xxxxxxxxx>

---
arch/x86/Kconfig.cpu | 3 ++
arch/x86/include/asm/cmpxchg_32.h | 46 ++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/cmpxchg_64.h | 45 +++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/cpufeature.h | 1
4 files changed, 95 insertions(+)

Index: linux-2.6/arch/x86/include/asm/cmpxchg_64.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/cmpxchg_64.h 2011-04-13 15:19:53.000000000 -0500
+++ linux-2.6/arch/x86/include/asm/cmpxchg_64.h 2011-04-15 13:14:45.000000000 -0500
@@ -151,4 +151,49 @@ extern void __cmpxchg_wrong_size(void);
cmpxchg_local((ptr), (o), (n)); \
})

+#define cmpxchg16b(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __junk; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile(LOCK_PREFIX_HERE "lock; cmpxchg16b (%%rsi);setz %1" \
+ : "=d"(__junk), "=a"(__ret) \
+ : "S"(ptr), "b"(__new1), "c"(__new2), \
+ "a"(__old1), "d"(__old2)); \
+ __ret; })
+
+
+#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __junk; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile("cmpxchg16b (%%rsi)\n\t\tsetz %1\n\t" \
+ : "=d"(__junk)_, "=a"(__ret) \
+ : "S"((ptr)), "b"(__new1), "c"(__new2), \
+ "a"(__old1), "d"(__old2)); \
+ __ret; })
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ VM_BUG_ON((unsigned long)(ptr) % 16); \
+ cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ VM_BUG_ON((unsigned long)(ptr) % 16); \
+ cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx16
+
#endif /* _ASM_X86_CMPXCHG_64_H */
Index: linux-2.6/arch/x86/include/asm/cmpxchg_32.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/cmpxchg_32.h 2011-04-13 15:19:53.000000000 -0500
+++ linux-2.6/arch/x86/include/asm/cmpxchg_32.h 2011-04-15 13:14:45.000000000 -0500
@@ -280,4 +280,50 @@ static inline unsigned long cmpxchg_386(

#endif

+#define cmpxchg8b(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __dummy; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile(LOCK_PREFIX_HERE "lock; cmpxchg8b (%%esi); setz %1"\
+ : "d="(__dummy), "=a" (__ret) \
+ : "S" ((ptr)), "a" (__old1), "d"(__old2), \
+ "b" (__new1), "c" (__new2) \
+ : "memory"); \
+ __ret; })
+
+
+#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __dummy; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile("cmpxchg8b (%%esi); tsetz %1" \
+ : "d="(__dummy), "=a"(__ret) \
+ : "S" ((ptr)), "a" (__old), "d"(__old2), \
+ "b" (__new1), "c" (__new2), \
+ : "memory"); \
+ __ret; })
+
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+ VM_BUG_ON((unsigned long)(ptr) % 8); \
+ cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+ VM_BUG_ON((unsigned long)(ptr) % 8); \
+ cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
+})
+
#endif /* _ASM_X86_CMPXCHG_32_H */
Index: linux-2.6/arch/x86/Kconfig.cpu
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig.cpu 2011-04-13 15:19:53.000000000 -0500
+++ linux-2.6/arch/x86/Kconfig.cpu 2011-04-15 13:14:45.000000000 -0500
@@ -308,6 +308,9 @@ config X86_CMPXCHG
config CMPXCHG_LOCAL
def_bool X86_64 || (X86_32 && !M386)

+config CMPXCHG_DOUBLE
+ def_bool X86_64 || (X86_32 && !M386)
+
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
Index: linux-2.6/arch/x86/include/asm/cpufeature.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/cpufeature.h 2011-04-15 12:51:51.000000000 -0500
+++ linux-2.6/arch/x86/include/asm/cpufeature.h 2011-04-15 13:14:45.000000000 -0500
@@ -286,6 +286,7 @@ extern const char * const x86_power_flag
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)

#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/