Re: [patches] Re: [PATCH v9 05/12] RISC-V: Atomic and Locking Code

From: Palmer Dabbelt
Date: Wed Nov 15 2017 - 14:48:15 EST

Next message: SF Markus Elfring: "[PATCH 0/2] ALSA: intel8x0: Fine-tuning for seven function implementations"
Previous message: Dan Murphy: "[PATCH v3 2/2] leds: lm3692x: Introduce LM3692x dual string driver"
In reply to: Will Deacon: "Re: [PATCH v9 05/12] RISC-V: Atomic and Locking Code"
Next in thread: Daniel Lustig: "RE: [patches] Re: [PATCH v9 05/12] RISC-V: Atomic and Locking Code"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Wed, 15 Nov 2017 10:06:01 PST (-0800), will.deacon@xxxxxxx wrote:

Hi Palmer,

On Tue, Nov 14, 2017 at 12:30:59PM -0800, Palmer Dabbelt wrote:

On Tue, 24 Oct 2017 07:10:33 PDT (-0700), will.deacon@xxxxxxx wrote:
>On Tue, Sep 26, 2017 at 06:56:31PM -0700, Palmer Dabbelt wrote:
>>+ATOMIC_OPS(add, add, +, i)
>>+ATOMIC_OPS(sub, add, +, -i)
>>+ATOMIC_OPS(and, and, &, i)
>>+ATOMIC_OPS( or, or, |, i)
>>+ATOMIC_OPS(xor, xor, ^, i)
>
>What is the point in the c_op parameter to these things?

I guess there isn't one, it just lingered from a handful of refactorings.
It's used in some of the other functions if we need to do a C operation
after the atomic. How does this look?

commit 5db229491a205ad0e1aa18287e3b342176c62d30 (HEAD -> staging-mm)
Author: Palmer Dabbelt <palmer@xxxxxxxxxxx>
Date: Tue Nov 14 11:35:37 2017 -0800

RISC-V: Remove c_op from ATOMIC_OP

This was an unused macro parameter.

Signed-off-by: Palmer Dabbelt <palmer@xxxxxxxxxxx>

You can do the same thing for FETCH_OP, I think.

I agree. I think I got them all this time.

commit be88c8a5f714569b56fffcc9f2d3bc673bdaf16b
Author: Palmer Dabbelt <palmer@xxxxxxxxxxx>
Date: Wed Nov 15 10:19:15 2017 -0800

Remove c_op from more places

diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 77a7fc55daab..1982c865ba59 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -83,7 +83,7 @@ ATOMIC_OPS(xor, xor, i)
* There's two flavors of these: the arithmatic ops have both fetch and return
* versions, while the logical ops only have fetch versions.
*/
-#define ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, asm_type, c_type, prefix) \
+#define ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, asm_type, c_type, prefix) \
static __always_inline c_type atomic##prefix##_fetch_##op##c_or(c_type i, atomic##prefix##_t *v) \
{ \
register c_type ret; \
@@ -103,13 +103,13 @@ static __always_inline c_type atomic##prefix##_##op##_return##c_or(c_type i, ato

#ifdef CONFIG_GENERIC_ATOMIC64
#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or) \
- ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, w, int, ) \
+ ATOMIC_FETCH_OP (op, asm_op, I, asm_or, c_or, w, int, ) \
ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w, int, )
#else
#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or) \
- ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, w, int, ) \
+ ATOMIC_FETCH_OP (op, asm_op, I, asm_or, c_or, w, int, ) \
ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w, int, ) \
- ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, d, long, 64) \
+ ATOMIC_FETCH_OP (op, asm_op, I, asm_or, c_or, d, long, 64) \
ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
#endif

@@ -126,28 +126,28 @@ ATOMIC_OPS(sub, add, +, -i, .aqrl, )
#undef ATOMIC_OPS

#ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or) \
- ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, w, int, )
+#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or) \
+ ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w, int, )
#else
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or) \
- ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, w, int, ) \
- ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or) \
+ ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w, int, ) \
+ ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, d, long, 64)
#endif

-ATOMIC_OPS(and, and, &, i, , _relaxed)
-ATOMIC_OPS(and, and, &, i, .aq , _acquire)
-ATOMIC_OPS(and, and, &, i, .rl , _release)
-ATOMIC_OPS(and, and, &, i, .aqrl, )
+ATOMIC_OPS(and, and, i, , _relaxed)
+ATOMIC_OPS(and, and, i, .aq , _acquire)
+ATOMIC_OPS(and, and, i, .rl , _release)
+ATOMIC_OPS(and, and, i, .aqrl, )

-ATOMIC_OPS( or, or, |, i, , _relaxed)
-ATOMIC_OPS( or, or, |, i, .aq , _acquire)
-ATOMIC_OPS( or, or, |, i, .rl , _release)
-ATOMIC_OPS( or, or, |, i, .aqrl, )
+ATOMIC_OPS( or, or, i, , _relaxed)
+ATOMIC_OPS( or, or, i, .aq , _acquire)
+ATOMIC_OPS( or, or, i, .rl , _release)
+ATOMIC_OPS( or, or, i, .aqrl, )

-ATOMIC_OPS(xor, xor, ^, i, , _relaxed)
-ATOMIC_OPS(xor, xor, ^, i, .aq , _acquire)
-ATOMIC_OPS(xor, xor, ^, i, .rl , _release)
-ATOMIC_OPS(xor, xor, ^, i, .aqrl, )
+ATOMIC_OPS(xor, xor, i, , _relaxed)
+ATOMIC_OPS(xor, xor, i, .aq , _acquire)
+ATOMIC_OPS(xor, xor, i, .rl , _release)
+ATOMIC_OPS(xor, xor, i, .aqrl, )

#undef ATOMIC_OPS

@@ -182,13 +182,13 @@ ATOMIC_OPS(add_negative, add, <, 0)
#undef ATOMIC_OP
#undef ATOMIC_OPS

-#define ATOMIC_OP(op, func_op, c_op, I, c_type, prefix) \
+#define ATOMIC_OP(op, func_op, I, c_type, prefix) \
static __always_inline void atomic##prefix##_##op(atomic##prefix##_t *v) \
{ \
atomic##prefix##_##func_op(I, v); \
}

-#define ATOMIC_FETCH_OP(op, func_op, c_op, I, c_type, prefix) \
+#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix) \
static __always_inline c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v) \
{ \
return atomic##prefix##_fetch_##func_op(I, v); \
@@ -202,16 +202,16 @@ static __always_inline c_type atomic##prefix##_##op##_return(atomic##prefix##_t

#ifdef CONFIG_GENERIC_ATOMIC64
#define ATOMIC_OPS(op, asm_op, c_op, I) \
- ATOMIC_OP (op, asm_op, c_op, I, int, ) \
- ATOMIC_FETCH_OP (op, asm_op, c_op, I, int, ) \
+ ATOMIC_OP (op, asm_op, I, int, ) \
+ ATOMIC_FETCH_OP (op, asm_op, I, int, ) \
ATOMIC_OP_RETURN(op, asm_op, c_op, I, int, )
#else
#define ATOMIC_OPS(op, asm_op, c_op, I) \
- ATOMIC_OP (op, asm_op, c_op, I, int, ) \
- ATOMIC_FETCH_OP (op, asm_op, c_op, I, int, ) \
+ ATOMIC_OP (op, asm_op, I, int, ) \
+ ATOMIC_FETCH_OP (op, asm_op, I, int, ) \
ATOMIC_OP_RETURN(op, asm_op, c_op, I, int, ) \
- ATOMIC_OP (op, asm_op, c_op, I, long, 64) \
- ATOMIC_FETCH_OP (op, asm_op, c_op, I, long, 64) \
+ ATOMIC_OP (op, asm_op, I, long, 64) \
+ ATOMIC_FETCH_OP (op, asm_op, I, long, 64) \
ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)
#endif

>>+ATOMIC_OPS(add, add, +, i, , _relaxed)
>>+ATOMIC_OPS(add, add, +, i, .aq , _acquire)
>>+ATOMIC_OPS(add, add, +, i, .rl , _release)
>>+ATOMIC_OPS(add, add, +, i, .aqrl, )
>
>Have you checked that .aqrl is equivalent to "ordered", since there are
>interpretations where that isn't the case. Specifically:
>
>// all variables zero at start of time
>P0:
>WRITE_ONCE(x) = 1;
>atomic_add_return(y, 1);
>WRITE_ONCE(z) = 1;
>
>P1:
>READ_ONCE(z) // reads 1
>smp_rmb();
>READ_ONCE(x) // must not read 0

I haven't. We don't quite have a formal memory model specification yet.
I've added Daniel Lustig, who is creating that model. He should have a
better idea

Thanks. You really do need to ensure that, as it's heavily relied upon.

I know it's the case for our current processors, and I'm pretty sure it's the case for what's formally specified, but we'll have to wait for the spec in order to prove it.

>>+static inline void arch_read_lock(arch_rwlock_t *lock)
>>+{
>>+ int tmp;
>>+
>>+ __asm__ __volatile__(
>>+ "1: lr.w %1, %0\n"
>>+ " bltz %1, 1b\n"
>>+ " addi %1, %1, 1\n"
>>+ " sc.w.aq %1, %1, %0\n"
>>+ " bnez %1, 1b\n"
>>+ : "+A" (lock->lock), "=&r" (tmp)
>>+ :: "memory");
>>+}
>>+
>>+static inline void arch_write_lock(arch_rwlock_t *lock)
>>+{
>>+ int tmp;
>>+
>>+ __asm__ __volatile__(
>>+ "1: lr.w %1, %0\n"
>>+ " bnez %1, 1b\n"
>>+ " li %1, -1\n"
>>+ " sc.w.aq %1, %1, %0\n"
>>+ " bnez %1, 1b\n"
>>+ : "+A" (lock->lock), "=&r" (tmp)
>>+ :: "memory");
>>+}
>
>I think you have the same starvation issues as we have on arm64 here. I
>strongly recommend moving over to qrwlock :)

I still strongly recommend this!

I agree. I think we can replace both of these locks with the generic versions -- they're a lot better than ours. It's on the TODO list.

>
>>+#ifndef _ASM_RISCV_TLBFLUSH_H
>>+#define _ASM_RISCV_TLBFLUSH_H
>>+
>>+#ifdef CONFIG_MMU
>>+
>>+/* Flush entire local TLB */
>>+static inline void local_flush_tlb_all(void)
>>+{
>>+ __asm__ __volatile__ ("sfence.vma" : : : "memory");
>>+}
>>+
>>+/* Flush one page from local TLB */
>>+static inline void local_flush_tlb_page(unsigned long addr)
>>+{
>>+ __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
>>+}
>
>Is this serialised against prior updates to the page table (set_pte) and
>also against subsequent instruction fetch?

This is a store -> (load, store) fence. The ordering is between stores that
touch paging data structures and the implicit loads that come from any
memory access when paging is enabled. As far as I can tell, it does not
enforce any instruction fetch ordering constraints.

That sounds pretty suspicious to me. You need to ensure that speculative
instruction fetch doesn't fetch instructions from the old mapping. Also,
store -> (load, store) fences don't generally order the page table walk,
which is what you need to order here.

It's the specific type of store -> (load, store) fence that orders page table walking. "sfence.vma" orders between the page table walker and the store buffer on a single core, as opposed to "fence w,rw" which would order between the memory stages of two different cores.

I'll have to check with a memory model guy for sure, but I don't think "sfence.vma" enforces anything WRT the instruction stream. If that's the case, this should do it

commit 4fa4374b3c73fc0022e51e648e8c9285829f6155
Author: Palmer Dabbelt <palmer@xxxxxxxxxxx>
Date: Wed Nov 15 10:28:16 2017 -0800

fence.i on TLB flushes

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 5ee4ae370b5e..cda08a9734c5 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -17,16 +17,28 @@

#ifdef CONFIG_MMU

-/* Flush entire local TLB */
+/*
+ * Flush entire local TLB. We need the fence.i to ensure newly mapped pages
+ * are fetched correctly.
+ */
static inline void local_flush_tlb_all(void)
{
- __asm__ __volatile__ ("sfence.vma" : : : "memory");
+ __asm__ __volatile__ (
+ "sfence.vma\n\t"
+ "fence.i"
+ : : : "memory"
+ );
}

/* Flush one page from local TLB */
static inline void local_flush_tlb_page(unsigned long addr)
{
- __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
+ __asm__ __volatile__ (
+ "sfence.vma %0\n\t"
+ "fence.i"
+ : : "r" (addr)
+ : "memory"
+ );
}

#ifndef CONFIG_SMP

Thanks!

Next message: SF Markus Elfring: "[PATCH 0/2] ALSA: intel8x0: Fine-tuning for seven function implementations"
Previous message: Dan Murphy: "[PATCH v3 2/2] leds: lm3692x: Introduce LM3692x dual string driver"
In reply to: Will Deacon: "Re: [PATCH v9 05/12] RISC-V: Atomic and Locking Code"
Next in thread: Daniel Lustig: "RE: [patches] Re: [PATCH v9 05/12] RISC-V: Atomic and Locking Code"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]