Re: locking/atomic: Introduce atomic_try_cmpxchg()

From: Dmitry Vyukov
Date: Fri Mar 24 2017 - 13:51:50 EST


On Fri, Mar 24, 2017 at 6:23 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Fri, Mar 24, 2017 at 09:54:46AM -0700, Andy Lutomirski wrote:
>> > So the first snipped I tested regressed like so:
>> >
>> >
>> > 0000000000000000 <T_refcount_inc>: 0000000000000000 <T_refcount_inc>:
>> > 0: 8b 07 mov (%rdi),%eax 0: 8b 17 mov (%rdi),%edx
>> > 2: 83 f8 ff cmp $0xffffffff,%eax 2: 83 fa ff cmp $0xffffffff,%edx
>> > 5: 74 13 je 1a <T_refcount_inc+0x1a> 5: 74 1a je 21 <T_refcount_inc+0x21>
>> > 7: 85 c0 test %eax,%eax 7: 85 d2 test %edx,%edx
>> > 9: 74 0d je 18 <T_refcount_inc+0x18> 9: 74 13 je 1e <T_refcount_inc+0x1e>
>> > b: 8d 50 01 lea 0x1(%rax),%edx b: 8d 4a 01 lea 0x1(%rdx),%ecx
>> > e: f0 0f b1 17 lock cmpxchg %edx,(%rdi) e: 89 d0 mov %edx,%eax
>> > 12: 75 ee jne 2 <T_refcount_inc+0x2> 10: f0 0f b1 0f lock cmpxchg %ecx,(%rdi)
>> > 14: ff c2 inc %edx 14: 74 04 je 1a <T_refcount_inc+0x1a>
>> > 16: 75 02 jne 1a <T_refcount_inc+0x1a> 16: 89 c2 mov %eax,%edx
>> > 18: 0f 0b ud2 18: eb e8 jmp 2 <T_refcount_inc+0x2>
>> > 1a: c3 retq 1a: ff c1 inc %ecx
>> > 1c: 75 03 jne 21 <T_refcount_inc+0x21>
>> > 1e: 0f 0b ud2
>> > 20: c3 retq
>> > 21: c3 retq
>>
>> Can you re-send the better asm you got earlier?
>
> On the left?
>
>> If I pretend to be a dumb compiler, I wonder if you'd get better results with:
>>
>> if (!success) {
>> *_old = __old;
>> return false;
>> } else {
>> return true;
>> }
>>
>> or however you jam that into a statement expression. That way you
>> aren't relying on the compiler to merge the branches.
>
> I tried a few variants, but nothing really made it better.
>
> Find the tiny.c file below; I'm using:
>
> gcc (Debian 6.3.0-5) 6.3.0 20170124
>
> it has both an inline and an stmt-expr try_cmpxchg variant to play with;
> the 'expected' output is at the bottom (same as above left).
>
> Note that clang doesn't compile this stuff due to missing features.
>
> ---
>
> /* gcc -Os -std=gnu99 -fno-strict-overflow -falign-jumps=1 -falign-loops=1 -c tiny.c; objdump -dr tiny.o */
>
> typedef _Bool bool;
>
> static inline bool try_cmpxchg(unsigned int *ptr, unsigned int *val, unsigned int new)
> {
> unsigned int old = *val;
> bool success;
>
> asm volatile("lock cmpxchgl %[new], %[ptr]"
> : "=@ccz" (success),
> [ptr] "+m" (*ptr),
> [old] "+a" (old)
> : [new] "r" (new)
> : "memory");
>
> // if (!success)
> *val = old;
>
> return success;
> }
>
> #define __try_cmpxchg(_ptr, _pold, _new) \
> ({ \
> bool success; \
> __typeof__(_ptr) _old = (_pold); \
> __typeof__(*(_ptr)) __old = *_old; \
> __typeof__(*(_ptr)) __new = (_new); \
> volatile unsigned int * __ptr = (volatile unsigned int *)(_ptr);\
> \
> asm volatile("lock cmpxchgl %[new], %[ptr]" \
> : "=@ccz" (success), \
> [ptr] "+m" (*__ptr), \
> [old] "+a" (__old) \
> : [new] "r" (__new) \
> : "memory"); \
> /* if (!success) */ \
> *_old = __old; \
> success; \
> })
>
> #define EXCEPTION_VALUE(val, handler) asm volatile ("ud2 # %0" : : "r" (val))
>
> #define UINT_MAX (~0U)
>
> #define likely(x) __builtin_expect(!!(x), 1)
> #define unlikely(x) __builtin_expect(!!(x), 0)
>
> static inline void refcount_inc(unsigned int *r)
> {
> unsigned int new, val = *(unsigned int volatile *)r;
>
> do {
> if (unlikely(val == UINT_MAX)) /* saturated */
> return;
>
> if (unlikely(!val)) /* use-after-free */
> goto exception;
>
> /* cannot overflow because we already checked UINT_MAX */
> new = val + 1;
>
> } while (unlikely(!try_cmpxchg(r, &val, new)));
>
> if (unlikely(new == UINT_MAX))
> exception: EXCEPTION_VALUE(val, __refcount_warn);
> }
>
> void T_refcount_inc(unsigned int *r)
> {
> refcount_inc(r);
> }
>
> /*
> 0: 8b 07 mov (%rdi),%eax
> 2: 83 f8 ff cmp $0xffffffff,%eax
> 5: 74 13 je 1a <T_refcount_inc+0x1a>
> 7: 85 c0 test %eax,%eax
> 9: 74 0d je 18 <T_refcount_inc+0x18>
> b: 8d 50 01 lea 0x1(%rax),%edx
> e: f0 0f b1 17 lock cmpxchg %edx,(%rdi)
> 12: 75 ee jne 2 <T_refcount_inc+0x2>
> 14: ff c2 inc %edx
> 16: 75 02 jne 1a <T_refcount_inc+0x1a>
> 18: 0f 0b ud2
> 1a: c3 retq
> */



This seems to help ;)

#define try_cmpxchg(ptr, pold, new) __atomic_compare_exchange_n(ptr,
pold, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)