Re: locking/atomic: Introduce atomic_try_cmpxchg()

From: Peter Zijlstra
Date: Fri Mar 24 2017 - 13:24:06 EST


On Fri, Mar 24, 2017 at 09:54:46AM -0700, Andy Lutomirski wrote:
> > So the first snipped I tested regressed like so:
> >
> >
> > 0000000000000000 <T_refcount_inc>: 0000000000000000 <T_refcount_inc>:
> > 0: 8b 07 mov (%rdi),%eax 0: 8b 17 mov (%rdi),%edx
> > 2: 83 f8 ff cmp $0xffffffff,%eax 2: 83 fa ff cmp $0xffffffff,%edx
> > 5: 74 13 je 1a <T_refcount_inc+0x1a> 5: 74 1a je 21 <T_refcount_inc+0x21>
> > 7: 85 c0 test %eax,%eax 7: 85 d2 test %edx,%edx
> > 9: 74 0d je 18 <T_refcount_inc+0x18> 9: 74 13 je 1e <T_refcount_inc+0x1e>
> > b: 8d 50 01 lea 0x1(%rax),%edx b: 8d 4a 01 lea 0x1(%rdx),%ecx
> > e: f0 0f b1 17 lock cmpxchg %edx,(%rdi) e: 89 d0 mov %edx,%eax
> > 12: 75 ee jne 2 <T_refcount_inc+0x2> 10: f0 0f b1 0f lock cmpxchg %ecx,(%rdi)
> > 14: ff c2 inc %edx 14: 74 04 je 1a <T_refcount_inc+0x1a>
> > 16: 75 02 jne 1a <T_refcount_inc+0x1a> 16: 89 c2 mov %eax,%edx
> > 18: 0f 0b ud2 18: eb e8 jmp 2 <T_refcount_inc+0x2>
> > 1a: c3 retq 1a: ff c1 inc %ecx
> > 1c: 75 03 jne 21 <T_refcount_inc+0x21>
> > 1e: 0f 0b ud2
> > 20: c3 retq
> > 21: c3 retq
>
> Can you re-send the better asm you got earlier?

On the left?

> If I pretend to be a dumb compiler, I wonder if you'd get better results with:
>
> if (!success) {
> *_old = __old;
> return false;
> } else {
> return true;
> }
>
> or however you jam that into a statement expression. That way you
> aren't relying on the compiler to merge the branches.

I tried a few variants, but nothing really made it better.

Find the tiny.c file below; I'm using:

gcc (Debian 6.3.0-5) 6.3.0 20170124

it has both an inline and an stmt-expr try_cmpxchg variant to play with;
the 'expected' output is at the bottom (same as above left).

Note that clang doesn't compile this stuff due to missing features.

---

/* gcc -Os -std=gnu99 -fno-strict-overflow -falign-jumps=1 -falign-loops=1 -c tiny.c; objdump -dr tiny.o */

typedef _Bool bool;

static inline bool try_cmpxchg(unsigned int *ptr, unsigned int *val, unsigned int new)
{
unsigned int old = *val;
bool success;

asm volatile("lock cmpxchgl %[new], %[ptr]"
: "=@ccz" (success),
[ptr] "+m" (*ptr),
[old] "+a" (old)
: [new] "r" (new)
: "memory");

// if (!success)
*val = old;

return success;
}

#define __try_cmpxchg(_ptr, _pold, _new) \
({ \
bool success; \
__typeof__(_ptr) _old = (_pold); \
__typeof__(*(_ptr)) __old = *_old; \
__typeof__(*(_ptr)) __new = (_new); \
volatile unsigned int * __ptr = (volatile unsigned int *)(_ptr);\
\
asm volatile("lock cmpxchgl %[new], %[ptr]" \
: "=@ccz" (success), \
[ptr] "+m" (*__ptr), \
[old] "+a" (__old) \
: [new] "r" (__new) \
: "memory"); \
/* if (!success) */ \
*_old = __old; \
success; \
})

#define EXCEPTION_VALUE(val, handler) asm volatile ("ud2 # %0" : : "r" (val))

#define UINT_MAX (~0U)

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

static inline void refcount_inc(unsigned int *r)
{
unsigned int new, val = *(unsigned int volatile *)r;

do {
if (unlikely(val == UINT_MAX)) /* saturated */
return;

if (unlikely(!val)) /* use-after-free */
goto exception;

/* cannot overflow because we already checked UINT_MAX */
new = val + 1;

} while (unlikely(!try_cmpxchg(r, &val, new)));

if (unlikely(new == UINT_MAX))
exception: EXCEPTION_VALUE(val, __refcount_warn);
}

void T_refcount_inc(unsigned int *r)
{
refcount_inc(r);
}

/*
0: 8b 07 mov (%rdi),%eax
2: 83 f8 ff cmp $0xffffffff,%eax
5: 74 13 je 1a <T_refcount_inc+0x1a>
7: 85 c0 test %eax,%eax
9: 74 0d je 18 <T_refcount_inc+0x18>
b: 8d 50 01 lea 0x1(%rax),%edx
e: f0 0f b1 17 lock cmpxchg %edx,(%rdi)
12: 75 ee jne 2 <T_refcount_inc+0x2>
14: ff c2 inc %edx
16: 75 02 jne 1a <T_refcount_inc+0x1a>
18: 0f 0b ud2
1a: c3 retq
*/