Re: queued spinlock code and results

From: Davide Libenzi
Date: Mon Jul 09 2007 - 15:01:25 EST


On Sun, 8 Jul 2007, Nick Piggin wrote:

> I made some tests of the queued spinlock code using userspace test code on
> 64-bit processors. I believe the xadd based code no longer has any theoretical
> memory ordering problems.
>
> The tests were done on 3 different architectures of different speeds and
> vintages (so not really comparable between columns). I have attached the two
> programs I used to make the results. Times are total elapsed time divided by
> the number of locks (so effectively you get the time required for lock+unlock).
>
> The threaded results also attempt to have an unfairness count, which is the
> max number of times in a row that a lock is acquired, when all other threads
> are also executing in the loop -- the reason xadd for example is not always
> 0 there is because the other threads may not have reached the lock before
> the current thread was able to get it several times (eg. if an interrupt
> comes in, this could happen).

I was trying to look how costly is having a double-short counter, instead
of a single byte.
The V-lock uses two short counters w/out the double-xadd optimization,
while the Z-lock uses it.
This is a dual Opteron 252, but the test is done in single thread only
(multi-thread really has no meaning for this test):

inc-lock in cache takes 7.28ns
xadd-lock in cache takes 8.93ns
vadd-lock in cache takes 10.35ns
zadd-lock in cache takes 8.43ns
inc-lock out of cache takes 87.85ns
xadd-lock out of cache takes 88.15ns
vadd-lock out of cache takes 89.38ns
zadd-lock out of cache takes 88.10ns

In this box, the always-lfence V-lock code pays for it. If I remove the
lfence from the V-lock (but that's not possible), I get:

inc-lock in cache takes 7.28ns
xadd-lock in cache takes 8.93ns
vadd-lock in cache takes 7.67ns
zadd-lock in cache takes 8.44ns
inc-lock out of cache takes 87.87ns
xadd-lock out of cache takes 88.15ns
vadd-lock out of cache takes 88.24ns
zadd-lock out of cache takes 88.12ns

So in this box, and in this test, the double-short Z-lock seems faster
than a double-byte. I've no idea why, since it uses two ops more and an
extra register.




- Davide



#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>

struct page {
short lock[2];
unsigned int next;
};
#define NR_PAGES (2*1024*1024)
static struct page pages[NR_PAGES];

#define LOCK_INIT(l) (l)[0] = 1
static inline void lock(short *lock)
{
__asm__ __volatile__ ("1:\n\t"
"lock ; decb %0\n\t"
"jns 2f\n\t"
"3:\n\t"
"rep ; nop\n\t"
"cmpb $0,%0\n\t"
"jle 3b\n\t"
"jmp 1b\n\t"
"2:\n\t"
: "+m" (*lock)
: : "memory");
}

static inline void unlock(short *lock)
{
__asm__ __volatile__ ("movb $1,%0\n\t"
: "+m" (*lock)
: : "memory");
}

#define XLOCK_INIT(l) (l)[0] = 0
static inline void xlock(short *lock)
{
short i = 0x0100;
__asm__ __volatile__ ("lock ; xaddw %%ax, %1\n\t"
"1:\n\t"
"cmpb %%ah, %%al\n\t"
"je 2f\n\t"
"rep ; nop\n\t"
"movb %1, %%al\n\t"
"lfence\n\t"
"jmp 1b\n\t"
"2:\n\t"
: "+a" (i), "+m" (*lock)
: : "memory");
}

static inline void xunlock(short *lock)
{
__asm__ __volatile__ ("incb %0\n\t"
: "+m" (*lock)
: : "memory");
}

#define VLOCK_INIT(l) (l)[0] = 0, (l)[1] = 0
static inline void vlock(short *lock)
{
__asm__ __volatile__ ("lock ; xaddw %%ax, %0\n\t"
"1:\n\t"
"cmpw %%ax, %1\n\t"
"je 2f\n\t"
"rep ; nop\n\t"
"jmp 1b\n\t"
"2:\n\t"
"lfence\n\t"
: "+m" (lock[1])
: "m" (lock[0]), "a" (1) : "memory");
}

static inline void vunlock(short *lock)
{
__asm__ __volatile__ ("incw %0\n\t"
: "+m" (lock[0])
: : "memory");
}

#define ZLOCK_INIT(l) (l)[0] = 0, (l)[1] = 0
static inline void zlock(short *lock)
{
__asm__ __volatile__ ("lock ; xaddl %%eax, %0\n\t"
"mov %%eax, %%ebx\n\t"
"shr $16, %%ebx\n\t"
"1:\n\t"
"cmpw %%ax, %%bx\n\t"
"je 2f\n\t"
"rep ; nop\n\t"
"movw %1, %%bx\n\t"
"lfence\n\t"
"jmp 1b\n\t"
"2:\n\t"
: "+m" (*(int *) lock)
: "m" (lock[0]), "a" (0x10000) : "ebx", "memory");
}

static inline void zunlock(short *lock)
{
__asm__ __volatile__ ("incw %0\n\t"
: "+m" (lock[0])
: : "memory");
}

static int xlock_is_locked(short *lock)
{
short tmp = *lock;
char *x = (char *)&tmp;

return (*x != *(x+1));
}

#define ITERS (16*1024*1024)
#define IN_ITERS (ITERS*5)

int main(void)
{
int nr_pages;
int nr;
int i;
struct page *p;
struct timeval start, end;
unsigned long long usec;
unsigned int tmp;

nr_pages = 10;
srandom(10);
p = &pages[0];
i = 0;
nr = 0;
while (nr < nr_pages-1) {
unsigned int n;

n = random() % NR_PAGES;
while (p == &pages[n] || pages[n].next)
n = (n+1) % NR_PAGES;
p->next = n;
p = &pages[n];
nr++;
}
p->next = 0;

for (i = 0; i < NR_PAGES; i++)
LOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < IN_ITERS; i++) {
lock(&p->lock[0]);
tmp = p->next;
unlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("inc-lock in cache takes %0.2lfns\n", (double)usec * 1000 / IN_ITERS);

for (i = 0; i < NR_PAGES; i++)
XLOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < IN_ITERS; i++) {
xlock(&p->lock[0]);
tmp = p->next;
xunlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("xadd-lock in cache takes %0.2lfns\n", (double)usec * 1000 / IN_ITERS);

for (i = 0; i < NR_PAGES; i++)
VLOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < IN_ITERS; i++) {
vlock(&p->lock[0]);
tmp = p->next;
vunlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("vadd-lock in cache takes %0.2lfns\n", (double)usec * 1000 / IN_ITERS);

for (i = 0; i < NR_PAGES; i++)
ZLOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < IN_ITERS; i++) {
zlock(&p->lock[0]);
tmp = p->next;
zunlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("zadd-lock in cache takes %0.2lfns\n", (double)usec * 1000 / IN_ITERS);

for (i = 0; i < NR_PAGES; i++)
pages[i].next = 0;

nr_pages = NR_PAGES;
srandom(10);
p = &pages[0];
i = 0;
nr = 0;
while (nr < nr_pages-1) {
unsigned int n;

n = random() % NR_PAGES;
while (p == &pages[n] || pages[n].next)
n = (n+1) % NR_PAGES;
p->next = n;
p = &pages[n];
nr++;
}
p->next = 0;

for (i = 0; i < NR_PAGES; i++)
LOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < ITERS; i++) {
lock(&p->lock[0]);
tmp = p->next;
unlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("inc-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / ITERS);


for (i = 0; i < NR_PAGES; i++)
XLOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < ITERS; i++) {
xlock(&p->lock[0]);
tmp = p->next;
xunlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("xadd-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / ITERS);

for (i = 0; i < NR_PAGES; i++)
VLOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < ITERS; i++) {
vlock(&p->lock[0]);
tmp = p->next;
vunlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("vadd-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / ITERS);

for (i = 0; i < NR_PAGES; i++)
ZLOCK_INIT(pages[i].lock);
gettimeofday(&start, NULL);
p = &pages[0];
for (i = 0; i < ITERS; i++) {
zlock(&p->lock[0]);
tmp = p->next;
zunlock(&p->lock[0]);
p = &pages[tmp];
}
gettimeofday(&end, NULL);
usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - start.tv_usec;
printf("zadd-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / ITERS);

return 0;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/