RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy()for unaligned copy

From: Ma, Ling
Date: Thu Oct 14 2010 - 15:44:05 EST


Attachment includes memcpy-kernel.c(cc -O2 memcpy-kernel.c -o memcpy-kernel),
and unaligned test cases on Atom.

Thanks
Ling

-----Original Message-----
From: Ma, Ling
Sent: Thursday, October 14, 2010 9:14 AM
To: 'H. Peter Anvin'; miaox@xxxxxxxxxxxxxx
Cc: Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

Sure, I will post benchmark tool and benchmark on Atom 64bit soon.

Thanks
Ling

-----Original Message-----
From: H. Peter Anvin [mailto:hpa@xxxxxxxxx]
Sent: Thursday, October 14, 2010 5:32 AM
To: miaox@xxxxxxxxxxxxxx
Cc: Ma, Ling; Ingo Molnar; Andi Kleen; Thomas Gleixner; Zhao, Yakui; Linux Kernel
Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

On 10/08/2010 02:02 AM, Miao Xie wrote:
> On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
>> Could you please give us full address for each comparison result,we will do some tests on my machine.
>> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
>> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.
>
> Would you need my benchmark tool? I think it is helpful for your test.
>

If you could post the benchmark tool that would be great.

-hpa

Attachment: memcpy-atom-unaligned-cases
Description: memcpy-atom-unaligned-cases

#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 1000
#define MAXCOPYSIZE (1024 * 1024 * 100)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)

#define HP_TIMING_BEST(best_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
if (best_time > tmptime) \
best_time = tmptime; \
} \
while (0)


void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
size_t len)
{
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t best_time = ~ (hp_timing_t) 0;
size_t i,j;

for (i = 0; i < repeat_one_test; ++i)
{
HP_TIMING_NOW (start);
do_memcpy ( dst, src, len);
HP_TIMING_NOW (stop);
HP_TIMING_BEST (best_time, start, stop);
}

printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;

s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);


printf ("LAT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = memcpy_orig;
do_one_test (s2, s1, len);
do_memcpy = memcpy_new;
do_one_test (s2, s1, len);

putchar ('\n');
}

static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);

for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}

}


void memcpy_new(char *dst, char *src, int len)
{

__asm__("movq %rdi, %rax");
__asm__("cmp $0x28, %rdx");
__asm__("jb 1f");

/*
* We check whether memory false dependece could occur,
* then jump to corresponding copy mode.
*/
__asm__("cmp %dil, %sil");
__asm__("jl 2f");
/*
* We append data to avoid store crossing cache.
*/
__asm__("movq (%rsi), %rcx");
__asm__("movq %rdi, %r8");
__asm__("addq $8, %rdi");
__asm__("andq $-8, %rdi");
__asm__("movq %rcx, (%r8)");
__asm__("subq %rdi, %r8");
__asm__("addq %r8, %rdx");
__asm__("subq %r8, %rsi");

__asm__("subq $0x20, %rdx");
__asm__("3:");
__asm__("subq $0x20, %rdx");

/*
* Move in blocks of 4x8 bytes:
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq 1*8(%rsi), %r9");
__asm__("movq 2*8(%rsi), %r10");
__asm__("movq 3*8(%rsi), %r11");
__asm__("leaq 4*8(%rsi), %rsi");

__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, 1*8(%rdi)");
__asm__("movq %r10, 2*8(%rdi)");
__asm__("movq %r11, 3*8(%rdi)");
__asm__("leaq 4*8(%rdi), %rdi");
__asm__("jae 3b");
__asm__("addq $0x20, %rdx");
__asm__("jmp 10f");

__asm__("2:");
/*
* Calculate copy position to tail.
*/
__asm__("addq %rdx, %rsi");
__asm__("addq %rdx, %rdi");
/*
* We append data to avoid store crossing cache.
*/

__asm__("movq -8(%rsi), %rcx");
__asm__("movq %rdi, %r8");
__asm__("andq $-8, %rdi");
__asm__("movq %rcx, -8(%r8)");
__asm__("subq %rdi, %r8");
__asm__("subq %r8, %rdx");
__asm__("subq %r8, %rsi");

__asm__("subq $0x20, %rdx");
__asm__(".p2align 4");
__asm__("4:");
__asm__("subq $0x20, %rdx");
__asm__("movq -1*8(%rsi), %r8");
__asm__("movq -2*8(%rsi), %r9");
__asm__("movq -3*8(%rsi), %r10");
__asm__("movq -4*8(%rsi), %r11");
__asm__("leaq -4*8(%rsi), %rsi");
__asm__("movq %r8, -1*8(%rdi)");
__asm__("movq %r9, -2*8(%rdi)");
__asm__("movq %r10, -3*8(%rdi)");
__asm__("movq %r11, -4*8(%rdi)");
__asm__("leaq -4*8(%rdi), %rdi");
__asm__("jae 4b");

/*
* Calculate copy position to head.
*/
__asm__("addq $0x20, %rdx");
__asm__("subq %rdx, %rsi");
__asm__("subq %rdx, %rdi");
__asm__("jmp 10f");
__asm__("1:");
__asm__("cmpq $32, %rdx");
__asm__("jb 10f");
/*
* Move data from 32 bytes to 39 bytes.
*/
__asm__("movq 0*8(%rsi), %rcx");
__asm__("movq 1*8(%rsi), %r8");
__asm__("movq -3*8(%rsi, %rdx), %r9");
__asm__("movq -2*8(%rsi, %rdx), %r10");
__asm__("movq -1*8(%rsi, %rdx), %r11");
__asm__("movq %rcx, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");
__asm__("movq %r9, -3*8(%rdi, %rdx)");
__asm__("movq %r10, -2*8(%rdi, %rdx)");
__asm__("movq %r11, -1*8(%rdi, %rdx)");
__asm__("retq");

/*
* Move data from 16 bytes to 31 bytes.
*/
__asm__("10:");
__asm__("cmpq $16, %rdx");
__asm__("jb 5f");
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq 1*8(%rsi), %r9");
__asm__("movq -2*8(%rsi, %rdx), %r10");
__asm__("movq -1*8(%rsi, %rdx), %r11");
__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, 1*8(%rdi)");
__asm__("movq %r10, -2*8(%rdi, %rdx)");
__asm__("movq %r11, -1*8(%rdi, %rdx)");
__asm__("retq");
__asm__(".p2align 4");
__asm__("5:");
__asm__("cmpq $8, %rdx");
__asm__("jb 6f");
/*
* Move data from 8 bytes to 15 bytes.
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq -1*8(%rsi, %rdx), %r9");
__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, -1*8(%rdi, %rdx)");
__asm__("retq");
__asm__(".p2align 4");
__asm__("6:");
__asm__("cmpq $4, %rdx");
__asm__("jb 7f");

/*
* Move data from 4 bytes to 7 bytes.
*/
__asm__("movl (%rsi), %ecx");
__asm__("movl -4(%rsi, %rdx), %r8d");
__asm__("movl %ecx, (%rdi)");
__asm__("movl %r8d, -4(%rdi, %rdx)");
__asm__("retq");
__asm__(".p2align 4");
__asm__("7:");
__asm__("cmpl $0, %edx");
__asm__("je 8f");
/*
* Move data from 1 bytes to 3 bytes.
*/
__asm__("9:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %edx");
__asm__("jnz 9b");

__asm__("8:");
}

void memcpy_orig(char *dst, char *src, int len)
{

__asm("movq %rdi, %rax");

/*
* Use 32bit CMP here to avoid long NOP padding.
*/
__asm("cmp $0x20, %edx");
__asm("jbe 1f");

/*
* the code for unaligned copy is good for large-size copy(>100),
* so if the size is small, we needn't check dst and src is aligned
* or not.
*/
__asm("cmp $100, %edx");
__asm("jb 2f");

/*
* unaligned access always leads to bad performance, so in order to
* avoid unaligned access, we align the address(both src and dest)
* first, and then copy from a aligned src to an aligned dst by using
* shifts.
* But we found if src is aligned, although dest is unaligned, the
* performance of generic memory copy (That is reading data aligned
* from the source and writing data unaligned to the dest) is better
* than the one that uses shifts to avoid unaligned access.
* So if src is aligned, we needn't check dest is aligned or not, just
* goto 2:
*/
__asm("test $7, %esi"); /* src align check */
__asm("jz 2f");

/* if dest and src both are unaligned, goto unaligned copy */
__asm("test $7, %edi");
__asm("jnz 3f");

__asm("jmp 4f");

__asm("2:");
/*
* We check whether memory false dependece could occur,
* then jump to corresponding copy mode.
*/
__asm("cmp %dil, %sil");
__asm("jl 5f");
__asm("subl $0x20, %edx");
__asm("6:");
__asm("subq $0x20, %rdx");

/*
* Move in blocks of 4x8 bytes:
*/
__asm("movq 0*8(%rsi), %r8");
__asm("movq 1*8(%rsi), %r9");
__asm("movq 2*8(%rsi), %r10");
__asm("movq 3*8(%rsi), %r11");
__asm("leaq 4*8(%rsi), %rsi");

__asm("movq %r8, 0*8(%rdi)");
__asm("movq %r9, 1*8(%rdi)");
__asm("movq %r10, 2*8(%rdi)");
__asm("movq %r11, 3*8(%rdi)");
__asm("leaq 4*8(%rdi), %rdi");
__asm("jae 6b");
__asm("addq $0x20, %rdx");
__asm("jmp 1f");

__asm("5:");
/*
* Calculate copy position to tail.
*/
__asm("addq %rdx, %rsi");
__asm("addq %rdx, %rdi");
__asm("subq $0x20, %rdx");
/*
* At most 3 ALU operations in one cycle,
* so append NOPS in the same 16bytes trunk.
*/
__asm(".p2align 4");
__asm("6:");
__asm("subq $0x20, %rdx");
__asm("movq -1*8(%rsi), %r8");
__asm("movq -2*8(%rsi), %r9");
__asm("movq -3*8(%rsi), %r10");
__asm("movq -4*8(%rsi), %r11");
__asm("leaq -4*8(%rsi), %rsi");
__asm("movq %r8, -1*8(%rdi)");
__asm("movq %r9, -2*8(%rdi)");
__asm("movq %r10, -3*8(%rdi)");
__asm("movq %r11, -4*8(%rdi)");
__asm("leaq -4*8(%rdi), %rdi");
__asm("jae 6b");

/*
* Calculate copy position to head.
*/
__asm("addq $0x20, %rdx");
__asm("subq %rdx, %rsi");
__asm("subq %rdx, %rdi");
__asm__("1:");
__asm("cmpq $16, %rdx");
__asm("jb 7f");

/*
* Move data from 16 bytes to 31 bytes.
*/
__asm("movq 0*8(%rsi), %r8");
__asm("movq 1*8(%rsi), %r9");
__asm("movq -2*8(%rsi, %rdx), %r10");
__asm("movq -1*8(%rsi, %rdx), %r11");
__asm("movq %r8, 0*8(%rdi)");
__asm("movq %r9, 1*8(%rdi)");
__asm("movq %r10, -2*8(%rdi, %rdx)");
__asm("movq %r11, -1*8(%rdi, %rdx)");
__asm("retq");
__asm(".p2align 4");
__asm__("7:");
__asm("cmpq $8, %rdx");
__asm("jb 8f");
/*
* Move data from 8 bytes to 15 bytes.
*/
__asm("movq 0*8(%rsi), %r8");
__asm("movq -1*8(%rsi, %rdx), %r9");
__asm("movq %r8, 0*8(%rdi)");
__asm("movq %r9, -1*8(%rdi, %rdx)");
__asm("retq");
__asm(".p2align 4");
__asm__("8:");
__asm("cmpq $4, %rdx");
__asm("jb 9f");

/*
* Move data from 4 bytes to 7 bytes.
*/
__asm("movl (%rsi), %ecx");
__asm("movl -4(%rsi, %rdx), %r8d");
__asm("movl %ecx, (%rdi)");
__asm("movl %r8d, -4(%rdi, %rdx)");
__asm("retq");
__asm(".p2align 4");
__asm__("9:");
__asm("cmpl $0, %edx");
__asm("je 10f");
/*
* Move data from 1 bytes to 3 bytes.
*/
__asm__("11:");
__asm("movb (%rsi), %r8b");
__asm("movb %r8b, (%rdi)");
__asm("incq %rdi");
__asm("incq %rsi");
__asm("decl %edx");
__asm("jnz 11b");

__asm__("10:");
__asm("retq");

__asm(".p2align 4");
__asm__("3:");
__asm("movq %rdi, %rcx");
__asm("andq $7, %rcx"); /* Align the destination */
__asm("negq %rcx");
__asm("andq $7, %rcx");
__asm("subq %rcx, %rdx");

/* tune dst address */
__asm("movq (%rsi), %r8");
__asm("movq %r8, (%rdi)");
__asm("addq %rcx, %rdi");
__asm("addq %rcx, %rsi");

__asm("test $7, %esi"); /* src align check */
__asm("jz 2b");

__asm(".p2align 4");
__asm__("4:");
__asm("push %rbx");
__asm("push %r12");
__asm("push %r13");
__asm("push %r14");
__asm("push %r15");
/*
* Calculate how to shift a word read at the memory operation
* aligned srcp to make it aligned for copy.
*/
__asm("movq %rsi, %r14");
__asm("andq $7, %r14");
__asm("shlq $3, %r14");

__asm("movq $64, %r15");
__asm("subq %r14, %r15");

__asm("andq $-8, %rsi"); /* src aligned */
__asm("movq 0*8(%rsi), %r8");

__asm("movq %rdx, %rbx");
__asm("shrq $5, %rbx");
__asm("jz 12f");

/*
* %r8 : store src[0]
* %r9 : store src[1]
* %r10: store src[2]
* %r11: store src[3]
* %r12: store src[4]
* %r13: store the tmp data
*/
__asm(".p2align 4");
__asm("13:");
__asm("movq 1*8(%rsi), %r9");
__asm("movq 2*8(%rsi), %r10");
__asm("movq 3*8(%rsi), %r11");
__asm("movq 4*8(%rsi), %r12");

__asm("movq %r9, %r13");
__asm("movb %r14b, %cl");
__asm("shrq %cl, %r8");
__asm("shrq %cl, %r13");
__asm("movb %r15b, %cl");
__asm("shlq %cl, %r9");
__asm("orq %r8, %r9");
__asm("movq %r10, %r8");
__asm("shlq %cl, %r10");
__asm("orq %r13, %r10");

__asm("movq %r11, %r13");
__asm("movb %r14b, %cl");
__asm("shrq %cl, %r8");
__asm("shrq %cl, %r13");
__asm("movb %r15b, %cl");
__asm("shlq %cl, %r11");
__asm("orq %r8, %r11");
__asm("movq %r12, %r8");
__asm("shlq %cl, %r12");
__asm("orq %r13, %r12");

__asm("movq %r9, 0*8(%rdi)");
__asm("movq %r10, 1*8(%rdi)");
__asm("movq %r11, 2*8(%rdi)");
__asm("movq %r12, 3*8(%rdi)");

__asm("leaq 4*8(%rdi), %rdi");
__asm("leaq 4*8(%rsi), %rsi");
__asm("decq %rbx");
__asm("jnz 13b");

__asm(".p2align 4");
__asm("12:");
__asm("shrq $3, %r14");
__asm("addq %r14, %rsi");
__asm("pop %r15");
__asm("pop %r14");
__asm("pop %r13");
__asm("pop %r12");
__asm("pop %rbx");
__asm("andq $31, %rdx");
__asm("jnz 1b");
__asm("retq");



}


void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\n", "memcpy_orig", "memcpy_new");

for (i = 0; i <= 12; ++i)
{
do_test (i, 0, 1 << i);
do_test (0, i, 1 << i);
}
for (i = 0; i < 32; ++i)
{
do_test (i, 0, i);
do_test (0, i, i);
}

for (i = 0; i < 48; ++i)
{
do_test (0, 8, i);
do_test (1, 8, i);
do_test (4, 8, i);
}

for (i = 3; i < 32; ++i)
{
if ((i & (i - 1)) == 0)
continue;
do_test (i, 0, 16 * i);
do_test (0, i, 16 * i);
}


}