RE: [PATCH RFC] [X86] performance improvement for memcpy_64.S byfast string.

From: Ma, Ling
Date: Wed Nov 11 2009 - 23:50:07 EST


Hi All
The attachment is latest memcpy.c, please update by
"cc -o memcpy memcpy.c -O2 -m64".

Thanks
Ling


>-----Original Message-----
>From: Cyrill Gorcunov [mailto:gorcunov@xxxxxxxxx]
>Sent: 2009å11æ12æ 12:28
>To: H. Peter Anvin
>Cc: Ma, Ling; Ingo Molnar; Ingo Molnar; Thomas Gleixner; linux-kernel
>Subject: Re: [PATCH RFC] [X86] performance improvement for memcpy_64.S by fast
>string.
>
>On Thu, Nov 12, 2009 at 1:39 AM, H. Peter Anvin <hpa@xxxxxxxxx> wrote:
>> On 11/11/2009 12:34 PM, Cyrill Gorcunov wrote:
>>>                        memcpy_orig   memc
>py_new
>>> TPT: Len 1024, alignment Â8/ 0: Â Â Â Â Â Â Â 490 Â Â Â Â Â Â 570
>>> TPT: Len 2048, alignment Â8/ 0: Â Â Â Â Â Â Â 826 Â Â Â Â Â Â 329
>>> TPT: Len 3072, alignment Â8/ 0: Â Â Â Â Â Â Â 441 Â Â Â Â Â Â 464
>>> TPT: Len 4096, alignment Â8/ 0: Â Â Â Â Â Â Â 579 Â Â Â Â Â Â 596
>>> TPT: Len 5120, alignment Â8/ 0: Â Â Â Â Â Â Â 723 Â Â Â Â Â Â 729
>>> TPT: Len 6144, alignment Â8/ 0: Â Â Â Â Â Â Â 859 Â Â Â Â Â Â 861
>>> TPT: Len 7168, alignment Â8/ 0: Â Â Â Â Â Â Â 996 Â Â Â Â Â Â 994
>>> TPT: Len 8192, alignment Â8/ 0: Â Â Â Â Â Â Â 1165 Â Â Â Â Â Â1127
>>> TPT: Len 9216, alignment Â8/ 0: Â Â Â Â Â Â Â 1273 Â Â Â Â Â Â1260
>>> TPT: Len 10240, alignment Â8/ 0: Â Â Â1402 Â Â Â Â Â Â1395
>>> TPT: Len 11264, alignment Â8/ 0: Â Â Â1543 Â Â Â Â Â Â1525
>>> TPT: Len 12288, alignment Â8/ 0: Â Â Â1682 Â Â Â Â Â Â1659
>>> TPT: Len 13312, alignment Â8/ 0: Â Â Â1869 Â Â Â Â Â Â1815
>>> TPT: Len 14336, alignment Â8/ 0: Â Â Â1982 Â Â Â Â Â Â1951
>>> TPT: Len 15360, alignment Â8/ 0: Â Â Â2185 Â Â Â Â Â Â2110
>>>
>>> I've run this test a few times and results almost the same,
>>> with alignment 1024, 3072, 4096, 5120, 6144, new version a bit slowly.
>>>
>>
>> Was the result for 2048 consistent (it seems odd in the extreme)... the
>> discrepancy between this result and Ling's results bothers me; perhaps
>> the right answer is to leave the current code for Core2 and use new code
>> (with a lower than 1024 threshold?) for NHM and K8?
>>
>> Â Â Â Â-hpa
>>
>
>Hi Peter,
>
>no, results for 2048 is not repeatable (that is why I didn't mention this number
>in a former report).
>
>Test1:
>TPT: Len 2048, alignment 8/ 0: 826 329
>Test2:
>TPT: Len 2048, alignment 8/ 0: 359 329
>Test3:
>TPT: Len 2048, alignment 8/ 0: 306 331
>Test4:
>TPT: Len 2048, alignment 8/ 0: 415 329
>
>I guess this was due to cpu frequency change from 800 to 2.1Ghz since
>I did tests manually
>not using any kind of bash cycle to run the test program.
#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 100000
#define MAXCOPYSIZE (1024 * 32)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)

void memcpy_orig(char *dst, char *src, int len);
void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_throughput ( char *dst, char *src,
size_t len)
{
__asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
size_t i;
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t total_time = (hp_timing_t) 0;

__asm__("cpuid" : : : "eax", "ebx", "ecx", "edx");
for (i = 0; i < MAXSAMPLESTPT; ++i) {
HP_TIMING_NOW (start);
do_memcpy(buf1, buf2, len);
HP_TIMING_NOW (stop);
HP_TIMING_TOTAL (total_time, start, stop);
}

printf ("\t%zd", (size_t) total_time/MAXSAMPLESTPT);

}

static void
do_tpt_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;

s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);


printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = memcpy_orig;
do_one_throughput (s2, s1, len);
do_memcpy = memcpy_new;
do_one_throughput (s2, s1, len);

putchar ('\n');
}

static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);

for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}

}

void memcpy_new(char *dst, char *src, int len)
{
__asm__("movq %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");

__asm__("cmp $0x400, %edx");
__asm__("jae 7f");

__asm__("1:");
__asm__("decl %ecx");

__asm__("movq 0*8(%rsi), %r11");
__asm__("movq 1*8(%rdi), %r8");
__asm__("movq %r11, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");

__asm__("movq 2*8(%rsi), %r9");
__asm__("movq 3*8(%rdi), %r10");
__asm__("movq %r9, 2*8(%rdi)");
__asm__("movq %r10, 3*8(%rdi)");

__asm__("movq 4*8(%rsi), %r11");
__asm__("movq 5*8(%rdi), %r8");
__asm__("movq %r11, 4*8(%rdi)");
__asm__("movq %r8, 5*8(%rdi)");

__asm__("movq 6*8(%rsi), %r9");
__asm__("movq 7*8(%rdi), %r10");
__asm__("movq %r9, 6*8(%rdi)");
__asm__("movq %r10, 7*8(%rdi)");

__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");

__asm__("jnz 1b");

__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shl $3, %ecx");
__asm__("jz 4f");


__asm__("3:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %r8");
__asm__("movq %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");

__asm__("4:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 6f");

__asm__("5:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 5b");

__asm__("6:");
__asm__("retq");

__asm__("7:");
__asm__("movl %edx, %ecx");
__asm__ ("shr $3, %ecx");
__asm__ ("andl $7, %edx");
__asm__("rep movsq ");
__asm__ ("jz 8f");
__asm__("movl %edx, %ecx");
__asm__("rep movsb");

__asm__("8:");
}
void memcpy_orig(char *dst, char *src, int len)
{
__asm__("movq %rdi, %rax");
__asm__("movl %edx, %ecx");
__asm__("shrl $6, %ecx");
__asm__("jz 2f");

__asm__("mov $0x80, %r8d "); /*aligned case for loop 1 */

__asm__("1:");
__asm__("decl %ecx");

__asm__("movq 0*8(%rsi), %r11");
__asm__("movq 1*8(%rdi), %r8");
__asm__("movq %r11, 0*8(%rdi)");
__asm__("movq %r8, 1*8(%rdi)");

__asm__("movq 2*8(%rsi), %r9");
__asm__("movq 3*8(%rdi), %r10");
__asm__("movq %r9, 2*8(%rdi)");
__asm__("movq %r10, 3*8(%rdi)");

__asm__("movq 4*8(%rsi), %r11");
__asm__("movq 5*8(%rdi), %r8");
__asm__("movq %r11, 4*8(%rdi)");
__asm__("movq %r8, 5*8(%rdi)");

__asm__("movq 6*8(%rsi), %r9");
__asm__("movq 7*8(%rdi), %r10");
__asm__("movq %r9, 6*8(%rdi)");
__asm__("movq %r10, 7*8(%rdi)");

__asm__("leaq 64(%rsi), %rsi");
__asm__("leaq 64(%rdi), %rdi");

__asm__("jnz 1b");

__asm__("2:");
__asm__("movl %edx, %ecx");
__asm__("andl $63, %ecx");
__asm__("shl $3, %ecx");
__asm__("jz 4f");


__asm__("3:");
__asm__("decl %ecx");
__asm__("movq (%rsi), %r8");
__asm__("movq %r8, (%rdi)");
__asm__("leaq 8(%rdi), %rdi");
__asm__("leaq 8(%rsi), %rsi");
__asm__("jnz 3b");

__asm__("4:");
__asm__("movl %edx, %ecx");
__asm__("andl $7, %ecx");
__asm__("jz 6f");

__asm__("5:");
__asm__("movb (%rsi), %r8b");
__asm__("movb %r8b, (%rdi)");
__asm__("incq %rdi");
__asm__("incq %rsi");
__asm__("decl %ecx");
__asm__("jnz 5b");

__asm__("6:");
}


void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\t%s\n", "memcpy_orig", "memcpy_new");

for (i = 1024; i < 1024 * 16; i = i+ 1024)
do_tpt_test(0, 0, i);

}