Re: [PATCH RFC V2] [x86] Optimize small size memcpy by avoding longlatency from decode stage

From: Ling Ma
Date: Mon Oct 22 2012 - 05:23:06 EST


Attached memcpy micro benchmark, cpu info ,comparison results between
rep movsq/b and memcpy on atom, ivb.

Thanks
Ling


2012/10/23, ling.ma.program@xxxxxxxxx <ling.ma.program@xxxxxxxxx>:
> From: Ma Ling <ling.ma.program@xxxxxxxxx>
>
> CISC code has higher instruction density, saving memory and
> improving i-cache hit rate. However decode become challenge,
> only one mulitple-uops(2~3)instruction could be decoded in one cycle,
> and instructions containing more 4 uops(rep movsq/b) have to be handled by
> MS-ROM,
> the process take long time and eat up the advantage from it for small size.
>
>
> In order to avoid this disavantage, we take use of general instruction code
> for small size copy. The result shows it can get 1~2x improvement
> on Core2, Nehalem, Sandy Bridge, Ivy Bridge, Atom, and Bulldozer as well.
>
> Signed-off-by: Ma Ling <ling.ma.program@xxxxxxxxx>
> ---
> In this version we decrease warm up distance from 512 to 256 for coming
> CPUs,
> which manage to reduce latency, but long time to decode is still consumed.
>
> Thanks
> Ling
>
> arch/x86/lib/memcpy_64.S | 14 +++++++++++++-
> 1 files changed, 13 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 1c273be..6a24c8c 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -5,7 +5,6 @@
> #include <asm/cpufeature.h>
> #include <asm/dwarf2.h>
> #include <asm/alternative-asm.h>
> -
> /*
> * memcpy - Copy a memory block.
> *
> @@ -19,6 +18,15 @@
> */
>
> /*
> + * memcpy_c() and memcpy_c_e() use rep movsq/movsb respectively,
> + * the instruction have to get micro ops from Microcode Sequencser Rom.
> + * And the decode process take long latency, in order to avoid it,
> + * we choose loop unrolling routine for small size.
> + * Could vary the warm up distance.
> + */
> +
> +
> +/*
> * memcpy_c() - fast string ops (REP MOVSQ) based variant.
> *
> * This gets patched over the unrolled variant (below) via the
> @@ -26,6 +34,8 @@
> */
> .section .altinstr_replacement, "ax", @progbits
> .Lmemcpy_c:
> + cmpq $256, %rdx
> + jbe memcpy
> movq %rdi, %rax
> movq %rdx, %rcx
> shrq $3, %rcx
> @@ -46,6 +56,8 @@
> */
> .section .altinstr_replacement, "ax", @progbits
> .Lmemcpy_c_e:
> + cmpq $256, %rdx
> + jbe memcpy
> movq %rdi, %rax
> movq %rdx, %rcx
> rep movsb
> --
> 1.6.5.2
>
>
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 28
model name : Intel(R) Atom(TM) CPU N450 @ 1.66GHz
stepping : 10
microcode : 0x107
cpu MHz : 1000.000
cache size : 512 KB
physical id : 0
siblings : 2
core id : 0
cpu cores : 1
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 10
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm movbe lahf_lm dts
bogomips : 3324.62
clflush size : 64
cache_alignment : 64
address sizes : 32 bits physical, 48 bits virtual
power management:

processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 28
model name : Intel(R) Atom(TM) CPU N450 @ 1.66GHz
stepping : 10
microcode : 0x107
cpu MHz : 1000.000
cache size : 512 KB
physical id : 0
siblings : 2
core id : 0
cpu cores : 1
apicid : 1
initial apicid : 1
fpu : yes
fpu_exception : yes
cpuid level : 10
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl aperfmperf pni dtes64 monitor ds_cpl est tm2 ssse3 cx16 xtpr pdcm movbe lahf_lm dts
bogomips : 3324.62
clflush size : 64
cache_alignment : 64
address sizes : 32 bits physical, 48 bits virtual
power management:

memcpy_new memcpy_c memcpy_c_e
TPT: Len 0, alignment 0/ 0: 50 90 70
TPT: Len 4, alignment 0/ 0: 60 110 80
TPT: Len 8, alignment 0/ 0: 60 100 100
TPT: Len 12, alignment 0/ 0: 50 120 110
TPT: Len 16, alignment 0/ 0: 60 100 130
TPT: Len 20, alignment 0/ 0: 60 120 140
TPT: Len 24, alignment 0/ 0: 60 100 160
TPT: Len 28, alignment 0/ 0: 60 120 180
TPT: Len 32, alignment 0/ 0: 60 100 190
TPT: Len 36, alignment 0/ 0: 70 120 200
TPT: Len 40, alignment 0/ 0: 70 100 220
TPT: Len 44, alignment 0/ 0: 70 120 240
TPT: Len 48, alignment 0/ 0: 70 110 250
TPT: Len 52, alignment 0/ 0: 70 130 270
TPT: Len 56, alignment 0/ 0: 70 110 280
TPT: Len 60, alignment 0/ 0: 70 130 290
TPT: Len 0, alignment 4/ 0: 50 90 70
TPT: Len 0, alignment 0/ 4: 50 90 70
TPT: Len 0, alignment 0/ 0: 50 90 70
TPT: Len 0, alignment 0/ 8: 50 90 70
TPT: Len 0, alignment 8/ 0: 50 90 70
TPT: Len 0, alignment 0/16: 50 90 70
TPT: Len 0, alignment 16/ 0: 50 90 70
TPT: Len 64, alignment 4/ 0: 90 120 200
TPT: Len 64, alignment 0/ 4: 90 130 300
TPT: Len 64, alignment 0/ 0: 70 110 310
TPT: Len 64, alignment 0/ 8: 80 160 200
TPT: Len 64, alignment 8/ 0: 70 110 200
TPT: Len 64, alignment 0/16: 80 130 200
TPT: Len 64, alignment 16/ 0: 70 110 200
TPT: Len 128, alignment 4/ 0: 120 150 330
TPT: Len 128, alignment 0/ 4: 130 160 540
TPT: Len 128, alignment 0/ 0: 100 130 550
TPT: Len 128, alignment 0/ 8: 100 230 330
TPT: Len 128, alignment 8/ 0: 100 120 330
TPT: Len 128, alignment 0/16: 100 170 330
TPT: Len 128, alignment 16/ 0: 90 120 330
TPT: Len 192, alignment 4/ 0: 150 180 450
TPT: Len 192, alignment 0/ 4: 160 190 780
TPT: Len 192, alignment 0/ 0: 110 140 790
TPT: Len 192, alignment 0/ 8: 110 300 450
TPT: Len 192, alignment 8/ 0: 110 140 450
TPT: Len 192, alignment 0/16: 110 220 450
TPT: Len 192, alignment 16/ 0: 110 140 450
TPT: Len 256, alignment 4/ 0: 180 210 610
TPT: Len 256, alignment 0/ 4: 190 220 1050
TPT: Len 256, alignment 0/ 0: 130 160 180
TPT: Len 256, alignment 0/ 8: 140 370 610
TPT: Len 256, alignment 8/ 0: 130 160 610
TPT: Len 256, alignment 0/16: 140 260 630
TPT: Len 256, alignment 16/ 0: 130 160 630
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping : 9
microcode : 0x12
cpu MHz : 3292.525
cache size : 6144 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 4
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips : 6585.05
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:

processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping : 9
microcode : 0x12
cpu MHz : 3292.525
cache size : 6144 KB
physical id : 0
siblings : 4
core id : 1
cpu cores : 4
apicid : 2
initial apicid : 2
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips : 6585.05
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:

processor : 2
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping : 9
microcode : 0x12
cpu MHz : 3292.525
cache size : 6144 KB
physical id : 0
siblings : 4
core id : 2
cpu cores : 4
apicid : 4
initial apicid : 4
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips : 6585.05
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:

processor : 3
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i5-3550 CPU @ 3.30GHz
stepping : 9
microcode : 0x12
cpu MHz : 3292.525
cache size : 6144 KB
physical id : 0
siblings : 4
core id : 3
cpu cores : 4
apicid : 6
initial apicid : 6
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips : 6585.05
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:

memcpy_new memcpy_c memcpy_c_e
TPT: Len 0, alignment 0/ 0: 24 92 76
TPT: Len 4, alignment 0/ 0: 24 72 44
TPT: Len 8, alignment 0/ 0: 24 92 44
TPT: Len 12, alignment 0/ 0: 28 72 48
TPT: Len 16, alignment 0/ 0: 28 92 44
TPT: Len 20, alignment 0/ 0: 24 72 48
TPT: Len 24, alignment 0/ 0: 24 92 44
TPT: Len 28, alignment 0/ 0: 24 72 48
TPT: Len 32, alignment 0/ 0: 28 92 48
TPT: Len 36, alignment 0/ 0: 28 72 48
TPT: Len 40, alignment 0/ 0: 28 92 44
TPT: Len 44, alignment 0/ 0: 24 72 44
TPT: Len 48, alignment 0/ 0: 24 92 48
TPT: Len 52, alignment 0/ 0: 24 72 44
TPT: Len 56, alignment 0/ 0: 24 92 44
TPT: Len 60, alignment 0/ 0: 24 72 48
TPT: Len 0, alignment 4/ 0: 24 92 72
TPT: Len 0, alignment 0/ 4: 24 92 72
TPT: Len 0, alignment 0/ 0: 28 92 72
TPT: Len 0, alignment 0/ 8: 24 92 76
TPT: Len 0, alignment 8/ 0: 24 92 72
TPT: Len 0, alignment 0/16: 24 92 76
TPT: Len 0, alignment 16/ 0: 24 92 76
TPT: Len 64, alignment 4/ 0: 32 92 44
TPT: Len 64, alignment 0/ 4: 28 96 44
TPT: Len 64, alignment 0/ 0: 28 92 48
TPT: Len 64, alignment 0/ 8: 28 96 44
TPT: Len 64, alignment 8/ 0: 28 92 48
TPT: Len 64, alignment 0/16: 32 92 44
TPT: Len 64, alignment 16/ 0: 28 92 44
TPT: Len 128, alignment 4/ 0: 36 96 60
TPT: Len 128, alignment 0/ 4: 36 108 56
TPT: Len 128, alignment 0/ 0: 36 96 60
TPT: Len 128, alignment 0/ 8: 36 108 56
TPT: Len 128, alignment 8/ 0: 36 96 56
TPT: Len 128, alignment 0/16: 36 104 56
TPT: Len 128, alignment 16/ 0: 36 96 60
TPT: Len 192, alignment 4/ 0: 40 108 60
TPT: Len 192, alignment 0/ 4: 40 120 60
TPT: Len 192, alignment 0/ 0: 40 108 60
TPT: Len 192, alignment 0/ 8: 40 116 60
TPT: Len 192, alignment 8/ 0: 40 104 60
TPT: Len 192, alignment 0/16: 40 116 60
TPT: Len 192, alignment 16/ 0: 40 104 60
TPT: Len 256, alignment 4/ 0: 52 116 64
TPT: Len 256, alignment 0/ 4: 56 136 56
TPT: Len 256, alignment 0/ 0: 52 112 68
TPT: Len 256, alignment 0/ 8: 56 128 64
TPT: Len 256, alignment 8/ 0: 52 112 64
TPT: Len 256, alignment 0/16: 52 128 64
TPT: Len 256, alignment 16/ 0: 52 116 64
#include<stdio.h>
#include <stdlib.h>


typedef unsigned long long int hp_timing_t;
#define MAXSAMPLESTPT 1000
#define MAXCOPYSIZE (1024 * 1024 * 100)
#define ORIG 0
#define NEW 1
static char* buf1 = NULL;
static char* buf2 = NULL;
static int repeat_one_test = 32;

hp_timing_t _dl_hp_timing_overhead;
# define HP_TIMING_NOW(Var) \
({ unsigned long long _hi, _lo; \
asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
(Var) = _hi << 32 | _lo; })

#define HP_TIMING_DIFF(Diff, Start, End) (Diff) = ((End) - (Start))
#define HP_TIMING_TOTAL(total_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
total_time += tmptime; \
} \
while (0)

#define HP_TIMING_BEST(best_time, start, end) \
do \
{ \
hp_timing_t tmptime; \
HP_TIMING_DIFF (tmptime, start + _dl_hp_timing_overhead, end); \
if (best_time > tmptime) \
best_time = tmptime; \
} \
while (0)


void memcpy_new(char *dst, char *src, int len);
void memcpy_c(char *dst, char *src, int len);
void memcpy_c_e(char *dst, char *src, int len);
void (*do_memcpy)(char *dst, char *src, int len);

static void
do_one_test ( char *dst, char *src,
size_t len)
{
hp_timing_t start __attribute ((unused));
hp_timing_t stop __attribute ((unused));
hp_timing_t best_time = ~ (hp_timing_t) 0;
size_t i,j;

for (i = 0; i < repeat_one_test; ++i)
{
HP_TIMING_NOW (start);
do_memcpy ( dst, src, len);
HP_TIMING_NOW (stop);
HP_TIMING_BEST (best_time, start, stop);
}

printf ("\t%zd", (size_t) best_time);
}

static void
do_test (size_t align1, size_t align2, size_t len)
{
size_t i, j;
char *s1, *s2;

s1 = (char *) (buf1 + align1);
s2 = (char *) (buf2 + align2);


printf ("TPT: Len %4zd, alignment %2zd/%2zd:", len, align1, align2);
do_memcpy = memcpy_new;
do_one_test (s2, s1, len);
do_memcpy = memcpy_c;
do_one_test (s2, s1, len);
do_memcpy = memcpy_c_e;
do_one_test (s2, s1, len);

putchar ('\n');
}

static test_init(void)
{
int i;
buf1 = valloc(MAXCOPYSIZE);
buf2 = valloc(MAXCOPYSIZE);

for (i = 0; i < MAXCOPYSIZE ; i = i + 64) {
buf1[i] = buf2[i] = i & 0xff;
}

}

void memset_c(char *dst, char *src, int len)
{
__asm__("mov %rdx, %rcx");
__asm__("shr $3, %rcx");
__asm__("rep stosq");
}
void memset_2(char *dst, char *src, int len)
{
__asm__("sub $128, %rdx");
__asm__("1:");
__asm__("sub $128, %rdx");
__asm__("movdqa %xmm0, (%rdi)");
__asm__("movdqa %xmm0, 16(%rdi)");
__asm__("movdqa %xmm0, 32(%rdi)");
__asm__("movdqa %xmm0, 48(%rdi)");
__asm__("movdqa %xmm0, 64(%rdi)");
__asm__("movdqa %xmm0, 80(%rdi)");
__asm__("movdqa %xmm0, 96(%rdi)");
__asm__("movdqa %xmm0, 112(%rdi)");
__asm__("jae 1b");

}

void memcpy_c(char *dst, char *src, int len)
{

__asm__("mov %rdi, %rax");

__asm__("movl %edx, %ecx");
__asm__("shrl $3, %ecx");
__asm__("andl $7, %edx");
__asm__("rep movsq");
__asm__("movl %edx, %ecx");
__asm__("rep movsb");
__asm__("1:");

}
void memcpy_c_e(char *dst, char *src, int len)
{

__asm__("movq %rdi, %rax");
__asm__("movq %rdx, %rcx");
__asm__("rep movsb");

}
void memcpy_new(char *dst, char *src, int len)
{
__asm__("movq %rdi, %rax");

__asm__("cmpq $0x20, %rdx");
__asm__("jb .Lhandle_tail");

/*
* We check whether memory false dependence could occur,
* then jump to corresponding copy mode.
*/
__asm__("cmp %dil, %sil");
__asm__("jl .Lcopy_backward");
__asm__("subq $0x20, %rdx");
__asm__(".Lcopy_forward_loop:");
__asm__("subq $0x20, %rdx");

/*
* Move in blocks of 4x8 bytes:
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq 1*8(%rsi), %r9");
__asm__("movq 2*8(%rsi), %r10");
__asm__("movq 3*8(%rsi), %r11");
__asm__("leaq 4*8(%rsi), %rsi");

__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, 1*8(%rdi)");
__asm__("movq %r10, 2*8(%rdi)");
__asm__("movq %r11, 3*8(%rdi)");
__asm__("leaq 4*8(%rdi), %rdi");
__asm__("jae .Lcopy_forward_loop");
__asm__("addl $0x20, %edx");
__asm__("jmp .Lhandle_tail");

__asm__(".Lcopy_backward:");
/*
* Calculate copy position to tail.
*/
__asm__("addq %rdx, %rsi");
__asm__("addq %rdx, %rdi");
__asm__("subq $0x20, %rdx");
/*
* At most 3 ALU operations in one cycle,
* so append NOPS in the same 16bytes trunk.
*/
__asm__(".p2align 4");
__asm__(".Lcopy_backward_loop:");
__asm__("subq $0x20, %rdx");
__asm__("movq -1*8(%rsi), %r8");
__asm__("movq -2*8(%rsi), %r9");
__asm__("movq -3*8(%rsi), %r10");
__asm__("movq -4*8(%rsi), %r11");
__asm__("leaq -4*8(%rsi), %rsi");
__asm__("movq %r8, -1*8(%rdi)");
__asm__("movq %r9, -2*8(%rdi)");
__asm__("movq %r10, -3*8(%rdi)");
__asm__("movq %r11, -4*8(%rdi)");
__asm__("leaq -4*8(%rdi), %rdi");
__asm__("jae .Lcopy_backward_loop");

/*
* Calculate copy position to head.
*/
__asm__("addl $0x20, %edx");
__asm__("subq %rdx, %rsi");
__asm__("subq %rdx, %rdi");
__asm__(".Lhandle_tail:");
__asm__("cmpl $16, %edx");
__asm__("jb .Lless_16bytes");

/*
* Move data from 16 bytes to 31 bytes.
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq 1*8(%rsi), %r9");
__asm__("movq -2*8(%rsi, %rdx), %r10");
__asm__("movq -1*8(%rsi, %rdx), %r11");
__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, 1*8(%rdi)");
__asm__("movq %r10, -2*8(%rdi, %rdx)");
__asm__("movq %r11, -1*8(%rdi, %rdx)");
__asm__("jmp .Lend");
__asm__(".p2align 4");
__asm__(".Lless_16bytes:");
__asm__("cmpl $8, %edx");
__asm__("jb .Lless_8bytes");
/*
* Move data from 8 bytes to 15 bytes.
*/
__asm__("movq 0*8(%rsi), %r8");
__asm__("movq -1*8(%rsi, %rdx), %r9");
__asm__("movq %r8, 0*8(%rdi)");
__asm__("movq %r9, -1*8(%rdi, %rdx)");
__asm__("jmp .Lend");
__asm__(".p2align 4");
__asm__(".Lless_8bytes:");
__asm__("cmpl $4, %edx");
__asm__("jb .Lless_3bytes");

/*
* Move data from 4 bytes to 7 bytes.
*/
__asm__("movl (%rsi), %ecx");
__asm__("movl -4(%rsi, %rdx), %r8d");
__asm__("movl %ecx, (%rdi)");
__asm__("movl %r8d, -4(%rdi, %rdx)");
__asm__("jmp .Lend");
__asm__(".p2align 4");
__asm__(".Lless_3bytes:");
__asm__("subl $1, %edx");
__asm__("jb .Lend");
/*
* Move data from 1 bytes to 3 bytes.
*/
__asm__("movzbl (%rsi), %ecx");
__asm__("jz .Lstore_1byte");
__asm__("movzbq 1(%rsi), %r8");
__asm__("movzbq (%rsi, %rdx), %r9");
__asm__("movb %r8b, 1(%rdi)");
__asm__("movb %r9b, (%rdi, %rdx)");
__asm__(".Lstore_1byte:");
__asm__("movb %cl, (%rdi)");


__asm__(".Lend:");
}


void main(void)
{
int i;
test_init();
printf ("%23s", "");
printf ("\t%s\t%s\t%s\n", "memcpy_new", "memcpy_c", "memcpy_c_e");
for(i = 0; i< 64;i += 4 )
do_test(0, 0, i);
for(i = 0; i< 576;i += 64 ) {
do_test(4, 0, i);
do_test(0, 4, i);
do_test(0, 0, i);
do_test(0, 8, i);
do_test(8, 0, i);
do_test(0, 8*2, i);
do_test(8*2,0, i);
}
return ;
}