Re: Kernel checksum routime (>=5)x86 optimization

Kurt Garloff (garloff@suse.de)
Mon, 17 May 1999 02:36:43 +0200


--/Uq4LBwYP4y1W6pO
Content-Type: multipart/mixed; boundary=H1spWtNR+x+ondvy

--H1spWtNR+x+ondvy
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: quoted-printable

On Sun, May 16, 1999 at 12:20:53AM +0200, Ingo Molnar wrote:
> > In the arch/i386/lib/checksum.S file, line 156:
> >=20
> > 40:
> > addl -128(%esi), %eax
> > adcl -124(%esi), %eax
> > adcl -120(%esi), %eax
> > adcl -116(%esi), %eax
> > adcl -112(%esi), %eax
> > adcl -108(%esi), %eax
> > adcl -104(%esi), %eax
> > adcl -100(%esi), %eax
> > adcl -96(%esi), %eax
> > adcl -92(%esi), %eax
> > adcl -88(%esi), %eax
> > adcl -84(%esi), %eax
> > adcl -80(%esi), %eax
> > adcl -76(%esi), %eax
> >=20
> > this is, as I understand, a vastly unrolled loop to perfrom the
> > checksum. Note that on a dual pipeline machine, the second pipeline does
> > not get used at all in this routine. [...]
>=20
> this is wrong, the PPro and up (for what this routine is optimized) uses
> register renaming which makes the above loop as highspeed as it can get.=
=20

I think you are wrong here, Ingo!

Register renaming helps for write access to a register, but not in the case
where the modified contents of is needed. So, in ix86 assembly language,
register renaming helps to parallelize

movl %eax, memory mem =3D a;
movl %ebx, %eax a =3D b;
=20
movl mem, %eax a =3D mem;
movl %ebx, %eax a =3D b;
=20
Normally the second insn would have to wait for the first to finish, because
%eax is modified. The same holds for the third and fourth insn resp.
(although this is stupid code).=20

But I can't imagine any CPU to parallelize something like=20
a=3Dmem[0]; a+=3Dmem[1]; a+=3Dmem[2]; a+=3Dmem[3];
but=20
a=3Dmem[0]; b=3Dmem[1]; a+=3Dmem[2]; b+=3Dmem[3];
should be parallelizable.
So the original proposal to improve speed is a valid one.

Well, I hacked a test. It seems my theory fails. Can someone explain, why
the PPro does not perform better using two regs?

garloff@kurt:~/C $ ./test_renaming
Summing with one reg : Res=3D2076732064, Time=3D0.410000 s
Summing with two regs: Res=3D2076732064, Time=3D0.400000 s

--=20
Kurt Garloff <garloff@suse.de> SuSE GmbH, N=FCrnberg, FRG
Linux kernel development; SCSI driver: DC390 (tmscsim/AM53C974)

--H1spWtNR+x+ondvy
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="test_renaming.c"
Content-Transfer-Encoding: quoted-printable

/* test_renaming.c */
/*=20
* tests speed of additions done using one or two registers
* to prove whether "register renaming" will help or not
*/
/* (c) Kurt Garloff <garloff@suse.de> 99/05/17 */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int table[32];

int sum_one (int * tbl)
{
int eax;
asm volatile(
"movl $0x400000,%%ecx \n\t"
"loop1: \n\t"
"movl 0(%%edx),%%eax \n\t"
"addl 4(%%edx),%%eax \n\t"
"addl 8(%%edx),%%eax \n\t"
"addl 12(%%edx),%%eax \n\t"
"addl 16(%%edx),%%eax \n\t"
"addl 20(%%edx),%%eax \n\t"
"addl 24(%%edx),%%eax \n\t"
"addl 28(%%edx),%%eax \n\t"
"addl 32(%%edx),%%eax \n\t"
"addl 36(%%edx),%%eax \n\t"
"addl 40(%%edx),%%eax \n\t"
"addl 44(%%edx),%%eax \n\t"
"addl 48(%%edx),%%eax \n\t"
"addl 52(%%edx),%%eax \n\t"
"addl 56(%%edx),%%eax \n\t"
"addl 60(%%edx),%%eax \n\t"
"addl 64(%%edx),%%eax \n\t"
"addl 68(%%edx),%%eax \n\t"
"addl 72(%%edx),%%eax \n\t"
"addl 76(%%edx),%%eax \n\t"
"addl 80(%%edx),%%eax \n\t"
"addl 84(%%edx),%%eax \n\t"
"addl 88(%%edx),%%eax \n\t"
"addl 92(%%edx),%%eax \n\t"
"addl 96(%%edx),%%eax \n\t"
"addl 100(%%edx),%%eax \n\t"
"addl 104(%%edx),%%eax \n\t"
"addl 108(%%edx),%%eax \n\t"
"addl 112(%%edx),%%eax \n\t"
"addl 116(%%edx),%%eax \n\t"
"addl 120(%%edx),%%eax \n\t"
"addl 124(%%edx),%%eax \n\t"
"loop loop1 \n"
: "=3Da" (eax)
: "d" (tbl)
: "%eax", "%edx", "%ecx", "cc"
);
return eax;
};

int sum_two (int* tbl)
{
int eax;
asm volatile (
"movl $0x400000,%%ecx \n\t"
"loop2: \n\t"
"movl 0(%%edx),%%eax \n\t"
"movl 4(%%edx),%%ebx \n\t"
"addl 8(%%edx),%%eax \n\t"
"addl 12(%%edx),%%ebx \n\t"
"addl 16(%%edx),%%eax \n\t"
"addl 20(%%edx),%%ebx \n\t"
"addl 24(%%edx),%%eax \n\t"
"addl 28(%%edx),%%ebx \n\t"
"addl 32(%%edx),%%eax \n\t"
"addl 36(%%edx),%%ebx \n\t"
"addl 40(%%edx),%%eax \n\t"
"addl 44(%%edx),%%ebx \n\t"
"addl 48(%%edx),%%eax \n\t"
"addl 52(%%edx),%%ebx \n\t"
"addl 56(%%edx),%%eax \n\t"
"addl 60(%%edx),%%ebx \n\t"
"addl 64(%%edx),%%eax \n\t"
"addl 68(%%edx),%%ebx \n\t"
"addl 72(%%edx),%%eax \n\t"
"addl 76(%%edx),%%ebx \n\t"
"addl 80(%%edx),%%eax \n\t"
"addl 84(%%edx),%%ebx \n\t"
"addl 88(%%edx),%%eax \n\t"
"addl 92(%%edx),%%ebx \n\t"
"addl 96(%%edx),%%eax \n\t"
"addl 100(%%edx),%%ebx \n\t"
"addl 104(%%edx),%%eax \n\t"
"addl 108(%%edx),%%ebx \n\t"
"addl 112(%%edx),%%eax \n\t"
"addl 116(%%edx),%%ebx \n\t"
"addl 120(%%edx),%%eax \n\t"
"addl 124(%%edx),%%ebx \n\t"
"addl %%ebx,%%eax \n\t"
"loop loop2 \n"
: "=3Da" (eax)
: "d" (tbl)
: "eax", "ebx", "edx", "ecx", "cc"
);
return eax;
};

int init_tbl (int* tbl)
{
int i;
for (i =3D 0; i < 64; i++)
/* tbl[i] =3D rand (); */
tbl[i] =3D 333333333;
};

int main ()
{
clock_t start, stop;
int i, r;
srand (time(0));
init_tbl (table);
start =3D clock ();
r =3D sum_one (table);
stop =3D clock ();
printf ("Summing with one reg : Res=3D%i, Time=3D%f s\n",=20
r, (float)(stop-start)/CLOCKS_PER_SEC);
start =3D clock ();
r =3D sum_two (table);
stop =3D clock ();
printf ("Summing with two regs: Res=3D%i, Time=3D%f s\n",=20
r, (float)(stop-start)/CLOCKS_PER_SEC);
};

--H1spWtNR+x+ondvy--

--/Uq4LBwYP4y1W6pO
Content-Type: application/pgp-signature

-----BEGIN PGP SIGNATURE-----
Version: 2.6.3in

iQCVAwUBNz9kmxaQN/7O/JIVAQH0rgP/b5kGyoGquURSjEgS9WrXUBbkXSC72pP1
XL8RH4alLwNtr08iy+BG97xd5ymLxpYwcfFql1K3hGnrDgWjm87j3CDVEor/lhFz
5fgMZvwFO2dttu8ezG+Dq6tRRVscHqdmYFnbO/l8FNXM9yroYJyUSWSHTVP5Gw7f
RGixT74qAS0=
=xTuX
-----END PGP SIGNATURE-----

--/Uq4LBwYP4y1W6pO--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/