Re: The Kommunity vs. Dick Johnson

David Hinds (dhinds@zen.stanford.edu)
Thu, 19 Nov 1998 13:25:03 -0800

Messages sorted by: [ date ][ thread ][ subject ][ author ]
Next message: Eric Buddington: "2.1.127: eth0 occasionaly self-disabling"
Previous message: Noah B Beck: "2.0.36 something eating DMA buffers (sound?)"
Maybe in reply to: Rick Hohensee: "The Kommunity vs. Dick Johnson"

I've modified Richard's original benchmark a bit to make it easier to
interpret. I removed the RDTSC calls since they cause trouble on non
Pentiums. I also use clock() to measure CPU time instead of SIGALRM.
And I rewrote the outer loop of the benchmark to show memory heirarchy
effects. I also tried adding code to flush the instruction cache
prior to each checksum calculation, to measure I-cache effects.

Finally, I replaced Richard's simple and slow C code with two
variants: one is C plus inline asm, hand unrolled to a depth of 16;
the other is pure C, but using "long long" arithmetic to avoid the
partial-register stalls in Richard's 16-bit-at-a-time C code.

My C+asm checksum is significantly faster than Richard's on a 486 (due
to cache pressure, I think). It is 10-25% faster on a Pentium,
depending on where packets are in the memory heirarchy, due to better
instruction scheduling in my code. It is 1-5% slower on Pentium II
and Pentium Pro processors, due to extra loop overhead.

If I flush the instruction cache before each checksum invocation, my
code is not seriously impacted, but Richard's code runs much slower.
On a Pentium or Pentium II, my code is now twice as fast: on a PPro,
it is "only" 10-25% faster, thanks to the PPro's full-speed secondary
cache.

Richard could modify his code to use my instruction sequence to get
the Pentium scheduling benefit. I'm not familiar enough with the
PPro/PII architecture to know if there are better ways of coding this
to reduce pipeline stalls. On the 486, however, Richard's code will
always lose, because of its small cache. Richard's main "innovation"
(fully unrolling a 512-iteration loop) has a best-case performance
advantage of 5% in the case where the checksum code is always in the
primary I-cache, but is significantly slower on all platforms whenever
the code is not in primary cache. In real life, one would not expect
the TCP checksum to always be in primary cache, especially if it is
Richard's code, which occupies a full 50% (4K) of the 8K I-cache on a
PPro or PII.

My better straight-C checksum using "long long" arithmetic is on the
order of 20% slower than my C+asm code under all test conditions. It
performs better than Richard's code in the instruction-cache-flush
test on most platforms. That's not a huge win for assembly...

I've attached my replacement for Richard's chksum.c: to enable or
disable the I-cache flush, change the '#define FLUSH' line. To flip
between my C+asm or straight-C algorithms, you need to change the
'#if 1' line to '#if 0'.

-- Dave Hinds

/*
* This is free software written by Richard B. Johnson. No copyright
* is claimed. It is also not guaranteed to do anything useful.
*/

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
#include <time.h>
#if !defined(SIZE)
#error Need command line argument for SIZE
#endif
#define MTU SIZE
#define LOOP_TIME 3
#define FLUSH

unsigned char buf[MTU*1024];
extern unsigned long chk(unsigned char *buffer, size_t byte_length);
extern unsigned long long tim(void);

#ifdef FLUSH
#define OCT(x) do {x;x;x;x;x;x;x;x;} while (0)
#define NOP1(a) asm ("xor %0, %0" : "=r" (a) : "0" (a))
#define NOP2(a) OCT(NOP1(a))
#define NOP3(a) OCT(NOP2(a))
#define NOP4(a) OCT(NOP3(a))
#define NOP5(a) OCT(NOP4(a))
int flush1(void)
{
int a = 4;
NOP5(a); NOP5(a);
return a;
}
int flush2(void)
{
int a = 4;
NOP5(a); NOP5(a);
return a;
}
#endif

/*
* This is a typical 'C' checksum routine to calculate the internet-
* style checksum. It will properly checksum odd byte-counts as well
* as the normal even counts.
*
* It is not written in any special way. It is used as a benchmark
* standard.
*/

#define ADCL(a,b) asm ("adcl %2,%0" : "=r" (a) : "0" (a), "r" (b))
#define ADDL(a,b) asm ("addl %2,%0" : "=r" (a) : "0" (a), "r" (b))
#define ADCZ(a) asm ("adcl $0,%0" : "=r" (a) : "0" (a))

unsigned long chksum(unsigned char *b, size_t len)
{
unsigned long *p = (unsigned long *)b;
unsigned long sum;
size_t flag = len & 1, flag2 = len & 2;

#if 1
sum = 0;
for (len >>= 2; len & 3; len--, p++) {
ADDL(sum, *p); ADCZ(sum);
}
for (len >>= 2; len & 3; len--, p += 4) {
ADDL(sum, p[0]); ADCL(sum, p[1]);
ADCL(sum, p[2]); ADCL(sum, p[3]);
ADCZ(sum);
}
for (len >>= 2; len; len--, p += 16) {
ADDL(sum, p[0]); ADCL(sum, p[1]);
ADCL(sum, p[2]); ADCL(sum, p[3]);
ADCL(sum, p[4]); ADCL(sum, p[5]);
ADCL(sum, p[6]); ADCL(sum, p[7]);
ADCL(sum, p[8]); ADCL(sum, p[9]);
ADCL(sum, p[10]); ADCL(sum, p[11]);
ADCL(sum, p[12]); ADCL(sum, p[13]);
ADCL(sum, p[14]); ADCL(sum, p[15]);
ADCZ(sum);
}
if (flag2) {
ADDL(sum, (unsigned long)*((unsigned short *)p));
ADCZ(sum);
((unsigned short *)p)++;
}
if (flag) {
ADDL(sum, (unsigned long)*((unsigned char *)p));
ADCZ(sum);
}
#else
register unsigned long long lsum = 0;

len >>= 2;
while (len & 15) {
lsum += *p++;
len--;
}
len >>= 4;
while (len--) {
lsum += p[0]; lsum += p[1]; lsum += p[2]; lsum += p[3];
lsum += p[4]; lsum += p[5]; lsum += p[6]; lsum += p[7];
lsum += p[8]; lsum += p[9]; lsum += p[10]; lsum += p[11];
lsum += p[12]; lsum += p[13]; lsum += p[14]; lsum += p[15];
p += 16;
}
if (flag2) lsum += *((unsigned short *)p)++;
if (flag) lsum += *((unsigned char *)p);
sum = (lsum & 0xffffffff) + (lsum >> 32);
#endif
sum = (sum >> 0x10) + (sum & 0xffff);
if (sum > 0xffff)
sum++;
return (~sum) & 0xffff;
}

/*
* This tests two checksum routines, one written here and one
* written in assembly.
*/

int main()
{
size_t i, len, n, mod;
unsigned long cc, as, fl, cnt;
clock_t at;
double a, c;
for(n = 0; ; n++)
{
mod = 1 << (2 * (n % 6));
printf("Testing for %d sequential packets (%dK):\n",
mod, mod*MTU/1024);
/*
* First verify that the assembly-language routine produces the
* exact same output as the 'C' routine.
*/

printf(" Verifying ASM code...");
fflush(stdout);
for(len = 0; len < MTU; len++)
{
for(i=0; i< MTU; i++)
buf[i] = (char) rand();
cc = chksum(buf, len);
as = chk(buf, len);
if(cc != as)
{
printf("\n C = %lu\n", cc);
printf(" A = %lu\n", as);
printf(" Fail at %u ...", len);
fgets(buf, MTU, stdin);
}
}
puts("Okay!");
for (i=0; i<sizeof(buf); i++)
buf[i] = (char)rand();
/*
* Okay, that works. Now see how may times the 'C' routine will
* complete in LOOP_TIME seconds. Some time will be wasted getting the
* CPU cycles.
*/

#ifdef FLUSH
printf(" Counting I-cache flush for 1 second\n");
cnt = 0;
at = clock();
while (clock() < at+CLOCKS_PER_SEC) {
flush1();
flush2();
cnt++;
}
fl = cnt;
#endif

printf(" Counting C loops for %d seconds\n", LOOP_TIME);
fflush(stdout);
len = MTU;
cnt = 0;
at = clock();
while(clock() < at+LOOP_TIME*CLOCKS_PER_SEC)
{
#ifdef FLUSH
flush1();
#endif
(void) chksum(buf + MTU*(cnt%mod), len);
#ifdef FLUSH
flush2();
#endif
cnt++;
}
c = (double) cnt;
/*
* See how may times the 'ASM' routine will complete in
* LOOP_TIME seconds. Some time will be wasted getting the CPU
* cycles.
*/
printf(" Counting ASM loops for %d seconds\n", LOOP_TIME );
fflush(stdout);
cnt = 0;
at = clock();
while(clock() < at+LOOP_TIME*CLOCKS_PER_SEC)
{
#ifdef FLUSH
flush1();
#endif
(void) chk(buf + MTU*(cnt%mod), len);
#ifdef FLUSH
flush2();
#endif
cnt++;
}
a = (double) cnt;
#ifdef FLUSH
a = a*fl/(LOOP_TIME*fl-a);
c = c*fl/(LOOP_TIME*fl-c);
#endif
printf(" C routine : %0.0f : %.1f MB/sec\n", c, c*MTU/1000000.0);
printf(" AS routine : %0.0f : %.1f MB/sec\n", a, a*MTU/1000000.0);
printf(" Change : %-3.2f times faster\n", a/c);
}
return 0;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/

Next message: Eric Buddington: "2.1.127: eth0 occasionaly self-disabling"
Previous message: Noah B Beck: "2.0.36 something eating DMA buffers (sound?)"
Maybe in reply to: Rick Hohensee: "The Kommunity vs. Dick Johnson"