Re: [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+

From: linux
Date: Wed Jun 13 2007 - 02:46:48 EST


>> The names are the order they were written in. "One" is the lib/sha1.c
>> code (547 bytes with -Os). "Four" is a 5x unrolled C version (1106 bytes).
>
> I'd like to see your version four.

Here's the test driver wrapped around the earlier assembly code.
It's an ugly mess of copy & paste code, of course.

I suspect it could be shrunk by allocating the W[] array locally,
thereby freeing up a register. Size is -Os -fomit-frame-pointer.


/*
* SHA transform algorithm, originally taken from code written by
* Peter Gutmann, and placed in the public domain.
*/
#include <stdint.h>
#include <stdio.h>

#define rol32(x, s) ((x)<<(s) | (x)>>(32-(s)))
static inline uint32_t __attribute__((const))
be32_to_cpu(unsigned x)
{
asm("bswap %0" : "+r"(x));
return x;
}


/* The SHA f()-functions. */

#define f1(x,y,z) (z ^ (x & (y ^ z))) /* x ? y : z */
#define f2(x,y,z) (x ^ y ^ z) /* XOR */
#define f3(x,y,z) ((x & y) + (z & (x ^ y))) /* majority */

/* The SHA Mysterious Constants */

#define K1 0x5A827999L /* Rounds 0-19: sqrt(2) * 2^30 */
#define K2 0x6ED9EBA1L /* Rounds 20-39: sqrt(3) * 2^30 */
#define K3 0x8F1BBCDCL /* Rounds 40-59: sqrt(5) * 2^30 */
#define K4 0xCA62C1D6L /* Rounds 60-79: sqrt(10) * 2^30 */

/**
* sha_transform - single block SHA1 transform
*
* @digest: 160 bit digest to update
* @data: 512 bits of data to hash
* @W: 80 words of workspace (see note)
*
* This function generates a SHA1 digest for a single 512-bit block.
* Be warned, it does not handle padding and message digest, do not
* confuse it with the full FIPS 180-1 digest algorithm for variable
* length messages.
*
* Note: If the hash is security sensitive, the caller should be sure
* to clear the workspace. This is left to the caller to avoid
* unnecessary clears between chained hashing operations.
*/
void sha_transform(uint32_t digest[5], const char in[64], uint32_t W[80])
{
register uint32_t a, b, c, d, e, t, i;

for (i = 0; i < 16; i++)
W[i] = be32_to_cpu(((const uint32_t *)in)[i]);

for (i = 0; i < 64; i++)
W[i+16] = rol32(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 1);

a = digest[0];
b = digest[1];
c = digest[2];
d = digest[3];
e = digest[4];

for (i = 0; i < 20; i++) {
t = f1(b, c, d) + K1 + rol32(a, 5) + e + W[i];
e = d; d = c; c = rol32(b, 30); b = a; a = t;
}

for (; i < 40; i ++) {
t = f2(b, c, d) + K2 + rol32(a, 5) + e + W[i];
e = d; d = c; c = rol32(b, 30); b = a; a = t;
}

for (; i < 60; i ++) {
t = f3(b, c, d) + K3 + rol32(a, 5) + e + W[i];
e = d; d = c; c = rol32(b, 30); b = a; a = t;
}

for (; i < 80; i ++) {
t = f2(b, c, d) + K4 + rol32(a, 5) + e + W[i];
e = d; d = c; c = rol32(b, 30); b = a; a = t;
}

digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
digest[4] += e;
}

#define ROUND(a,b,c,d,e,f,add) \
( e += add + f(b,c,d), \
b = rol32(b, 30), \
e += rol32(a, 5) )

void sha_transform4(uint32_t digest[5], const char in[64], uint32_t W[80])
{
register uint32_t a, b, c, d, e, i;

for (i = 0; i < 16; i++)
W[i] = be32_to_cpu(((const uint32_t *)in)[i]);

for (i = 0; i < 64; i++) {
a = W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i];
W[i+16] = rol32(a, 1);
}

a = digest[0];
b = digest[1];
c = digest[2];
d = digest[3];
e = digest[4];

for (i = 0; i < 20; i += 5) {
ROUND(a,b,c,d,e,f1,W[i ]+K1);
ROUND(e,a,b,c,d,f1,W[i+1]+K1);
ROUND(d,e,a,b,c,f1,W[i+2]+K1);
ROUND(c,d,e,a,b,f1,W[i+3]+K1);
ROUND(b,c,d,e,a,f1,W[i+4]+K1);
}

for (; i < 40; i += 5) {
ROUND(a,b,c,d,e,f2,W[i ]+K2);
ROUND(e,a,b,c,d,f2,W[i+1]+K2);
ROUND(d,e,a,b,c,f2,W[i+2]+K2);
ROUND(c,d,e,a,b,f2,W[i+3]+K2);
ROUND(b,c,d,e,a,f2,W[i+4]+K2);
}
for (; i < 60; i += 5) {
ROUND(a,b,c,d,e,f3,W[i ]+K3);
ROUND(e,a,b,c,d,f3,W[i+1]+K3);
ROUND(d,e,a,b,c,f3,W[i+2]+K3);
ROUND(c,d,e,a,b,f3,W[i+3]+K3);
ROUND(b,c,d,e,a,f3,W[i+4]+K3);
}
for (; i < 80; i += 5) {
ROUND(a,b,c,d,e,f2,W[i ]+K4);
ROUND(e,a,b,c,d,f2,W[i+1]+K4);
ROUND(d,e,a,b,c,f2,W[i+2]+K4);
ROUND(c,d,e,a,b,f2,W[i+3]+K4);
ROUND(b,c,d,e,a,f2,W[i+4]+K4);
}

digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
digest[4] += e;
}

extern void sha_transform2(uint32_t digest[5], const char in[64]);
extern void sha_transform3(uint32_t digest[5], const char in[64]);
extern void sha_transform5(uint32_t digest[5], const char in[64]);
extern void sha_stackwipe(void);

void sha_init(uint32_t buf[5])
{
buf[0] = 0x67452301;
buf[1] = 0xefcdab89;
buf[2] = 0x98badcfe;
buf[3] = 0x10325476;
buf[4] = 0xc3d2e1f0;
}

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#if 1
void sha_stackwipe2(void)
{
uint32_t buf[90];
memset(buf, 0, sizeof buf);
asm("" : : "r" (&buf)); /* Force the compiler to do the memset */
}
#endif


#define TEST_SIZE (10*1024*1024)

int main(void)
{
uint32_t W[80];
uint32_t out[5];
char const text[64] = "Hello, world!\n";
char *buf;
uint32_t *p;
unsigned i;
struct timeval start, stop;

sha_init(out);
sha_transform(out, text, W);
printf(" One: %08x %08x %08x %08x %08x\n",
out[0], out[1], out[2], out[3], out[4]);

sha_init(out);
sha_transform4(out, text, W);
printf(" Four: %08x %08x %08x %08x %08x\n",
out[0], out[1], out[2], out[3], out[4]);

sha_init(out);
sha_transform2(out, text);
printf(" Two: %08x %08x %08x %08x %08x\n",
out[0], out[1], out[2], out[3], out[4]);

sha_init(out);
sha_transform3(out, text);
printf("Three: %08x %08x %08x %08x %08x\n",
out[0], out[1], out[2], out[3], out[4]);

sha_init(out);
sha_transform5(out, text);
printf(" Five: %08x %08x %08x %08x %08x\n",
out[0], out[1], out[2], out[3], out[4]);

sha_stackwipe();
#if 1

/* Set up a large buffer full of stuff */
buf = malloc(TEST_SIZE);
p = (uint32_t *)buf;
memcpy(p, W+80-16, 16*sizeof *p);
for (i = 0; i < TEST_SIZE/sizeof *p - 16; i++) {
uint32_t a = p[i+13] ^ p[i+8] ^ p[i+2] ^ p[i];
p[i+16] = rol32(a, 1);
}

sha_init(out);
gettimeofday(&start, 0);
for (i = 0; i < TEST_SIZE; i += 64)
sha_transform(out, buf+i, W);
gettimeofday(&stop, 0);
printf(" One: %08x %08x %08x %08x %08x -- %lu us\n",
out[0], out[1], out[2], out[3], out[4],
1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

sha_init(out);
gettimeofday(&start, 0);
for (i = 0; i < TEST_SIZE; i += 64)
sha_transform4(out, buf+i, W);
gettimeofday(&stop, 0);
printf(" Four: %08x %08x %08x %08x %08x -- %lu us\n",
out[0], out[1], out[2], out[3], out[4],
1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

sha_init(out);
gettimeofday(&start, 0);
for (i = 0; i < TEST_SIZE; i += 64)
sha_transform2(out, buf+i);
gettimeofday(&stop, 0);
printf(" Two: %08x %08x %08x %08x %08x -- %lu us\n",
out[0], out[1], out[2], out[3], out[4],
1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

sha_init(out);
gettimeofday(&start, 0);
for (i = 0; i < TEST_SIZE; i += 64)
sha_transform3(out, buf+i);
gettimeofday(&stop, 0);
printf("Three: %08x %08x %08x %08x %08x -- %lu us\n",
out[0], out[1], out[2], out[3], out[4],
1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

sha_init(out);
gettimeofday(&start, 0);
for (i = 0; i < TEST_SIZE; i += 64)
sha_transform5(out, buf+i);
gettimeofday(&stop, 0);
printf(" Five: %08x %08x %08x %08x %08x -- %lu us\n",
out[0], out[1], out[2], out[3], out[4],
1000000*(stop.tv_sec-start.tv_sec)+stop.tv_usec-start.tv_usec);

sha_stackwipe();
#endif

return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/