[RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

From: Christophe JAILLET
Date: Sun May 02 2021 - 15:29:58 EST


The S array does not need to be u32, u8 is enough
On machine which have efficient unaligned access, use u8 to save some
memory.

So, provide a version optimized for memory usage in such a case.

Based on my testing, the size of arc4_ctx is:
u8 version: 264
u32 version: 1032

On my machine, the u8 version is about 5% faster.
It save ~800 bytes of memory or stack depending on how arc4_ctx is stored.
It is likely to be more cache friendly.

It has been tested an Core i7-3770 with the following test program:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define u8 unsigned char
#define u32 unsigned int
#define true 1

struct arc4_ctx_8 {
u8 S[256];
u32 x, y;
};
struct arc4_ctx_32 {
u32 S[256];
u32 x, y;
};

#define S_type u8
int arc4_setkey_8(struct arc4_ctx_8 *ctx, const u8 *in_key, unsigned int key_len)
{
int i, j = 0, k = 0;

ctx->x = 1;
ctx->y = 0;

for (i = 0; i < 256; i++)
ctx->S[i] = i;

for (i = 0; i < 256; i++) {
S_type a = ctx->S[i];

j = (j + in_key[k] + a) & 0xff;
ctx->S[i] = ctx->S[j];
ctx->S[j] = a;
if (++k >= key_len)
k = 0;
}

return 0;
}

void arc4_crypt_8(struct arc4_ctx_8 *ctx, u8 *out, const u8 *in, unsigned int len)
{
S_type *const S = ctx->S;
S_type a, b, ta, tb;
u32 x, y, ty;

if (len == 0)
return;

x = ctx->x;
y = ctx->y;

a = S[x];
y = (y + a) & 0xff;
b = S[y];

do {
S[y] = a;
a = (a + b) & 0xff;
S[x] = b;
x = (x + 1) & 0xff;
ta = S[x];
ty = (y + ta) & 0xff;
tb = S[ty];
*out++ = *in++ ^ S[a];
if (--len == 0)
break;
y = ty;
a = ta;
b = tb;
} while (true);

ctx->x = x;
ctx->y = y;
}

#undef S_type
#define S_type u32
int arc4_setkey_32(struct arc4_ctx_32 *ctx, const u8 *in_key, unsigned int key_len)
{
int i, j = 0, k = 0;

ctx->x = 1;
ctx->y = 0;

for (i = 0; i < 256; i++)
ctx->S[i] = i;

for (i = 0; i < 256; i++) {
S_type a = ctx->S[i];

j = (j + in_key[k] + a) & 0xff;
ctx->S[i] = ctx->S[j];
ctx->S[j] = a;
if (++k >= key_len)
k = 0;
}

return 0;
}

void arc4_crypt_32(struct arc4_ctx_32 *ctx, u8 *out, const u8 *in, unsigned int len)
{
S_type *const S = ctx->S;
S_type a, b, ta, tb;
u32 x, y, ty;

if (len == 0)
return;

x = ctx->x;
y = ctx->y;

a = S[x];
y = (y + a) & 0xff;
b = S[y];

do {
S[y] = a;
a = (a + b) & 0xff;
S[x] = b;
x = (x + 1) & 0xff;
ta = S[x];
ty = (y + ta) & 0xff;
tb = S[ty];
*out++ = *in++ ^ S[a];
if (--len == 0)
break;
y = ty;
a = ta;
b = tb;
} while (true);

ctx->x = x;
ctx->y = y;
}

#define KEY "AZERTY"
#define in "AZERTYUIOP_QSDFGHJKLM_WXCVBN"

int main() {
long i;
struct arc4_ctx_8 ctx_8;
u8 out8[1024] = { };
struct arc4_ctx_32 ctx_32;
u8 out32[1024] = { };

arc4_setkey_8(&ctx_8, KEY, strlen(KEY));
arc4_crypt_8(&ctx_8, out8, in, strlen(in));

arc4_setkey_32(&ctx_32, KEY, strlen(KEY));
arc4_crypt_32(&ctx_32, out32, in, strlen(in));

printf("%ld vs %ld\n", sizeof(ctx_8), sizeof(ctx_32));
if (memcmp(out8, out32, 1024) == 0)
printf("Ok\n");
else
printf("Broken\n");

return 0;
}

Signed-off-by: Christophe JAILLET <christophe.jaillet@xxxxxxxxxx>
---
The idea came from code found in staging/rtl8712/
See at the top of:
https://elixir.bootlin.com/linux/v5.12/source/drivers/staging/rtl8712/rtl871x_security.c

More precisely, in an attempt to clean staging/rtl8712/, I triggered the
kernel test robot about some increasing stack usage:
https://lore.kernel.org/kernel-janitors/YHQUH+Nqc%2FzS14Tb@xxxxxxxxx/T/#m832a01a9d1517e7efc4f671ed46deae9993d6ae9

The above patch works for me, but should be taken as a RFC.
---
include/crypto/arc4.h | 8 +++++++-
lib/crypto/arc4.c | 8 ++++----
2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/crypto/arc4.h b/include/crypto/arc4.h
index f3c22fe01704..39545ed486e2 100644
--- a/include/crypto/arc4.h
+++ b/include/crypto/arc4.h
@@ -12,8 +12,14 @@
#define ARC4_MAX_KEY_SIZE 256
#define ARC4_BLOCK_SIZE 1

+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define S_type u8
+#else
+#define S_type u32
+#endif
+
struct arc4_ctx {
- u32 S[256];
+ S_type S[256];
u32 x, y;
};

diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index c2020f19c652..e0be0c2a08d9 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -21,7 +21,7 @@ int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
ctx->S[i] = i;

for (i = 0; i < 256; i++) {
- u32 a = ctx->S[i];
+ S_type a = ctx->S[i];

j = (j + in_key[k] + a) & 0xff;
ctx->S[i] = ctx->S[j];
@@ -36,9 +36,9 @@ EXPORT_SYMBOL(arc4_setkey);

void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
{
- u32 *const S = ctx->S;
- u32 x, y, a, b;
- u32 ty, ta, tb;
+ S_type *const S = ctx->S;
+ S_type a, b, ta, tb;
+ u32 x, y, ty;

if (len == 0)
return;
--
2.30.2