Re: [PATCH v14 7/7] x86: vdso: Wire up getrandom() vDSO implementation

From: Christophe Leroy
Date: Thu Jan 12 2023 - 13:04:27 EST




Le 01/01/2023 à 17:29, Jason A. Donenfeld a écrit :
> Hook up the generic vDSO implementation to the x86 vDSO data page. Since
> the existing vDSO infrastructure is heavily based on the timekeeping
> functionality, which works over arrays of bases, a new macro is
> introduced for vvars that are not arrays.
>
> The vDSO function requires a ChaCha20 implementation that does not write
> to the stack, yet can still do an entire ChaCha20 permutation, so
> provide this using SSE2, since this is userland code that must work on
> all x86-64 processors. There's a simple test for this code as well.

As far as I understand the test is not dependent on the architecture,
can it be a separate patch ?

Also, as the chacha implementation is standalone and can be tested by
the above mentionned simple test, can it be a separate patch as well ?

Then the last patch only has the glue to wire-up getrandom VDSO to the
architecture, and can be used as a reference for other architectures.

>
> Reviewed-by: Samuel Neves <sneves@xxxxxxxxx> # for vgetrandom-chacha.S
> Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/entry/vdso/Makefile | 3 +-
> arch/x86/entry/vdso/vdso.lds.S | 2 +
> arch/x86/entry/vdso/vgetrandom-chacha.S | 178 ++++++++++++++++++
> arch/x86/entry/vdso/vgetrandom.c | 17 ++
> arch/x86/include/asm/vdso/getrandom.h | 55 ++++++
> arch/x86/include/asm/vdso/vsyscall.h | 2 +
> arch/x86/include/asm/vvar.h | 16 ++
> tools/testing/selftests/vDSO/.gitignore | 1 +
> tools/testing/selftests/vDSO/Makefile | 9 +
> .../testing/selftests/vDSO/vdso_test_chacha.c | 43 +++++
> 11 files changed, 326 insertions(+), 1 deletion(-)
> create mode 100644 arch/x86/entry/vdso/vgetrandom-chacha.S
> create mode 100644 arch/x86/entry/vdso/vgetrandom.c
> create mode 100644 arch/x86/include/asm/vdso/getrandom.h
> create mode 100644 tools/testing/selftests/vDSO/vdso_test_chacha.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 3604074a878b..ed689d831362 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -272,6 +272,7 @@ config X86
> select HAVE_UNSTABLE_SCHED_CLOCK
> select HAVE_USER_RETURN_NOTIFIER
> select HAVE_GENERIC_VDSO
> + select VDSO_GETRANDOM if X86_64
> select HOTPLUG_SMT if SMP
> select IRQ_FORCED_THREADING
> select NEED_PER_CPU_EMBED_FIRST_CHUNK
> diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
> index 838613ac15b8..3979bb4a61ae 100644
> --- a/arch/x86/entry/vdso/Makefile
> +++ b/arch/x86/entry/vdso/Makefile
> @@ -27,7 +27,7 @@ VDSO32-$(CONFIG_X86_32) := y
> VDSO32-$(CONFIG_IA32_EMULATION) := y
>
> # files to link into the vdso
> -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
> +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
> vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
> vobjs32-y += vdso32/vclock_gettime.o
> vobjs-$(CONFIG_X86_SGX) += vsgx.o
> @@ -105,6 +105,7 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg
> CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
> CFLAGS_REMOVE_vgetcpu.o = -pg
> CFLAGS_REMOVE_vsgx.o = -pg
> +CFLAGS_REMOVE_vgetrandom.o = -pg
>
> #
> # X32 processes use x32 vDSO to access 64bit kernel data.
> diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
> index e8c60ae7a7c8..0bab5f4af6d1 100644
> --- a/arch/x86/entry/vdso/vdso.lds.S
> +++ b/arch/x86/entry/vdso/vdso.lds.S
> @@ -30,6 +30,8 @@ VERSION {
> #ifdef CONFIG_X86_SGX
> __vdso_sgx_enter_enclave;
> #endif
> + getrandom;
> + __vdso_getrandom;
> local: *;
> };
> }
> diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..d79e2bd97598
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/frame.h>
> +
> +.section .rodata, "a"
> +.align 16
> +CONSTANTS: .octa 0x6b20657479622d323320646e61707865
> +.text
> +
> +/*
> + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
> + * of blocks of output with a nonce of 0, taking an input key and 8-byte
> + * counter. Importantly does not spill to the stack. Its arguments are:
> + *
> + * rdi: output bytes
> + * rsi: 32-byte key input
> + * rdx: 8-byte counter input/output

Why a 8-byte counter ? According to RFC 7539, chacha20 takes:

The inputs to ChaCha20 are:

o A 256-bit key, treated as a concatenation of eight 32-bit little-
endian integers.

o A 96-bit nonce, treated as a concatenation of three 32-bit little-
endian integers.

o A 32-bit block count parameter, treated as a 32-bit little-endian
integer.


Are you mixing up the upper part of the counter with the nonce ? In that
case you can't say you use a 0 nonce, can you ?


> + * rcx: number of 64-byte blocks to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> +.set output, %rdi
> +.set key, %rsi
> +.set counter, %rdx
> +.set nblocks, %rcx
> +.set i, %al
> +/* xmm registers are *not* callee-save. */
> +.set state0, %xmm0
> +.set state1, %xmm1
> +.set state2, %xmm2
> +.set state3, %xmm3
> +.set copy0, %xmm4
> +.set copy1, %xmm5
> +.set copy2, %xmm6
> +.set copy3, %xmm7
> +.set temp, %xmm8
> +.set one, %xmm9
> +
> + /* copy0 = "expand 32-byte k" */
> + movaps CONSTANTS(%rip),copy0
> + /* copy1,copy2 = key */
> + movups 0x00(key),copy1
> + movups 0x10(key),copy2
> + /* copy3 = counter || zero nonce */
> + movq 0x00(counter),copy3
> + /* one = 1 || 0 */
> + movq $1,%rax
> + movq %rax,one
> +
> +.Lblock:
> + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
> + movdqa copy0,state0
> + movdqa copy1,state1
> + movdqa copy2,state2
> + movdqa copy3,state3
> +
> + movb $10,i
> +.Lpermute:
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> + paddd state1,state0
> + pxor state0,state3
> + movdqa state3,temp
> + pslld $16,temp
> + psrld $16,state3
> + por temp,state3

There is a lot of similarities between all the blocks, can you use GAS
macros to avoid repetitions ?


> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> + paddd state3,state2
> + pxor state2,state1
> + movdqa state1,temp
> + pslld $12,temp
> + psrld $20,state1
> + por temp,state1
> +
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> + paddd state1,state0
> + pxor state0,state3
> + movdqa state3,temp
> + pslld $8,temp
> + psrld $24,state3
> + por temp,state3
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> + paddd state3,state2
> + pxor state2,state1
> + movdqa state1,temp
> + pslld $7,temp
> + psrld $25,state1
> + por temp,state1
> +
> + /* state1[0,1,2,3] = state1[1,2,3,0] */
> + pshufd $0x39,state1,state1
> + /* state2[0,1,2,3] = state2[2,3,0,1] */
> + pshufd $0x4e,state2,state2
> + /* state3[0,1,2,3] = state3[3,0,1,2] */
> + pshufd $0x93,state3,state3
> +
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> + paddd state1,state0
> + pxor state0,state3
> + movdqa state3,temp
> + pslld $16,temp
> + psrld $16,state3
> + por temp,state3
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> + paddd state3,state2
> + pxor state2,state1
> + movdqa state1,temp
> + pslld $12,temp
> + psrld $20,state1
> + por temp,state1
> +
> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> + paddd state1,state0
> + pxor state0,state3
> + movdqa state3,temp
> + pslld $8,temp
> + psrld $24,state3
> + por temp,state3
> +
> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> + paddd state3,state2
> + pxor state2,state1
> + movdqa state1,temp
> + pslld $7,temp
> + psrld $25,state1
> + por temp,state1
> +
> + /* state1[0,1,2,3] = state1[3,0,1,2] */
> + pshufd $0x93,state1,state1
> + /* state2[0,1,2,3] = state2[2,3,0,1] */
> + pshufd $0x4e,state2,state2
> + /* state3[0,1,2,3] = state3[1,2,3,0] */
> + pshufd $0x39,state3,state3
> +
> + decb i
> + jnz .Lpermute
> +
> + /* output0 = state0 + copy0 */
> + paddd copy0,state0
> + movups state0,0x00(output)
> + /* output1 = state1 + copy1 */
> + paddd copy1,state1
> + movups state1,0x10(output)
> + /* output2 = state2 + copy2 */
> + paddd copy2,state2
> + movups state2,0x20(output)
> + /* output3 = state3 + copy3 */
> + paddd copy3,state3
> + movups state3,0x30(output)
> +
> + /* ++copy3.counter */
> + paddq one,copy3
> +
> + /* output += 64, --nblocks */
> + addq $64,output
> + decq nblocks
> + jnz .Lblock
> +
> + /* counter = copy3.counter */
> + movq copy3,0x00(counter)
> +
> + /* Zero out the potentially sensitive regs, in case nothing uses these again. */
> + pxor state0,state0
> + pxor state1,state1
> + pxor state2,state2
> + pxor state3,state3
> + pxor copy1,copy1
> + pxor copy2,copy2
> + pxor temp,temp
> +
> + ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c
> new file mode 100644
> index 000000000000..6045ded5da90
> --- /dev/null
> +++ b/arch/x86/entry/vdso/vgetrandom.c
> @@ -0,0 +1,17 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +#include <linux/types.h>
> +
> +#include "../../../../lib/vdso/getrandom.c"
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state);
> +
> +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state)
> +{
> + return __cvdso_getrandom(buffer, len, flags, state);
> +}
> +
> +ssize_t getrandom(void *, size_t, unsigned int, void *)
> + __attribute__((weak, alias("__vdso_getrandom")));
> diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
> new file mode 100644
> index 000000000000..46f99d735ae6
> --- /dev/null
> +++ b/arch/x86/include/asm/vdso/getrandom.h
> @@ -0,0 +1,55 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +#ifndef __ASM_VDSO_GETRANDOM_H
> +#define __ASM_VDSO_GETRANDOM_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <asm/unistd.h>
> +#include <asm/vvar.h>
> +
> +/**
> + * getrandom_syscall - Invoke the getrandom() syscall.
> + * @buffer: Destination buffer to fill with random bytes.
> + * @len: Size of @buffer in bytes.
> + * @flags: Zero or more GRND_* flags.
> + * Returns the number of random bytes written to @buffer, or a negative value indicating an error.
> + */
> +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
> +{
> + long ret;
> +
> + asm ("syscall" : "=a" (ret) :
> + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
> + "rcx", "r11", "memory");
> +
> + return ret;
> +}
> +
> +#define __vdso_rng_data (VVAR(_vdso_rng_data))
> +
> +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
> +{
> + if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
> + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data);
> + return &__vdso_rng_data;
> +}
> +
> +/**
> + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack.
> + * @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output.
> + * @key: 32-byte input key.
> + * @counter: 8-byte counter, read on input and updated on return.
> + * @nblocks: Number of blocks to generate.
> + *
> + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write
> + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data
> + * leaking into forked child processes.
> + */
> +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
> +
> +#endif /* !__ASSEMBLY__ */
> +
> +#endif /* __ASM_VDSO_GETRANDOM_H */
> diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
> index be199a9b2676..71c56586a22f 100644
> --- a/arch/x86/include/asm/vdso/vsyscall.h
> +++ b/arch/x86/include/asm/vdso/vsyscall.h
> @@ -11,6 +11,8 @@
> #include <asm/vvar.h>
>
> DEFINE_VVAR(struct vdso_data, _vdso_data);
> +DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data);
> +
> /*
> * Update the vDSO data page to keep in sync with kernel timekeeping.
> */
> diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
> index 183e98e49ab9..9d9af37f7cab 100644
> --- a/arch/x86/include/asm/vvar.h
> +++ b/arch/x86/include/asm/vvar.h
> @@ -26,6 +26,8 @@
> */
> #define DECLARE_VVAR(offset, type, name) \
> EMIT_VVAR(name, offset)
> +#define DECLARE_VVAR_SINGLE(offset, type, name) \
> + EMIT_VVAR(name, offset)
>
> #else
>
> @@ -37,6 +39,10 @@ extern char __vvar_page;
> extern type timens_ ## name[CS_BASES] \
> __attribute__((visibility("hidden"))); \
>
> +#define DECLARE_VVAR_SINGLE(offset, type, name) \
> + extern type vvar_ ## name \
> + __attribute__((visibility("hidden"))); \
> +
> #define VVAR(name) (vvar_ ## name)
> #define TIMENS(name) (timens_ ## name)
>
> @@ -44,12 +50,22 @@ extern char __vvar_page;
> type name[CS_BASES] \
> __attribute__((section(".vvar_" #name), aligned(16))) __visible
>
> +#define DEFINE_VVAR_SINGLE(type, name) \
> + type name \
> + __attribute__((section(".vvar_" #name), aligned(16))) __visible
> +
> #endif
>
> /* DECLARE_VVAR(offset, type, name) */
>
> DECLARE_VVAR(128, struct vdso_data, _vdso_data)
>
> +#if !defined(_SINGLE_DATA)
> +#define _SINGLE_DATA
> +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data)
> +#endif
> +
> #undef DECLARE_VVAR
> +#undef DECLARE_VVAR_SINGLE
>
> #endif
> diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore
> index 7dbfdec53f3d..30d5c8f0e5c7 100644
> --- a/tools/testing/selftests/vDSO/.gitignore
> +++ b/tools/testing/selftests/vDSO/.gitignore
> @@ -7,3 +7,4 @@ vdso_test_gettimeofday
> vdso_test_getcpu
> vdso_standalone_test_x86
> vdso_test_getrandom
> +vdso_test_chacha
> diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
> index a33b4d200a32..54a015afe60c 100644
> --- a/tools/testing/selftests/vDSO/Makefile
> +++ b/tools/testing/selftests/vDSO/Makefile
> @@ -3,6 +3,7 @@ include ../lib.mk
>
> uname_M := $(shell uname -m 2>/dev/null || echo not)
> ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
> +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null)
>
> TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu
> TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi
> @@ -12,9 +13,15 @@ TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
> endif
> TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness
> TEST_GEN_PROGS += $(OUTPUT)/vdso_test_getrandom
> +ifeq ($(uname_M),x86_64)
> +ifneq ($(SODIUM),)
> +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_chacha
> +endif
> +endif
>
> CFLAGS := -std=gnu99
> CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector
> +CFLAGS_vdso_test_chacha := $(SODIUM) -idirafter $(top_srcdir)/include -idirafter $(top_srcdir)/arch/$(ARCH)/include -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 -Wa,--noexecstack
> LDFLAGS_vdso_test_correctness := -ldl
> ifeq ($(CONFIG_X86_32),y)
> LDLIBS += -lgcc_s
> @@ -35,3 +42,5 @@ $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c
> -o $@ \
> $(LDFLAGS_vdso_test_correctness)
> $(OUTPUT)/vdso_test_getrandom: parse_vdso.c
> +$(OUTPUT)/vdso_test_chacha: CFLAGS += $(CFLAGS_vdso_test_chacha)
> +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S
> diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
> new file mode 100644
> index 000000000000..bce7a7752b11
> --- /dev/null
> +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
> @@ -0,0 +1,43 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> + */
> +
> +#include <sodium/crypto_stream_chacha20.h>

Is that standard ? Every distribution has sodium ?

> +#include <sys/random.h>
> +#include <string.h>
> +#include <stdint.h>
> +#include "../kselftest.h"
> +
> +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks);
> +
> +int main(int argc, char *argv[])
> +{
> + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
> + static const uint8_t nonce[8] = { 0 };
> + uint32_t counter[2];
> + uint8_t key[32];
> + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
> +
> + ksft_print_header();
> + ksft_set_plan(1);
> +
> + for (unsigned int trial = 0; trial < TRIALS; ++trial) {
> + if (getrandom(key, sizeof(key), 0) != sizeof(key)) {
> + printf("getrandom() failed!\n");
> + return KSFT_SKIP;
> + }
> + crypto_stream_chacha20(output1, sizeof(output1), nonce, key);
> + for (unsigned int split = 0; split < BLOCKS; ++split) {
> + memset(output2, 'X', sizeof(output2));
> + memset(counter, 0, sizeof(counter));
> + if (split)
> + __arch_chacha20_blocks_nostack(output2, key, counter, split);
> + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split);
> + if (memcmp(output1, output2, sizeof(output1)))
> + return KSFT_FAIL;
> + }
> + }
> + ksft_test_result_pass("chacha: PASS\n");
> + return KSFT_PASS;
> +}

Christophe