Re: [5.2 REGRESSION] Generic vDSO breaks seccomp-enabled userspace on i386

From: Kees Cook
Date: Mon Jul 22 2019 - 19:28:30 EST


On Mon, Jul 22, 2019 at 12:17:16PM -0700, Andy Lutomirski wrote:
> On Mon, Jul 22, 2019 at 11:39 AM Kees Cook <keescook@xxxxxxxxxxxx> wrote:
> >
> > On Mon, Jul 22, 2019 at 08:31:32PM +0200, Thomas Gleixner wrote:
> > > On Mon, 22 Jul 2019, Kees Cook wrote:
> > > > Just so I'm understanding: the vDSO change introduced code to make an
> > > > actual syscall on i386, which for most seccomp filters would be rejected?
> > >
> > > No. The old x86 specific VDSO implementation had a fallback syscall as
> > > well, i.e. clock_gettime(). On 32bit clock_gettime() uses the y2038
> > > endangered timespec.
> > >
> > > So when the VDSO was made generic we changed the internal data structures
> > > to be 2038 safe right away. As a consequence the fallback syscall is not
> > > clock_gettime(), it's clock_gettime64(). which seems to surprise seccomp.
> >
> > Okay, it's didn't add a syscall, it just changed it. Results are the
> > same: conservative filters suddenly start breaking due to the different
> > call. (And now I see why Andy's alias suggestion would help...)
> >
> > I'm not sure which direction to do with this. It seems like an alias
> > list is a large hammer for this case, and a "seccomp-bypass when calling
> > from vDSO" solution seems too fragile?
> >
>
> I don't like the seccomp bypass at all. If someone uses seccomp to
> disallow all clock_gettime() variants, there shouldn't be a back door
> to learn the time.
>
> Here's the restart_syscall() logic that makes me want aliases: we have
> different syscall numbers for restart_syscall() on 32-bit and 64-bit.
> The logic to decide which one to use is dubious at best. I'd like to
> introduce a restart_syscall2() that is identical to restart_syscall()
> except that it has the same number on both variants.

I've built a straw-man for this idea... but I have to say I don't
like it. This can lead to really unexpected behaviors if someone
were to have differing filters for the two syscalls. For example,
let's say someone was doing a paranoid audit of 2038-unsafe clock usage
and marked clock_gettime() with RET_KILL and marked clock_gettime64()
with RET_LOG. This aliasing would make clock_gettime64() trigger with
RET_KILL...

There's no sense of "default" in BPF so we can't check for "and they
weren't expecting it", and we can't check for exact matches since the
filter might be using a balance tree to bucket the allowed syscalls.

Anyway, here it is in case it sparks some more thoughts...


diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 9242b28418d5..1e82bd43286c 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -13,6 +13,7 @@
*/
#undef CONFIG_64BIT
#undef CONFIG_X86_64
+#undef CONFIG_COMPAT
#undef CONFIG_PGTABLE_LEVELS
#undef CONFIG_ILLEGAL_POINTER_VALUE
#undef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 2bd1338de236..5d0da5ee1b61 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -6,6 +6,12 @@

#ifdef CONFIG_X86_32
#define __NR_seccomp_sigreturn __NR_sigreturn
+#define seccomp_syscall_alias(arch, syscall) \
+ ({ \
+ (syscall) == __NR_clock_gettime64 ? \
+ __NR_clock_gettime : \
+ -1; \
+ })
#endif

#ifdef CONFIG_COMPAT
@@ -14,6 +20,13 @@
#define __NR_seccomp_write_32 __NR_ia32_write
#define __NR_seccomp_exit_32 __NR_ia32_exit
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#define seccomp_syscall_alias(arch, syscall) \
+ ({ \
+ (arch) != AUDIT_ARCH_I386 ? -1 : \
+ (syscall) == __NR_ia32_clock_gettime64 ? \
+ __NR_ia32_clock_gettime : \
+ -1; \
+ })
#endif

#include <asm-generic/seccomp.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 84868d37b35d..06f5ca81d756 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -83,6 +83,14 @@ static inline int seccomp_mode(struct seccomp *s)
#ifdef CONFIG_SECCOMP_FILTER
extern void put_seccomp_filter(struct task_struct *tsk);
extern void get_seccomp_filter(struct task_struct *tsk);
+/*
+ * Allow architectures to provide syscall aliases for special cases
+ * when attempting to make invisible transitions to new syscalls that
+ * have no arguments (e.g. clock_gettime64, restart_syscall).
+ */
+# ifndef seccomp_syscall_alias
+# define seccomp_syscall_alias(arch, syscall) (-1)
+# endif
#else /* CONFIG_SECCOMP_FILTER */
static inline void put_seccomp_filter(struct task_struct *tsk)
{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..5153c6155d9a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -807,6 +807,26 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION_FULL;

+ /* Handle syscall aliases when result is not SECCOMP_RET_ALLOW. */
+ if (unlikely(action != SECCOMP_RET_ALLOW)) {
+ int alias;
+
+ alias = seccomp_syscall_alias(sd->arch, sd->nr);
+ if (unlikely(alias != -1)) {
+ /* Use sd_local for an aliased syscall. */
+ if (sd != &sd_local) {
+ sd_local = *sd;
+ sd = &sd_local;
+ }
+ sd_local.nr = alias;
+
+ /* Run again, with the alias, accepting the results. */
+ filter_ret = seccomp_run_filters(sd, &match);
+ data = filter_ret & SECCOMP_RET_DATA;
+ action = filter_ret & SECCOMP_RET_ACTION_FULL;
+ }
+ }
+
switch (action) {
case SECCOMP_RET_ERRNO:
/* Set low-order bits as an errno, capped at MAX_ERRNO. */
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 6ef7f16c4cf5..778619d145ea 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -3480,6 +3519,55 @@ TEST(seccomp_get_notif_sizes)
EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
}

+#if defined(__i386__)
+TEST(seccomp_syscall_alias)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+#ifdef __NR_sigreturn
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0),
+#endif
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 2, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 1, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_gettime, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ unsigned char buffer[128];
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel refused to install seccomp filter!");
+ }
+
+ /* We allow clock_gettime explicitly. */
+ EXPECT_EQ(0, syscall(__NR_clock_gettime, CLOCK_REALTIME, &buffer)) {
+ TH_LOG("Failed __NR_clock_gettime!?");
+ }
+
+ /* With aliases, clock_gettime64 should be allowed too. */
+ EXPECT_EQ(0, syscall(__NR_clock_gettime64, CLOCK_REALTIME, &buffer)) {
+ TH_LOG("Failed __NR_clock_gettime64!");
+ }
+
+ syscall(__NR_exit, 0);
+}
+#endif
+
/*
* TODO:
* - add microbenchmarks

--
Kees Cook