[PATCH v2 seccomp 3/6] seccomp/cache: Add "emulator" to check if filter is arg-dependent

From: YiFei Zhu
Date: Thu Sep 24 2020 - 08:49:08 EST


From: YiFei Zhu <yifeifz2@xxxxxxxxxxxx>

SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not
access any syscall arguments or instruction pointer. To facilitate
this we need a static analyser to know whether a filter will
return allow regardless of syscall arguments for a given
architecture number / syscall number pair. This is implemented
here with a pseudo-emulator, and stored in a per-filter bitmap.

Each common BPF instruction (stolen from Kees's list [1]) are
emulated. Any weirdness or loading from a syscall argument will
cause the emulator to bail.

The emulation is also halted if it reaches a return. In that case,
if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good.

Filter dependency is resolved at attach time. If a filter depends
on more filters, then we perform an and on its bitmask against its
dependee; if the dependee does not guarantee to allow the syscall,
then the depender is also marked not to guarantee to allow the
syscall.

[1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@xxxxxxxxxxxx/

Signed-off-by: YiFei Zhu <yifeifz2@xxxxxxxxxxxx>
---
arch/Kconfig | 25 ++++++
kernel/seccomp.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 218 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 6dfc5673215d..8cc3dc87f253 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -489,6 +489,31 @@ config SECCOMP_FILTER

See Documentation/userspace-api/seccomp_filter.rst for details.

+choice
+ prompt "Seccomp filter cache"
+ default SECCOMP_CACHE_NONE
+ depends on SECCOMP_FILTER
+ help
+ Seccomp filters can potentially incur large overhead for each
+ system call. This can alleviate some of the overhead.
+
+ If in doubt, select 'syscall numbers only'.
+
+config SECCOMP_CACHE_NONE
+ bool "None"
+ help
+ No caching is done. Seccomp filters will be called each time
+ a system call occurs in a seccomp-guarded task.
+
+config SECCOMP_CACHE_NR_ONLY
+ bool "Syscall number only"
+ depends on !HAVE_SPARSE_SYSCALL_NR
+ help
+ For each syscall number, if the seccomp filter has a fixed
+ result, store that result in a bitmap to speed up system calls.
+
+endchoice
+
config HAVE_ARCH_STACKLEAK
bool
help
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3ee59ce0a323..20d33378a092 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,32 @@ struct notification {
struct list_head notifications;
};

+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_cache_filter_data - container for cache's per-filter data
+ *
+ * @syscall_ok: A bitmap for each architecture number, where each bit
+ * represents whether the filter will always allow the syscall.
+ */
+struct seccomp_cache_filter_data {
+ DECLARE_BITMAP(syscall_ok[ARRAY_SIZE(syscall_arches)], NR_syscalls);
+};
+
+#define SECCOMP_EMU_MAX_PENDING_STATES 64
+#else
+struct seccomp_cache_filter_data { };
+
+static inline int seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ return 0;
+}
+
+static inline void seccomp_cache_inherit(struct seccomp_filter *sfilter,
+ const struct seccomp_filter *prev)
+{
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -185,6 +211,7 @@ struct seccomp_filter {
struct notification *notif;
struct mutex notify_lock;
wait_queue_head_t wqh;
+ struct seccomp_cache_filter_data cache;
};

/* Limit any path through the tree to 256KB worth of instructions. */
@@ -530,6 +557,139 @@ static inline void seccomp_sync_threads(unsigned long flags)
}
}

+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_emu_env - container for seccomp emulator environment
+ *
+ * @filter: The cBPF filter instructions.
+ * @nr: The syscall number we are emulating.
+ * @arch: The architecture number we are emulating.
+ * @syscall_ok: Emulation result, whether it is okay for seccomp to cache the
+ * syscall.
+ */
+struct seccomp_emu_env {
+ struct sock_filter *filter;
+ int arch;
+ int nr;
+ bool syscall_ok;
+};
+
+/**
+ * struct seccomp_emu_state - container for seccomp emulator state
+ *
+ * @next: The next pending state. This structure is a linked list.
+ * @pc: The current program counter.
+ * @areg: the value of that A register.
+ */
+struct seccomp_emu_state {
+ struct seccomp_emu_state *next;
+ int pc;
+ u32 areg;
+};
+
+/**
+ * seccomp_emu_step - step one instruction in the emulator
+ * @env: The emulator environment
+ * @state: The emulator state
+ *
+ * Returns 1 to halt emulation, 0 to continue, or -errno if error occurred.
+ */
+static int seccomp_emu_step(struct seccomp_emu_env *env,
+ struct seccomp_emu_state *state)
+{
+ struct sock_filter *ftest = &env->filter[state->pc++];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+ bool compare;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ if (k == offsetof(struct seccomp_data, nr))
+ state->areg = env->nr;
+ else if (k == offsetof(struct seccomp_data, arch))
+ state->areg = env->arch;
+ else
+ return 1;
+
+ return 0;
+ case BPF_JMP | BPF_JA:
+ state->pc += k;
+ return 0;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ compare = state->areg == k;
+ break;
+ case BPF_JGT:
+ compare = state->areg > k;
+ break;
+ case BPF_JGE:
+ compare = state->areg >= k;
+ break;
+ case BPF_JSET:
+ compare = state->areg & k;
+ break;
+ default:
+ WARN_ON(true);
+ return -EINVAL;
+ }
+
+ state->pc += compare ? ftest->jt : ftest->jf;
+ return 0;
+ case BPF_ALU | BPF_AND | BPF_K:
+ state->areg &= k;
+ return 0;
+ case BPF_RET | BPF_K:
+ env->syscall_ok = k == SECCOMP_RET_ALLOW;
+ return 1;
+ default:
+ return 1;
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+int seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct sock_filter *filter = fprog->filter;
+ int arch, nr, res = 0;
+
+ for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
+ for (nr = 0; nr < NR_syscalls; nr++) {
+ struct seccomp_emu_env env = {0};
+ struct seccomp_emu_state state = {0};
+
+ env.filter = filter;
+ env.arch = syscall_arches[arch];
+ env.nr = nr;
+
+ while (true) {
+ res = seccomp_emu_step(&env, &state);
+ if (res)
+ break;
+ }
+
+ if (res < 0)
+ goto out;
+
+ if (env.syscall_ok)
+ set_bit(nr, sfilter->cache.syscall_ok[arch]);
+ }
+ }
+
+out:
+ return res;
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
@@ -540,7 +700,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY);

if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -571,6 +732,13 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}

+ ret = seccomp_cache_prepare(sfilter);
+ if (ret < 0) {
+ bpf_prog_destroy(sfilter->prog);
+ kfree(sfilter);
+ return ERR_PTR(ret);
+ }
+
refcount_set(&sfilter->refs, 1);
refcount_set(&sfilter->users, 1);
init_waitqueue_head(&sfilter->wqh);
@@ -606,6 +774,29 @@ seccomp_prepare_user_filter(const char __user *user_filter)
return filter;
}

+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * seccomp_cache_inherit - mask accept bitmap against previous filter
+ * @sfilter: The seccomp filter
+ * @sfilter: The previous seccomp filter
+ */
+static void seccomp_cache_inherit(struct seccomp_filter *sfilter,
+ const struct seccomp_filter *prev)
+{
+ int arch;
+
+ if (!prev)
+ return;
+
+ for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
+ bitmap_and(sfilter->cache.syscall_ok[arch],
+ sfilter->cache.syscall_ok[arch],
+ prev->cache.syscall_ok[arch],
+ NR_syscalls);
+ }
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -655,6 +846,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_inherit(filter, filter->prev);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);

--
2.28.0