Re: [PATCH v5 net-next 2/3] [RFC] seccomp: convert seccomp to use extended BPF

From: Alexei Starovoitov
Date: Tue Mar 04 2014 - 22:12:00 EST


On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <ast@xxxxxxxxxxxx> wrote:
> use sk_convert_filter() to convert seccomp BPF into extended BPF
>
> 05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
> seccomp_rule_add_exact(ctx,...
> seccomp_rule_add_exact(ctx,...
> rc = seccomp_load(ctx);
> for (i = 0; i < 10000000; i++)
> syscall(199, 100);
>
> --x86_64--
> old BPF: 8.6 seconds
> 73.38% bench [kernel.kallsyms] [k] sk_run_filter
> 10.70% bench libc-2.15.so [.] syscall
> 5.09% bench [kernel.kallsyms] [k] seccomp_bpf_load
> 1.97% bench [kernel.kallsyms] [k] system_call
> ext BPF: 5.7 seconds
> 66.20% bench [kernel.kallsyms] [k] sk_run_filter_ext
> 16.75% bench libc-2.15.so [.] syscall
> 3.31% bench [kernel.kallsyms] [k] system_call
> 2.88% bench [kernel.kallsyms] [k] __secure_computing
>
> --i386---
> old BPF: 5.4 sec
> ext BPF: 3.8 sec
>
> BPF filters generated by seccomp are very branchy, so ext BPF
> performance is better than old BPF.
>
> Performance gains will be even higher when extended BPF JIT
> is committed.
>
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxxxx>
>
> ---
> This patch is an RFC to use extended BPF in seccomp.
> Change it to do it conditionally with bpf_ext_enable knob ?

Kees, Will,

I've played with libseccomp to test this patch and just found
your other testsuite for bpf+seccomp:
https://github.com/redpig/seccomp
It passes as well on x86_64 and i386.

Not sure how real all 'seccomp' and libseccomp' bpf filters.
Some of them are very short, some very long.
If average number of filtered syscalls is > 10, then upcoming
'ebpf table' will give another performance boost, since single table
lookup will be faster than long sequence of 'if' conditions.

Please review.

Thanks
Alexei

> ---
> include/linux/seccomp.h | 1 -
> kernel/seccomp.c | 112 +++++++++++++++++++++--------------------------
> net/core/filter.c | 5 ---
> 3 files changed, 51 insertions(+), 67 deletions(-)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 6f19cfd1840e..4054b0994071 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
> #ifdef CONFIG_SECCOMP_FILTER
> extern void put_seccomp_filter(struct task_struct *tsk);
> extern void get_seccomp_filter(struct task_struct *tsk);
> -extern u32 seccomp_bpf_load(int off);
> #else /* CONFIG_SECCOMP_FILTER */
> static inline void put_seccomp_filter(struct task_struct *tsk)
> {
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index b7a10048a32c..346c597b44cc 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -55,60 +55,27 @@ struct seccomp_filter {
> atomic_t usage;
> struct seccomp_filter *prev;
> unsigned short len; /* Instruction count */
> - struct sock_filter insns[];
> + struct sock_filter_ext insns[];
> };
>
> /* Limit any path through the tree to 256KB worth of instructions. */
> #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
>
> -/**
> - * get_u32 - returns a u32 offset into data
> - * @data: a unsigned 64 bit value
> - * @index: 0 or 1 to return the first or second 32-bits
> - *
> - * This inline exists to hide the length of unsigned long. If a 32-bit
> - * unsigned long is passed in, it will be extended and the top 32-bits will be
> - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
> - * properly returned.
> - *
> +/*
> * Endianness is explicitly ignored and left for BPF program authors to manage
> * as per the specific architecture.
> */
> -static inline u32 get_u32(u64 data, int index)
> -{
> - return ((u32 *)&data)[index];
> -}
> -
> -/* Helper for bpf_load below. */
> -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
> -/**
> - * bpf_load: checks and returns a pointer to the requested offset
> - * @off: offset into struct seccomp_data to load from
> - *
> - * Returns the requested 32-bits of data.
> - * seccomp_check_filter() should assure that @off is 32-bit aligned
> - * and not out of bounds. Failure to do so is a BUG.
> - */
> -u32 seccomp_bpf_load(int off)
> +static void populate_seccomp_data(struct seccomp_data *sd)
> {
> struct pt_regs *regs = task_pt_regs(current);
> - if (off == BPF_DATA(nr))
> - return syscall_get_nr(current, regs);
> - if (off == BPF_DATA(arch))
> - return syscall_get_arch(current, regs);
> - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
> - unsigned long value;
> - int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
> - int index = !!(off % sizeof(u64));
> - syscall_get_arguments(current, regs, arg, 1, &value);
> - return get_u32(value, index);
> - }
> - if (off == BPF_DATA(instruction_pointer))
> - return get_u32(KSTK_EIP(current), 0);
> - if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
> - return get_u32(KSTK_EIP(current), 1);
> - /* seccomp_check_filter should make this impossible. */
> - BUG();
> + int i;
> +
> + sd->nr = syscall_get_nr(current, regs);
> + sd->arch = syscall_get_arch(current, regs);
> + for (i = 0; i < 6; i++)
> + syscall_get_arguments(current, regs, i, 1,
> + (unsigned long *)&sd->args[i]);
> + sd->instruction_pointer = KSTK_EIP(current);
> }
>
> /**
> @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>
> switch (code) {
> case BPF_S_LD_W_ABS:
> - ftest->code = BPF_S_ANC_SECCOMP_LD_W;
> + ftest->code = BPF_LDX | BPF_W | BPF_ABS;
> /* 32-bit aligned and not out of bounds. */
> if (k >= sizeof(struct seccomp_data) || k & 3)
> return -EINVAL;
> continue;
> case BPF_S_LD_W_LEN:
> - ftest->code = BPF_S_LD_IMM;
> + ftest->code = BPF_LD | BPF_IMM;
> ftest->k = sizeof(struct seccomp_data);
> continue;
> case BPF_S_LDX_W_LEN:
> - ftest->code = BPF_S_LDX_IMM;
> + ftest->code = BPF_LDX | BPF_IMM;
> ftest->k = sizeof(struct seccomp_data);
> continue;
> /* Explicitly include allowed calls. */
> @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
> case BPF_S_JMP_JGT_X:
> case BPF_S_JMP_JSET_K:
> case BPF_S_JMP_JSET_X:
> + sk_decode_filter(ftest, ftest);
> continue;
> default:
> return -EINVAL;
> @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
> static u32 seccomp_run_filters(int syscall)
> {
> struct seccomp_filter *f;
> + struct seccomp_data sd;
> u32 ret = SECCOMP_RET_ALLOW;
>
> /* Ensure unexpected behavior doesn't result in failing open. */
> if (WARN_ON(current->seccomp.filter == NULL))
> return SECCOMP_RET_KILL;
>
> + populate_seccomp_data(&sd);
> +
> /*
> * All filters in the list are evaluated and the lowest BPF return
> * value always takes priority (ignoring the DATA).
> */
> for (f = current->seccomp.filter; f; f = f->prev) {
> - u32 cur_ret = sk_run_filter(NULL, f->insns);
> + u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
> if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
> ret = cur_ret;
> }
> @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
> struct seccomp_filter *filter;
> unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
> unsigned long total_insns = fprog->len;
> + struct sock_filter *fp;
> + int new_len;
> long ret;
>
> if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
> @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
> CAP_SYS_ADMIN) != 0)
> return -EACCES;
>
> - /* Allocate a new seccomp_filter */
> - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
> - GFP_KERNEL|__GFP_NOWARN);
> - if (!filter)
> + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
> + if (!fp)
> return -ENOMEM;
> - atomic_set(&filter->usage, 1);
> - filter->len = fprog->len;
>
> /* Copy the instructions from fprog. */
> ret = -EFAULT;
> - if (copy_from_user(filter->insns, fprog->filter, fp_size))
> - goto fail;
> + if (copy_from_user(fp, fprog->filter, fp_size))
> + goto free_prog;
>
> /* Check and rewrite the fprog via the skb checker */
> - ret = sk_chk_filter(filter->insns, filter->len);
> + ret = sk_chk_filter(fp, fprog->len);
> if (ret)
> - goto fail;
> + goto free_prog;
>
> /* Check and rewrite the fprog for seccomp use */
> - ret = seccomp_check_filter(filter->insns, filter->len);
> + ret = seccomp_check_filter(fp, fprog->len);
> if (ret)
> - goto fail;
> + goto free_prog;
> +
> + /* convert 'sock_filter' insns to 'sock_filter_ext' insns */
> + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
> + if (ret)
> + goto free_prog;
> +
> + /* Allocate a new seccomp_filter */
> + filter = kzalloc(sizeof(struct seccomp_filter) +
> + sizeof(struct sock_filter_ext) * new_len,
> + GFP_KERNEL|__GFP_NOWARN);
> + if (!filter)
> + goto free_prog;
> +
> + ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);
> + if (ret)
> + goto free_filter;
> + atomic_set(&filter->usage, 1);
> + filter->len = new_len;
>
> /*
> * If there is an existing filter, make it the prev and don't drop its
> @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
> filter->prev = current->seccomp.filter;
> current->seccomp.filter = filter;
> return 0;
> -fail:
> +
> +free_filter:
> kfree(filter);
> +free_prog:
> + kfree(fp);
> return ret;
> }
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1494421486b7..f144a6a7d939 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -388,11 +388,6 @@ load_b:
> A = 0;
> continue;
> }
> -#ifdef CONFIG_SECCOMP_FILTER
> - case BPF_S_ANC_SECCOMP_LD_W:
> - A = seccomp_bpf_load(fentry->k);
> - continue;
> -#endif
> default:
> WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
> fentry->code, fentry->jt,
> --
> 1.7.9.5
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/