Re: [PATCH v2 net-next 1/3] bpf: enable non-root eBPF programs

From: Kees Cook
Date: Thu Oct 08 2015 - 13:45:36 EST


On Wed, Oct 7, 2015 at 10:23 PM, Alexei Starovoitov <ast@xxxxxxxxxxxx> wrote:
> In order to let unprivileged users load and execute eBPF programs
> teach verifier to prevent pointer leaks.
> Verifier will prevent
> - any arithmetic on pointers
> (except R10+Imm which is used to compute stack addresses)
> - comparison of pointers
> (except if (map_value_ptr == 0) ... )
> - passing pointers to helper functions
> - indirectly passing pointers in stack to helper functions
> - returning pointer from bpf program
> - storing pointers into ctx or maps
>
> Spill/fill of pointers into stack is allowed, but mangling
> of pointers stored in the stack or reading them byte by byte is not.
>
> Within bpf programs the pointers do exist, since programs need to
> be able to access maps, pass skb pointer to LD_ABS insns, etc
> but programs cannot pass such pointer values to the outside
> or obfuscate them.
>
> Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs,
> so that socket filters (tcpdump), af_packet (quic acceleration)
> and future kcm can use it.
> tracing and tc cls/act program types still require root permissions,
> since tracing actually needs to be able to see all kernel pointers
> and tc is for root only.
>
> For example, the following unprivileged socket filter program is allowed:
> int bpf_prog1(struct __sk_buff *skb)
> {
> u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
> u64 *value = bpf_map_lookup_elem(&my_map, &index);
>
> if (value)
> *value += skb->len;
> return 0;
> }
>
> but the following program is not:
> int bpf_prog1(struct __sk_buff *skb)
> {
> u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
> u64 *value = bpf_map_lookup_elem(&my_map, &index);
>
> if (value)
> *value += (u64) skb;
> return 0;
> }
> since it would leak the kernel address into the map.
>
> Unprivileged socket filter bpf programs have access to the
> following helper functions:
> - map lookup/update/delete (but they cannot store kernel pointers into them)
> - get_random (it's already exposed to unprivileged user space)
> - get_smp_processor_id
> - tail_call into another socket filter program
> - ktime_get_ns
>
> The feature is controlled by sysctl kernel.unprivileged_bpf_disabled.
> This toggle defaults to off (0), but can be set true (1). Once true,
> bpf programs and maps cannot be accessed from unprivileged process,
> and the toggle cannot be set back to false.
>
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxxxx>

Reviewed-by: Kees Cook <keescook@xxxxxxxxxxxx>

Thanks for making this safer! :)

-Kees

> ---
> v1->v2:
> - sysctl_unprivileged_bpf_disabled
> - drop bpf_trace_printk
> - split tests into separate patch to ease review
> ---
> include/linux/bpf.h | 2 +
> kernel/bpf/syscall.c | 11 ++---
> kernel/bpf/verifier.c | 106 ++++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sysctl.c | 13 ++++++
> net/core/filter.c | 3 +-
> 5 files changed, 120 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 19b8a2081f88..e472b06df138 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -167,6 +167,8 @@ void bpf_prog_put_rcu(struct bpf_prog *prog);
> struct bpf_map *bpf_map_get(struct fd f);
> void bpf_map_put(struct bpf_map *map);
>
> +extern int sysctl_unprivileged_bpf_disabled;
> +
> /* verify correctness of eBPF program */
> int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
> #else
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 5f35f420c12f..9f824b0f0f5f 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -18,6 +18,8 @@
> #include <linux/filter.h>
> #include <linux/version.h>
>
> +int sysctl_unprivileged_bpf_disabled __read_mostly;
> +
> static LIST_HEAD(bpf_map_types);
>
> static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
> @@ -542,6 +544,9 @@ static int bpf_prog_load(union bpf_attr *attr)
> attr->kern_version != LINUX_VERSION_CODE)
> return -EINVAL;
>
> + if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> /* plain bpf_prog allocation */
> prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
> if (!prog)
> @@ -597,11 +602,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> union bpf_attr attr = {};
> int err;
>
> - /* the syscall is limited to root temporarily. This restriction will be
> - * lifted when security audit is clean. Note that eBPF+tracing must have
> - * this restriction, since it may pass kernel data to user space
> - */
> - if (!capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
> return -EPERM;
>
> if (!access_ok(VERIFY_READ, uattr, 1))
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f8da034c2258..1d6b97be79e1 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -199,6 +199,7 @@ struct verifier_env {
> struct verifier_state_list **explored_states; /* search pruning optimization */
> struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
> u32 used_map_cnt; /* number of used maps */
> + bool allow_ptr_leaks;
> };
>
> /* verbose verifier prints what it's seeing
> @@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size)
> return -EINVAL;
> }
>
> +static bool is_spillable_regtype(enum bpf_reg_type type)
> +{
> + switch (type) {
> + case PTR_TO_MAP_VALUE:
> + case PTR_TO_MAP_VALUE_OR_NULL:
> + case PTR_TO_STACK:
> + case PTR_TO_CTX:
> + case FRAME_PTR:
> + case CONST_PTR_TO_MAP:
> + return true;
> + default:
> + return false;
> + }
> +}
> +
> /* check_stack_read/write functions track spill/fill of registers,
> * stack boundary and alignment are checked in check_mem_access()
> */
> @@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
> */
>
> if (value_regno >= 0 &&
> - (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
> - state->regs[value_regno].type == PTR_TO_STACK ||
> - state->regs[value_regno].type == PTR_TO_CTX)) {
> + is_spillable_regtype(state->regs[value_regno].type)) {
>
> /* register containing pointer is being spilled into stack */
> if (size != BPF_REG_SIZE) {
> @@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
> return -EACCES;
> }
>
> +static bool is_pointer_value(struct verifier_env *env, int regno)
> +{
> + if (env->allow_ptr_leaks)
> + return false;
> +
> + switch (env->cur_state.regs[regno].type) {
> + case UNKNOWN_VALUE:
> + case CONST_IMM:
> + return false;
> + default:
> + return true;
> + }
> +}
> +
> /* check whether memory at (regno + off) is accessible for t = (read | write)
> * if t==write, value_regno is a register which value is stored into memory
> * if t==read, value_regno is a register which will receive the value from memory
> @@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
> }
>
> if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
> + if (t == BPF_WRITE && value_regno >= 0 &&
> + is_pointer_value(env, value_regno)) {
> + verbose("R%d leaks addr into map\n", value_regno);
> + return -EACCES;
> + }
> err = check_map_access(env, regno, off, size);
> if (!err && t == BPF_READ && value_regno >= 0)
> mark_reg_unknown_value(state->regs, value_regno);
>
> } else if (state->regs[regno].type == PTR_TO_CTX) {
> + if (t == BPF_WRITE && value_regno >= 0 &&
> + is_pointer_value(env, value_regno)) {
> + verbose("R%d leaks addr into ctx\n", value_regno);
> + return -EACCES;
> + }
> err = check_ctx_access(env, off, size, t);
> if (!err && t == BPF_READ && value_regno >= 0)
> mark_reg_unknown_value(state->regs, value_regno);
> @@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
> verbose("invalid stack off=%d size=%d\n", off, size);
> return -EACCES;
> }
> - if (t == BPF_WRITE)
> + if (t == BPF_WRITE) {
> + if (!env->allow_ptr_leaks &&
> + state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
> + size != BPF_REG_SIZE) {
> + verbose("attempt to corrupt spilled pointer on stack\n");
> + return -EACCES;
> + }
> err = check_stack_write(state, off, size, value_regno);
> - else
> + } else {
> err = check_stack_read(state, off, size, value_regno);
> + }
> } else {
> verbose("R%d invalid mem access '%s'\n",
> regno, reg_type_str[state->regs[regno].type]);
> @@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
> return -EACCES;
> }
>
> - if (arg_type == ARG_ANYTHING)
> + if (arg_type == ARG_ANYTHING) {
> + if (is_pointer_value(env, regno)) {
> + verbose("R%d leaks addr into helper function\n", regno);
> + return -EACCES;
> + }
> return 0;
> + }
>
> if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
> arg_type == ARG_PTR_TO_MAP_VALUE) {
> @@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id)
> }
>
> /* check validity of 32-bit and 64-bit arithmetic operations */
> -static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
> +static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
> {
> + struct reg_state *regs = env->cur_state.regs;
> u8 opcode = BPF_OP(insn->code);
> int err;
>
> @@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
> if (err)
> return err;
>
> + if (is_pointer_value(env, insn->dst_reg)) {
> + verbose("R%d pointer arithmetic prohibited\n",
> + insn->dst_reg);
> + return -EACCES;
> + }
> +
> /* check dest operand */
> err = check_reg_arg(regs, insn->dst_reg, DST_OP);
> if (err)
> @@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
> */
> regs[insn->dst_reg] = regs[insn->src_reg];
> } else {
> + if (is_pointer_value(env, insn->src_reg)) {
> + verbose("R%d partial copy of pointer\n",
> + insn->src_reg);
> + return -EACCES;
> + }
> regs[insn->dst_reg].type = UNKNOWN_VALUE;
> regs[insn->dst_reg].map_ptr = NULL;
> }
> @@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
> /* pattern match 'bpf_add Rx, imm' instruction */
> if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
> regs[insn->dst_reg].type == FRAME_PTR &&
> - BPF_SRC(insn->code) == BPF_K)
> + BPF_SRC(insn->code) == BPF_K) {
> stack_relative = true;
> + } else if (is_pointer_value(env, insn->dst_reg)) {
> + verbose("R%d pointer arithmetic prohibited\n",
> + insn->dst_reg);
> + return -EACCES;
> + } else if (BPF_SRC(insn->code) == BPF_X &&
> + is_pointer_value(env, insn->src_reg)) {
> + verbose("R%d pointer arithmetic prohibited\n",
> + insn->src_reg);
> + return -EACCES;
> + }
>
> /* check dest operand */
> err = check_reg_arg(regs, insn->dst_reg, DST_OP);
> @@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
> err = check_reg_arg(regs, insn->src_reg, SRC_OP);
> if (err)
> return err;
> +
> + if (is_pointer_value(env, insn->src_reg)) {
> + verbose("R%d pointer comparison prohibited\n",
> + insn->src_reg);
> + return -EACCES;
> + }
> } else {
> if (insn->src_reg != BPF_REG_0) {
> verbose("BPF_JMP uses reserved fields\n");
> @@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
> regs[insn->dst_reg].type = CONST_IMM;
> regs[insn->dst_reg].imm = 0;
> }
> + } else if (is_pointer_value(env, insn->dst_reg)) {
> + verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
> + return -EACCES;
> } else if (BPF_SRC(insn->code) == BPF_K &&
> (opcode == BPF_JEQ || opcode == BPF_JNE)) {
>
> @@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env)
> }
>
> if (class == BPF_ALU || class == BPF_ALU64) {
> - err = check_alu_op(regs, insn);
> + err = check_alu_op(env, insn);
> if (err)
> return err;
>
> @@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env)
> if (err)
> return err;
>
> + if (is_pointer_value(env, BPF_REG_0)) {
> + verbose("R0 leaks addr as return value\n");
> + return -EACCES;
> + }
> +
> process_bpf_exit:
> insn_idx = pop_stack(env, &prev_insn_idx);
> if (insn_idx < 0) {
> @@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
> if (ret < 0)
> goto skip_full_check;
>
> + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
> +
> ret = do_check(env);
>
> skip_full_check:
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d8094e..96c856b04081 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -64,6 +64,7 @@
> #include <linux/binfmts.h>
> #include <linux/sched/sysctl.h>
> #include <linux/kexec.h>
> +#include <linux/bpf.h>
>
> #include <asm/uaccess.h>
> #include <asm/processor.h>
> @@ -1139,6 +1140,18 @@ static struct ctl_table kern_table[] = {
> .proc_handler = timer_migration_handler,
> },
> #endif
> +#ifdef CONFIG_BPF_SYSCALL
> + {
> + .procname = "unprivileged_bpf_disabled",
> + .data = &sysctl_unprivileged_bpf_disabled,
> + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
> + .mode = 0644,
> + /* only handle a transition from default "0" to "1" */
> + .proc_handler = proc_dointvec_minmax,
> + .extra1 = &one,
> + .extra2 = &one,
> + },
> +#endif
> { }
> };
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index cbaa23c3fb30..8fb3acbbe4cb 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1644,7 +1644,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
> case BPF_FUNC_ktime_get_ns:
> return &bpf_ktime_get_ns_proto;
> case BPF_FUNC_trace_printk:
> - return bpf_get_trace_printk_proto();
> + if (capable(CAP_SYS_ADMIN))
> + return bpf_get_trace_printk_proto();
> default:
> return NULL;
> }
> --
> 1.7.9.5
>



--
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/