Re: [RFC PATCH RESEND bpf-next 3/4] riscv, bpf: Add bpf_arch_text_poke support for RV64

From: Pu Lehui
Date: Thu Jan 05 2023 - 20:58:12 EST




On 2023/1/4 2:12, Björn Töpel wrote:
Pu Lehui <pulehui@xxxxxxxxxxxxxxx> writes:

From: Pu Lehui <pulehui@xxxxxxxxxx>

Implement bpf_arch_text_poke for RV64. For call scenario,
ftrace framework reserve 4 nops for RV64 kernel function
as function entry, and use auipc+jalr instructions to call
kernel or module functions. However, since the auipc+jalr
call instructions is non-atomic operation, we need to use
stop-machine to make sure instruction patching in atomic
context. As for jump scenario, since we only jump inside
the trampoline, a jal instruction is sufficient.

Hmm, is that really true? More below!


Signed-off-by: Pu Lehui <pulehui@xxxxxxxxxx>
---
arch/riscv/net/bpf_jit.h | 5 ++
arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++-
2 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index d926e0f7ef57..bf9802a63061 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ)
return rv_i_insn(imm11_0, 0, 0, 0, 0xf);
}
+static inline u32 rv_nop(void)
+{
+ return rv_i_insn(0, 0, 0, 0, 0x13);
+}
+
/* RVC instrutions. */
static inline u16 rvc_addi4spn(u8 rd, u32 imm10)
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index bf4721a99a09..fa8b03c52463 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -8,6 +8,8 @@
#include <linux/bitfield.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/memory.h>
+#include <linux/stop_machine.h>
#include "bpf_jit.h"
#define RV_REG_TCC RV_REG_A6
@@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
if (!is_tail_call)
emit_mv(RV_REG_A0, RV_REG_A5, ctx);
emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
- is_tail_call ? 4 : 0, /* skip TCC init */
+ is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */
ctx);
}
@@ -615,6 +617,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
return 0;
}
+struct text_poke_args {
+ void *addr;
+ const void *insns;
+ size_t len;
+ atomic_t cpu_count;
+};
+
+static int do_text_poke(void *data)
+{
+ int ret = 0;
+ struct text_poke_args *patch = data;
+
+ if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) {
+ ret = patch_text_nosync(patch->addr, patch->insns, patch->len);
+ atomic_inc(&patch->cpu_count);
+ } else {
+ while (atomic_read(&patch->cpu_count) <= num_online_cpus())
+ cpu_relax();
+ smp_mb();
+ }
+
+ return ret;
+}
+
+static int bpf_text_poke_stop_machine(void *addr, const void *insns, size_t len)
+{
+ struct text_poke_args patch = {
+ .addr = addr,
+ .insns = insns,
+ .len = len,
+ .cpu_count = ATOMIC_INIT(0),
+ };
+
+ return stop_machine(do_text_poke, &patch, cpu_online_mask);
+}
+
+static int gen_call_or_nops(void *target, void *ip, u32 *insns)
+{
+ int i, ret;
+ s64 rvoff;
+ struct rv_jit_context ctx;
+
+ ctx.ninsns = 0;
+ ctx.insns = (u16 *)insns;
+
+ if (!target) {
+ for (i = 0; i < 4; i++)
+ emit(rv_nop(), &ctx);
+ return 0;
+ }
+
+ rvoff = (s64)(target - ip);
+ emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx);
+ ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx);
+ if (ret)
+ return ret;
+ emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx);
+
+ return 0;
+
+}
+
+static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr)
+{
+ int ret;
+ u32 old_insns[4], new_insns[4];
+
+ ret = gen_call_or_nops(old_addr, ip + 4, old_insns);
+ if (ret)
+ return ret;
+
+ ret = gen_call_or_nops(new_addr, ip + 4, new_insns);
+ if (ret)
+ return ret;
+
+ mutex_lock(&text_mutex);
+ if (memcmp(ip, old_insns, sizeof(old_insns))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (memcmp(ip, new_insns, sizeof(new_insns)))
+ ret = bpf_text_poke_stop_machine(ip, new_insns,
sizeof(new_insns));

I'd rather see that you added a patch_text variant to
arch/riscv/kernel/patch.c (something like your
bpf_text_poke_stop_machine()), and use that here. Might be other users
of that as well -- Andy's ftrace patch maybe? :-)


Good idea.

+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
+
+static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr)
+{
+ int ret;
+ u32 old_insn, new_insn;
+
+ old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >> 1) : rv_nop();
+ new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >> 1) : rv_nop();
+
+ mutex_lock(&text_mutex);
+ if (memcmp(ip, &old_insn, sizeof(old_insn))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (memcmp(ip, &new_insn, sizeof(new_insn)))
+ ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn));
+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
+ void *old_addr, void *new_addr)

AFAIU there's nothing in the bpf_arch_text_poke() API that say that
BPF_MOD_JUMP is jumps within the trampoline. That is one usage, but not
the only one. In general, the jal might not have enough reach.

I believe that this needs to be an auipc/jalr pair similar to
BPF_MOD_CALL (w/o linked register).


The initial idea was that currently BPF_MOD_JUMP only serves for bpf_tramp_image_put, and jal, which range is +/- 1MB, is sufficient for the distance between im->ip_after_call and im->ip_epilogue, and try to not use not-atomic auipc/jalr pair. But take deep consideration, this might be extended to other uses, such as tailcall optimization. So agree with your suggestion.


And again, thanks for working on the RV trampoline!
Björn