[PATCH] RISC-V: store percpu offset in CSR_SCRATCH
From: Radim Krčmář
Date: Mon Jul 07 2025 - 08:50:49 EST
2025-07-04T16:45:00+08:00, Yunhui Cui <cuiyunhui@xxxxxxxxxxxxx>:
> The following data was collected from tests conducted on the
> Spacemit(R) X60 using the fixed register method:
> [...]
> The fixed register method reduced performance by 5.29%.
> The per-CPU offset optimization improved performance by 2.52%.
What is the performance if you use the scratch register?
The patch below is completely unoptimized as I didn't want to shuffle
code around too much, but it could give a rough idea.
Thanks.
---8<---
The scratch register currently denotes the mode before exception, but we
can just use two different exception entry points to provide the same
information, which frees the scratch register for the percpu offset.
The user/kernel entry paths need more through rewrite, because they are
terribly wasteful right now.
---
Applies on top of d7b8f8e20813f0179d8ef519541a3527e7661d3a (v6.16-rc5)
arch/riscv/include/asm/percpu.h | 13 ++++++++++
arch/riscv/kernel/entry.S | 46 ++++++++++++++++++++-------------
arch/riscv/kernel/head.S | 7 +----
arch/riscv/kernel/smpboot.c | 7 +++++
arch/riscv/kernel/stacktrace.c | 4 +--
5 files changed, 51 insertions(+), 26 deletions(-)
create mode 100644 arch/riscv/include/asm/percpu.h
diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..2c838514e3ea
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+static inline void set_my_cpu_offset(unsigned long off)
+{
+ csr_write(CSR_SCRATCH, off);
+}
+
+#define __my_cpu_offset csr_read(CSR_SCRATCH)
+
+#include <asm-generic/percpu.h>
+
+#endif
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 75656afa2d6b..e48c553d6779 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -91,18 +91,8 @@
REG_L a0, TASK_TI_A0(tp)
.endm
-
-SYM_CODE_START(handle_exception)
- /*
- * If coming from userspace, preserve the user thread pointer and load
- * the kernel thread pointer. If we came from the kernel, the scratch
- * register will contain 0, and we should continue on the current TP.
- */
- csrrw tp, CSR_SCRATCH, tp
- bnez tp, .Lsave_context
-
-.Lrestore_kernel_tpsp:
- csrr tp, CSR_SCRATCH
+SYM_CODE_START(handle_kernel_exception)
+ csrw CSR_SCRATCH, tp
#ifdef CONFIG_64BIT
/*
@@ -126,8 +116,22 @@ SYM_CODE_START(handle_exception)
bnez sp, handle_kernel_stack_overflow
REG_L sp, TASK_TI_KERNEL_SP(tp)
#endif
+ j handle_exception
+ASM_NOKPROBE(handle_kernel_exception)
+SYM_CODE_END(handle_kernel_exception)
-.Lsave_context:
+SYM_CODE_START(handle_user_exception)
+ /*
+ * If coming from userspace, preserve the user thread pointer and load
+ * the kernel thread pointer.
+ */
+ csrrw tp, CSR_SCRATCH, tp
+ j handle_exception
+
+SYM_CODE_END(handle_user_exception)
+ASM_NOKPROBE(handle_user_exception)
+
+SYM_CODE_START_NOALIGN(handle_exception)
REG_S sp, TASK_TI_USER_SP(tp)
REG_L sp, TASK_TI_KERNEL_SP(tp)
addi sp, sp, -(PT_SIZE_ON_STACK)
@@ -158,11 +162,15 @@ SYM_CODE_START(handle_exception)
REG_S s4, PT_CAUSE(sp)
REG_S s5, PT_TP(sp)
- /*
- * Set the scratch register to 0, so that if a recursive exception
- * occurs, the exception vector knows it came from the kernel
- */
- csrw CSR_SCRATCH, x0
+ REG_L s0, TASK_TI_CPU(tp)
+ slli s0, s0, 3
+ la s1, __per_cpu_offset
+ add s1, s1, s0
+ REG_L s1, 0(s1)
+
+ csrw CSR_SCRATCH, s1
+ la s1, handle_kernel_exception
+ csrw CSR_TVEC, s1
/* Load the global pointer */
load_global_pointer
@@ -236,6 +244,8 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
* structures again.
*/
csrw CSR_SCRATCH, tp
+ la a0, handle_user_exception
+ csrw CSR_TVEC, a0
1:
#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE
move a0, sp
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index bdf3352acf4c..d8858334af2d 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -188,14 +188,9 @@ secondary_start_sbi:
.align 2
.Lsetup_trap_vector:
/* Set trap vector to exception handler */
- la a0, handle_exception
+ la a0, handle_kernel_exception
csrw CSR_TVEC, a0
- /*
- * Set sup0 scratch register to 0, indicating to exception vector that
- * we are presently executing in kernel.
- */
- csrw CSR_SCRATCH, zero
ret
SYM_CODE_END(_start)
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 601a321e0f17..2db44b10bedb 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -41,6 +41,11 @@
static DECLARE_COMPLETION(cpu_running);
+void __init smp_prepare_boot_cpu(void)
+{
+ set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+}
+
void __init smp_prepare_cpus(unsigned int max_cpus)
{
int cpuid;
@@ -225,6 +230,8 @@ asmlinkage __visible void smp_callin(void)
mmgrab(mm);
current->active_mm = mm;
+ set_my_cpu_offset(per_cpu_offset(curr_cpuid));
+
store_cpu_topology(curr_cpuid);
notify_cpu_starting(curr_cpuid);
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index 3fe9e6edef8f..69b2f390a2d4 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -16,7 +16,7 @@
#ifdef CONFIG_FRAME_POINTER
-extern asmlinkage void handle_exception(void);
+extern asmlinkage void handle_kernel_exception(void);
extern unsigned long ret_from_exception_end;
static inline int fp_is_valid(unsigned long fp, unsigned long sp)
@@ -72,7 +72,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
fp = frame->fp;
pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra,
&frame->ra);
- if (pc >= (unsigned long)handle_exception &&
+ if (pc >= (unsigned long)handle_kernel_exception &&
pc < (unsigned long)&ret_from_exception_end) {
if (unlikely(!fn(arg, pc)))
break;