Re: [RFC v2 3/4] modules, x86: use vmalloc_exec for module core

From: Aaron Lu
Date: Thu Oct 13 2022 - 23:49:12 EST


On Fri, Oct 07, 2022 at 04:43:14PM -0700, Song Liu wrote:
> This is a prototype that allows modules to share 2MB text pages with other
> modules and BPF programs.
>
> Current version only covers core_layout.
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/kernel/alternative.c | 30 ++++++++++++++++++++++++------
> arch/x86/kernel/module.c | 1 +
> kernel/module/main.c | 23 +++++++++++++----------
> kernel/module/strict_rwx.c | 3 ---
> kernel/trace/ftrace.c | 3 ++-
> 6 files changed, 41 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index f9920f1341c8..0b1ea05a1da6 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -91,6 +91,7 @@ config X86
> select ARCH_HAS_SET_DIRECT_MAP
> select ARCH_HAS_STRICT_KERNEL_RWX
> select ARCH_HAS_STRICT_MODULE_RWX
> + select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if X86_64
> select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
> select ARCH_HAS_SYSCALL_WRAPPER
> select ARCH_HAS_UBSAN_SANITIZE_ALL
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 4f3204364caa..0e47a558c5bc 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -332,7 +332,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
>
> DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
>
> - text_poke_early(instr, insn_buff, insn_buff_sz);
> + if (system_state < SYSTEM_RUNNING) {
> + text_poke_early(instr, insn_buff, insn_buff_sz);
> + } else {
> + mutex_lock(&text_mutex);
> + text_poke(instr, insn_buff, insn_buff_sz);
> + mutex_unlock(&text_mutex);
> + }
>
> next:
> optimize_nops(instr, a->instrlen);
> @@ -503,7 +509,13 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
> optimize_nops(bytes, len);
> DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
> DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
> - text_poke_early(addr, bytes, len);
> + if (system_state == SYSTEM_BOOTING) {
> + text_poke_early(addr, bytes, len);
> + } else {
> + mutex_lock(&text_mutex);
> + text_poke(addr, bytes, len);
> + mutex_unlock(&text_mutex);
> + }
> }
> }
> }
> @@ -568,7 +580,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
> if (len == insn.length) {
> DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
> DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
> - text_poke_early(addr, bytes, len);
> + if (unlikely(system_state == SYSTEM_BOOTING)) {
> + text_poke_early(addr, bytes, len);
> + } else {
> + mutex_lock(&text_mutex);
> + text_poke(addr, bytes, len);
> + mutex_unlock(&text_mutex);
> + }
> }
> }
> }
> @@ -609,7 +627,7 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
> */
> DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
> DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
> - text_poke_early(addr, &poison, 4);
> + text_poke(addr, &poison, 4);
> }
> }
>
> @@ -791,7 +809,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
>
> /* Pad the rest with nops */
> add_nops(insn_buff + used, p->len - used);
> - text_poke_early(p->instr, insn_buff, p->len);
> + text_poke(p->instr, insn_buff, p->len);

Got below warning when booting a VM:

[ 0.190098] ------------[ cut here ]------------
[ 0.190377] WARNING: CPU: 0 PID: 0 at /home/aaron/linux/src/arch/x86/kernel/alternative.c:1224 text_poke+0x53/0x60
[ 0.191083] Modules linked in:
[ 0.191269] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.0-00004-gc49d19177d78 #5
[ 0.191721] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
[ 0.192083] RIP: 0010:text_poke+0x53/0x60
[ 0.192326] Code: c7 c7 20 e7 02 81 5b 5d e9 2a f8 ff ff be ff ff ff ff 48 c7 c7 b0 6d 06 83 48 89 14 24 e8 75 fd bf 00 85 c0 48 8b 14 24 75 c8 <0f> 0b eb c4 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 41 56
[ 0.193083] RSP: 0000:ffffffff83003d60 EFLAGS: 00010246
[ 0.194083] RAX: 0000000000000000 RBX: ffffffff810295b7 RCX: 0000000000000001
[ 0.194506] RDX: 0000000000000006 RSI: ffffffff828b01c5 RDI: ffffffff8293898e
[ 0.195083] RBP: ffffffff83003d82 R08: ffffffff82206520 R09: 0000000000000001
[ 0.195506] R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8a9949c0
[ 0.195929] R13: ffffffff8a95f400 R14: 00000000ffffffff R15: 00000000ffffffff
[ 0.196083] FS: 0000000000000000(0000) GS:ffff88842de00000(0000) knlGS:0000000000000000
[ 0.196562] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.197083] CR2: ffff88843ffff000 CR3: 0000000003012001 CR4: 0000000000770ef0
[ 0.197508] PKRU: 55555554
[ 0.197673] Call Trace:
[ 0.197822] <TASK>
[ 0.198084] apply_paravirt+0xaf/0x150
[ 0.198313] ? __might_resched+0x3f/0x280
[ 0.198557] ? synchronize_rcu+0xe0/0x1c0
[ 0.198799] ? lock_release+0x230/0x450
[ 0.199030] ? _raw_spin_unlock_irqrestore+0x30/0x60
[ 0.199083] ? lockdep_hardirqs_on+0x79/0x100
[ 0.199345] ? _raw_spin_unlock_irqrestore+0x3b/0x60
[ 0.199643] ? atomic_notifier_chain_unregister+0x51/0x80
[ 0.200084] alternative_instructions+0x27/0xfa
[ 0.200357] check_bugs+0xe08/0xe82
[ 0.200570] start_kernel+0x692/0x6cc
[ 0.200797] secondary_startup_64_no_verify+0xe0/0xeb
[ 0.201088] </TASK>
[ 0.201223] irq event stamp: 13575
[ 0.201428] hardirqs last enabled at (13583): [<ffffffff811193c2>] __up_console_sem+0x52/0x60
[ 0.202083] hardirqs last disabled at (13592): [<ffffffff811193a7>] __up_console_sem+0x37/0x60
[ 0.202594] softirqs last enabled at (12762): [<ffffffff8117e169>] cgroup_idr_alloc.constprop.60+0x59/0x100
[ 0.203083] softirqs last disabled at (12750): [<ffffffff8117e13d>] cgroup_idr_alloc.constprop.60+0x2d/0x100
[ 0.203665] ---[ end trace 0000000000000000 ]---

Looks like it is also necessary to differentiate system_state in
apply_paravirt() like you did in the other apply_XXX() functions.