Re: [PATCH RFC v7 06/64] KVM: x86: Add platform hooks for private memory invalidations

From: Tom Dohrmann
Date: Sun Jan 22 2023 - 07:45:50 EST


On Wed, Dec 14, 2022 at 01:39:58PM -0600, Michael Roth wrote:
> In some cases, like with SEV-SNP, guest memory needs to be updated in a
> platform-specific manner before it can be safely freed back to the host.
> Add hooks to wire up handling of this sort to the invalidation notifiers
> for restricted memory.
>
> Also issue invalidations of all allocated pages during notifier
> unregistration so that the pages are not left in an unusable state when
> they eventually get freed back to the host upon FD release.
>
> Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 1 +
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/mmu/mmu.c | 5 +++++
> include/linux/kvm_host.h | 2 ++
> mm/restrictedmem.c | 16 ++++++++++++++++
> virt/kvm/kvm_main.c | 5 +++++
> 6 files changed, 30 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 52f94a0ba5e9..c71df44b0f02 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -134,6 +134,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
> KVM_X86_OP_OPTIONAL_RET0(private_mem_enabled);
> KVM_X86_OP_OPTIONAL_RET0(fault_is_private);
> KVM_X86_OP_OPTIONAL_RET0(update_mem_attr)
> +KVM_X86_OP_OPTIONAL(invalidate_restricted_mem)
>
> #undef KVM_X86_OP
> #undef KVM_X86_OP_OPTIONAL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 13802389f0f9..9ef8d73455d9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1639,6 +1639,7 @@ struct kvm_x86_ops {
> int (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault);
> int (*update_mem_attr)(struct kvm_memory_slot *slot, unsigned int attr,
> gfn_t start, gfn_t end);
> + void (*invalidate_restricted_mem)(struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
>
> bool (*has_wbinvd_exit)(void);
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a0c41d391547..2713632e5061 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7183,3 +7183,8 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
> kvm_update_lpage_private_shared_mixed(kvm, slot, attrs,
> start, end);
> }
> +
> +void kvm_arch_invalidate_restricted_mem(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> +{
> + static_call_cond(kvm_x86_invalidate_restricted_mem)(slot, start, end);
> +}
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f032d878e034..f72a2e0b8699 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2327,6 +2327,7 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
> struct kvm_memory_slot *slot,
> unsigned long attrs,
> gfn_t start, gfn_t end);
> +
> #else
> static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
> struct kvm_memory_slot *slot,
> @@ -2366,6 +2367,7 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
> }
>
> void kvm_arch_memory_mce(struct kvm *kvm);
> +void kvm_arch_invalidate_restricted_mem(struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
> #endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */
>
> #endif
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> index 56953c204e5c..74fa2cfb8618 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -54,6 +54,11 @@ static int restrictedmem_release(struct inode *inode, struct file *file)
> {
> struct restrictedmem_data *data = inode->i_mapping->private_data;
>
> + pr_debug("%s: releasing memfd, invalidating page offsets 0x0-0x%llx\n",
> + __func__, inode->i_size >> PAGE_SHIFT);
> + restrictedmem_invalidate_start(data, 0, inode->i_size >> PAGE_SHIFT);
> + restrictedmem_invalidate_end(data, 0, inode->i_size >> PAGE_SHIFT);
> +
> fput(data->memfd);
> kfree(data);
> return 0;
> @@ -258,6 +263,17 @@ void restrictedmem_unregister_notifier(struct file *file,
> struct restrictedmem_notifier *notifier)
> {
> struct restrictedmem_data *data = file->f_mapping->private_data;
> + struct inode *inode = file_inode(data->memfd);
> +
> + /* TODO: this will issue notifications to all registered notifiers,
> + * but it's only the one being unregistered that needs to process
> + * invalidations for any ranges still allocated at this point in
> + * time. For now this relies on KVM currently being the only notifier.
> + */
> + pr_debug("%s: unregistering notifier, invalidating page offsets 0x0-0x%llx\n",
> + __func__, inode->i_size >> PAGE_SHIFT);
> + restrictedmem_invalidate_start(data, 0, inode->i_size >> PAGE_SHIFT);
> + restrictedmem_invalidate_end(data, 0, inode->i_size >> PAGE_SHIFT);
>
> mutex_lock(&data->lock);
> list_del(&notifier->list);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d2d829d23442..d2daa049e94a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -974,6 +974,9 @@ static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *no
> &gfn_start, &gfn_end))
> return;
>
> + pr_debug("%s: start: 0x%lx, end: 0x%lx, roffset: 0x%llx, gfn_start: 0x%llx, gfn_end: 0x%llx\n",
> + __func__, start, end, slot->restricted_offset, gfn_start, gfn_end);
> +
> gfn_range.start = gfn_start;
> gfn_range.end = gfn_end;
> gfn_range.slot = slot;
> @@ -988,6 +991,8 @@ static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *no
> if (kvm_unmap_gfn_range(kvm, &gfn_range))
> kvm_flush_remote_tlbs(kvm);
>
> + kvm_arch_invalidate_restricted_mem(slot, gfn_start, gfn_end);

Calling kvm_arch_invalidate_restricted_mem while the KVM MMU lock is taken
causes problems, because taking said lock disables preemption. Within
kvm_arch_invalidate_restricted_mem a few calls down, eventually
vm_unmap_aliases is called which tries to lock a mutex, which shouldn't happen
with preemption disabled. This causes a "scheduling while atomic" bug:

[ 152.846596] BUG: scheduling while atomic: enarx/8302/0x00000002
[ 152.846599] Modules linked in: nf_conntrack_netlink(E) xfrm_user(E) xfrm_algo(E) xt_addrtype(E) br_netfilter(E) xt_CHECKSUM(E) xt_MASQUERADE(E) xt_conntrack(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_tcpudp(E) nft_compat(E) nft_chain_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) nf_tables(E) libcrc32c(E) nfnetlink(E) bridge(E) stp(E) llc(E) bonding(E) intel_rapl_msr(E) intel_rapl_common(E) amd64_edac(E) edac_mce_amd(E) kvm_amd(E) tun(E) ipmi_ssif(E) rfkill(E) overlay(E) ghash_clmulni_intel(E) sha512_ssse3(E) sha512_generic(E) aesni_intel(E) libaes(E) crypto_simd(E) cryptd(E) rapl(E) wmi_bmof(E) binfmt_misc(E) kvm(E) irqbypass(E) nls_ascii(E) nls_cp437(E) vfat(E) fat(E) snd_usb_audio(E) snd_usbmidi_lib(E) snd_hwdep(E) mc(E) snd_pcm(E) snd_seq_midi(E) snd_seq_midi_event(E) snd_rawmidi(E) snd_seq(E) ast(E) snd_seq_device(E) drm_vram_helper(E) drm_ttm_helper(E) snd_timer(E) ttm(E) joydev(E) snd(E) ccp(E) drm_kms_helper(E) soundcore(E) sg(E) i2c_algo_bit(E) rng_core(E)
[ 152.846629] k10temp(E) evdev(E) acpi_ipmi(E) ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_cpufreq(E) button(E) squashfs(E) loop(E) sch_fq_codel(E) msr(E) parport_pc(E) ppdev(E) lp(E) ramoops(E) parport(E) reed_solomon(E) fuse(E) drm(E) efi_pstore(E) configfs(E) efivarfs(E) ip_tables(E) x_tables(E) autofs4(E) ext4(E) crc32c_generic(E) crc16(E) mbcache(E) jbd2(E) rndis_host(E) cdc_ether(E) usbnet(E) mii(E) hid_generic(E) usbhid(E) hid(E) sd_mod(E) t10_pi(E) crc64_rocksoft(E) crc64(E) crc_t10dif(E) crct10dif_generic(E) crct10dif_pclmul(E) crct10dif_common(E) crc32_pclmul(E) crc32c_intel(E) ahci(E) libahci(E) xhci_pci(E) libata(E) bnxt_en(E) xhci_hcd(E) scsi_mod(E) ptp(E) scsi_common(E) pps_core(E) usbcore(E) i2c_piix4(E) usb_common(E) wmi(E)
[ 152.846657] Preemption disabled at:
[ 152.846657] [<ffffffffc146a09a>] kvm_restrictedmem_invalidate_begin+0xba/0x1c0 [kvm]
[ 152.846688] CPU: 108 PID: 8302 Comm: enarx Tainted: G W E 6.1.0-rc4+ #30
[ 152.846690] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.4 04/14/2022
[ 152.846691] Call Trace:
[ 152.846692] <TASK>
[ 152.846694] dump_stack_lvl+0x49/0x63
[ 152.846695] ? kvm_restrictedmem_invalidate_begin+0xba/0x1c0 [kvm]
[ 152.846723] dump_stack+0x10/0x16
[ 152.846725] __schedule_bug.cold+0x81/0x92
[ 152.846727] __schedule+0x809/0xa00
[ 152.846729] ? asm_sysvec_call_function+0x1b/0x20
[ 152.846731] schedule+0x6b/0xf0
[ 152.846733] schedule_preempt_disabled+0x18/0x30
[ 152.846735] __mutex_lock.constprop.0+0x723/0x750
[ 152.846738] ? smp_call_function_many_cond+0xc1/0x2e0
[ 152.846740] __mutex_lock_slowpath+0x13/0x20
[ 152.846742] mutex_lock+0x49/0x60
[ 152.846744] _vm_unmap_aliases+0x10e/0x160
[ 152.846746] vm_unmap_aliases+0x19/0x20
[ 152.846748] change_page_attr_set_clr+0xb7/0x1c0
[ 152.846751] set_memory_p+0x29/0x30
[ 152.846753] rmpupdate+0xd5/0x110
[ 152.846756] rmp_make_shared+0xb7/0xc0
[ 152.846758] snp_make_page_shared.constprop.0+0x4c/0x90 [kvm_amd]
[ 152.846765] sev_invalidate_private_range+0x156/0x330 [kvm_amd]
[ 152.846770] ? kvm_unmap_gfn_range+0xef/0x100 [kvm]
[ 152.846801] kvm_arch_invalidate_restricted_mem+0xe/0x20 [kvm]
[ 152.846829] kvm_restrictedmem_invalidate_begin+0x106/0x1c0 [kvm]
[ 152.846856] restrictedmem_unregister_notifier+0x74/0x150
[ 152.846859] kvm_free_memslot+0x6b/0x80 [kvm]
[ 152.846885] kvm_free_memslots.part.0+0x47/0x70 [kvm]
[ 152.846911] kvm_destroy_vm+0x222/0x320 [kvm]
[ 152.846937] kvm_put_kvm+0x2a/0x50 [kvm]
[ 152.846964] kvm_vm_release+0x22/0x30 [kvm]
[ 152.846990] __fput+0xa8/0x280
[ 152.846992] ____fput+0xe/0x20
[ 152.846994] task_work_run+0x61/0xb0
[ 152.846996] do_exit+0x362/0xb30
[ 152.846998] ? tomoyo_path_number_perm+0x6f/0x200
[ 152.847001] do_group_exit+0x38/0xa0
[ 152.847003] get_signal+0x999/0x9c0
[ 152.847005] arch_do_signal_or_restart+0x37/0x7e0
[ 152.847008] ? __might_fault+0x26/0x30
[ 152.847010] ? __rseq_handle_notify_resume+0xd5/0x4f0
[ 152.847013] exit_to_user_mode_prepare+0xd3/0x170
[ 152.847016] syscall_exit_to_user_mode+0x26/0x50
[ 152.847019] do_syscall_64+0x48/0x90
[ 152.847020] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 152.847022] RIP: 0033:0x7fa345f1aaff
[ 152.847023] Code: Unable to access opcode bytes at 0x7fa345f1aad5.
[ 152.847024] RSP: 002b:00007fff99d6c050 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 152.847026] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00007fa345f1aaff
[ 152.847027] RDX: 00007fff99d6c188 RSI: 00000000c008aeba RDI: 0000000000000006
[ 152.847028] RBP: 00007fff99576000 R08: 0000000000000000 R09: 0000000000000000
[ 152.847029] R10: 0000000001680000 R11: 0000000000000246 R12: 00007fff99d752c0
[ 152.847030] R13: 00007fff99d75270 R14: 0000000000000000 R15: 00007fff99577000
[ 152.847032] </TASK>

This bug can be triggered by destroying multiple SNP VMs at the same time.

> +
> KVM_MMU_UNLOCK(kvm);
> srcu_read_unlock(&kvm->srcu, idx);
> }
> --
> 2.25.1
>
>

Regards, Tom