Re: [PATCH v1 08/11] KVM: s390: pv: lazy destroy for reboot

From: Janosch Frank
Date: Thu May 27 2021 - 05:43:42 EST


On 5/17/21 10:07 PM, Claudio Imbrenda wrote:
> Until now, destroying a protected guest was an entirely synchronous
> operation that could potentially take a very long time, depending on
> the size of the guest, due to the time needed to clean up the address
> space from protected pages.
>
> This patch implements a lazy destroy mechanism, that allows a protected
> guest to reboot significantly faster than previously.
>
> This is achieved by clearing the pages of the old guest in background.
> In case of reboot, the new guest will be able to run in the same
> address space almost immediately.
>
> The old protected guest is then only destroyed when all of its memory has
> been destroyed or otherwise made non protected.

LGTM some comments below

>
> Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
> ---
> arch/s390/kvm/kvm-s390.c | 6 +-
> arch/s390/kvm/kvm-s390.h | 2 +-
> arch/s390/kvm/pv.c | 118 ++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 120 insertions(+), 6 deletions(-)
>
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 2f09e9d7dc95..db25aa1fb6a6 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -2248,7 +2248,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
>
> r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
> if (r)
> - kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> + kvm_s390_pv_deinit_vm_deferred(kvm, &dummy, &dummy);
>
> /* we need to block service interrupts from now on */
> set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
> @@ -2267,7 +2267,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
> */
> if (r)
> break;
> - r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
> + r = kvm_s390_pv_deinit_vm_deferred(kvm, &cmd->rc, &cmd->rrc);
>
> /* no need to block service interrupts any more */
> clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
> @@ -2796,7 +2796,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
> * complaining we do not use kvm_s390_pv_is_protected.
> */
> if (kvm_s390_pv_get_handle(kvm))
> - kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
> + kvm_s390_pv_deinit_vm_deferred(kvm, &rc, &rrc);
> debug_unregister(kvm->arch.dbf);
> free_page((unsigned long)kvm->arch.sie_page2);
> if (!kvm_is_ucontrol(kvm))
> diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
> index 79dcd647b378..b3c0796a3cc1 100644
> --- a/arch/s390/kvm/kvm-s390.h
> +++ b/arch/s390/kvm/kvm-s390.h
> @@ -211,7 +211,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
> /* implemented in pv.c */
> int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
> int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
> -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
> +int kvm_s390_pv_deinit_vm_deferred(struct kvm *kvm, u16 *rc, u16 *rrc);
> int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
> int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
> u16 *rrc);
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index 59039b8a7be7..9a3547966e18 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -14,8 +14,17 @@
> #include <asm/mman.h>
> #include <linux/pagewalk.h>
> #include <linux/sched/mm.h>
> +#include <linux/kthread.h>
> #include "kvm-s390.h"
>
> +struct deferred_priv {
> + struct mm_struct *mm;
> + unsigned long old_table;
> + u64 handle;
> + void *virt;
> + unsigned long real;
> +};
> +
> int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
> {
> int cc = 0;
> @@ -202,7 +211,7 @@ static int kvm_s390_pv_replace_asce(struct kvm *kvm)
> }
>
> /* this should not fail, but if it does, we must not free the donated memory */
> -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +static int kvm_s390_pv_deinit_vm_now(struct kvm *kvm, u16 *rc, u16 *rrc)
> {
> int cc;
>
> @@ -230,6 +239,111 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> return cc ? -EIO : 0;
> }
>
> +static int kvm_s390_pv_destroy_vm_thread(void *priv)
> +{
> + struct deferred_priv *p = priv;
> + u16 rc, rrc;
> + int r = 1;
> +
> + /* Exit early if we end up being the only users of the mm */
> + s390_uv_destroy_range(p->mm, 1, 0, TASK_SIZE_MAX);
> + mmput(p->mm);

Where do we exit early?

> +
> + r = uv_cmd_nodata(p->handle, UVC_CMD_DESTROY_SEC_CONF, &rc, &rrc);
> + WARN_ONCE(r, "protvirt destroy vm failed rc %x rrc %x", rc, rrc);
> + if (r)
> + return r;
> + atomic_dec(&p->mm->context.is_protected);
> +
> + /*
> + * Intentional leak in case the destroy secure VM call fails. The
> + * call should never fail if the hardware is not broken.
> + */
> + free_pages(p->real, get_order(uv_info.guest_base_stor_len));
> + free_pages(p->old_table, CRST_ALLOC_ORDER);
> + vfree(p->virt);
> + kfree(p);
> + return 0;
> +}
> +
> +static int deferred_destroy(struct kvm *kvm, struct deferred_priv *priv, u16 *rc, u16 *rrc)
> +{
> + struct task_struct *t;
> +
> + priv->virt = kvm->arch.pv.stor_var;
> + priv->real = kvm->arch.pv.stor_base;

Now I know what the struct members actually mean...
Is there a reson you didn't like config_stor_* as a name or something
similar?

> + priv->handle = kvm_s390_pv_get_handle(kvm);
> + priv->old_table = (unsigned long)kvm->arch.gmap->table;
> + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
> +
> + if (kvm_s390_pv_replace_asce(kvm))
> + goto fail;
> +
> + t = kthread_create(kvm_s390_pv_destroy_vm_thread, priv,
> + "kvm_s390_pv_destroy_vm_thread");
> + if (IS_ERR_OR_NULL(t))
> + goto fail;
> +
> + memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
> + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM DEFERRED %d", t->pid);
> + wake_up_process(t);
> + /*
> + * no actual UVC is performed at this point, just return a successful
> + * rc value to make userspace happy, and an arbitrary rrc
> + */
> + *rc = 1;
> + *rrc = 42;
> +
> + return 0;
> +
> +fail:
> + kfree(priv);
> + return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc);
> +}
> +
> +/* Clear the first 2GB of guest memory, to avoid prefix issues after reboot */
> +static void kvm_s390_clear_2g(struct kvm *kvm)
> +{
> + struct kvm_memory_slot *slot;
> + struct kvm_memslots *slots;
> + unsigned long lim;
> + int idx;
> +
> + idx = srcu_read_lock(&kvm->srcu);
> +
> + slots = kvm_memslots(kvm);
> + kvm_for_each_memslot(slot, slots) {
> + if (slot->base_gfn >= (SZ_2G / PAGE_SIZE))
> + continue;
> + if (slot->base_gfn + slot->npages > (SZ_2G / PAGE_SIZE))
> + lim = slot->userspace_addr + SZ_2G - slot->base_gfn * PAGE_SIZE;
> + else
> + lim = slot->userspace_addr + slot->npages * PAGE_SIZE;
> + s390_uv_destroy_range(kvm->mm, 1, slot->userspace_addr, lim);
> + }
> +
> + srcu_read_unlock(&kvm->srcu, idx);
> +}
> +
> +int kvm_s390_pv_deinit_vm_deferred(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> + struct deferred_priv *priv;
> +
> + priv = kmalloc(sizeof(*priv), GFP_KERNEL | __GFP_ZERO);
> + if (!priv)
> + return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc);
> +
> + if (mmget_not_zero(kvm->mm)) {
> + kvm_s390_clear_2g(kvm);
> + } else {
> + /* No deferred work to do */
> + kfree(priv);
> + return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc);
> + }
> + priv->mm = kvm->mm;
> + return deferred_destroy(kvm, priv, rc, rrc);
> +}
> +
> int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> {
> struct uv_cb_cgc uvcb = {
> @@ -263,7 +377,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> atomic_inc(&kvm->mm->context.is_protected);
> if (cc) {
> if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
> - kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> + kvm_s390_pv_deinit_vm_now(kvm, &dummy, &dummy);
> } else {
> atomic_dec(&kvm->mm->context.is_protected);
> kvm_s390_pv_dealloc_vm(kvm);
>