Re: [PATCH 3/3] virtio_balloon: Introduce memory recover

From: David Hildenbrand
Date: Mon May 30 2022 - 03:48:54 EST



> +
> struct virtio_balloon {
> struct virtio_device *vdev;
> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> @@ -126,6 +133,16 @@ struct virtio_balloon {
> /* Free page reporting device */
> struct virtqueue *reporting_vq;
> struct page_reporting_dev_info pr_dev_info;
> +
> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
> + struct virtqueue *recover_vq;
> + spinlock_t recover_vq_lock;
> + struct notifier_block memory_failure_nb;
> + struct list_head corrupted_page_list;
> + struct list_head recovered_page_list;
> + spinlock_t recover_page_list_lock;
> + struct __virtio_balloon_recover in_vbr;
> + struct work_struct unpoison_memory_work;

I assume we want all that only with CONFIG_MEMORY_FAILURE.

> };
>
> static const struct virtio_device_id id_table[] = {
> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
> queue_work(system_freezable_wq, work);
> }
>
> +/*
> + * virtballoon_memory_failure - notified by memory failure, try to fix the
> + * corrupted page.
> + * The memory failure notifier is designed to call back when the kernel handled
> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
> + * error(memory error handling is a best effort, not 100% coverd).
> + */
> +static int virtballoon_memory_failure(struct notifier_block *notifier,
> + unsigned long pfn, void *parm)
> +{
> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
> + memory_failure_nb);
> + struct page *page;
> + struct __virtio_balloon_recover *out_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + page = pfn_to_online_page(pfn);
> + if (WARN_ON_ONCE(!page))
> + return NOTIFY_DONE;
> +
> + if (PageHuge(page))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(!PageHWPoison(page)))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(page_count(page) != 1))
> + return NOTIFY_DONE;

Relying on the page_count to be 1 for correctness is usually a bit
shaky, for example, when racing against isolate_movable_page() that
might temporarily bump upo the refcount.

> +
> + get_page(page); /* balloon reference */
> +
> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);

Are we always guaranteed to be able to use GFP_KERNEL out of MCE
context? (IOW, are we never atomic?)

> + if (WARN_ON_ONCE(!out_vbr))
> + return NOTIFY_BAD;
> +
> + spin_lock(&vb->recover_page_list_lock);
> + balloon_page_push(&vb->corrupted_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;

This makes me wonder if we should have a more generic guest->host
request queue, similar to what e.g., virtio-mem uses, instead of adding
a separate VIRTIO_BALLOON_VQ_RECOVER vq.

> + set_page_pfns(vb, out_vbr->pfns, page);
> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return NOTIFY_DONE;
> + }
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int recover_vq_get_response(struct virtio_balloon *vb)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + in_vbr = &vb->in_vbr;
> + memset(in_vbr, 0x00, sizeof(*in_vbr));
> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return err;
> + }
> +
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return 0;
> +}
> +
> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct virtio_balloon_recover *vbr;
> + struct page *page;
> + unsigned int pfns;
> + u32 pfn0, pfn1;
> + __u8 status;
> +
> + /* the response is not expected */
> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
> + return;
> +
> + in_vbr = &vb->in_vbr;
> + vbr = &in_vbr->vbr;
> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
> + return;
> +
> + /* to make sure the contiguous balloon PFNs */
> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
> + if (pfn1 - pfn0 != 1)
> + return;

Yeah, we really shouldn't be dealing with (legacy) 4k PFNs here, but
instead, proper ranges I guess.

> + }
> +
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
> + if (!pfn_valid(pfn0))
> + return;
> +
> + pfn1 = -1;
> + spin_lock(&vb->recover_page_list_lock);
> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
> + pfn1 = page_to_pfn(page);
> + if (pfn1 == pfn0)
> + break;
> + }
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + status = vbr->status;
> + switch (status) {
> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + balloon_page_push(&vb->recovered_page_list, page);

We rather not reuse actual balloon functions in !balloon context. Just
move the page to the proper list directly.

> + spin_unlock(&vb->recover_page_list_lock);
> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);

Well, not yet. Shouldn't this go into unpoison_memory_func() ?

> + }
> + break;
> + case VIRTIO_BALLOON_R_STATUS_FAILED:
> + /* the hypervisor can't fix this corrupted page, balloon puts page */
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> + put_page(page);
> + dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
> + }
> + default:
> + break;
> + };
> +
> + /* continue to get response from host side if the response gets handled successfully */
> + recover_vq_get_response(vb);
> +}
> +
> +static void unpoison_memory_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct page *page;
> +
> + vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
> +
> + do {
> + spin_lock(&vb->recover_page_list_lock);
> + page = list_first_entry_or_null(&vb->recovered_page_list,
> + struct page, lru);
> + if (page)
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + if (page) {
> + put_page(page);
> + unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
> + }
> + } while (page);
> +}
> +
> +static void recover_vq_cb(struct virtqueue *vq)
> +{
> + struct virtio_balloon *vb = vq->vdev->priv;
> + struct __virtio_balloon_recover *vbr;
> + unsigned long flags;
> + unsigned int len;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + do {
> + virtqueue_disable_cb(vq);
> + while ((vbr = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + if (vbr == &vb->in_vbr)
> + recover_vq_handle_response(vb, len);
> + else
> + kfree(vbr); /* just free the memory for out vbr request */
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + }
> + } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +}
> +


[...]

>
> +out_unregister_reporting:
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> + page_reporting_unregister(&vb->pr_dev_info);
> out_unregister_oom:
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> unregister_oom_notifier(&vb->oom_nb);
> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
> destroy_workqueue(vb->balloon_wq);
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {

Could the notifier already have been triggered and we might be using the
device before already fully initialized from the notifier and might end
up leaking memory here that we allocated?

> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
> + cancel_work_sync(&vb->unpoison_memory_work);
> + }
> +

Could we be leaking memory from the virtballoon_remove() path?

--
Thanks,

David / dhildenb