Re: WARNING in __mmdrop

From: Michael S. Tsirkin
Date: Sun Jul 21 2019 - 08:19:25 EST


On Sun, Jul 21, 2019 at 06:02:52AM -0400, Michael S. Tsirkin wrote:
> On Sat, Jul 20, 2019 at 03:08:00AM -0700, syzbot wrote:
> > syzbot has bisected this bug to:
> >
> > commit 7f466032dc9e5a61217f22ea34b2df932786bbfc
> > Author: Jason Wang <jasowang@xxxxxxxxxx>
> > Date: Fri May 24 08:12:18 2019 +0000
> >
> > vhost: access vq metadata through kernel virtual address
> >
> > bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=149a8a20600000
> > start commit: 6d21a41b Add linux-next specific files for 20190718
> > git tree: linux-next
> > final crash: https://syzkaller.appspot.com/x/report.txt?x=169a8a20600000
> > console output: https://syzkaller.appspot.com/x/log.txt?x=129a8a20600000
> > kernel config: https://syzkaller.appspot.com/x/.config?x=3430a151e1452331
> > dashboard link: https://syzkaller.appspot.com/bug?extid=e58112d71f77113ddb7b
> > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10139e68600000
> >
> > Reported-by: syzbot+e58112d71f77113ddb7b@xxxxxxxxxxxxxxxxxxxxxxxxx
> > Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> > address")
> >
> > For information about bisection process see: https://goo.gl/tpsmEJ#bisection
>
>
> OK I poked at this for a bit, I see several things that
> we need to fix, though I'm not yet sure it's the reason for
> the failures:
>
>
> 1. mmu_notifier_register shouldn't be called from vhost_vring_set_num_addr
> That's just a bad hack, in particular I don't think device
> mutex is taken and so poking at two VQs will corrupt
> memory.
> So what to do? How about a per vq notifier?
> Of course we also have synchronize_rcu
> in the notifier which is slow and is now going to be called twice.
> I think call_rcu would be more appropriate here.
> We then need rcu_barrier on module unload.
> OTOH if we make pages linear with map then we are good
> with kfree_rcu which is even nicer.
>
> 2. Doesn't map leak after vhost_map_unprefetch?
> And why does it poke at contents of the map?
> No one should use it right?
>
> 3. notifier unregister happens last in vhost_dev_cleanup,
> but register happens first. This looks wrong to me.
>
> 4. OK so we use the invalidate count to try and detect that
> some invalidate is in progress.
> I am not 100% sure why do we care.
> Assuming we do, uaddr can change between start and end
> and then the counter can get negative, or generally
> out of sync.
>
> So what to do about all this?
> I am inclined to say let's just drop the uaddr optimization
> for now. E.g. kvm invalidates unconditionally.
> 3 should be fixed independently.


Above implements this but is only build-tested.
Jason, pls take a look. If you like the approach feel
free to take it from here.

One thing the below does not have is any kind of rate-limiting.
Given it's so easy to restart I'm thinking it makes sense
to add a generic infrastructure for this.
Can be a separate patch I guess.

Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx>


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0536f8526359..1d89715af89d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -299,53 +299,30 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
}

#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_map_unprefetch(struct vhost_map *map)
-{
- kfree(map->pages);
- map->pages = NULL;
- map->npages = 0;
- map->addr = NULL;
-}
-
-static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+static void __vhost_cleanup_vq_maps(struct vhost_virtqueue *vq)
{
struct vhost_map *map[VHOST_NUM_ADDRS];
int i;

- spin_lock(&vq->mmu_lock);
for (i = 0; i < VHOST_NUM_ADDRS; i++) {
map[i] = rcu_dereference_protected(vq->maps[i],
lockdep_is_held(&vq->mmu_lock));
- if (map[i])
+ if (map[i]) {
+ if (vq->uaddrs[i].write) {
+ for (i = 0; i < map[i]->npages; i++)
+ set_page_dirty(map[i]->pages[i]);
+ }
rcu_assign_pointer(vq->maps[i], NULL);
+ kfree_rcu(map[i], head);
+ }
}
+}
+
+static void vhost_cleanup_vq_maps(struct vhost_virtqueue *vq)
+{
+ spin_lock(&vq->mmu_lock);
+ __vhost_cleanup_vq_maps(vq);
spin_unlock(&vq->mmu_lock);
-
- synchronize_rcu();
-
- for (i = 0; i < VHOST_NUM_ADDRS; i++)
- if (map[i])
- vhost_map_unprefetch(map[i]);
-
-}
-
-static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
-{
- int i;
-
- vhost_uninit_vq_maps(vq);
- for (i = 0; i < VHOST_NUM_ADDRS; i++)
- vq->uaddrs[i].size = 0;
-}
-
-static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
- unsigned long start,
- unsigned long end)
-{
- if (unlikely(!uaddr->size))
- return false;
-
- return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
}

static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
@@ -353,31 +330,11 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
unsigned long start,
unsigned long end)
{
- struct vhost_uaddr *uaddr = &vq->uaddrs[index];
- struct vhost_map *map;
- int i;
-
- if (!vhost_map_range_overlap(uaddr, start, end))
- return;
-
spin_lock(&vq->mmu_lock);
++vq->invalidate_count;

- map = rcu_dereference_protected(vq->maps[index],
- lockdep_is_held(&vq->mmu_lock));
- if (map) {
- if (uaddr->write) {
- for (i = 0; i < map->npages; i++)
- set_page_dirty(map->pages[i]);
- }
- rcu_assign_pointer(vq->maps[index], NULL);
- }
+ __vhost_cleanup_vq_maps(vq);
spin_unlock(&vq->mmu_lock);
-
- if (map) {
- synchronize_rcu();
- vhost_map_unprefetch(map);
- }
}

static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
@@ -385,9 +342,6 @@ static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
unsigned long start,
unsigned long end)
{
- if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
- return;
-
spin_lock(&vq->mmu_lock);
--vq->invalidate_count;
spin_unlock(&vq->mmu_lock);
@@ -483,7 +437,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->invalidate_count = 0;
__vhost_vq_meta_reset(vq);
#if VHOST_ARCH_CAN_ACCEL_UACCESS
- vhost_reset_vq_maps(vq);
+ vhost_cleanup_vq_maps(vq);
#endif
}

@@ -833,6 +787,7 @@ static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
size_t size, bool write)
{
struct vhost_uaddr *addr = &vq->uaddrs[index];
+ spin_lock(&vq->mmu_lock);

addr->uaddr = uaddr;
addr->size = size;
@@ -841,6 +796,8 @@ static void vhost_setup_uaddr(struct vhost_virtqueue *vq,

static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
{
+ spin_lock(&vq->mmu_lock);
+
vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
(unsigned long)vq->desc,
vhost_get_desc_size(vq, vq->num),
@@ -853,6 +810,8 @@ static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
(unsigned long)vq->used,
vhost_get_used_size(vq, vq->num),
true);
+
+ spin_unlock(&vq->mmu_lock);
}

static int vhost_map_prefetch(struct vhost_virtqueue *vq,
@@ -874,13 +833,11 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
goto err;

err = -ENOMEM;
- map = kmalloc(sizeof(*map), GFP_ATOMIC);
+ map = kmalloc(sizeof(*map) + sizeof(*map->pages) * npages, GFP_ATOMIC);
if (!map)
goto err;

- pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
- if (!pages)
- goto err_pages;
+ pages = map->pages;

err = EFAULT;
npinned = __get_user_pages_fast(uaddr->uaddr, npages,
@@ -907,7 +864,6 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,

map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
map->npages = npages;
- map->pages = pages;

rcu_assign_pointer(vq->maps[index], map);
/* No need for a synchronize_rcu(). This function should be
@@ -919,8 +875,6 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
return 0;

err_gup:
- kfree(pages);
-err_pages:
kfree(map);
err:
spin_unlock(&vq->mmu_lock);
@@ -942,6 +896,10 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
vhost_vq_reset(dev, dev->vqs[i]);
}
vhost_dev_free_iovecs(dev);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+ if (dev->mm)
+ mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
+#endif
if (dev->log_ctx)
eventfd_ctx_put(dev->log_ctx);
dev->log_ctx = NULL;
@@ -957,16 +915,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
kthread_stop(dev->worker);
dev->worker = NULL;
}
- if (dev->mm) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
- mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
-#endif
+ if (dev->mm)
mmput(dev->mm);
- }
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
- for (i = 0; i < dev->nvqs; i++)
- vhost_uninit_vq_maps(dev->vqs[i]);
-#endif
dev->mm = NULL;
}
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -1426,7 +1376,7 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
if (likely(map)) {
avail = map->addr;
- *event = (__virtio16)avail->ring[vq->num];
+ *event = avail->ring[vq->num];
rcu_read_unlock();
return 0;
}
@@ -1830,6 +1780,8 @@ static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
struct vhost_map __rcu *map;
int i;

+ vhost_setup_vq_uaddr(vq);
+
for (i = 0; i < VHOST_NUM_ADDRS; i++) {
rcu_read_lock();
map = rcu_dereference(vq->maps[i]);
@@ -1838,6 +1790,10 @@ static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
vhost_map_prefetch(vq, i);
}
}
+#else
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+}
#endif

int vq_meta_prefetch(struct vhost_virtqueue *vq)
@@ -1845,9 +1801,7 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq)
unsigned int num = vq->num;

if (!vq->iotlb) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
vhost_vq_map_prefetch(vq);
-#endif
return 1;
}

@@ -2060,16 +2014,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,

mutex_lock(&vq->mutex);

-#if VHOST_ARCH_CAN_ACCEL_UACCESS
- /* Unregister MMU notifer to allow invalidation callback
- * can access vq->uaddrs[] without holding a lock.
- */
- if (d->mm)
- mmu_notifier_unregister(&d->mmu_notifier, d->mm);
-
- vhost_uninit_vq_maps(vq);
-#endif
-
switch (ioctl) {
case VHOST_SET_VRING_NUM:
r = vhost_vring_set_num(d, vq, argp);
@@ -2081,13 +2025,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
BUG();
}

-#if VHOST_ARCH_CAN_ACCEL_UACCESS
- vhost_setup_vq_uaddr(vq);
-
- if (d->mm)
- mmu_notifier_register(&d->mmu_notifier, d->mm);
-#endif
-
mutex_unlock(&vq->mutex);

return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 819296332913..584bb13c4d6d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -86,7 +86,8 @@ enum vhost_uaddr_type {
struct vhost_map {
int npages;
void *addr;
- struct page **pages;
+ struct rcu_head head;
+ struct page *pages[];
};

struct vhost_uaddr {