Re: [PATCH net V2] vhost: log dirty page correctly

From: Jason Wang
Date: Thu Jan 10 2019 - 07:37:28 EST



On 2019/1/9 äå10:25, Michael S. Tsirkin wrote:
On Wed, Jan 09, 2019 at 03:29:47PM +0800, Jason Wang wrote:
Vhost dirty page logging API is designed to sync through GPA. But we
try to log GIOVA when device IOTLB is enabled. This is wrong and may
lead to missing data after migration.

To solve this issue, when logging with device IOTLB enabled, we will:

1) reuse the device IOTLB translation result of GIOVA->HVA mapping to
get HVA, for writable descriptor, get HVA through iovec. For used
ring update, translate its GIOVA to HVA
2) traverse the GPA->HVA mapping to get the possible GPA and log
through GPA. Pay attention this reverse mapping is not guaranteed
to be unique, so we should log each possible GPA in this case.

This fix the failure of scp to guest during migration. In -next, we
will probably support passing GIOVA->GPA instead of GIOVA->HVA.

Fixes: 6b1e6cc7855b ("vhost: new device IOTLB API")
Reported-by: Jintack Lim <jintack@xxxxxxxxxxxxxxx>
Cc: Jintack Lim <jintack@xxxxxxxxxxxxxxx>
Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
---
The patch is needed for stable.
Changes from V1:
- return error instead of warn
---
drivers/vhost/net.c | 3 +-
drivers/vhost/vhost.c | 82 +++++++++++++++++++++++++++++++++++--------
drivers/vhost/vhost.h | 3 +-
3 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 36f3d0f49e60..bca86bf7189f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1236,7 +1236,8 @@ static void handle_rx(struct vhost_net *net)
if (nvq->done_idx > VHOST_NET_BATCH)
vhost_net_signal_used(nvq);
if (unlikely(vq_log))
- vhost_log_write(vq, vq_log, log, vhost_len);
+ vhost_log_write(vq, vq_log, log, vhost_len,
+ vq->iov, in);
total_len += vhost_len;
if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9f7942cbcbb2..ee095f08ffd4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1733,11 +1733,70 @@ static int log_write(void __user *log_base,
return r;
}
+static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
+{
+ struct vhost_umem *umem = vq->umem;
+ struct vhost_umem_node *u;
+ u64 gpa;
+ int r;
+ bool hit = false;
+
+ list_for_each_entry(u, &umem->umem_list, link) {
+ if (u->userspace_addr < hva &&
+ u->userspace_addr + u->size >=
+ hva + len) {
So this tries to see that the GPA range is completely within
the GVA region. Does this have to be the case?


You mean e.g a buffer that crosses the boundary of two memory regions?


And if yes why not return 0 below instead of hit = true?


I think it's safe but not sure for the case like two GPAs can map to same HVA?


I'm also a bit concerned about overflow when addr + len is on a 64 bit
boundary. Why not check add + size - 1 and hva + len - 1 instead?


Let me fix this.




+ gpa = u->start + hva - u->userspace_addr;
+ r = log_write(vq->log_base, gpa, len);
+ if (r < 0)
+ return r;
+ hit = true;
+ }
+ }
+
+ if (!hit)
+ return -EFAULT;
+
+ return 0;
+}
+
+static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
+{
+ struct iovec iov[64];
+ int i, ret;
+
+ if (!vq->iotlb)
+ return log_write(vq->log_base, vq->log_addr + used_offset, len);
+
+ ret = translate_desc(vq, (u64)(uintptr_t)vq->used + used_offset,
+ len, iov, 64, VHOST_ACCESS_WO);

We don't need the cast to u64 here do we?

+ if (ret)
+ return ret;
+
+ for (i = 0; i < ret; i++) {
+ ret = log_write_hva(vq, (u64)(uintptr_t)iov[i].iov_base,

We don't need the cast to u64 here do we?

+ iov[i].iov_len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
- unsigned int log_num, u64 len)
+ unsigned int log_num, u64 len, struct iovec *iov, int count)
{
int i, r;
+ if (vq->iotlb) {
+ for (i = 0; i < count; i++) {
+ r = log_write_hva(vq, (u64)(uintptr_t)iov[i].iov_base,
+ iov[i].iov_len);

We don't need the cast to u64 here do we?


Let me remove the unnecessary u64 cast.



+ if (r < 0)
+ return r;
+ }
+ return 0;
+ }
+
/* Make sure data written is seen before log. */
smp_wmb();
Shouldn't the wmb be before log_write_hva too?


Yes.

Thanks


for (i = 0; i < log_num; ++i) {
@@ -1769,9 +1828,8 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
smp_wmb();
/* Log used flag write. */
used = &vq->used->flags;
- log_write(vq->log_base, vq->log_addr +
- (used - (void __user *)vq->used),
- sizeof vq->used->flags);
+ log_used(vq, (used - (void __user *)vq->used),
+ sizeof vq->used->flags);
if (vq->log_ctx)
eventfd_signal(vq->log_ctx, 1);
}
@@ -1789,9 +1847,8 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
smp_wmb();
/* Log avail event write */
used = vhost_avail_event(vq);
- log_write(vq->log_base, vq->log_addr +
- (used - (void __user *)vq->used),
- sizeof *vhost_avail_event(vq));
+ log_used(vq, (used - (void __user *)vq->used),
+ sizeof *vhost_avail_event(vq));
if (vq->log_ctx)
eventfd_signal(vq->log_ctx, 1);
}
@@ -2191,10 +2248,8 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
/* Make sure data is seen before log. */
smp_wmb();
/* Log used ring entry write. */
- log_write(vq->log_base,
- vq->log_addr +
- ((void __user *)used - (void __user *)vq->used),
- count * sizeof *used);
+ log_used(vq, ((void __user *)used - (void __user *)vq->used),
+ count * sizeof *used);
}
old = vq->last_used_idx;
new = (vq->last_used_idx += count);
@@ -2236,9 +2291,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
/* Make sure used idx is seen before log. */
smp_wmb();
/* Log used index update. */
- log_write(vq->log_base,
- vq->log_addr + offsetof(struct vring_used, idx),
- sizeof vq->used->idx);
+ log_used(vq, offsetof(struct vring_used, idx),
+ sizeof vq->used->idx);
if (vq->log_ctx)
eventfd_signal(vq->log_ctx, 1);
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 466ef7542291..1b675dad5e05 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -205,7 +205,8 @@ bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
- unsigned int log_num, u64 len);
+ unsigned int log_num, u64 len,
+ struct iovec *iov, int count);
int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
--
2.17.1