[PATCH RFC] vhost: basic device IOTLB support

From: Jason Wang
Date: Thu Dec 31 2015 - 02:14:06 EST


This patch tries to implement an device IOTLB for vhost. This could be
used with for co-operation with userspace(qemu) implementation of
iommu for a secure DMA environment in guest.

The idea is simple. When vhost meets an IOTLB miss, it will request
the assistance of userspace to do the translation, this is done
through:

- Fill the translation request in a preset userspace address (This
address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY).
- Notify userspace through eventfd (This eventfd was set through ioctl
VHOST_SET_IOTLB_FD).

When userspace finishes the translation, it will update the vhost
IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of
snooping the IOTLB invalidation of IOMMU IOTLB and use
VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost.

For simplicity, IOTLB was implemented with a simple hash array. The
index were calculated from IOVA page frame number which can only works
at PAGE_SIZE level.

An qemu implementation (for reference) is available at:
git@xxxxxxxxxx:jasowang/qemu.git iommu

TODO & Known issues:

- read/write permission validation was not implemented.
- no feature negotiation.
- VHOST_SET_MEM_TABLE is not reused (maybe there's a chance).
- working at PAGE_SIZE level, don't support large mappings.
- better data structure for IOTLB instead of simple hash array.
- better API, e.g using mmap() instead of preset userspace address.

Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
---
drivers/vhost/net.c | 2 +-
drivers/vhost/vhost.c | 190 ++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.h | 13 ++++
include/uapi/linux/vhost.h | 26 +++++++
4 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e..a172be9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1083,7 +1083,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
r = vhost_dev_ioctl(&n->dev, ioctl, argp);
if (r == -ENOIOCTLCMD)
r = vhost_vring_ioctl(&n->dev, ioctl, argp);
- else
+ else if (ioctl != VHOST_UPDATE_IOTLB)
vhost_net_flush(n);
mutex_unlock(&n->dev.mutex);
return r;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index eec2f11..729fe05 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -113,6 +113,11 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
}
#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */

+static inline int vhost_iotlb_hash(u64 iova)
+{
+ return (iova >> PAGE_SHIFT) & (VHOST_IOTLB_SIZE - 1);
+}
+
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -384,8 +389,14 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->memory = NULL;
dev->mm = NULL;
spin_lock_init(&dev->work_lock);
+ spin_lock_init(&dev->iotlb_lock);
+ mutex_init(&dev->iotlb_req_mutex);
INIT_LIST_HEAD(&dev->work_list);
dev->worker = NULL;
+ dev->iotlb_request = NULL;
+ dev->iotlb_ctx = NULL;
+ dev->iotlb_file = NULL;
+ dev->pending_request.flags.type = VHOST_IOTLB_INVALIDATE;

for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
@@ -393,12 +404,17 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
+ vq->iotlb_request = NULL;
mutex_init(&vq->mutex);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
POLLIN, dev);
}
+
+ init_completion(&dev->iotlb_completion);
+ for (i = 0; i < VHOST_IOTLB_SIZE; i++)
+ dev->iotlb[i].flags.valid = VHOST_IOTLB_INVALID;
}
EXPORT_SYMBOL_GPL(vhost_dev_init);

@@ -940,9 +956,10 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
struct file *eventfp, *filep = NULL;
struct eventfd_ctx *ctx = NULL;
+ struct vhost_iotlb_entry entry;
u64 p;
long r;
- int i, fd;
+ int index, i, fd;

/* If you are not the owner, you can become one */
if (ioctl == VHOST_SET_OWNER) {
@@ -1008,6 +1025,80 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
if (filep)
fput(filep);
break;
+ case VHOST_SET_IOTLB_FD:
+ r = get_user(fd, (int __user *)argp);
+ if (r < 0)
+ break;
+ eventfp = fd == -1 ? NULL : eventfd_fget(fd);
+ if (IS_ERR(eventfp)) {
+ r = PTR_ERR(eventfp);
+ break;
+ }
+ if (eventfp != d->iotlb_file) {
+ filep = d->iotlb_file;
+ d->iotlb_file = eventfp;
+ ctx = d->iotlb_ctx;
+ d->iotlb_ctx = eventfp ?
+ eventfd_ctx_fileget(eventfp) : NULL;
+ } else
+ filep = eventfp;
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
+ d->vqs[i]->iotlb_ctx = d->iotlb_ctx;
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
+ if (ctx)
+ eventfd_ctx_put(ctx);
+ if (filep)
+ fput(filep);
+ break;
+ case VHOST_SET_IOTLB_REQUEST_ENTRY:
+ if (!access_ok(VERIFY_READ, argp, sizeof(*d->iotlb_request)))
+ return -EFAULT;
+ if (!access_ok(VERIFY_WRITE, argp, sizeof(*d->iotlb_request)))
+ return -EFAULT;
+ d->iotlb_request = argp;
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
+ d->vqs[i]->iotlb_request = argp;
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
+ break;
+ case VHOST_UPDATE_IOTLB:
+ r = copy_from_user(&entry, argp, sizeof(entry));
+ if (r < 0) {
+ r = -EFAULT;
+ goto done;
+ }
+
+ index = vhost_iotlb_hash(entry.iova);
+
+ spin_lock(&d->iotlb_lock);
+ switch (entry.flags.type) {
+ case VHOST_IOTLB_UPDATE:
+ d->iotlb[index] = entry;
+ break;
+ case VHOST_IOTLB_INVALIDATE:
+ if (d->iotlb[index].iova == entry.iova)
+ d->iotlb[index] = entry;
+ break;
+ default:
+ r = -EINVAL;
+ }
+ spin_unlock(&d->iotlb_lock);
+
+ if (!r && entry.flags.type != VHOST_IOTLB_INVALIDATE) {
+ mutex_lock(&d->iotlb_req_mutex);
+ if (entry.iova == d->pending_request.iova &&
+ d->pending_request.flags.type ==
+ VHOST_IOTLB_MISS) {
+ d->pending_request = entry;
+ complete(&d->iotlb_completion);
+ }
+ mutex_unlock(&d->iotlb_req_mutex);
+ }
+
+ break;
default:
r = -ENOIOCTLCMD;
break;
@@ -1177,9 +1268,104 @@ int vhost_init_used(struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_init_used);

+static struct vhost_iotlb_entry vhost_iotlb_miss(struct vhost_virtqueue *vq,
+ u64 iova)
+{
+ struct completion *c = &vq->dev->iotlb_completion;
+ struct vhost_iotlb_entry *pending = &vq->dev->pending_request;
+ struct vhost_iotlb_entry entry = {
+ .flags.valid = VHOST_IOTLB_INVALID,
+ };
+
+ mutex_lock(&vq->dev->iotlb_req_mutex);
+
+ if (!vq->iotlb_ctx)
+ goto err;
+
+ if (!vq->dev->iotlb_request)
+ goto err;
+
+ if (pending->flags.type == VHOST_IOTLB_MISS)
+ goto err;
+
+ pending->iova = iova & PAGE_MASK;
+ pending->flags.type = VHOST_IOTLB_MISS;
+
+ if (copy_to_user(vq->dev->iotlb_request, pending,
+ sizeof(struct vhost_iotlb_entry))) {
+ goto err;
+ }
+
+ mutex_unlock(&vq->dev->iotlb_req_mutex);
+
+ eventfd_signal(vq->iotlb_ctx, 1);
+ wait_for_completion_interruptible(c);
+
+ mutex_lock(&vq->dev->iotlb_req_mutex);
+ entry = vq->dev->pending_request;
+ mutex_unlock(&vq->dev->iotlb_req_mutex);
+
+ return entry;
+err:
+ mutex_unlock(&vq->dev->iotlb_req_mutex);
+ return entry;
+}
+
+static int translate_iotlb(struct vhost_virtqueue *vq, u64 iova, u32 len,
+ struct iovec iov[], int iov_size)
+{
+ struct vhost_iotlb_entry *entry;
+ struct vhost_iotlb_entry miss;
+ struct vhost_dev *dev = vq->dev;
+ int ret = 0;
+ u64 s = 0, size;
+
+ spin_lock(&dev->iotlb_lock);
+
+ while ((u64) len > s) {
+ if (unlikely(ret >= iov_size)) {
+ ret = -ENOBUFS;
+ break;
+ }
+ entry = &vq->dev->iotlb[vhost_iotlb_hash(iova)];
+ if ((entry->iova != (iova & PAGE_MASK)) ||
+ (entry->flags.valid != VHOST_IOTLB_VALID)) {
+
+ spin_unlock(&dev->iotlb_lock);
+ miss = vhost_iotlb_miss(vq, iova);
+ spin_lock(&dev->iotlb_lock);
+
+ if (miss.flags.valid != VHOST_IOTLB_VALID ||
+ miss.iova != (iova & PAGE_MASK)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ entry = &miss;
+ }
+
+ if (entry->iova == (iova & PAGE_MASK)) {
+ size = entry->userspace_addr + entry->size - iova;
+ iov[ret].iov_base =
+ (void __user *)(entry->userspace_addr +
+ (iova & (PAGE_SIZE - 1)));
+ iov[ret].iov_len = min((u64)len - s, size);
+ s += size;
+ iova += size;
+ ret++;
+ } else {
+ BUG();
+ }
+ }
+
+err:
+ spin_unlock(&dev->iotlb_lock);
+ return ret;
+}
+
static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
struct iovec iov[], int iov_size)
{
+#if 0
const struct vhost_memory_region *reg;
struct vhost_memory *mem;
struct iovec *_iov;
@@ -1209,6 +1395,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
}

return ret;
+#endif
+ return translate_iotlb(vq, addr, len, iov, iov_size);
}

/* Each buffer in the virtqueues is actually a chain of descriptors. This
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d3f7674..d254efc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -68,6 +68,8 @@ struct vhost_virtqueue {
struct eventfd_ctx *call_ctx;
struct eventfd_ctx *error_ctx;
struct eventfd_ctx *log_ctx;
+ struct eventfd_ctx *iotlb_ctx;
+ struct vhost_iotlb __user *iotlb_request;

struct vhost_poll poll;

@@ -116,6 +118,8 @@ struct vhost_virtqueue {
#endif
};

+#define VHOST_IOTLB_SIZE 1024
+
struct vhost_dev {
struct vhost_memory *memory;
struct mm_struct *mm;
@@ -124,9 +128,18 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
+ struct file *iotlb_file;
+ struct eventfd_ctx *iotlb_ctx;
+ struct mutex iotlb_req_mutex;
+ struct vhost_iotlb_entry __user *iotlb_request;
+ struct vhost_iotlb_entry pending_request;
+ struct completion iotlb_completion;
+ struct vhost_iotlb_entry request;
spinlock_t work_lock;
struct list_head work_list;
struct task_struct *worker;
+ spinlock_t iotlb_lock;
+ struct vhost_iotlb_entry iotlb[VHOST_IOTLB_SIZE];
};

void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ab373191..400e513 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -63,6 +63,26 @@ struct vhost_memory {
struct vhost_memory_region regions[0];
};

+struct vhost_iotlb_entry {
+ __u64 iova;
+ __u64 size;
+ __u64 userspace_addr;
+ struct {
+#define VHOST_IOTLB_PERM_READ 0x1
+#define VHOST_IOTLB_PERM_WRITE 0x10
+ __u8 perm;
+#define VHOST_IOTLB_MISS 1
+#define VHOST_IOTLB_UPDATE 2
+#define VHOST_IOTLB_INVALIDATE 3
+ __u8 type;
+#define VHOST_IOTLB_INVALID 0x1
+#define VHOST_IOTLB_VALID 0x2
+ __u8 valid;
+ __u8 u8_padding;
+ __u32 padding;
+ } flags;
+};
+
/* ioctls */

#define VHOST_VIRTIO 0xAF
@@ -127,6 +147,12 @@ struct vhost_memory {
/* Set eventfd to signal an error */
#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)

+/* IOTLB */
+/* Specify an eventfd file descriptor to signle on IOTLB miss */
+#define VHOST_SET_IOTLB_FD _IOW(VHOST_VIRTIO, 0x23, int)
+#define VHOST_UPDATE_IOTLB _IOW(VHOST_VIRTIO, 0x24, struct vhost_iotlb_entry)
+#define VHOST_SET_IOTLB_REQUEST_ENTRY _IOW(VHOST_VIRTIO, 0x25, struct vhost_iotlb_entry)
+
/* VHOST_NET specific defines */

/* Attach virtio net ring to a raw socket, or tap device.
--
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/