[PATCH 1/1] Add vhost_blk driver

From: Vitaly Mayatskikh
Date: Fri Nov 02 2018 - 14:21:49 EST


This driver accelerates host side of virtio-blk.

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx>
---
drivers/vhost/Kconfig | 13 ++
drivers/vhost/Makefile | 3 +
drivers/vhost/blk.c | 510 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 526 insertions(+)
create mode 100644 drivers/vhost/blk.c

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index b580885243f7..c4980d6af0ea 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -53,3 +53,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
adds some overhead, it is disabled by default.

If unsure, say "N".
+
+config VHOST_BLK
+ tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
+ depends on BLOCK && EVENTFD
+ select VHOST
+ default n
+ help
+ This kernel module can be loaded in host kernel to accelerate
+ guest block with virtio_blk. Not to be confused with virtio_blk
+ module itself which needs to be loaded in guest kernel.
+
+ To compile this driver as a module, choose M here: the module will
+ be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6c6df24f770c..c8be36cd9214 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -8,6 +8,9 @@ vhost_scsi-y := scsi.o
obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
vhost_vsock-y := vsock.o

+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
+
obj-$(CONFIG_VHOST_RING) += vringh.o

obj-$(CONFIG_VHOST) += vhost.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000000000000..aefb9a61fa0f
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 IBM Corporation
+ * Author: Vitaly Mayatskikh <v.mayatskih@xxxxxxxxx>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/virtio_blk.h>
+#include <linux/vhost.h>
+#include <linux/fs.h>
+#include "vhost.h"
+
+enum {
+ VHOST_BLK_FEATURES =
+ VHOST_FEATURES |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) |
+ (1ULL << VIRTIO_BLK_F_MQ)
+};
+
+#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0x50, int)
+
+enum {
+ VHOST_BLK_VQ_MAX = 16,
+ VHOST_BLK_VQ_MAX_REQS = 128,
+};
+
+struct vhost_blk_req {
+ struct llist_node list;
+ int index;
+ struct vhost_blk_queue *q;
+ struct virtio_blk_outhdr hdr;
+ struct iovec *out_iov;
+ struct iovec *in_iov;
+ u8 out_num;
+ u8 in_num;
+ long len;
+ struct kiocb iocb;
+ struct iov_iter i;
+ int res;
+ void __user *status;
+};
+
+struct vhost_blk_queue {
+ int index;
+ struct vhost_blk *blk;
+ struct vhost_virtqueue vq;
+ struct vhost_work w;
+ struct llist_head wl;
+ struct vhost_blk_req req[VHOST_BLK_VQ_MAX_REQS];
+};
+
+struct vhost_blk {
+ struct vhost_dev dev;
+ struct file *backend;
+ int num_queues;
+ struct vhost_virtqueue *vqs[VHOST_BLK_VQ_MAX];
+ struct vhost_blk_queue queue[VHOST_BLK_VQ_MAX];
+};
+
+static void vhost_blk_flush(struct vhost_blk *blk)
+{
+ int i;
+
+ for (i = 0; i < blk->num_queues; i++)
+ vhost_poll_flush(&blk->queue[i].vq.poll);
+}
+
+
+static void vhost_blk_stop(struct vhost_blk *blk)
+{
+ struct vhost_virtqueue *vq;
+ int i;
+
+ for (i = 0; i < blk->num_queues; i++) {
+ vq = &blk->queue[i].vq;
+ mutex_lock(&vq->mutex);
+ rcu_assign_pointer(vq->private_data, NULL);
+ mutex_unlock(&vq->mutex);
+ }
+}
+
+static int vhost_blk_req_done(struct vhost_blk_req *req, unsigned char status)
+{
+ int ret;
+ int len = req->len;
+
+ pr_debug("%s vq[%d] req->index %d status %d len %d\n", __func__,
+ req->q->index, req->index, status, len);
+ ret = put_user(status, (unsigned char __user *)req->status);
+
+ WARN(ret, "%s: vq[%d] req->index %d failed to write status\n", __func__,
+ req->q->index, req->index);
+
+ vhost_add_used(&req->q->vq, req->index, len);
+
+ return ret;
+}
+
+static void vhost_blk_io_done_work(struct vhost_work *w)
+{
+ struct vhost_blk_queue *q = container_of(w, struct vhost_blk_queue, w);
+ struct llist_node *node;
+ struct vhost_blk_req *req, *tmp;
+
+ node = llist_del_all(&q->wl);
+ llist_for_each_entry_safe(req, tmp, node, list) {
+ vhost_blk_req_done(req, req->res);
+ }
+ vhost_signal(&q->blk->dev, &q->vq);
+}
+
+static void vhost_blk_iocb_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct vhost_blk_req *req = container_of(iocb, struct vhost_blk_req,
+ iocb);
+
+ pr_debug("%s vq[%d] req->index %d ret %ld ret2 %ld\n", __func__,
+ req->q->index, req->index, ret, ret2);
+
+ req->res = (ret == req->len) ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+ llist_add(&req->list, &req->q->wl);
+ vhost_vq_work_queue(&req->q->vq, &req->q->w);
+}
+
+static int vhost_blk_req_handle(struct vhost_blk_req *req)
+{
+ struct vhost_blk *blk = req->q->blk;
+ struct vhost_virtqueue *vq = &req->q->vq;
+ int type = le32_to_cpu(req->hdr.type);
+ int ret;
+ u8 status;
+
+ if ((type == VIRTIO_BLK_T_IN) || (type == VIRTIO_BLK_T_OUT)) {
+ bool write = (type == VIRTIO_BLK_T_OUT);
+ int nr_seg = (write ? req->out_num : req->in_num) - 1;
+ unsigned long sector = le64_to_cpu(req->hdr.sector);
+ ssize_t len, rem_len;
+
+ if (!req->q->blk->backend) {
+ vq_err(vq, "blk %p no backend!\n", req->q->blk);
+ ret = -EINVAL;
+ goto out_err;
+ }
+
+ len = iov_length(&vq->iov[1], nr_seg);
+ pr_debug("%s: [pid:%d %s] %s sector %lld, len %ld\n",
+ __func__, current->pid, current->comm,
+ write ? "WRITE" : "READ", req->hdr.sector, len);
+
+ req->len = len;
+ rem_len = len;
+ iov_iter_init(&req->i, (write ? WRITE : READ),
+ write ? &req->out_iov[0] : &req->in_iov[0],
+ nr_seg, len);
+
+ req->iocb.ki_pos = sector << 9;
+ req->iocb.ki_filp = blk->backend;
+ req->iocb.ki_complete = vhost_blk_iocb_complete;
+ req->iocb.ki_flags = IOCB_DIRECT;
+
+ if (write)
+ ret = call_write_iter(blk->backend, &req->iocb,
+ &req->i);
+ else
+ ret = call_read_iter(blk->backend, &req->iocb,
+ &req->i);
+
+ if (ret != -EIOCBQUEUED)
+ vhost_blk_iocb_complete(&req->iocb, ret, 0);
+
+ ret = 0;
+ goto out;
+ }
+
+ if (type == VIRTIO_BLK_T_GET_ID) {
+ char s[] = "vhost_blk";
+ size_t len = min_t(size_t, req->in_iov[0].iov_len,
+ strlen(s));
+
+ ret = copy_to_user(req->in_iov[0].iov_base, s, len);
+ status = ret ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+ if (put_user(status, (unsigned char __user *)req->status)) {
+ ret = -EFAULT;
+ goto out_err;
+ }
+ vhost_add_used_and_signal(&blk->dev, vq, req->index, 1);
+ ret = 0;
+ goto out;
+ } else {
+ pr_warn("Unsupported request type %d\n", type);
+ vhost_discard_vq_desc(vq, 1);
+ ret = -EINVAL;
+ return ret;
+ }
+out_err:
+ vhost_discard_vq_desc(vq, 1);
+out:
+ return ret;
+}
+
+static void vhost_blk_handle_guest_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq;
+ struct vhost_blk_queue *q;
+ struct vhost_blk *blk;
+ struct vhost_blk_req *req;
+ int in, out;
+ int head;
+
+ vq = container_of(work, struct vhost_virtqueue, poll.work);
+ q = container_of(vq, struct vhost_blk_queue, vq);
+ blk = container_of(vq->dev, struct vhost_blk, dev);
+
+ vhost_disable_notify(&blk->dev, vq);
+ for (;;) {
+ in = out = -1;
+
+ head = vhost_get_vq_desc(vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+
+ if (head < 0)
+ break;
+
+ if (head == vq->num) {
+ if (vhost_enable_notify(&blk->dev, vq)) {
+ vhost_disable_notify(&blk->dev, vq);
+ continue;
+ }
+ break;
+ }
+
+ req = &q->req[head];
+ req->index = head;
+ req->out_num = out;
+ req->in_num = in;
+ req->out_iov = &vq->iov[1];
+ req->in_iov = &vq->iov[out];
+ req->status = vq->iov[out + in - 1].iov_base;
+
+ if (copy_from_user(&req->hdr, vq->iov[0].iov_base,
+ sizeof(req->hdr))) {
+ vq_err(vq, "Failed to get block header!\n");
+ vhost_discard_vq_desc(vq, 1);
+ continue;
+ }
+ if (vhost_blk_req_handle(req) < 0)
+ break;
+ }
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *file)
+{
+ struct vhost_blk *blk;
+ struct vhost_blk_queue *q;
+ int i, j;
+
+ blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
+ if (!blk)
+ return -ENOMEM;
+
+ for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+ q = &blk->queue[i];
+ q->index = i;
+ q->blk = blk;
+ q->vq.handle_kick = vhost_blk_handle_guest_kick;
+ vhost_work_init(&q->w, vhost_blk_io_done_work);
+ blk->vqs[i] = &q->vq;
+ for (j = 0; j < VHOST_BLK_VQ_MAX_REQS; j++) {
+ q->req[j].index = j;
+ q->req[j].q = q;
+ }
+ }
+ vhost_dev_init(&blk->dev, (struct vhost_virtqueue **)&blk->vqs,
+ VHOST_BLK_VQ_MAX);
+ file->private_data = blk;
+
+ return 0;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+ struct vhost_blk *blk = f->private_data;
+
+ vhost_blk_stop(blk);
+ mutex_lock(&blk->dev.mutex);
+ vhost_blk_flush(blk);
+ vhost_dev_stop(&blk->dev);
+ vhost_dev_cleanup(&blk->dev);
+ vhost_blk_flush(blk);
+
+ if (blk->backend) {
+ fput(blk->backend);
+ blk->backend = NULL;
+ }
+
+ mutex_unlock(&blk->dev.mutex);
+ kvfree(blk);
+
+ return 0;
+}
+
+static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
+{
+ int i;
+ int ret = -EFAULT;
+
+ mutex_lock(&blk->dev.mutex);
+ if ((features & (1 << VHOST_F_LOG_ALL)) &&
+ !vhost_log_access_ok(&blk->dev))
+ goto out_unlock;
+
+ if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+ if (vhost_init_device_iotlb(&blk->dev, true))
+ goto out_unlock;
+ }
+
+ for (i = 0; i < VHOST_BLK_VQ_MAX; ++i) {
+ struct vhost_virtqueue *vq = blk->vqs[i];
+
+ mutex_lock(&vq->mutex);
+ vq->acked_features = features & VHOST_BLK_FEATURES;
+ mutex_unlock(&vq->mutex);
+ }
+ ret = 0;
+out_unlock:
+ mutex_unlock(&blk->dev.mutex);
+
+ return ret;
+}
+
+static long vhost_blk_reset_owner(struct vhost_blk *blk)
+{
+ long err;
+ struct vhost_umem *umem;
+
+ mutex_lock(&blk->dev.mutex);
+ err = vhost_dev_check_owner(&blk->dev);
+ if (err)
+ goto done;
+ umem = vhost_dev_reset_owner_prepare();
+ if (!umem) {
+ err = -ENOMEM;
+ goto done;
+ }
+ vhost_blk_stop(blk);
+ vhost_blk_flush(blk);
+ vhost_dev_reset_owner(&blk->dev, umem);
+done:
+ mutex_unlock(&blk->dev.mutex);
+ return err;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
+{
+ struct file *backend;
+ int ret, i;
+ struct vhost_virtqueue *vq;
+
+ mutex_lock(&blk->dev.mutex);
+ ret = vhost_dev_check_owner(&blk->dev);
+ if (ret)
+ goto out_dev;
+
+ backend = fget(fd);
+ if (IS_ERR(backend)) {
+ ret = PTR_ERR(backend);
+ goto out_dev;
+ }
+
+ if (backend == blk->backend) {
+ ret = 0;
+ goto out_file;
+ }
+
+ if (blk->backend)
+ fput(blk->backend);
+ blk->backend = backend;
+ for (i = 0; i < blk->num_queues; i++) {
+ vq = &blk->queue[i].vq;
+ if (!vhost_vq_access_ok(vq)) {
+ ret = -EFAULT;
+ goto out_file;
+ }
+ mutex_lock(&vq->mutex);
+ rcu_assign_pointer(vq->private_data, backend);
+ ret = vhost_vq_init_access(vq);
+ mutex_unlock(&vq->mutex);
+ if (ret) {
+ pr_err("vhost_vq_init_access failed: %d\n", ret);
+ goto out_file;
+ }
+
+ }
+ ret = 0;
+ goto out_dev;
+out_file:
+ fput(backend);
+ blk->backend = NULL;
+out_dev:
+ mutex_unlock(&blk->dev.mutex);
+ vhost_blk_flush(blk);
+ return ret;
+}
+
+static long vhost_blk_pass_ioctl(struct vhost_blk *blk, unsigned int ioctl,
+ void __user *argp)
+{
+ long ret;
+
+ mutex_lock(&blk->dev.mutex);
+ ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
+ if (ret == -ENOIOCTLCMD)
+ ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
+ else
+ vhost_blk_flush(blk);
+ mutex_unlock(&blk->dev.mutex);
+ return ret;
+}
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct vhost_blk *blk = f->private_data;
+ void __user *argp = (void __user *)arg;
+ int fd;
+ u64 __user *featurep = argp;
+ u64 features;
+ long ret;
+ struct vhost_vring_state s;
+
+ switch (ioctl) {
+ case VHOST_SET_MEM_TABLE:
+ vhost_blk_stop(blk);
+ ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+ break;
+ case VHOST_SET_VRING_NUM:
+ if (copy_from_user(&s, argp, sizeof(s)))
+ return -EFAULT;
+ ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+ if (!ret)
+ blk->num_queues = s.index + 1;
+ break;
+ case VHOST_BLK_SET_BACKEND:
+ if (copy_from_user(&fd, argp, sizeof(fd)))
+ return -EFAULT;
+ ret = vhost_blk_set_backend(blk, fd);
+ break;
+ case VHOST_GET_FEATURES:
+ features = VHOST_BLK_FEATURES;
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+ ret = 0;
+ break;
+ case VHOST_SET_FEATURES:
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+ if (features & ~VHOST_BLK_FEATURES)
+ return -EOPNOTSUPP;
+ ret = vhost_blk_set_features(blk, features);
+ break;
+ case VHOST_RESET_OWNER:
+ ret = vhost_blk_reset_owner(blk);
+ break;
+ default:
+ ret = vhost_blk_pass_ioctl(blk, ioctl, argp);
+ break;
+ }
+ return ret;
+}
+
+static const struct file_operations vhost_blk_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_blk_open,
+ .release = vhost_blk_release,
+ .llseek = noop_llseek,
+ .unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+ MISC_DYNAMIC_MINOR,
+ "vhost-blk",
+ &vhost_blk_fops,
+};
+
+static int vhost_blk_init(void)
+{
+ return misc_register(&vhost_blk_misc);
+}
+module_init(vhost_blk_init);
+
+static void vhost_blk_exit(void)
+{
+ misc_deregister(&vhost_blk_misc);
+}
+
+module_exit(vhost_blk_exit);
+
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vitaly Mayatskikh");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio blk");
+MODULE_ALIAS("devname:vhost-blk");
--
2.17.1