Re: [PATCH 2/2] virtio_blk: blk-mq support

From: Rusty Russell
Date: Sun Oct 27 2013 - 23:36:04 EST


Christoph Hellwig <hch@xxxxxxxxxxxxx> writes:
> Switch virtio-blk from the dual support for old-style requests and bios
> to use the block-multiqueue. For now pretend to have 4 issue queues
> as Jens pulled that out of his this hair and it worked.

Let's pretend I'm stupid.

We don't actually have multiple queues through to the host, but we're
pretending to, because it makes the block layer go faster?

Do I want to know *why* it's faster? Or should I look the other way?

Thanks,
Rusty.

> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
> drivers/block/virtio_blk.c | 312 +++++++++-----------------------------------
> 1 file changed, 65 insertions(+), 247 deletions(-)
>
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 5cdf88b..87f099d 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -11,12 +11,11 @@
> #include <linux/string_helpers.h>
> #include <scsi/scsi_cmnd.h>
> #include <linux/idr.h>
> +#include <linux/blk-mq.h>
> +#include <linux/numa.h>
>
> #define PART_BITS 4
>
> -static bool use_bio;
> -module_param(use_bio, bool, S_IRUGO);
> -
> static int major;
> static DEFINE_IDA(vd_index_ida);
>
> @@ -26,13 +25,11 @@ struct virtio_blk
> {
> struct virtio_device *vdev;
> struct virtqueue *vq;
> - wait_queue_head_t queue_wait;
> + spinlock_t vq_lock;
>
> /* The disk structure for the kernel. */
> struct gendisk *disk;
>
> - mempool_t *pool;
> -
> /* Process context for config space updates */
> struct work_struct config_work;
>
> @@ -47,15 +44,11 @@ struct virtio_blk
>
> /* Ida index - used to track minor number allocations. */
> int index;
> -
> - /* Scatterlist: can be too big for stack. */
> - struct scatterlist sg[/*sg_elems*/];
> };
>
> struct virtblk_req
> {
> struct request *req;
> - struct bio *bio;
> struct virtio_blk_outhdr out_hdr;
> struct virtio_scsi_inhdr in_hdr;
> struct work_struct work;
> @@ -84,22 +77,6 @@ static inline int virtblk_result(struct virtblk_req *vbr)
> }
> }
>
> -static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
> - gfp_t gfp_mask)
> -{
> - struct virtblk_req *vbr;
> -
> - vbr = mempool_alloc(vblk->pool, gfp_mask);
> - if (!vbr)
> - return NULL;
> -
> - vbr->vblk = vblk;
> - if (use_bio)
> - sg_init_table(vbr->sg, vblk->sg_elems);
> -
> - return vbr;
> -}
> -
> static int __virtblk_add_req(struct virtqueue *vq,
> struct virtblk_req *vbr,
> struct scatterlist *data_sg,
> @@ -143,83 +120,8 @@ static int __virtblk_add_req(struct virtqueue *vq,
> return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
> }
>
> -static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
> -{
> - struct virtio_blk *vblk = vbr->vblk;
> - DEFINE_WAIT(wait);
> - int ret;
> -
> - spin_lock_irq(vblk->disk->queue->queue_lock);
> - while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
> - have_data)) < 0)) {
> - prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
> - TASK_UNINTERRUPTIBLE);
> -
> - spin_unlock_irq(vblk->disk->queue->queue_lock);
> - io_schedule();
> - spin_lock_irq(vblk->disk->queue->queue_lock);
> -
> - finish_wait(&vblk->queue_wait, &wait);
> - }
> -
> - virtqueue_kick(vblk->vq);
> - spin_unlock_irq(vblk->disk->queue->queue_lock);
> -}
> -
> -static void virtblk_bio_send_flush(struct virtblk_req *vbr)
> -{
> - vbr->flags |= VBLK_IS_FLUSH;
> - vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
> - vbr->out_hdr.sector = 0;
> - vbr->out_hdr.ioprio = 0;
> -
> - virtblk_add_req(vbr, false);
> -}
> -
> -static void virtblk_bio_send_data(struct virtblk_req *vbr)
> -{
> - struct virtio_blk *vblk = vbr->vblk;
> - struct bio *bio = vbr->bio;
> - bool have_data;
> -
> - vbr->flags &= ~VBLK_IS_FLUSH;
> - vbr->out_hdr.type = 0;
> - vbr->out_hdr.sector = bio->bi_sector;
> - vbr->out_hdr.ioprio = bio_prio(bio);
> -
> - if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
> - have_data = true;
> - if (bio->bi_rw & REQ_WRITE)
> - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
> - else
> - vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
> - } else
> - have_data = false;
> -
> - virtblk_add_req(vbr, have_data);
> -}
> -
> -static void virtblk_bio_send_data_work(struct work_struct *work)
> -{
> - struct virtblk_req *vbr;
> -
> - vbr = container_of(work, struct virtblk_req, work);
> -
> - virtblk_bio_send_data(vbr);
> -}
> -
> -static void virtblk_bio_send_flush_work(struct work_struct *work)
> -{
> - struct virtblk_req *vbr;
> -
> - vbr = container_of(work, struct virtblk_req, work);
> -
> - virtblk_bio_send_flush(vbr);
> -}
> -
> static inline void virtblk_request_done(struct virtblk_req *vbr)
> {
> - struct virtio_blk *vblk = vbr->vblk;
> struct request *req = vbr->req;
> int error = virtblk_result(vbr);
>
> @@ -231,90 +133,43 @@ static inline void virtblk_request_done(struct virtblk_req *vbr)
> req->errors = (error != 0);
> }
>
> - __blk_end_request_all(req, error);
> - mempool_free(vbr, vblk->pool);
> -}
> -
> -static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
> -{
> - struct virtio_blk *vblk = vbr->vblk;
> -
> - if (vbr->flags & VBLK_REQ_DATA) {
> - /* Send out the actual write data */
> - INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
> - queue_work(virtblk_wq, &vbr->work);
> - } else {
> - bio_endio(vbr->bio, virtblk_result(vbr));
> - mempool_free(vbr, vblk->pool);
> - }
> -}
> -
> -static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
> -{
> - struct virtio_blk *vblk = vbr->vblk;
> -
> - if (unlikely(vbr->flags & VBLK_REQ_FUA)) {
> - /* Send out a flush before end the bio */
> - vbr->flags &= ~VBLK_REQ_DATA;
> - INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
> - queue_work(virtblk_wq, &vbr->work);
> - } else {
> - bio_endio(vbr->bio, virtblk_result(vbr));
> - mempool_free(vbr, vblk->pool);
> - }
> -}
> -
> -static inline void virtblk_bio_done(struct virtblk_req *vbr)
> -{
> - if (unlikely(vbr->flags & VBLK_IS_FLUSH))
> - virtblk_bio_flush_done(vbr);
> - else
> - virtblk_bio_data_done(vbr);
> + blk_mq_end_io(req, error);
> }
>
> static void virtblk_done(struct virtqueue *vq)
> {
> struct virtio_blk *vblk = vq->vdev->priv;
> - bool bio_done = false, req_done = false;
> + bool req_done = false;
> struct virtblk_req *vbr;
> unsigned long flags;
> unsigned int len;
>
> - spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
> + spin_lock_irqsave(&vblk->vq_lock, flags);
> do {
> virtqueue_disable_cb(vq);
> while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
> - if (vbr->bio) {
> - virtblk_bio_done(vbr);
> - bio_done = true;
> - } else {
> - virtblk_request_done(vbr);
> - req_done = true;
> - }
> + virtblk_request_done(vbr);
> + req_done = true;
> }
> } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vblk->vq_lock, flags);
> +
> /* In case queue is stopped waiting for more buffers. */
> if (req_done)
> - blk_start_queue(vblk->disk->queue);
> - spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
> -
> - if (bio_done)
> - wake_up(&vblk->queue_wait);
> + blk_mq_start_stopped_hw_queues(vblk->disk->queue);
> }
>
> -static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
> - struct request *req)
> +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
> {
> + struct virtio_blk *vblk = hctx->queue->queuedata;
> + struct virtblk_req *vbr = req->special;
> + unsigned long flags;
> unsigned int num;
> - struct virtblk_req *vbr;
> + const bool last = (req->cmd_flags & REQ_END) != 0;
>
> - vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
> - if (!vbr)
> - /* When another request finishes we'll try again. */
> - return false;
> + BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
>
> vbr->req = req;
> - vbr->bio = NULL;
> if (req->cmd_flags & REQ_FLUSH) {
> vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
> vbr->out_hdr.sector = 0;
> @@ -342,7 +197,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
> }
> }
>
> - num = blk_rq_map_sg(q, vbr->req, vblk->sg);
> + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
> if (num) {
> if (rq_data_dir(vbr->req) == WRITE)
> vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
> @@ -350,63 +205,18 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
> vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
> }
>
> - if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
> - mempool_free(vbr, vblk->pool);
> - return false;
> - }
> -
> - return true;
> -}
> -
> -static void virtblk_request(struct request_queue *q)
> -{
> - struct virtio_blk *vblk = q->queuedata;
> - struct request *req;
> - unsigned int issued = 0;
> -
> - while ((req = blk_peek_request(q)) != NULL) {
> - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
> -
> - /* If this request fails, stop queue and wait for something to
> - finish to restart it. */
> - if (!do_req(q, vblk, req)) {
> - blk_stop_queue(q);
> - break;
> - }
> - blk_start_request(req);
> - issued++;
> - }
> -
> - if (issued)
> + spin_lock_irqsave(&vblk->vq_lock, flags);
> + if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) {
> + spin_unlock_irqrestore(&vblk->vq_lock, flags);
> + blk_mq_stop_hw_queue(hctx);
> virtqueue_kick(vblk->vq);
> -}
> -
> -static void virtblk_make_request(struct request_queue *q, struct bio *bio)
> -{
> - struct virtio_blk *vblk = q->queuedata;
> - struct virtblk_req *vbr;
> -
> - BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
> -
> - vbr = virtblk_alloc_req(vblk, GFP_NOIO);
> - if (!vbr) {
> - bio_endio(bio, -ENOMEM);
> - return;
> + return BLK_MQ_RQ_QUEUE_BUSY;
> }
> + spin_unlock_irqrestore(&vblk->vq_lock, flags);
>
> - vbr->bio = bio;
> - vbr->flags = 0;
> - if (bio->bi_rw & REQ_FLUSH)
> - vbr->flags |= VBLK_REQ_FLUSH;
> - if (bio->bi_rw & REQ_FUA)
> - vbr->flags |= VBLK_REQ_FUA;
> - if (bio->bi_size)
> - vbr->flags |= VBLK_REQ_DATA;
> -
> - if (unlikely(vbr->flags & VBLK_REQ_FLUSH))
> - virtblk_bio_send_flush(vbr);
> - else
> - virtblk_bio_send_data(vbr);
> + if (last)
> + virtqueue_kick(vblk->vq);
> + return BLK_MQ_RQ_QUEUE_OK;
> }
>
> /* return id (s/n) string for *disk to *id_str
> @@ -680,12 +490,35 @@ static const struct device_attribute dev_attr_cache_type_rw =
> __ATTR(cache_type, S_IRUGO|S_IWUSR,
> virtblk_cache_type_show, virtblk_cache_type_store);
>
> +static struct blk_mq_ops virtio_mq_ops = {
> + .queue_rq = virtio_queue_rq,
> + .map_queue = blk_mq_map_queue,
> + .alloc_hctx = blk_mq_alloc_single_hw_queue,
> + .free_hctx = blk_mq_free_single_hw_queue,
> +};
> +
> +static struct blk_mq_reg virtio_mq_reg = {
> + .ops = &virtio_mq_ops,
> + .nr_hw_queues = 4,
> + .queue_depth = 64,
> + .numa_node = NUMA_NO_NODE,
> + .flags = BLK_MQ_F_SHOULD_MERGE,
> +};
> +
> +static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
> + struct request *rq, unsigned int nr)
> +{
> + struct virtio_blk *vblk = data;
> + struct virtblk_req *vbr = rq->special;
> +
> + sg_init_table(vbr->sg, vblk->sg_elems);
> +}
> +
> static int virtblk_probe(struct virtio_device *vdev)
> {
> struct virtio_blk *vblk;
> struct request_queue *q;
> int err, index;
> - int pool_size;
>
> u64 cap;
> u32 v, blk_size, sg_elems, opt_io_size;
> @@ -709,17 +542,14 @@ static int virtblk_probe(struct virtio_device *vdev)
>
> /* We need an extra sg elements at head and tail. */
> sg_elems += 2;
> - vdev->priv = vblk = kmalloc(sizeof(*vblk) +
> - sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
> + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
> if (!vblk) {
> err = -ENOMEM;
> goto out_free_index;
> }
>
> - init_waitqueue_head(&vblk->queue_wait);
> vblk->vdev = vdev;
> vblk->sg_elems = sg_elems;
> - sg_init_table(vblk->sg, vblk->sg_elems);
> mutex_init(&vblk->config_lock);
>
> INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
> @@ -728,31 +558,27 @@ static int virtblk_probe(struct virtio_device *vdev)
> err = init_vq(vblk);
> if (err)
> goto out_free_vblk;
> -
> - pool_size = sizeof(struct virtblk_req);
> - if (use_bio)
> - pool_size += sizeof(struct scatterlist) * sg_elems;
> - vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
> - if (!vblk->pool) {
> - err = -ENOMEM;
> - goto out_free_vq;
> - }
> + spin_lock_init(&vblk->vq_lock);
>
> /* FIXME: How many partitions? How long is a piece of string? */
> vblk->disk = alloc_disk(1 << PART_BITS);
> if (!vblk->disk) {
> err = -ENOMEM;
> - goto out_mempool;
> + goto out_free_vq;
> }
>
> - q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
> + virtio_mq_reg.cmd_size =
> + sizeof(struct virtblk_req) +
> + sizeof(struct scatterlist) * sg_elems;
> +
> + q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
> if (!q) {
> err = -ENOMEM;
> goto out_put_disk;
> }
>
> - if (use_bio)
> - blk_queue_make_request(q, virtblk_make_request);
> + blk_mq_init_commands(q, virtblk_init_vbr, vblk);
> +
> q->queuedata = vblk;
>
> virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
> @@ -857,8 +683,6 @@ out_del_disk:
> blk_cleanup_queue(vblk->disk->queue);
> out_put_disk:
> put_disk(vblk->disk);
> -out_mempool:
> - mempool_destroy(vblk->pool);
> out_free_vq:
> vdev->config->del_vqs(vdev);
> out_free_vblk:
> @@ -890,7 +714,6 @@ static void virtblk_remove(struct virtio_device *vdev)
>
> refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
> put_disk(vblk->disk);
> - mempool_destroy(vblk->pool);
> vdev->config->del_vqs(vdev);
> kfree(vblk);
>
> @@ -914,10 +737,7 @@ static int virtblk_freeze(struct virtio_device *vdev)
>
> flush_work(&vblk->config_work);
>
> - spin_lock_irq(vblk->disk->queue->queue_lock);
> - blk_stop_queue(vblk->disk->queue);
> - spin_unlock_irq(vblk->disk->queue->queue_lock);
> - blk_sync_queue(vblk->disk->queue);
> + blk_mq_stop_hw_queues(vblk->disk->queue);
>
> vdev->config->del_vqs(vdev);
> return 0;
> @@ -930,11 +750,9 @@ static int virtblk_restore(struct virtio_device *vdev)
>
> vblk->config_enable = true;
> ret = init_vq(vdev->priv);
> - if (!ret) {
> - spin_lock_irq(vblk->disk->queue->queue_lock);
> - blk_start_queue(vblk->disk->queue);
> - spin_unlock_irq(vblk->disk->queue->queue_lock);
> - }
> + if (!ret)
> + blk_mq_start_stopped_hw_queues(vblk->disk->queue);
> +
> return ret;
> }
> #endif
> --
> 1.7.10.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/