[PATCH v5 03/10] xen/blkfront: pseudo support for multi hardware queues/rings

From: Bob Liu
Date: Fri Nov 13 2015 - 22:14:57 EST


Preparatory patch for multiple hardware queues (rings). The number of
rings is unconditionally set to 1, larger number will be enabled in next
patch("xen/blkfront: negotiate number of queues/rings to be used with backend")
so as to make every single patch small and readable.

Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
---
v2:
* Fix memleak.
* Other comments from Konrad.
---
drivers/block/xen-blkfront.c | 341 ++++++++++++++++++++++++------------------
1 file changed, 195 insertions(+), 146 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0c3ad21..d73734f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -150,6 +150,7 @@ struct blkfront_info
int vdevice;
blkif_vdev_t handle;
enum blkif_state connected;
+ /* Number of pages per ring buffer. */
unsigned int nr_ring_pages;
struct request_queue *rq;
struct list_head grants;
@@ -164,7 +165,8 @@ struct blkfront_info
unsigned int max_indirect_segments;
int is_ready;
struct blk_mq_tag_set tag_set;
- struct blkfront_ring_info rinfo;
+ struct blkfront_ring_info *rinfo;
+ unsigned int nr_rings;
};

static unsigned int nr_minors;
@@ -209,7 +211,7 @@ static DEFINE_SPINLOCK(minor_lock);
#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)

static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
-static int blkfront_gather_backend_features(struct blkfront_info *info);
+static void blkfront_gather_backend_features(struct blkfront_info *info);

static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
{
@@ -338,8 +340,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
struct page *indirect_page;

/* Fetch a pre-allocated page to use for indirect grefs */
- BUG_ON(list_empty(&info->rinfo.indirect_pages));
- indirect_page = list_first_entry(&info->rinfo.indirect_pages,
+ BUG_ON(list_empty(&info->rinfo->indirect_pages));
+ indirect_page = list_first_entry(&info->rinfo->indirect_pages,
struct page, lru);
list_del(&indirect_page->lru);
gnt_list_entry->page = indirect_page;
@@ -597,7 +599,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
* existing persistent grants, or if we have to get new grants,
* as there are not sufficiently many free.
*/
- bool new_persistent_gnts;
struct scatterlist *sg;
int num_sg, max_grefs, num_grant;

@@ -609,12 +610,12 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
*/
max_grefs += INDIRECT_GREFS(max_grefs);

- /* Check if we have enough grants to allocate a requests */
- if (info->persistent_gnts_c < max_grefs) {
- new_persistent_gnts = 1;
- if (gnttab_alloc_grant_references(
- max_grefs - info->persistent_gnts_c,
- &setup.gref_head) < 0) {
+ /*
+ * We have to reserve 'max_grefs' grants at first because persistent
+ * grants are shared by all rings.
+ */
+ if (max_grefs > 0)
+ if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
gnttab_request_free_callback(
&rinfo->callback,
blkif_restart_queue_callback,
@@ -622,8 +623,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
max_grefs);
return 1;
}
- } else
- new_persistent_gnts = 0;

/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
@@ -712,7 +711,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
/* Keep a private copy so we can reissue requests when recovering. */
rinfo->shadow[id].req = *ring_req;

- if (new_persistent_gnts)
+ if (max_grefs > 0)
gnttab_free_grant_references(setup.gref_head);

return 0;
@@ -791,7 +790,8 @@ static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
{
struct blkfront_info *info = (struct blkfront_info *)data;

- hctx->driver_data = &info->rinfo;
+ BUG_ON(info->nr_rings <= index);
+ hctx->driver_data = &info->rinfo[index];
return 0;
}

@@ -1050,8 +1050,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,

static void xlvbd_release_gendisk(struct blkfront_info *info)
{
- unsigned int minor, nr_minors;
- struct blkfront_ring_info *rinfo = &info->rinfo;
+ unsigned int minor, nr_minors, i;

if (info->rq == NULL)
return;
@@ -1059,11 +1058,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
/* No more blkif_request(). */
blk_mq_stop_hw_queues(info->rq);

- /* No more gnttab callback work. */
- gnttab_cancel_free_callback(&rinfo->callback);
+ for (i = 0; i < info->nr_rings; i++) {
+ struct blkfront_ring_info *rinfo = &info->rinfo[i];

- /* Flush gnttab callback work. Must be done with no locks held. */
- flush_work(&rinfo->work);
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&rinfo->callback);
+
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_work(&rinfo->work);
+ }

del_gendisk(info->gd);

@@ -1096,37 +1099,11 @@ static void blkif_restart_queue(struct work_struct *work)
spin_unlock_irq(&rinfo->dev_info->io_lock);
}

-static void blkif_free(struct blkfront_info *info, int suspend)
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
{
struct grant *persistent_gnt;
- struct grant *n;
+ struct blkfront_info *info = rinfo->dev_info;
int i, j, segs;
- struct blkfront_ring_info *rinfo = &info->rinfo;
-
- /* Prevent new requests being issued until we fix things up. */
- spin_lock_irq(&info->io_lock);
- info->connected = suspend ?
- BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
- /* No more blkif_request(). */
- if (info->rq)
- blk_mq_stop_hw_queues(info->rq);
-
- /* Remove all persistent grants */
- if (!list_empty(&info->grants)) {
- list_for_each_entry_safe(persistent_gnt, n,
- &info->grants, node) {
- list_del(&persistent_gnt->node);
- if (persistent_gnt->gref != GRANT_INVALID_REF) {
- gnttab_end_foreign_access(persistent_gnt->gref,
- 0, 0UL);
- info->persistent_gnts_c--;
- }
- if (info->feature_persistent)
- __free_page(persistent_gnt->page);
- kfree(persistent_gnt);
- }
- }
- BUG_ON(info->persistent_gnts_c != 0);

/*
* Remove indirect pages, this only happens when using indirect
@@ -1186,7 +1163,6 @@ free_shadow:

/* No more gnttab callback work. */
gnttab_cancel_free_callback(&rinfo->callback);
- spin_unlock_irq(&info->io_lock);

/* Flush gnttab callback work. Must be done with no locks held. */
flush_work(&rinfo->work);
@@ -1204,7 +1180,43 @@ free_shadow:
if (rinfo->irq)
unbind_from_irqhandler(rinfo->irq, rinfo);
rinfo->evtchn = rinfo->irq = 0;
+}

+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+ struct grant *persistent_gnt, *n;
+ unsigned int i;
+
+ /* Prevent new requests being issued until we fix things up. */
+ spin_lock_irq(&info->io_lock);
+ info->connected = suspend ?
+ BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+ /* No more blkif_request(). */
+ if (info->rq)
+ blk_mq_stop_hw_queues(info->rq);
+
+ /* Remove all persistent grants */
+ if (!list_empty(&info->grants)) {
+ list_for_each_entry_safe(persistent_gnt, n,
+ &info->grants, node) {
+ list_del(&persistent_gnt->node);
+ if (persistent_gnt->gref != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(persistent_gnt->gref,
+ 0, 0UL);
+ info->persistent_gnts_c--;
+ }
+ if (info->feature_persistent)
+ __free_page(persistent_gnt->page);
+ kfree(persistent_gnt);
+ }
+ }
+ BUG_ON(info->persistent_gnts_c != 0);
+
+ for (i = 0; i < info->nr_rings; i++)
+ blkif_free_ring(&info->rinfo[i]);
+ kfree(info->rinfo);
+ info->nr_rings = 0;
+ spin_unlock_irq(&info->io_lock);
}

struct copy_from_grant {
@@ -1492,7 +1504,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
int err, i;
unsigned int max_page_order = 0;
unsigned int ring_page_order = 0;
- struct blkfront_ring_info *rinfo = &info->rinfo;
+ struct blkfront_ring_info *rinfo;

err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
"max-ring-page-order", "%u", &max_page_order);
@@ -1503,10 +1515,13 @@ static int talk_to_blkback(struct xenbus_device *dev,
info->nr_ring_pages = 1 << ring_page_order;
}

- /* Create shared ring, alloc event channel. */
- err = setup_blkring(dev, rinfo);
- if (err)
- goto out;
+ for (i = 0; i < info->nr_rings; i++) {
+ rinfo = &info->rinfo[i];
+ /* Create shared ring, alloc event channel. */
+ err = setup_blkring(dev, rinfo);
+ if (err)
+ goto destroy_blkring;
+ }

again:
err = xenbus_transaction_start(&xbt);
@@ -1515,37 +1530,43 @@ again:
goto destroy_blkring;
}

- if (info->nr_ring_pages == 1) {
- err = xenbus_printf(xbt, dev->nodename,
- "ring-ref", "%u", rinfo->ring_ref[0]);
- if (err) {
- message = "writing ring-ref";
- goto abort_transaction;
- }
- } else {
- err = xenbus_printf(xbt, dev->nodename,
- "ring-page-order", "%u", ring_page_order);
- if (err) {
- message = "writing ring-page-order";
- goto abort_transaction;
- }
-
- for (i = 0; i < info->nr_ring_pages; i++) {
- char ring_ref_name[RINGREF_NAME_LEN];
-
- snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
- err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
- "%u", rinfo->ring_ref[i]);
+ if (info->nr_rings == 1) {
+ rinfo = &info->rinfo[0];
+ if (info->nr_ring_pages == 1) {
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-ref", "%u", rinfo->ring_ref[0]);
if (err) {
message = "writing ring-ref";
goto abort_transaction;
}
+ } else {
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-page-order", "%u", ring_page_order);
+ if (err) {
+ message = "writing ring-page-order";
+ goto abort_transaction;
+ }
+
+ for (i = 0; i < info->nr_ring_pages; i++) {
+ char ring_ref_name[RINGREF_NAME_LEN];
+
+ snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+ err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
+ "%u", rinfo->ring_ref[i]);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ }
}
- }
- err = xenbus_printf(xbt, dev->nodename,
- "event-channel", "%u", rinfo->evtchn);
- if (err) {
- message = "writing event-channel";
+ err = xenbus_printf(xbt, dev->nodename,
+ "event-channel", "%u", rinfo->evtchn);
+ if (err) {
+ message = "writing event-channel";
+ goto abort_transaction;
+ }
+ } else {
+ /* Not supported at this stage. */
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
@@ -1568,9 +1589,14 @@ again:
goto destroy_blkring;
}

- for (i = 0; i < BLK_RING_SIZE(info); i++)
- rinfo->shadow[i].req.u.rw.id = i+1;
- rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+ for (i = 0; i < info->nr_rings; i++) {
+ int j;
+ rinfo = &info->rinfo[i];
+
+ for (j = 0; j < BLK_RING_SIZE(info); j++)
+ rinfo->shadow[j].req.u.rw.id = j + 1;
+ rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+ }
xenbus_switch_state(dev, XenbusStateInitialised);

return 0;
@@ -1581,7 +1607,7 @@ again:
xenbus_dev_fatal(dev, err, "%s", message);
destroy_blkring:
blkif_free(info, 0);
- out:
+
return err;
}

@@ -1594,9 +1620,8 @@ again:
static int blkfront_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
- int err, vdevice;
+ int err, vdevice, r_index;
struct blkfront_info *info;
- struct blkfront_ring_info *rinfo;

/* FIXME: Use dynamic device id if this is not set. */
err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1646,10 +1671,22 @@ static int blkfront_probe(struct xenbus_device *dev,
return -ENOMEM;
}

- rinfo = &info->rinfo;
- INIT_LIST_HEAD(&rinfo->indirect_pages);
- rinfo->dev_info = info;
- INIT_WORK(&rinfo->work, blkif_restart_queue);
+ info->nr_rings = 1;
+ info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+ if (!info->rinfo) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+ kfree(info);
+ return -ENOMEM;
+ }
+
+ for (r_index = 0; r_index < info->nr_rings; r_index++) {
+ struct blkfront_ring_info *rinfo;
+
+ rinfo = &info->rinfo[r_index];
+ INIT_LIST_HEAD(&rinfo->indirect_pages);
+ rinfo->dev_info = info;
+ INIT_WORK(&rinfo->work, blkif_restart_queue);
+ }

mutex_init(&info->mutex);
spin_lock_init(&info->io_lock);
@@ -1681,7 +1718,7 @@ static void split_bio_end(struct bio *bio)

static int blkif_recover(struct blkfront_info *info)
{
- int i;
+ int i, r_index;
struct request *req, *n;
struct blk_shadow *copy;
int rc;
@@ -1691,57 +1728,62 @@ static int blkif_recover(struct blkfront_info *info)
int pending, size;
struct split_bio *split_bio;
struct list_head requests;
- struct blkfront_ring_info *rinfo = &info->rinfo;
-
- /* Stage 1: Make a safe copy of the shadow state. */
- copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
- GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
- if (!copy)
- return -ENOMEM;
-
- /* Stage 2: Set up free list. */
- memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
- for (i = 0; i < BLK_RING_SIZE(info); i++)
- rinfo->shadow[i].req.u.rw.id = i+1;
- rinfo->shadow_free = rinfo->ring.req_prod_pvt;
- rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
-
- rc = blkfront_gather_backend_features(info);
- if (rc) {
- kfree(copy);
- return rc;
- }

+ blkfront_gather_backend_features(info);
segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
blk_queue_max_segments(info->rq, segs);
bio_list_init(&bio_list);
INIT_LIST_HEAD(&requests);
- for (i = 0; i < BLK_RING_SIZE(info); i++) {
- /* Not in use? */
- if (!copy[i].request)
- continue;

- /*
- * Get the bios in the request so we can re-queue them.
- */
- if (copy[i].request->cmd_flags &
- (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ for (r_index = 0; r_index < info->nr_rings; r_index++) {
+ struct blkfront_ring_info *rinfo;
+
+ rinfo = &info->rinfo[r_index];
+ /* Stage 1: Make a safe copy of the shadow state. */
+ copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+ GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+ if (!copy)
+ return -ENOMEM;
+
+ /* Stage 2: Set up free list. */
+ memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+ for (i = 0; i < BLK_RING_SIZE(info); i++)
+ rinfo->shadow[i].req.u.rw.id = i+1;
+ rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+ rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+
+ rc = blkfront_setup_indirect(rinfo);
+ if (rc) {
+ kfree(copy);
+ return rc;
+ }
+
+ for (i = 0; i < BLK_RING_SIZE(info); i++) {
+ /* Not in use? */
+ if (!copy[i].request)
+ continue;
+
/*
- * Flush operations don't contain bios, so
- * we need to requeue the whole request
+ * Get the bios in the request so we can re-queue them.
*/
- list_add(&copy[i].request->queuelist, &requests);
- continue;
+ if (copy[i].request->cmd_flags &
+ (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ /*
+ * Flush operations don't contain bios, so
+ * we need to requeue the whole request
+ */
+ list_add(&copy[i].request->queuelist, &requests);
+ continue;
+ }
+ merge_bio.head = copy[i].request->bio;
+ merge_bio.tail = copy[i].request->biotail;
+ bio_list_merge(&bio_list, &merge_bio);
+ copy[i].request->bio = NULL;
+ blk_end_request_all(copy[i].request, 0);
}
- merge_bio.head = copy[i].request->bio;
- merge_bio.tail = copy[i].request->biotail;
- bio_list_merge(&bio_list, &merge_bio);
- copy[i].request->bio = NULL;
- blk_end_request_all(copy[i].request, 0);
- }
-
- kfree(copy);

+ kfree(copy);
+ }
xenbus_switch_state(info->xbdev, XenbusStateConnected);

spin_lock_irq(&info->io_lock);
@@ -1749,8 +1791,13 @@ static int blkif_recover(struct blkfront_info *info)
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;

- /* Kick any other new requests queued since we resumed */
- kick_pending_request_queues(rinfo);
+ for (r_index = 0; r_index < info->nr_rings; r_index++) {
+ struct blkfront_ring_info *rinfo;
+
+ rinfo = &info->rinfo[r_index];
+ /* Kick any other new requests queued since we resumed */
+ kick_pending_request_queues(rinfo);
+ }

list_for_each_entry_safe(req, n, &requests, queuelist) {
/* Requeue pending requests (flush or discard) */
@@ -1961,7 +2008,7 @@ out_of_memory:
/*
* Gather all backend feature-*
*/
-static int blkfront_gather_backend_features(struct blkfront_info *info)
+static void blkfront_gather_backend_features(struct blkfront_info *info)
{
int err;
int barrier, flush, discard, persistent;
@@ -2016,8 +2063,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
else
info->max_indirect_segments = min(indirect_segments,
xen_blkif_max_segments);
-
- return blkfront_setup_indirect(&info->rinfo);
}

/*
@@ -2030,8 +2075,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size;
unsigned int physical_sector_size;
unsigned int binfo;
- int err;
- struct blkfront_ring_info *rinfo = &info->rinfo;
+ int err, i;

switch (info->connected) {
case BLKIF_STATE_CONNECTED:
@@ -2088,11 +2132,15 @@ static void blkfront_connect(struct blkfront_info *info)
if (err != 1)
physical_sector_size = sector_size;

- err = blkfront_gather_backend_features(info);
- if (err) {
- xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
- info->xbdev->otherend);
- return;
+ blkfront_gather_backend_features(info);
+ for (i = 0; i < info->nr_rings; i++) {
+ err = blkfront_setup_indirect(&info->rinfo[i]);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+ info->xbdev->otherend);
+ blkif_free(info, 0);
+ break;
+ }
}

err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2108,7 +2156,8 @@ static void blkfront_connect(struct blkfront_info *info)
/* Kick pending requests. */
spin_lock_irq(&info->io_lock);
info->connected = BLKIF_STATE_CONNECTED;
- kick_pending_request_queues(rinfo);
+ for (i = 0; i < info->nr_rings; i++)
+ kick_pending_request_queues(&info->rinfo[i]);
spin_unlock_irq(&info->io_lock);

add_disk(info->gd);
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/