Re: [PATCH 3/3] xen-blkfront: dynamic configuration of per-vbd resources

From: Roger Pau Monné
Date: Thu Jul 21 2016 - 04:58:22 EST


On Fri, Jul 15, 2016 at 05:31:49PM +0800, Bob Liu wrote:
> The current VBD layer reserves buffer space for each attached device based on
> three statically configured settings which are read at boot time.
> * max_indirect_segs: Maximum amount of segments.
> * max_ring_page_order: Maximum order of pages to be used for the shared ring.
> * max_queues: Maximum of queues(rings) to be used.
>
> But the storage backend, workload, and guest memory result in very different
> tuning requirements. It's impossible to centrally predict application
> characteristics so it's best to leave allow the settings can be dynamiclly
> adjusted based on workload inside the Guest.
>
> Usage:
> Show current values:
> cat /sys/devices/vbd-xxx/max_indirect_segs
> cat /sys/devices/vbd-xxx/max_ring_page_order
> cat /sys/devices/vbd-xxx/max_queues
>
> Write new values:
> echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
> echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
> echo <new value> > /sys/devices/vbd-xxx/max_queues
>
> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
> --
> v2: Add device lock and other comments from Konrad.
> ---
> drivers/block/xen-blkfront.c | 285 ++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 283 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 10f46a8..9a5ed22 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -46,6 +46,7 @@
> #include <linux/scatterlist.h>
> #include <linux/bitmap.h>
> #include <linux/list.h>
> +#include <linux/delay.h>
>
> #include <xen/xen.h>
> #include <xen/xenbus.h>
> @@ -212,6 +213,11 @@ struct blkfront_info
> /* Save uncomplete reqs and bios for migration. */
> struct list_head requests;
> struct bio_list bio_list;
> + /* For dynamic configuration. */
> + unsigned int reconfiguring:1;
> + int new_max_indirect_segments;
> + int new_max_ring_page_order;
> + int new_max_queues;
> };
>
> static unsigned int nr_minors;
> @@ -1350,6 +1356,31 @@ static void blkif_free(struct blkfront_info *info, int suspend)
> for (i = 0; i < info->nr_rings; i++)
> blkif_free_ring(&info->rinfo[i]);
>
> + /* Remove old xenstore nodes. */
> + if (info->nr_ring_pages > 1)
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
> +
> + if (info->nr_rings == 1) {
> + if (info->nr_ring_pages == 1) {
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
> + } else {
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + char ring_ref_name[RINGREF_NAME_LEN];
> +
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
> + }
> + }
> + } else {
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
> +
> + for (i = 0; i < info->nr_rings; i++) {
> + char queuename[QUEUE_NAME_LEN];
> +
> + snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
> + }
> + }
> kfree(info->rinfo);
> info->rinfo = NULL;
> info->nr_rings = 0;
> @@ -1772,6 +1803,10 @@ static int talk_to_blkback(struct xenbus_device *dev,
> info->nr_ring_pages = 1;
> else {
> ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
> + if (info->new_max_ring_page_order) {

Instead of calling this "new_max_ring_page_order", could you just call it
max_ring_page_order, iniitalize it to xen_blkif_max_ring_order by default
and use it everywhere instead of xen_blkif_max_ring_order?

> + BUG_ON(info->new_max_ring_page_order > max_page_order);
> + ring_page_order = info->new_max_ring_page_order;
> + }
> info->nr_ring_pages = 1 << ring_page_order;
> }
>
> @@ -1895,6 +1930,10 @@ static int negotiate_mq(struct blkfront_info *info)
> backend_max_queues = 1;
>
> info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> + if (info->new_max_queues) {

Same here IMHO, this is going to make the code flow slightly easier to
understand.

> + BUG_ON(info->new_max_queues > backend_max_queues);
> + info->nr_rings = info->new_max_queues;
> + }
> /* We need at least one ring. */
> if (!info->nr_rings)
> info->nr_rings = 1;
> @@ -2352,11 +2391,227 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
> NULL);
> if (err)
> info->max_indirect_segments = 0;
> - else
> + else {
> info->max_indirect_segments = min(indirect_segments,
> xen_blkif_max_segments);
> + if (info->new_max_indirect_segments) {
> + BUG_ON(info->new_max_indirect_segments > indirect_segments);
> + info->max_indirect_segments = info->new_max_indirect_segments;
> + }
> + }
> +}
> +
> +static ssize_t max_ring_page_order_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct blkfront_info *info = dev_get_drvdata(dev);
> +
> + return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
> +}
> +
> +static ssize_t max_indirect_segs_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct blkfront_info *info = dev_get_drvdata(dev);
> +
> + return sprintf(page, "%u\n", info->max_indirect_segments);
> +}
> +
> +static ssize_t max_queues_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct blkfront_info *info = dev_get_drvdata(dev);
> +
> + return sprintf(page, "%u\n", info->nr_rings);
> +}
> +
> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> +{
> + unsigned int i;
> + int err = -EBUSY;
> +
> + /*
> + * Make sure no migration in parallel, device lock is actually a
> + * mutex.
> + */
> + if (!device_trylock(&info->xbdev->dev)) {
> + pr_err("Fail to acquire dev:%s lock, may be in migration.\n",
> + dev_name(&info->xbdev->dev));
> + return err;
> + }
> +
> + /*
> + * Prevent new requests and guarantee no uncompleted reqs.
> + */
> + blk_mq_freeze_queue(info->rq);
> + if (part_in_flight(&info->gd->part0))
> + goto out;
> +
> + /*
> + * Front Backend
> + * Switch to XenbusStateClosed
> + * frontend_changed():
> + * case XenbusStateClosed:
> + * xen_blkif_disconnect()
> + * Switch to XenbusStateClosed
> + * blkfront_resume():
> + * frontend_changed():
> + * reconnect
> + * Wait until XenbusStateConnected
> + */
> + info->reconfiguring = true;
> + xenbus_switch_state(info->xbdev, XenbusStateClosed);
> +
> + /* Poll every 100ms, 1 minute timeout. */
> + for (i = 0; i < 600; i++) {
> + /*
> + * Wait backend enter XenbusStateClosed, blkback_changed()
> + * will clear reconfiguring.
> + */
> + if (!info->reconfiguring)
> + goto resume;
> + schedule_timeout_interruptible(msecs_to_jiffies(100));
> + }

Instead of having this wait, could you just set info->reconfiguring = 1, set
the frontend state to XenbusStateClosed and mimic exactly what a resume from
suspension does? blkback_changed would have to set the frontend state to
InitWait when it detects that the backend has switched to Closed, and call
blkfront_resume.

> + goto out;
> +
> +resume:
> + if (blkfront_resume(info->xbdev))
> + goto out;
> +
> + /* Poll every 100ms, 1 minute timeout. */
> + for (i = 0; i < 600; i++) {
> + /* Wait blkfront enter StateConnected which is done by blkif_recover(). */
> + if (info->xbdev->state == XenbusStateConnected) {
> + err = count;
> + goto out;
> + }
> + schedule_timeout_interruptible(msecs_to_jiffies(100));
> + }

Same here, IMHO all this should be much more similar to a resume, and you
shouldn't need all this wait loops.

Roger.