Re: [PATCH RFC 06/12] xen-blkback: implement LRU mechanism forpersistent grants

From: Konrad Rzeszutek Wilk
Date: Mon Mar 04 2013 - 15:11:32 EST


On Thu, Feb 28, 2013 at 11:28:49AM +0100, Roger Pau Monne wrote:
> This mechanism allows blkback to change the number of grants
> persistently mapped at run time.
>
> The algorithm uses a simple LRU mechanism that removes (if needed) the
> persistent grants that have not been used since the last LRU run, or
> if all grants have been used it removes the first grants in the list
> (that are not in use).
>
> The algorithm has several parameters that can be tuned by the user
> from sysfs:
>
> * max_persistent_grants: maximum number of grants that will be
> persistently mapped.
> * lru_interval: minimum interval (in ms) at which the LRU should be
> run
> * lru_num_clean: number of persistent grants to remove when executing
> the LRU.
>
> Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
> Cc: xen-devel@xxxxxxxxxxxxx
> ---
> drivers/block/xen-blkback/blkback.c | 207 +++++++++++++++++++++++++++--------
> drivers/block/xen-blkback/common.h | 4 +
> drivers/block/xen-blkback/xenbus.c | 1 +
> 3 files changed, 166 insertions(+), 46 deletions(-)

You also should add a Documentation/sysfs/ABI/stable/sysfs-bus-xen-backend

>
> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
> index 415a0c7..c14b736 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -63,6 +63,44 @@ static int xen_blkif_reqs = 64;
> module_param_named(reqs, xen_blkif_reqs, int, 0);
> MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
>
> +/*
> + * Maximum number of grants to map persistently in blkback. For maximum
> + * performance this should be the total numbers of grants that can be used
> + * to fill the ring, but since this might become too high, specially with
> + * the use of indirect descriptors, we set it to a value that provides good
> + * performance without using too much memory.
> + *
> + * When the list of persistent grants is full we clean it using a LRU
> + * algorithm.
> + */
> +
> +static int xen_blkif_max_pgrants = 352;

And a little blurb saying why 352.

> +module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
> +MODULE_PARM_DESC(max_persistent_grants,
> + "Maximum number of grants to map persistently");
> +
> +/*
> + * The LRU mechanism to clean the lists of persistent grants needs to
> + * be executed periodically. The time interval between consecutive executions
> + * of the purge mechanism is set in ms.
> + */
> +
> +static int xen_blkif_lru_interval = 100;

So every second? What is the benefit of having the user modify this? Would
it better if there was an watermark system in xen-blkfront to automatically
figure this out? (This could be a TODO of course)

> +module_param_named(lru_interval, xen_blkif_lru_interval, int, 0644);
> +MODULE_PARM_DESC(lru_interval,
> +"Execution interval (in ms) of the LRU mechanism to clean the list of persistent grants");
> +
> +/*
> + * When the persistent grants list is full we will remove unused grants
> + * from the list. The number of grants to be removed at each LRU execution
> + * can be set dynamically.
> + */
> +
> +static int xen_blkif_lru_num_clean = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +module_param_named(lru_num_clean, xen_blkif_lru_num_clean, int, 0644);
> +MODULE_PARM_DESC(lru_num_clean,
> +"Number of persistent grants to unmap when the list is full");

Again, what does that mean to the system admin? Why would they need to modify
the contents of that value?


Now if this is a debug related one for developing, then this could all be
done in DebugFS.

> +
> /* Run-time switchable: /sys/module/blkback/parameters/ */
> static unsigned int log_stats;
> module_param(log_stats, int, 0644);
> @@ -81,7 +119,7 @@ struct pending_req {
> unsigned short operation;
> int status;
> struct list_head free_list;
> - DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
> + struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> };
>
> #define BLKBACK_INVALID_HANDLE (~0)
> @@ -102,36 +140,6 @@ struct xen_blkbk {
> static struct xen_blkbk *blkbk;
>
> /*
> - * Maximum number of grant pages that can be mapped in blkback.
> - * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
> - * pages that blkback will persistently map.
> - * Currently, this is:
> - * RING_SIZE = 32 (for all known ring types)
> - * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
> - * sizeof(struct persistent_gnt) = 48
> - * So the maximum memory used to store the grants is:
> - * 32 * 11 * 48 = 16896 bytes
> - */
> -static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
> -{
> - switch (protocol) {
> - case BLKIF_PROTOCOL_NATIVE:
> - return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
> - BLKIF_MAX_SEGMENTS_PER_REQUEST;
> - case BLKIF_PROTOCOL_X86_32:
> - return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
> - BLKIF_MAX_SEGMENTS_PER_REQUEST;
> - case BLKIF_PROTOCOL_X86_64:
> - return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
> - BLKIF_MAX_SEGMENTS_PER_REQUEST;
> - default:
> - BUG();
> - }
> - return 0;
> -}
> -
> -
> -/*
> * Little helpful macro to figure out the index and virtual address of the
> * pending_pages[..]. For each 'pending_req' we have have up to
> * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
> @@ -251,6 +259,76 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
> BUG_ON(num != 0);
> }
>
> +static int purge_persistent_gnt(struct rb_root *root, int num)
> +{
> + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct persistent_gnt *persistent_gnt;
> + struct rb_node *n;
> + int ret, segs_to_unmap = 0;
> + int requested_num = num;
> + int preserve_used = 1;

Boolean? And perhaps 'scan_dirty' ?


> +
> + pr_debug("Requested the purge of %d persistent grants\n", num);
> +
> +purge_list:

This could be written a bit differently to also run outside the xen_blkif_schedule
(so a new thread). This would require using the lock mechanism and converting
this big loop to two smaller loops:
1) - one quick that holds the lock - to take the items of the list,
2) second one to do the grant_set_unmap_op operations and all the heavy
free_xenballooned_pages call.

.. As this function ends up (presumarily?) causing xen_blkif_schedule to be doing
this for some time every second. Irregardless of how utilized the ring is - so
if we are 100% busy - we should not need to call this function. But if we do,
then we end up walking the persistent_gnt twice - once with preserve_used set
to true, and the other with it set to false.

We don't really want that - so is there a way for xen_blkif_schedule to
do a quick determintion of this caliber:


if (RING_HAS_UNCONSUMED_REQUESTS(x) >= some_value)
wait_up(blkif->purgarator)

And the thread would just sit there until kicked in action?


> + foreach_grant_safe(persistent_gnt, n, root, node) {
> + BUG_ON(persistent_gnt->handle ==
> + BLKBACK_INVALID_HANDLE);
> +
> + if (persistent_gnt->flags & PERSISTENT_GNT_ACTIVE)
> + continue;
> + if (preserve_used &&
> + (persistent_gnt->flags & PERSISTENT_GNT_USED))

Is that similar to DIRTY on pagetables?

> + continue;
> +
> + gnttab_set_unmap_op(&unmap[segs_to_unmap],
> + (unsigned long) pfn_to_kaddr(page_to_pfn(
> + persistent_gnt->page)),
> + GNTMAP_host_map,
> + persistent_gnt->handle);
> +
> + pages[segs_to_unmap] = persistent_gnt->page;
> +
> + if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
> + ret = gnttab_unmap_refs(unmap, NULL, pages,
> + segs_to_unmap);
> + BUG_ON(ret);
> + free_xenballooned_pages(segs_to_unmap, pages);
> + segs_to_unmap = 0;
> + }
> +
> + rb_erase(&persistent_gnt->node, root);
> + kfree(persistent_gnt);
> + if (--num == 0)
> + goto finished;
> + }
> + /*
> + * If we get here it means we also need to start cleaning
> + * grants that were used since last purge in order to cope
> + * with the requested num
> + */
> + if (preserve_used) {
> + pr_debug("Still missing %d purged frames\n", num);
> + preserve_used = 0;
> + goto purge_list;
> + }
> +finished:
> + if (segs_to_unmap > 0) {
> + ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
> + BUG_ON(ret);
> + free_xenballooned_pages(segs_to_unmap, pages);
> + }
> + /* Finally remove the "used" flag from all the persistent grants */
> + foreach_grant_safe(persistent_gnt, n, root, node) {
> + BUG_ON(persistent_gnt->handle ==
> + BLKBACK_INVALID_HANDLE);
> + persistent_gnt->flags &= ~PERSISTENT_GNT_USED;
> + }
> + pr_debug("Purged %d/%d\n", (requested_num - num), requested_num);
> + return (requested_num - num);
> +}
> +
> /*
> * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
> */
> @@ -397,6 +475,8 @@ int xen_blkif_schedule(void *arg)
> {
> struct xen_blkif *blkif = arg;
> struct xen_vbd *vbd = &blkif->vbd;
> + int rq_purge, purged;
> + unsigned long timeout;
>
> xen_blkif_get(blkif);
>
> @@ -406,13 +486,21 @@ int xen_blkif_schedule(void *arg)
> if (unlikely(vbd->size != vbd_sz(vbd)))
> xen_vbd_resize(blkif);
>
> - wait_event_interruptible(
> + timeout = msecs_to_jiffies(xen_blkif_lru_interval);
> +
> + timeout = wait_event_interruptible_timeout(
> blkif->wq,
> - blkif->waiting_reqs || kthread_should_stop());
> - wait_event_interruptible(
> + blkif->waiting_reqs || kthread_should_stop(),
> + timeout);
> + if (timeout == 0)
> + goto purge_gnt_list;
> + timeout = wait_event_interruptible_timeout(
> blkbk->pending_free_wq,
> !list_empty(&blkbk->pending_free) ||
> - kthread_should_stop());
> + kthread_should_stop(),
> + timeout);
> + if (timeout == 0)
> + goto purge_gnt_list;
>
> blkif->waiting_reqs = 0;
> smp_mb(); /* clear flag *before* checking for work */
> @@ -420,6 +508,32 @@ int xen_blkif_schedule(void *arg)
> if (do_block_io_op(blkif))
> blkif->waiting_reqs = 1;
>
> +purge_gnt_list:
> + if (blkif->vbd.feature_gnt_persistent &&
> + time_after(jiffies, blkif->next_lru)) {
> + /* Clean the list of persistent grants */
> + if (blkif->persistent_gnt_c > xen_blkif_max_pgrants ||
> + (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
> + blkif->vbd.overflow_max_grants)) {
> + rq_purge = blkif->persistent_gnt_c -
> + xen_blkif_max_pgrants +
> + xen_blkif_lru_num_clean;

You can make this more than 80 lines.
> + rq_purge = rq_purge > blkif->persistent_gnt_c ?
> + blkif->persistent_gnt_c : rq_purge;
> + purged = purge_persistent_gnt(
> + &blkif->persistent_gnts, rq_purge);
> + if (purged != rq_purge)
> + pr_debug(DRV_PFX " unable to meet persistent grants purge requirements for device %#x, domain %u, requested %d done %d\n",
> + blkif->domid,
> + blkif->vbd.handle,
> + rq_purge, purged);
> + blkif->persistent_gnt_c -= purged;
> + blkif->vbd.overflow_max_grants = 0;
> + }
> + blkif->next_lru = jiffies +
> + msecs_to_jiffies(xen_blkif_lru_interval);
> + }
> +
> if (log_stats && time_after(jiffies, blkif->st_print))
> print_stats(blkif);
> }
> @@ -453,13 +567,18 @@ static void xen_blkbk_unmap(struct pending_req *req)
> {
> struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct persistent_gnt *persistent_gnt;
> unsigned int i, invcount = 0;
> grant_handle_t handle;
> int ret;
>
> for (i = 0; i < req->nr_pages; i++) {
> - if (!test_bit(i, req->unmap_seg))
> + if (req->persistent_gnts[i] != NULL) {
> + persistent_gnt = req->persistent_gnts[i];
> + persistent_gnt->flags |= PERSISTENT_GNT_USED;
> + persistent_gnt->flags &= ~PERSISTENT_GNT_ACTIVE;
> continue;
> + }
> handle = pending_handle(req, i);
> if (handle == BLKBACK_INVALID_HANDLE)
> continue;
> @@ -480,8 +599,8 @@ static int xen_blkbk_map(struct blkif_request *req,
> struct page *pages[])
> {
> struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> - struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct persistent_gnt **persistent_gnts = pending_req->persistent_gnts;
> struct persistent_gnt *persistent_gnt = NULL;
> struct xen_blkif *blkif = pending_req->blkif;
> phys_addr_t addr = 0;
> @@ -494,9 +613,6 @@ static int xen_blkbk_map(struct blkif_request *req,
>
> use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
>
> - BUG_ON(blkif->persistent_gnt_c >
> - max_mapped_grant_pages(pending_req->blkif->blk_protocol));
> -
> /*
> * Fill out preq.nr_sects with proper amount of sectors, and setup
> * assign map[..] with the PFN of the page in our domain with the
> @@ -516,9 +632,9 @@ static int xen_blkbk_map(struct blkif_request *req,
> * the grant is already mapped
> */
> new_map = false;
> + persistent_gnt->flags |= PERSISTENT_GNT_ACTIVE;
> } else if (use_persistent_gnts &&
> - blkif->persistent_gnt_c <
> - max_mapped_grant_pages(blkif->blk_protocol)) {
> + blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
> /*
> * We are using persistent grants, the grant is
> * not mapped but we have room for it
> @@ -536,6 +652,7 @@ static int xen_blkbk_map(struct blkif_request *req,
> }
> persistent_gnt->gnt = req->u.rw.seg[i].gref;
> persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
> + persistent_gnt->flags = PERSISTENT_GNT_ACTIVE;
>
> pages_to_gnt[segs_to_map] =
> persistent_gnt->page;
> @@ -547,7 +664,7 @@ static int xen_blkbk_map(struct blkif_request *req,
> blkif->persistent_gnt_c++;
> pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
> persistent_gnt->gnt, blkif->persistent_gnt_c,
> - max_mapped_grant_pages(blkif->blk_protocol));
> + xen_blkif_max_pgrants);
> } else {
> /*
> * We are either using persistent grants and
> @@ -557,7 +674,7 @@ static int xen_blkbk_map(struct blkif_request *req,
> if (use_persistent_gnts &&
> !blkif->vbd.overflow_max_grants) {
> blkif->vbd.overflow_max_grants = 1;
> - pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
> + pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
> blkif->domid, blkif->vbd.handle);
> }
> new_map = true;
> @@ -595,7 +712,6 @@ static int xen_blkbk_map(struct blkif_request *req,
> * so that when we access vaddr(pending_req,i) it has the contents of
> * the page from the other domain.
> */
> - bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
> for (i = 0, j = 0; i < nseg; i++) {
> if (!persistent_gnts[i] ||
> persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
> @@ -634,7 +750,6 @@ static int xen_blkbk_map(struct blkif_request *req,
> (req->u.rw.seg[i].first_sect << 9);
> } else {
> pending_handle(pending_req, i) = map[j].handle;
> - bitmap_set(pending_req->unmap_seg, i, 1);
>
> if (ret) {
> j++;
> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
> index f338f8a..bd44d75 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -167,11 +167,14 @@ struct xen_vbd {
>
> struct backend_info;
>
> +#define PERSISTENT_GNT_ACTIVE 0x1
> +#define PERSISTENT_GNT_USED 0x2
>
> struct persistent_gnt {
> struct page *page;
> grant_ref_t gnt;
> grant_handle_t handle;
> + uint8_t flags;
> struct rb_node node;
> };
>
> @@ -204,6 +207,7 @@ struct xen_blkif {
> /* tree to store persistent grants */
> struct rb_root persistent_gnts;
> unsigned int persistent_gnt_c;
> + unsigned long next_lru;
>
> /* statistics */
> unsigned long st_print;
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> index 5e237f6..abb399a 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -116,6 +116,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
> init_completion(&blkif->drain_complete);
> atomic_set(&blkif->drain, 0);
> blkif->st_print = jiffies;
> + blkif->next_lru = jiffies;
> init_waitqueue_head(&blkif->waiting_to_free);
> blkif->persistent_gnts.rb_node = NULL;
>
> --
> 1.7.7.5 (Apple Git-26)
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/