[PATCH 01/03] block: prepare I/O context code for BFQ

From: Fabio Checconi
Date: Tue Nov 11 2008 - 07:51:38 EST


BFQ uses struct cfq_io_context to store its per-process per-device data,
reusing the same code for cic handling of CFQ. The code is not shared
ATM to minimize the impact of these patches.

This patch introduces a new hlist to each io_context to store all the
cic's allocated by BFQ to allow calling the right destructor on module
unload; the radix tree used for cic lookup needs to be duplicated
because it can contain dead keys inserted by a scheduler and later
retrieved by the other one.

Update the io_context exit and free paths to take care also of
the BFQ cic's.

Change the type of cfqq inside struct cfq_io_context to void *
to use it also for BFQ per-queue data.

A new bfq-specific ioprio_changed field is necessary, too, to avoid
clobbering cfq's one, so switch ioprio_changed to a bitmap, with one
element per scheduler.

Signed-off-by: Fabio Checconi <fabio@xxxxxxxxxxxxxxxx>
Signed-off-by: Paolo Valente <paolo.valente@xxxxxxxxxx>
---
block/blk-ioc.c | 27 ++++++++++++++++-----------
block/cfq-iosched.c | 10 +++++++---
fs/ioprio.c | 9 +++++++--
include/linux/iocontext.h | 18 +++++++++++++++---
4 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065..c18bb82 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
+#include <linux/bitmap.h>
#include <linux/blkdev.h>
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */

@@ -15,13 +16,12 @@
*/
static struct kmem_cache *iocontext_cachep;

-static void cfq_dtor(struct io_context *ioc)
+static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list)
{
- if (!hlist_empty(&ioc->cic_list)) {
+ if (!hlist_empty(list)) {
struct cfq_io_context *cic;

- cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
- cic_list);
+ cic = list_entry(list->first, struct cfq_io_context, cic_list);
cic->dtor(ioc);
}
}
@@ -41,7 +41,9 @@ int put_io_context(struct io_context *ioc)
rcu_read_lock();
if (ioc->aic && ioc->aic->dtor)
ioc->aic->dtor(ioc->aic);
- cfq_dtor(ioc);
+
+ hlist_sched_dtor(ioc, &ioc->cic_list);
+ hlist_sched_dtor(ioc, &ioc->bfq_cic_list);
rcu_read_unlock();

kmem_cache_free(iocontext_cachep, ioc);
@@ -51,18 +53,18 @@ int put_io_context(struct io_context *ioc)
}
EXPORT_SYMBOL(put_io_context);

-static void cfq_exit(struct io_context *ioc)
+static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list)
{
rcu_read_lock();

- if (!hlist_empty(&ioc->cic_list)) {
+ if (!hlist_empty(list)) {
struct cfq_io_context *cic;

- cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
- cic_list);
+ cic = list_entry(list->first, struct cfq_io_context, cic_list);
cic->exit(ioc);
}
rcu_read_unlock();
+
}

/* Called by the exitting task */
@@ -78,7 +80,8 @@ void exit_io_context(void)
if (atomic_dec_and_test(&ioc->nr_tasks)) {
if (ioc->aic && ioc->aic->exit)
ioc->aic->exit(ioc->aic);
- cfq_exit(ioc);
+ hlist_sched_exit(ioc, &ioc->cic_list);
+ hlist_sched_exit(ioc, &ioc->bfq_cic_list);

put_io_context(ioc);
}
@@ -93,13 +96,15 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
atomic_set(&ret->refcount, 1);
atomic_set(&ret->nr_tasks, 1);
spin_lock_init(&ret->lock);
- ret->ioprio_changed = 0;
+ bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
ret->ioprio = 0;
ret->last_waited = jiffies; /* doesn't matter... */
ret->nr_batch_requests = 0; /* because this is 0 */
ret->aic = NULL;
INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
INIT_HLIST_HEAD(&ret->cic_list);
+ INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH);
+ INIT_HLIST_HEAD(&ret->bfq_cic_list);
ret->ioc_data = NULL;
}

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1e2aff8..ab503ec 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1417,7 +1417,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
static void cfq_ioc_set_ioprio(struct io_context *ioc)
{
call_for_each_cic(ioc, changed_ioprio);
- ioc->ioprio_changed = 0;
}

static struct cfq_queue *
@@ -1664,8 +1663,13 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
goto err_free;

out:
- smp_read_barrier_depends();
- if (unlikely(ioc->ioprio_changed))
+ /*
+ * test_and_clear_bit() implies a memory barrier, paired with
+ * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
+ * new one.
+ */
+ if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED,
+ ioc->ioprio_changed)))
cfq_ioc_set_ioprio(ioc);

return cic;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index da3cc46..9296ac3 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -29,7 +29,7 @@

static int set_task_ioprio(struct task_struct *task, int ioprio)
{
- int err;
+ int err, i;
struct io_context *ioc;

if (task->uid != current->euid &&
@@ -53,12 +53,17 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
err = -ENOMEM;
break;
}
+ /* let other ioc users see the new values */
+ smp_wmb();
task->io_context = ioc;
} while (1);

if (!err) {
ioc->ioprio = ioprio;
- ioc->ioprio_changed = 1;
+ /* make sure schedulers see the new ioprio value */
+ wmb();
+ for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++)
+ set_bit(i, ioc->ioprio_changed);
}

task_unlock(task);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987b..335bbb9 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -1,6 +1,7 @@
#ifndef IOCONTEXT_H
#define IOCONTEXT_H

+#include <linux/bitmap.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>

@@ -30,12 +31,11 @@ struct as_io_context {
sector_t seek_mean;
};

-struct cfq_queue;
struct cfq_io_context {
void *key;
unsigned long dead_key;

- struct cfq_queue *cfqq[2];
+ void *cfqq[2];

struct io_context *ioc;

@@ -60,6 +60,16 @@ struct cfq_io_context {
};

/*
+ * Indexes into the ioprio_changed bitmap. A bit set indicates that
+ * the corresponding I/O scheduler needs to see a ioprio update.
+ */
+enum {
+ IOC_CFQ_IOPRIO_CHANGED,
+ IOC_BFQ_IOPRIO_CHANGED,
+ IOC_IOPRIO_CHANGED_BITS
+};
+
+/*
* I/O subsystem state of the associated processes. It is refcounted
* and kmalloc'ed. These could be shared between processes.
*/
@@ -71,7 +81,7 @@ struct io_context {
spinlock_t lock;

unsigned short ioprio;
- unsigned short ioprio_changed;
+ DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS);

/*
* For request batching
@@ -82,6 +92,8 @@ struct io_context {
struct as_io_context *aic;
struct radix_tree_root radix_root;
struct hlist_head cic_list;
+ struct radix_tree_root bfq_radix_root;
+ struct hlist_head bfq_cic_list;
void *ioc_data;
};

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/