[RFC PATCH 1/2] block: strict rq_affinity

From: Dan Williams
Date: Fri Jul 22 2011 - 16:59:46 EST


Some storage controllers benefit from completions always being steered
to the strict requester cpu rather than the looser "per-socket" steering
that blk_cpu_to_group() attempts by default.

echo 2 > /sys/block/<bdev>/queue/rq_affinity

Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Cc: Roland Dreier <roland@xxxxxxxxxxxxxxx>
Tested-by: Dave Jiang <dave.jiang@xxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
Documentation/block/queue-sysfs.txt | 10 +++++++---
block/blk-core.c | 6 ++----
block/blk-softirq.c | 11 +++++++----
block/blk-sysfs.c | 13 +++++++++----
include/linux/blkdev.h | 3 ++-
5 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index f652740..d8147b3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -45,9 +45,13 @@ device.

rq_affinity (RW)
----------------
-If this option is enabled, the block layer will migrate request completions
-to the CPU that originally submitted the request. For some workloads
-this provides a significant reduction in CPU cycles due to caching effects.
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).

scheduler (RW)
--------------
diff --git a/block/blk-core.c b/block/blk-core.c
index d2f8f40..9c7ba87 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1279,10 +1279,8 @@ get_rq:
init_request_from_bio(req, bio);

if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
- bio_flagged(bio, BIO_CPU_AFFINE)) {
- req->cpu = blk_cpu_to_group(get_cpu());
- put_cpu();
- }
+ bio_flagged(bio, BIO_CPU_AFFINE))
+ req->cpu = smp_processor_id();

plug = current->plug;
if (plug) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ee9c216..475fab8 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {

void __blk_complete_request(struct request *req)
{
+ int ccpu, cpu, group_cpu = NR_CPUS;
struct request_queue *q = req->q;
unsigned long flags;
- int ccpu, cpu, group_cpu;

BUG_ON(!q->softirq_done_fn);

local_irq_save(flags);
cpu = smp_processor_id();
- group_cpu = blk_cpu_to_group(cpu);

/*
* Select completion CPU
*/
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+ if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
ccpu = req->cpu;
- else
+ if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+ ccpu = blk_cpu_to_group(ccpu);
+ group_cpu = blk_cpu_to_group(cpu);
+ }
+ } else
ccpu = cpu;

if (ccpu == cpu || ccpu == group_cpu) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d935bd8..0ee17b5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
{
bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+ bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);

- return queue_var_show(set, page);
+ return queue_var_show(set << force, page);
}

static ssize_t
@@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)

ret = queue_var_store(&val, page, count);
spin_lock_irq(q->queue_lock);
- if (val)
+ if (val) {
queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
- else
- queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+ if (val == 2)
+ queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+ } else {
+ queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+ queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+ }
spin_unlock_irq(q->queue_lock);
#endif
return ret;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a23722..b228825 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -393,7 +393,7 @@ struct request_queue
#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */
#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */
#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */
+#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */
#define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */
#define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */
@@ -403,6 +403,7 @@ struct request_queue
#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */
#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */
#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */
+#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */

#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_STACKABLE) | \

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/