[PATCH] block: Limit number of items taken from the I/O scheduler in one go

From: Salman Qazi
Date: Mon Feb 03 2020 - 15:59:59 EST


We observed that it is possible for a flush to bypass the I/O
scheduler and get added to hctx->dispatch in blk_mq_sched_bypass_insert.
This can happen while a kworker is running blk_mq_do_dispatch_sched call
in blk_mq_sched_dispatch_requests.

However, the blk_mq_do_dispatch_sched call doesn't end in bounded time.
As a result, the flush can sit there indefinitely, as the I/O scheduler
feeds an arbitrary number of requests to the hardware.

The solution is to periodically poll hctx->dispatch in
blk_mq_do_dispatch_sched, to put a bound on the latency of the commands
sitting there.

Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx>
---
block/blk-mq-sched.c | 6 ++++++
block/blk-mq.c | 4 ++++
block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++
include/linux/blkdev.h | 2 ++
4 files changed, 45 insertions(+)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..75cdec64b9c7 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -90,6 +90,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
+ int count = 0;

do {
struct request *rq;
@@ -97,6 +98,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;

+ if (count > 0 && count % q->max_sched_batch == 0 &&
+ !list_empty_careful(&hctx->dispatch))
+ break;
+
if (!blk_mq_get_dispatch_budget(hctx))
break;

@@ -112,6 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
* in blk_mq_dispatch_rq_list().
*/
list_add(&rq->queuelist, &rq_list);
+ count++;
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
}

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a12b1763508d..7cb13aa72a94 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,6 +40,8 @@
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"

+#define BLK_MQ_DEFAULT_MAX_SCHED_BATCH 100
+
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

@@ -2934,6 +2936,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
*/
q->poll_nsec = BLK_MQ_POLL_CLASSIC;

+ q->max_sched_batch = BLK_MQ_DEFAULT_MAX_SCHED_BATCH;
+
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fca9b158f4a0..dd7b58a1bd35 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -390,6 +390,32 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
return count;
}

+static ssize_t queue_max_sched_batch_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%d\n", q->max_sched_batch);
+}
+
+static ssize_t queue_max_sched_batch_store(struct request_queue *q,
+ const char *page,
+ size_t count)
+{
+ int err, val;
+
+ if (!q->mq_ops)
+ return -EINVAL;
+
+ err = kstrtoint(page, 10, &val);
+ if (err < 0)
+ return err;
+
+ if (val <= 0)
+ return -EINVAL;
+
+ q->max_sched_batch = val;
+
+ return count;
+}
+
static ssize_t queue_poll_show(struct request_queue *q, char *page)
{
return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -691,6 +717,12 @@ static struct queue_sysfs_entry queue_poll_delay_entry = {
.store = queue_poll_delay_store,
};

+static struct queue_sysfs_entry queue_max_sched_batch_entry = {
+ .attr = {.name = "max_sched_batch", .mode = 0644 },
+ .show = queue_max_sched_batch_show,
+ .store = queue_max_sched_batch_store,
+};
+
static struct queue_sysfs_entry queue_wc_entry = {
.attr = {.name = "write_cache", .mode = 0644 },
.show = queue_wc_show,
@@ -763,6 +795,7 @@ static struct attribute *queue_attrs[] = {
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
&queue_io_timeout_entry.attr,
+ &queue_max_sched_batch_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
&throtl_sample_time_entry.attr,
#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 053ea4b51988..68e7d29d4dd4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -477,6 +477,8 @@ struct request_queue {
unsigned int rq_timeout;
int poll_nsec;

+ int max_sched_batch;
+
struct blk_stat_callback *poll_cb;
struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS];

--
2.25.0.341.g760bfbb309-goog