[PATCH RFC 3/5] blk-mq: Facilitate a shared tags per tagset

From: John Garry
Date: Wed Nov 13 2019 - 08:40:48 EST


Some SCSI HBAs (such as HPSA, megaraid, mpt3sas, hisi_sas_v3 ..) support
multiple reply queues with single hostwide tags.

In addition, these drivers want to use interrupt assignment in
pci_alloc_irq_vectors(PCI_IRQ_AFFINITY). However, as discussed in [0],
CPU hotplug may cause in-flight IO completion to not be serviced when an
interrupt is shutdown.

To solve that problem, Ming's patchset to drain hctx's should ensure no
IOs are missed in-flight [1].

However, to take advantage of that patchset, we need to map the HBA HW
queues to blk mq hctx's; to do that, we need to expose the HBA HW queues.

In making that transition, the per-SCSI command request tags are no
longer unique per Scsi host - they are just unique per hctx. As such, the
HBA LLDD would have to generate this tag internally, which has a certain
performance overhead.

However another problem is that blk mq assumes the host may accept
(Scsi_host.can_queue * #hw queue) commands. In [2], we removed the Scsi
host busy counter, which would stop the LLDD being sent more than
.can_queue commands; however, we should still ensure that the block layer
does not issue more than .can_queue commands to the Scsi host.

To solve this problem, introduce a shared tags per blk_mq_tag_set, which
may be requested when allocating the tagset.

New flag BLK_MQ_F_TAG_HCTX_SHARED should be set when requesting the
tagset.

This is based on work originally from Ming Lei in [3].

[0] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@xxxxxxxxxxxxxxxxxxxxxxx/
[1] https://lore.kernel.org/linux-block/20191014015043.25029-1-ming.lei@xxxxxxxxxx/
[2] https://lore.kernel.org/linux-scsi/20191025065855.6309-1-ming.lei@xxxxxxxxxx/
[3] https://lore.kernel.org/linux-block/20190531022801.10003-1-ming.lei@xxxxxxxxxx/

Signed-off-by: John Garry <john.garry@xxxxxxxxxx>
---
block/blk-core.c | 1 +
block/blk-flush.c | 2 +
block/blk-mq-debugfs.c | 2 +-
block/blk-mq-tag.c | 85 ++++++++++++++++++++++++++++++++++++++++++
block/blk-mq-tag.h | 1 +
block/blk-mq.c | 61 +++++++++++++++++++++++++-----
block/blk-mq.h | 9 +++++
include/linux/blk-mq.h | 3 ++
include/linux/blkdev.h | 1 +
9 files changed, 155 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d5e668ec751b..79eb8983ed45 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,6 +116,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = -1;
+ rq->shared_tag = -1;
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1eec9cbe5a0a..b9ad9a5978f5 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -228,6 +228,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
if (!q->elevator) {
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
flush_rq->tag = -1;
+ flush_rq->shared_tag = -1;
} else {
blk_mq_put_driver_tag(flush_rq);
flush_rq->internal_tag = -1;
@@ -309,6 +310,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
if (!q->elevator) {
fq->orig_rq = first_rq;
flush_rq->tag = first_rq->tag;
+ flush_rq->shared_tag = first_rq->shared_tag;
blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
} else {
flush_rq->internal_tag = first_rq->internal_tag;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 33a40ae1d60f..dc90c42aeb9a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -339,7 +339,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq)));
- seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
+ seq_printf(m, ", .tag=%d, .shared_tag=%d, .internal_tag=%d", rq->tag, rq->shared_tag,
rq->internal_tag);
if (mq_ops->show_rq)
mq_ops->show_rq(m, rq);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d7aa23c82dbf..0a6c8a6b05dd 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -191,6 +191,91 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
return tag + tag_offset;
}

+ /* We could factor this out */
+unsigned int blk_mq_get_shared_tag(struct blk_mq_alloc_data *data)
+{
+ struct blk_mq_tags *tags = blk_mq_shared_tags_from_data(data);
+ struct sbitmap_queue *bt;
+ struct sbq_wait_state *ws;
+ DEFINE_SBQ_WAIT(wait);
+ unsigned int tag_offset;
+ int tag;
+
+ if (data->flags & BLK_MQ_REQ_RESERVED) {
+ if (unlikely(!tags->nr_reserved_tags)) {
+ WARN_ON_ONCE(1);
+ return BLK_MQ_TAG_FAIL;
+ }
+ bt = &tags->breserved_tags;
+ tag_offset = 0;
+ } else {
+ bt = &tags->bitmap_tags;
+ tag_offset = tags->nr_reserved_tags;
+ }
+
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != -1)
+ goto found_tag;
+
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
+ return BLK_MQ_TAG_FAIL;
+
+ ws = bt_wait_ptr(bt, data->hctx);
+ do {
+ struct sbitmap_queue *bt_prev;
+
+ /*
+ * We're out of tags on this hardware queue, kick any
+ * pending IO submits before going to sleep waiting for
+ * some to complete.
+ */
+ blk_mq_run_hw_queues(data->q, false);
+
+ /*
+ * Retry tag allocation after running the hardware queue,
+ * as running the queue may also have found completions.
+ */
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != -1)
+ break;
+
+ sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
+
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != -1)
+ break;
+
+
+ bt_prev = bt;
+ io_schedule();
+
+ sbitmap_finish_wait(bt, ws, &wait);
+
+ data->ctx = blk_mq_get_ctx(data->q);
+ data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
+ data->ctx);
+ tags = blk_mq_tags_from_data(data);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ bt = &tags->breserved_tags;
+ else
+ bt = &tags->bitmap_tags;
+
+ /*
+ * If destination hw queue is changed, fake wake up on
+ * previous queue for compensating the wake up miss, so
+ * other allocations on previous queue won't be starved.
+ */
+ if (bt != bt_prev)
+ sbitmap_queue_wake_up(bt_prev);
+
+ ws = bt_wait_ptr(bt, data->hctx);
+ } while (1);
+
+ sbitmap_finish_wait(bt, ws, &wait);
+
+found_tag:
+ return tag + tag_offset;
+}
+
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag)
{
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 88b85daa4976..82ff8faa70f7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -26,6 +26,7 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
extern void blk_mq_free_tags(struct blk_mq_tags *tags);

extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+extern unsigned int blk_mq_get_shared_tag(struct blk_mq_alloc_data *data);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6b39cf0efdcd..792eae37dc44 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -292,7 +292,7 @@ static inline bool blk_mq_need_time_stamp(struct request *rq)
}

static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
- unsigned int tag, unsigned int op, u64 alloc_time_ns)
+ unsigned int tag, unsigned int shared_tag, unsigned int op, u64 alloc_time_ns)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
@@ -300,6 +300,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,

if (data->flags & BLK_MQ_REQ_INTERNAL) {
rq->tag = -1;
+ rq->shared_tag = -1;
rq->internal_tag = tag;
} else {
if (data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) {
@@ -307,6 +308,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
+ rq->shared_tag = shared_tag;
rq->internal_tag = -1;
data->hctx->tags->rqs[rq->tag] = rq;
}
@@ -359,7 +361,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
{
struct elevator_queue *e = q->elevator;
struct request *rq;
- unsigned int tag;
+ unsigned int tag, shared_tag = BLK_MQ_TAG_FAIL;
bool clear_ctx_on_error = false;
u64 alloc_time_ns = 0;

@@ -396,15 +398,17 @@ static struct request *blk_mq_get_request(struct request_queue *q,
blk_mq_tag_busy(data->hctx);
}

- tag = blk_mq_get_tag(data);
- if (tag == BLK_MQ_TAG_FAIL) {
- if (clear_ctx_on_error)
- data->ctx = NULL;
- blk_queue_exit(q);
- return NULL;
+ if (data->hctx->shared_tags) {
+ shared_tag = blk_mq_get_shared_tag(data);
+ if (shared_tag == BLK_MQ_TAG_FAIL)
+ goto err_shared_tag;
}

- rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
+ tag = blk_mq_get_tag(data);
+ if (tag == BLK_MQ_TAG_FAIL)
+ goto err_tag;
+
+ rq = blk_mq_rq_ctx_init(data, tag, shared_tag, data->cmd_flags, alloc_time_ns);
if (!op_is_flush(data->cmd_flags)) {
rq->elv.icq = NULL;
if (e && e->type->ops.prepare_request) {
@@ -417,6 +421,15 @@ static struct request *blk_mq_get_request(struct request_queue *q,
}
data->hctx->queued++;
return rq;
+
+err_tag:
+ if (shared_tag != BLK_MQ_TAG_FAIL)
+ blk_mq_put_tag(data->hctx->shared_tags, data->ctx, shared_tag);
+err_shared_tag:
+ if (clear_ctx_on_error)
+ data->ctx = NULL;
+ blk_queue_exit(q);
+ return NULL;
}

struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
@@ -498,6 +511,8 @@ static void __blk_mq_free_request(struct request *rq)

blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
+ if (rq->shared_tag != -1)
+ blk_mq_put_tag(hctx->shared_tags, ctx, rq->shared_tag);
if (rq->tag != -1)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
@@ -1070,6 +1085,14 @@ bool blk_mq_get_driver_tag(struct request *rq)
data.flags |= BLK_MQ_REQ_RESERVED;

shared = blk_mq_tag_busy(data.hctx);
+ if (rq && rq->mq_hctx && rq->mq_hctx->shared_tags) {
+ rq->shared_tag = blk_mq_get_shared_tag(&data);
+ if (rq->shared_tag == BLK_MQ_TAG_FAIL) {
+ blk_mq_put_tag(rq->mq_hctx->tags, rq->mq_ctx, rq->tag);
+ rq->tag = -1;
+ goto done;
+ }
+ }
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
if (shared) {
@@ -1077,6 +1100,9 @@ bool blk_mq_get_driver_tag(struct request *rq)
atomic_inc(&data.hctx->nr_active);
}
data.hctx->tags->rqs[rq->tag] = rq;
+ } else if (rq->shared_tag >= 0) {
+ blk_mq_put_tag(rq->mq_hctx->tags, rq->mq_ctx, rq->tag);
+ rq->shared_tag = -1;
}

done:
@@ -2317,6 +2343,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);

hctx->tags = set->tags[hctx_idx];
+ hctx->shared_tags = set->shared_tags;

if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
@@ -3100,6 +3127,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_mq_map;

+ if (set->flags & BLK_MQ_F_TAG_HCTX_SHARED) {
+ int node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], 0);
+ int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
+
+ if (node == NUMA_NO_NODE)
+ node = set->numa_node;
+
+ set->shared_tags = blk_mq_init_tags(set->queue_depth,
+ set->reserved_tags,
+ node, alloc_policy);
+ if (!set->shared_tags) {
+ ret = -ENOMEM;
+ goto out_free_mq_map;
+ }
+ }
+
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 78d38b5f2793..c328d335de7d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -174,6 +174,14 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
return data->hctx->tags;
}

+static inline struct blk_mq_tags *blk_mq_shared_tags_from_data(struct blk_mq_alloc_data *data)
+{
+ if (data->flags & BLK_MQ_REQ_INTERNAL)
+ return data->hctx->sched_tags;
+
+ return data->hctx->shared_tags;
+}
+
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
@@ -210,6 +218,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
{
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = -1;
+ rq->shared_tag = -1;

if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 147185394a25..d3b402bd01a9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -46,6 +46,7 @@ struct blk_mq_hw_ctx {
atomic_t wait_index;

struct blk_mq_tags *tags;
+ struct blk_mq_tags *shared_tags;
struct blk_mq_tags *sched_tags;

unsigned long queued;
@@ -109,6 +110,7 @@ struct blk_mq_tag_set {
unsigned int flags; /* BLK_MQ_F_* */
void *driver_data;

+ struct blk_mq_tags *shared_tags;
struct blk_mq_tags **tags;

struct mutex tag_list_lock;
@@ -226,6 +228,7 @@ struct blk_mq_ops {
enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
+ BLK_MQ_F_TAG_HCTX_SHARED = 1 << 2,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3ea78b0c91c..a4caa6407b3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -138,6 +138,7 @@ struct request {
req_flags_t rq_flags;

int tag;
+ int shared_tag;
int internal_tag;

/* the following two fields are internal, NEVER access directly */
--
2.17.1