[RFC PATCH 4/9] ublk_drv: refactor __ublk_rq_task_work() and aborting machenism

From: ZiyangZhang
Date: Wed Aug 24 2022 - 01:49:25 EST


If one rq is handled by io_uring_cmd_complete_in_task(), after a crash
this rq is actually handled by an io_uring fallback wq. We have to
end(abort) this rq since this fallback wq is a task other than the
crashed task. However, current code does not call io_uring_cmd_done()
at the same time but do it in ublk_cancel_queue(). With current design,
this does work because ublk_cancel_queue() is called AFTER del_gendisk(),
which waits for the rq ended(aborted) in fallback wq. This implies that
fallback wq on this rq is scheduled BEFORE calling io_uring_cmd_done()
on the corresponding ioucmd in ublk_cancel_queue().

However, while considering recovery feature, we cannot rely on
del_gendisk() or blk_mq_freeze_queue() to wait for completion of all
rqs because we may not want any aborted rq. Besides, io_uring does not
provide "flush fallback" machenism so we cannot trace this ioucmd.

The recovery machenism needs to complete all ioucmds of a dying ubq
to avoid leaking io_uring ctx. But as talked above, we are unsafe
to call io_uring_cmd_done() in the recovery task if fallback wq happens
to run simultaneously. This is a UAF case because io_uring ctx may be
freed. Actually a similar case happens in
(5804987b7272f437299011c76b7363b8df6f8515: ublk_drv: do not add a
re-issued request aborted previously to ioucmd's task_work).

Besides, in order to implement recovery machenism, in ublk_queue_rq()
and __ublk_rq_task_work(), we should not end(abort) current rq while
ubq_daemon is dying. Instead, we should wait for new ubq_daemon getting
ready and requeue it.

In summary, We refactor some code to avoid the UAF problem and prepare
for recovery machenism:

(1) Refactor monitor_work
Monitor_work is only used without recovery feature which aborts rqs and
stops the device. Now ublk_abort_queue() is the only function end(abort)
inflight rqs with a dying ubq_daemon. While for a not inflight(idle)
rq, its ioucmd is completed by io_uring_cmd_done() safely. We do not
rely on UBLK_IO_FLAG_ACTIVE anymore. monitor_work also sets 'force_abort'
for a dying ubq.

(2) Refactor ublk_queue_rq()
Check 'force_abort' before blk_mq_start_request(). If 'force_abort' is
true, end(abort) current rq immediately. 'force_abort' is set by
monitor_work for a dying ubq. Aborting rqs ASAP ensures that all rqs'
status do not change while we traverse rqs in monitor_work.

(3) Refactor __ublk_rq_task_work().
No matter we use task_work_add() or io_uring_cmd_complete_in_task(),
now __ublk_rq_task_work() only accepts one arg: ioucmd, which is set in
ublk_queue_rq() eariler. And no matter ubq_daemon is dying or not,
we always call io_uring_cmd_done(ioucmd). Note that ioucmd might not be
the same as io->cmd because a new ubq_daemon may set new ioucmds before
old fallback wq or exit_task_work runs. In this way the old ioucmd
can be safely freed eventually and io->cmd can be updated without race.

(4) Refactor ublk_cancel_dev()
ublk_cancel_dev() cannot complete ioucmds for a dying ubq because we
have done this in monitor_work.

Signed-off-by: ZiyangZhang <ZiyangZhang@xxxxxxxxxxxxxxxxx>
---
drivers/block/ublk_drv.c | 216 +++++++++++++++++++++------------------
1 file changed, 115 insertions(+), 101 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 8add6e3ae15f..1b1c0190bba4 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -54,12 +54,10 @@
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)

-struct ublk_rq_data {
- struct callback_head work;
-};
-
struct ublk_uring_cmd_pdu {
- struct request *req;
+ int q_id;
+ int tag;
+ struct callback_head work;
};

/*
@@ -122,6 +120,7 @@ struct ublk_queue {
bool abort_work_pending;
unsigned short nr_io_ready; /* how many ios setup */
struct ublk_device *dev;
+ bool force_abort;
struct ublk_io ios[0];
};

@@ -610,24 +609,6 @@ static void ublk_complete_rq(struct request *req)
__blk_mq_end_request(req, BLK_STS_OK);
}

-/*
- * Since __ublk_rq_task_work always fails requests immediately during
- * exiting, __ublk_fail_req() is only called from abort context during
- * exiting. So lock is unnecessary.
- *
- * Also aborting may not be started yet, keep in mind that one failed
- * request may be issued by block layer again.
- */
-static void __ublk_fail_req(struct ublk_io *io, struct request *req)
-{
- WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
-
- if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
- io->flags |= UBLK_IO_FLAG_ABORTED;
- blk_mq_end_request(req, BLK_STS_IOERR);
- }
-}
-
static void ubq_complete_io_cmd(struct ublk_io *io, int res)
{
/* mark this cmd owned by ublksrv */
@@ -645,18 +626,14 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res)

#define UBLK_REQUEUE_DELAY_MS 3

-static inline void __ublk_rq_task_work(struct request *req)
+static inline void __ublk_rq_task_work(struct io_uring_cmd *cmd)
{
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublk_device *ub = ubq->dev;
- int tag = req->tag;
- struct ublk_io *io = &ubq->ios[tag];
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_device *ub = cmd->file->private_data;
+ struct ublk_queue *ubq = ublk_get_queue(ub, pdu->q_id);
+ struct ublk_io *io;
+ struct request *req;
unsigned int mapped_bytes;
-
- pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
-
/*
* Task is exiting if either:
*
@@ -667,11 +644,22 @@ static inline void __ublk_rq_task_work(struct request *req)
* (2) current->flags & PF_EXITING.
*/
if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
- blk_mq_end_request(req, BLK_STS_IOERR);
- mod_delayed_work(system_wq, &ub->monitor_work, 0);
+ pr_devel("%s: (%s) done old cmd: qid %d tag %d\n",
+ __func__,
+ current != ubq->ubq_daemon ? "fallback wq" : "exit_task_work",
+ pdu->q_id, pdu->tag);
+ io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, 0);
return;
}

+ io = &ubq->ios[pdu->tag];
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], pdu->tag);
+ WARN_ON_ONCE(!req);
+
+ pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
+ __func__, cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+ ublk_get_iod(ubq, req->tag)->addr);
+
if (ublk_need_get_data(ubq) &&
(req_op(req) == REQ_OP_WRITE ||
req_op(req) == REQ_OP_FLUSH)) {
@@ -728,18 +716,16 @@ static inline void __ublk_rq_task_work(struct request *req)

static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
{
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- __ublk_rq_task_work(pdu->req);
+ __ublk_rq_task_work(cmd);
}

static void ublk_rq_task_work_fn(struct callback_head *work)
{
- struct ublk_rq_data *data = container_of(work,
- struct ublk_rq_data, work);
- struct request *req = blk_mq_rq_from_pdu(data);
+ struct ublk_uring_cmd_pdu *pdu = container_of(work,
+ struct ublk_uring_cmd_pdu, work);
+ struct io_uring_cmd *cmd = ublk_uring_cmd_from_pdu(pdu);

- __ublk_rq_task_work(req);
+ __ublk_rq_task_work(cmd);
}

static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -747,6 +733,8 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct ublk_queue *ubq = hctx->driver_data;
struct request *rq = bd->rq;
+ struct ublk_io *io = &ubq->ios[rq->tag];
+ struct ublk_uring_cmd_pdu *pdu;
blk_status_t res;

/* fill iod to slot in io cmd buffer */
@@ -754,43 +742,43 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(res != BLK_STS_OK))
return BLK_STS_IOERR;

- blk_mq_start_request(bd->rq);
-
- if (unlikely(ubq_daemon_is_dying(ubq))) {
- fail:
- mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
+ /*
+ * We set force_abort because we want to abort this rq ASAP.
+ *
+ * NOTE: At this moment, rq is NOT inflight. So after the ubq is marked
+ * as force_abort, no new rq can be inflight and we are safe to check
+ * all rqs' status in monitor_work and choose whether:
+ *
+ * (1) if inflight, end(abort) this rq;
+ * (2) if not inflight, io_uring_cmd_done() the ioucmd.
+ */
+ if (ubq->force_abort) {
+ pr_devel("%s: force abort: qid %d tag %d io_flag %d\n",
+ __func__, ubq->q_id, rq->tag, io->flags);
return BLK_STS_IOERR;
}

+ blk_mq_start_request(bd->rq);
+ pr_devel("%s: start req: q_id %d tag %d io_flags %d\n",
+ __func__, ubq->q_id, rq->tag, io->flags);
+ pdu = ublk_get_uring_cmd_pdu(io->cmd);
+ pdu->q_id = ubq->q_id;
+ pdu->tag = rq->tag;
+
if (ublk_can_use_task_work(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
enum task_work_notify_mode notify_mode = bd->last ?
TWA_SIGNAL_NO_IPI : TWA_NONE;

- if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
- goto fail;
- } else {
- struct ublk_io *io = &ubq->ios[rq->tag];
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ init_task_work(&pdu->work, ublk_rq_task_work_fn);

- /*
- * If the check pass, we know that this is a re-issued request aborted
- * previously in monitor_work because the ubq_daemon(cmd's task) is
- * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
- * because this ioucmd's io_uring context may be freed now if no inflight
- * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
- *
- * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
- * the tag). Then the request is re-started(allocating the tag) and we are here.
- * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
- * guarantees that here is a re-issued request aborted previously.
- */
- if ((io->flags & UBLK_IO_FLAG_ABORTED))
- goto fail;
+ /* If task_work_add() fails, we know that ubq_daemon(cmd's task) is PF_EXITING. */
+ if (task_work_add(ubq->ubq_daemon, &pdu->work, notify_mode))
+ pr_devel("%s: done old cmd: qid %d tag %d\n",
+ __func__, ubq->q_id, rq->tag);
+ io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);

- pdu->req = rq;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+ } else {
+ io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
}

return BLK_STS_OK;
@@ -814,20 +802,10 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
return 0;
}

-static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- init_task_work(&data->work, ublk_rq_task_work_fn);
- return 0;
-}
-
static const struct blk_mq_ops ublk_mq_ops = {
.queue_rq = ublk_queue_rq,
.commit_rqs = ublk_commit_rqs,
.init_hctx = ublk_init_hctx,
- .init_request = ublk_init_rq,
};

static int ublk_ch_open(struct inode *inode, struct file *filp)
@@ -906,31 +884,46 @@ static void ublk_commit_completion(struct ublk_device *ub,
ublk_complete_rq(req);
}

-/*
- * When ->ubq_daemon is exiting, either new request is ended immediately,
- * or any queued io command is drained, so it is safe to abort queue
- * lockless
+/* do cleanup work for a dying(PF_EXITING) ubq_daemon:
+ * (1) end(abort) all 'inflight' rqs here.
+ * (2) complete ioucmd of all not 'inflight' rqs here.
+ *
+ * Note: we have set ubq->force_abort before. So ublk_queue_rq() must fail
+ * a rq immediately before it calls blk_mq_start_request() which will
+ * set a rq's status as 'inflight'. Therefore, set of 'inflight'
+ * rqs must not change for a dying ubq.
+ *
+ * Note: If we fail one rq in ublk_queue_rq(), we cannot fail it here again.
+ * This is because checking 'force_abort' happens before blk_mq_start_request()
+ * so its status is always 'idle' and never changes to 'inflight'.
+ *
+ * Also aborting may not be started yet, keep in mind that one failed
+ * request may be issued by block layer again.
*/
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;

+ WARN_ON_ONCE(!ubq->force_abort);
+
if (!ublk_get_device(ub))
return;

for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
-
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
- struct request *rq;
+ struct request *rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);

+ if (rq && blk_mq_request_started(rq)) {
/*
- * Either we fail the request or ublk_rq_task_work_fn
+ * Either we fail the request or ublk_queue_rq
* will do it
*/
- rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq)
- __ublk_fail_req(io, rq);
+ pr_devel("%s: abort request: qid %d tag %d\n",
+ __func__, ubq->q_id, i);
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ } else {
+ pr_devel("%s: done old cmd: qid %d tag %d\n",
+ __func__, ubq->q_id, i);
+ io_uring_cmd_done(ubq->ios[i].cmd, UBLK_IO_RES_ABORT, 0);
}
}
ublk_put_device(ub);
@@ -945,7 +938,18 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
struct ublk_queue *ubq = ublk_get_queue(ub, i);

- if (ubq_daemon_is_dying(ubq)) {
+ /* check force_abort so we do not abort a queue two times! */
+ if (ubq->ubq_daemon && ubq_daemon_is_dying(ubq) && !ubq->force_abort) {
+ struct request_queue *q = ub->ub_disk->queue;
+
+ ubq->force_abort = true;
+
+ /* ensure that all ublk_queue_rq() calls see force_abort */
+ if (blk_queue_has_srcu(q))
+ synchronize_srcu(q->srcu);
+ else
+ synchronize_rcu();
+
schedule_work(&ub->stop_work);

/* abort queue is for making forward progress */
@@ -997,8 +1001,18 @@ static void ublk_cancel_dev(struct ublk_device *ub)
{
int i;

- for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ /*
+ * for ubq with a dying daemon, we have io_uring_cmd_done() all cmd in
+ * ublk_abort_queue(). Do not io_uring_cmd_done() cmd two times!
+ */
+ if (ubq->ubq_daemon && ubq_daemon_is_dying(ubq))
+ continue;
+
ublk_cancel_queue(ublk_get_queue(ub, i));
+ }
}

static void ublk_stop_dev(struct ublk_device *ub)
@@ -1037,17 +1051,17 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
int tag, struct io_uring_cmd *cmd)
{
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->q_id = q_id;
+ pdu->tag = tag;

if (ublk_can_use_task_work(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);

+ init_task_work(&pdu->work, ublk_rq_task_work_fn);
/* should not fail since we call it just in ubq->ubq_daemon */
- task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
+ task_work_add(ubq->ubq_daemon, &pdu->work, TWA_SIGNAL_NO_IPI);
} else {
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->req = req;
io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
}
}
@@ -1320,7 +1334,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
+ ub->tag_set.cmd_size = 0;
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
--
2.27.0