[PATCH] blk-ioprio: Introduce promote-to-rt policy

From: Hou Tao
Date: Tue Jan 31 2023 - 23:24:13 EST


From: Hou Tao <houtao1@xxxxxxxxxx>

Since commit a78418e6a04c ("block: Always initialize bio IO priority on
submit"), bio->bi_ioprio will never be IOPRIO_CLASS_NONE when calling
blkcg_set_ioprio(), so there will be no way to promote the io-priority
of one cgroup to IOPRIO_CLASS_RT, because bi_ioprio will always be
greater than or equals to IOPRIO_CLASS_RT.

It seems possible to call blkcg_set_ioprio() first then try to
initialize bi_ioprio later in bio_set_ioprio(), but this doesn't work
for bio in which bi_ioprio is already initialized (e.g., direct-io), so
introduce a new ioprio policy to promote the iopriority of bio to
IOPRIO_CLASS_RT if the ioprio is not already RT.

To distinguish between the demotion policy and the promotion policy,
use a bit in upper 16-bits of the policy to accomplish that and handle
the bit accordingly in blkcg_set_ioprio().

Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
---
Documentation/admin-guide/cgroup-v2.rst | 38 ++++++----
block/blk-ioprio.c | 94 +++++++++++++++++--------
2 files changed, 92 insertions(+), 40 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index c8ae7c897f14..e0b9f73ef62a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2038,17 +2038,27 @@ that attribute:
Change the I/O priority class of all requests into IDLE, the lowest
I/O priority class.

+ promote-to-rt
+ For requests that have I/O priority class BE or that have I/O priority
+ class IDLE, change it into RT. Do not modify the I/O priority class
+ of requests that have priority class RT.
+
The following numerical values are associated with the I/O priority policies:

-+-------------+---+
-| no-change | 0 |
-+-------------+---+
-| none-to-rt | 1 |
-+-------------+---+
-| rt-to-be | 2 |
-+-------------+---+
-| all-to-idle | 3 |
-+-------------+---+
+
++---------------+---------+-----+
+| policy | inst | num |
++---------------+---------+-----+
+| no-change | demote | 0 |
++---------------+---------+-----+
+| none-to-rt | demote | 1 |
++---------------+---------+-----+
+| rt-to-be | demote | 2 |
++---------------+---------+-----+
+| idle | demote | 3 |
++---------------+---------+-----+
+| promote-to-rt | promote | 1 |
++---------------+---------+-----+

The numerical value that corresponds to each I/O priority class is as follows:

@@ -2064,9 +2074,13 @@ The numerical value that corresponds to each I/O priority class is as follows:

The algorithm to set the I/O priority class for a request is as follows:

-- Translate the I/O priority class policy into a number.
-- Change the request I/O priority class into the maximum of the I/O priority
- class policy number and the numerical I/O priority class.
+-- Translate the I/O priority class policy into an instruction and a number
+-- If the instruction is demotion, change the request I/O priority class
+- into the maximum of the I/O priority class policy number and the numerical
+- I/O priority class.
+-- If the instruction is promotion, change the request I/O priority class
+- into the minimum of the I/O priority class policy number and the numerical
+- I/O priority class.

PID
---
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 8bb6b8eba4ce..0d400bee9c72 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -20,6 +20,13 @@
#include "blk-ioprio.h"
#include "blk-rq-qos.h"

+/*
+ * Upper 16-bits are reserved for special flags.
+ *
+ * @IOPRIO_POL_PROMOTION: Promote bi_ioprio instead of demote it.
+ */
+#define IOPRIO_POL_PROMOTION (1U << 17)
+
/**
* enum prio_policy - I/O priority class policy.
* @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
@@ -27,21 +34,30 @@
* @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
* IOPRIO_CLASS_BE.
* @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
- *
+ * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
+ * IOPRIO_CLASS_RT.
* See also <linux/ioprio.h>.
*/
enum prio_policy {
- POLICY_NO_CHANGE = 0,
- POLICY_NONE_TO_RT = 1,
- POLICY_RESTRICT_TO_BE = 2,
- POLICY_ALL_TO_IDLE = 3,
+ POLICY_NO_CHANGE = IOPRIO_CLASS_NONE,
+ POLICY_NONE_TO_RT = IOPRIO_CLASS_RT,
+ POLICY_RESTRICT_TO_BE = IOPRIO_CLASS_BE,
+ POLICY_ALL_TO_IDLE = IOPRIO_CLASS_IDLE,
+ POLICY_PROMOTE_TO_RT = IOPRIO_CLASS_RT | IOPRIO_POL_PROMOTION,
+};
+
+struct ioprio_policy_tuple {
+ const char *name;
+ enum prio_policy policy;
};

-static const char *policy_name[] = {
- [POLICY_NO_CHANGE] = "no-change",
- [POLICY_NONE_TO_RT] = "none-to-rt",
- [POLICY_RESTRICT_TO_BE] = "restrict-to-be",
- [POLICY_ALL_TO_IDLE] = "idle",
+/* ioprio_alloc_cpd() needs POLICY_NO_CHANGE to be the first policy */
+static const struct ioprio_policy_tuple ioprio_policies[] = {
+ { "no-change", POLICY_NO_CHANGE },
+ { "none-to-rt", POLICY_NONE_TO_RT },
+ { "restrict-to-be", POLICY_RESTRICT_TO_BE },
+ { "idle", POLICY_ALL_TO_IDLE },
+ { "promote-to-rt", POLICY_PROMOTE_TO_RT }
};

static struct blkcg_policy ioprio_policy;
@@ -57,11 +73,11 @@ struct ioprio_blkg {
/**
* struct ioprio_blkcg - Per cgroup data.
* @cpd: blkcg_policy_data structure.
- * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
+ * @ioprio: Policy name and definition.
*/
struct ioprio_blkcg {
struct blkcg_policy_data cpd;
- enum prio_policy prio_policy;
+ const struct ioprio_policy_tuple *ioprio;
};

static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -95,23 +111,35 @@ static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));

- seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
+ seq_printf(sf, "%s\n", blkcg->ioprio->name);
return 0;
}

+static const struct ioprio_policy_tuple *ioprio_match_policy(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ioprio_policies); i++) {
+ if (sysfs_streq(ioprio_policies[i].name, buf))
+ return &ioprio_policies[i];
+ }
+
+ return NULL;
+}
+
static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
- int ret;
+ const struct ioprio_policy_tuple *ioprio;

if (off != 0)
return -EIO;
/* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
- ret = sysfs_match_string(policy_name, buf);
- if (ret < 0)
- return ret;
- blkcg->prio_policy = ret;
+ ioprio = ioprio_match_policy(buf);
+ if (!ioprio)
+ return -EINVAL;
+ blkcg->ioprio = ioprio;
return nbytes;
}

@@ -141,7 +169,7 @@ static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
blkcg = kzalloc(sizeof(*blkcg), gfp);
if (!blkcg)
return NULL;
- blkcg->prio_policy = POLICY_NO_CHANGE;
+ blkcg->ioprio = &ioprio_policies[0];
return &blkcg->cpd;
}

@@ -186,20 +214,30 @@ void blkcg_set_ioprio(struct bio *bio)
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
u16 prio;

- if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
+ if (!blkcg || blkcg->ioprio->policy == POLICY_NO_CHANGE)
return;

+ WARN_ON_ONCE(bio->bi_ioprio == IOPRIO_CLASS_NONE);
+
/*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
- * correspond to a lower priority. Hence, the max_t() below selects
- * the lower priority of bi_ioprio and the cgroup I/O priority class.
- * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
- * priority is assigned to the bio.
+ * correspond to a lower priority.
+ *
+ * When IOPRIO_POL_PROMOTION is enabled, the min_t() below selects
+ * the higher priority of bi_ioprio and the cgroup I/O priority class,
+ * otherwise the lower priority is selected.
*/
- prio = max_t(u16, bio->bi_ioprio,
- IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
- if (prio > bio->bi_ioprio)
- bio->bi_ioprio = prio;
+ if (blkcg->ioprio->policy & IOPRIO_POL_PROMOTION) {
+ prio = min_t(u16, bio->bi_ioprio,
+ IOPRIO_PRIO_VALUE(blkcg->ioprio->policy, 0));
+ if (prio < bio->bi_ioprio)
+ bio->bi_ioprio = prio;
+ } else {
+ prio = max_t(u16, bio->bi_ioprio,
+ IOPRIO_PRIO_VALUE(blkcg->ioprio->policy, 0));
+ if (prio > bio->bi_ioprio)
+ bio->bi_ioprio = prio;
+ }
}

void blk_ioprio_exit(struct gendisk *disk)
--
2.29.2