[RFC PATCH] mm: count swap_writepage into PSI_IO STALL

From: Huangzhaoyang
Date: Fri Dec 03 2021 - 04:20:03 EST


From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>

We would like to count swap_writepage into PSI_IO STALL time. There are
two reasons for doing so:
1. Swap_writepage introduces non-productive times. especially under the
scenario of RAM based swap device.
2. High swappiness value will lead to more anon pages to be swap out.
3. IO pressure is inconsistent to PGSWPOUT.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
---
include/linux/psi.h | 6 ++++++
kernel/sched/psi.c | 15 +++++++++++++++
mm/vmscan.c | 10 ++++++++++
3 files changed, 31 insertions(+)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 65eb147..6eb3a6f 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -23,6 +23,9 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);

+void psi_iostall_enter(void);
+void psi_iostall_leave(void);
+
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);

#ifdef CONFIG_CGROUPS
@@ -45,6 +48,9 @@ static inline void psi_init(void) {}
static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {}

+static inline void psi_iostall_enter(void) {}
+static inline void psi_iostall_leave(void) {}
+
#ifdef CONFIG_CGROUPS
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
{
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 923a0d6..643b48c 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -958,6 +958,21 @@ void psi_memstall_leave(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}

+void psi_iostall_enter(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ psi_task_change(current, 0, TSK_IOWAIT);
+}
+
+void psi_iostall_leave(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ psi_task_change(current, TSK_IOWAIT, 0);
+}
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 74296c2..798907b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1072,7 +1072,17 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
};

SetPageReclaim(page);
+
+ /*
+ * For the ram based swap device, there is no chance for reclaim
+ * context sleeping on the congested IO while it really introduce
+ * non-productive time. So count the period into PSI_IO.
+ * Don't worry about the file page, just counting it in as it has
+ * less chance to be here.
+ */
+ psi_iostall_enter();
res = mapping->a_ops->writepage(page, &wbc);
+ psi_iostall_leave();
if (res < 0)
handle_write_error(mapping, page, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
--
1.9.1