Re: [RFC PATCH] psi: introduce PSI UNINTERRUPTIBLE

From: Chengming Zhou
Date: Mon Aug 08 2022 - 05:12:56 EST


On 2022/8/8 14:13, zhaoyang.huang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
>
> Uninterruptible sleep has not been monitored as an important system status yet.
> Imagin that a set of psi triggers are created for monitoring a special group, while
> get nothing high for none of the pressures, which could be the processes within
> are stock in some given resources and turn to be UN status. Introduce PSI_UN as
> a sub-type among PSI system here.

Hello,

The problem is that not all TASK_UNINTERRUPTIBLE task means stalled on some
shared resource, like many schedule_timeout() paths.

Thanks.

>
> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
> ---
> include/linux/psi_types.h | 11 ++++++++---
> kernel/sched/psi.c | 10 ++++++++++
> kernel/sched/stats.h | 6 +++++-
> 3 files changed, 23 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index c7fe7c0..8cc1979 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -15,6 +15,7 @@ enum psi_task_count {
> NR_IOWAIT,
> NR_MEMSTALL,
> NR_RUNNING,
> + NR_UNINTERRUPTIBLE,
> /*
> * This can't have values other than 0 or 1 and could be
> * implemented as a bit flag. But for now we still have room
> @@ -32,7 +33,7 @@ enum psi_task_count {
> * threads and memstall ones.
> */
> NR_MEMSTALL_RUNNING,
> - NR_PSI_TASK_COUNTS = 5,
> + NR_PSI_TASK_COUNTS = 6,
> };
>
> /* Task state bitmasks */
> @@ -41,13 +42,15 @@ enum psi_task_count {
> #define TSK_RUNNING (1 << NR_RUNNING)
> #define TSK_ONCPU (1 << NR_ONCPU)
> #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
> +#define TSK_UNINTERRUPTIBLE (1 << NR_UNINTERRUPTIBLE)
>
> /* Resources that workloads could be stalled on */
> enum psi_res {
> PSI_IO,
> PSI_MEM,
> PSI_CPU,
> - NR_PSI_RESOURCES = 3,
> + PSI_UN,
> + NR_PSI_RESOURCES = 4,
> };
>
> /*
> @@ -63,9 +66,11 @@ enum psi_states {
> PSI_MEM_FULL,
> PSI_CPU_SOME,
> PSI_CPU_FULL,
> + PSI_UN_SOME,
> + PSI_UN_FULL,
> /* Only per-CPU, to weigh the CPU in the global average: */
> PSI_NONIDLE,
> - NR_PSI_STATES = 7,
> + NR_PSI_STATES = 9,
> };
>
> enum psi_aggregators {
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index a337f3e..a37b4a4 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -231,6 +231,10 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
> return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
> case PSI_CPU_FULL:
> return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
> + case PSI_UN_SOME:
> + return unlikely(tasks[NR_UNINTERRUPTIBLE]);
> + case PSI_UN_FULL:
> + return unlikely(tasks[NR_UNINTERRUPTIBLE] && !tasks[NR_RUNNING]);
> case PSI_NONIDLE:
> return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
> tasks[NR_RUNNING];
> @@ -683,6 +687,12 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> groupc->times[PSI_CPU_FULL] += delta;
> }
>
> + if (groupc->state_mask & (1 << PSI_UN_SOME)) {
> + groupc->times[PSI_UN_SOME] += delta;
> + if (groupc->state_mask & (1 << PSI_UN_FULL))
> + groupc->times[PSI_UN_FULL] += delta;
> + }
> +
> if (groupc->state_mask & (1 << PSI_NONIDLE))
> groupc->times[PSI_NONIDLE] += delta;
> }
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index baa839c..bf98829 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -132,6 +132,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
> if (p->in_iowait)
> clear |= TSK_IOWAIT;
> }
> + clear |= TSK_UNINTERRUPTIBLE;
>
> psi_task_change(p, clear, set);
> }
> @@ -139,6 +140,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
> static inline void psi_dequeue(struct task_struct *p, bool sleep)
> {
> int clear = TSK_RUNNING;
> + int set = 0;
>
> if (static_branch_likely(&psi_disabled))
> return;
> @@ -154,8 +156,10 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
>
> if (p->in_memstall)
> clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
> + if (READ_ONCE(p->__state) & TASK_UNINTERRUPTIBLE)
> + set = TSK_UNINTERRUPTIBLE;
>
> - psi_task_change(p, clear, 0);
> + psi_task_change(p, clear, set);
> }
>
> static inline void psi_ttwu_dequeue(struct task_struct *p)