[PATCH v2 09/14] Keep nr_iowait per cgroup

From: Glauber Costa
Date: Tue Nov 01 2011 - 17:21:10 EST


Since we are able to know precisely which process are waiting for I/O,
keep nr_iowait per-cgroup. this is used by the idle tick to calculate
whether the system is considered to be idle, or waiting for I/O.

When only the root cgroup is enabled, this should be not too much different
from before.

Signed-off-by: Glauber Costa <glommer@xxxxxxxxxxxxx>
---
include/linux/kernel_stat.h | 1 +
kernel/sched.c | 83 +++++++++++++++++++++++++++++++++++-------
2 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a0f1182..77e91f6 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -35,6 +35,7 @@ enum cpu_usage_stat {

struct kernel_cpustat {
u64 cpustat[NR_STATS];
+ atomic_t nr_iowait;
};

struct kernel_stat {
diff --git a/kernel/sched.c b/kernel/sched.c
index c7ac150..800728e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -639,8 +639,6 @@ struct rq {
u64 clock;
u64 clock_task;

- atomic_t nr_iowait;
-
#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain *sd;
@@ -817,6 +815,7 @@ static inline void task_group_account_field(struct task_struct *p,
* get ourselves ahead and touch it first. If the root cgroup
* is the only cgroup, then nothing else should be necessary.
*
+ * Same thing applies to the iowait related functions.
*/
__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;

@@ -837,6 +836,50 @@ static inline void task_group_account_field(struct task_struct *p,
#endif
}

+static inline void task_group_nr_iowait_inc(struct task_struct *p, int cpu)
+{
+
+ atomic_inc(&per_cpu(kernel_cpustat, cpu).nr_iowait);
+
+#ifdef CONFIG_CGROUP_SCHED
+ if (static_branch(&sched_cgroup_enabled)) {
+ struct kernel_cpustat *kcpustat;
+ struct task_group *tg;
+
+ rcu_read_lock();
+ tg = task_group(p);
+ while (tg && (tg != &root_task_group)) {
+ kcpustat = per_cpu_ptr(tg->cpustat, cpu);
+ atomic_inc(&kcpustat->nr_iowait);
+ tg = tg->parent;
+ }
+ rcu_read_unlock();
+ }
+#endif
+}
+
+static inline void task_group_nr_iowait_dec(struct task_struct *p, int cpu)
+{
+
+ atomic_dec(&per_cpu(kernel_cpustat, cpu).nr_iowait);
+
+#ifdef CONFIG_CGROUP_SCHED
+ if (static_branch(&sched_cgroup_enabled)) {
+ struct kernel_cpustat *kcpustat;
+ struct task_group *tg;
+
+ rcu_read_lock();
+ tg = task_group(p);
+ while (tg && (tg != &root_task_group)) {
+ kcpustat = per_cpu_ptr(tg->cpustat, cpu);
+ atomic_dec(&kcpustat->nr_iowait);
+ tg = tg->parent;
+ }
+ rcu_read_unlock();
+ }
+#endif
+}
+
void task_group_new_fork(struct task_struct *p)
{
task_group_account_field(p, 1, TOTAL_FORKS);
@@ -3442,16 +3485,24 @@ unsigned long nr_iowait(void)
{
unsigned long i, sum = 0;

- for_each_possible_cpu(i)
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
+ for_each_possible_cpu(i) {
+ kstat_lock();
+ sum += atomic_read(&per_cpu(kernel_cpustat, i).nr_iowait);
+ kstat_unlock();
+ }

return sum;
}

unsigned long nr_iowait_cpu(int cpu)
{
- struct rq *this = cpu_rq(cpu);
- return atomic_read(&this->nr_iowait);
+ unsigned long ret;
+
+ kstat_lock();
+ ret = atomic_read(&per_cpu(kernel_cpustat, cpu).nr_iowait);
+ kstat_unlock();
+
+ return ret;
}

unsigned long this_cpu_load(void)
@@ -4043,12 +4094,11 @@ void account_idle_time(cputime_t cputime)
{
struct kernel_cpustat *kcpustat;
u64 cputime64 = cputime_to_cputime64(cputime);
- struct rq *rq = this_rq();

kstat_lock();
kcpustat = kcpustat_this_cpu;

- if (atomic_read(&rq->nr_iowait) > 0)
+ if (atomic_read(&kcpustat->nr_iowait) > 0)
kcpustat->cpustat[IOWAIT] += cputime64;
else
/* idle is always accounted to the root cgroup */
@@ -5915,14 +5965,15 @@ EXPORT_SYMBOL_GPL(yield_to);
void __sched io_schedule(void)
{
struct rq *rq = raw_rq();
+ int cpu = cpu_of(rq);

delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
+ task_group_nr_iowait_inc(current, cpu);
blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
+ task_group_nr_iowait_dec(current, cpu);
delayacct_blkio_end();
}
EXPORT_SYMBOL(io_schedule);
@@ -5930,15 +5981,16 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
struct rq *rq = raw_rq();
+ int cpu = cpu_of(rq);
long ret;

delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
+ task_group_nr_iowait_inc(current, cpu);
blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
+ task_group_nr_iowait_dec(current, cpu);
delayacct_blkio_end();
return ret;
}
@@ -8363,7 +8415,6 @@ void __init sched_init(void)
#endif
#endif
init_rq_hrtick(rq);
- atomic_set(&rq->nr_iowait, 0);
}

set_load_weight(&init_task);
@@ -8766,6 +8817,7 @@ struct task_group *sched_create_group(struct task_group *parent)
root_kstat = per_cpu_ptr(root_task_group.cpustat, i);
kcpustat->cpustat[IDLE_BASE] = root_kstat->cpustat[IDLE];
kcpustat->cpustat[STEAL_BASE] = root_kstat->cpustat[STEAL];
+ atomic_set(&kcpustat->nr_iowait, 0);
kstat_unlock();
}

@@ -9660,6 +9712,7 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
u64 total_forks = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
+ unsigned long tg_iowait = 0;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *tg;
struct task_group *sib;
@@ -9701,6 +9754,8 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
guest += kcpustat->cpustat[GUEST];
guest_nice += kcpustat->cpustat[GUEST_NICE];
total_forks += kcpustat->cpustat[TOTAL_FORKS];
+ tg_iowait += atomic_read(&kcpustat->nr_iowait);
+
#ifdef CONFIG_CGROUP_SCHED
if (static_branch(&sched_cgroup_enabled)) {
list_for_each_entry(sib, &tg->siblings, siblings) {
@@ -9807,7 +9862,7 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
(unsigned long)jif,
total_forks,
nr_running(),
- nr_iowait());
+ tg_iowait),

seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);

--
1.7.6.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/