Re: 2.6.39-rc4+: Kernel leaking memory during FS scanning,regression?

From: Bruno PrÃmont
Date: Thu Apr 28 2011 - 16:23:24 EST


On Thu, 28 April 2011 Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
> On Thu, 28 Apr 2011, Sedat Dilek wrote:
> > On Thu, Apr 28, 2011 at 3:30 PM, Mike Galbraith <efault@xxxxxx> wrote:
> > rt_rq[0]:
> > .rt_nr_running : 0
> > .rt_throttled : 0
>
> > .rt_time : 888.893877
>
> > .rt_time : 950.005460
>
> So rt_time is constantly accumulated, but never decreased. The
> decrease happens in the timer callback. Looks like the timer is not
> running for whatever reason.
>
> Can you add the following patch as well ?
>
> Thanks,
>
> tglx
>
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -172,7 +172,7 @@ static enum hrtimer_restart sched_rt_per
> idle = do_sched_rt_period_timer(rt_b, overrun);
> }
>
> - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
> + return HRTIMER_RESTART;

This doesn't help here.
Be it applied on top of the others, full diff attached
or applied alone (with throttling printk).

Could it be that NO_HZ=y has some importance in this matter?


Extended throttling printk (Linus asked what exact values were looking
like):
[ 401.000119] sched: RT throttling activated 950012539 > 950000000


Equivalent to what Sedat sees (/proc/sched_debug):
rt_rq[0]:
.rt_nr_running : 2
.rt_throttled : 1
.rt_time : 950.012539
.rt_runtime : 950.000000


/proc/$(pidof rcu_kthread)/sched captured at regular intervals:
Thu Apr 28 21:33:41 CEST 2011
rcu_kthread (6, #threads: 1)
---------------------------------------------------------
se.exec_start : 0.000000
se.vruntime : 0.000703
se.sum_exec_runtime : 903.067982
nr_switches : 23752
nr_voluntary_switches : 23751
nr_involuntary_switches : 1
se.load.weight : 1024
policy : 1
prio : 98
clock-delta : 912
Thu Apr 28 21:34:11 CEST 2011
rcu_kthread (6, #threads: 1)
---------------------------------------------------------
se.exec_start : 0.000000
se.vruntime : 0.000703
se.sum_exec_runtime : 974.899495
nr_switches : 25721
nr_voluntary_switches : 25720
nr_involuntary_switches : 1
se.load.weight : 1024
policy : 1
prio : 98
clock-delta : 1098
Thu Apr 28 21:34:41 CEST 2011
rcu_kthread (6, #threads: 1)
---------------------------------------------------------
se.exec_start : 0.000000
se.vruntime : 0.000703
se.sum_exec_runtime : 974.899495
nr_switches : 25721
nr_voluntary_switches : 25720
nr_involuntary_switches : 1
se.load.weight : 1024
policy : 1
prio : 98
clock-delta : 1126
Thu Apr 28 21:35:11 CEST 2011
rcu_kthread (6, #threads: 1)



> }
>
> static
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b9..aad1b88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -172,7 +172,7 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
idle = do_sched_rt_period_timer(rt_b, overrun);
}

- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+ return /* idle ? HRTIMER_NORESTART : */ HRTIMER_RESTART;
}

static
@@ -460,7 +460,7 @@ struct rq {
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
- unsigned int skip_clock_update;
+ int skip_clock_update;

/* capture load from *all* tasks on this cpu: */
struct load_weight load;
@@ -642,8 +642,8 @@ static void update_rq_clock(struct rq *rq)
{
s64 delta;

- if (rq->skip_clock_update)
- return;
+/* if (rq->skip_clock_update > 0)
+ return; */

delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
rq->clock += delta;
@@ -4035,7 +4035,7 @@ static inline void schedule_debug(struct task_struct *prev)

static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->se.on_rq)
+ if (prev->se.on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq);
prev->sched_class->put_prev_task(rq, prev);
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc..2feae93 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -572,8 +572,15 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
enqueue = 1;
}

- if (enqueue)
+ if (enqueue) {
+ /*
+ * Tag a forced clock update if we're coming out of idle
+ * so rq->clock_task will be updated when we schedule().
+ */
+ if (rq->curr == rq->idle)
+ rq->skip_clock_update = -1;
sched_rt_rq_enqueue(rt_rq);
+ }
raw_spin_unlock(&rq->lock);
}

@@ -608,6 +615,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;

if (rt_rq->rt_time > runtime) {
+ printk_once(KERN_WARNING "sched: RT throttling activated %llu > %llu\n", rt_rq->rt_time, runtime);
rt_rq->rt_throttled = 1;
if (rt_rq_throttled(rt_rq)) {
sched_rt_rq_dequeue(rt_rq);