Re: [BUG nohz]: wrong user and system time accounting

From: Wanpeng Li
Date: Mon May 15 2017 - 04:21:14 EST


Ping,
2017-05-02 18:01 GMT+08:00 Wanpeng Li <kernellwp@xxxxxxxxx>:
> Cc Paolo,
> 2017-04-13 21:32 GMT+08:00 Frederic Weisbecker <fweisbec@xxxxxxxxx>:
>> On Thu, Apr 13, 2017 at 12:31:12PM +0800, Wanpeng Li wrote:
>>> 2017-04-12 22:57 GMT+08:00 Thomas Gleixner <tglx@xxxxxxxxxxxxx>:
>>> > On Wed, 12 Apr 2017, Frederic Weisbecker wrote:
>>> >> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote:
>>> >> > It's not different from the current jiffies based stuff at all. Same
>>> >> > failure mode.
>>> >>
>>> >> Yes you're right, I got confused again. So to fix this we could do our snapshots
>>> >> at a frequency lower than HZ but still high enough to avoid overhead.
>>> >>
>>> >> Something like TICK_NSEC / 2 ?
>>> >
>>> > If you are using TSC anyway then you can do proper accumulation for both
>>> > system and user and only account the data when the accumulation is more
>>> > than a jiffie.
>>>
>>> So I implement it as below:
>>>
>>> - HZ=1000.
>>> 1) two cpu hogs on cpu in nohz_full mode, 100% user time
>>> 2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected)
>>> - HZ=250
>>> 1) two cpu hogs on cpu in nohz_full mode, 100% user time
>>> 2) Luzi's testcase, 100% idle
>>>
>>> So the codes below still not work correctly for HZ=250, any suggestions?
>>
>> Right, so first lets reorder that code a bit so we can see clear inside :-)
>>
>>>
>>> -------------------------------------->8-----------------------------------------------------
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d67eee8..6a11771 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -668,6 +668,8 @@ struct task_struct {
>>> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>>> seqcount_t vtime_seqcount;
>>> unsigned long long vtime_snap;
>>> + u64 vtime_acct_utime;
>>> + u64 vtime_acct_stime;
>>
>> You need to accumulate guest and steal time as well.
>>
>
> Hi Frederic,
>
> Sorry for the delay since I'm too busy recently, I just add guest time
> and idle time accumulations as below, the code work as we expected for
> native kernel, however, the testcase fails when it runs in kvm guest.
> Top shows ~99% sys for Luzi's testcase "./acct-bug 1 995" which we
> expect 95% user and %5 idle. In addition, what's the design idea of
> steal time accumluation in your mind? Pass the tsk parameter in the
> function get_vtime_delta() down to the function
> steal_account_process_time()?
>
> -------------------------------------->8-----------------------------------------------------
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4cf9a59..56815cd 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -672,6 +672,10 @@ struct task_struct {
> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> seqcount_t vtime_seqcount;
> unsigned long long vtime_snap;
> + u64 vtime_acct_utime;
> + u64 vtime_acct_stime;
> + u64 vtime_acct_idle_time;
> + u64 vtime_acct_guest_time;
> enum {
> /* Task is sleeping or running in a CPU with VTIME inactive: */
> VTIME_INACTIVE = 0,
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index f3778e2b..2d950c6 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -676,18 +676,19 @@ void thread_group_cputime_adjusted(struct
> task_struct *p, u64 *ut, u64 *st)
> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
> static u64 vtime_delta(struct task_struct *tsk)
> {
> - unsigned long now = READ_ONCE(jiffies);
> + unsigned long long clock;
>
> - if (time_before(now, (unsigned long)tsk->vtime_snap))
> + clock = sched_clock();
> + if (clock < tsk->vtime_snap)
> return 0;
>
> - return jiffies_to_nsecs(now - tsk->vtime_snap);
> + return clock - tsk->vtime_snap;
> }
>
> static u64 get_vtime_delta(struct task_struct *tsk)
> {
> - unsigned long now = READ_ONCE(jiffies);
> - u64 delta, other;
> + u64 delta = vtime_delta(tsk);
> + u64 other;
>
> /*
> * Unlike tick based timing, vtime based timing never has lost
> @@ -696,17 +697,16 @@ static u64 get_vtime_delta(struct task_struct *tsk)
> * elapsed time. Limit account_other_time to prevent rounding
> * errors from causing elapsed vtime to go negative.
> */
> - delta = jiffies_to_nsecs(now - tsk->vtime_snap);
> other = account_other_time(delta);
> WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
> - tsk->vtime_snap = now;
> + tsk->vtime_snap += delta;
>
> return delta - other;
> }
>
> static void __vtime_account_system(struct task_struct *tsk)
> {
> - account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
> + account_system_time(tsk, irq_count(), tsk->vtime_acct_stime);
> }
>
> void vtime_account_system(struct task_struct *tsk)
> @@ -715,7 +715,11 @@ void vtime_account_system(struct task_struct *tsk)
> return;
>
> write_seqcount_begin(&tsk->vtime_seqcount);
> - __vtime_account_system(tsk);
> + tsk->vtime_acct_stime += get_vtime_delta(tsk);
> + if (tsk->vtime_acct_stime >= TICK_NSEC) {
> + __vtime_account_system(tsk);
> + tsk->vtime_acct_stime = 0;
> + }
> write_seqcount_end(&tsk->vtime_seqcount);
> }
>
> @@ -723,16 +727,22 @@ void vtime_account_user(struct task_struct *tsk)
> {
> write_seqcount_begin(&tsk->vtime_seqcount);
> tsk->vtime_snap_whence = VTIME_SYS;
> - if (vtime_delta(tsk))
> - account_user_time(tsk, get_vtime_delta(tsk));
> + tsk->vtime_acct_utime += get_vtime_delta(tsk);
> + if (tsk->vtime_acct_utime >= TICK_NSEC) {
> + account_user_time(tsk, tsk->vtime_acct_utime);
> + tsk->vtime_acct_utime = 0;
> + }
> write_seqcount_end(&tsk->vtime_seqcount);
> }
>
> void vtime_user_enter(struct task_struct *tsk)
> {
> write_seqcount_begin(&tsk->vtime_seqcount);
> - if (vtime_delta(tsk))
> + tsk->vtime_acct_stime += get_vtime_delta(tsk);
> + if (tsk->vtime_acct_stime >= TICK_NSEC) {
> __vtime_account_system(tsk);
> + tsk->vtime_acct_stime = 0;
> + }
> tsk->vtime_snap_whence = VTIME_USER;
> write_seqcount_end(&tsk->vtime_seqcount);
> }
> @@ -747,8 +757,11 @@ void vtime_guest_enter(struct task_struct *tsk)
> * that can thus safely catch up with a tickless delta.
> */
> write_seqcount_begin(&tsk->vtime_seqcount);
> - if (vtime_delta(tsk))
> + tsk->vtime_acct_stime += get_vtime_delta(tsk);
> + if (tsk->vtime_acct_stime >= TICK_NSEC) {
> __vtime_account_system(tsk);
> + tsk->vtime_acct_stime = 0;
> + }
> current->flags |= PF_VCPU;
> write_seqcount_end(&tsk->vtime_seqcount);
> }
> @@ -757,7 +770,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_enter);
> void vtime_guest_exit(struct task_struct *tsk)
> {
> write_seqcount_begin(&tsk->vtime_seqcount);
> - __vtime_account_system(tsk);
> + tsk->vtime_acct_stime += get_vtime_delta(tsk);
> + if (tsk->vtime_acct_stime >= TICK_NSEC) {
> + __vtime_account_system(tsk);
> + tsk->vtime_acct_stime = 0;
> + }
> current->flags &= ~PF_VCPU;
> write_seqcount_end(&tsk->vtime_seqcount);
> }
> @@ -765,7 +782,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
>
> void vtime_account_idle(struct task_struct *tsk)
> {
> - account_idle_time(get_vtime_delta(tsk));
> + tsk->vtime_acct_idle_time += get_vtime_delta(tsk);
> + if (tsk->vtime_acct_idle_time >= TICK_NSEC) {
> + account_idle_time(tsk->vtime_acct_idle_time);
> + tsk->vtime_acct_idle_time = 0;
> + }
> }
>
> void arch_vtime_task_switch(struct task_struct *prev)
> @@ -776,7 +797,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
>
> write_seqcount_begin(&current->vtime_seqcount);
> current->vtime_snap_whence = VTIME_SYS;
> - current->vtime_snap = jiffies;
> + current->vtime_snap = sched_clock_cpu(smp_processor_id());
> write_seqcount_end(&current->vtime_seqcount);
> }
>
> @@ -787,7 +808,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
> local_irq_save(flags);
> write_seqcount_begin(&t->vtime_seqcount);
> t->vtime_snap_whence = VTIME_SYS;
> - t->vtime_snap = jiffies;
> + t->vtime_snap = sched_clock_cpu(cpu);
> write_seqcount_end(&t->vtime_seqcount);
> local_irq_restore(flags);
> }
>
> Regards,
> Wanpeng Li