Re: [PATCH v8 clocksource 2/5] clocksource: Retry clock read if long delays detected

From: Paul E. McKenney
Date: Sat Apr 17 2021 - 19:01:17 EST


On Sat, Apr 17, 2021 at 02:24:23PM +0200, Thomas Gleixner wrote:
> On Tue, Apr 13 2021 at 21:35, Paul E. McKenney wrote:
> > #define WATCHDOG_INTERVAL (HZ >> 1)
> > #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
> > +#define WATCHDOG_MAX_SKEW (NSEC_PER_SEC >> 6)
>
> That's ~15ms which is a tad large I'd say...

And I have a separate patch queued to crank this down to 125us.

I ran 2,664 hours of rcutorture on this patch, and saw only 18 instances
of delay detected. So 125us works well, but I am not inclined to decrease
it much.

I will try 50us this evening though, just for grins. After all, only
those who have gone too far can possibly tell you how far you can go.

> > static void clocksource_watchdog_work(struct work_struct *work)
> > {
> > @@ -213,9 +214,10 @@ static void clocksource_watchdog_inject_delay(void)
> > static void clocksource_watchdog(struct timer_list *unused)
> > {
> > struct clocksource *cs;
> > - u64 csnow, wdnow, cslast, wdlast, delta;
> > - int64_t wd_nsec, cs_nsec;
> > + u64 csnow, wdnow, wdagain, cslast, wdlast, delta;
> > + int64_t wd_nsec, wdagain_delta, wderr_nsec = 0, cs_nsec;
> > int next_cpu, reset_pending;
> > + int nretries;
> >
> > spin_lock(&watchdog_lock);
> > if (!watchdog_running)
> > @@ -224,6 +226,7 @@ static void clocksource_watchdog(struct timer_list *unused)
> > reset_pending = atomic_read(&watchdog_reset_pending);
> >
> > list_for_each_entry(cs, &watchdog_list, wd_list) {
> > + nretries = 0;
> >
> > /* Clocksource already marked unstable? */
> > if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
> > @@ -232,11 +235,24 @@ static void clocksource_watchdog(struct timer_list *unused)
> > continue;
> > }
> >
> > +retry:
> > local_irq_disable();
> > - csnow = cs->read(cs);
> > - clocksource_watchdog_inject_delay();
> > wdnow = watchdog->read(watchdog);
> > + clocksource_watchdog_inject_delay();
> > + csnow = cs->read(cs);
> > + wdagain = watchdog->read(watchdog);
> > local_irq_enable();
> > + delta = clocksource_delta(wdagain, wdnow, watchdog->mask);
> > + wdagain_delta = clocksource_cyc2ns(delta, watchdog->mult, watchdog->shift);
> > + if (wdagain_delta > WATCHDOG_MAX_SKEW) {
> > + wderr_nsec = wdagain_delta;
> > + if (nretries++ < max_read_retries)
> > + goto retry;
> > + }
> > + if (nretries) {
> > + pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d\n",
> > + smp_processor_id(), watchdog->name, wderr_nsec, nretries);
> > + }
> >
> > /* Clocksource initialized ? */
> > if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
>
> This can nicely be split out into a read function which avoids brain
> overload when reading. Something like the uncompiled below.

Good point, done, with minor adjustments.

> I so wish we could just delete all of this horror instead of making it
> more horrible.

Revisit deleting it in five years if there are no issues, whatever
"issue" might mean in this context? Five years should give time for
this to propagate to the enterprise distros.

Thanx, Paul

> Thanks,
>
> tglx
> ---
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -124,6 +124,12 @@ static void __clocksource_change_rating(
> #define WATCHDOG_INTERVAL (HZ >> 1)
> #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
>
> +/*
> + * The maximum delay between two consecutive readouts of the watchdog
> + * clocksource to detect SMI,NMI,vCPU preemption.
> + */
> +#define WATCHDOG_MAX_DELAY (100 * NSEC_PER_USEC)
> +
> static void clocksource_watchdog_work(struct work_struct *work)
> {
> /*
> @@ -184,12 +190,37 @@ void clocksource_mark_unstable(struct cl
> spin_unlock_irqrestore(&watchdog_lock, flags);
> }
>
> +static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
> +{
> + unsigned int nretries;
> + u64 wd_end, wd_delta;
> + int64_t wd_delay;
> +
> + for (nretries = 0; nretries < max_read_retries; nretries++) {
> + local_irq_disable();
> + *wdnow = watchdog->read(watchdog);
> + clocksource_watchdog_inject_delay();
> + *csnow = cs->read(cs);
> + wd_end = watchdog->read(watchdog);
> + local_irq_enable();
> +
> + wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
> + wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
> + if (wd_delay < WATCHDOG_MAX_DELAY)
> + return true;
> + }
> +
> + pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, %d attempts\n",
> + smp_processor_id(), watchdog->name, wd_delay, nretries);
> + return false;
> +}
> +
> static void clocksource_watchdog(struct timer_list *unused)
> {
> - struct clocksource *cs;
> u64 csnow, wdnow, cslast, wdlast, delta;
> - int64_t wd_nsec, cs_nsec;
> int next_cpu, reset_pending;
> + int64_t wd_nsec, cs_nsec;
> + struct clocksource *cs;
>
> spin_lock(&watchdog_lock);
> if (!watchdog_running)
> @@ -206,10 +237,14 @@ static void clocksource_watchdog(struct
> continue;
> }
>
> - local_irq_disable();
> - csnow = cs->read(cs);
> - wdnow = watchdog->read(watchdog);
> - local_irq_enable();
> + if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
> + /*
> + * No point to continue if the watchdog readout is
> + * unreliable.
> + */
> + __clocksource_unstable(cs);
> + continue;
> + }
>
> /* Clocksource initialized ? */
> if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||