Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

From: Peter Zijlstra
Date: Tue Nov 10 2015 - 08:23:35 EST


On Mon, Nov 09, 2015 at 04:21:23PM -0800, Jacob Pan wrote:
> +++ b/include/trace/events/sched.h

> +/*
> + * Tracepoint for idle injection
> + */
> +TRACE_EVENT(sched_cfs_idle_inject,
> +
> + TP_PROTO(char *msg, int throttled),
> +
> + TP_ARGS(msg, throttled),
> +
> + TP_STRUCT__entry(
> + __string(msg, msg)
> + __field(int, throttled)
> + ),
> +
> + TP_fast_assign(
> + __assign_str(msg, msg);
> + __entry->throttled = throttled;
> + ),
> +
> + TP_printk("%s: throttled=%d", __get_str(msg), __entry->throttled)
> +);

So I hate tracepoints.. and I'd rather not see them. But at the very
least kill that @msg field and replace it with an enum or so.


> +/*
> + * Knobs for controlling percentage of time when idle is forced across all
> + * CPUs. This is a power management feature intended for achieving deepest
> + * and broadest idle without lower CPU frequencies to less optimal level.
> + * No action is taken if CPUs are natually idle.
> + */
> +#ifdef CONFIG_CFS_IDLE_INJECT
> +unsigned int sysctl_sched_cfs_idle_inject_pct;
> +unsigned int sysctl_sched_cfs_idle_inject_duration = 10UL;

since you're playing the ifdef game, you might as well also do:

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
cfs_rq->runnable = true;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
return cfs_rq->runnable;
}

#else

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
cfs_rq->nr_running++;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
return !!cfs_rq->nr_running;
}

> +#endif
> +
> static inline void update_load_add(struct load_weight *lw, unsigned long inc)
> {
> lw->weight += inc;
> @@ -2334,7 +2346,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> list_add(&se->group_node, &rq->cfs_tasks);
> }
> #endif
> - cfs_rq->nr_running++;
> +
> + if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
> + cfs_rq->runnable = true;

which makes that:
cfs_rq_nr_running_inc();

> }
>
> static void
> @@ -2347,7 +2361,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> account_numa_dequeue(rq_of(cfs_rq), task_of(se));
> list_del_init(&se->group_node);
> }
> - cfs_rq->nr_running--;
> +
> + if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
> + cfs_rq->runnable = false;

cfs_rq_nr_running_dec();

> }
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -5139,7 +5155,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
>
> again:
> #ifdef CONFIG_FAIR_GROUP_SCHED
> - if (!cfs_rq->nr_running)
> + if (!cfs_rq->runnable)

if (!cfs_rq_runnable(cfs_rq))
> goto idle;
>
> if (prev->sched_class != &fair_sched_class)

> idle:
> + if ((cfs_rq->forced_idle)) {
> + if (unlikely(local_softirq_pending())) {
> + trace_sched_cfs_idle_inject("softirq pending", 1);

> + cfs_rq->forced_idle = false;
> + cfs_rq->runnable = cfs_rq->nr_running;

maybe:
__unthrottle_cfs_rq(cfs_rq); ?

> + goto again;
> + }
> + trace_sched_cfs_idle_inject("forced idle", 1);
> + return NULL;
> + }
> /*
> * This is OK, because current is on_cpu, which avoids it being picked
> * for load-balance and preemption/IRQs are still disabled avoiding
> @@ -8318,3 +8344,350 @@ __init void init_sched_fair_class(void)
> #endif /* SMP */
>
> }
> +
> +#ifdef CONFIG_CFS_IDLE_INJECT

> +static atomic_t idle_inject_active;

You only use atomic_{read,set} on this, therefore atomic_t is pointless.

> +static DEFINE_PER_CPU(struct hrtimer, idle_inject_timer);
> +static DEFINE_PER_CPU(bool, idle_injected);

I tend to prefer to not use bool as a storage class; its ill defined.

> +/* protect injection parameters from runtime changes */
> +static DEFINE_SPINLOCK(idle_inject_lock);

A global lock, yay :-), I think you want this to be a RAW_SPINLOCK
though. As on -RT this would want to actually run from IRQ context too.

> +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *hrtimer)
> +{
> + struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> + int cpu = smp_processor_id();
> + ktime_t now, delta, period;
> + bool status;
> +
> + now = hrtimer_cb_get_time(hrt);

You're not interested in the current time.

> +
> + status = raw_cpu_read(idle_injected);
> + if (status) {
> + /*
> + * We were injecting idle in the last phase, let's forward the
> + * timer to the next period
> + *
> + * status: 1 0 1 0
> + * ____ ____________________ _______
> + * |________| |_________|
> + *
> + * |duration| interval |
> + *
> + * ^ we are here
> + * forward to here: ^
> + */
> + delta = ktime_sub(now, inject_start_time);
> + period = ktime_add(ms_to_ktime(duration),
> + ms_to_ktime(inject_interval));
> + delta = ktime_roundup(delta, period);
> + hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

This doesn't make any sense. Who cares what the current time is.

> + } else {
> + /*
> + * We were not injecting idle in the last phase, let's forward
> + * timer after forced idle duration
> + * ____ ____________________ _______
> + * |________| |_________|
> + *
> + * |duration| interval |
> + *
> + * ^ we are here
> + * ^ forward timer to here
> + */
> + hrtimer_set_expires(hrt, ktime_add(ms_to_ktime(duration), now));

Same here, we don't care about the current time. The timer was at the
previous start of injection, just forward it a whole period to find the
next injection slot.

> + }

It looks like what you want is:

hrtimer_forward(hrt, period);

unconditionally.

> + raw_cpu_write(idle_injected, !status);
> + trace_sched_cfs_idle_inject("idle sync timer", !status);
> + if (status)
> + unthrottle_rq(cpu);
> + else
> + throttle_rq(cpu);
> +
> + return HRTIMER_RESTART;
> +}
> +
> +static void idle_inject_timer_start(void *info)
> +{
> + int cpu = smp_processor_id();
> + struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +
> + this_cpu_write(idle_injected, true);
> + set_bit(cpu, idle_inject_cpumask);
> + hrtimer_start(hrt, ms_to_ktime(duration), HRTIMER_MODE_ABS_PINNED);
> + hrtimer_set_expires(hrt, *(ktime_t *)info);

This is broken, _first_ set an expiration time, then start the timer.

Now you insert the timer into the RB tree on a previous expiration time,
then you modify the expiration time under it, effectively wrecking the
RB tree.

> +}

> +static void stop_idle_inject(void)
> +{
> + int i;
> + struct hrtimer *hrt;
> +
> + if (bitmap_weight(idle_inject_cpumask, num_possible_cpus())) {

I don't get the point of this bitmap; with the cpu notifier you
basically ensure this is equal to online_mask.

Also, this weight test is pointless, if the bitmap is empty the
for_each_set_bit() should be of equal cost -- and afaict nothing calling
this is performance critical in the first place.

> + for_each_set_bit(i, idle_inject_cpumask, num_possible_cpus()) {

> + hrt = &per_cpu(idle_inject_timer, i);
> + hrtimer_cancel(hrt);
> + unthrottle_rq(i);
> + }
> + }
> +}
> +
> +static int idle_inject_cpu_callback(struct notifier_block *nfb,
> + unsigned long action, void *hcpu)
> +{
> + unsigned long cpu = (unsigned long)hcpu;
> + struct hrtimer *hrt = &per_cpu(idle_inject_timer, cpu);
> + ktime_t now, delta, period;
> +
> + if (!atomic_read(&idle_inject_active))
> + goto exit_ok;

We should never get here if that weren't set, right? I mean you
register/unregister these callbacks around setting that variable.

> +
> + switch (action) {
> + case CPU_STARTING:
> + raw_cpu_write(idle_injected, true);
> +
> + hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> + hrt->function = idle_inject_timer_fn;
> + set_bit(cpu, idle_inject_cpumask);
> +
> + now = hrtimer_cb_get_time(hrt);
> + hrtimer_start(hrt, ms_to_ktime(duration),
> + HRTIMER_MODE_ABS_PINNED);
> + /*
> + * When a new CPU comes online, we need to make sure it aligns
> + * its phase with the rest of the CPUs. So we set the
> + * timer to the next period based on the common starting time,
> + * then start injecting idle time.
> + */
> + spin_lock_irq(&idle_inject_lock);
> + delta = ktime_sub(now, inject_start_time);
> + period = ktime_add(ms_to_ktime(duration),
> + ms_to_ktime(inject_interval));
> + delta = ktime_roundup(delta, period);
> + spin_unlock_irq(&idle_inject_lock);
> + hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

Same broken, you cannot call that on a timer you've already started.

> + break;
> + case CPU_DYING:
> + clear_bit(cpu, idle_inject_cpumask);
> + hrtimer_cancel(hrt);
> + raw_cpu_write(idle_injected, false);
> + unthrottle_rq(cpu);
> + break;
> + default:
> + return NOTIFY_DONE;
> + }
> +exit_ok:
> + return NOTIFY_OK;
> +}
> +
> +static int idle_inject_pm_callback(struct notifier_block *self,
> + unsigned long action, void *hcpu)
> +{
> + switch (action) {
> + case PM_HIBERNATION_PREPARE:
> + case PM_SUSPEND_PREPARE:
> + if (atomic_read(&idle_inject_active))
> + stop_idle_inject();

As with the above, if that were false, this whole callback would not be
called, seeing how you unregister before actually clearing that
idle_inject_active thing.

> + break;
> + case PM_POST_HIBERNATION:
> + case PM_POST_SUSPEND:
> + printk("POST SUSPEND restart idle injection\n");

Seems a tad inconsistent, printing here but not when stopping it.

> + start_idle_inject();
> + break;
> + default:
> + break;
> + }
> + return NOTIFY_OK;
> +}
> +
> +static struct notifier_block idle_inject_pm_notifier = {
> + .notifier_call = idle_inject_pm_callback,
> +};
> +
> +static struct notifier_block idle_inject_cpu_notifier = {
> + .notifier_call = idle_inject_cpu_callback,
> +};
> +
> +static void end_idle_inject(void) {
> + unregister_hotcpu_notifier(&idle_inject_cpu_notifier);
> + unregister_pm_notifier(&idle_inject_pm_notifier);

As per the above, these callbacks will not happen hereafter, and will
this never see:

> + atomic_set(&idle_inject_active, 0);
> + kfree(idle_inject_cpumask);
> +}
> +
> +static int prepare_idle_inject(void)
> +{
> + int retval = 0;
> + int bitmap_size;
> + int cpu;
> + struct hrtimer *hrt;
> +
> + bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);

This is incorrect, you want nr_cpu_ids. There is no guarantee the CPU
space does not contain holes. But seeing I still don't see the point of
the mask, this might all fix itself by killing it alltogether.

> + idle_inject_cpumask = kzalloc(bitmap_size, GFP_KERNEL);
> + if (!idle_inject_cpumask)
> + return -ENOMEM;
> +
> + retval = register_pm_notifier(&idle_inject_pm_notifier);
> + if (retval)
> + goto exit_free;
> + retval = register_hotcpu_notifier(&idle_inject_cpu_notifier);
> + if (retval)
> + goto exit_unregister_pm;
> + get_online_cpus();
> + for_each_online_cpu(cpu) {
> + hrt = &per_cpu(idle_inject_timer, cpu);
> + hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> + hrt->function = idle_inject_timer_fn;
> + }
> + put_online_cpus();
> +
> + if (!duration)
> + duration = DEFAULT_DURATION_MSECS;
> +
> + return 0;
> +exit_unregister_pm:
> + unregister_pm_notifier(&idle_inject_pm_notifier);
> +exit_free:
> + kfree(idle_inject_cpumask);
> + return retval;
> +}
> +
> +int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
> + int write,
> + void __user *buffer,
> + size_t *length, loff_t *ppos)
> +{
> + int ret;
> +
> + ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> + if (ret)
> + goto out;
> +
> + if (idle_pct != sysctl_sched_cfs_idle_inject_pct) {
> + if (!idle_pct)
> + start_idle_inject();
> + else if (!sysctl_sched_cfs_idle_inject_pct) {
> + stop_idle_inject();
> + end_idle_inject();
> + }
> +
> + /* recompute injection parameters */
> + spin_lock_irq(&idle_inject_lock);
> + idle_pct = sysctl_sched_cfs_idle_inject_pct;
> + /*
> + * duration is fixed for each injection period, we adjust
> + * non idle interval to satisfy the idle percentage set
> + * by the user. e.g. if duration is 10 and we want 33% idle
> + * then interval is 20.
> + * 33% idle
> + * ____ ___________________ _________
> + * |________| |________| 33% idle
> + * ____ ________ _______
> + * |________| |________| 50% idle
> + *
> + * |duration|interval|
> + */
> + if (idle_pct)
> + inject_interval = (duration * (100 - idle_pct))
> + / idle_pct;

This needs {} (or just exceed the 80 char thing).

> + spin_unlock_irq(&idle_inject_lock);
> +
> + }
> +out:
> + return ret;
> +}
> +
> +int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
> + int write,
> + void __user *buffer,
> + size_t *length, loff_t *ppos)
> +{
> + int ret;
> +
> + ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> + if (ret)
> + goto out;
> +
> + if (duration == sysctl_sched_cfs_idle_inject_duration)
> + goto out;
> + /* recompute injection parameters */
> + spin_lock_irq(&idle_inject_lock);
> + duration = jiffies_to_msecs(sysctl_sched_cfs_idle_inject_duration);
> + if (idle_pct)
> + inject_interval = (duration * (100 - idle_pct)) / idle_pct;
> +
> + spin_unlock_irq(&idle_inject_lock);
> +out:
> + return ret;
> +}

And since you have proc handlers for both these, why not convert to
ktime here and avoid the enless ms_to_ktime() calls ?

Also, maybe precompute the period, since that is what you really need.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/