[PATCH] sched/idle: Make idle poll dynamic per-cpu

From: Daniel Bristot de Oliveira
Date: Thu Jan 12 2023 - 11:27:47 EST


idle=poll is frequently used on ultra-low-latency systems. Examples of
such systems are high-performance trading and 5G NVRAM. The performance
gain is given by avoiding the idle driver machinery and by keeping the
CPU is always in an active state - avoiding (odd) hardware heuristics that
are out of the control of the OS.

Currently, idle=poll is an all-or-nothing static option defined at
boot time. The motivation for creating this option dynamic and per-cpu
are two:

1) Reduce the power usage/heat by allowing only selected CPUs to
do idle polling;
2) Allow multi-tenant systems (e.g., Kubernetes) to enable idle
polling only when ultra-low-latency applications are present
on specific CPUs.

Joe Mario did some experiments with this option enabled, and the results
were significant. For example, by using dynamic idle polling on
selected CPUs, cyclictest performance is optimal (like when using
idle=poll), but cpu power consumption drops from 381 to 233 watts.

Also, limiting idle=poll to the set of CPUs that benefits from
it allows other CPUs to benefit from frequency boosts. Joe also
shows that the results can be in the order of 80nsec round trip
improvement when system-wide idle=poll was not used.

The user can enable idle polling with this command:
# echo 1 > /sys/devices/system/cpu/cpu{CPU_ID}/idle_poll

And disable it via:
# echo 0 > /sys/devices/system/cpu/cpu{CPU_ID}/idle_poll

By default, all CPUs have idle polling disabled (the current behavior).
A static key avoids the CPU mask check overhead when no idle polling
is enabled.

Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Valentin Schneider <vschneid@xxxxxxxxxx>
Cc: Joe Mario <jmario@xxxxxxxxxx>
Signed-off-by: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
---
kernel/sched/idle.c | 97 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 93 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..c6ef1322d549 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,91 @@
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];

+/*
+ * per-cpu idle polling selector.
+ */
+static struct cpumask cpu_poll_mask;
+DEFINE_STATIC_KEY_FALSE(cpu_poll_enabled);
+
+/*
+ * Protects the mask/static key relation.
+ */
+DEFINE_MUTEX(cpu_poll_mutex);
+
+static ssize_t idle_poll_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int cpu = dev->id;
+ int retval, set;
+ bool val;
+
+ retval = kstrtobool(buf, &val);
+ if (retval)
+ return retval;
+
+ mutex_lock(&cpu_poll_mutex);
+
+ if (val) {
+ set = cpumask_test_and_set_cpu(cpu, &cpu_poll_mask);
+
+ /*
+ * If the CPU was already on, do not increase the static key usage.
+ */
+ if (!set)
+ static_branch_inc(&cpu_poll_enabled);
+ } else {
+ set = cpumask_test_and_clear_cpu(cpu, &cpu_poll_mask);
+
+ /*
+ * If the CPU was already off, do not decrease the static key usage.
+ */
+ if (set)
+ static_branch_dec(&cpu_poll_enabled);
+ }
+
+ mutex_unlock(&cpu_poll_mutex);
+
+ return count;
+}
+
+static ssize_t idle_poll_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", cpumask_test_cpu(dev->id, &cpu_poll_mask));
+}
+
+static DEVICE_ATTR_RW(idle_poll);
+
+static const struct attribute *idle_poll_attrs[] = {
+ &dev_attr_idle_poll.attr,
+ NULL
+};
+
+static int __init idle_poll_sysfs_init(void)
+{
+ int cpu, retval;
+
+ for_each_possible_cpu(cpu) {
+ struct device *dev = get_cpu_device(cpu);
+
+ if (!dev)
+ continue;
+ retval = sysfs_create_files(&dev->kobj, idle_poll_attrs);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
+}
+device_initcall(idle_poll_sysfs_init);
+
+static int is_cpu_idle_poll(int cpu)
+{
+ if (static_branch_unlikely(&cpu_poll_enabled))
+ return cpumask_test_cpu(cpu, &cpu_poll_mask);
+
+ return 0;
+}
+
/**
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
@@ -51,18 +136,21 @@ __setup("hlt", cpu_idle_nopoll_setup);

static noinline int __cpuidle cpu_idle_poll(void)
{
- trace_cpu_idle(0, smp_processor_id());
+ int cpu = smp_processor_id();
+
+ trace_cpu_idle(0, cpu);
stop_critical_timings();
ct_idle_enter();
local_irq_enable();

while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()
+ || is_cpu_idle_poll(cpu)))
cpu_relax();

ct_idle_exit();
start_critical_timings();
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, cpu);

return 1;
}
@@ -296,7 +384,8 @@ static void do_idle(void)
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()
+ || is_cpu_idle_poll(cpu)) {
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
--
2.38.1