[RFC PATCH 3/3] sched/fair: add debugfs knob for yield throttling

From: Kuba Piecuch
Date: Fri Aug 08 2025 - 16:04:23 EST


yield_interval_ns specifies the interval within which any given thread
is allowed to yield at most once. Subsequent calls to sched_yield()
within the interval simply return without calling schedule().

Allowing unlimited calls to sched_yield() allows for DoS-like behavior
because threads can continually call into schedule() which results in
various types of contention.

For example, if a process has a profiling timer enabled, every call to
update_curr() results in an atomic add to a shared process-wide variable
p->signal->cputimer->cputime_atomic.sum_exec_runtime, performed in
account_group_exec_runtime().

In a synthetic benchmark consisting of 80 threads (2 per core) calling
sched_yield() in a busy loop with a profiling timer enabled, we have
observed that ~80% of CPU time is spent in the single atomic add
instruction. Setting yield_interval_ns to 10000 lowers that percentage
to 1-2%, at the cost of decreasing the total number yields that end up
calling schedule() by ~60%. The benchmark was run on an Intel Emerald
Rapids CPU with 60 physical cores.

Signed-off-by: Kuba Piecuch <jpiecuch@xxxxxxxxxx>
---
include/linux/sched.h | 2 ++
kernel/sched/core.c | 1 +
kernel/sched/debug.c | 2 ++
kernel/sched/fair.c | 29 +++++++++++++++++++++++++++++
kernel/sched/sched.h | 2 ++
5 files changed, 36 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa9c5be7a6325..c637025792fc6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -946,6 +946,8 @@ struct task_struct {

struct sched_info sched_info;

+ ktime_t last_yield;
+
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 81c6df746df17..acc87c9ff5681 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4493,6 +4493,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;

+ p->last_yield = ktime_set(0, 0);
p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 557246880a7e0..93d2c988d491d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -512,6 +512,8 @@ static __init int sched_init_debug(void)
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);

+ debugfs_create_u32("yield_interval_ns", 0644, debugfs_sched, &sysctl_sched_yield_interval);
+
#ifdef CONFIG_SMP
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3f9bfc64e0bc5..39ca52128f502 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -81,6 +81,18 @@ static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;

__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;

+/*
+ * This interval controls how often a given CFS thread can yield.
+ * A given thread can only yield once within this interval.
+ * The throttling is accomplished by making calls to sched_yield() return
+ * without actually calling schedule().
+ * A value of 0 means yields are not throttled.
+ *
+ * (default: 0, units: nanoseconds)
+ */
+__read_mostly unsigned int sysctl_sched_yield_interval;
+
+
static int __init setup_sched_thermal_decay_shift(char *str)
{
pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
@@ -9015,6 +9027,7 @@ static bool yield_task_fair(struct rq *rq)
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
+ ktime_t now, throttle_end_time;

/*
* Are we the only task in the tree?
@@ -9024,6 +9037,22 @@ static bool yield_task_fair(struct rq *rq)

clear_buddies(cfs_rq, se);

+ if (unlikely(sysctl_sched_yield_interval)) {
+ /*
+ * Limit how often a given thread can call schedule() via
+ * sched_yield() to once every sysctl_sched_yield_interval
+ * nanoseconds.
+ */
+ now = ktime_get();
+ throttle_end_time = ktime_add_ns(curr->last_yield,
+ sysctl_sched_yield_interval);
+
+ if (unlikely(ktime_before(now, throttle_end_time)))
+ return false;
+
+ curr->last_yield = now;
+ }
+
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8b2cd54a09942..14e3d90b0df0e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2827,6 +2827,8 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost;

extern unsigned int sysctl_sched_base_slice;

+extern __read_mostly unsigned int sysctl_sched_yield_interval;
+
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;

--
2.51.0.rc0.155.g4a0f42376b-goog