[PATCH] sched: Add a new version sysctl to control child runs first

From: cgel . zte
Date: Sun Sep 12 2021 - 00:44:07 EST


From: Yang Yang <yang.yang29@xxxxxxxxxx>

The old version sysctl has some problems. First, it allows set value
bigger than 1, which is unnecessary. Second, it didn't follow the
rule of capabilities. Thirdly, it didn't use static key. This new
version fixes all the problems.

Signed-off-by: Yang Yang <yang.yang29@xxxxxxxxxx>
Reported-by: Zeal Robot <zealci@xxxxxxxxxx>
---
include/linux/sched/sysctl.h | 2 ++
kernel/sched/core.c | 35 +++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 3 ++-
kernel/sched/sched.h | 1 +
kernel/sysctl.c | 6 ++++--
5 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 304f431178fd..0a194d0cf692 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -74,6 +74,8 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
+int sysctl_child_runs_first(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);

#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern unsigned int sysctl_sched_energy_aware;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c4462c454ab9..bfea7ecf3b83 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4323,6 +4323,41 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */

+DEFINE_STATIC_KEY_FALSE(child_runs_first);
+
+static void set_child_runs_first(bool enabled)
+{
+ if (enabled) {
+ static_branch_enable(&child_runs_first);
+ sysctl_sched_child_runs_first = 1;
+ } else {
+ static_branch_disable(&child_runs_first);
+ sysctl_sched_child_runs_first = 0;
+ }
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_child_runs_first(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&child_runs_first);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_child_runs_first(state);
+ return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+
/*
* fork()/clone()-time setup:
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f245b939..f6d4307bd654 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11099,7 +11099,8 @@ static void task_fork_fair(struct task_struct *p)
}
place_entity(cfs_rq, se, 1);

- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+ if (static_branch_unlikely(&child_runs_first) &&
+ curr && entity_before(curr, se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e117..89ac11e48173 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2002,6 +2002,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =

extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
+DECLARE_STATIC_KEY_FALSE(child_runs_first);

static inline u64 global_rt_period(void)
{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..72063cffc565 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1773,10 +1773,12 @@ int proc_do_static_key(struct ctl_table *table, int write,
static struct ctl_table kern_table[] = {
{
.procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
+ .data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = sysctl_child_runs_first,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_SCHEDSTATS
{
--
2.25.1