[RFC PATCH v7 20/23] sched/coresched: config option for kernel protection

From: Julien Desfossez
Date: Fri Aug 28 2020 - 15:54:16 EST


From: Vineeth Pillai <viremana@xxxxxxxxxxxxxxxxxxx>

There are use cases where the kernel protection is not needed. One
example could be about using core scheduling for non-security related
use cases - isolate core for a particular process dynamically. Also,
to test/benchmark the overhead of kernel protection.

Have a compile time and boot time option to disable the feature.
CONFIG_SCHED_CORE_KERNEL_PROTECTION will enable this feature at
compile time and is enabled by default is CONFIG_SCHED_CORE=y.
sched_core_kernel_protection= boot time option to control this. Value
0 will disable the feature.

Signed-off-by: Vineeth Pillai <viremana@xxxxxxxxxxxxxxxxxxx>
---
.../admin-guide/kernel-parameters.txt | 9 +++++
include/linux/sched.h | 2 +-
kernel/Kconfig.preempt | 13 +++++++
kernel/sched/core.c | 39 ++++++++++++++++++-
kernel/sched/sched.h | 2 +
5 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a1068742a6df..01e442388e4a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4611,6 +4611,15 @@

sbni= [NET] Granch SBNI12 leased line adapter

+ sched_core_kernel_protection=
+ [SCHED_CORE, SCHED_CORE_IRQ_PAUSE] Pause SMT siblings
+ of a core runninig in user mode if atleast one of the
+ siblings of the core is running in kernel. This is to
+ guarantee that kernel data is not leaked to tasks which
+ are not trusted by the kernel.
+ This feature is valid only when Core scheduling is
+ enabled(CONFIG_SCHED_CORE).
+
sched_debug [KNL] Enables verbose scheduler debug messages.

schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e04ffe689cb..4d9ae6b4dcc9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2055,7 +2055,7 @@ int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

-#ifdef CONFIG_SCHED_CORE
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
void sched_core_unsafe_enter(void);
void sched_core_unsafe_exit(void);
void sched_core_unsafe_exit_wait(unsigned long ti_check);
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 4488fbf4d3a8..52f86739f910 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -86,3 +86,16 @@ config SCHED_CORE
default y
depends on SCHED_SMT

+config SCHED_CORE_KERNEL_PROTECTION
+ bool "Core Scheduling for SMT"
+ default y
+ depends on SCHED_CORE
+ help
+ This option enables pausing all SMT siblings of a core running in
+ user mode when atleast one of the siblings in the core is in kernel.
+ This is to enforce security such that information from kernel is not
+ leaked to non-trusted tasks running on siblings. This option is valid
+ only if Core Scheduling(CONFIG_SCHED_CORE) is enabled.
+
+ If in doubt, select 'Y' when CONFIG_SCHED_CORE=y
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0dc9172be04d..34238fd67f31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,6 +75,24 @@ __read_mostly int scheduler_running;

#ifdef CONFIG_SCHED_CORE

+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
+
+DEFINE_STATIC_KEY_TRUE(sched_core_kernel_protection);
+static int __init set_sched_core_kernel_protection(char *str)
+{
+ unsigned long val = 0;
+
+ if (!str)
+ return 0;
+
+ if (!kstrtoul(str, 0, &val) && !val)
+ static_branch_disable(&sched_core_kernel_protection);
+
+ return 1;
+}
+__setup("sched_core_kernel_protection=", set_sched_core_kernel_protection);
+#endif
+
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */
@@ -4600,6 +4618,8 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}

+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
+
/*
* Handler to attempt to enter kernel. It does nothing because the exit to
* usermode or guest mode will do the actual work (of waiting if needed).
@@ -4609,6 +4629,11 @@ static void sched_core_irq_work(struct irq_work *work)
return;
}

+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+ init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
/*
* sched_core_wait_till_safe - Pause the caller's hyperthread until the core
* exits the core-wide unsafe state. Obviously the CPU calling this function
@@ -4684,6 +4709,9 @@ void sched_core_unsafe_enter(void)
struct rq *rq;
int i, cpu;

+ if (!static_branch_likely(&sched_core_kernel_protection))
+ return;
+
/* Ensure that on return to user/guest, we check whether to wait. */
if (current->core_cookie)
set_tsk_thread_flag(current, TIF_UNSAFE_RET);
@@ -4769,6 +4797,9 @@ void sched_core_unsafe_exit(void)
struct rq *rq;
int cpu;

+ if (!static_branch_likely(&sched_core_kernel_protection))
+ return;
+
local_irq_save(flags);
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -4807,9 +4838,15 @@ void sched_core_unsafe_exit(void)

void sched_core_unsafe_exit_wait(unsigned long ti_check)
{
+ if (!static_branch_likely(&sched_core_kernel_protection))
+ return;
+
sched_core_unsafe_exit();
sched_core_wait_till_safe(ti_check);
}
+#else
+static inline void init_sched_core_irq_work(struct rq *rq) {}
+#endif /* CONFIG_SCHED_CORE_KERNEL_PROTECTION */

// XXX fairness/fwd progress conditions
/*
@@ -7795,7 +7832,7 @@ int sched_cpu_starting(unsigned int cpu)
rq = cpu_rq(i);
if (rq->core && rq->core == rq)
core_rq = rq;
- init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+ init_sched_core_irq_work(rq);
}

if (!core_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dbd8416ddaba..676818bdb9df 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1058,8 +1058,10 @@ struct rq {
unsigned int core_sched_seq;
struct rb_root core_tree;
unsigned char core_forceidle;
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
struct irq_work core_irq_work; /* To force HT into kernel */
unsigned int core_this_unsafe_nest;
+#endif

/* shared state */
unsigned int core_task_seq;
--
2.17.1