[PATCH 1/7] x86/fpu: Simplify the fpu->last_cpu logic and rename it to fpu->fpregs_cached

From: Ingo Molnar
Date: Thu Jan 26 2017 - 06:26:57 EST


fpu->last_cpu records the last CPU a given FPU context structure was used on.
This enables an important optimization: if a task schedules out to a kernel
thread and then gets scheduled back after only FPU-inactive kernel threads
executed, the FPU state in the registers is still intact and the FPU restore
can be skipped - speeding up the context switch.

The same logic can be implemented slightly simpler, by using a single boolean
flag: fpu->fpregs_cached tells us whether the context's FPU registers are
cached in the CPU.

The only difference is that this flag has to be invalidated when a task is
migrated away from its CPU - but that is a slow path compared to context
switches.

Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Fenghua Yu <fenghua.yu@xxxxxxxxx>
Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Yu-cheng Yu <yu-cheng.yu@xxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
arch/x86/include/asm/fpu/internal.h | 15 ++++++++-------
arch/x86/include/asm/fpu/types.h | 24 ++++++++++--------------
arch/x86/include/asm/switch_to.h | 10 ++++++++++
arch/x86/kernel/fpu/core.c | 2 +-
kernel/sched/core.c | 2 ++
kernel/sched/sched.h | 8 ++++++++
6 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..2eaf93cf11cc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -490,7 +490,7 @@ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
* The in-register FPU state for an FPU context on a CPU is assumed to be
- * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * valid if fpu->fpregs_cached is still set, and if the fpu_fpregs_owner_ctx
* matches the FPU.
*
* If the FPU register state is valid, the kernel can skip restoring the
@@ -512,12 +512,12 @@ static inline void __cpu_invalidate_fpregs_state(void)

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
- fpu->last_cpu = -1;
+ fpu->fpregs_cached = 0;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
- return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+ return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && fpu->fpregs_cached;
}

/*
@@ -573,15 +573,16 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (old_fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(old_fpu))
- old_fpu->last_cpu = -1;
+ old_fpu->fpregs_cached = 0;
else
- old_fpu->last_cpu = cpu;
+ old_fpu->fpregs_cached = 1;

/* But leave fpu_fpregs_owner_ctx! */
old_fpu->fpregs_active = 0;
trace_x86_fpu_regs_deactivated(old_fpu);
- } else
- old_fpu->last_cpu = -1;
+ } else {
+ old_fpu->fpregs_cached = 0;
+ }
}

/*
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c80f5b9c09d..3090b0d7b232 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -276,20 +276,6 @@ union fpregs_state {
*/
struct fpu {
/*
- * @last_cpu:
- *
- * Records the last CPU on which this context was loaded into
- * FPU registers. (In the lazy-restore case we might be
- * able to reuse FPU registers across multiple context switches
- * this way, if no intermediate task used the FPU.)
- *
- * A value of -1 is used to indicate that the FPU state in context
- * memory is newer than the FPU state in registers, and that the
- * FPU state should be reloaded next time the task is run.
- */
- unsigned int last_cpu;
-
- /*
* @fpstate_active:
*
* This flag indicates whether this context is active: if the task
@@ -322,6 +308,16 @@ struct fpu {
unsigned char fpregs_active;

/*
+ * @fpregs_cached:
+ *
+ * This flag tells us whether this context is loaded into a CPU
+ * right now.
+ *
+ * This is set to 0 if a task is migrated to another CPU.
+ */
+ unsigned char fpregs_cached;
+
+ /*
* @state:
*
* In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index fcc5cd387fd1..a7146dadb31d 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -72,4 +72,14 @@ do { \
((last) = __switch_to_asm((prev), (next))); \
} while (0)

+
+/*
+ * The task-migration arch callback clears the FPU registers cache:
+ */
+static inline void arch_task_migrate(struct task_struct *p)
+{
+ p->thread.fpu.fpregs_cached = 0;
+}
+#define arch_task_migrate arch_task_migrate
+
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e1114f070c2d..287f1cb32b59 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -190,7 +190,7 @@ EXPORT_SYMBOL_GPL(fpstate_init);
int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
dst_fpu->fpregs_active = 0;
- dst_fpu->last_cpu = -1;
+ dst_fpu->fpregs_cached = 0;

if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU))
return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57f2991..7eb2f3041fde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1253,6 +1253,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
+
+ arch_task_migrate(p);
}

__set_task_cpu(p, new_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b34c7826ca5..ff8a894132e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1824,3 +1824,11 @@ static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
+
+/*
+ * Default task-migration arch callback:
+ */
+#ifndef arch_task_migrate
+static inline void arch_task_migrate(struct task_struct *p) { }
+#endif
+
--
2.7.4