[ABOMINATION] x86: Fast interrupt return to userspace

From: Andy Lutomirski
Date: Tue May 06 2014 - 16:29:21 EST


This could be even faster if it were written in assembler :)

The only reason it's Signed-off-by is that I agree to the DCO.
That should not be construed to mean that anyone should apply
this patch. It's an abomination and it will do terrible,
terrible things.

It boots, though :) I haven't tested it beyond that.

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
---
arch/x86/include/asm/calling.h | 10 ++++++++++
arch/x86/kernel/entry_64.S | 14 ++++++++++++++
arch/x86/kernel/process_64.c | 37 +++++++++++++++++++++++++++++++++++++
arch/x86/kernel/vsyscall_64.c | 2 +-
arch/x86/kernel/vsyscall_emu_64.S | 5 +++++
5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73b..ead0345 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,7 +46,9 @@ For 32-bit we have the following conventions - kernel is built with

*/

+#ifdef __ASSEMBLY__
#include <asm/dwarf2.h>
+#endif

#ifdef CONFIG_X86_64

@@ -85,6 +87,8 @@ For 32-bit we have the following conventions - kernel is built with
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX

+#ifdef __ASSEMBLY__
+
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
@@ -195,8 +199,12 @@ For 32-bit we have the following conventions - kernel is built with
.byte 0xf1
.endm

+#endif /* __ASSEMBLY__ */
+
#else /* CONFIG_X86_64 */

+#ifdef __ASSEMBLY__
+
/*
* For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
* are different from the entry_32.S versions in not changing the segment
@@ -240,5 +248,7 @@ For 32-bit we have the following conventions - kernel is built with
CFI_RESTORE eax
.endm

+#endif /* __ASSEMBLY__ */
+
#endif /* CONFIG_X86_64 */

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c36..7e3eae1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1027,6 +1027,9 @@ retint_swapgs: /* return to user-space */
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
+ call install_sysret_trampoline
+ test %rax,%rax
+ jnz iret_via_sysret
SWAPGS
jmp restore_args

@@ -1036,6 +1039,7 @@ retint_restore_args: /* return to kernel space */
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
+
restore_args:
RESTORE_ARGS 1,8,1

@@ -1043,6 +1047,16 @@ irq_return:
INTERRUPT_RETURN
_ASM_EXTABLE(irq_return, bad_iret)

+iret_via_sysret:
+ SWAPGS
+ RESTORE_ARGS 1,8,1
+ popq %rcx /* RIP */
+ popq %r11 /* CS */
+ popq %r11 /* RFLAGS */
+ popq %rsp /* RSP */
+ /* ignore SS */
+ sysretq
+
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iretq
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c0280f..e48aced 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -562,3 +562,40 @@ unsigned long KSTK_ESP(struct task_struct *task)
return (test_tsk_thread_flag(task, TIF_IA32)) ?
(task_pt_regs(task)->sp) : ((task)->thread.usersp);
}
+
+#include <asm/calling.h>
+
+unsigned long notrace install_sysret_trampoline(void)
+{
+ unsigned long *here = __builtin_frame_address(0);
+ unsigned long *asmframe = here + 2;
+ unsigned long __user * newrsp;
+
+#define FRAMEVAL(x) asmframe[((x)-ARGOFFSET) / 8]
+ newrsp = (unsigned long __user * __force)(FRAMEVAL(RSP) - 128 - 3*8);
+
+ if (FRAMEVAL(CS) != __USER_CS)
+ return 0;
+
+ /*
+ * A real implementation would do:
+ * if (!access_ok(VERIFY_WRITE, newrsp, 3*8))
+ * return 0;
+ */
+
+ if (__put_user(FRAMEVAL(RIP), newrsp + 2))
+ return 0;
+
+ if (__put_user(FRAMEVAL(R11), newrsp + 1))
+ return 0;
+
+ if (__put_user(FRAMEVAL(RCX), newrsp))
+ return 0;
+
+ /* Hi there, optimizer. */
+ ACCESS_ONCE(FRAMEVAL(RIP)) = 0xffffffffff600c00;
+ ACCESS_ONCE(FRAMEVAL(RSP)) = (unsigned long)newrsp;
+ return 1;
+
+#undef FRAMEVAL
+}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8b3b3eb..77a5ef3 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,7 @@

DEFINE_VVAR(int, vgetcpu_mode);

-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;

static int __init vsyscall_setup(char *str)
{
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
index c9596a9..a54a780 100644
--- a/arch/x86/kernel/vsyscall_emu_64.S
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -32,6 +32,11 @@ __vsyscall_page:
syscall
ret

+ .balign 1024, 0xcc
+ popq %rcx
+ popq %r11
+ retq $128
+
.balign 4096, 0xcc

.size __vsyscall_page, 4096
--
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/