Re: [PATCH 2/3 v3] x86: entry_64.S: always allocate complete "struct pt_regs"

From: Andy Lutomirski
Date: Wed Feb 18 2015 - 15:23:21 EST


On Thu, Feb 12, 2015 at 1:54 PM, Denys Vlasenko <dvlasenk@xxxxxxxxxx> wrote:
> 64-bit code was using six stack slots less by not saving/restoring
> registers which are callee-preserved according to C ABI,
> and not allocating space for them.
> Only when syscall needed a complete "struct pt_regs",
> the complete area was allocated and filled in.
> As an additional twist, on interrupt entry a "slightly less truncated pt_regs"
> trick is used, to make nested interrupt stacks easier to unwind.
>
> This proved to be a source of significant obfuscation and subtle bugs.
> For example, stub_fork had to pop the return address,
> extend the struct, save registers, and push return address back. Ugly.
> ia32_ptregs_common pops return address and "returns" via jmp insn,
> throwing a wrench into CPU return stack cache.
>
> This patch changes code to always allocate a complete "struct pt_regs".
> The saving of registers is still done lazily.
>
> "Partial pt_regs" trick on interrupt stack is retained.
>
> Macros which manipulate "struct pt_regs" on stack are reworked:
> ALLOC_PT_GPREGS_ON_STACK allocates the structure.
> SAVE_C_REGS saves to it those registers which are clobbered by C code.
> SAVE_EXTRA_REGS saves to it all other registers.
> Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it.
>
> ia32_ptregs_common, stub_fork and friends lost their ugly dance with
> return pointer.
>
> LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets
> instead of magic numbers.
>
> error_entry and save_paranoid now use SAVE_C_REGS + SAVE_EXTRA_REGS
> instead of having it open-coded yet again.
>
> Patch was run-tested: 64-bit executables, 32-bit executables,
> strace works.
> Timing tests did not show measurable difference in 32-bit
> and 64-bit syscalls.

This patch scares me, because it changes a lot of hairy code. That
being said, I don't see anything wrong with it, and the end result is
much nicer than the status quo. So I applied it, and I'll let the
kbuild bot have fun with it. I confirmed that I can boot a 64-bit and
a 32-bit system with it, at least in my configuration.

Further reviews are encouraged :)

--Andy

>
> Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
> CC: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> CC: Oleg Nesterov <oleg@xxxxxxxxxx>
> CC: Borislav Petkov <bp@xxxxxxxxx>
> CC: "H. Peter Anvin" <hpa@xxxxxxxxx>
> CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> CC: Frederic Weisbecker <fweisbec@xxxxxxxxx>
> CC: X86 ML <x86@xxxxxxxxxx>
> CC: Alexei Starovoitov <ast@xxxxxxxxxxxx>
> CC: Will Drewry <wad@xxxxxxxxxxxx>
> CC: Kees Cook <keescook@xxxxxxxxxxxx>
> CC: linux-kernel@xxxxxxxxxxxxxxx
> ---
> arch/x86/ia32/ia32entry.S | 47 +++----
> arch/x86/include/asm/calling.h | 222 ++++++++++++++++-----------------
> arch/x86/include/asm/irqflags.h | 4 +-
> arch/x86/include/uapi/asm/ptrace-abi.h | 1 -
> arch/x86/kernel/entry_64.S | 195 +++++++++++------------------
> 5 files changed, 209 insertions(+), 260 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 156ebca..f4bed49 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -62,12 +62,12 @@
> */
> .macro LOAD_ARGS32 offset, _r9=0
> .if \_r9
> - movl \offset+16(%rsp),%r9d
> + movl \offset+R9(%rsp),%r9d
> .endif
> - movl \offset+40(%rsp),%ecx
> - movl \offset+48(%rsp),%edx
> - movl \offset+56(%rsp),%esi
> - movl \offset+64(%rsp),%edi
> + movl \offset+RCX(%rsp),%ecx
> + movl \offset+RDX(%rsp),%edx
> + movl \offset+RSI(%rsp),%esi
> + movl \offset+RDI(%rsp),%edi
> movl %eax,%eax /* zero extension */
> .endm
>
> @@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
> CFI_REL_OFFSET rip,0
> pushq_cfi %rax
> cld
> - SAVE_ARGS 0,1,0
> + ALLOC_PT_GPREGS_ON_STACK
> + SAVE_C_REGS_EXCEPT_R891011
> /* no need to do an access_ok check here because rbp has been
> 32bit zero extended */
> ASM_STAC
> @@ -182,7 +183,8 @@ sysexit_from_sys_call:
> andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
> movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
> CFI_REGISTER rip,rdx
> - RESTORE_ARGS 0,24,0,0,0,0
> + RESTORE_RSI_RDI
> + REMOVE_PT_GPREGS_FROM_STACK 3*8
> xorq %r8,%r8
> xorq %r9,%r9
> xorq %r10,%r10
> @@ -256,13 +258,13 @@ sysenter_tracesys:
> testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> jz sysenter_auditsys
> #endif
> - SAVE_REST
> + SAVE_EXTRA_REGS
> CLEAR_RREGS
> movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
> movq %rsp,%rdi /* &pt_regs -> arg1 */
> call syscall_trace_enter
> LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> cmpq $(IA32_NR_syscalls-1),%rax
> ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
> jmp sysenter_do_call
> @@ -304,7 +306,8 @@ ENTRY(ia32_cstar_target)
> * disabled irqs and here we enable it straight after entry:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> - SAVE_ARGS 8,0,0
> + ALLOC_PT_GPREGS_ON_STACK 8
> + SAVE_C_REGS_EXCEPT_RCX_R891011
> movl %eax,%eax /* zero extension */
> movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
> movq %rcx,RIP-ARGOFFSET(%rsp)
> @@ -341,7 +344,7 @@ cstar_dispatch:
> jnz sysretl_audit
> sysretl_from_sys_call:
> andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> - RESTORE_ARGS 0,-ARG_SKIP,0,0,0
> + RESTORE_RSI_RDI_RDX
> movl RIP-ARGOFFSET(%rsp),%ecx
> CFI_REGISTER rip,rcx
> movl EFLAGS-ARGOFFSET(%rsp),%r11d
> @@ -372,13 +375,13 @@ cstar_tracesys:
> jz cstar_auditsys
> #endif
> xchgl %r9d,%ebp
> - SAVE_REST
> + SAVE_EXTRA_REGS
> CLEAR_RREGS 0, r9
> movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
> movq %rsp,%rdi /* &pt_regs -> arg1 */
> call syscall_trace_enter
> LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> xchgl %ebp,%r9d
> cmpq $(IA32_NR_syscalls-1),%rax
> ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
> @@ -433,7 +436,8 @@ ENTRY(ia32_syscall)
> cld
> /* note the registers are not zero extended to the sf.
> this could be a problem. */
> - SAVE_ARGS 0,1,0
> + ALLOC_PT_GPREGS_ON_STACK
> + SAVE_C_REGS_EXCEPT_R891011
> orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> jnz ia32_tracesys
> @@ -446,16 +450,16 @@ ia32_sysret:
> movq %rax,RAX-ARGOFFSET(%rsp)
> ia32_ret_from_sys_call:
> CLEAR_RREGS -ARGOFFSET
> - jmp int_ret_from_sys_call
> + jmp int_ret_from_sys_call
>
> -ia32_tracesys:
> - SAVE_REST
> +ia32_tracesys:
> + SAVE_EXTRA_REGS
> CLEAR_RREGS
> movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
> movq %rsp,%rdi /* &pt_regs -> arg1 */
> call syscall_trace_enter
> LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> cmpq $(IA32_NR_syscalls-1),%rax
> ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
> jmp ia32_do_call
> @@ -492,7 +496,6 @@ GLOBAL(stub32_clone)
>
> ALIGN
> ia32_ptregs_common:
> - popq %r11
> CFI_ENDPROC
> CFI_STARTPROC32 simple
> CFI_SIGNAL_FRAME
> @@ -507,9 +510,9 @@ ia32_ptregs_common:
> /* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
> CFI_REL_OFFSET rsp,RSP-ARGOFFSET
> /* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
> - SAVE_REST
> + SAVE_EXTRA_REGS 8
> call *%rax
> - RESTORE_REST
> - jmp ia32_sysret /* misbalances the return cache */
> + RESTORE_EXTRA_REGS 8
> + ret
> CFI_ENDPROC
> END(ia32_ptregs_common)
> diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
> index 3c711f2a..3835647 100644
> --- a/arch/x86/include/asm/calling.h
> +++ b/arch/x86/include/asm/calling.h
> @@ -55,143 +55,137 @@ For 32-bit we have the following conventions - kernel is built with
> * for assembly code:
> */
>
> -#define R15 0
> -#define R14 8
> -#define R13 16
> -#define R12 24
> -#define RBP 32
> -#define RBX 40
> -
> -/* arguments: interrupts/non tracing syscalls only save up to here: */
> -#define R11 48
> -#define R10 56
> -#define R9 64
> -#define R8 72
> -#define RAX 80
> -#define RCX 88
> -#define RDX 96
> -#define RSI 104
> -#define RDI 112
> -#define ORIG_RAX 120 /* + error_code */
> -/* end of arguments */
> -
> -/* cpu exception frame or undefined in case of fast syscall: */
> -#define RIP 128
> -#define CS 136
> -#define EFLAGS 144
> -#define RSP 152
> -#define SS 160
> -
> -#define ARGOFFSET R11
> -
> - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
> - subq $9*8+\addskip, %rsp
> - CFI_ADJUST_CFA_OFFSET 9*8+\addskip
> - movq_cfi rdi, 8*8
> - movq_cfi rsi, 7*8
> - movq_cfi rdx, 6*8
> -
> - .if \save_rcx
> - movq_cfi rcx, 5*8
> - .endif
> +/* The layout forms the "struct pt_regs" on the stack: */
> +/*
> + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
> + * unless syscall needs a complete, fully filled "struct pt_regs".
> + */
> +#define R15 0*8
> +#define R14 1*8
> +#define R13 2*8
> +#define R12 3*8
> +#define RBP 4*8
> +#define RBX 5*8
> +/* These regs are callee-clobbered. Always saved on kernel entry. */
> +#define R11 6*8
> +#define R10 7*8
> +#define R9 8*8
> +#define R8 9*8
> +#define RAX 10*8
> +#define RCX 11*8
> +#define RDX 12*8
> +#define RSI 13*8
> +#define RDI 14*8
> +/*
> + * On syscall entry, this is syscall#. On CPU exception, this is error code.
> + * On hw interrupt, it's IRQ number:
> + */
> +#define ORIG_RAX 15*8
> +/* Return frame for iretq */
> +#define RIP 16*8
> +#define CS 17*8
> +#define EFLAGS 18*8
> +#define RSP 19*8
> +#define SS 20*8
> +
> +#define ARGOFFSET 0
> +
> + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
> + subq $15*8+\addskip, %rsp
> + CFI_ADJUST_CFA_OFFSET 15*8+\addskip
> + .endm
>
> - .if \rax_enosys
> - movq $-ENOSYS, 4*8(%rsp)
> - .else
> - movq_cfi rax, 4*8
> + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8plus=1
> + .if \r8plus
> + movq_cfi r11, 6*8+\offset
> + movq_cfi r10, 7*8+\offset
> + movq_cfi r9, 8*8+\offset
> + movq_cfi r8, 9*8+\offset
> .endif
> -
> - .if \save_r891011
> - movq_cfi r8, 3*8
> - movq_cfi r9, 2*8
> - movq_cfi r10, 1*8
> - movq_cfi r11, 0*8
> + .if \rax
> + movq_cfi rax, 10*8+\offset
> + .endif
> + .if \rcx
> + movq_cfi rcx, 11*8+\offset
> .endif
> + movq_cfi rdx, 12*8+\offset
> + movq_cfi rsi, 13*8+\offset
> + movq_cfi rdi, 14*8+\offset
> + .endm
> + .macro SAVE_C_REGS offset=0
> + SAVE_C_REGS_HELPER \offset, 1, 1, 1
> + .endm
> + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
> + SAVE_C_REGS_HELPER \offset, 0, 0, 1
> + .endm
> + .macro SAVE_C_REGS_EXCEPT_R891011
> + SAVE_C_REGS_HELPER 0, 1, 1, 0
> + .endm
> + .macro SAVE_C_REGS_EXCEPT_RCX_R891011
> + SAVE_C_REGS_HELPER 0, 1, 0, 0
> + .endm
>
> + .macro SAVE_EXTRA_REGS offset=0
> + movq_cfi r15, 0*8+\offset
> + movq_cfi r14, 1*8+\offset
> + movq_cfi r13, 2*8+\offset
> + movq_cfi r12, 3*8+\offset
> + movq_cfi rbp, 4*8+\offset
> + movq_cfi rbx, 5*8+\offset
> + .endm
> + .macro SAVE_EXTRA_REGS_RBP offset=0
> + movq_cfi rbp, 4*8+\offset
> .endm
>
> -#define ARG_SKIP (9*8)
> + .macro RESTORE_EXTRA_REGS offset=0
> + movq_cfi_restore 0*8+\offset, r15
> + movq_cfi_restore 1*8+\offset, r14
> + movq_cfi_restore 2*8+\offset, r13
> + movq_cfi_restore 3*8+\offset, r12
> + movq_cfi_restore 4*8+\offset, rbp
> + movq_cfi_restore 5*8+\offset, rbx
> + .endm
>
> - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
> - rstor_r8910=1, rstor_rdx=1
> + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
> .if \rstor_r11
> - movq_cfi_restore 0*8, r11
> + movq_cfi_restore 6*8, r11
> .endif
> -
> .if \rstor_r8910
> - movq_cfi_restore 1*8, r10
> - movq_cfi_restore 2*8, r9
> - movq_cfi_restore 3*8, r8
> + movq_cfi_restore 7*8, r10
> + movq_cfi_restore 8*8, r9
> + movq_cfi_restore 9*8, r8
> .endif
> -
> .if \rstor_rax
> - movq_cfi_restore 4*8, rax
> + movq_cfi_restore 10*8, rax
> .endif
> -
> .if \rstor_rcx
> - movq_cfi_restore 5*8, rcx
> + movq_cfi_restore 11*8, rcx
> .endif
> -
> .if \rstor_rdx
> - movq_cfi_restore 6*8, rdx
> - .endif
> -
> - movq_cfi_restore 7*8, rsi
> - movq_cfi_restore 8*8, rdi
> -
> - .if ARG_SKIP+\addskip > 0
> - addq $ARG_SKIP+\addskip, %rsp
> - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
> + movq_cfi_restore 12*8, rdx
> .endif
> + movq_cfi_restore 13*8, rsi
> + movq_cfi_restore 14*8, rdi
> .endm
> -
> - .macro LOAD_ARGS offset, skiprax=0
> - movq \offset(%rsp), %r11
> - movq \offset+8(%rsp), %r10
> - movq \offset+16(%rsp), %r9
> - movq \offset+24(%rsp), %r8
> - movq \offset+40(%rsp), %rcx
> - movq \offset+48(%rsp), %rdx
> - movq \offset+56(%rsp), %rsi
> - movq \offset+64(%rsp), %rdi
> - .if \skiprax
> - .else
> - movq \offset+72(%rsp), %rax
> - .endif
> + .macro RESTORE_C_REGS
> + RESTORE_C_REGS_HELPER 1,1,1,1,1
> .endm
> -
> -#define REST_SKIP (6*8)
> -
> - .macro SAVE_REST
> - subq $REST_SKIP, %rsp
> - CFI_ADJUST_CFA_OFFSET REST_SKIP
> - movq_cfi rbx, 5*8
> - movq_cfi rbp, 4*8
> - movq_cfi r12, 3*8
> - movq_cfi r13, 2*8
> - movq_cfi r14, 1*8
> - movq_cfi r15, 0*8
> + .macro RESTORE_C_REGS_EXCEPT_RAX
> + RESTORE_C_REGS_HELPER 0,1,1,1,1
> .endm
> -
> - .macro RESTORE_REST
> - movq_cfi_restore 0*8, r15
> - movq_cfi_restore 1*8, r14
> - movq_cfi_restore 2*8, r13
> - movq_cfi_restore 3*8, r12
> - movq_cfi_restore 4*8, rbp
> - movq_cfi_restore 5*8, rbx
> - addq $REST_SKIP, %rsp
> - CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
> + .macro RESTORE_C_REGS_EXCEPT_RCX
> + RESTORE_C_REGS_HELPER 1,0,1,1,1
> .endm
> -
> - .macro SAVE_ALL
> - SAVE_ARGS
> - SAVE_REST
> + .macro RESTORE_RSI_RDI
> + RESTORE_C_REGS_HELPER 0,0,0,0,0
> + .endm
> + .macro RESTORE_RSI_RDI_RDX
> + RESTORE_C_REGS_HELPER 0,0,0,0,1
> .endm
>
> - .macro RESTORE_ALL addskip=0
> - RESTORE_REST
> - RESTORE_ARGS 1, \addskip
> + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
> + addq $15*8+\addskip, %rsp
> + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
> .endm
>
> .macro icebp
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index 0a8b519..021bee9 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
> #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
> TRACE_IRQS_ON; \
> sti; \
> - SAVE_REST; \
> + SAVE_EXTRA_REGS; \
> LOCKDEP_SYS_EXIT; \
> - RESTORE_REST; \
> + RESTORE_EXTRA_REGS; \
> cli; \
> TRACE_IRQS_OFF;
>
> diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
> index 7b0a55a..ad115bf 100644
> --- a/arch/x86/include/uapi/asm/ptrace-abi.h
> +++ b/arch/x86/include/uapi/asm/ptrace-abi.h
> @@ -49,7 +49,6 @@
> #define EFLAGS 144
> #define RSP 152
> #define SS 160
> -#define ARGOFFSET R11
> #endif /* __ASSEMBLY__ */
>
> /* top of stack page */
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index ac542ac..45bdd26 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -26,12 +26,6 @@
> * Some macro usage:
> * - CFI macros are used to generate dwarf2 unwind information for better
> * backtraces. They don't change any code.
> - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
> - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
> - * There are unfortunately lots of special cases where some registers
> - * not touched. The macro is a big mess that should be cleaned up.
> - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
> - * Gives a full stack frame.
> * - ENTRY/END Define functions in the symbol table.
> * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
> * frame that is otherwise undefined after a SYSCALL
> @@ -190,9 +184,9 @@ ENDPROC(native_usergs_sysret64)
> .endm
>
> /*
> - * frame that enables calling into C.
> + * frame that enables passing a complete pt_regs to a C function.
> */
> - .macro PARTIAL_FRAME start=1 offset=0
> + .macro DEFAULT_FRAME start=1 offset=0
> XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
> CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
> CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
> @@ -203,13 +197,6 @@ ENDPROC(native_usergs_sysret64)
> CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
> CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
> CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
> - .endm
> -
> -/*
> - * frame that enables passing a complete pt_regs to a C function.
> - */
> - .macro DEFAULT_FRAME start=1 offset=0
> - PARTIAL_FRAME \start, R11+\offset-R15
> CFI_REL_OFFSET rbx, RBX+\offset
> CFI_REL_OFFSET rbp, RBP+\offset
> CFI_REL_OFFSET r12, R12+\offset
> @@ -221,21 +208,8 @@ ENDPROC(native_usergs_sysret64)
> ENTRY(save_paranoid)
> XCPT_FRAME 1 RDI+8
> cld
> - movq %rdi, RDI+8(%rsp)
> - movq %rsi, RSI+8(%rsp)
> - movq_cfi rdx, RDX+8
> - movq_cfi rcx, RCX+8
> - movq_cfi rax, RAX+8
> - movq %r8, R8+8(%rsp)
> - movq %r9, R9+8(%rsp)
> - movq %r10, R10+8(%rsp)
> - movq %r11, R11+8(%rsp)
> - movq_cfi rbx, RBX+8
> - movq %rbp, RBP+8(%rsp)
> - movq %r12, R12+8(%rsp)
> - movq %r13, R13+8(%rsp)
> - movq %r14, R14+8(%rsp)
> - movq %r15, R15+8(%rsp)
> + SAVE_C_REGS 8
> + SAVE_EXTRA_REGS 8
> movl $1,%ebx
> movl $MSR_GS_BASE,%ecx
> rdmsr
> @@ -264,7 +238,7 @@ ENTRY(ret_from_fork)
>
> GET_THREAD_INFO(%rcx)
>
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
>
> testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
> jz 1f
> @@ -276,12 +250,10 @@ ENTRY(ret_from_fork)
> jmp ret_from_sys_call # go to the SYSRET fastpath
>
> 1:
> - subq $REST_SKIP, %rsp # leave space for volatiles
> - CFI_ADJUST_CFA_OFFSET REST_SKIP
> movq %rbp, %rdi
> call *%rbx
> movl $0, RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(ret_from_fork)
> @@ -339,9 +311,11 @@ GLOBAL(system_call_after_swapgs)
> * and short:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> - SAVE_ARGS 8, 0, rax_enosys=1
> + ALLOC_PT_GPREGS_ON_STACK 8
> + SAVE_C_REGS_EXCEPT_RAX_RCX
> + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
> movq_cfi rax,(ORIG_RAX-ARGOFFSET)
> - movq %rcx,RIP-ARGOFFSET(%rsp)
> + movq %rcx,RIP-ARGOFFSET(%rsp)
> CFI_REL_OFFSET rip,RIP-ARGOFFSET
> testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> jnz tracesys
> @@ -372,9 +346,9 @@ ret_from_sys_call:
> * sysretq will re-enable interrupts:
> */
> TRACE_IRQS_ON
> + RESTORE_C_REGS_EXCEPT_RCX
> movq RIP-ARGOFFSET(%rsp),%rcx
> CFI_REGISTER rip,rcx
> - RESTORE_ARGS 1,-ARG_SKIP,0
> /*CFI_REGISTER rflags,r11*/
> movq PER_CPU_VAR(old_rsp), %rsp
> USERGS_SYSRET64
> @@ -387,16 +361,16 @@ int_ret_from_sys_call_fixup:
>
> /* Do syscall tracing */
> tracesys:
> - leaq -REST_SKIP(%rsp), %rdi
> + movq %rsp, %rdi
> movq $AUDIT_ARCH_X86_64, %rsi
> call syscall_trace_enter_phase1
> test %rax, %rax
> jnz tracesys_phase2 /* if needed, run the slow path */
> - LOAD_ARGS 0 /* else restore clobbered regs */
> + RESTORE_C_REGS /* else restore clobbered regs */
> jmp system_call_fastpath /* and return to the fast path */
>
> tracesys_phase2:
> - SAVE_REST
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %rdi
> movq %rsp, %rdi
> movq $AUDIT_ARCH_X86_64, %rsi
> @@ -408,8 +382,8 @@ tracesys_phase2:
> * We don't reload %rax because syscall_trace_entry_phase2() returned
> * the value it wants us to use in the table lookup.
> */
> - LOAD_ARGS ARGOFFSET, 1
> - RESTORE_REST
> + RESTORE_C_REGS_EXCEPT_RAX
> + RESTORE_EXTRA_REGS
> #if __SYSCALL_MASK == ~0
> cmpq $__NR_syscall_max,%rax
> #else
> @@ -460,7 +434,7 @@ int_very_careful:
> TRACE_IRQS_ON
> ENABLE_INTERRUPTS(CLBR_NONE)
> int_check_syscall_exit_work:
> - SAVE_REST
> + SAVE_EXTRA_REGS
> /* Check for syscall exit trace */
> testl $_TIF_WORK_SYSCALL_EXIT,%edx
> jz int_signal
> @@ -479,7 +453,7 @@ int_signal:
> call do_notify_resume
> 1: movl $_TIF_WORK_MASK,%edi
> int_restore_rest:
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> DISABLE_INTERRUPTS(CLBR_NONE)
> TRACE_IRQS_OFF
> jmp int_with_check
> @@ -489,15 +463,12 @@ END(system_call)
> .macro FORK_LIKE func
> ENTRY(stub_\func)
> CFI_STARTPROC
> - popq %r11 /* save return address */
> - PARTIAL_FRAME 0
> - SAVE_REST
> - pushq %r11 /* put it back on stack */
> + DEFAULT_FRAME 0, 8 /* offset 8: return address */
> + SAVE_EXTRA_REGS 8
> FIXUP_TOP_OF_STACK %r11, 8
> - DEFAULT_FRAME 0 8 /* offset 8: return address */
> call sys_\func
> RESTORE_TOP_OF_STACK %r11, 8
> - ret $REST_SKIP /* pop extended registers */
> + ret
> CFI_ENDPROC
> END(stub_\func)
> .endm
> @@ -505,7 +476,7 @@ END(stub_\func)
> .macro FIXED_FRAME label,func
> ENTRY(\label)
> CFI_STARTPROC
> - PARTIAL_FRAME 0 8 /* offset 8: return address */
> + DEFAULT_FRAME 0, 8 /* offset 8: return address */
> FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
> call \func
> RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
> @@ -522,12 +493,12 @@ END(\label)
> ENTRY(stub_execve)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys_execve
> movq %rax,RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_execve)
> @@ -535,13 +506,13 @@ END(stub_execve)
> ENTRY(stub_execveat)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys_execveat
> RESTORE_TOP_OF_STACK %r11
> movq %rax,RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_execveat)
> @@ -553,12 +524,12 @@ END(stub_execveat)
> ENTRY(stub_rt_sigreturn)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys_rt_sigreturn
> movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_rt_sigreturn)
> @@ -567,12 +538,12 @@ END(stub_rt_sigreturn)
> ENTRY(stub_x32_rt_sigreturn)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call sys32_x32_rt_sigreturn
> movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_x32_rt_sigreturn)
> @@ -580,13 +551,13 @@ END(stub_x32_rt_sigreturn)
> ENTRY(stub_x32_execve)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call compat_sys_execve
> RESTORE_TOP_OF_STACK %r11
> movq %rax,RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_x32_execve)
> @@ -594,13 +565,13 @@ END(stub_x32_execve)
> ENTRY(stub_x32_execveat)
> CFI_STARTPROC
> addq $8, %rsp
> - PARTIAL_FRAME 0
> - SAVE_REST
> + DEFAULT_FRAME 0
> + SAVE_EXTRA_REGS
> FIXUP_TOP_OF_STACK %r11
> call compat_sys_execveat
> RESTORE_TOP_OF_STACK %r11
> movq %rax,RAX(%rsp)
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> jmp int_ret_from_sys_call
> CFI_ENDPROC
> END(stub_x32_execveat)
> @@ -656,42 +627,28 @@ END(interrupt)
>
> /* 0(%rsp): ~(interrupt number) */
> .macro interrupt func
> - /* reserve pt_regs for scratch regs and rbp */
> - subq $ORIG_RAX-RBP, %rsp
> - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
> cld
> - /* start from rbp in pt_regs and jump over */
> - movq_cfi rdi, (RDI-RBP)
> - movq_cfi rsi, (RSI-RBP)
> - movq_cfi rdx, (RDX-RBP)
> - movq_cfi rcx, (RCX-RBP)
> - movq_cfi rax, (RAX-RBP)
> - movq_cfi r8, (R8-RBP)
> - movq_cfi r9, (R9-RBP)
> - movq_cfi r10, (R10-RBP)
> - movq_cfi r11, (R11-RBP)
> -
> - /* Save rbp so that we can unwind from get_irq_regs() */
> - movq_cfi rbp, 0
> -
> - /* Save previous stack value */
> - movq %rsp, %rsi
> + ALLOC_PT_GPREGS_ON_STACK -RBP
> + SAVE_C_REGS -RBP
> + /* this goes to 0(%rsp) for unwinder, not for saving the value: */
> + SAVE_EXTRA_REGS_RBP -RBP
> +
> + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
>
> - leaq -RBP(%rsp),%rdi /* arg1 for handler */
> - testl $3, CS-RBP(%rsi)
> + testl $3, CS-RBP(%rsp)
> je 1f
> SWAPGS
> +1:
> /*
> * irq_count is used to check if a CPU is already on an interrupt stack
> * or not. While this is essentially redundant with preempt_count it is
> * a little cheaper to use a separate counter in the PDA (short of
> * moving irq_enter into assembly, which would be too much work)
> */
> -1: incl PER_CPU_VAR(irq_count)
> + movq %rsp, %rsi
> + incl PER_CPU_VAR(irq_count)
> cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> CFI_DEF_CFA_REGISTER rsi
> -
> - /* Store previous stack value */
> pushq %rsi
> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
> 0x77 /* DW_OP_breg7 */, 0, \
> @@ -800,7 +757,8 @@ retint_swapgs: /* return to user-space */
> */
> irq_return_via_sysret:
> CFI_REMEMBER_STATE
> - RESTORE_ARGS 1,8,1
> + RESTORE_C_REGS
> + REMOVE_PT_GPREGS_FROM_STACK 8
> movq (RSP-RIP)(%rsp),%rsp
> USERGS_SYSRET64
> CFI_RESTORE_STATE
> @@ -816,7 +774,8 @@ retint_restore_args: /* return to kernel space */
> */
> TRACE_IRQS_IRETQ
> restore_args:
> - RESTORE_ARGS 1,8,1
> + RESTORE_C_REGS
> + REMOVE_PT_GPREGS_FROM_STACK 8
>
> irq_return:
> INTERRUPT_RETURN
> @@ -887,12 +846,12 @@ retint_signal:
> jz retint_swapgs
> TRACE_IRQS_ON
> ENABLE_INTERRUPTS(CLBR_NONE)
> - SAVE_REST
> + SAVE_EXTRA_REGS
> movq $-1,ORIG_RAX(%rsp)
> xorl %esi,%esi # oldset
> movq %rsp,%rdi # &pt_regs
> call do_notify_resume
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> DISABLE_INTERRUPTS(CLBR_NONE)
> TRACE_IRQS_OFF
> GET_THREAD_INFO(%rcx)
> @@ -1019,8 +978,7 @@ ENTRY(\sym)
> pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
> .endif
>
> - subq $ORIG_RAX-R15, %rsp
> - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
> + ALLOC_PT_GPREGS_ON_STACK
>
> .if \paranoid
> .if \paranoid == 1
> @@ -1266,7 +1224,9 @@ ENTRY(xen_failsafe_callback)
> addq $0x30,%rsp
> CFI_ADJUST_CFA_OFFSET -0x30
> pushq_cfi $-1 /* orig_ax = -1 => not a system call */
> - SAVE_ALL
> + ALLOC_PT_GPREGS_ON_STACK
> + SAVE_C_REGS
> + SAVE_EXTRA_REGS
> jmp error_exit
> CFI_ENDPROC
> END(xen_failsafe_callback)
> @@ -1318,11 +1278,15 @@ ENTRY(paranoid_exit)
> jnz paranoid_restore
> TRACE_IRQS_IRETQ 0
> SWAPGS_UNSAFE_STACK
> - RESTORE_ALL 8
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> + REMOVE_PT_GPREGS_FROM_STACK 8
> INTERRUPT_RETURN
> paranoid_restore:
> TRACE_IRQS_IRETQ_DEBUG 0
> - RESTORE_ALL 8
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> + REMOVE_PT_GPREGS_FROM_STACK 8
> INTERRUPT_RETURN
> CFI_ENDPROC
> END(paranoid_exit)
> @@ -1336,21 +1300,8 @@ ENTRY(error_entry)
> CFI_ADJUST_CFA_OFFSET 15*8
> /* oldrax contains error code */
> cld
> - movq %rdi, RDI+8(%rsp)
> - movq %rsi, RSI+8(%rsp)
> - movq %rdx, RDX+8(%rsp)
> - movq %rcx, RCX+8(%rsp)
> - movq %rax, RAX+8(%rsp)
> - movq %r8, R8+8(%rsp)
> - movq %r9, R9+8(%rsp)
> - movq %r10, R10+8(%rsp)
> - movq %r11, R11+8(%rsp)
> - movq_cfi rbx, RBX+8
> - movq %rbp, RBP+8(%rsp)
> - movq %r12, R12+8(%rsp)
> - movq %r13, R13+8(%rsp)
> - movq %r14, R14+8(%rsp)
> - movq %r15, R15+8(%rsp)
> + SAVE_C_REGS 8
> + SAVE_EXTRA_REGS 8
> xorl %ebx,%ebx
> testl $3,CS+8(%rsp)
> je error_kernelspace
> @@ -1399,7 +1350,7 @@ END(error_entry)
> ENTRY(error_exit)
> DEFAULT_FRAME
> movl %ebx,%eax
> - RESTORE_REST
> + RESTORE_EXTRA_REGS
> DISABLE_INTERRUPTS(CLBR_NONE)
> TRACE_IRQS_OFF
> GET_THREAD_INFO(%rcx)
> @@ -1618,8 +1569,8 @@ end_repeat_nmi:
> * so that we repeat another NMI.
> */
> pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
> - subq $ORIG_RAX-R15, %rsp
> - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
> + ALLOC_PT_GPREGS_ON_STACK
> +
> /*
> * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
> * as we should not be calling schedule in NMI context.
> @@ -1658,8 +1609,10 @@ end_repeat_nmi:
> nmi_swapgs:
> SWAPGS_UNSAFE_STACK
> nmi_restore:
> + RESTORE_EXTRA_REGS
> + RESTORE_C_REGS
> /* Pop the extra iret frame at once */
> - RESTORE_ALL 6*8
> + REMOVE_PT_GPREGS_FROM_STACK 6*8
>
> /* Clear the NMI executing stack variable */
> movq $0, 5*8(%rsp)
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/