[PATCH] x86/entry/64: Context-track syscalls before enabling interrupts

From: Andy Lutomirski
Date: Tue Aug 18 2015 - 15:12:22 EST


This fixes a couple minor holes if we took an IRQ very early in syscall
processing:

- We could enter the IRQ with CONTEXT_USER. Everything worked (RCU
was fine), but we could warn if all the debugging options were
set.

- We could have the IRQ regs overlap task_pt_regs. I'm not aware
of anything important that would break, but some of the /proc
stuff could plausibly have gotten confused.

Fix it the straightforward way: finish filling in pt_regs and call
enter_from_user_mode before enabling interrupts if _TIF_NOHZ is set.

This should be the last piece of the puzzle needed to get rid of most
remaining exception_enter calls. (vmalloc faults are still tricky,
but they're mostly fatal in the syscall prologue already.)

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---

This is the last significant functionality change I send for 4.3, I
hope. With this applied, context tracking for all non-NMI, non-debug
entries should be exact.

There's probably some (minor) performance regression on
CONFIG_CONTEXT_TRACKING=y kernels that aren't using nohz. If so
(I'll benchmark it later this week), I'll try to rig up a simple
patch to NOP out the hooks of nohz is off.

Sasha, this should fix the intermittent DEBUG_LOCKS splat you're
seeing.

I don't intend to send v2 the #BP stuff for 4.3. The pile is plenty
big already.

arch/x86/entry/common.c | 12 +-------
arch/x86/entry/entry_64.S | 32 ++++++++++++++------
arch/x86/entry/entry_64_compat.S | 60 +++++++++++++++++++++++++++++---------
arch/x86/include/asm/thread_info.h | 3 +-
4 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 80dcc9261ca3..b570cea2f469 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -70,21 +70,11 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
u32 work;

BUG_ON(regs != task_pt_regs(current));
+ CT_WARN_ON(ct_state() != CONTEXT_KERNEL);

work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;

-#ifdef CONFIG_CONTEXT_TRACKING
- /*
- * If TIF_NOHZ is set, we are required to call user_exit() before
- * doing anything that could touch RCU.
- */
- if (work & _TIF_NOHZ) {
- enter_from_user_mode();
- work &= ~_TIF_NOHZ;
- }
-#endif
-
#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e2d078c9dfe4..6bf0c7ecf399 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,20 +142,16 @@ ENTRY(entry_SYSCALL_64)
*/
GLOBAL(entry_SYSCALL_64_after_swapgs)

+ /*
+ * IRQs must be off while we use rsp_scratch to keep it from
+ * being clobbered by a different task.
+ */
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- /*
- * Re-enable interrupts.
- * We use 'rsp_scratch' as a scratch space, hence irq-off block above
- * must execute atomically in the face of possible interrupt-driven
- * task preemption. We must enable interrupts only after we're done
- * with using rsp_scratch:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
@@ -171,8 +167,17 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */

- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ testl $(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz tracesys
+
+ /*
+ * Re-enable interrupts. IRQ tracing already thinks that IRQs are
+ * on (since we treat user mode as having IRQs on), and the
+ * prologue above is too short for it to be worth adding a
+ * tracing round trip.
+ */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
entry_SYSCALL_64_fastpath:
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max, %rax
@@ -235,6 +240,15 @@ GLOBAL(int_ret_from_sys_call_irqs_off)

/* Do syscall entry tracing */
tracesys:
+#ifdef CONFIG_CONTEXT_TRACKING
+ /* This is slow enough that it's worth tracing. */
+ TRACE_IRQS_OFF
+ call enter_from_user_mode
+ TRACE_IRQS_ON
+#endif
+
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
movq %rsp, %rdi
movl $AUDIT_ARCH_X86_64, %esi
call syscall_trace_enter_phase1
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index ff32a289b5d1..099ec1174ff9 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -103,11 +103,19 @@ ENTRY(entry_SYSENTER_compat)
jnz sysenter_fix_flags
sysenter_flags_fixed:

+#ifdef CONFIG_CONTEXT_TRACKING
+ /* This is slow enough that it's worth tracing. */
+ TRACE_IRQS_OFF
+ call enter_from_user_mode
+ TRACE_IRQS_ON
+#endif
+
/*
* Re-enable interrupts. IRQ tracing already thinks that IRQs are
* on (since we treat user mode as having IRQs on), and the
* prologue above is too short for it to be worth adding a
- * tracing round trip.
+ * tracing round trip except in the CONFIG_CONTEXT_TRACKING
+ * case.
*/
ENABLE_INTERRUPTS(CLBR_NONE)

@@ -318,15 +326,10 @@ ENDPROC(entry_SYSENTER_compat)
* with the int 0x80 path.
*/
ENTRY(entry_SYSCALL_compat)
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
+ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- ENABLE_INTERRUPTS(CLBR_NONE)

/* Zero-extending 32-bit regs, do not remove */
movl %eax, %eax
@@ -346,6 +349,22 @@ ENTRY(entry_SYSCALL_compat)
pushq $-ENOSYS /* pt_regs->ax */
sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */

+#ifdef CONFIG_CONTEXT_TRACKING
+ /* This is slow enough that it's worth tracing. */
+ TRACE_IRQS_OFF
+ call enter_from_user_mode
+ TRACE_IRQS_ON
+#endif
+
+ /*
+ * Re-enable interrupts. IRQ tracing already thinks that IRQs are
+ * on (since we treat user mode as having IRQs on), and the
+ * prologue above is too short for it to be worth adding a
+ * tracing round trip except in the CONFIG_CONTEXT_TRACKING
+ * case.
+ */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
/*
* No need to do an access_ok check here because r8 has been
* 32-bit zero extended:
@@ -354,6 +373,7 @@ ENTRY(entry_SYSCALL_compat)
1: movl (%r8), %r9d
_ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
+
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz cstar_tracesys
@@ -518,14 +538,9 @@ ia32_ret_from_sys_call:
*/

ENTRY(entry_INT80_compat)
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
+ /* Interrupts are off on entry. */
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
- ENABLE_INTERRUPTS(CLBR_NONE)

/* Zero-extending 32-bit regs, do not remove */
movl %eax, %eax
@@ -545,9 +560,17 @@ ENTRY(entry_INT80_compat)
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */

orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ testl $(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_tracesys

+ /*
+ * Re-enable interrupts. IRQ tracing already thinks that IRQs are
+ * on (since we treat user mode as having IRQs on), and the
+ * prologue above is too short for it to be worth adding a
+ * tracing round trip.
+ */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
ia32_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
@@ -564,6 +587,15 @@ ia32_do_call:
jmp int_ret_from_sys_call

ia32_tracesys:
+#ifdef CONFIG_CONTEXT_TRACKING
+ /* This is slow enough that it's worth tracing. */
+ TRACE_IRQS_OFF
+ call enter_from_user_mode
+ TRACE_IRQS_ON
+#endif
+
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
SAVE_EXTRA_REGS
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8afdc3e44247..3c5a96815dec 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -140,8 +140,7 @@ struct thread_info {
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
- _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
- _TIF_NOHZ)
+ _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)

/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/