save_xstate_sig (Re: frequent lockups in 3.18rc4)

From: Andy Lutomirski
Date: Thu Dec 18 2014 - 16:18:13 EST


On 12/14/2014 09:47 PM, Linus Torvalds wrote:
On Sun, Dec 14, 2014 at 4:38 PM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:

Can anybody make sense of that backtrace, keeping in mind that we're
looking for some kind of endless loop where we don't make progress?

So looking at all the backtraces, which is kind of messy because
there's some missing data (presumably buffers overflowed from all the
CPU's printing at the same time), it looks like:

- CPU 0 is missing. No idea why.
- CPU's 1-3 all have the same trace for

int_signal ->
do_notify_resume ->
do_signal ->
....
page_fault ->
do_page_fault

and "save_xstate_sig+0x81" shows up on all stacks, although only on
CPU1 does it show up as a "guaranteed" part of the stack chain (ie it
matches frame pointer data too). CPU1 also has that __clear_user show
up (which is called from save_xstate_sig), but not other CPU's. CPU2
and CPU3 have "save_xstate_sig+0x98" in addition to that +0x81 thing.

My guess is that "save_xstate_sig+0x81" is the instruction after the
__clear_user call, and that CPU1 took the fault in __clear_user(),
while CPU2 and CPU3 took the fault at "save_xstate_sig+0x98" instead,
which I'd guess is the

xsave64 (%rdi)

I admit that my understanding of the disaster that is x86's FPU handling is limited, but I'm moderately confident that save_xstate_sig is broken.

The code is:

if (user_has_fpu()) {
/* Save the live register state to the user directly. */
if (save_user_xstate(buf_fx))
return -1;
/* Update the thread's fxstate to save the fsave header. */
if (ia32_fxstate)
fpu_fxsave(&tsk->thread.fpu);
} else {
sanitize_i387_state(tsk);
if (__copy_to_user(buf_fx, xsave, xstate_size))
return -1;
}

Suppose that user_has_fpu() returns true, we call save_user_xstate, and the xsave instruction (or anything else in there, for that matter) causes a page fault.

The page fault handler is well within its rights to schedule. At that point, *we might not own the FPU any more*, depending on the vagaries of eager vs lazy mode. So, when we schedule back in and resume from the page fault, we are in the wrong branch of the if statement.

At this point, we're going to write garbage (possibly sensitive garbage) to the userspace signal frame. I don't see why this would cause an infinite loop, but I don't think it's healthy.

FWIW, if xsave traps with cr2 value, then there would indeed be an infinite loop in here. It seems to work right on my machine. Dave, want to run the attached little test?

--Andy
#define _GNU_SOURCE
#include <err.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/ucontext.h>

static volatile unsigned char *buf, *xsave_addr;
static volatile int nfailures = 0;

static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}

static void clearhandler(int sig)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}

static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t*)ctx_void;

unsigned long cr2 = (unsigned long)ctx->uc_mcontext.gregs[REG_CR2];
unsigned long start = (unsigned long)buf;

extern unsigned char xsave_insn[], after_xsave_insn[];

if (ctx->uc_mcontext.gregs[REG_RIP] != (unsigned long)xsave_insn) {
printf("Uncorrectable segfault\n");
clearhandler(SIGSEGV);
return;
}

if (si->si_code != SEGV_ACCERR) {
printf("Segfault was %d (trap %d), not SEGV_ACCERR\n",
si->si_code, ctx->uc_mcontext.gregs[REG_TRAPNO]);
clearhandler(SIGSEGV);
return;
}

if (cr2 != (unsigned long)si->si_addr) {
printf("CR2 (0x%lx) != si_addr (0x%lx)\n",
cr2, (unsigned long)si->si_addr);
clearhandler(SIGSEGV);
return;
}

if (cr2 >= start && cr2 <= (start + 4095)) {
printf("[OK]\txsave offset = %d, cr2 offset = %d\n",
(int)(xsave_addr - buf), (int)(cr2 - start));
} else if (cr2 >= start + 4096 && cr2 <= start + 8191) {
printf("[FAIL]\txsave offset = %d, cr2 offset = %d\n",
(int)(xsave_addr - buf), (int)(cr2 - start));

nfailures++;
} else if (cr2 >= start + 8192 && cr2 <= start + 12287) {
printf("[OK]\txsave offset = %d, cr2 offset = %d\n",
(int)(xsave_addr - buf), (int)(cr2 - start));
} else {
printf("[FAIL]\tcr2 is completely out of range\n");
abort();
}

ctx->uc_mcontext.gregs[REG_RIP] = (unsigned long)after_xsave_insn;
}

int main()
{
int i;

buf = mmap(NULL, 4096*3, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
if (buf == MAP_FAILED)
err(1, "mmap");

if (mmap((unsigned char *)buf + 4096, 4096, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED)
err(1, "mmap");

sethandler(SIGSEGV, sigsegv, 0);

for (i = 0; i < 8193; i += 64) {
xsave_addr = buf + i;
printf("XSAVE to offset %d\n", i);
asm volatile ("xsave_insn: xsaveq %0 ; after_xsave_insn:"
: "=m" (*xsave_addr)
: "a" (0xffffffff), "d" (0xffffffff));
}

if (nfailures)
printf("%d failures\n", nfailures);
else
printf("PASS!\n");

return 0;
}