Re: git commit 9fd67b4ed0714ab718f1f9bd14c344af336a6df7 (x86-64: Givevvars their own page) breaks Xen PV guests (64-bit).

From: Andrew Lutomirski
Date: Tue Jul 26 2011 - 15:01:50 EST


On Tue, Jul 26, 2011 at 12:18 PM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
>> > However, this is what I get later on, any ideas?
>>
>> > [    0.585880] init[1] illegal int 0xcc from 32-bit mode ip:ffffffffff600400 cs:e033 sp:7fff230ca088 ax:ffffffffff600400 si:7faee3e822bf di:7fff230ca158
>>
>> That will, indeed, crash your system.
>>
>> 0xe033 is FLAT_RING3_CS64
>>
>> Jeremy / other Xen people:  I'm trying to implement a lightweight
>> check to distinguish a trap from a sane (i.e. allowable for syscalls)
>> 64-bit user context from anything else.  There seems to be precedent
>> for using ->cs == __USER_CS to detect 64-bitness; for example, step.c
>> contains:
>>
>> #ifdef CONFIG_X86_64
>>                 case 0x40 ... 0x4f:
>>                         if (regs->cs != __USER_CS)
>>                                 /* 32-bit mode: register increment */
>>                                 return 0;
>>                         /* 64-bit mode: REX prefix */
>>                         continue;
>> #endif
>>
>> The prefetch opcode checker in mm/fault.c does something similar.
>>
>> Even the sysret code in xen/xen-asm_64.S does:
>>
>>         pushq %r11
>>         pushq $__USER_CS
>>         pushq %rcx
>>
>> So I'm at a bit of a loss.
>>
>> You could probably hack it up and get your kernel to boot by allowing
>> __USER_CS and 0xe033 in that check, but I'd rather understand it
>
> Did this little hack:
>
>
> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
> index dda7dff..5d0cf37 100644
> --- a/arch/x86/kernel/vsyscall_64.c
> +++ b/arch/x86/kernel/vsyscall_64.c
> @@ -131,7 +131,7 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
>         * Real 64-bit user mode code has cs == __USER_CS.  Anything else
>         * is bogus.
>         */
> -       if (regs->cs != __USER_CS) {
> +       if ((regs->cs != __USER_CS) && (regs->cs != FLAT_RING3_CS64)) {
>                /*
>                 * If we trapped from kernel mode, we might as well OOPS now
>                 * instead of returning to some random address and OOPSing
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index f987bde..0e4c13c 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1916,6 +1916,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
>  # endif
>  #else
>        case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
> +       case VVAR_PAGE:
>  #endif
>        case FIX_TEXT_POKE0:
>        case FIX_TEXT_POKE1:
>
> And getting this on 64-bit:
>
>  started: BusyBox v1.14.3 (2011-07-26 11:43:49 EDT)
> [    0.578603] rcS[1128]: segfault at ffffffffff5ff0a0 ip 00007fff40b7380a sp 00007fff40b5c0f0 error 4
> [    0.578847] rcS used greatest stack depth: 5024 bytes left
> [    0.581897] sh[1131]: segfault at ffffffffff5ff0a0 ip 00007fffb93ff80a sp 00007fffb92bbd70 error 4
> [    1.587637] sh[1137]: segfault at ffffffffff5ff0a0 ip 00007ffffa5ff80a sp 00007ffffa522560 error 4
> [    2.592295] sh[1141]: segfault at ffffffffff5ff0a0 ip 00007ffffcb3f80a sp 00007ffffca98af0 error 4
> [    3.596344] sh[1145]: segfault at ffffffffff5ff0a0 ip 00007fff2e3ff80a sp 00007fff2e3e3370 error 4
> [    4.599812] sh[1149]: segfault at ffffffffff5ff0a0 ip 00007fff62dff80a sp 00007fff62ca9f10 error 4
> [    5.605835] sh[1153]: segfault at ffffffffff5ff0a0 ip 00007fff117ff80a sp 00007fff1175e7f0 error 4
> [    6.609438] sh[1157]: segfault at ffffffffff5ff0a0 ip 00007fff91bff80a sp 00007fff91bd71c0 error 4
> [    7.614714] sh[1161]: segfault at ffffffffff5ff0a0 ip 00007fff396b280a sp 00007fff3968ede0 error 4
> [    8.620374] sh[1165]: segfault at ffffffffff5ff0a0 ip 00007fffd398b80a sp 00007fffd38ecd70 error 4
> [    9.625512] sh[1169]: segfault at ffffffffff5ff0a0 ip 00007fff617d980a sp 00007fff61776070 error 4
> [   10.630246] sh[1173]: segfault at ffffffffff5ff0a0 ip 00007fff89fff80a sp 00007fff89f7f3b0 error 4
> [   11.635588] sh[1177]: segfault at ffffffffff5ff0a0 ip 00007fffa95ff80a sp 00007fffa95ea7c0 error 4
> [   12.640491] sh[1181]: segfault at ffffffffff5ff0a0 ip 00007fff28cd180a sp 00007fff28c524f0 error 4

That one means that the vvar fixmap isn't working. Can you try the
attached patch?

--Andy

>
> ..
>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca6f7ab..b1f3f53 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -638,6 +638,25 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
break;
}

+ case 1000: {
+ kernel_fpu_begin();
+ kernel_fpu_end();
+ ret = 0;
+ break;
+ }
+
+ case 1001: {
+ int i;
+ kernel_fpu_begin();
+ for (i = 0; i < 999; i++) {
+ stts();
+ clts();
+ }
+ kernel_fpu_end();
+ ret = 0;
+ break;
+ }
+
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0ccccb6..5888579 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1829,6 +1829,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
# endif
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+ case VVAR_PAGE:
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
@@ -1869,7 +1870,8 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#ifdef CONFIG_X86_64
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
- if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE ||
+ idx == VVAR_PAGE) {
unsigned long vaddr = __fix_to_virt(idx);
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
}