Re: New pentium bug workaround - please test..

Hans Lermen (lermen@elserv.ffm.fgan.de)
Wed, 19 Nov 1997 15:09:48 +0100 (MET)


On Tue, 18 Nov 1997, Linus Torvalds wrote:

> Now, that implies that we can actually _keep_ the IDT mapped, and instead
> of marking it not present we can mark it read-only.

Ah, this one is _much_ better then the Intel solution.
Congratulation Linus !

> Anyway, this approach still needs to be debugged, but I'm told by intel
> that it should indeed work, and I have a patch for people to try out.
> Right now the patch is only for 2.1.x (relative to 2.1.65, in fact), but
^^^^^^^^^^^^^^
Nope, just I made a 2.0.32 backport, look at the appended patch ;-)
^^^^^^
Works fine here, also with SMP. No f00f crashes atall.

For 2.0.x, however, the kernel needs to write to the idt, hence the below
patch has 2 alias mapped addresses for the idt:

- one writeprotected for LIDT
- one writeable for the kernel.

Hans
<lermen@fgan.de>

--- linux-2.0.32/arch/i386/kernel/traps.c Tue Nov 18 10:25:34 1997
+++ linux-2.0.32-newf00f/arch/i386/kernel/traps.c Wed Nov 19 14:45:53 1997
@@ -348,14 +348,17 @@
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
+ pte_t * alias_pte;
unsigned long twopage;
+ unsigned long page;
struct desc_struct *new_idt;

printk("moving IDT ... ");

twopage = (unsigned long) vmalloc (2*PAGE_SIZE);

- new_idt = (void *)(twopage + 4096-7*8);
+ page = twopage + PAGE_SIZE;
+ new_idt = (void *)page;

memcpy(new_idt,idt,256*8);

@@ -363,17 +366,29 @@
idt_descriptor.addr = VMALLOC_VMADDR(new_idt);

__asm__ __volatile__("\tlidt %0": "=m" (idt_descriptor));
- idt = new_idt;

/*
- * Unmap lower page:
+ * Write protect the IDT containing page:
*/
- twopage = VMALLOC_VMADDR(twopage);
- pgd = pgd_offset(current->mm, twopage);
- pmd = pmd_offset(pgd, twopage);
- pte = pte_offset(pmd, twopage);
+ page = VMALLOC_VMADDR(page);
+ pgd = pgd_offset(&init_mm, page);
+ pmd = pmd_offset(pgd, page);
+ pte = pte_offset(pmd, page);
+ *pte = pte_wrprotect(*pte);
+
+
+ /*
+ * Alias the IDT writeable to the lower page, so the kernel can
+ * write to it.
+ */
+ page = VMALLOC_VMADDR(twopage);
+ pgd = pgd_offset(&init_mm, page);
+ pmd = pmd_offset(pgd, page);
+ alias_pte = pte_offset(pmd, page);
+ *alias_pte = pte_mkwrite(*pte);
+ idt = (void *)twopage;
+

- pte_clear(pte);
flush_tlb_all();

printk(" ... done\n");
--- linux-2.0.32/arch/i386/mm/fault.c Tue Nov 18 10:25:34 1997
+++ linux-2.0.32-newf00f/arch/i386/mm/fault.c Wed Nov 19 14:37:11 1997
@@ -21,127 +21,10 @@

extern void die_if_kernel(const char *,struct pt_regs *,long);

-asmlinkage void do_divide_error (struct pt_regs *, unsigned long);
-asmlinkage void do_debug (struct pt_regs *, unsigned long);
-asmlinkage void do_nmi (struct pt_regs *, unsigned long);
-asmlinkage void do_int3 (struct pt_regs *, unsigned long);
-asmlinkage void do_overflow (struct pt_regs *, unsigned long);
-asmlinkage void do_bounds (struct pt_regs *, unsigned long);
asmlinkage void do_invalid_op (struct pt_regs *, unsigned long);
-asmlinkage void do_general_protection (struct pt_regs *, unsigned long);

extern int pentium_f00f_bug;

-static int handle_intx_eip_adjust(struct pt_regs *regs)
-{
- unsigned char *addr, *csp = 0;
- int wrap = 0;
- int count = 8; /* only check for reasonable number of bytes
- * else we do it the save 'simple way' */
- unsigned long _eip;
-#define XX_WRAP(x) (wrap ? *((unsigned short *)&x) : x)
-
- /* We rely on being able to access the memory pointed to by cs:eip
- * and the bytes behind it up to the faulting instruction,
- * because we just got an exception for this instruction and
- * hence the memory should just be successfully accessed.
- * In case of crossing a page boundary or when accessing kernel space
- * we just do the simple fix (increase eip by one).
- * This assumption also obsoletes checking of segment limit.
- * ( should be veryfied, however, if this assumption is true )
- */
-
- if (regs->cs == KERNEL_CS) {
- /* not what we expect */
- regs->eip++;
- return 0;
- }
-
- if (regs->eflags & VM_MASK) {
- /* we have real mode type selector */
- wrap = 1;
- csp = (unsigned char *)((unsigned long)regs->cs << 4);
- }
- else if (regs->cs & 4) {
- /* we have a LDT selector */
- struct desc_struct *p, *ldt = current->ldt;
- if (!ldt)
- ldt = (struct desc_struct*) &default_ldt;
- p = ldt + (regs->cs >> 3);
- csp = (unsigned char *)((p->a >> 16) | ((p->b & 0xff) << 16) | (p->b & 0xFF000000));
- if (!(p->b & 0x400000))
- wrap = 1; /* 16-bit segment */
- }
-
- _eip = regs->eip;
- addr = csp+XX_WRAP(_eip);
- while (count-- > 0) {
- if ((unsigned long)addr >= TASK_SIZE) {
- /* accessing kernel space, do the simple case */
- regs->eip++;
- return 0;
- }
- switch (get_user(addr)) {
-
- case 0xCC: /* single byte INT3 */
- XX_WRAP(_eip)++;
- regs->eip = _eip;
- return 0;
-
- case 0xCD: /* two byte INT 3 */
- XX_WRAP(_eip)++;
- /* fall through */
- case 0xCE: /* INTO, single byte */
- XX_WRAP(_eip)++;
- if ( (regs->eflags & VM_MASK)
- && ((regs->eflags & IOPL_MASK) != IOPL_MASK)) {
- /* not allowed, do GP0 fault */
- do_general_protection(regs, 0);
- return -1;
- }
- regs->eip = _eip;
- return 0;
-
- /* the prefixes from the Intel patch */
- case 0xF2 ... 0xF3:
- case 0x2E:
- case 0x36:
- case 0x3E:
- case 0x26:
- case 0x64 ... 0x67:
- break; /* just skipping them */
-
- default:
- /* not what we handle here,
- * just doing the simple fix
- */
- regs->eip++;
- return 0;
- }
-
- if ( !(++XX_WRAP(_eip)) ) {
- /* we wrapped around */
- regs->eip++;
- return 0;
- }
-
- addr = csp+XX_WRAP(_eip);
- if ( !((unsigned long)addr & ~(PAGE_SIZE -1)) ) {
- /* we would cross page boundary, not good,
- * doing the simple fix
- */
- regs->eip++;
- return 0;
- }
- }
-
- /* if we come here something weird happened,
- * just doing the simple fix
- */
- regs->eip++;
- return 0;
-}
-

/*
* This routine handles page faults. It determines the address,
@@ -246,21 +129,10 @@
if ( pentium_f00f_bug ) {
unsigned long nr;

- nr = (address - TASK_SIZE - (unsigned long) idt) >> 3;
+ nr = (address - PAGE_SIZE - TASK_SIZE - (unsigned long) idt) >> 3;

- if (nr < 7) {
- static void (*handler[])(struct pt_regs *, unsigned long) = {
- do_divide_error, /* 0 - divide overflow */
- do_debug, /* 1 - debug trap */
- do_nmi, /* 2 - NMI */
- do_int3, /* 3 - int 3 */
- do_overflow, /* 4 - overflow */
- do_bounds, /* 5 - bound range */
- do_invalid_op }; /* 6 - invalid opcode */
- if ((nr == 3) || (nr == 4))
- if (handle_intx_eip_adjust(regs))
- return;
- handler[nr](regs, error_code);
+ if (nr == 6) {
+ do_invalid_op(regs, 0);
return;
}
}