[PATCH] Athlon Prefetch workaround for 2.6.0test6

From: Andi Kleen
Date: Mon Sep 29 2003 - 07:58:08 EST



Here is a new version of the Athlon/Opteron prefetch issue workaround
for 2.6.0test6. The issue was hit regularly by the 2.6 kernel
while doing prefetches on NULL terminated hlists.

These CPUs sometimes generate illegal exception for prefetch
instructions. The operating system can work around this by checking
if the faulting instruction is a prefetch and ignoring it when
it's the case.

The code is structured carefully to ensure that the page fault
will never recurse more than once. Also unmapped EIPs are special
cased to give more friendly oopses when the kernel jumps to
unmapped addresses.

It removes the previous dumb in kernel workaround for this and shrinks the
kernel by >10k.

Small behaviour change is that a SIGBUS fault for a *_user access will
cause an EFAULT now, no SIGBUS.

This version addresses all criticism that I got for previous versions.

- Only checks on AMD K7+ CPUs.
- Computes linear address for VM86 mode or code segments
with non zero base.
- Some cleanup
- No pointer comparisons
- More comments

Please consider applying,

-Andi

diff -u linux-2.6.0test6-work/include/asm-i386/processor.h-PRE linux-2.6.0test6-work/include/asm-i386/processor.h
--- linux-2.6.0test6-work/include/asm-i386/processor.h-PRE 2003-09-11 04:12:39.000000000 +0200
+++ linux-2.6.0test6-work/include/asm-i386/processor.h 2003-09-28 10:52:55.000000000 +0200
@@ -578,8 +589,6 @@
#define ARCH_HAS_PREFETCH
extern inline void prefetch(const void *x)
{
- if (cpu_data[0].x86_vendor == X86_VENDOR_AMD)
- return; /* Some athlons fault if the address is bad */
alternative_input(ASM_NOP4,
"prefetchnta (%1)",
X86_FEATURE_XMM,
diff -u linux-2.6.0test6-work/arch/i386/mm/fault.c-PRE linux-2.6.0test6-work/arch/i386/mm/fault.c
--- linux-2.6.0test6-work/arch/i386/mm/fault.c-PRE 2003-05-27 03:00:20.000000000 +0200
+++ linux-2.6.0test6-work/arch/i386/mm/fault.c 2003-09-29 14:48:44.000000000 +0200
@@ -55,6 +55,104 @@
console_loglevel = loglevel_save;
}

+/*
+ * Find an segment base in the LDT/GDT.
+ * Don't need to do any boundary checking because the CPU did that already
+ * when the instruction was executed
+ */
+static unsigned long segment_base(unsigned seg)
+{
+ u32 *desc;
+ /*
+ * No need to use get/put_cpu here because when we switch CPUs the
+ * segment base is always switched too.
+ */
+ if (seg & (1<<2))
+ desc = current->mm->context.ldt;
+ else
+ desc = (u32 *)&cpu_gdt_table[smp_processor_id()];
+ desc = (void *)desc + (seg & ~7);
+ return (desc[0] >> 16) |
+ ((desc[1] & 0xFF) << 16) |
+ (desc[1] & 0xFF000000);
+}
+
+/*
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ */
+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
+{
+ unsigned char *instr = (unsigned char *)(regs->eip);
+ int scan_more = 1;
+ int prefetch = 0;
+ int i;
+
+ /*
+ * Avoid recursive faults. This catches the kernel jumping to nirvana.
+ * More complicated races with unmapped EIP are handled elsewhere for
+ * user space.
+ */
+ if (regs->eip == addr)
+ return 0;
+
+ if (unlikely(regs->eflags & VM_MASK))
+ addr += regs->xcs << 4;
+ else if (unlikely(regs->xcs != __USER_CS &&regs->xcs != __KERNEL_CS))
+ addr += segment_base(regs->xcs);
+
+ for (i = 0; scan_more && i < 15; i++) {
+ unsigned char opcode;
+ unsigned char instr_hi;
+ unsigned char instr_lo;
+
+ if (__get_user(opcode, instr))
+ break;
+
+ instr_hi = opcode & 0xf0;
+ instr_lo = opcode & 0x0f;
+ instr++;
+
+ switch (instr_hi) {
+ case 0x20:
+ case 0x30:
+ /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
+ scan_more = ((instr_lo & 7) == 0x6);
+ break;
+
+ case 0x60:
+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
+ scan_more = (instr_lo & 0xC) == 0x4;
+ break;
+ case 0xF0:
+ /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+ scan_more = !instr_lo || (instr_lo>>1) == 1;
+ break;
+ case 0x00:
+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
+ scan_more = 0;
+ if (__get_user(opcode, instr))
+ break;
+ prefetch = (instr_lo == 0xF) &&
+ (opcode == 0x0D || opcode == 0x18);
+ break;
+ default:
+ scan_more = 0;
+ break;
+ }
+ }
+
+ return prefetch;
+}
+
+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr)
+{
+ if (likely(boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+ boot_cpu_data.x86 < 6))
+ return 0;
+ return __is_prefetch(regs, addr);
+}
+
asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);

/*
@@ -110,7 +208,7 @@
* atomic region then we must not take the fault..
*/
if (in_atomic() || !mm)
- goto no_context;
+ goto bad_area_nosemaphore;

down_read(&mm->mmap_sem);

@@ -198,8 +296,16 @@
bad_area:
up_read(&mm->mmap_sem);

+bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
if (error_code & 4) {
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space.
+ */
+ if (is_prefetch(regs, address))
+ return;
+
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
@@ -232,6 +338,14 @@
if (fixup_exception(regs))
return;

+ /*
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ */
+ if (is_prefetch(regs, address))
+ return;
+
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
@@ -286,10 +400,14 @@
do_sigbus:
up_read(&mm->mmap_sem);

- /*
- * Send a sigbus, regardless of whether we were in kernel
- * or user mode.
- */
+ /* Kernel mode? Handle exceptions or die */
+ if (!(error_code & 4))
+ goto no_context;
+
+ /* User space => ok to do another page fault */
+ if (is_prefetch(regs, address))
+ return;
+
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
@@ -298,10 +416,6 @@
info.si_code = BUS_ADRERR;
info.si_addr = (void *)address;
force_sig_info(SIGBUS, &info, tsk);
-
- /* Kernel mode? Handle exceptions or die */
- if (!(error_code & 4))
- goto no_context;
return;

vmalloc_fault:

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/