Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early updateucode on Intel's CPU

From: H. Peter Anvin
Date: Thu Dec 13 2012 - 00:13:34 EST


Here is a version that compiles. It doesn't *boot* yet, because the switchover from dynamic mode to the real pagetables doesn't happen right and so we end up on an uninitialized set of page tables.

The new page table setup in tip:x86/mm2 should make that easier to achieve, however... I won't have time to test this out tonight, though.

-hpa

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16..2d88344 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_PGTABLE_64_DEFS_H
#define _ASM_X86_PGTABLE_64_DEFS_H

+#include <asm/sparsemem.h>
+
#ifndef __ASSEMBLY__
#include <linux/types.h>

@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)

+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57..9443c77 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,11 +26,73 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>

-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2, early_pgt_resets = 0;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ unsigned long i;
+
+ for (i = 0; i < PTRS_PER_PGD-1; i++)
+ early_level4_pgt[i].pgd = 0;
+
+ next_early_pgt = 0;
+ early_pgt_resets++;
+
+ __native_flush_tlb();
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ unsigned long i;
+ pgdval_t pgd, *pgd_p;
+ pudval_t *pud_p;
+ pmdval_t pmd, *pmd_p;
+
+ if (physaddr >= MAXMEM)
+ return -1; /* Invalid address - puke */
+
+ i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1);
+ pgd_p = &early_level4_pgt[i].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) {
+ pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map);
+ } else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1)
+ reset_early_page_tables();
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_p[i] = 0;
+
+ *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + _KERNPG_TABLE;
+ }
+ i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+ pud_p += i;
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_p[i] = pmd;
+ pmd += PMD_SIZE;
+ }
+
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + _KERNPG_TABLE;
+
+ return 0;
}

/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -70,12 +132,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();

- /* Make NULL pointers segfault */
- zap_identity_mappings();
-
+ /* XXX - this is wrong... we need to build page tables from scratch */
max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;

for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc..0e040b3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
-
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
* tables and then reload them.
*/

- /* Compute the delta between the address I am compiled to run at and the
+ /*
+ * Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
@@ -78,53 +78,66 @@ startup_64:
testl %eax, %eax
jnz bad_address

- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
+ /*
+ * Is the address too large?
*/
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ leaq _text(%rip), %rax
+ shrq $MAX_PHYSMEM_BITS, %rax
+ jnz bad_address

- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Fixup the physical addresses in the page table
+ */
+ addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)

addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)

addq %rbp, level2_fixmap_pgt + (506*8)(%rip)

- /* Add an Identity mapping if I am above 1G */
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+ leaq early_level4_pgt(%rip), %rbx

movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+ shrq $PGDIR_SHIFT, %rax

- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ movq %rdx, 0(%rbx,%rax,8)
+ movq %rdx, 8(%rbx,%rax,8)
+
+ addq $4096, %rdx
+ movq %rdi, %rax
+ shrq $PUD_SHIFT, %rax
+ andl $(PTRS_PER_PUD-1), %eax
+ movq %rdx, (4096+0)(%rbx,%rax,8)
+ movq %rdx, (4096+8)(%rbx,%rax,8)

+ addq $8192, %rbx
movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ shrq $PMD_SHIFT, %rdi
+ addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+ movl $PTRS_PER_PMD, %ecx

+1:
+ andq $(PTRS_PER_PMD - 1), %rdi
+ movq %rax, (%rbx,%rdi,8)
+ incq %rdi
+ addq $PMD_SIZE, %rax
+ decl %ecx
+ jnz 1b
+
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
-
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
@@ -149,7 +162,7 @@ ENTRY(secondary_startup_64)
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
@@ -196,7 +209,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0

/* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ movq stack_start(%rip), %rsp

/* zero EFLAGS after setting rsp */
pushq $0
@@ -236,31 +249,31 @@ ENTRY(secondary_startup_64)
movl initial_gs+4(%rip),%edx
wrmsr

- /* esi is pointer to real mode structure with interesting info.
+ /* rsi is pointer to real mode structure with interesting info.
pass it to C */
- movl %esi, %edi
+ movq %rsi, %rdi

/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
- * a far return.
+ * a far jump.
*/
- movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ /* gas 2.22 is buggy and mis-assembles ljmpq */
+ rex64 ljmp *initial_code(%rip)

/* SMP bootup changes these two */
__REFDATA
- .align 8
- ENTRY(initial_code)
+ .balign 8
+ GLOBAL(initial_code)
.quad x86_64_start_kernel
- ENTRY(initial_gs)
+ .word __KERNEL_CS
+ .balign 8
+ GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)

- ENTRY(stack_start)
+ GLOBAL(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
__FINITDATA
@@ -268,7 +281,7 @@ ENTRY(secondary_startup_64)
bad_address:
jmp bad_address

- .section ".init.text","ax"
+ __INIT
.globl early_idt_handlers
early_idt_handlers:
# 104(%rsp) %rflags
@@ -305,14 +318,22 @@ ENTRY(early_idt_handler)
pushq %r11 # 0(%rsp)

cmpl $__KERNEL_CS,96(%rsp)
- jne 10f
+ jne 11f

+ cmpl $14,72(%rsp) # Page fault?
+ jnz 10f
+ GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f # All good
+
+10:
leaq 88(%rsp),%rdi # Pointer to %rip
call early_fixup_exception
andl %eax,%eax
jnz 20f # Found an exception entry

-10:
+11:
#ifdef CONFIG_EARLY_PRINTK
GET_CR2_INTO(%r9) # can clobber any volatile register if pv
movl 80(%rsp),%r8d # error code
@@ -334,7 +355,7 @@ ENTRY(early_idt_handler)
1: hlt
jmp 1b

-20: # Exception table entry found
+20: # Exception table entry found or page table generated
popq %r11
popq %r10
popq %r9
@@ -348,6 +369,8 @@ ENTRY(early_idt_handler)
decl early_recursion_flag(%rip)
INTERRUPT_RETURN

+ __INITDATA
+
.balign 4
early_recursion_flag:
.long 0
@@ -358,11 +381,10 @@ early_idt_msg:
early_idt_ripmsg:
.asciz "RIP %s\n"
#endif /* CONFIG_EARLY_PRINTK */
- .previous

#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)

/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
@@ -372,46 +394,21 @@ ENTRY(name)
i = i + 1 ; \
.endr

- .data
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
-NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ __INITDATA
+NEXT_PAGE(early_level4_pgt)
+ .fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

-NEXT_PAGE(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 511,8,0
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0

+ .data
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE

-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
- * Don't set NX because code runs from these pages.
- */
- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
NEXT_PAGE(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
@@ -426,11 +423,16 @@ NEXT_PAGE(level2_kernel_pgt)
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)

-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 506,8,0
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+ .fill 5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .fill 512,8,0

#undef PMDS
-#undef NEXT_PAGE

.data
.align 16
@@ -456,6 +458,7 @@ ENTRY(nmi_idt_table)
.skip IDT_ENTRIES * 16

__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
+ .skip PAGE_SIZE
+NEXT_PAGE(init_level4_pgt)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..e383050 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -911,7 +911,6 @@ void __init setup_arch(char **cmdline_p)
(max_pfn_mapped<<PAGE_SHIFT) - 1);

setup_real_mode();
-
init_gbpages();

/* max_pfn_mapped is updated here */
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cbca565..1650bf4 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -21,7 +21,7 @@ void __init setup_real_mode(void)
struct trampoline_header *trampoline_header;
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
#ifdef CONFIG_X86_64
- u64 *trampoline_pgd;
+ pgd_t *trampoline_pgd;
u64 efer;
#endif

@@ -77,9 +77,17 @@ void __init setup_real_mode(void)
trampoline_cr4_features = &trampoline_header->cr4;
*trampoline_cr4_features = read_cr4();

- trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
- trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
- trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+ trampoline_pgd = (pgd_t *) __va(real_mode_header->trampoline_pgd);
+
+ /* Set up the identity map */
+ clone_pgd_range(trampoline_pgd,
+ init_level4_pgt + KERNEL_PGD_BOUNDARY,
+ MAXMEM >> PGDIR_SHIFT);
+
+ /* Set up the kernel map */
+ clone_pgd_range(trampoline_pgd + KERNEL_PGD_BOUNDARY,
+ init_level4_pgt + KERNEL_PGD_BOUNDARY,
+ PTRS_PER_PGD - KERNEL_PGD_BOUNDARY);
#endif
}