Re: [RFC PATCH v4 27/28] x86: Add support to encrypt the kernel in-place

From: Borislav Petkov
Date: Wed Mar 01 2017 - 12:42:24 EST

Next message: Fengguang Wu: "Re: [locking/ww_mutex] 2a0c112828 WARNING: CPU: 0 PID: 18 at kernel/locking/mutex.c:305 __ww_mutex_wakeup_for_backoff"
Previous message: Michal Hocko: "Re: rsync: page allocation stalls in kernel 4.9.10 to a VessRAID NAS"
Next in thread: Tom Lendacky: "Re: [RFC PATCH v4 27/28] x86: Add support to encrypt the kernel in-place"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Thu, Feb 16, 2017 at 09:48:08AM -0600, Tom Lendacky wrote:
> This patch adds the support to encrypt the kernel in-place. This is
> done by creating new page mappings for the kernel - a decrypted
> write-protected mapping and an encrypted mapping. The kernel is encyrpted

s/encyrpted/encrypted/

> by copying the kernel through a temporary buffer.

"... by copying it... "

>
> Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
> ---

...

> +ENTRY(sme_encrypt_execute)
> +
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> + /*
> + * Entry parameters:
> + * RDI - virtual address for the encrypted kernel mapping
> + * RSI - virtual address for the decrypted kernel mapping
> + * RDX - length of kernel
> + * RCX - address of the encryption workarea

, including:

> + * - stack page (PAGE_SIZE)
> + * - encryption routine page (PAGE_SIZE)
> + * - intermediate copy buffer (PMD_PAGE_SIZE)
> + * R8 - address of the pagetables to use for encryption
> + */
> +
> + /* Set up a one page stack in the non-encrypted memory area */
> + movq %rcx, %rax
> + addq $PAGE_SIZE, %rax
> + movq %rsp, %rbp

%rbp is callee-saved and you're overwriting it here. You need to push it
first.

> + movq %rax, %rsp
> + push %rbp
> +
> + push %r12
> + push %r13

In general, just do all pushes on function entry and the pops on exit,
like the compiler does.

> + movq %rdi, %r10
> + movq %rsi, %r11
> + movq %rdx, %r12
> + movq %rcx, %r13
> +
> + /* Copy encryption routine into the workarea */
> + movq %rax, %rdi
> + leaq .Lencrypt_start(%rip), %rsi
> + movq $(.Lencrypt_stop - .Lencrypt_start), %rcx
> + rep movsb
> +
> + /* Setup registers for call */
> + movq %r10, %rdi
> + movq %r11, %rsi
> + movq %r8, %rdx
> + movq %r12, %rcx
> + movq %rax, %r8
> + addq $PAGE_SIZE, %r8
> +
> + /* Call the encryption routine */
> + call *%rax
> +
> + pop %r13
> + pop %r12
> +
> + pop %rsp /* Restore original stack pointer */
> +.Lencrypt_exit:

Please put side comments like this here:

ENTRY(sme_encrypt_execute)

#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* Entry parameters:
* RDI - virtual address for the encrypted kernel mapping
* RSI - virtual address for the decrypted kernel mapping
* RDX - length of kernel
* RCX - address of the encryption workarea
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
* - intermediate copy buffer (PMD_PAGE_SIZE)
* R8 - address of the pagetables to use for encryption
*/

/* Set up a one page stack in the non-encrypted memory area */
movq %rcx, %rax # %rax = workarea
addq $PAGE_SIZE, %rax # %rax += 4096
movq %rsp, %rbp # stash stack ptr
movq %rax, %rsp # set new stack
push %rbp # needs to happen before the mov %rsp, %rbp

push %r12
push %r13

movq %rdi, %r10 # encrypted kernel
movq %rsi, %r11 # decrypted kernel
movq %rdx, %r12 # kernel length
movq %rcx, %r13 # workarea
...

and so on.

...

> diff --git a/arch/x86/kernel/mem_encrypt_init.c b/arch/x86/kernel/mem_encrypt_init.c
> index 25af15d..07cbb90 100644
> --- a/arch/x86/kernel/mem_encrypt_init.c
> +++ b/arch/x86/kernel/mem_encrypt_init.c
> @@ -16,9 +16,200 @@
> #ifdef CONFIG_AMD_MEM_ENCRYPT
>
> #include <linux/mem_encrypt.h>
> +#include <linux/mm.h>
> +
> +#include <asm/sections.h>
> +
> +extern void sme_encrypt_execute(unsigned long, unsigned long, unsigned long,
> + void *, pgd_t *);

This belongs into mem_encrypt.h. And I think it already came up: please
use names for those params.

> +
> +#define PGD_FLAGS _KERNPG_TABLE_NOENC
> +#define PUD_FLAGS _KERNPG_TABLE_NOENC
> +#define PMD_FLAGS __PAGE_KERNEL_LARGE_EXEC
> +
> +static void __init *sme_pgtable_entry(pgd_t *pgd, void *next_page,
> + void *vaddr, pmdval_t pmd_val)
> +{

sme_populate() or so sounds better.

> + pud_t *pud;
> + pmd_t *pmd;
> +
> + pgd += pgd_index((unsigned long)vaddr);
> + if (pgd_none(*pgd)) {
> + pud = next_page;
> + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
> + native_set_pgd(pgd,
> + native_make_pgd((unsigned long)pud + PGD_FLAGS));

Let it stick out, no need for those "stairs" in the vertical alignment :)

> + next_page += sizeof(*pud) * PTRS_PER_PUD;
> + } else {
> + pud = (pud_t *)(native_pgd_val(*pgd) & ~PTE_FLAGS_MASK);
> + }
> +
> + pud += pud_index((unsigned long)vaddr);
> + if (pud_none(*pud)) {
> + pmd = next_page;
> + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
> + native_set_pud(pud,
> + native_make_pud((unsigned long)pmd + PUD_FLAGS));
> + next_page += sizeof(*pmd) * PTRS_PER_PMD;
> + } else {
> + pmd = (pmd_t *)(native_pud_val(*pud) & ~PTE_FLAGS_MASK);
> + }
> +
> + pmd += pmd_index((unsigned long)vaddr);
> + if (pmd_none(*pmd) || !pmd_large(*pmd))
> + native_set_pmd(pmd, native_make_pmd(pmd_val));
> +
> + return next_page;
> +}
> +
> +static unsigned long __init sme_pgtable_calc(unsigned long start,
> + unsigned long end)
> +{
> + unsigned long addr, total;
> +
> + total = 0;
> + addr = start;
> + while (addr < end) {
> + unsigned long pgd_end;
> +
> + pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
> + if (pgd_end > end)
> + pgd_end = end;
> +
> + total += sizeof(pud_t) * PTRS_PER_PUD * 2;
> +
> + while (addr < pgd_end) {
> + unsigned long pud_end;
> +
> + pud_end = (addr & PUD_MASK) + PUD_SIZE;
> + if (pud_end > end)
> + pud_end = end;
> +
> + total += sizeof(pmd_t) * PTRS_PER_PMD * 2;

That "* 2" is?

> +
> + addr = pud_end;

So addr += PUD_SIZE;

?

> + }
> +
> + addr = pgd_end;

So addr += PGD_SIZE;

?

> + total += sizeof(pgd_t) * PTRS_PER_PGD;
> +
> + return total;
> +}
>
> void __init sme_encrypt_kernel(void)
> {
> + pgd_t *pgd;
> + void *workarea, *next_page, *vaddr;
> + unsigned long kern_start, kern_end, kern_len;
> + unsigned long index, paddr, pmd_flags;
> + unsigned long exec_size, full_size;
> +
> + /* If SME is not active then no need to prepare */

That comment is obvious.

> + if (!sme_active())
> + return;
> +
> + /* Set the workarea to be after the kernel */
> + workarea = (void *)ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
> +
> + /*
> + * Prepare for encrypting the kernel by building new pagetables with
> + * the necessary attributes needed to encrypt the kernel in place.
> + *
> + * One range of virtual addresses will map the memory occupied
> + * by the kernel as encrypted.
> + *
> + * Another range of virtual addresses will map the memory occupied
> + * by the kernel as decrypted and write-protected.
> + *
> + * The use of write-protect attribute will prevent any of the
> + * memory from being cached.
> + */
> +
> + /* Physical address gives us the identity mapped virtual address */
> + kern_start = __pa_symbol(_text);
> + kern_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE) - 1;

So
kern_end = (unsigned long)workarea - 1;

?

Also, you can make that workarea be unsigned long and cast it to void *
only when needed so that you don't need to cast it in here for the
calculations.

> + kern_len = kern_end - kern_start + 1;
> +
> + /*
> + * Calculate required number of workarea bytes needed:
> + * executable encryption area size:
> + * stack page (PAGE_SIZE)
> + * encryption routine page (PAGE_SIZE)
> + * intermediate copy buffer (PMD_PAGE_SIZE)
> + * pagetable structures for workarea (in case not currently mapped)
> + * pagetable structures for the encryption of the kernel
> + */
> + exec_size = (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
> +
> + full_size = exec_size;
> + full_size += ALIGN(exec_size, PMD_PAGE_SIZE) / PMD_PAGE_SIZE *
> + sizeof(pmd_t) * PTRS_PER_PMD;
> + full_size += sme_pgtable_calc(kern_start, kern_end + exec_size);
> +
> + next_page = workarea + exec_size;

So next_page is the next free page after the workarea, correct? Because
of all things, *that* certainly needs a comment. It took me a while to
decipher what's going on here and I'm still not 100% clear.

> + /* Make sure the current pagetables have entries for the workarea */
> + pgd = (pgd_t *)native_read_cr3();
> + paddr = (unsigned long)workarea;
> + while (paddr < (unsigned long)workarea + full_size) {
> + vaddr = (void *)paddr;
> + next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> + paddr + PMD_FLAGS);
> +
> + paddr += PMD_PAGE_SIZE;
> + }
> + native_write_cr3(native_read_cr3());

Why not

native_write_cr3((unsigned long)pgd);

?

Now you can actually acknowledge that the code block in between changed
the hierarchy in pgd and you're reloading it.

> + /* Calculate a PGD index to be used for the decrypted mapping */
> + index = (pgd_index(kern_end + full_size) + 1) & (PTRS_PER_PGD - 1);
> + index <<= PGDIR_SHIFT;

So call it decrypt_mapping_pgd or so. index doesn't say anything. Also,
move it right above where it is being used. This function is very hard
to follow as it is.

> + /* Set and clear the PGD */

This needs more text: we're building a new temporary pagetable which
will have A, B and C mapped into it and blablabla...

> + pgd = next_page;
> + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
> + next_page += sizeof(*pgd) * PTRS_PER_PGD;
> +
> + /* Add encrypted (identity) mappings for the kernel */
> + pmd_flags = PMD_FLAGS | _PAGE_ENC;
> + paddr = kern_start;
> + while (paddr < kern_end) {
> + vaddr = (void *)paddr;
> + next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> + paddr + pmd_flags);
> +
> + paddr += PMD_PAGE_SIZE;
> + }
> +
> + /* Add decrypted (non-identity) mappings for the kernel */
> + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
> + paddr = kern_start;
> + while (paddr < kern_end) {
> + vaddr = (void *)(paddr + index);
> + next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> + paddr + pmd_flags);
> +
> + paddr += PMD_PAGE_SIZE;
> + }
> +
> + /* Add the workarea to both mappings */
> + paddr = kern_end + 1;

paddr = (unsigned long)workarea;

Now this makes sense when I read the comment above it.

> + while (paddr < (kern_end + exec_size)) {

... which actually wants that exec_size to be called workarea_size. Then
it'll make more sense.

And then the thing above:

next_page = workarea + exec_size;

would look like:

next_page = workarea + workarea_size;

which would make even more sense. And since you have stuff called _start
and _end, you can do:

next_page = workarea_start + workarea_size;

and not it would make most sense. Eva! :-)

> + vaddr = (void *)paddr;
> + next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> + paddr + PMD_FLAGS);
> +
> + vaddr = (void *)(paddr + index);
> + next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> + paddr + PMD_FLAGS);
> +
> + paddr += PMD_PAGE_SIZE;
> + }
> +
> + /* Perform the encryption */
> + sme_encrypt_execute(kern_start, kern_start + index, kern_len,
> + workarea, pgd);
> +

Phew, that's one tough patch to review. I'd like to review it again in
your next submission.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

Next message: Fengguang Wu: "Re: [locking/ww_mutex] 2a0c112828 WARNING: CPU: 0 PID: 18 at kernel/locking/mutex.c:305 __ww_mutex_wakeup_for_backoff"
Previous message: Michal Hocko: "Re: rsync: page allocation stalls in kernel 4.9.10 to a VessRAID NAS"
Next in thread: Tom Lendacky: "Re: [RFC PATCH v4 27/28] x86: Add support to encrypt the kernel in-place"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]