[PATCH v5 5/5] x86/kvm: Avoid dynamic allocation of pvclock data when SEV is active

From: Brijesh Singh
Date: Thu Sep 06 2018 - 07:44:00 EST


Currently, the per-cpu pvclock data is allocated dynamically when
cpu > HVC_BOOT_ARRAY_SIZE. The physical address of this variable is
shared between the guest and the hypervisor hence it must be mapped as
unencrypted (ie. C=0) when SEV is active.

When SEV is active, we will be wasting fairly sizeable amount of memory
since each CPU will be doing a separate 4k allocation so that it can clear
C-bit. Let's define few extra static page sized array of pvclock data.
In the preparatory stage of CPU hotplug, use the element of this static
array to avoid the dynamic allocation. This array will be put in
the .data..decrypted section so that its mapped with C=0 during the boot.

In non-SEV case, this static page will unused and free'd by the
free_decrypted_mem().

Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
Suggested-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Cc: Tom Lendacky <thomas.lendacky@xxxxxxx>
Cc: kvm@xxxxxxxxxxxxxxx
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Cc: kvm@xxxxxxxxxxxxxxx
Cc: "Radim KrÄmÃÅ" <rkrcmar@xxxxxxxxxx>
---
arch/x86/include/asm/mem_encrypt.h | 4 ++++
arch/x86/kernel/kvmclock.c | 22 +++++++++++++++++++---
arch/x86/kernel/vmlinux.lds.S | 3 +++
arch/x86/mm/init.c | 3 +++
arch/x86/mm/mem_encrypt.c | 10 ++++++++++
5 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 802b2eb..aa204af 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,11 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);

/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
+void __init free_decrypted_mem(void);

bool sme_active(void);
bool sev_active(void);

#define __decrypted __attribute__((__section__(".data..decrypted")))
+#define __decrypted_hvclock __attribute__((__section__(".data..decrypted_hvclock")))

#else /* !CONFIG_AMD_MEM_ENCRYPT */

@@ -80,6 +82,7 @@ static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }

#define __decrypted
+#define __decrypted_hvclock

#endif /* CONFIG_AMD_MEM_ENCRYPT */

@@ -93,6 +96,7 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)

extern char __start_data_decrypted[], __end_data_decrypted[];
+extern char __start_data_decrypted_hvclock[];

#endif /* __ASSEMBLY__ */

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 376fd3a..5b88773 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -65,6 +65,13 @@ static struct pvclock_vsyscall_time_info
static struct pvclock_wall_clock wall_clock __decrypted;
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);

+
+/* This should cover upto 512 VCPUS (first 64 are covered by hv_clock_boot[]). */
+#define HVC_DECRYPTED_ARRAY_SIZE \
+ ((PAGE_SIZE * 7) / sizeof(struct pvclock_vsyscall_time_info))
+static struct pvclock_vsyscall_time_info
+ hv_clock_dec[HVC_DECRYPTED_ARRAY_SIZE] __decrypted_hvclock;
+
static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
{
return &this_cpu_read(hv_clock_per_cpu)->pvti;
@@ -267,10 +274,19 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return 0;

/* Use the static page for the first CPUs, allocate otherwise */
- if (cpu < HVC_BOOT_ARRAY_SIZE)
+ if (cpu < HVC_BOOT_ARRAY_SIZE) {
p = &hv_clock_boot[cpu];
- else
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ } else {
+ /*
+ * When SEV is active, use the static pages from
+ * .data..decrypted_hvclock section. The pages are already
+ * mapped with C=0.
+ */
+ if (sev_active())
+ p = &hv_clock_dec[cpu - HVC_BOOT_ARRAY_SIZE];
+ else
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ }

per_cpu(hv_clock_per_cpu, cpu) = p;
return p ? 0 : -ENOMEM;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78d3169..1aec291 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -101,6 +101,9 @@ PHDRS {
. = ALIGN(PMD_SIZE); \
__start_data_decrypted = .; \
*(.data..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_data_decrypted_hvclock = .; \
+ *(.data..decrypted_hvclock); \
. = ALIGN(PMD_SIZE); \
__end_data_decrypted = .; \

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7a8fc26..052b279 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,9 +815,12 @@ void free_kernel_image_pages(void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
}

+void __weak free_decrypted_mem(void) { }
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
+ free_decrypted_mem();

free_kernel_image_pages(&__init_begin, &__init_end);
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b2de398..865b1ad 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -348,6 +348,16 @@ bool sev_active(void)
EXPORT_SYMBOL(sev_active);

/* Architecture __weak replacement functions */
+void __init free_decrypted_mem(void)
+{
+ if (mem_encrypt_active())
+ return;
+
+ free_init_pages("unused decrypted",
+ (unsigned long)__start_data_decrypted_hvclock,
+ (unsigned long)__end_data_decrypted);
+}
+
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
--
2.7.4