[PATCH 3/3] KVM: VMX: make MSR bitmaps per-VCPU

From: Paolo Bonzini
Date: Tue Jan 16 2018 - 11:53:56 EST


Place the MSR bitmap in struct loaded_vmcs, and update it in place
every time the x2apic or APICv state can change. This does not sound
too efficient, but (with the exception of nested virt) it's rare and
the loop can handle 64 MSRs per iteration, in a similar fashion as
nested_vmx_prepare_msr_bitmap.

Suggested-by: Jim Mattson <jmattson@xxxxxxxxxx>
Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
arch/x86/kvm/vmx.c | 200 ++++++++++++++++++++++++++++-------------------------
1 file changed, 106 insertions(+), 94 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d8057fac336a..a540f172d032 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -208,6 +208,7 @@ struct loaded_vmcs {
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link;
};

@@ -455,8 +456,6 @@ struct nested_vmx {
bool pi_pending;
u16 posted_intr_nv;

- unsigned long *msr_bitmap;
-
struct hrtimer preemption_timer;
bool preemption_timer_expired;

@@ -896,6 +895,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);

static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -913,9 +913,6 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);

enum {
- VMX_MSR_BITMAP,
- VMX_MSR_BITMAP_X2APIC_APICV,
- VMX_MSR_BITMAP_X2APIC,
VMX_VMREAD_BITMAP,
VMX_VMWRITE_BITMAP,
VMX_BITMAP_NR
@@ -923,9 +920,6 @@ enum {

static unsigned long *vmx_bitmap[VMX_BITMAP_NR];

-#define vmx_msr_bitmap (vmx_bitmap[VMX_MSR_BITMAP])
-#define vmx_msr_bitmap_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_X2APIC_APICV])
-#define vmx_msr_bitmap_x2apic (vmx_bitmap[VMX_MSR_BITMAP_X2APIC])
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])

@@ -2524,26 +2518,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
vmx->guest_msrs[from] = tmp;
}

-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- unsigned long *msr_bitmap;
-
- if (is_guest_mode(vcpu))
- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
- else if (cpu_has_secondary_exec_ctrls() &&
- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
- msr_bitmap = vmx_msr_bitmap_x2apic_apicv;
- else
- msr_bitmap = vmx_msr_bitmap_x2apic;
- } else {
- msr_bitmap = vmx_msr_bitmap;
- }
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -2584,7 +2558,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;

if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(&vmx->vcpu);
+ vmx_update_msr_bitmap(&vmx->vcpu);
}

/*
@@ -3796,6 +3770,8 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}

@@ -3825,7 +3801,18 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)

loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs_init(loaded_vmcs);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+ }
return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
}

static void free_kvm_area(void)
@@ -4920,8 +4907,8 @@ static void free_vpid(int vpid)

#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
{
int f = sizeof(unsigned long);

@@ -4955,6 +4942,41 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
}
}

+static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
/*
* If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -4996,28 +5018,48 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}

-static void vmx_disable_intercept_for_msr(u32 msr)
-{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap,
- msr, MSR_TYPE_R | MSR_TYPE_W);
-}
-
-static void vmx_enable_intercept_for_msr(u32 msr)
-{
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap,
- msr, MSR_TYPE_R | MSR_TYPE_W);
-}
-
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))

-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only)
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_x2apic_apicv, msr, type);
- if (!apicv_only) {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_x2apic, msr, type);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ bool x2apic, apicv;
+ int msr;
+
+ x2apic = (cpu_has_secondary_exec_ctrls() &&
+ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE));
+ apicv = x2apic && enable_apicv && kvm_vcpu_apicv_active(vcpu);
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap[word] = apicv ? 0 : ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
}
+
+ if (!x2apic)
+ return;
+
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+ if (!apicv)
+ return;
+
+ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT),
+ MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
}

+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
{
return enable_apicv;
@@ -5269,7 +5311,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}

if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
}

static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -5458,7 +5500,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
}
if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap));
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));

vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */

@@ -6748,7 +6790,7 @@ void vmx_enable_tdp(void)

static __init int hardware_setup(void)
{
- int r = -ENOMEM, i, msr;
+ int r = -ENOMEM, i;

rdmsrl_safe(MSR_EFER, &host_efer);

@@ -6764,8 +6806,6 @@ static __init int hardware_setup(void)
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);

- memset(vmx_msr_bitmap, 0xff, PAGE_SIZE);
-
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
goto out;
@@ -6829,32 +6869,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}

- vmx_disable_intercept_for_msr(MSR_FS_BASE);
- vmx_disable_intercept_for_msr(MSR_GS_BASE);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP);
-
- memcpy(vmx_msr_bitmap_x2apic_apicv, vmx_msr_bitmap, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_x2apic, vmx_msr_bitmap, PAGE_SIZE);
-
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */

- for (msr = 0x800; msr <= 0x8ff; msr++) {
- if (msr == X2APIC_MSR(APIC_TMCCT))
- continue;
- vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
- }
-
- /*
- * TPR reads and writes can be virtualized even if virtual interrupt
- * delivery is not in use.
- */
- vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W, false);
- vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_EOI), MSR_TYPE_W, true);
- vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, true);
-
if (enable_ept)
vmx_enable_tdp();
else
@@ -7148,14 +7164,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_vmcs02;

- if (cpu_has_vmx_msr_bitmap()) {
- vmx->nested.msr_bitmap =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx->nested.msr_bitmap)
- goto out_msr_bitmap;
- memset(vmx->nested.msr_bitmap, 0xff, PAGE_SIZE);
- }
-
vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
@@ -7182,9 +7190,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
kfree(vmx->nested.cached_vmcs12);

out_cached_vmcs12:
- free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
vmx_nested_free_vmcs02(vmx);

out_vmcs02:
@@ -7330,10 +7335,6 @@ static void free_nested(struct vcpu_vmx *vmx)
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- if (vmx->nested.msr_bitmap) {
- free_page((unsigned long)vmx->nested.msr_bitmap);
- vmx->nested.msr_bitmap = NULL;
- }
if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -8835,7 +8836,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);

- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);
}

static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@ -9487,6 +9488,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ unsigned long *msr_bitmap;
int cpu;

if (!vmx)
@@ -9523,6 +9525,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (err < 0)
goto free_msrs;

+ msr_bitmap = vmx->vmcs01.msr_bitmap;
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_R | MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_R | MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_R | MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_R | MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_R | MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_R | MSR_TYPE_W);
+
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -9983,7 +9993,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
struct page *page;
unsigned long *msr_bitmap_l1;
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;

/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
@@ -10416,6 +10426,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
+
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
}

/*
@@ -11393,7 +11405,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);

if (cpu_has_vmx_msr_bitmap())
- vmx_set_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap(vcpu);

if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
--
1.8.3.1