Re: [PATCH v3 02/34] KVM: x86: hyper-v: Introduce TLB flush ring

From: Sean Christopherson
Date: Mon May 16 2022 - 15:34:33 EST


On Thu, Apr 14, 2022, Vitaly Kuznetsov wrote:
> To allow flushing individual GVAs instead of always flushing the whole
> VPID a per-vCPU structure to pass the requests is needed. Introduce a
> simple ring write-locked structure to hold two types of entries:
> individual GVA (GFN + up to 4095 following GFNs in the lower 12 bits)
> and 'flush all'.
>
> The queuing rule is: if there's not enough space on the ring to put
> the request and leave at least 1 entry for 'flush all' - put 'flush
> all' entry.
>
> The size of the ring is arbitrary set to '16'.
>
> Note, kvm_hv_flush_tlb() only queues 'flush all' entries for now so
> there's very small functional change but the infrastructure is
> prepared to handle individual GVA flush requests.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
> ---
> arch/x86/include/asm/kvm_host.h | 16 +++++++
> arch/x86/kvm/hyperv.c | 83 +++++++++++++++++++++++++++++++++
> arch/x86/kvm/hyperv.h | 13 ++++++
> arch/x86/kvm/x86.c | 5 +-
> arch/x86/kvm/x86.h | 1 +
> 5 files changed, 116 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1de3ad9308d8..b4dd2ff61658 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -578,6 +578,20 @@ struct kvm_vcpu_hv_synic {
> bool dont_zero_synic_pages;
> };
>
> +#define KVM_HV_TLB_FLUSH_RING_SIZE (16)
> +
> +struct kvm_vcpu_hv_tlb_flush_entry {
> + u64 addr;

"addr" misleading, this is overloaded to be both the virtual address and the count.
I think we make it a moot point, but it led me astray in thinkin we could use the
lower 12 bits for flags... until I realized those bits are already in use.

> + u64 flush_all:1;
> + u64 pad:63;

This is rather odd, why not just use a bool? But why even have a "flush_all"
field, can't we just use a magic value for write_idx to indicate "flush_all"?
E.g. either an explicit #define or -1.

Writers set write_idx to -1 to indicate "flush all", vCPU/reader goes straight
to "flush all" if write_idx is -1/invalid. That way, future writes can simply do
nothing until read_idx == write_idx, and the vCPU/reader avoids unnecessary flushes
if there's a "flush all" pending and other valid entries in the ring.

And it allows deferring the "flush all" until the ring is truly full (unless there's
an off-by-one / wraparound edge case I'm missing, which is likely...).

---
arch/x86/include/asm/kvm_host.h | 8 +-----
arch/x86/kvm/hyperv.c | 47 +++++++++++++--------------------
2 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b6b9a71a4591..bb45cc383ce4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -605,16 +605,10 @@ enum hv_tlb_flush_rings {
HV_NR_TLB_FLUSH_RINGS,
};

-struct kvm_vcpu_hv_tlb_flush_entry {
- u64 addr;
- u64 flush_all:1;
- u64 pad:63;
-};
-
struct kvm_vcpu_hv_tlb_flush_ring {
int read_idx, write_idx;
spinlock_t write_lock;
- struct kvm_vcpu_hv_tlb_flush_entry entries[KVM_HV_TLB_FLUSH_RING_SIZE];
+ u64 entries[KVM_HV_TLB_FLUSH_RING_SIZE];
};

/* Hyper-V per vcpu emulation context */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1d6927538bc7..56f06cf85282 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1837,10 +1837,13 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc
static inline int hv_tlb_flush_ring_free(struct kvm_vcpu_hv *hv_vcpu,
int read_idx, int write_idx)
{
+ if (write_idx < 0)
+ return 0;
+
if (write_idx >= read_idx)
- return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx) - 1;
+ return KVM_HV_TLB_FLUSH_RING_SIZE - (write_idx - read_idx);

- return read_idx - write_idx - 1;
+ return read_idx - write_idx;
}

static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
@@ -1869,6 +1872,9 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
*/
write_idx = tlb_flush_ring->write_idx;

+ if (write_idx < 0 && read_idx == write_idx)
+ read_idx = write_idx = 0;
+
ring_free = hv_tlb_flush_ring_free(hv_vcpu, read_idx, write_idx);
/* Full ring always contains 'flush all' entry */
if (!ring_free)
@@ -1879,21 +1885,13 @@ static void hv_tlb_flush_ring_enqueue(struct kvm_vcpu *vcpu,
* entry in case another request comes in. In case there's not enough
* space, just put 'flush all' entry there.
*/
- if (!count || count >= ring_free - 1 || !entries) {
- tlb_flush_ring->entries[write_idx].addr = 0;
- tlb_flush_ring->entries[write_idx].flush_all = 1;
- /*
- * Advance write index only after filling in the entry to
- * synchronize with lockless reader.
- */
- smp_wmb();
- tlb_flush_ring->write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
+ if (!count || count > ring_free - 1 || !entries) {
+ tlb_flush_ring->write_idx = -1;
goto out_unlock;
}

for (i = 0; i < count; i++) {
- tlb_flush_ring->entries[write_idx].addr = entries[i];
- tlb_flush_ring->entries[write_idx].flush_all = 0;
+ tlb_flush_ring->entries[write_idx] = entries[i];
write_idx = (write_idx + 1) % KVM_HV_TLB_FLUSH_RING_SIZE;
}
/*
@@ -1911,7 +1909,6 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_hv_tlb_flush_ring *tlb_flush_ring;
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- struct kvm_vcpu_hv_tlb_flush_entry *entry;
int read_idx, write_idx;
u64 address;
u32 count;
@@ -1940,26 +1937,18 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
/* Pairs with smp_wmb() in hv_tlb_flush_ring_enqueue() */
smp_rmb();

+ if (write_idx < 0) {
+ kvm_vcpu_flush_tlb_guest(vcpu);
+ goto out_empty_ring;
+ }
+
for (i = read_idx; i != write_idx; i = (i + 1) % KVM_HV_TLB_FLUSH_RING_SIZE) {
- entry = &tlb_flush_ring->entries[i];
-
- if (entry->flush_all)
- goto out_flush_all;
-
- /*
- * Lower 12 bits of 'address' encode the number of additional
- * pages to flush.
- */
- address = entry->addr & PAGE_MASK;
- count = (entry->addr & ~PAGE_MASK) + 1;
+ address = tlb_flush_ring->entries[i] & PAGE_MASK;
+ count = (tlb_flush_ring->entries[i] & ~PAGE_MASK) + 1;
for (j = 0; j < count; j++)
static_call(kvm_x86_flush_tlb_gva)(vcpu, address + j * PAGE_SIZE);
}
++vcpu->stat.tlb_flush;
- goto out_empty_ring;
-
-out_flush_all:
- kvm_vcpu_flush_tlb_guest(vcpu);

out_empty_ring:
tlb_flush_ring->read_idx = write_idx;

base-commit: 62592c7c742ae78eb1f1005a63965ece19e6effe
--