Re: [PATCHv3 7/8] kvm: host side for eoi optimization

From: Michael S. Tsirkin
Date: Wed May 16 2012 - 02:16:48 EST


On Tue, May 15, 2012 at 10:55:19PM -0300, Marcelo Tosatti wrote:
> On Tue, May 15, 2012 at 05:36:19PM +0300, Michael S. Tsirkin wrote:
> > Implementation of PV EOI using shared memory.
> > This reduces the number of exits an interrupt
> > causes as much as by half.
> >
> > The idea is simple: there's a bit, per APIC, in guest memory,
> > that tells the guest that it does not need EOI.
> > We set it before injecting an interrupt and clear
> > before injecting a nested one. Guest tests it using
> > a test and clear operation - this is necessary
> > so that host can detect interrupt nesting -
> > and if set, it can skip the EOI MSR.
> >
> > There's a new MSR to set the address of said register
> > in guest memory. Otherwise not much changed:
> > - Guest EOI is not required
> > - Register is tested & ISR is automatically cleared on exit
> > For tetsing results see description of patch 7 of the series.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx>
> > ---
> > arch/x86/include/asm/kvm_host.h | 6 ++
> > arch/x86/include/asm/kvm_para.h | 2 +
> > arch/x86/kvm/cpuid.c | 1 +
> > arch/x86/kvm/irq.c | 2 +-
> > arch/x86/kvm/lapic.c | 110 +++++++++++++++++++++++++++++++++++++--
> > arch/x86/kvm/lapic.h | 2 +
> > arch/x86/kvm/trace.h | 34 ++++++++++++
> > arch/x86/kvm/x86.c | 8 +++-
> > 8 files changed, 158 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 32297f2..7673f4d 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -174,6 +174,7 @@ enum {
> >
> > /* apic attention bits */
> > #define KVM_APIC_CHECK_VAPIC 0
> > +#define KVM_APIC_EOI_PENDING 1
> >
> > /*
> > * We don't want allocation failures within the mmu code, so we preallocate
> > @@ -485,6 +486,11 @@ struct kvm_vcpu_arch {
> > u64 length;
> > u64 status;
> > } osvw;
> > +
> > + struct {
> > + u64 msr_val;
> > + struct gfn_to_hva_cache data;
> > + } eoi;
> > };
> >
> > struct kvm_lpage_info {
> > diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> > index 734c376..195840d 100644
> > --- a/arch/x86/include/asm/kvm_para.h
> > +++ b/arch/x86/include/asm/kvm_para.h
> > @@ -22,6 +22,7 @@
> > #define KVM_FEATURE_CLOCKSOURCE2 3
> > #define KVM_FEATURE_ASYNC_PF 4
> > #define KVM_FEATURE_STEAL_TIME 5
> > +#define KVM_FEATURE_EOI 6
>
> SKIP_EOI?
>
> >
> > /* The last 8 bits are used to indicate how to interpret the flags field
> > * in pvclock structure. If no bits are set, all flags are ignored.
> > @@ -37,6 +38,7 @@
> > #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
> > #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
> > #define MSR_KVM_STEAL_TIME 0x4b564d03
> > +#define MSR_KVM_EOI_EN 0x4b564d04
> >
> > struct kvm_steal_time {
> > __u64 steal;
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index eba8345..bda4877 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -408,6 +408,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> > (1 << KVM_FEATURE_NOP_IO_DELAY) |
> > (1 << KVM_FEATURE_CLOCKSOURCE2) |
> > (1 << KVM_FEATURE_ASYNC_PF) |
> > + (1 << KVM_FEATURE_EOI) |
> > (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> >
> > if (sched_info_on())
> > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > index 7e06ba1..07bdfab 100644
> > --- a/arch/x86/kvm/irq.c
> > +++ b/arch/x86/kvm/irq.c
> > @@ -48,7 +48,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> > if (!irqchip_in_kernel(v->kvm))
> > return v->arch.interrupt.pending;
> >
> > - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
> > + if (kvm_apic_has_interrupt(v) < 0) { /* LAPIC */
> > if (kvm_apic_accept_pic_intr(v)) {
> > s = pic_irqchip(v->kvm); /* PIC */
> > return s->output;
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 93c1574..c7e6ffb 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -270,6 +270,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
> > irq->level, irq->trig_mode);
> > }
> >
> > +static int eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
> > +{
> > +
> > + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, &val,
> > + sizeof(val));
> > +}
> > +
> > +static int eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
> > +{
> > +
> > + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, val,
> > + sizeof(*val));
> > +}
> > +
> > +static inline bool eoi_enabled(struct kvm_vcpu *vcpu)
> > +{
> > + return vcpu->arch.eoi.msr_val & KVM_MSR_ENABLED;
> > +}
>
> Please find a more descriptive name, such as pv_eoi_* (or skip_eoi,
> or...).
>
> > +
> > +static bool eoi_get_pending(struct kvm_vcpu *vcpu)
> > +{
> > + u8 val;
> > + if (eoi_get_user(vcpu, &val) < 0)
> > + apic_debug("Can't read EOI MSR value: 0x%llx\n",
> > + (unsigned long long)vcpi->arch.eoi.msr_val);
> > + return val & 0x1;
> > +}
> > +
> > +static void eoi_set_pending(struct kvm_vcpu *vcpu)
> > +{
> > + if (eoi_put_user(vcpu, 0x1) < 0) {
> > + apic_debug("Can't set EOI MSR value: 0x%llx\n",
> > + (unsigned long long)vcpi->arch.eoi.msr_val);
> > + return;
> > + }
> > + __set_bit(KVM_APIC_EOI_PENDING, &vcpu->arch.apic_attention);
> > +}
> > +
> > +static void eoi_clr_pending(struct kvm_vcpu *vcpu)
> > +{
> > + if (eoi_put_user(vcpu, 0x0) < 0) {
> > + apic_debug("Can't clear EOI MSR value: 0x%llx\n",
> > + (unsigned long long)vcpi->arch.eoi.msr_val);
> > + return;
> > + }
> > + __clear_bit(KVM_APIC_EOI_PENDING, &vcpu->arch.apic_attention);
> > +}
> > +
> > static inline int apic_find_highest_isr(struct kvm_lapic *apic)
> > {
> > int result;
> > @@ -482,16 +530,21 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
> > return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
> > }
> >
> > -static void apic_set_eoi(struct kvm_lapic *apic)
> > +static int apic_set_eoi(struct kvm_lapic *apic)
> > {
> > int vector = apic_find_highest_isr(apic);
> > +
> > + trace_kvm_eoi(apic, vector);
> > +
> > /*
> > * Not every write EOI will has corresponding ISR,
> > * one example is when Kernel check timer on setup_IO_APIC
> > */
> > if (vector == -1)
> > - return;
> > + return vector;
> >
> > + if (eoi_enabled(apic->vcpu))
> > + eoi_clr_pending(apic->vcpu);
>
> Why is this here again? If you got here, the bit should be cleared
> already?

It's here in case guest invokes the EOI MSR instead of
clearing the bit. In this case we clear it instead of the guest.
The idea being, PV EOI is always optional never mandatory.
I'll add a comment.

> > apic_clear_vector(vector, apic->regs + APIC_ISR);
> > apic_update_ppr(apic);
> >
> > @@ -505,6 +558,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
> > kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
> > }
> > kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
> > + return vector;
> > }
> >
> > static void apic_send_ipi(struct kvm_lapic *apic)
> > @@ -1085,6 +1139,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> > atomic_set(&apic->lapic_timer.pending, 0);
> > if (kvm_vcpu_is_bsp(vcpu))
> > vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
> > + vcpu->arch.eoi.msr_val = 0;
> > apic_update_ppr(apic);
> >
> > vcpu->arch.apic_arb_prio = 0;
> > @@ -1211,9 +1266,10 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
> >
> > apic_update_ppr(apic);
> > highest_irr = apic_find_highest_irr(apic);
> > - if ((highest_irr == -1) ||
> > - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
> > + if (highest_irr == -1)
> > return -1;
> > + if (((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
> > + return -2;
> > return highest_irr;
> > }
> >
> > @@ -1245,9 +1301,20 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
> > int vector = kvm_apic_has_interrupt(vcpu);
> > struct kvm_lapic *apic = vcpu->arch.apic;
> >
> > - if (vector == -1)
> > + /* Detect interrupt nesting and disable EOI optimization */
> > + if (eoi_enabled(vcpu) && vector == -2)
> > + eoi_clr_pending(vcpu);
> > +
> > + if (vector < 0)
> > return -1;
> >
> > + if (eoi_enabled(vcpu)) {
> > + if (kvm_ioapic_handles_vector(vcpu->kvm, vector))
> > + eoi_clr_pending(vcpu);
> > + else
> > + eoi_set_pending(vcpu);
> > + }
> > +
> > apic_set_vector(vector, apic->regs + APIC_ISR);
> > apic_update_ppr(apic);
> > apic_clear_irr(vector, apic);
> > @@ -1262,6 +1329,14 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
> > MSR_IA32_APICBASE_BASE;
> > kvm_apic_set_version(vcpu);
> >
> > + if (eoi_enabled(apic->vcpu)) {
> > + if (eoi_get_pending(apic->vcpu))
> > + __set_bit(KVM_APIC_EOI_PENDING,
> > + &vcpu->arch.apic_attention);
> > + else
> > + __clear_bit(KVM_APIC_EOI_PENDING,
> > + &vcpu->arch.apic_attention);
> > + }
>
>
> Why is this necessary? kvm_apic_post_state_restore cannot change the
> per cpu in memory value.

Exactly. Memory is migrated but the internal flag
in apic_attention isn't. So we make sure apic_attention
is in sync with the per cpu value.
I'll add a comment.

> > apic_update_ppr(apic);
> > hrtimer_cancel(&apic->lapic_timer.timer);
> > update_divide_count(apic);
> > @@ -1283,11 +1358,25 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> > hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
> > }
> >
> > +static void apic_update_eoi(struct kvm_lapic *apic)
> > +{
> > + int vector;
> > + BUG_ON(!eoi_enabled(apic->vcpu));
> > + if (eoi_get_pending(apic->vcpu))
> > + return;
> > + __clear_bit(KVM_APIC_EOI_PENDING, &apic->vcpu->arch.apic_attention);
> > + vector = apic_set_eoi(apic);
> > + trace_kvm_pv_eoi(apic, vector);
> > +}
>
> KVM_APIC_EOI_PENDING appears to be a shadow of the pervcpu value,
> to speed up processing. If it is, please make that clear.

Exactly. I'll add a comment where this bit is defined.

> > void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
> > {
> > u32 data;
> > void *vapic;
> >
> > + if (test_bit(KVM_APIC_EOI_PENDING, &vcpu->arch.apic_attention))
> > + apic_update_eoi(vcpu->arch.apic);
> > +
> > if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
> > return;
> >
> > @@ -1394,3 +1483,14 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
> >
> > return 0;
> > }
> > +
> > +int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data)
> > +{
> > + u64 addr = data & ~KVM_MSR_ENABLED;
> > + if (eoi_enabled(vcpu))
> > + eoi_clr_pending(vcpu);
> > + vcpu->arch.eoi.msr_val = data;
> > + if (!eoi_enabled(vcpu))
> > + return 0;
> > + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.eoi.data, addr);
> > +}
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 6f4ce25..042dace 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -60,4 +60,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
> > {
> > return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
> > }
> > +
> > +int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data);
> > #endif
> > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> > index 911d264..851914e 100644
> > --- a/arch/x86/kvm/trace.h
> > +++ b/arch/x86/kvm/trace.h
> > @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
> > __entry->coalesced ? " (coalesced)" : "")
> > );
> >
> > +TRACE_EVENT(kvm_eoi,
> > + TP_PROTO(struct kvm_lapic *apic, int vector),
> > + TP_ARGS(apic, vector),
> > +
> > + TP_STRUCT__entry(
> > + __field( __u32, apicid )
> > + __field( int, vector )
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->apicid = apic->vcpu->vcpu_id;
> > + __entry->vector = vector;
> > + ),
> > +
> > + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
> > +);
> > +
> > +TRACE_EVENT(kvm_pv_eoi,
> > + TP_PROTO(struct kvm_lapic *apic, int vector),
> > + TP_ARGS(apic, vector),
> > +
> > + TP_STRUCT__entry(
> > + __field( __u32, apicid )
> > + __field( int, vector )
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->apicid = apic->vcpu->vcpu_id;
> > + __entry->vector = vector;
> > + ),
> > +
> > + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
> > +);
> > +
> > /*
> > * Tracepoint for nested VMRUN
> > */
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 185a2b8..bfcd97d 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -795,6 +795,7 @@ static u32 msrs_to_save[] = {
> > MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
> > HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
> > HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
> > + MSR_KVM_EOI_EN,
> > MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
> > MSR_STAR,
> > #ifdef CONFIG_X86_64
> > @@ -1653,6 +1654,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> > kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
> >
> > break;
> > + case MSR_KVM_EOI_EN:
> > + if (kvm_pv_enable_apic_eoi(vcpu, data))
> > + return 1;
> > + break;
> >
> > case MSR_IA32_MCG_CTL:
> > case MSR_IA32_MCG_STATUS:
> > @@ -5370,7 +5375,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > if (unlikely(vcpu->arch.tsc_always_catchup))
> > kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> >
> > - kvm_lapic_sync_from_vapic(vcpu);
> > + if (unlikely(vcpu->arch.apic_attention))
> > + kvm_lapic_sync_from_vapic(vcpu);
>
> This apic_attention check is unrelated, please have in a separate patch.

OK.

> Also please document that this is only possible because of interrupt window
> exiting.

Not sure I understand what you are saying here.

When we try to inject an interrupt, this causes the
destination vcpu to exit. At that point we check whether guest has
finished handling the previous interrupt.
That's one reason to have apic_attention set.
Another is use of vapic.

So if we add a comment it belongs at kvm_lapic_sync_from_vapic
not here in the generic code.

> The Hyper-V spec, does it provide similar interface ? It mentions
> that "Most EOI intercepts can be eliminated and done lazily if the guest
> OS leaves a marker when it performs an EOI".

I think the kvm data path for that will be able to reuse our code.

--
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/