Re: [RFC PATCH] x86, kvm: use kvmclock to compute TSC deadline value

From: David Matlack
Date: Wed Jul 06 2016 - 13:17:38 EST


On Tue, Jul 5, 2016 at 10:36 AM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:
> Bad things happen if a guest using the TSC deadline timer is migrated.
> The guest doesn't re-calibrate the TSC after migration, and the
> TSC frequency can and will change unless your processor supports TSC
> scaling (on Intel this is only Skylake) or your data center is perfectly
> homogeneous.
>
> The solution in this patch is to skip tsc_khz, and instead derive the
> frequency from kvmclock's (mult, shift) pair. Because kvmclock
> parameters convert from tsc to nanoseconds, this needs a division
> but that's the least of our problems when the TSC_DEADLINE_MSR write
> costs 2000 clock cycles. Luckily tsc_khz is really used by very little
> outside the tsc clocksource (which kvmclock replaces) and the TSC
> deadline timer.

Two other ways to solve the problem, I don't know if you've considered:
* Constrain the set of hosts a given VM can run on based on the TSC
rate. (So don't need a perfectly homogenous fleet, just need each VM
to be constrained to a homogenous subset).
* Disable the TSC deadline timer from QEMU by assigning a CPUID with
the TSC capability zeroed (at least among VMs which could migrate to
hosts with different TSC rates). These VMs will use the APIC timer
which runs at a nice fixed rate.

>
> This patch does not handle the very first deadline, hoping that it
> is masked by the migration downtime (i.e. that the timer fires late
> anyway). I'd like a remark on the approach in general and ideas on how
> to handle the first deadline. It's also possible to just blacklist the
> TSC deadline timer of course, and it's probably the best thing to do for
> stable kernels.

> It would require extending to other modes the
> implementation of preemption-timer based APIC timer.

This would be nice to have.

> It'd be a pity to
> lose the nice latency boost that the preemption timer offers.
>
> The patch is also quite ugly in the way it arranges for kvmclock to
> replace only a small part of setup_apic_timer; better ideas are welcome.
>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> arch/x86/include/asm/apic.h | 2 ++
> arch/x86/include/asm/x86_init.h | 5 +++
> arch/x86/kernel/apic/apic.c | 15 +++++---
> arch/x86/kernel/kvmclock.c | 78 +++++++++++++++++++++++++++++++++++++++++
> arch/x86/kernel/x86_init.c | 1 +
> 5 files changed, 96 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index bc27611fa58f..cc4f45f66218 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -135,6 +135,7 @@ extern void init_apic_mappings(void);
> void register_lapic_address(unsigned long address);
> extern void setup_boot_APIC_clock(void);
> extern void setup_secondary_APIC_clock(void);
> +extern void setup_APIC_clockev(struct clock_event_device *levt);
> extern int APIC_init_uniprocessor(void);
>
> #ifdef CONFIG_X86_64
> @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { }
> static inline void disable_local_APIC(void) { }
> # define setup_boot_APIC_clock x86_init_noop
> # define setup_secondary_APIC_clock x86_init_noop
> +# define setup_APIC_clockev NULL
> #endif /* !CONFIG_X86_LOCAL_APIC */
>
> #ifdef CONFIG_X86_X2APIC
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 4dcdf74dfed8..d0f099ab4dba 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -7,6 +7,7 @@ struct mpc_bus;
> struct mpc_cpu;
> struct mpc_table;
> struct cpuinfo_x86;
> +struct clock_event_device;
>
> /**
> * struct x86_init_mpparse - platform specific mpparse ops
> @@ -84,11 +85,15 @@ struct x86_init_paging {
> * boot cpu
> * @timer_init: initialize the platform timer (default PIT/HPET)
> * @wallclock_init: init the wallclock device
> + * @setup_APIC_clockev: tweak the clock_event_device for the LAPIC timer,
> + * if setup_boot_APIC_clock and/or
> + * setup_secondary_APIC_clock are in use
> */
> struct x86_init_timers {
> void (*setup_percpu_clockev)(void);
> void (*timer_init)(void);
> void (*wallclock_init)(void);
> + void (*setup_APIC_clockev)(struct clock_event_device *levt);
> };
>
> /**
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 60078a67d7e3..b7a331f329d0 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -558,15 +558,20 @@ static void setup_APIC_timer(void)
> memcpy(levt, &lapic_clockevent, sizeof(*levt));
> levt->cpumask = cpumask_of(smp_processor_id());
>
> + x86_init.timers.setup_APIC_clockev(levt);
> + clockevents_register_device(levt);
> +}
> +
> +void setup_APIC_clockev(struct clock_event_device *levt)
> +{
> if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
> levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
> CLOCK_EVT_FEAT_DUMMY);
> + levt->min_delta_ticks = 0xF;
> + levt->max_delta_ticks = ~0UL;
> levt->set_next_event = lapic_next_deadline;
> - clockevents_config_and_register(levt,
> - (tsc_khz / TSC_DIVISOR) * 1000,
> - 0xF, ~0UL);
> - } else
> - clockevents_register_device(levt);
> + clockevents_config(levt, (tsc_khz / TSC_DIVISOR) * 1000);
> + }
> }
>
> /*
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1d39bfbd26bb..61e094207339 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -17,6 +17,7 @@
> */
>
> #include <linux/clocksource.h>
> +#include <linux/clockchips.h>
> #include <linux/kvm_para.h>
> #include <asm/pvclock.h>
> #include <asm/msr.h>
> @@ -245,6 +246,82 @@ static void kvm_shutdown(void)
> native_machine_shutdown();
> }
>
> +/*
> + * We already have the inverse of the (mult,shift) pair, though this means
> + * we need a division. To avoid it we could compute a multiplicative inverse
> + * every time src->version changes.
> + */
> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38
> +#define KVMCLOCK_TSC_DEADLINE_MAX ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
> +
> +static int lapic_next_ktime(ktime_t expires,
> + struct clock_event_device *evt)
> +{
> + u64 ns, tsc;
> + u32 version;
> + int cpu;
> + struct pvclock_vcpu_time_info *src;
> +
> + cpu = smp_processor_id();
> + src = &hv_clock[cpu].pvti;
> + ns = ktime_to_ns(expires);
> + do {
> + u64 delta_ns;
> + int shift;
> +
> + version = src->version;
> + virt_rmb();
> + if (unlikely(ns < src->system_time)) {
> + tsc = src->tsc_timestamp;
> + virt_rmb();
> + continue;
> + }
> +
> + delta_ns = ns - src->system_time;
> +
> + /* Cap the wait to avoid overflow. */
> + if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
> + delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
> +
> + /*
> + * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
> + * The shift is split in two steps so that a 38 bits (275 s)
> + * deadline fits into the 64-bit dividend.
> + */
> + shift = 32 - src->tsc_shift;
> +
> + /* First shift step... */
> + delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> + shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +
> + /* ... division... */
> + tsc = div_u64(delta_ns, src->tsc_to_system_mul);
> +
> + /* ... and second shift step for the remaining bits. */
> + if (shift >= 0)
> + tsc <<= shift;
> + else
> + tsc >>= -shift;
> +
> + tsc += src->tsc_timestamp;
> + virt_rmb();
> + } while((src->version & 1) || version != src->version);
> +
> + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
> + return 0;
> +}
> +
> +
> +static void kvm_setup_tsc_deadline_timer(struct clock_event_device *levt)
> +{
> + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
> + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
> + CLOCK_EVT_FEAT_DUMMY);
> + levt->features |= CLOCK_EVT_FEAT_KTIME;
> + levt->set_next_ktime = lapic_next_ktime;
> + }
> +}
> +
> void __init kvmclock_init(void)
> {
> struct pvclock_vcpu_time_info *vcpu_time;
> @@ -288,6 +365,7 @@ void __init kvmclock_init(void)
> kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
> put_cpu();
>
> + x86_init.timers.setup_APIC_clockev = kvm_setup_tsc_deadline_timer;
> x86_platform.calibrate_tsc = kvm_get_tsc_khz;
> x86_platform.get_wallclock = kvm_get_wallclock;
> x86_platform.set_wallclock = kvm_set_wallclock;
> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
> index dad5fe9633a3..b85a323208dc 100644
> --- a/arch/x86/kernel/x86_init.c
> +++ b/arch/x86/kernel/x86_init.c
> @@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
> .setup_percpu_clockev = setup_boot_APIC_clock,
> .timer_init = hpet_time_init,
> .wallclock_init = x86_init_noop,
> + .setup_APIC_clockev = setup_APIC_clockev,
> },
>
> .iommu = {
> --
> 1.8.3.1
>