Re: Stolen and degraded time and schedulers

From: Jeremy Fitzhardinge
Date: Wed Mar 14 2007 - 16:32:51 EST


Dan Hecht wrote:
> Sounds good. I don't see this in your patchset you sent yesterday
> though; did you add it after sending out those patches?

Yes.

> if so, could you forward the new patch? does it explicitly prevent
> stolen time from getting accounted as user/system time or does it
> just rely on NO_HZ mode sort of happening to work that way (since the
> one shot timer is skipped ahead for missed ticks)?

Hm, not sure. It doesn't care how often it gets called; it just
accumulates results up to that point, but I'm not sure if the time would
get double accounted. Perhaps it doesn't matter when using
xen_sched_clock().

Did the get_scheduled_time -> sched_clock make sense to you?

J
Signed-off-by: Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>
Cc: john stultz <johnstul@xxxxxxxxxx>

---
arch/i386/xen/time.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 101 insertions(+)

===================================================================
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -2,6 +2,7 @@
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>

#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -14,6 +15,7 @@

#define XEN_SHIFT 22
#define TIMER_SLOP 100000 /* Xen may fire a timer up to this many ns early */
+#define NS_PER_TICK (1000000000ll / HZ)

static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);

@@ -28,6 +30,99 @@ struct shadow_time_info {

static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);

+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/*
+ Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+ u64 state_time;
+ struct vcpu_runstate_info *state;
+
+ preempt_disable();
+
+ state = &__get_cpu_var(runstate);
+
+ do {
+ state_time = state->state_entry_time;
+ barrier();
+ *res = *state;
+ barrier();
+ } while(state->state_entry_time != state_time);
+
+ preempt_enable();
+}
+
+static void setup_runstate_info(void)
+{
+ struct vcpu_register_runstate_memory_area area;
+
+ area.addr.v = &__get_cpu_var(runstate);
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+ smp_processor_id(), &area))
+ BUG();
+
+ get_runstate_snapshot(&__get_cpu_var(runstate_snapshot));
+}
+
+static void do_stolen_accounting(void)
+{
+ struct vcpu_runstate_info state;
+ struct vcpu_runstate_info *snap;
+ u64 blocked, runnable, offline, stolen;
+ cputime_t ticks;
+
+ get_runstate_snapshot(&state);
+
+ WARN_ON(state.state != RUNSTATE_running);
+
+ snap = &__get_cpu_var(runstate_snapshot);
+
+ /* work out how much time the VCPU has not been runn*ing* */
+ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+ runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+ offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+ *snap = state;
+
+ /* Add the appropriate number of ticks of stolen time,
+ including any left-overs from last time. Passing NULL to
+ account_steal_time accounts the time as stolen. */
+ stolen = runnable + offline + __get_cpu_var(residual_stolen);
+ ticks = 0;
+ while(stolen >= NS_PER_TICK) {
+ ticks++;
+ stolen -= NS_PER_TICK;
+ }
+ __get_cpu_var(residual_stolen) = stolen;
+ account_steal_time(NULL, ticks);
+
+ /* Add the appropriate number of ticks of blocked time,
+ including any left-overs from last time. Passing idle to
+ account_steal_time accounts the time as idle/wait. */
+ blocked += __get_cpu_var(residual_blocked);
+ ticks = 0;
+ while(blocked >= NS_PER_TICK) {
+ ticks++;
+ blocked -= NS_PER_TICK;
+ }
+ __get_cpu_var(residual_blocked) = blocked;
+ account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+
+
+/* Get the CPU speed from Xen */
unsigned long xen_cpu_khz(void)
{
u64 cpu_khz = 1000000ULL << 32;
@@ -264,6 +359,8 @@ static irqreturn_t xen_timerop_timer_int
ret = IRQ_HANDLED;
}

+ do_stolen_accounting();
+
return ret;
}

@@ -338,6 +435,8 @@ static irqreturn_t xen_vcpuop_timer_inte
ret = IRQ_HANDLED;
}

+ do_stolen_accounting();
+
return ret;
}

@@ -380,6 +479,8 @@ static void xen_setup_timer(int cpu)
evt->cpumask = cpumask_of_cpu(cpu);
evt->irq = irq;
clockevents_register_device(evt);
+
+ setup_runstate_info();

put_cpu_var(xen_clock_events);
}
Subject: Implement xen_sched_clock

Implement xen_sched_clock, which returns the number of ns the current
vcpu has been actually in the running state (vs blocked,
runnable-but-not-running, or offline) since boot.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>
Cc: john stultz <johnstul@xxxxxxxxxx>

---
arch/i386/xen/enlighten.c | 2 +-
arch/i386/xen/time.c | 14 ++++++++++++++
arch/i386/xen/xen-ops.h | 1 +
3 files changed, 16 insertions(+), 1 deletion(-)

===================================================================
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -664,7 +664,7 @@ static const struct paravirt_ops xen_par
.set_wallclock = xen_set_wallclock,
.get_wallclock = xen_get_wallclock,
.get_cpu_khz = xen_cpu_khz,
- .get_scheduled_cycles = native_read_tsc,
+ .sched_clock = xen_sched_clock,

#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = paravirt_nop,
===================================================================
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -16,6 +16,8 @@
#define XEN_SHIFT 22
#define TIMER_SLOP 100000 /* Xen may fire a timer up to this many ns early */
#define NS_PER_TICK (1000000000ll / HZ)
+
+static cycle_t xen_clocksource_read(void);

static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);

@@ -120,6 +122,18 @@ static void do_stolen_accounting(void)
account_steal_time(idle_task(smp_processor_id()), ticks);
}

+/* Xen sched_clock implementation. Returns the number of RUNNING ns */
+unsigned long long xen_sched_clock(void)
+{
+ struct vcpu_runstate_info state;
+ cycle_t now = xen_clocksource_read();
+
+ get_runstate_snapshot(&state);
+
+ WARN_ON(state.state != RUNSTATE_running);
+
+ return state.time[RUNSTATE_running] + (now - state.state_entry_time);
+}


/* Get the CPU speed from Xen */
===================================================================
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -14,6 +14,7 @@ void __init xen_time_init(void);
void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);

void xen_mark_init_mm_pinned(void);