This patch accounts for the time Xen steals from our VCPUs. Stolen time is time where a vcpu is runnable and could be running, but all available physical CPUs are being used for something else. This accounting gets run on each timer interrupt, just as a way to get it run relatively often, and when interesting things are going on. Stolen time is not really used by much in the kernel; it is reported in /proc/stats, and that's about it. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright Cc: john stultz Cc: Rik van Riel --- arch/i386/xen/time.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 153 insertions(+), 10 deletions(-) =================================================================== --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,7 @@ #define XEN_SHIFT 22 #define TIMER_SLOP 100000 /* Xen may fire a timer up to this many ns early */ +#define NS_PER_TICK (1000000000ll / HZ) /* These are perodically updated in shared_info, and then copied here. */ struct shadow_time_info { @@ -35,6 +37,141 @@ struct shadow_time_info { static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* snapshots of runstate info */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); + +/* unused ns of stolen and blocked time */ +static DEFINE_PER_CPU(u64, residual_stolen); +static DEFINE_PER_CPU(u64, residual_blocked); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = p32[1]; + barrier(); + l = p32[0]; + barrier(); + } while (p32[1] != h); + + ret = (((u64)h) << 32) | l; + } else + ret = *p; + + return ret; +} + +/* + * Runstate accounting + */ +static void get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + preempt_disable(); + + state = &__get_cpu_var(runstate); + + /* + * The runstate info is always updated by the hypervisor on + * the current CPU, so there's no need to use anything + * stronger than a compiler barrier when fetching it. + */ + do { + state_time = get64(&state->state_entry_time); + barrier(); + *res = *state; + barrier(); + } while(get64(&state->state_entry_time) != state_time); + + preempt_enable(); +} + +static void setup_runstate_info(void) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &__get_cpu_var(runstate); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + smp_processor_id(), &area)) + BUG(); + + get_runstate_snapshot(&__get_cpu_var(runstate_snapshot)); +} + +static void do_stolen_accounting(void) +{ + struct vcpu_runstate_info state; + struct vcpu_runstate_info *snap; + s64 blocked, runnable, offline, stolen; + cputime_t ticks; + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + snap = &__get_cpu_var(runstate_snapshot); + + /* work out how much time the VCPU has not been runn*ing* */ + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; + + *snap = state; + + /* Add the appropriate number of ticks of stolen time, + including any left-overs from last time. Passing NULL to + account_steal_time accounts the time as stolen. */ + stolen = runnable + offline + __get_cpu_var(residual_stolen); + + if (stolen < 0) + stolen = 0; + + ticks = 0; + while(stolen >= NS_PER_TICK) { + ticks++; + stolen -= NS_PER_TICK; + } + __get_cpu_var(residual_stolen) = stolen; + account_steal_time(NULL, ticks); + + /* Add the appropriate number of ticks of blocked time, + including any left-overs from last time. Passing idle to + account_steal_time accounts the time as idle/wait. */ + blocked += __get_cpu_var(residual_blocked); + + if (blocked < 0) + blocked = 0; + + ticks = 0; + while(blocked >= NS_PER_TICK) { + ticks++; + blocked -= NS_PER_TICK; + } + __get_cpu_var(residual_blocked) = blocked; + account_steal_time(idle_task(smp_processor_id()), ticks); +} + + + +/* Get the CPU speed from Xen */ unsigned long xen_cpu_khz(void) { u64 cpu_khz = 1000000ULL << 32; @@ -54,12 +191,10 @@ unsigned long xen_cpu_khz(void) * Reads a consistent set of time-base values from Xen, into a shadow data * area. */ -static void get_time_values_from_xen(void) +static unsigned get_time_values_from_xen(void) { struct vcpu_time_info *src; struct shadow_time_info *dst; - - preempt_disable(); src = &__get_cpu_var(xen_vcpu)->time; dst = &__get_cpu_var(shadow_time); @@ -74,7 +209,7 @@ static void get_time_values_from_xen(voi rmb(); } while ((src->version & 1) | (dst->version ^ src->version)); - preempt_enable(); + return dst->version; } /* @@ -118,7 +253,7 @@ static u64 get_nsec_offset(struct shadow static u64 get_nsec_offset(struct shadow_time_info *shadow) { u64 now, delta; - rdtscll(now); + now = native_read_tsc(); delta = now - shadow->tsc_timestamp; return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } @@ -127,10 +262,14 @@ cycle_t xen_clocksource_read(void) { struct shadow_time_info *shadow = &get_cpu_var(shadow_time); cycle_t ret; - - get_time_values_from_xen(); - - ret = shadow->system_timestamp + get_nsec_offset(shadow); + unsigned version; + + do { + version = get_time_values_from_xen(); + barrier(); + ret = shadow->system_timestamp + get_nsec_offset(shadow); + barrier(); + } while(version != __get_cpu_var(xen_vcpu)->time.version); put_cpu_var(shadow_time); @@ -347,6 +486,8 @@ static irqreturn_t xen_timer_interrupt(i ret = IRQ_HANDLED; } + do_stolen_accounting(); + return ret; } @@ -373,6 +514,8 @@ static void xen_setup_timer(int cpu) evt->irq = irq; clockevents_register_device(evt); + setup_runstate_info(); + put_cpu_var(xen_clock_events); } @@ -385,7 +528,7 @@ __init void xen_time_init(void) clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { - /* Successfully turned off 100hz tick, so we have the + /* Successfully turned off 100Hz tick, so we have the vcpuop-based timer interface */ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); xen_clockevent = &xen_vcpuop_clockevent; -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/