linux-kernel - Re: [PATCHv3 7/8] kvm: host side for eoi optimization

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20120516085832.GE3183@redhat.com>
Date:	Wed, 16 May 2012 11:58:32 +0300
From:	"Michael S. Tsirkin" <mst@...hat.com>
To:	Marcelo Tosatti <mtosatti@...hat.com>
Cc:	x86@...nel.org, kvm@...r.kernel.org,
	Ingo Molnar <mingo@...hat.com>,
	"H. Peter Anvin" <hpa@...or.com>, Avi Kivity <avi@...hat.com>,
	gleb@...hat.com, Linus Torvalds <torvalds@...ux-foundation.org>,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCHv3 7/8] kvm: host side for eoi optimization

On Tue, May 15, 2012 at 10:55:19PM -0300, Marcelo Tosatti wrote:
> On Tue, May 15, 2012 at 05:36:19PM +0300, Michael S. Tsirkin wrote:
> > Implementation of PV EOI using shared memory.
> > This reduces the number of exits an interrupt
> > causes as much as by half.
> > 
> > The idea is simple: there's a bit, per APIC, in guest memory,
> > that tells the guest that it does not need EOI.
> > We set it before injecting an interrupt and clear
> > before injecting a nested one. Guest tests it using
> > a test and clear operation - this is necessary
> > so that host can detect interrupt nesting -
> > and if set, it can skip the EOI MSR.
> > 
> > There's a new MSR to set the address of said register
> > in guest memory. Otherwise not much changed:
> > - Guest EOI is not required
> > - Register is tested & ISR is automatically cleared on exit
> > For tetsing results see description of patch 7 of the series.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@...hat.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |    6 ++
> >  arch/x86/include/asm/kvm_para.h |    2 +
> >  arch/x86/kvm/cpuid.c            |    1 +
> >  arch/x86/kvm/irq.c              |    2 +-
> >  arch/x86/kvm/lapic.c            |  110 +++++++++++++++++++++++++++++++++++++--
> >  arch/x86/kvm/lapic.h            |    2 +
> >  arch/x86/kvm/trace.h            |   34 ++++++++++++
> >  arch/x86/kvm/x86.c              |    8 +++-
> >  8 files changed, 158 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 32297f2..7673f4d 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -174,6 +174,7 @@ enum {
> >  
> >  /* apic attention bits */
> >  #define KVM_APIC_CHECK_VAPIC	0
> > +#define KVM_APIC_EOI_PENDING	1
> >  
> >  /*
> >   * We don't want allocation failures within the mmu code, so we preallocate
> > @@ -485,6 +486,11 @@ struct kvm_vcpu_arch {
> >  		u64 length;
> >  		u64 status;
> >  	} osvw;
> > +
> > +	struct {
> > +		u64 msr_val;
> > +		struct gfn_to_hva_cache data;
> > +	} eoi;
> >  };
> >  
> >  struct kvm_lpage_info {
> > diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> > index 734c376..195840d 100644
> > --- a/arch/x86/include/asm/kvm_para.h
> > +++ b/arch/x86/include/asm/kvm_para.h
> > @@ -22,6 +22,7 @@
> >  #define KVM_FEATURE_CLOCKSOURCE2        3
> >  #define KVM_FEATURE_ASYNC_PF		4
> >  #define KVM_FEATURE_STEAL_TIME		5
> > +#define KVM_FEATURE_EOI			6
> 
> SKIP_EOI?
> 
> >  
> >  /* The last 8 bits are used to indicate how to interpret the flags field
> >   * in pvclock structure. If no bits are set, all flags are ignored.
> > @@ -37,6 +38,7 @@
> >  #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
> >  #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
> >  #define MSR_KVM_STEAL_TIME  0x4b564d03
> > +#define MSR_KVM_EOI_EN      0x4b564d04
> >  
> >  struct kvm_steal_time {
> >  	__u64 steal;
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index eba8345..bda4877 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -408,6 +408,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >  			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
> >  			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
> >  			     (1 << KVM_FEATURE_ASYNC_PF) |
> > +			     (1 << KVM_FEATURE_EOI) |
> >  			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> >  
> >  		if (sched_info_on())
> > diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
> > index 7e06ba1..07bdfab 100644
> > --- a/arch/x86/kvm/irq.c
> > +++ b/arch/x86/kvm/irq.c
> > @@ -48,7 +48,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
> >  	if (!irqchip_in_kernel(v->kvm))
> >  		return v->arch.interrupt.pending;
> >  
> > -	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
> > +	if (kvm_apic_has_interrupt(v) < 0) {	/* LAPIC */
> >  		if (kvm_apic_accept_pic_intr(v)) {
> >  			s = pic_irqchip(v->kvm);	/* PIC */
> >  			return s->output;
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 93c1574..c7e6ffb 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -270,6 +270,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
> >  			irq->level, irq->trig_mode);
> >  }
> >  
> > +static int eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
> > +{
> > +
> > +	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, &val,
> > +				      sizeof(val));
> > +}
> > +
> > +static int eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
> > +{
> > +
> > +	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, val,
> > +				      sizeof(*val));
> > +}
> > +
> > +static inline bool eoi_enabled(struct kvm_vcpu *vcpu)
> > +{
> > +	return vcpu->arch.eoi.msr_val & KVM_MSR_ENABLED;
> > +}
> 
> Please find a more descriptive name, such as pv_eoi_* (or skip_eoi,
> or...).
> 
> > +
> > +static bool eoi_get_pending(struct kvm_vcpu *vcpu)
> > +{
> > +	u8 val;
> > +	if (eoi_get_user(vcpu, &val) < 0)
> > +		apic_debug("Can't read EOI MSR value: 0x%llx\n",
> > +			   (unsigned long long)vcpi->arch.eoi.msr_val);
> > +	return val & 0x1;
> > +}
> > +
> > +static void eoi_set_pending(struct kvm_vcpu *vcpu)
> > +{
> > +	if (eoi_put_user(vcpu, 0x1) < 0) {
> > +		apic_debug("Can't set EOI MSR value: 0x%llx\n",
> > +			   (unsigned long long)vcpi->arch.eoi.msr_val);
> > +		return;
> > +	}
> > +	__set_bit(KVM_APIC_EOI_PENDING, &vcpu->arch.apic_attention);
> > +}
> > +
> > +static void eoi_clr_pending(struct kvm_vcpu *vcpu)
> > +{
> > +	if (eoi_put_user(vcpu, 0x0) < 0) {
> > +		apic_debug("Can't clear EOI MSR value: 0x%llx\n",
> > +			   (unsigned long long)vcpi->arch.eoi.msr_val);
> > +		return;
> > +	}
> > +	__clear_bit(KVM_APIC_EOI_PENDING, &vcpu->arch.apic_attention);
> > +}
> > +
> >  static inline int apic_find_highest_isr(struct kvm_lapic *apic)
> >  {
> >  	int result;
> > @@ -482,16 +530,21 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
> >  	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
> >  }
> >  
> > -static void apic_set_eoi(struct kvm_lapic *apic)
> > +static int apic_set_eoi(struct kvm_lapic *apic)
> >  {
> >  	int vector = apic_find_highest_isr(apic);
> > +
> > +	trace_kvm_eoi(apic, vector);
> > +
> >  	/*
> >  	 * Not every write EOI will has corresponding ISR,
> >  	 * one example is when Kernel check timer on setup_IO_APIC
> >  	 */
> >  	if (vector == -1)
> > -		return;
> > +		return vector;
> >  
> > +	if (eoi_enabled(apic->vcpu))
> > +		eoi_clr_pending(apic->vcpu);
> 
> Why is this here again? If you got here, the bit should be cleared 
> already?
> 
> >  	apic_clear_vector(vector, apic->regs + APIC_ISR);
> >  	apic_update_ppr(apic);
> >  
> > @@ -505,6 +558,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
> >  		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
> >  	}
> >  	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
> > +	return vector;
> >  }
> >  
> >  static void apic_send_ipi(struct kvm_lapic *apic)
> > @@ -1085,6 +1139,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
> >  	atomic_set(&apic->lapic_timer.pending, 0);
> >  	if (kvm_vcpu_is_bsp(vcpu))
> >  		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
> > +	vcpu->arch.eoi.msr_val = 0;
> >  	apic_update_ppr(apic);
> >  
> >  	vcpu->arch.apic_arb_prio = 0;
> > @@ -1211,9 +1266,10 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
> >  
> >  	apic_update_ppr(apic);
> >  	highest_irr = apic_find_highest_irr(apic);
> > -	if ((highest_irr == -1) ||
> > -	    ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
> > +	if (highest_irr == -1)
> >  		return -1;
> > +	if (((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
> > +		return -2;
> >  	return highest_irr;
> >  }
> >  
> > @@ -1245,9 +1301,20 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
> >  	int vector = kvm_apic_has_interrupt(vcpu);
> >  	struct kvm_lapic *apic = vcpu->arch.apic;
> >  
> > -	if (vector == -1)
> > +	/* Detect interrupt nesting and disable EOI optimization */
> > +	if (eoi_enabled(vcpu) && vector == -2)
> > +		eoi_clr_pending(vcpu);
> > +
> > +	if (vector < 0)
> >  		return -1;
> >  
> > +	if (eoi_enabled(vcpu)) {
> > +		if (kvm_ioapic_handles_vector(vcpu->kvm, vector))
> > +			eoi_clr_pending(vcpu);
> > +		else
> > +			eoi_set_pending(vcpu);
> > +	}
> > +
> >  	apic_set_vector(vector, apic->regs + APIC_ISR);
> >  	apic_update_ppr(apic);
> >  	apic_clear_irr(vector, apic);
> > @@ -1262,6 +1329,14 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
> >  			     MSR_IA32_APICBASE_BASE;
> >  	kvm_apic_set_version(vcpu);
> >  
> > +	if (eoi_enabled(apic->vcpu)) {
> > +		if (eoi_get_pending(apic->vcpu))
> > +			__set_bit(KVM_APIC_EOI_PENDING,
> > +				  &vcpu->arch.apic_attention);
> > +		else
> > +			__clear_bit(KVM_APIC_EOI_PENDING,
> > +				    &vcpu->arch.apic_attention);
> > +	}
> 
> 
> Why is this necessary? kvm_apic_post_state_restore cannot change the 
> per cpu in memory value.
> 
> >  	apic_update_ppr(apic);
> >  	hrtimer_cancel(&apic->lapic_timer.timer);
> >  	update_divide_count(apic);
> > @@ -1283,11 +1358,25 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
> >  		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
> >  }
> >  
> > +static void apic_update_eoi(struct kvm_lapic *apic)
> > +{
> > +	int vector;
> > +	BUG_ON(!eoi_enabled(apic->vcpu));
> > +	if (eoi_get_pending(apic->vcpu))
> > +		return;
> > +	__clear_bit(KVM_APIC_EOI_PENDING, &apic->vcpu->arch.apic_attention);
> > +	vector = apic_set_eoi(apic);
> > +	trace_kvm_pv_eoi(apic, vector);
> > +}
> 
> KVM_APIC_EOI_PENDING appears to be a shadow of the pervcpu value,
> to speed up processing. If it is, please make that clear. 

BTW yes it is a shadow but it's not just for speed:
without it there would be no easy way to distinguish between
two states:
	- we decided to not use pv eoi for a specific interrupt
	- we enabled pv eoi for an interrupt, and guest has cleared it

This bit let us distinguish: in 1st case it is cleared in second
case it is set.

> >  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
> >  {
> >  	u32 data;
> >  	void *vapic;
> >  
> > +	if (test_bit(KVM_APIC_EOI_PENDING, &vcpu->arch.apic_attention))
> > +		apic_update_eoi(vcpu->arch.apic);
> > +
> >  	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
> >  		return;
> >  
> > @@ -1394,3 +1483,14 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
> >  
> >  	return 0;
> >  }
> > +
> > +int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data)
> > +{
> > +	u64 addr = data & ~KVM_MSR_ENABLED;
> > +	if (eoi_enabled(vcpu))
> > +		eoi_clr_pending(vcpu);
> > +	vcpu->arch.eoi.msr_val = data;
> > +	if (!eoi_enabled(vcpu))
> > +		return 0;
> > +	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.eoi.data, addr);
> > +}
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 6f4ce25..042dace 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -60,4 +60,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
> >  {
> >  	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
> >  }
> > +
> > +int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data);
> >  #endif
> > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> > index 911d264..851914e 100644
> > --- a/arch/x86/kvm/trace.h
> > +++ b/arch/x86/kvm/trace.h
> > @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
> >  		  __entry->coalesced ? " (coalesced)" : "")
> >  );
> >  
> > +TRACE_EVENT(kvm_eoi,
> > +	    TP_PROTO(struct kvm_lapic *apic, int vector),
> > +	    TP_ARGS(apic, vector),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(	__u32,		apicid		)
> > +		__field(	int,		vector		)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->apicid		= apic->vcpu->vcpu_id;
> > +		__entry->vector		= vector;
> > +	),
> > +
> > +	TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
> > +);
> > +
> > +TRACE_EVENT(kvm_pv_eoi,
> > +	    TP_PROTO(struct kvm_lapic *apic, int vector),
> > +	    TP_ARGS(apic, vector),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(	__u32,		apicid		)
> > +		__field(	int,		vector		)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->apicid		= apic->vcpu->vcpu_id;
> > +		__entry->vector		= vector;
> > +	),
> > +
> > +	TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
> > +);
> > +
> >  /*
> >   * Tracepoint for nested VMRUN
> >   */
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 185a2b8..bfcd97d 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -795,6 +795,7 @@ static u32 msrs_to_save[] = {
> >  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
> >  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
> >  	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
> > +	MSR_KVM_EOI_EN,
> >  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
> >  	MSR_STAR,
> >  #ifdef CONFIG_X86_64
> > @@ -1653,6 +1654,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> >  		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
> >  
> >  		break;
> > +	case MSR_KVM_EOI_EN:
> > +		if (kvm_pv_enable_apic_eoi(vcpu, data))
> > +			return 1;
> > +		break;
> >  
> >  	case MSR_IA32_MCG_CTL:
> >  	case MSR_IA32_MCG_STATUS:
> > @@ -5370,7 +5375,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  	if (unlikely(vcpu->arch.tsc_always_catchup))
> >  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> >  
> > -	kvm_lapic_sync_from_vapic(vcpu);
> > +	if (unlikely(vcpu->arch.apic_attention))
> > +		kvm_lapic_sync_from_vapic(vcpu);
> 
> This apic_attention check is unrelated, please have in a separate patch.
> 
> Also please document that this is only possible because of interrupt window 
> exiting. 
> 
> The Hyper-V spec, does it provide similar interface ? It mentions
> that "Most EOI intercepts can be eliminated and done lazily if the guest
> OS leaves a marker when it performs an EOI".
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/