linux-kernel - Re: [PATCH v6 3/3] KVM: MMU: prefetch ptes when intercepted guest #PF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100805143850.GA1648@amt.cnet>
Date:	Thu, 5 Aug 2010 11:38:50 -0300
From:	Marcelo Tosatti <mtosatti@...hat.com>
To:	Xiao Guangrong <xiaoguangrong@...fujitsu.com>
Cc:	Avi Kivity <avi@...hat.com>, LKML <linux-kernel@...r.kernel.org>,
	KVM list <kvm@...r.kernel.org>
Subject: Re: [PATCH v6 3/3] KVM: MMU: prefetch ptes when intercepted guest #PF

On Wed, Aug 04, 2010 at 11:33:29AM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
> 
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@...fujitsu.com>
> ---
>  arch/x86/kvm/mmu.c         |  107 +++++++++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/paging_tmpl.h |   81 ++++++++++++++++++++++++++++++++-
>  2 files changed, 184 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9c69725..4501734 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
>  	}
>  #endif
>  
> +#define PTE_PREFETCH_NUM		8
> +
>  #define PT_FIRST_AVAIL_BITS_SHIFT 9
>  #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>  
> @@ -403,7 +405,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
>  	if (r)
>  		goto out;
>  	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
> -				   rmap_desc_cache, 4);
> +				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
>  	if (r)
>  		goto out;
>  	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
> @@ -2090,6 +2092,108 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
>  {
>  }
>  
> +static struct kvm_memory_slot *
> +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
> +{
> +	struct kvm_memory_slot *slot;
> +
> +	slot = gfn_to_memslot(vcpu->kvm, gfn);
> +	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
> +	      (no_dirty_log && slot->dirty_bitmap))
> +		slot = NULL;

Why is this no_dirty_log optimization worthwhile?

> +
> +	return slot;
> +}
> +
> +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
> +				     bool no_dirty_log)
> +{
> +	struct kvm_memory_slot *slot;
> +	unsigned long hva;
> +
> +	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
> +	if (!slot) {
> +		get_page(bad_page);
> +		return page_to_pfn(bad_page);
> +	}
> +
> +	hva = gfn_to_hva_memslot(slot, gfn);
> +
> +	return hva_to_pfn_atomic(vcpu->kvm, hva);
> +}
> +
> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> +				    struct kvm_mmu_page *sp,
> +				    u64 *start, u64 *end)
> +{
> +	struct page *pages[PTE_PREFETCH_NUM];
> +	struct kvm_memory_slot *slot;
> +	unsigned hva, access = sp->role.access;
> +	int i, ret, npages = end - start;
> +	gfn_t gfn;
> +
> +	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
> +	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK);
> +	if (!slot || slot->npages - (gfn - slot->base_gfn) != npages)
> +		return -1;
> +
> +	hva = gfn_to_hva_memslot(slot, gfn);
> +	ret = __get_user_pages_fast(hva, npages, 1, pages);
> +	if (ret <= 0)
> +		return -1;

Better do one at a time with hva_to_pfn_atomic. Or, if you measure that
its worthwhile, do on a separate patch (using a helper as discussed
previously).

> +
> +	for (i = 0; i < ret; i++, gfn++, start++)
> +		mmu_set_spte(vcpu, start, ACC_ALL,
> +			     access, 0, 0, 1, NULL,
> +			     sp->role.level, gfn,
> +			     page_to_pfn(pages[i]), true, true);
> +
> +	return ret == npages ? 0 : -1;
> +}
> +
> +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
> +				  struct kvm_mmu_page *sp, u64 *sptep)
> +{
> +	u64 *spte, *start = NULL;
> +	int i;
> +
> +	WARN_ON(!sp->role.direct);
> +
> +	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
> +	spte = sp->spt + i;
> +
> +	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
> +		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
> +			if (!start)
> +				continue;
> +			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
> +				break;
> +			start = NULL;
> +		} else if (!start)
> +			start = spte;
> +	}
> +}
> +
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> +	struct kvm_mmu_page *sp;
> +
> +	/*
> +	 * Since it's no accessed bit on EPT, it's no way to
> +	 * distinguish between actually accessed translations
> +	 * and prefetched, so disable pte prefetch if EPT is
> +	 * enabled.
> +	 */
> +	if (!shadow_accessed_mask)
> +		return;
> +
> +	sp = page_header(__pa(sptep));
> +	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> +		return;
> +
> +	__direct_pte_prefetch(vcpu, sp, sptep);
> +}
> +
>  static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
>  			int level, gfn_t gfn, pfn_t pfn)
>  {
> @@ -2103,6 +2207,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
>  			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
>  				     0, write, 1, &pt_write,
>  				     level, gfn, pfn, false, true);
> +			direct_pte_prefetch(vcpu, iterator.sptep);
>  			++vcpu->stat.pf_fixed;
>  			break;
>  		}
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 51ef909..114c17d 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -67,6 +67,7 @@ struct guest_walker {
>  	int level;
>  	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
>  	pt_element_t ptes[PT_MAX_FULL_LEVELS];
> +	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
>  	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
>  	unsigned pt_access;
>  	unsigned pte_access;
> @@ -302,14 +303,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>  static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
>  				struct guest_walker *gw, int level)
>  {
> -	int r;
>  	pt_element_t curr_pte;
> -
> -	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
> +	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
> +	u64 mask;
> +	int r, index;
> +
> +	if (level == PT_PAGE_TABLE_LEVEL) {
> +		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
> +		base_gpa = pte_gpa & ~mask;
> +		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
> +
> +		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
> +				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
> +		curr_pte = gw->prefetch_ptes[index];

This can slowdown a single non-prefetchable pte fault. Maybe its
irrelevant, but please have kvm_read_guest_atomic in the first patch and
then later optimize, its easier to review and bisectable.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/