linux-kernel - Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120426234535.GA5057@amt.cnet>
Date:	Thu, 26 Apr 2012 20:45:35 -0300
From:	Marcelo Tosatti <mtosatti@...hat.com>
To:	Xiao Guangrong <xiaoguangrong@...ux.vnet.ibm.com>
Cc:	Avi Kivity <avi@...hat.com>, LKML <linux-kernel@...r.kernel.org>,
	KVM <kvm@...r.kernel.org>
Subject: Re: [PATCH v4 06/10] KVM: MMU: fast path of handling guest page fault

On Wed, Apr 25, 2012 at 12:03:48PM +0800, Xiao Guangrong wrote:
> If the the present bit of page fault error code is set, it indicates
> the shadow page is populated on all levels, it means what we do is
> only modify the access bit which can be done out of mmu-lock
> 
> Currently, in order to simplify the code, we only fix the page fault
> caused by write-protect on the fast path
> 
> In order to better review, we hold mmu-lock to update spte in this
> patch, the concurrent update will be introduced in the later patch
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@...ux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c         |  113 ++++++++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/paging_tmpl.h |    3 +
>  2 files changed, 112 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e7d8ffe..96a9d5b 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2683,18 +2683,117 @@ exit:
>  	return ret;
>  }
> 
> +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
> +				   u32 error_code)
> +{
> +	/*
> +	 * #PF can be fast only if the shadow page table is present and it
> +	 * is caused by write-protect, that means we just need change the
> +	 * W bit of the spte which can be done out of mmu-lock.
> +	 */
> +	if (!(error_code & PFERR_PRESENT_MASK) ||
> +	      !(error_code & PFERR_WRITE_MASK))
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool
> +fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> +		  u64 *sptep, u64 spte)
> +{
> +	gfn_t gfn;
> +
> +	spin_lock(&vcpu->kvm->mmu_lock);
> +
> +	/* The spte has been changed. */
> +	if (*sptep != spte)
> +		goto exit;
> +
> +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +
> +	*sptep = spte | PT_WRITABLE_MASK;
> +	mark_page_dirty(vcpu->kvm, gfn);
> +
> +exit:
> +	spin_unlock(&vcpu->kvm->mmu_lock);

There was a misunderstanding. The suggestion is to change codepaths that
today assume that a side effect of holding mmu_lock is that sptes are
not updated or read before attempting to introduce any lockless spte
updates.

example)

        u64 mask, old_spte = *sptep;

        if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
                __update_clear_spte_fast(sptep, new_spte);
        else
                old_spte = __update_clear_spte_slow(sptep, new_spte);

The local old_spte copy might be stale by the
time "spte_has_volatile_bits(old_spte)" reads the writable bit.

example)


VCPU0                                       VCPU1
set_spte
mmu_spte_update decides fast write 
mov newspte to sptep
(non-locked write instruction)
newspte in store buffer

                                            lockless spte read path reads stale value

spin_unlock(mmu_lock) 

Depending on the what stale value CPU1 read and what decision it took
this can be a problem, say the new bits (and we do not want to verify
every single case). The spte write from inside mmu_lock should always be
atomic now?

So these details must be exposed to the reader, they are hidden now
(note mmu_spte_update is already a maze, its getting too deep).

> +
> +	return true;
> +}
> +
> +/*
> + * Return value:
> + * - true: let the vcpu to access on the same address again.
> + * - false: let the real page fault path to fix it.
> + */
> +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
> +			    int level, u32 error_code)
> +{
> +	struct kvm_shadow_walk_iterator iterator;
> +	struct kvm_mmu_page *sp;
> +	bool ret = false;
> +	u64 spte = 0ull;
> +
> +	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
> +		return false;
> +
> +	walk_shadow_page_lockless_begin(vcpu);
> +	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
> +		if (!is_shadow_present_pte(spte) || iterator.level < level)
> +			break;
> +
> +	/*
> +	 * If the mapping has been changed, let the vcpu fault on the
> +	 * same address again.
> +	 */
> +	if (!is_rmap_spte(spte)) {
> +		ret = true;
> +		goto exit;
> +	}
> +
> +	if (!is_last_spte(spte, level))
> +		goto exit;
> +
> +	/*
> +	 * Check if it is a spurious fault caused by TLB lazily flushed.
> +	 *
> +	 * Need not check the access of upper level table entries since
> +	 * they are always ACC_ALL.
> +	 */
> +	 if (is_writable_pte(spte)) {
> +		ret = true;
> +		goto exit;
> +	}
> +
> +	/*
> +	 * Currently, to simplify the code, only the spte write-protected
> +	 * by dirty-log can be fast fixed.
> +	 */
> +	if (!spte_wp_by_dirty_log(spte))
> +		goto exit;
> +
> +	sp = page_header(__pa(iterator.sptep));
> +
> +	ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte);
> +
> +exit:
> +	walk_shadow_page_lockless_end(vcpu);
> +
> +	return ret;
> +}
> +
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>  			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
> 
> -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
> -			 bool prefault)
> +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
> +			 gfn_t gfn, bool prefault)
>  {
>  	int r;
>  	int level;
>  	int force_pt_level;
>  	pfn_t pfn;
>  	unsigned long mmu_seq;
> -	bool map_writable;
> +	bool map_writable, write = error_code & PFERR_WRITE_MASK;
> 
>  	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
>  	if (likely(!force_pt_level)) {
> @@ -2711,6 +2810,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
>  	} else
>  		level = PT_PAGE_TABLE_LEVEL;
> 
> +	if (fast_page_fault(vcpu, v, gfn, level, error_code))
> +		return 0;
> +
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
> @@ -3099,7 +3201,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>  	gfn = gva >> PAGE_SHIFT;
> 
>  	return nonpaging_map(vcpu, gva & PAGE_MASK,
> -			     error_code & PFERR_WRITE_MASK, gfn, prefault);
> +			     error_code, gfn, prefault);
>  }
> 
>  static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> @@ -3179,6 +3281,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
>  	} else
>  		level = PT_PAGE_TABLE_LEVEL;
> 
> +	if (fast_page_fault(vcpu, gpa, gfn, level, error_code))
> +		return 0;
> +
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index df5a703..80493fb 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -617,6 +617,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
>  	}
> 
> +	if (fast_page_fault(vcpu, addr, walker.gfn, level, error_code))
> +		return 0;
> +
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/