linux-kernel - Re: [PATCH 1/4] KVM: x86/mmu: Don't require refcounted "struct page" to create huge SPTEs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YtJSalFfPPoQs4Dj@google.com>
Date:   Sat, 16 Jul 2022 05:53:46 +0000
From:   Mingwei Zhang <mizhang@...gle.com>
To:     Sean Christopherson <seanjc@...gle.com>
Cc:     Paolo Bonzini <pbonzini@...hat.com>, kvm@...r.kernel.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/4] KVM: x86/mmu: Don't require refcounted "struct page"
 to create huge SPTEs

On Fri, Jul 15, 2022, Sean Christopherson wrote:
> Drop the requirement that a pfn be backed by a refcounted, compound or
> or ZONE_DEVICE, struct page, and instead rely solely on the host page
> tables to identify huge pages.  The PageCompound() check is a remnant of
> an old implementation that identified (well, attempt to identify) huge
> pages without walking the host page tables.  The ZONE_DEVICE check was
> added as an exception to the PageCompound() requirement.  In other words,
> neither check is actually a hard requirement, if the primary has a pfn
> backed with a huge page, then KVM can back the pfn with a huge page
> regardless of the backing store.
> 
> Dropping the @pfn parameter will also allow KVM to query the max host
> mapping level without having to first get the pfn, which is advantageous
> for use outside of the page fault path where KVM wants to take action if
> and only if a page can be mapped huge, i.e. avoids the pfn lookup for
> gfns that can't be backed with a huge page.

Our of curiosity, when host maps huge pages under VMA with VM_PFNMAP,
they are basically out of the control of MM (I presume). So, when
drivers of those pages remove them from host page table, do we (KVM) get
mmu_notifiers?

> 
> Cc: Mingwei Zhang <mizhang@...gle.com>
> Signed-off-by: Sean Christopherson <seanjc@...gle.com>

Reviewed-by: Mingwei Zhang <mizhang@...gle.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          | 23 +++++------------------
>  arch/x86/kvm/mmu/mmu_internal.h |  2 +-
>  arch/x86/kvm/mmu/tdp_mmu.c      |  8 +-------
>  3 files changed, 7 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 52664c3caaab..bebff1d5acd4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2919,11 +2919,10 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
>  	__direct_pte_prefetch(vcpu, sp, sptep);
>  }
>  
> -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
>  				  const struct kvm_memory_slot *slot)
>  {
>  	int level = PG_LEVEL_4K;
> -	struct page *page;
>  	unsigned long hva;
>  	unsigned long flags;
>  	pgd_t pgd;
> @@ -2931,17 +2930,6 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>  	pud_t pud;
>  	pmd_t pmd;
>  
> -	/*
> -	 * Note, @slot must be non-NULL, i.e. the caller is responsible for
> -	 * ensuring @pfn isn't garbage and is backed by a memslot.
> -	 */
> -	page = kvm_pfn_to_refcounted_page(pfn);
> -	if (!page)
> -		return PG_LEVEL_4K;
> -
> -	if (!PageCompound(page) && !kvm_is_zone_device_page(page))
> -		return PG_LEVEL_4K;
> -
>  	/*
>  	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
>  	 * is not solely for performance, it's also necessary to avoid the
> @@ -2994,7 +2982,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>  
>  int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  			      const struct kvm_memory_slot *slot, gfn_t gfn,
> -			      kvm_pfn_t pfn, int max_level)
> +			      int max_level)
>  {
>  	struct kvm_lpage_info *linfo;
>  	int host_level;
> @@ -3009,7 +2997,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> -	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> +	host_level = host_pfn_mapping_level(kvm, gfn, slot);
>  	return min(host_level, max_level);
>  }
>  
> @@ -3034,8 +3022,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	 * level, which will be used to do precise, accurate accounting.
>  	 */
>  	fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> -						     fault->gfn, fault->pfn,
> -						     fault->max_level);
> +						     fault->gfn, fault->max_level);
>  	if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
>  		return;
>  
> @@ -6406,7 +6393,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>  		 */
>  		if (sp->role.direct &&
>  		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
> -							       pfn, PG_LEVEL_NUM)) {
> +							       PG_LEVEL_NUM)) {
>  			pte_list_remove(kvm, rmap_head, sptep);
>  
>  			if (kvm_available_flush_tlb_with_range())
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index ae2d660e2dab..582def531d4d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -309,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  
>  int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  			      const struct kvm_memory_slot *slot, gfn_t gfn,
> -			      kvm_pfn_t pfn, int max_level);
> +			      int max_level);
>  void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>  void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
>  
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f3a430d64975..d75d93edc40a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1733,7 +1733,6 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  	gfn_t end = start + slot->npages;
>  	struct tdp_iter iter;
>  	int max_mapping_level;
> -	kvm_pfn_t pfn;
>  
>  	rcu_read_lock();
>  
> @@ -1745,13 +1744,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>  
> -		/*
> -		 * This is a leaf SPTE. Check if the PFN it maps can
> -		 * be mapped at a higher level.
> -		 */
> -		pfn = spte_to_pfn(iter.old_spte);
>  		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> -				iter.gfn, pfn, PG_LEVEL_NUM);
> +							      iter.gfn, PG_LEVEL_NUM);
>  
>  		WARN_ON(max_mapping_level < iter.level);
>  
> -- 
> 2.37.0.170.g444d1eabd0-goog
>