[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210624035749.4054934-4-stevensd@google.com>
Date: Thu, 24 Jun 2021 12:57:46 +0900
From: David Stevens <stevensd@...omium.org>
To: Marc Zyngier <maz@...nel.org>, Huacai Chen <chenhuacai@...nel.org>,
Aleksandar Markovic <aleksandar.qemu.devel@...il.com>,
Paul Mackerras <paulus@...abs.org>,
Paolo Bonzini <pbonzini@...hat.com>,
Zhenyu Wang <zhenyuw@...ux.intel.com>,
Zhi Wang <zhi.a.wang@...el.com>
Cc: James Morse <james.morse@....com>,
Alexandru Elisei <alexandru.elisei@....com>,
Suzuki K Poulose <suzuki.poulose@....com>,
Will Deacon <will@...nel.org>,
Sean Christopherson <seanjc@...gle.com>,
Vitaly Kuznetsov <vkuznets@...hat.com>,
Wanpeng Li <wanpengli@...cent.com>,
Jim Mattson <jmattson@...gle.com>,
Joerg Roedel <joro@...tes.org>,
linux-arm-kernel@...ts.infradead.org, kvmarm@...ts.cs.columbia.edu,
linux-kernel@...r.kernel.org, linux-mips@...r.kernel.org,
kvm@...r.kernel.org, kvm-ppc@...r.kernel.org,
linuxppc-dev@...ts.ozlabs.org, intel-gvt-dev@...ts.freedesktop.org,
intel-gfx@...ts.freedesktop.org, dri-devel@...ts.freedesktop.org,
David Stevens <stevensd@...omium.org>
Subject: [PATCH 3/6] KVM: x86/mmu: avoid struct page in MMU
From: David Stevens <stevensd@...omium.org>
Avoid converting pfns returned by follow_fault_pfn to struct pages to
transiently take a reference. The reference was originally taken to
match the reference taken by gup. However, pfns returned by
follow_fault_pfn may not have a struct page set up for reference
counting.
Signed-off-by: David Stevens <stevensd@...omium.org>
---
arch/x86/kvm/mmu/mmu.c | 56 +++++++++++++++++++--------------
arch/x86/kvm/mmu/mmu_audit.c | 13 ++++----
arch/x86/kvm/mmu/mmu_internal.h | 3 +-
arch/x86/kvm/mmu/paging_tmpl.h | 36 ++++++++++++---------
arch/x86/kvm/mmu/tdp_mmu.c | 7 +++--
arch/x86/kvm/mmu/tdp_mmu.h | 4 +--
arch/x86/kvm/x86.c | 9 +++---
7 files changed, 73 insertions(+), 55 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 84913677c404..8fa4a4a411ba 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2610,16 +2610,16 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
return ret;
}
-static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
+static struct kvm_pfn_page pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu,
+ gfn_t gfn, bool no_dirty_log)
{
struct kvm_memory_slot *slot;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot)
- return KVM_PFN_ERR_FAULT;
+ return KVM_PFN_PAGE_ERR(KVM_PFN_ERR_FAULT);
- return kvm_pfn_page_unwrap(gfn_to_pfn_memslot_atomic(slot, gfn));
+ return gfn_to_pfn_memslot_atomic(slot, gfn);
}
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2748,7 +2748,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level)
+ struct page *page, bool huge_page_disallowed,
+ int *req_level)
{
struct kvm_memory_slot *slot;
kvm_pfn_t pfn = *pfnp;
@@ -2760,6 +2761,9 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (unlikely(max_level == PG_LEVEL_4K))
return PG_LEVEL_4K;
+ if (!page)
+ return PG_LEVEL_4K;
+
if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
return PG_LEVEL_4K;
@@ -2814,7 +2818,8 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
+ int map_writable, int max_level,
+ const struct kvm_pfn_page *pfnpg,
bool prefault, bool is_tdp)
{
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
@@ -2826,11 +2831,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int level, req_level, ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
+ kvm_pfn_t pfn = pfnpg->pfn;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, pfnpg->page,
huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
@@ -3672,8 +3678,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
- bool write, bool *writable)
+ gpa_t cr2_or_gpa, struct kvm_pfn_page *pfnpg,
+ hva_t *hva, bool write, bool *writable)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
@@ -3688,17 +3694,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
- *pfn = KVM_PFN_NOSLOT;
+ *pfnpg = KVM_PFN_PAGE_ERR(KVM_PFN_NOSLOT);
*writable = false;
return false;
}
async = false;
- *pfn = kvm_pfn_page_unwrap(__gfn_to_pfn_memslot(slot, gfn, false,
- &async, write,
- writable, hva));
+ *pfnpg = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+ write, writable, hva);
if (!async)
- return false; /* *pfn has correct page already */
+ return false; /* *pfnpg has correct page already */
if (!prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
@@ -3710,8 +3715,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
- *pfn = kvm_pfn_page_unwrap(__gfn_to_pfn_memslot(slot, gfn, false, NULL,
- write, writable, hva));
+ *pfnpg = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+ write, writable, hva);
return false;
}
@@ -3723,7 +3728,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
- kvm_pfn_t pfn;
+ struct kvm_pfn_page pfnpg;
hva_t hva;
int r;
@@ -3743,11 +3748,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfnpg, &hva,
write, &map_writable))
return RET_PF_RETRY;
- if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa,
+ gfn, pfnpg.pfn, ACC_ALL, &r))
return r;
r = RET_PF_RETRY;
@@ -3757,7 +3763,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
else
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+ if (!is_noslot_pfn(pfnpg.pfn) &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
@@ -3765,17 +3772,18 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
- pfn, prefault);
+ &pfnpg, prefault);
else
- r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
- prefault, is_tdp);
+ r = __direct_map(vcpu, gpa, error_code, map_writable, max_level,
+ &pfnpg, prefault, is_tdp);
out_unlock:
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ if (pfnpg.page)
+ put_page(pfnpg.page);
return r;
}
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index 3f983dc6e0f1..72b470b892da 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -94,7 +94,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
- kvm_pfn_t pfn;
+ struct kvm_pfn_page pfnpg;
hpa_t hpa;
sp = sptep_to_sp(sptep);
@@ -111,18 +111,19 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
return;
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = kvm_pfn_page_unwrap(kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn));
+ pfnpg = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
- if (is_error_pfn(pfn))
+ if (is_error_pfn(pfnpg.pfn))
return;
- hpa = pfn << PAGE_SHIFT;
+ hpa = pfnpg.pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu->root_level, pfn,
+ "ent %llxn", vcpu->arch.mmu->root_level, pfnpg.pfn,
hpa, *sptep);
- kvm_release_pfn_clean(pfn);
+ if (pfnpg.page)
+ put_page(pfnpg.page);
}
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d64ccb417c60..db4d878fde4e 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -154,7 +154,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
kvm_pfn_t pfn, int max_level);
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level);
+ struct page *page, bool huge_page_disallowed,
+ int *req_level);
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
kvm_pfn_t *pfnp, int *goal_levelp);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 823a5919f9fa..db13efd4b62d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -535,7 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
unsigned pte_access;
gfn_t gfn;
- kvm_pfn_t pfn;
+ struct kvm_pfn_page pfnpg;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
@@ -545,19 +545,20 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ pfnpg = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
- if (is_error_pfn(pfn))
+ if (is_error_pfn(pfnpg.pfn))
return false;
/*
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
+ mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfnpg.pfn,
true, true);
- kvm_release_pfn_clean(pfn);
+ if (pfnpg.page)
+ put_page(pfnpg.page);
return true;
}
@@ -637,8 +638,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
*/
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
struct guest_walker *gw, u32 error_code,
- int max_level, kvm_pfn_t pfn, bool map_writable,
- bool prefault)
+ int max_level, struct kvm_pfn_page *pfnpg,
+ bool map_writable, bool prefault)
{
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
bool write_fault = error_code & PFERR_WRITE_MASK;
@@ -649,6 +650,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
unsigned int direct_access, access;
int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
+ kvm_pfn_t pfn = pfnpg->pfn;
direct_access = gw->pte_access;
@@ -695,7 +697,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
}
level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ pfnpg->page, huge_page_disallowed,
+ &req_level);
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
@@ -801,7 +804,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
- kvm_pfn_t pfn;
+ struct kvm_pfn_page pfnpg;
hva_t hva;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
@@ -853,11 +856,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfnpg, &hva,
write_fault, &map_writable))
return RET_PF_RETRY;
- if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
+ if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfnpg.pfn,
+ walker.pte_access, &r))
return r;
/*
@@ -866,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
*/
if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_write_protection(vcpu) && !user_fault &&
- !is_noslot_pfn(pfn)) {
+ !is_noslot_pfn(pfnpg.pfn)) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -882,20 +886,22 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+ if (!is_noslot_pfn(pfnpg.pfn) &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
+ r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, &pfnpg,
map_writable, prefault);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ if (pfnpg.page)
+ put_page(pfnpg.page);
return r;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 237317b1eddd..b0e6d63f0fe1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -960,8 +960,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
* page tables and SPTEs to translate the faulting guest physical address.
*/
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault)
+ int map_writable, int max_level,
+ const struct kvm_pfn_page *pfnpg, bool prefault)
{
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
bool write = error_code & PFERR_WRITE_MASK;
@@ -976,13 +976,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
int level;
int req_level;
+ kvm_pfn_t pfn = pfnpg->pfn;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, pfnpg->page,
huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5fdf63090451..f78681b9dcb7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -52,8 +52,8 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault);
+ int map_writable, int max_level,
+ const struct kvm_pfn_page *pfnpg, bool prefault);
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d31797e0cb6e..86d66c765190 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7311,7 +7311,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type)
{
gpa_t gpa = cr2_or_gpa;
- kvm_pfn_t pfn;
+ struct kvm_pfn_page pfnpg;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
@@ -7341,16 +7341,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* retry instruction -> write #PF -> emulation fail -> retry
* instruction -> ...
*/
- pfn = kvm_pfn_page_unwrap(gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)));
+ pfnpg = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
/*
* If the instruction failed on the error pfn, it can not be fixed,
* report the error to userspace.
*/
- if (is_error_noslot_pfn(pfn))
+ if (is_error_noslot_pfn(pfnpg.pfn))
return false;
- kvm_release_pfn_clean(pfn);
+ if (pfnpg.page)
+ put_page(pfnpg.page);
/* The instructions are well-emulated on direct mmu. */
if (vcpu->arch.mmu->direct_map) {
--
2.32.0.93.g670b81a890-goog
Powered by blists - more mailing lists