[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <73fa2e31-58bd-4461-a6cb-a269f22ba7a1@os.amperecomputing.com>
Date: Fri, 5 Sep 2025 17:43:19 +0530
From: Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>
To: kvm@...r.kernel.org, linux-arm-kernel@...ts.infradead.org,
linux-kernel@...r.kernel.org, kvmarm@...ts.linux.dev
Cc: maz@...nel.org, oliver.upton@...ux.dev, darren@...amperecomputing.com,
scott@...amperecomputing.com, cl@...two.org, gklkml16@...il.com
Subject: Re: [PATCH] KVM: arm64: nv: Optimize unmapping of shadow S2-MMU
tables
[My apologies, for using the old kvmarm mailing list ID]
On 9/5/2025 11:59 AM, Ganapatrao Kulkarni wrote:
> As of commit ec14c272408a ("KVM: arm64: nv: Unmap/flush shadow
> stage 2 page tables"), an unmap of a canonical IPA range mapped at L1
> triggers invalidation in L1 S2-MMU and in all active shadow (L2) S2-MMU
> tables. Because there is no direct mapping to locate the corresponding
> shadow IPAs, the code falls back to a full S2-MMU page-table walk and
> invalidation across the entire L1 address space.
>
> For 4K pages this causes roughly 256K loop iterations (about 8M for
> 64K pages) per unmap, which can severely impact performance on large
> systems and even cause soft lockups during NV (L1/L2) boots with many
> CPUs and large memory. It also causes long delays during L1 reboot.
>
> This patch adds a maple-tree-based lookup that records canonical-IPA to
> shadow-IPA mappings whenever a page is mapped into any shadow (L2)
> table. On unmap, the lookup is used to target only those shadow IPAs
> which are fully or partially mapped in shadow S2-MMU tables, avoiding
> a full-address-space walk and unnecessary unmap/flush operations.
>
> The lookup is updated on map/unmap operations so entries remain
> consistent with shadow table state. Use it during unmap to invalidate
> only affected shadow IPAs, avoiding unnecessary CPU work and reducing
> latency when shadow mappings are sparse.
>
> Reviewed-by: Christoph Lameter (Ampere) <cl@...two.org>
> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>
> ---
>
> Changes since RFC v1:
> Added maple tree based lookup and updated with review
> comments from [1].
>
> [1] https://lkml.indiana.edu/2403.0/03801.html
>
> arch/arm64/include/asm/kvm_host.h | 3 +
> arch/arm64/include/asm/kvm_nested.h | 9 +++
> arch/arm64/kvm/mmu.c | 18 +++--
> arch/arm64/kvm/nested.c | 102 ++++++++++++++++++++++++++--
> 4 files changed, 121 insertions(+), 11 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2f2394cce24e..eac9405aee48 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -227,6 +227,9 @@ struct kvm_s2_mmu {
> * >0: Somebody is actively using this.
> */
> atomic_t refcnt;
> +
> + /* For IPA to shadow IPA lookup */
> + struct maple_tree nested_mmu_mt;
> };
>
> struct kvm_arch_memory_slot {
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index 7fd76f41c296..89f91164bc4c 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -69,6 +69,8 @@ extern void kvm_init_nested(struct kvm *kvm);
> extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
> extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
> extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern int add_to_ipa_shadow_ipa_lookup(struct kvm_pgtable *pgt, u64 shadow_ipa, u64 ipa,
> + u64 size);
>
> union tlbi_info;
>
> @@ -93,6 +95,12 @@ struct kvm_s2_trans {
> u64 desc;
> };
>
> +struct shadow_ipa_map {
> + u64 shadow_ipa;
> + u64 ipa;
> + u64 size;
> +};
> +
> static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
> {
> return trans->output;
> @@ -130,6 +138,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
> extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
> extern void kvm_nested_s2_wp(struct kvm *kvm);
> extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block);
> +extern void kvm_nested_s2_unmap_range(struct kvm *kvm, u64 ipa, u64 size, bool may_block);
> extern void kvm_nested_s2_flush(struct kvm *kvm);
>
> unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val);
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 1c78864767c5..e9bbc8275a51 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1784,6 +1784,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
> __pfn_to_phys(pfn), prot,
> memcache, flags);
> +
> + /* Add to lookup, if canonical IPA range mapped to shadow mmu */
> + if (nested)
> + add_to_ipa_shadow_ipa_lookup(pgt, ALIGN_DOWN(fault_ipa, PAGE_SIZE),
> + ipa, vma_pagesize);
> }
>
> out_unlock:
> @@ -1995,14 +2000,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>
> bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> {
> + gpa_t start = range->start << PAGE_SHIFT;
> + gpa_t end = (range->end - range->start) << PAGE_SHIFT;
> + bool may_block = range->may_block;
> +
> if (!kvm->arch.mmu.pgt)
> return false;
>
> - __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
> - (range->end - range->start) << PAGE_SHIFT,
> - range->may_block);
> -
> - kvm_nested_s2_unmap(kvm, range->may_block);
> + __unmap_stage2_range(&kvm->arch.mmu, start, end, may_block);
> + kvm_nested_s2_unmap_range(kvm, start, end, may_block);
> return false;
> }
>
> @@ -2280,7 +2286,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>
> write_lock(&kvm->mmu_lock);
> kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
> - kvm_nested_s2_unmap(kvm, true);
> + kvm_nested_s2_unmap_range(kvm, gpa, size, true);
> write_unlock(&kvm->mmu_lock);
> }
>
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 153b3e11b115..07b7bd3f66fc 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -7,6 +7,7 @@
> #include <linux/bitfield.h>
> #include <linux/kvm.h>
> #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
>
> #include <asm/fixmap.h>
> #include <asm/kvm_arm.h>
> @@ -725,6 +726,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
> mmu->tlb_vttbr = VTTBR_CNP_BIT;
> mmu->nested_stage2_enabled = false;
> atomic_set(&mmu->refcnt, 0);
> + mt_init_flags(&mmu->nested_mmu_mt, MM_MT_FLAGS);
> }
>
> void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> @@ -1067,17 +1069,94 @@ void kvm_nested_s2_wp(struct kvm *kvm)
> kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
> }
>
> +/*
> + * Store range of canonical IPA mapped to a nested stage 2 mmu table.
> + * Canonical IPA used as pivot in maple tree for the lookup later
> + * while IPA unmap/flush.
> + */
> +int add_to_ipa_shadow_ipa_lookup(struct kvm_pgtable *pgt, u64 shadow_ipa,
> + u64 ipa, u64 size)
> +{
> + struct kvm_s2_mmu *mmu;
> + struct shadow_ipa_map *entry;
> + unsigned long start, end;
> +
> + start = ipa;
> + end = ipa + size;
> + mmu = pgt->mmu;
> +
> + entry = kzalloc(sizeof(struct shadow_ipa_map), GFP_KERNEL_ACCOUNT);
> + entry->ipa = ipa;
> + entry->shadow_ipa = shadow_ipa;
> + entry->size = size;
> + mtree_store_range(&mmu->nested_mmu_mt, start, end - 1, entry,
> + GFP_KERNEL_ACCOUNT);
> + return 0;
> +}
> +
> +static void mtree_erase_nested(struct maple_tree *mt, unsigned long start,
> + unsigned long size)
> +{
> + void *entry = NULL;
> +
> + MA_STATE(mas, mt, start, start + size - 1);
> +
> + mtree_lock(mt);
> + entry = mas_erase(&mas);
> + mtree_unlock(mt);
> + kfree(entry);
> +}
> +
> +void kvm_nested_s2_unmap_range(struct kvm *kvm, u64 ipa, u64 size,
> + bool may_block)
> +{
> + int i;
> + struct shadow_ipa_map *entry;
> + unsigned long start = ipa;
> + unsigned long end = ipa + size;
> +
> + lockdep_assert_held_write(&kvm->mmu_lock);
> +
> + for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +
> + if (!kvm_s2_mmu_valid(mmu))
> + continue;
> +
> + do {
> + entry = mt_find(&mmu->nested_mmu_mt, &start, end - 1);
> + if (!entry)
> + break;
> +
> + kvm_stage2_unmap_range(mmu, entry->shadow_ipa,
> + entry->size, may_block);
> + start = entry->ipa + entry->size;
> + mtree_erase_nested(&mmu->nested_mmu_mt, entry->ipa,
> + entry->size);
> + } while (start < end);
> + }
> +}
> +
> void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
> {
> int i;
> + unsigned long start = 0;
>
> lockdep_assert_held_write(&kvm->mmu_lock);
>
> for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> + struct shadow_ipa_map *entry;
>
> - if (kvm_s2_mmu_valid(mmu))
> - kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
> + if (!kvm_s2_mmu_valid(mmu))
> + continue;
> +
> + mt_for_each(&mmu->nested_mmu_mt, entry, start, kvm_phys_size(mmu)) {
> + kvm_stage2_unmap_range(mmu, entry->shadow_ipa, entry->size,
> + may_block);
> + kfree(entry);
> + }
> + mtree_destroy(&mmu->nested_mmu_mt);
> }
>
> kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
> @@ -1086,14 +1165,19 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
> void kvm_nested_s2_flush(struct kvm *kvm)
> {
> int i;
> + unsigned long start = 0;
>
> lockdep_assert_held_write(&kvm->mmu_lock);
>
> for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> + struct shadow_ipa_map *entry;
>
> - if (kvm_s2_mmu_valid(mmu))
> - kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
> + if (!kvm_s2_mmu_valid(mmu))
> + continue;
> +
> + mt_for_each(&mmu->nested_mmu_mt, entry, start, kvm_phys_size(mmu))
> + kvm_stage2_flush_range(mmu, entry->shadow_ipa, entry->size);
> }
> }
>
> @@ -1737,10 +1821,18 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
> {
> if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
> struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
> + unsigned long start = 0;
>
> write_lock(&vcpu->kvm->mmu_lock);
> if (mmu->pending_unmap) {
> - kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
> + struct shadow_ipa_map *entry;
> +
> + mt_for_each(&mmu->nested_mmu_mt, entry, start, kvm_phys_size(mmu)) {
> + kvm_stage2_unmap_range(mmu, entry->shadow_ipa, entry->size,
> + true);
> + kfree(entry);
> + }
> + mtree_destroy(&mmu->nested_mmu_mt);
> mmu->pending_unmap = false;
> }
> write_unlock(&vcpu->kvm->mmu_lock);
--
Thanks,
Gk
Powered by blists - more mailing lists