linux-kernel - Re: [PATCH] KVM: arm64: nv: Optimize unmapping of shadow S2-MMU tables

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <73fa2e31-58bd-4461-a6cb-a269f22ba7a1@os.amperecomputing.com>
Date: Fri, 5 Sep 2025 17:43:19 +0530
From: Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>
To: kvm@...r.kernel.org, linux-arm-kernel@...ts.infradead.org,
 linux-kernel@...r.kernel.org, kvmarm@...ts.linux.dev
Cc: maz@...nel.org, oliver.upton@...ux.dev, darren@...amperecomputing.com,
 scott@...amperecomputing.com, cl@...two.org, gklkml16@...il.com
Subject: Re: [PATCH] KVM: arm64: nv: Optimize unmapping of shadow S2-MMU
 tables


[My apologies, for using the old kvmarm mailing list ID]

On 9/5/2025 11:59 AM, Ganapatrao Kulkarni wrote:
> As of commit ec14c272408a ("KVM: arm64: nv: Unmap/flush shadow
> stage 2 page tables"), an unmap of a canonical IPA range mapped at L1
> triggers invalidation in L1 S2-MMU and in all active shadow (L2) S2-MMU
> tables. Because there is no direct mapping to locate the corresponding
> shadow IPAs, the code falls back to a full S2-MMU page-table walk and
> invalidation across the entire L1 address space.
> 
> For 4K pages this causes roughly 256K loop iterations (about 8M for
> 64K pages) per unmap, which can severely impact performance on large
> systems and even cause soft lockups during NV (L1/L2) boots with many
> CPUs and large memory. It also causes long delays during L1 reboot.
> 
> This patch adds a maple-tree-based lookup that records canonical-IPA to
> shadow-IPA mappings whenever a page is mapped into any shadow (L2)
> table. On unmap, the lookup is used to target only those shadow IPAs
> which are fully or partially mapped in shadow S2-MMU tables, avoiding
> a full-address-space walk and unnecessary unmap/flush operations.
> 
> The lookup is updated on map/unmap operations so entries remain
> consistent with shadow table state. Use it during unmap to invalidate
> only affected shadow IPAs, avoiding unnecessary CPU work and reducing
> latency when shadow mappings are sparse.
> 
> Reviewed-by: Christoph Lameter (Ampere) <cl@...two.org>
> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>
> ---
> 
> Changes since RFC v1:
> 		Added maple tree based lookup and updated with review
> 		comments from [1].
> 
> [1] https://lkml.indiana.edu/2403.0/03801.html
> 
>   arch/arm64/include/asm/kvm_host.h   |   3 +
>   arch/arm64/include/asm/kvm_nested.h |   9 +++
>   arch/arm64/kvm/mmu.c                |  18 +++--
>   arch/arm64/kvm/nested.c             | 102 ++++++++++++++++++++++++++--
>   4 files changed, 121 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2f2394cce24e..eac9405aee48 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -227,6 +227,9 @@ struct kvm_s2_mmu {
>   	 * >0: Somebody is actively using this.
>   	 */
>   	atomic_t refcnt;
> +
> +	/* For IPA to shadow IPA lookup */
> +	struct maple_tree nested_mmu_mt;
>   };
>   
>   struct kvm_arch_memory_slot {
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index 7fd76f41c296..89f91164bc4c 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -69,6 +69,8 @@ extern void kvm_init_nested(struct kvm *kvm);
>   extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>   extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>   extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern int add_to_ipa_shadow_ipa_lookup(struct kvm_pgtable *pgt, u64 shadow_ipa, u64 ipa,
> +		u64 size);
>   
>   union tlbi_info;
>   
> @@ -93,6 +95,12 @@ struct kvm_s2_trans {
>   	u64 desc;
>   };
>   
> +struct shadow_ipa_map {
> +	u64 shadow_ipa;
> +	u64 ipa;
> +	u64 size;
> +};
> +
>   static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
>   {
>   	return trans->output;
> @@ -130,6 +138,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
>   extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
>   extern void kvm_nested_s2_wp(struct kvm *kvm);
>   extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block);
> +extern void kvm_nested_s2_unmap_range(struct kvm *kvm, u64 ipa, u64 size, bool may_block);
>   extern void kvm_nested_s2_flush(struct kvm *kvm);
>   
>   unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val);
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 1c78864767c5..e9bbc8275a51 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1784,6 +1784,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
>   					     __pfn_to_phys(pfn), prot,
>   					     memcache, flags);
> +
> +		/* Add to lookup, if canonical IPA range mapped to shadow mmu */
> +		if (nested)
> +			add_to_ipa_shadow_ipa_lookup(pgt, ALIGN_DOWN(fault_ipa, PAGE_SIZE),
> +					ipa, vma_pagesize);
>   	}
>   
>   out_unlock:
> @@ -1995,14 +2000,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   
>   bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>   {
> +	gpa_t start = range->start << PAGE_SHIFT;
> +	gpa_t end = (range->end - range->start) << PAGE_SHIFT;
> +	bool may_block = range->may_block;
> +
>   	if (!kvm->arch.mmu.pgt)
>   		return false;
>   
> -	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
> -			     (range->end - range->start) << PAGE_SHIFT,
> -			     range->may_block);
> -
> -	kvm_nested_s2_unmap(kvm, range->may_block);
> +	__unmap_stage2_range(&kvm->arch.mmu, start, end, may_block);
> +	kvm_nested_s2_unmap_range(kvm, start, end, may_block);
>   	return false;
>   }
>   
> @@ -2280,7 +2286,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>   
>   	write_lock(&kvm->mmu_lock);
>   	kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
> -	kvm_nested_s2_unmap(kvm, true);
> +	kvm_nested_s2_unmap_range(kvm, gpa, size, true);
>   	write_unlock(&kvm->mmu_lock);
>   }
>   
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 153b3e11b115..07b7bd3f66fc 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -7,6 +7,7 @@
>   #include <linux/bitfield.h>
>   #include <linux/kvm.h>
>   #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
>   
>   #include <asm/fixmap.h>
>   #include <asm/kvm_arm.h>
> @@ -725,6 +726,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
>   	mmu->tlb_vttbr = VTTBR_CNP_BIT;
>   	mmu->nested_stage2_enabled = false;
>   	atomic_set(&mmu->refcnt, 0);
> +	mt_init_flags(&mmu->nested_mmu_mt, MM_MT_FLAGS);
>   }
>   
>   void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> @@ -1067,17 +1069,94 @@ void kvm_nested_s2_wp(struct kvm *kvm)
>   	kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
>   }
>   
> +/*
> + * Store range of canonical IPA mapped to a nested stage 2 mmu table.
> + * Canonical IPA used as pivot in maple tree for the lookup later
> + * while IPA unmap/flush.
> + */
> +int add_to_ipa_shadow_ipa_lookup(struct kvm_pgtable *pgt, u64 shadow_ipa,
> +		u64 ipa, u64 size)
> +{
> +	struct kvm_s2_mmu *mmu;
> +	struct shadow_ipa_map *entry;
> +	unsigned long start, end;
> +
> +	start = ipa;
> +	end = ipa + size;
> +	mmu = pgt->mmu;
> +
> +	entry = kzalloc(sizeof(struct shadow_ipa_map), GFP_KERNEL_ACCOUNT);
> +	entry->ipa = ipa;
> +	entry->shadow_ipa = shadow_ipa;
> +	entry->size = size;
> +	mtree_store_range(&mmu->nested_mmu_mt, start, end - 1, entry,
> +			  GFP_KERNEL_ACCOUNT);
> +	return 0;
> +}
> +
> +static void mtree_erase_nested(struct maple_tree *mt, unsigned long start,
> +		unsigned long size)
> +{
> +	void *entry = NULL;
> +
> +	MA_STATE(mas, mt, start, start + size - 1);
> +
> +	mtree_lock(mt);
> +	entry = mas_erase(&mas);
> +	mtree_unlock(mt);
> +	kfree(entry);
> +}
> +
> +void kvm_nested_s2_unmap_range(struct kvm *kvm, u64 ipa, u64 size,
> +		bool may_block)
> +{
> +	int i;
> +	struct shadow_ipa_map *entry;
> +	unsigned long start = ipa;
> +	unsigned long end = ipa + size;
> +
> +	lockdep_assert_held_write(&kvm->mmu_lock);
> +
> +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +
> +		if (!kvm_s2_mmu_valid(mmu))
> +			continue;
> +
> +		do {
> +			entry = mt_find(&mmu->nested_mmu_mt, &start, end - 1);
> +			if (!entry)
> +				break;
> +
> +			kvm_stage2_unmap_range(mmu, entry->shadow_ipa,
> +							entry->size, may_block);
> +			start = entry->ipa + entry->size;
> +			mtree_erase_nested(&mmu->nested_mmu_mt, entry->ipa,
> +							entry->size);
> +		} while (start < end);
> +	}
> +}
> +
>   void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
>   {
>   	int i;
> +	unsigned long start = 0;
>   
>   	lockdep_assert_held_write(&kvm->mmu_lock);
>   
>   	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>   		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +		struct shadow_ipa_map *entry;
>   
> -		if (kvm_s2_mmu_valid(mmu))
> -			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
> +		if (!kvm_s2_mmu_valid(mmu))
> +			continue;
> +
> +		mt_for_each(&mmu->nested_mmu_mt, entry, start, kvm_phys_size(mmu)) {
> +			kvm_stage2_unmap_range(mmu, entry->shadow_ipa, entry->size,
> +					may_block);
> +			kfree(entry);
> +		}
> +		mtree_destroy(&mmu->nested_mmu_mt);
>   	}
>   
>   	kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
> @@ -1086,14 +1165,19 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
>   void kvm_nested_s2_flush(struct kvm *kvm)
>   {
>   	int i;
> +	unsigned long start = 0;
>   
>   	lockdep_assert_held_write(&kvm->mmu_lock);
>   
>   	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>   		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +		struct shadow_ipa_map *entry;
>   
> -		if (kvm_s2_mmu_valid(mmu))
> -			kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
> +		if (!kvm_s2_mmu_valid(mmu))
> +			continue;
> +
> +		mt_for_each(&mmu->nested_mmu_mt, entry, start, kvm_phys_size(mmu))
> +			kvm_stage2_flush_range(mmu, entry->shadow_ipa, entry->size);
>   	}
>   }
>   
> @@ -1737,10 +1821,18 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
>   {
>   	if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
>   		struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
> +		unsigned long start = 0;
>   
>   		write_lock(&vcpu->kvm->mmu_lock);
>   		if (mmu->pending_unmap) {
> -			kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
> +			struct shadow_ipa_map *entry;
> +
> +			mt_for_each(&mmu->nested_mmu_mt, entry, start, kvm_phys_size(mmu)) {
> +				kvm_stage2_unmap_range(mmu, entry->shadow_ipa, entry->size,
> +						true);
> +				kfree(entry);
> +			}
> +			mtree_destroy(&mmu->nested_mmu_mt);
>   			mmu->pending_unmap = false;
>   		}
>   		write_unlock(&vcpu->kvm->mmu_lock);


-- 
Thanks,
Gk