linux-kernel - [PATCH RFC v3 33/35] KVM: arm64: mte: Introduce VM_MTE

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240125164256.4147-34-alexandru.elisei@arm.com>
Date: Thu, 25 Jan 2024 16:42:54 +0000
From: Alexandru Elisei <alexandru.elisei@....com>
To: catalin.marinas@....com,
	will@...nel.org,
	oliver.upton@...ux.dev,
	maz@...nel.org,
	james.morse@....com,
	suzuki.poulose@....com,
	yuzenghui@...wei.com,
	arnd@...db.de,
	akpm@...ux-foundation.org,
	mingo@...hat.com,
	peterz@...radead.org,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	dietmar.eggemann@....com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	bristot@...hat.com,
	vschneid@...hat.com,
	mhiramat@...nel.org,
	rppt@...nel.org,
	hughd@...gle.com
Cc: pcc@...gle.com,
	steven.price@....com,
	anshuman.khandual@....com,
	vincenzo.frascino@....com,
	david@...hat.com,
	eugenis@...gle.com,
	kcc@...gle.com,
	hyesoo.yu@...sung.com,
	linux-arm-kernel@...ts.infradead.org,
	linux-kernel@...r.kernel.org,
	kvmarm@...ts.linux.dev,
	linux-fsdevel@...r.kernel.org,
	linux-arch@...r.kernel.org,
	linux-mm@...ck.org,
	linux-trace-kernel@...r.kernel.org
Subject: [PATCH RFC v3 33/35] KVM: arm64: mte: Introduce VM_MTE_KVM VMA flag

Tag storage pages mapped by the host in a VM with MTE enabled are migrated
when they are first accessed by the guest. This introduces latency spikes
for memory accesses made by the guest.

Tag storage pages can be mapped in the guest memory when the VM_MTE VMA
flag is not set. Introduce a new VMA flag, VM_MTE_KVM, to stop tag storage
pages from being mapped in a VM with MTE enabled.

The flag is different from VM_MTE, because the pages from the VMA won't be
mapped as tagged in the host, and host's userspace can continue to access
the guest memory as Untagged. The flag's only function is to instruct the
page allocator to treat the allocation as tagged, so tag storage pages
aren't used. The page allocator will also try to reserve tag storage for
the new page, which can speed up stage 2 aborts further if the VMM has
accessed the memory before the guest. For example, qemu and kvmtool will
benefit from this change because the guest image is copied after the
memslot is created.

Signed-off-by: Alexandru Elisei <alexandru.elisei@....com>
---

Changes since rfc v2:

* New patch.

 arch/arm64/kvm/mmu.c  | 77 ++++++++++++++++++++++++++++++++++++++++++-
 arch/arm64/mm/fault.c |  2 +-
 include/linux/mm.h    |  2 ++
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 986a9544228d..45c57c4b9fe2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1420,7 +1420,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	unsigned long mmu_seq;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma, *old_vma;
 	short vma_shift;
 	gfn_t gfn;
 	kvm_pfn_t pfn;
@@ -1428,6 +1428,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	long vma_pagesize, fault_granule;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
+	bool vma_has_kvm_mte = false;
 
 	if (fault_is_perm)
 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1506,6 +1507,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	gfn = fault_ipa >> PAGE_SHIFT;
 	mte_allowed = kvm_vma_mte_allowed(vma);
+	vma_has_kvm_mte = !!(vma->vm_flags & VM_MTE_KVM);
+	old_vma = vma;
 
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
@@ -1521,6 +1524,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	mmap_read_unlock(current->mm);
 
+	/*
+	 * If the VMA was created after the memslot, it doesn't have the
+	 * VM_MTE_KVM flag set.
+	 */
+	if (unlikely(tag_storage_enabled() && !fault_is_perm &&
+	    kvm_has_mte(kvm) && mte_allowed && !vma_has_kvm_mte)) {
+		mmap_write_lock(current->mm);
+		vma = vma_lookup(current->mm, hva);
+		/* The VMA was changed, replay the fault. */
+		if (vma != old_vma) {
+			mmap_write_unlock(current->mm);
+			return 0;
+		}
+		if (!(vma->vm_flags & VM_MTE_KVM)) {
+			vma_start_write(vma);
+			vm_flags_reset(vma, vma->vm_flags | VM_MTE_KVM);
+		}
+		vma = NULL;
+		mmap_write_unlock(current->mm);
+	}
+
 	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
 				   write_fault, &writable, NULL);
 
@@ -1986,6 +2010,40 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
 	return err;
 }
 
+static int kvm_set_clear_kvm_mte_vma(const struct kvm_memory_slot *memslot, bool set)
+{
+	struct vm_area_struct *vma;
+	hva_t hva, memslot_end;
+	int ret = 0;
+
+	hva = memslot->userspace_addr;
+	memslot_end = hva + (memslot->npages << PAGE_SHIFT);
+
+	mmap_write_lock(current->mm);
+
+	do {
+		vma = find_vma_intersection(current->mm, hva, memslot_end);
+		if (!vma)
+			break;
+		if (!kvm_vma_mte_allowed(vma))
+			continue;
+		if (set) {
+			if (!(vma->vm_flags & VM_MTE_KVM)) {
+				vma_start_write(vma);
+				vm_flags_reset(vma, vma->vm_flags | VM_MTE_KVM);
+			}
+		} else if (vma->vm_flags & VM_MTE_KVM) {
+			vma_start_write(vma);
+			vm_flags_reset(vma, vma->vm_flags & ~VM_MTE_KVM);
+		}
+		hva = min(memslot_end, vma->vm_end);
+	} while (hva < memslot_end);
+
+	mmap_write_unlock(current->mm);
+
+	return ret;
+}
+
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *old,
 				   const struct kvm_memory_slot *new,
@@ -1993,6 +2051,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 {
 	bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
 
+	if (kvm_has_mte(kvm) && change != KVM_MR_FLAGS_ONLY) {
+		switch (change) {
+		case KVM_MR_CREATE:
+			kvm_set_clear_kvm_mte_vma(new, true);
+			break;
+		case KVM_MR_DELETE:
+			kvm_set_clear_kvm_mte_vma(old, false);
+			break;
+		case KVM_MR_MOVE:
+			kvm_set_clear_kvm_mte_vma(old, false);
+			kvm_set_clear_kvm_mte_vma(new, true);
+			break;
+		default:
+			WARN(true, "Unknown memslot change");
+		}
+	}
+
 	/*
 	 * At this point memslot has been committed and there is an
 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 5c12232bdf0b..f4ca3ba8dde7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -947,7 +947,7 @@ NOKPROBE_SYMBOL(do_debug_exception);
  */
 gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t gfp)
 {
-	if (vma->vm_flags & VM_MTE)
+	if (vma->vm_flags & (VM_MTE |VM_MTE_KVM))
 		return __GFP_TAGGED;
 	return 0;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..924aa7c26ec9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -375,9 +375,11 @@ extern unsigned int kobjsize(const void *objp);
 #if defined(CONFIG_ARM64_MTE)
 # define VM_MTE		VM_HIGH_ARCH_0	/* Use Tagged memory for access control */
 # define VM_MTE_ALLOWED	VM_HIGH_ARCH_1	/* Tagged memory permitted */
+# define VM_MTE_KVM	VM_HIGH_ARCH_2	/* VMA is mapped in a virtual machine with MTE */
 #else
 # define VM_MTE		VM_NONE
 # define VM_MTE_ALLOWED	VM_NONE
+# define VM_MTE_KVM	VM_NONE
 #endif
 
 #ifndef VM_GROWSUP
-- 
2.43.0