lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 25 Mar 2020 12:24:23 +0800
From:   Keqian Zhu <zhukeqian1@...wei.com>
To:     <kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
        <linux-arm-kernel@...ts.infradead.org>,
        <kvmarm@...ts.cs.columbia.edu>
CC:     Marc Zyngier <maz@...nel.org>, Paolo Bonzini <pbonzini@...hat.com>,
        "James Morse" <james.morse@....com>,
        Julien Thierry <julien.thierry.kdev@...il.com>,
        Will Deacon <will@...nel.org>,
        Suzuki K Poulose <suzuki.poulose@....com>,
        Sean Christopherson <sean.j.christopherson@...el.com>,
        Jay Zhou <jianjay.zhou@...wei.com>,
        <wanghaibin.wang@...wei.com>, Keqian Zhu <zhukeqian1@...wei.com>
Subject: [PATCH 3/3] KVM/arm64: Only set bits of dirty bitmap with valid translation entries

When KVM_DIRTY_LOG_INITIALLY_SET is enabled, we can only report these
pages that have valid translation entries to userspace, then userspace
don't need to do zero-check on other pages during VM migration.

Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G
Linux VMs with different page size.

About the time of enabling dirty log: The memory pressure is 127GB.
Page size   Before      After
   4K        1.8ms      341ms
   2M        1.8ms       4ms
   1G        1.8ms       2ms

About the time of migration: The memory pressure is 3GB and the migration
bandwidth is 500MB/s.
Page size   Before    After
   4K        21s       6s
   2M        21s       6s
   1G        21s       7s

Signed-off-by: Keqian Zhu <zhukeqian1@...wei.com>
---
 virt/kvm/arm/mmu.c | 161 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 137 insertions(+), 24 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 6c84de442a0e..0c7a5faf8609 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1413,34 +1413,85 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
 	return false;
 }
 
+enum s2_operation {
+	S2_OP_WP,  /* write protect page tables */
+	S2_OP_MD,  /* mark dirty bitmap in memslot */
+};
+
 /**
- * stage2_wp_ptes - write protect PMD range
+ * mark_range_dirty - mark a range of dirty bitmap
+ * @kvm:	kvm instance for the VM
+ * @addr:	range start address
+ * @end:	range end address
+ *
+ * note: addr and end should belong to the same memslot.
+ */
+static void mark_range_dirty(struct kvm *kvm,
+			     phys_addr_t addr,
+			     phys_addr_t end)
+{
+	gfn_t gfn;
+	unsigned int start, nbits;
+	struct kvm_memory_slot *memslot = NULL;
+
+	gfn = addr >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	if (memslot && memslot->dirty_bitmap) {
+		start = gfn - memslot->base_gfn;
+		nbits = DIV_ROUND_UP(end, PAGE_SIZE) - gfn;
+		bitmap_set(memslot->dirty_bitmap, start, nbits);
+	}
+}
+
+/**
+ * stage2_op_ptes - do an operation on PMD range
+ * @kvm:	kvm instance for the VM
+ * @op: 	the operation wanted
  * @pmd:	pointer to pmd entry
  * @addr:	range start address
  * @end:	range end address
  */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+static void stage2_op_ptes(struct kvm *kvm,
+			   enum s2_operation op,
+			   pmd_t *pmd,
+			   phys_addr_t addr,
+			   phys_addr_t end)
 {
 	pte_t *pte;
 
 	pte = pte_offset_kernel(pmd, addr);
 	do {
-		if (!pte_none(*pte)) {
+		if (pte_none(*pte))
+			continue;
+
+		switch (op) {
+		case S2_OP_WP:
 			if (!kvm_s2pte_readonly(pte))
 				kvm_set_s2pte_readonly(pte);
+			break;
+		case S2_OP_MD:
+			mark_range_dirty(kvm, addr, addr + PAGE_SIZE);
+			break;
+		default:
+			break;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
 /**
- * stage2_wp_pmds - write protect PUD range
- * kvm:		kvm instance for the VM
+ * stage2_op_pmds - do an operation on PUD range
+ * @kvm:	kvm instance for the VM
+ * @op: 	the operation wanted
  * @pud:	pointer to pud entry
  * @addr:	range start address
  * @end:	range end address
  */
-static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
-			   phys_addr_t addr, phys_addr_t end)
+static void stage2_op_pmds(struct kvm *kvm,
+			   enum s2_operation op,
+			   pud_t *pud,
+			   phys_addr_t addr,
+			   phys_addr_t end)
 {
 	pmd_t *pmd;
 	phys_addr_t next;
@@ -1449,25 +1500,40 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
 
 	do {
 		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
+		if (pmd_none(*pmd))
+			continue;
+
+		if (pmd_thp_or_huge(*pmd)) {
+			switch (op) {
+			case S2_OP_WP:
 				if (!kvm_s2pmd_readonly(pmd))
 					kvm_set_s2pmd_readonly(pmd);
-			} else {
-				stage2_wp_ptes(pmd, addr, next);
+				break;
+			case S2_OP_MD:
+				mark_range_dirty(kvm, addr, next);
+				break;
+			default:
+				break;
 			}
+		} else {
+			stage2_op_ptes(kvm, op, pmd, addr, next);
 		}
 	} while (pmd++, addr = next, addr != end);
 }
 
 /**
- * stage2_wp_puds - write protect PGD range
+ * stage2_op_puds - do an operation on PGD range
+ * @kvm:	kvm instance for the VM
+ * @op: 	the operation wanted
  * @pgd:	pointer to pgd entry
  * @addr:	range start address
  * @end:	range end address
  */
-static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
-			    phys_addr_t addr, phys_addr_t end)
+static void  stage2_op_puds(struct kvm *kvm,
+			    enum s2_operation op,
+			    pgd_t *pgd,
+			    phys_addr_t addr,
+			    phys_addr_t end)
 {
 	pud_t *pud;
 	phys_addr_t next;
@@ -1475,24 +1541,38 @@ static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
 	pud = stage2_pud_offset(kvm, pgd, addr);
 	do {
 		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
+		if (stage2_pud_none(kvm, *pud))
+			continue;
+
+		if (stage2_pud_huge(kvm, *pud)) {
+			switch (op) {
+			case S2_OP_WP:
 				if (!kvm_s2pud_readonly(pud))
 					kvm_set_s2pud_readonly(pud);
-			} else {
-				stage2_wp_pmds(kvm, pud, addr, next);
+				break;
+			case S2_OP_MD:
+				mark_range_dirty(kvm, addr, next);
+				break;
+			default:
+				break;
 			}
+		} else {
+			stage2_op_pmds(kvm, op, pud, addr, next);
 		}
 	} while (pud++, addr = next, addr != end);
 }
 
 /**
- * stage2_wp_range() - write protect stage2 memory region range
+ * stage2_op_range() - do an operation on stage2 memory region range
  * @kvm:	The KVM pointer
+ * @op: 	The operation wanted
  * @addr:	Start address of range
  * @end:	End address of range
  */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+static void stage2_op_range(struct kvm *kvm,
+			    enum s2_operation op,
+			    phys_addr_t addr,
+			    phys_addr_t end)
 {
 	pgd_t *pgd;
 	phys_addr_t next;
@@ -1513,7 +1593,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 			break;
 		next = stage2_pgd_addr_end(kvm, addr, end);
 		if (stage2_pgd_present(kvm, *pgd))
-			stage2_wp_puds(kvm, pgd, addr, next);
+			stage2_op_puds(kvm, op, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -1543,11 +1623,44 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
 	spin_lock(&kvm->mmu_lock);
-	stage2_wp_range(kvm, start, end);
+	stage2_op_range(kvm, S2_OP_WP, start, end);
 	spin_unlock(&kvm->mmu_lock);
 	kvm_flush_remote_tlbs(kvm);
 }
 
+/**
+ * kvm_mmu_md_memory_region() - mark dirty bitmap for memory slot
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot to mark dirty
+ *
+ * Called to mark dirty bitmap after memory region KVM_MEM_LOG_DIRTY_PAGES
+ * operation is called and kvm_dirty_log_manual_protect_and_init_set is
+ * true. After this function returns, a bit of dirty bitmap is set if its
+ * corresponding page table (including PUD, PMD and PTEs) is present.
+ *
+ * Afterwards read of dirty page log can be called and present PUD, PMD and
+ * PTEs can be write protected by userspace manually.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+static void kvm_mmu_md_memory_region(struct kvm *kvm, int slot)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+	phys_addr_t start, end;
+
+	if (WARN_ON_ONCE(!memslot))
+		return;
+
+	start = memslot->base_gfn << PAGE_SHIFT;
+	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+	spin_lock(&kvm->mmu_lock);
+	stage2_op_range(kvm, S2_OP_MD, start, end);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
  * @kvm:	The KVM pointer
@@ -1567,7 +1680,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
 
-	stage2_wp_range(kvm, start, end);
+	stage2_op_range(kvm, S2_OP_WP, start, end);
 }
 
 /*
@@ -2274,7 +2387,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 			 * write protect any pages because they're reported
 			 * as dirty here.
 			 */
-			bitmap_set(new->dirty_bitmap, 0, new->npages);
+			kvm_mmu_md_memory_region(kvm, mem->slot);
 		}
 	}
 }
-- 
2.19.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ