lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 12 Jan 2021 10:10:22 -0800
From:   Ben Gardon <bgardon@...gle.com>
To:     linux-kernel@...r.kernel.org, kvm@...r.kernel.org
Cc:     Paolo Bonzini <pbonzini@...hat.com>, Peter Xu <peterx@...hat.com>,
        Sean Christopherson <seanjc@...gle.com>,
        Peter Shier <pshier@...gle.com>,
        Peter Feiner <pfeiner@...gle.com>,
        Junaid Shahid <junaids@...gle.com>,
        Jim Mattson <jmattson@...gle.com>,
        Yulei Zhang <yulei.kernel@...il.com>,
        Wanpeng Li <kernellwp@...il.com>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Xiao Guangrong <xiaoguangrong.eric@...il.com>,
        Ben Gardon <bgardon@...gle.com>
Subject: [PATCH 05/24] kvm: x86/mmu: Fix yielding in TDP MMU

There are two problems with the way the TDP MMU yields in long running
functions. 1.) Given certain conditions, the function may not yield
reliably / frequently enough. 2.) In some functions the TDP iter risks
not making forward progress if two threads livelock yielding to
one another.

Case 1 is possible if for example, a paging structure was very large
but had few, if any writable entries. wrprot_gfn_range could traverse many
entries before finding a writable entry and yielding.

Case 2 is possible if two threads were trying to execute wrprot_gfn_range.
Each could write protect an entry and then yield. This would reset the
tdp_iter's walk over the paging structure and the loop would end up
repeating the same entry over and over, preventing either thread from
making forward progress.

Fix these issues by moving the yield to the beginning of the loop,
before other checks and only yielding if the loop has made forward
progress since the last yield.

Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU")
Reviewed-by: Peter Feiner <pfeiner@...gle.com>

Signed-off-by: Ben Gardon <bgardon@...gle.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 83 +++++++++++++++++++++++++++++++-------
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index b2784514ca2d..1987da0da66e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -470,9 +470,23 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 			  gfn_t start, gfn_t end, bool can_yield)
 {
 	struct tdp_iter iter;
+	gfn_t last_goal_gfn = start;
 	bool flush_needed = false;
 
 	tdp_root_for_each_pte(iter, root, start, end) {
+		/* Ensure forward progress has been made before yielding. */
+		if (can_yield && iter.goal_gfn != last_goal_gfn &&
+		    tdp_mmu_iter_flush_cond_resched(kvm, &iter)) {
+			last_goal_gfn = iter.goal_gfn;
+			flush_needed = false;
+			/*
+			 * Yielding caused the paging structure walk to be
+			 * reset so skip to the next iteration to continue the
+			 * walk from the root.
+			 */
+			continue;
+		}
+
 		if (!is_shadow_present_pte(iter.old_spte))
 			continue;
 
@@ -487,12 +501,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 			continue;
 
 		tdp_mmu_set_spte(kvm, &iter, 0);
-
-		if (can_yield)
-			flush_needed = !tdp_mmu_iter_flush_cond_resched(kvm,
-									&iter);
-		else
-			flush_needed = true;
+		flush_needed = true;
 	}
 	return flush_needed;
 }
@@ -850,12 +859,25 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 {
 	struct tdp_iter iter;
 	u64 new_spte;
+	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
 
 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
 				   min_level, start, end) {
+		/* Ensure forward progress has been made before yielding. */
+		if (iter.goal_gfn != last_goal_gfn &&
+		    tdp_mmu_iter_cond_resched(kvm, &iter)) {
+			last_goal_gfn = iter.goal_gfn;
+			/*
+			 * Yielding caused the paging structure walk to be
+			 * reset so skip to the next iteration to continue the
+			 * walk from the root.
+			 */
+			continue;
+		}
+
 		if (!is_shadow_present_pte(iter.old_spte) ||
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
@@ -864,8 +886,6 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 		spte_set = true;
-
-		tdp_mmu_iter_cond_resched(kvm, &iter);
 	}
 	return spte_set;
 }
@@ -906,9 +926,22 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 {
 	struct tdp_iter iter;
 	u64 new_spte;
+	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
+		/* Ensure forward progress has been made before yielding. */
+		if (iter.goal_gfn != last_goal_gfn &&
+		    tdp_mmu_iter_cond_resched(kvm, &iter)) {
+			last_goal_gfn = iter.goal_gfn;
+			/*
+			 * Yielding caused the paging structure walk to be
+			 * reset so skip to the next iteration to continue the
+			 * walk from the root.
+			 */
+			continue;
+		}
+
 		if (spte_ad_need_write_protect(iter.old_spte)) {
 			if (is_writable_pte(iter.old_spte))
 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -923,8 +956,6 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 		spte_set = true;
-
-		tdp_mmu_iter_cond_resched(kvm, &iter);
 	}
 	return spte_set;
 }
@@ -1029,9 +1060,22 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 {
 	struct tdp_iter iter;
 	u64 new_spte;
+	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
 	tdp_root_for_each_pte(iter, root, start, end) {
+		/* Ensure forward progress has been made before yielding. */
+		if (iter.goal_gfn != last_goal_gfn &&
+		    tdp_mmu_iter_cond_resched(kvm, &iter)) {
+			last_goal_gfn = iter.goal_gfn;
+			/*
+			 * Yielding caused the paging structure walk to be
+			 * reset so skip to the next iteration to continue the
+			 * walk from the root.
+			 */
+			continue;
+		}
+
 		if (!is_shadow_present_pte(iter.old_spte))
 			continue;
 
@@ -1039,8 +1083,6 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
 		tdp_mmu_set_spte(kvm, &iter, new_spte);
 		spte_set = true;
-
-		tdp_mmu_iter_cond_resched(kvm, &iter);
 	}
 
 	return spte_set;
@@ -1078,9 +1120,23 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 {
 	struct tdp_iter iter;
 	kvm_pfn_t pfn;
+	gfn_t last_goal_gfn = start;
 	bool spte_set = false;
 
 	tdp_root_for_each_pte(iter, root, start, end) {
+		/* Ensure forward progress has been made before yielding. */
+		if (iter.goal_gfn != last_goal_gfn &&
+		    tdp_mmu_iter_flush_cond_resched(kvm, &iter)) {
+			last_goal_gfn = iter.goal_gfn;
+			spte_set = false;
+			/*
+			 * Yielding caused the paging structure walk to be
+			 * reset so skip to the next iteration to continue the
+			 * walk from the root.
+			 */
+			continue;
+		}
+
 		if (!is_shadow_present_pte(iter.old_spte) ||
 		    is_last_spte(iter.old_spte, iter.level))
 			continue;
@@ -1091,8 +1147,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 			continue;
 
 		tdp_mmu_set_spte(kvm, &iter, 0);
-
-		spte_set = !tdp_mmu_iter_flush_cond_resched(kvm, &iter);
+		spte_set = true;
 	}
 
 	if (spte_set)
-- 
2.30.0.284.gd98b1dd5eaa7-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ