lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20250821151655.3051386-1-vkuznets@redhat.com>
Date: Thu, 21 Aug 2025 17:16:55 +0200
From: Vitaly Kuznetsov <vkuznets@...hat.com>
To: linux-hyperv@...r.kernel.org,
	Michael Kelley <mhklinux@...look.com>
Cc: "K. Y. Srinivasan" <kys@...rosoft.com>,
	Haiyang Zhang <haiyangz@...rosoft.com>,
	Wei Liu <wei.liu@...nel.org>,
	Dexuan Cui <decui@...rosoft.com>,
	x86@...nel.org,
	linux-kernel@...r.kernel.org,
	Nuno Das Neves <nunodasneves@...ux.microsoft.com>,
	Tianyu Lan <tiala@...rosoft.com>,
	Li Tian <litian@...hat.com>,
	Philipp Rudo <prudo@...hat.com>
Subject: [PATCH v3] x86/hyperv: Fix kdump on Azure CVMs

Azure CVM instance types featuring a paravisor hang upon kdump. The
investigation shows that makedumpfile causes a hang when it steps on a page
which was previously share with the host
(HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
knowledge of these 'special' regions (which are Vmbus connection pages,
GPADL buffers, ...). There are several ways to approach the issue:
- Convey the knowledge about these regions to the new kernel somehow.
- Unshare these regions before accessing in the new kernel (it is unclear
if there's a way to query the status for a given GPA range).
- Unshare these regions before jumping to the new kernel (which this patch
implements).

To make the procedure as robust as possible, store PFN ranges of shared
regions in a linked list instead of storing GVAs and re-using
hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
on the kdump/kexec path.

The patch skips implementing weird corner case in hv_list_enc_remove()
when a PFN in the middle of a region is unshared. First, it is unlikely
that such requests happen. Second, it is not a big problem if
hv_list_enc_remove() doesn't actually remove some regions as this will
only result in an extra hypercall doing nothing upon kexec/kdump; there's
no need to be perfect.

Signed-off-by: Vitaly Kuznetsov <vkuznets@...hat.com>
---
Changes since v2 [Michael Kelley]:
 - Rebase to hyperv-next.
 - Move hv_ivm_clear_host_access() call to hyperv_cleanup(). This also
   makes ARM stub unneeded.
 - Implement the missing corner case in hv_list_enc_remove(). With this,
   the math should (hopefully!) always be correct so we don't rely on
   the idempotency of HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY
   hypercall. As the case is not something we see, I tested the code
   with a few synthetic tests.
 - Fix the math in hv_list_enc_remove() (count -> ent->count).
 - Typos.
---
 arch/x86/hyperv/hv_init.c       |   3 +
 arch/x86/hyperv/ivm.c           | 213 ++++++++++++++++++++++++++++++--
 arch/x86/include/asm/mshyperv.h |   2 +
 3 files changed, 210 insertions(+), 8 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 2979d15223cf..4bb1578237eb 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -596,6 +596,9 @@ void hyperv_cleanup(void)
 	union hv_x64_msr_hypercall_contents hypercall_msr;
 	union hv_reference_tsc_msr tsc_msr;
 
+	/* Retract host access to shared memory in case of isolation */
+	hv_ivm_clear_host_access();
+
 	/* Reset our OS id */
 	wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0);
 	hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 3084ae8a3eed..0d74156ad6a7 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -462,6 +462,188 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
 		hv_ghcb_msr_read(msr, value);
 }
 
+/*
+ * Keep track of the PFN regions which were shared with the host. The access
+ * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
+ */
+struct hv_enc_pfn_region {
+	struct list_head list;
+	u64 pfn;
+	int count;
+};
+
+static LIST_HEAD(hv_list_enc);
+static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
+
+static int hv_list_enc_add(const u64 *pfn_list, int count)
+{
+	struct hv_enc_pfn_region *ent;
+	unsigned long flags;
+	u64 pfn;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		pfn = pfn_list[i];
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		/* Check if the PFN already exists in some region first */
+		list_for_each_entry(ent, &hv_list_enc, list) {
+			if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
+				/* Nothing to do - pfn is already in the list */
+				goto unlock_done;
+		}
+
+		/*
+		 * Check if the PFN is adjacent to an existing region. Growing
+		 * a region can make it adjacent to another one but merging is
+		 * not (yet) implemented for simplicity. A PFN cannot be added
+		 * to two regions to keep the logic in hv_list_enc_remove()
+		 * correct.
+		 */
+		list_for_each_entry(ent, &hv_list_enc, list) {
+			if (ent->pfn + ent->count == pfn) {
+				/* Grow existing region up */
+				ent->count++;
+				goto unlock_done;
+			} else if (pfn + 1 == ent->pfn) {
+				/* Grow existing region down */
+				ent->pfn--;
+				ent->count++;
+				goto unlock_done;
+			}
+		}
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+		/* No adjacent region found -- create a new one */
+		ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+		if (!ent)
+			return -ENOMEM;
+
+		ent->pfn = pfn;
+		ent->count = 1;
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		list_add(&ent->list, &hv_list_enc);
+
+unlock_done:
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+	}
+
+	return 0;
+}
+
+static void hv_list_enc_remove(const u64 *pfn_list, int count)
+{
+	struct hv_enc_pfn_region *ent, *t;
+	struct hv_enc_pfn_region new_region;
+	unsigned long flags;
+	u64 pfn;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		pfn = pfn_list[i];
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
+			if (pfn == ent->pfn + ent->count - 1) {
+				/* Removing tail pfn */
+				ent->count--;
+				if (!ent->count) {
+					list_del(&ent->list);
+					kfree(ent);
+				}
+				goto unlock_done;
+			} else if (pfn == ent->pfn) {
+				/* Removing head pfn */
+				ent->count--;
+				ent->pfn++;
+				if (!ent->count) {
+					list_del(&ent->list);
+					kfree(ent);
+				}
+				goto unlock_done;
+			} else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
+				/*
+				 * Removing a pfn in the middle. Cut off the tail
+				 * of the existing region and create a template for
+				 * the new one.
+				 */
+				new_region.pfn = pfn + 1;
+				new_region.count = ent->count - (pfn - ent->pfn + 1);
+				ent->count = pfn - ent->pfn;
+				goto unlock_split;
+			}
+
+		}
+unlock_done:
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+		continue;
+
+unlock_split:
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+		ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+		/*
+		 * There is no apparent good way to recover from -ENOMEM
+		 * situation, the accouting is going to be wrong either way.
+		 * Proceed with the rest of the list to make it 'less wrong'.
+		 */
+		if (WARN_ON_ONCE(!ent))
+			continue;
+
+		ent->pfn = new_region.pfn;
+		ent->count = new_region.count;
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		list_add(&ent->list, &hv_list_enc);
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+	}
+}
+
+void hv_ivm_clear_host_access(void)
+{
+	struct hv_gpa_range_for_visibility *input;
+	struct hv_enc_pfn_region *ent;
+	unsigned long flags;
+	u64 hv_status;
+	int batch_size, cur, i;
+
+	if (!hv_is_isolation_supported())
+		return;
+
+	raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+
+	batch_size = MIN(hv_setup_in_array(&input, sizeof(*input),
+					   sizeof(input->gpa_page_list[0])),
+			 HV_MAX_MODIFY_GPA_REP_COUNT);
+	if (unlikely(!input))
+		goto unlock;
+
+	list_for_each_entry(ent, &hv_list_enc, list) {
+		for (i = 0, cur = 0; i < ent->count; i++) {
+			input->gpa_page_list[cur] = ent->pfn + i;
+			cur++;
+
+			if (cur == batch_size || i == ent->count - 1) {
+				input->partition_id = HV_PARTITION_ID_SELF;
+				input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
+				input->reserved0 = 0;
+				input->reserved1 = 0;
+				hv_status = hv_do_rep_hypercall(
+					HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
+					cur, 0, input, NULL);
+				WARN_ON_ONCE(!hv_result_success(hv_status));
+				cur = 0;
+			}
+		}
+
+	};
+
+unlock:
+	raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+}
+EXPORT_SYMBOL_GPL(hv_ivm_clear_host_access);
+
 /*
  * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
  *
@@ -476,24 +658,33 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 	u64 hv_status;
 	int batch_size;
 	unsigned long flags;
+	int ret;
 
 	/* no-op if partition isolation is not enabled */
 	if (!hv_is_isolation_supported())
 		return 0;
 
+	if (visibility == VMBUS_PAGE_NOT_VISIBLE) {
+		hv_list_enc_remove(pfn, count);
+	} else {
+		ret = hv_list_enc_add(pfn, count);
+		if (ret)
+			return ret;
+	}
+
 	local_irq_save(flags);
 	batch_size = hv_setup_in_array(&input, sizeof(*input),
 					sizeof(input->gpa_page_list[0]));
 	if (unlikely(!input)) {
-		local_irq_restore(flags);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto unlock;
 	}
 
 	if (count > batch_size) {
 		pr_err("Hyper-V: GPA count:%d exceeds supported:%u\n", count,
 		       batch_size);
-		local_irq_restore(flags);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto unlock;
 	}
 
 	input->partition_id = HV_PARTITION_ID_SELF;
@@ -502,12 +693,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 	hv_status = hv_do_rep_hypercall(
 			HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count,
 			0, input, NULL);
-	local_irq_restore(flags);
-
 	if (hv_result_success(hv_status))
-		return 0;
+		ret = 0;
 	else
-		return -EFAULT;
+		ret = -EFAULT;
+
+unlock:
+	local_irq_restore(flags);
+
+	if (ret)
+		hv_list_enc_remove(pfn, count);
+
+	return ret;
 }
 
 /*
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index abc4659f5809..6a988001e46f 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -263,10 +263,12 @@ static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip,
 void hv_vtom_init(void);
 void hv_ivm_msr_write(u64 msr, u64 value);
 void hv_ivm_msr_read(u64 msr, u64 *value);
+void hv_ivm_clear_host_access(void);
 #else
 static inline void hv_vtom_init(void) {}
 static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
 static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
+static inline void hv_ivm_clear_host_access(void) {}
 #endif
 
 static inline bool hv_is_synic_msr(unsigned int reg)
-- 
2.50.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ