[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250815133725.1591863-1-vkuznets@redhat.com>
Date: Fri, 15 Aug 2025 15:37:25 +0200
From: Vitaly Kuznetsov <vkuznets@...hat.com>
To: linux-hyperv@...r.kernel.org
Cc: "K. Y. Srinivasan" <kys@...rosoft.com>,
Haiyang Zhang <haiyangz@...rosoft.com>,
Wei Liu <wei.liu@...nel.org>,
Dexuan Cui <decui@...rosoft.com>,
x86@...nel.org,
linux-kernel@...r.kernel.org,
Nuno Das Neves <nunodasneves@...ux.microsoft.com>,
Tianyu Lan <tiala@...rosoft.com>,
Michael Kelley <mhklinux@...look.com>,
Li Tian <litian@...hat.com>,
Philipp Rudo <prudo@...hat.com>
Subject: [PATCH] x86/hyperv: Fix kdump on Azure CVMs
Azure CVM instance types featuring a paravisor hang upon kdump. The
investigation shows that makedumpfile causes a hang when it steps on a page
which was previously share with the host
(HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
knowledge of these 'special' regions (which are Vmbus connection pages,
GPADL buffers, ...). There are several ways to approach the issue:
- Convey the knowledge about these regions to the new kernel somehow.
- Unshare these regions before accessing in the new kernel (it is unclear
if there's a way to query the status for a given GPA range).
- Unshare these regions before jumping to the new kernel (which this patch
implements).
To make the procedure as robust as possible, store PFN ranges of shared
regions in a linked list instead of storing GVAs and re-using
hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
on the kdump/kexec path.
The patch skips implementing weird corner case in hv_list_enc_remove()
when a PFN in the middle of a region is unshared. First, it is unlikely
that such requests happen. Second, it is not a big problem if
hv_list_enc_remove() doesn't actually remove some regions as this will
only result in an extra hypercall doing nothing upon kexec/kdump; there's
no need to be perfect.
Signed-off-by: Vitaly Kuznetsov <vkuznets@...hat.com>
---
arch/x86/hyperv/ivm.c | 153 ++++++++++++++++++++++++++++++++
arch/x86/include/asm/mshyperv.h | 2 +
drivers/hv/vmbus_drv.c | 2 +
3 files changed, 157 insertions(+)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index ade6c665c97e..a6e614672836 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -462,6 +462,150 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
hv_ghcb_msr_read(msr, value);
}
+/*
+ * Keep track of the PFN regions which were shared with the host. The access
+ * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
+ */
+struct hv_enc_pfn_region {
+ struct list_head list;
+ u64 pfn;
+ int count;
+};
+
+static LIST_HEAD(hv_list_enc);
+static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
+
+static int hv_list_enc_add(const u64 *pfn_list, int count)
+{
+ struct hv_enc_pfn_region *ent;
+ unsigned long flags;
+ bool found = false;
+ u64 pfn;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i];
+
+ found = false;
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) {
+ /* Nothin to do - pfn is already in the list */
+ found = true;
+ } else if (ent->pfn + ent->count == pfn) {
+ /* Grow existing region up */
+ found = true;
+ ent->count++;
+ } else if (pfn + 1 == ent->pfn) {
+ /* Grow existing region down */
+ found = true;
+ ent->pfn--;
+ ent->count++;
+ }
+ };
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+ if (found)
+ continue;
+
+ /* No adajacent region found -- creating a new one */
+ ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pfn = pfn;
+ ent->count = 1;
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_add(&ent->list, &hv_list_enc);
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ }
+
+ return 0;
+}
+
+static void hv_list_enc_remove(const u64 *pfn_list, int count)
+{
+ struct hv_enc_pfn_region *ent, *t;
+ unsigned long flags;
+ u64 pfn;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i];
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
+ if (pfn == ent->pfn + count - 1) {
+ /* Removing tail pfn */
+ ent->count--;
+ if (!ent->count) {
+ list_del(&ent->list);
+ kfree(ent);
+ }
+ } else if (pfn == ent->pfn) {
+ /* Removing head pfn */
+ ent->count--;
+ ent->pfn++;
+ if (!ent->count) {
+ list_del(&ent->list);
+ kfree(ent);
+ }
+ }
+
+ /*
+ * Removing PFNs in the middle of a region is not implemented; the
+ * list is currently only used for cleanup upon kexec and there's
+ * no harm done if we issue an extra unneeded hypercall making some
+ * region encrypted when it already is.
+ */
+ };
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ }
+}
+
+void hv_ivm_clear_host_access(void)
+{
+ struct hv_gpa_range_for_visibility *input;
+ struct hv_enc_pfn_region *ent;
+ unsigned long flags;
+ u64 hv_status;
+ int cur, i;
+
+ if (!hv_is_isolation_supported())
+ return;
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ if (!input)
+ goto unlock;
+
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ for (i = 0, cur = 0; i < ent->count; i++) {
+ input->gpa_page_list[cur] = ent->pfn + i;
+ cur++;
+
+ if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
+ input->reserved0 = 0;
+ input->reserved1 = 0;
+ hv_status = hv_do_rep_hypercall(
+ HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
+ cur, 0, input, NULL);
+ WARN_ON_ONCE(!hv_result_success(hv_status));
+ cur = 0;
+ }
+ }
+
+ };
+
+unlock:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+}
+EXPORT_SYMBOL_GPL(hv_ivm_clear_host_access);
+
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
*
@@ -475,6 +619,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
struct hv_gpa_range_for_visibility *input;
u64 hv_status;
unsigned long flags;
+ int ret;
/* no-op if partition isolation is not enabled */
if (!hv_is_isolation_supported())
@@ -486,6 +631,14 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
return -EINVAL;
}
+ if (visibility == VMBUS_PAGE_NOT_VISIBLE) {
+ hv_list_enc_remove(pfn, count);
+ } else {
+ ret = hv_list_enc_add(pfn, count);
+ if (ret)
+ return ret;
+ }
+
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index abc4659f5809..6a988001e46f 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -263,10 +263,12 @@ static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip,
void hv_vtom_init(void);
void hv_ivm_msr_write(u64 msr, u64 value);
void hv_ivm_msr_read(u64 msr, u64 *value);
+void hv_ivm_clear_host_access(void);
#else
static inline void hv_vtom_init(void) {}
static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
+static inline void hv_ivm_clear_host_access(void) {}
#endif
static inline bool hv_is_synic_msr(unsigned int reg)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 2ed5a1e89d69..2e891e108218 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2784,6 +2784,7 @@ static void hv_kexec_handler(void)
/* Make sure conn_state is set as hv_synic_cleanup checks for it */
mb();
cpuhp_remove_state(hyperv_cpuhp_online);
+ hv_ivm_clear_host_access();
};
static void hv_crash_handler(struct pt_regs *regs)
@@ -2799,6 +2800,7 @@ static void hv_crash_handler(struct pt_regs *regs)
cpu = smp_processor_id();
hv_stimer_cleanup(cpu);
hv_synic_disable_regs(cpu);
+ hv_ivm_clear_host_access();
};
static int hv_synic_suspend(void)
--
2.50.0
Powered by blists - more mailing lists