[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260120064230.3602565-14-mrathor@linux.microsoft.com>
Date: Mon, 19 Jan 2026 22:42:28 -0800
From: Mukesh R <mrathor@...ux.microsoft.com>
To: linux-kernel@...r.kernel.org,
linux-hyperv@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org,
iommu@...ts.linux.dev,
linux-pci@...r.kernel.org,
linux-arch@...r.kernel.org
Cc: kys@...rosoft.com,
haiyangz@...rosoft.com,
wei.liu@...nel.org,
decui@...rosoft.com,
longli@...rosoft.com,
catalin.marinas@....com,
will@...nel.org,
tglx@...utronix.de,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
hpa@...or.com,
joro@...tes.org,
lpieralisi@...nel.org,
kwilczynski@...nel.org,
mani@...nel.org,
robh@...nel.org,
bhelgaas@...gle.com,
arnd@...db.de,
nunodasneves@...ux.microsoft.com,
mhklinux@...look.com,
romank@...ux.microsoft.com
Subject: [PATCH v0 13/15] x86/hyperv: Basic interrupt support for direct attached devices
From: Mukesh Rathor <mrathor@...ux.microsoft.com>
As mentioned previously, a direct attached device must be referenced
via logical device id which is formed in the initial attach hypercall.
Interrupt mapping paths for direct attached devices are almost same,
except we must use logical device ids instead of the PCI device ids.
L1VH only supports direct attaches for passing thru devices to its guests,
and devices on L1VH are VMBus based. However, the interrupts are mapped
via the map interrupt hypercall and not the traditional method of VMBus
messages.
Partition id for the relevant hypercalls is tricky. This because a device
could be moving from root to guest and then back to the root. In case
of L1VH, it could be moving from system host to L1VH root to a guest,
then back to the L1VH root. So, it is carefully crafted by keeping
track of whether the call is on behalf of a VMM process, whether the
device is attached device (as opposed to mapped), and whether we are in
an L1VH root/parent. If VMM process, we assume it is on behalf of a
guest. Otherwise, the device is being attached or detached during boot
or shutdown of the privileged partition.
Lastly, a dummy cpu and vector is used to map interrupt for a direct
attached device. This because, once a device is marked for direct attach,
hypervisor will not let any interrupts be mapped to host. So it is mapped
to guest dummy cpu and dummy vector. This is then correctly mapped during
guest boot via the retarget paths.
Signed-off-by: Mukesh Rathor <mrathor@...ux.microsoft.com>
---
arch/arm64/include/asm/mshyperv.h | 15 +++++
arch/x86/hyperv/irqdomain.c | 57 +++++++++++++-----
arch/x86/include/asm/mshyperv.h | 4 ++
drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++----
4 files changed, 142 insertions(+), 25 deletions(-)
diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h
index b721d3134ab6..27da480f94f6 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -53,6 +53,21 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg)
return hv_get_msr(reg);
}
+struct irq_data;
+struct msi_msg;
+struct pci_dev;
+static inline void hv_irq_compose_msi_msg(struct irq_data *data,
+ struct msi_msg *msg) {};
+static inline int hv_unmap_msi_interrupt(struct pci_dev *pdev,
+ struct hv_interrupt_entry *hvirqe)
+{
+ return -EOPNOTSUPP;
+}
+static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
+{
+ return false;
+}
+
/* SMCCC hypercall parameters */
#define HV_SMCCC_FUNC_NUMBER 1
#define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 33017aa0caa4..e6eb457f791e 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -13,6 +13,16 @@
#include <linux/irqchip/irq-msi-lib.h>
#include <asm/mshyperv.h>
+/*
+ * For direct attached devices (which use logical device ids), hypervisor will
+ * not allow mappings to host. But VFIO needs to bind the interrupt at the very
+ * start before the guest cpu/vector is known. So we use dummy cpu and vector
+ * to bind in such case, and later when the guest starts, retarget will move it
+ * to correct guest cpu and vector.
+ */
+#define HV_DDA_DUMMY_CPU 0
+#define HV_DDA_DUMMY_VECTOR 32
+
static u64 hv_map_interrupt_hcall(u64 ptid, union hv_device_id hv_devid,
bool level, int cpu, int vector,
struct hv_interrupt_entry *ret_entry)
@@ -24,6 +34,11 @@ static u64 hv_map_interrupt_hcall(u64 ptid, union hv_device_id hv_devid,
u64 status;
int nr_bank, var_size;
+ if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL) {
+ cpu = HV_DDA_DUMMY_CPU;
+ vector = HV_DDA_DUMMY_VECTOR;
+ }
+
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -95,7 +110,8 @@ static int hv_map_interrupt(u64 ptid, union hv_device_id device_id, bool level,
return hv_result_to_errno(status);
}
-static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
+static int hv_unmap_interrupt(union hv_device_id hv_devid,
+ struct hv_interrupt_entry *irq_entry)
{
unsigned long flags;
struct hv_input_unmap_device_interrupt *input;
@@ -103,10 +119,14 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
-
memset(input, 0, sizeof(*input));
- input->partition_id = hv_current_partition_id;
- input->device_id = id;
+
+ if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
+ input->partition_id = hv_iommu_get_curr_partid();
+ else
+ input->partition_id = hv_current_partition_id;
+
+ input->device_id = hv_devid.as_uint64;
input->interrupt_entry = *irq_entry;
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
@@ -263,6 +283,7 @@ static u64 hv_build_irq_devid(struct pci_dev *pdev)
int hv_map_msi_interrupt(struct irq_data *data,
struct hv_interrupt_entry *out_entry)
{
+ u64 ptid;
struct irq_cfg *cfg = irqd_cfg(data);
struct hv_interrupt_entry dummy;
union hv_device_id hv_devid;
@@ -275,8 +296,17 @@ int hv_map_msi_interrupt(struct irq_data *data,
hv_devid.as_uint64 = hv_build_irq_devid(pdev);
cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
- return hv_map_interrupt(hv_current_partition_id, hv_devid, false, cpu,
- cfg->vector, out_entry ? out_entry : &dummy);
+ if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
+ if (hv_pcidev_is_attached_dev(pdev))
+ ptid = hv_iommu_get_curr_partid();
+ else
+ /* Device actually on l1vh root, not passthru'd to vm */
+ ptid = hv_current_partition_id;
+ else
+ ptid = hv_current_partition_id;
+
+ return hv_map_interrupt(ptid, hv_devid, false, cpu, cfg->vector,
+ out_entry ? out_entry : &dummy);
}
EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
@@ -289,10 +319,7 @@ static void entry_to_msi_msg(struct hv_interrupt_entry *entry,
msg->data = entry->msi_entry.data.as_uint32;
}
-static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
- struct hv_interrupt_entry *irq_entry);
-
-static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
struct hv_interrupt_entry *stored_entry;
struct irq_cfg *cfg = irqd_cfg(data);
@@ -341,16 +368,18 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
data->chip_data = stored_entry;
entry_to_msi_msg(data->chip_data, msg);
}
+EXPORT_SYMBOL_GPL(hv_irq_compose_msi_msg);
-static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
- struct hv_interrupt_entry *irq_entry)
+int hv_unmap_msi_interrupt(struct pci_dev *pdev,
+ struct hv_interrupt_entry *irq_entry)
{
union hv_device_id hv_devid;
hv_devid.as_uint64 = hv_build_irq_devid(pdev);
- return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
+ return hv_unmap_interrupt(hv_devid, irq_entry);
}
+EXPORT_SYMBOL_GPL(hv_unmap_msi_interrupt);
/* NB: during map, hv_interrupt_entry is saved via data->chip_data */
static void hv_teardown_msi_irq(struct pci_dev *pdev, struct irq_data *irqd)
@@ -486,7 +515,7 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
- return hv_unmap_interrupt(hv_devid.as_uint64, entry);
+ return hv_unmap_interrupt(hv_devid, entry);
}
EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e4ccdbbf1d12..b6facd3a0f5e 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -204,11 +204,15 @@ static inline u64 hv_iommu_get_curr_partid(void)
#endif /* CONFIG_HYPERV_IOMMU */
u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
+void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
+extern bool hv_no_attdev;
struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_msi_interrupt(struct irq_data *data,
struct hv_interrupt_entry *out_entry);
+int hv_unmap_msi_interrupt(struct pci_dev *dev,
+ struct hv_interrupt_entry *hvirqe);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 40f0b06bb966..71d1599dc4a8 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -660,15 +660,17 @@ static void hv_irq_retarget_interrupt(struct irq_data *data)
params = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(params, 0, sizeof(*params));
- params->partition_id = HV_PARTITION_ID_SELF;
+
+ if (hv_pcidev_is_attached_dev(pdev))
+ params->partition_id = hv_iommu_get_curr_partid();
+ else
+ params->partition_id = HV_PARTITION_ID_SELF;
+
params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
- params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
+ params->int_entry.msi_entry.address.as_uint32 =
+ int_desc->address & 0xffffffff;
params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
- params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
- (hbus->hdev->dev_instance.b[4] << 16) |
- (hbus->hdev->dev_instance.b[7] << 8) |
- (hbus->hdev->dev_instance.b[6] & 0xf8) |
- PCI_FUNC(pdev->devfn);
+ params->device_id = hv_pci_vmbus_device_id(pdev);
params->int_target.vector = hv_msi_get_int_vector(data);
if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
@@ -1263,6 +1265,15 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
mb();
}
spin_unlock_irqrestore(&hbus->config_lock, flags);
+ /*
+ * Make sure PCI_INTERRUPT_PIN is hard-wired to 0 since it may
+ * be read using a 32bit read which is skipped by the above
+ * emulation.
+ */
+ if (PCI_INTERRUPT_PIN >= where &&
+ PCI_INTERRUPT_PIN <= (where + size)) {
+ *((char *)val + PCI_INTERRUPT_PIN - where) = 0;
+ }
} else {
dev_err(dev, "Attempt to read beyond a function's config space.\n");
}
@@ -1731,14 +1742,22 @@ static void hv_msi_free(struct irq_domain *domain, unsigned int irq)
if (!int_desc)
return;
- irq_data->chip_data = NULL;
hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
if (!hpdev) {
+ irq_data->chip_data = NULL;
kfree(int_desc);
return;
}
- hv_int_desc_free(hpdev, int_desc);
+ if (hv_pcidev_is_attached_dev(pdev)) {
+ hv_unmap_msi_interrupt(pdev, irq_data->chip_data);
+ kfree(irq_data->chip_data);
+ irq_data->chip_data = NULL;
+ } else {
+ irq_data->chip_data = NULL;
+ hv_int_desc_free(hpdev, int_desc);
+ }
+
put_pcichild(hpdev);
}
@@ -2139,6 +2158,56 @@ static void hv_vmbus_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
msg->data = 0;
}
+/* Compose an msi message for a directly attached device */
+static void hv_dda_compose_msi_msg(struct irq_data *irq_data,
+ struct msi_desc *msi_desc,
+ struct msi_msg *msg)
+{
+ bool multi_msi;
+ struct hv_pcibus_device *hbus;
+ struct hv_pci_dev *hpdev;
+ struct pci_dev *pdev = msi_desc_to_pci_dev(msi_desc);
+
+ multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
+ msi_desc->nvec_used > 1;
+
+ if (multi_msi) {
+ dev_err(&hbus->hdev->device,
+ "Passthru direct attach does not support multi msi\n");
+ goto outerr;
+ }
+
+ hbus = container_of(pdev->bus->sysdata, struct hv_pcibus_device,
+ sysdata);
+
+ hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
+ if (!hpdev)
+ goto outerr;
+
+ /* will unmap if needed and also update irq_data->chip_data */
+ hv_irq_compose_msi_msg(irq_data, msg);
+
+ put_pcichild(hpdev);
+ return;
+
+outerr:
+ memset(msg, 0, sizeof(*msg));
+}
+
+static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct pci_dev *pdev;
+ struct msi_desc *msi_desc;
+
+ msi_desc = irq_data_get_msi_desc(data);
+ pdev = msi_desc_to_pci_dev(msi_desc);
+
+ if (hv_pcidev_is_attached_dev(pdev))
+ hv_dda_compose_msi_msg(data, msi_desc, msg);
+ else
+ hv_vmbus_compose_msi_msg(data, msg);
+}
+
static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
struct irq_domain *real_parent, struct msi_domain_info *info)
{
@@ -2177,7 +2246,7 @@ static const struct msi_parent_ops hv_pcie_msi_parent_ops = {
/* HW Interrupt Chip Descriptor */
static struct irq_chip hv_msi_irq_chip = {
.name = "Hyper-V PCIe MSI",
- .irq_compose_msi_msg = hv_vmbus_compose_msi_msg,
+ .irq_compose_msi_msg = hv_compose_msi_msg,
.irq_set_affinity = irq_chip_set_affinity_parent,
.irq_ack = irq_chip_ack_parent,
.irq_eoi = irq_chip_eoi_parent,
@@ -4096,7 +4165,7 @@ static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
irq_data = irq_get_irq_data(entry->irq);
if (WARN_ON_ONCE(!irq_data))
return -EINVAL;
- hv_vmbus_compose_msi_msg(irq_data, &entry->msg);
+ hv_compose_msi_msg(irq_data, &entry->msg);
}
return 0;
}
--
2.51.2.vfs.0.1
Powered by blists - more mailing lists