[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aXAiCw9Dk7GDfagy@skinsburskii.localdomain>
Date: Tue, 20 Jan 2026 16:47:07 -0800
From: Stanislav Kinsburskii <skinsburskii@...ux.microsoft.com>
To: Mukesh R <mrathor@...ux.microsoft.com>
Cc: linux-kernel@...r.kernel.org, linux-hyperv@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org, iommu@...ts.linux.dev,
linux-pci@...r.kernel.org, linux-arch@...r.kernel.org,
kys@...rosoft.com, haiyangz@...rosoft.com, wei.liu@...nel.org,
decui@...rosoft.com, longli@...rosoft.com, catalin.marinas@....com,
will@...nel.org, tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
dave.hansen@...ux.intel.com, hpa@...or.com, joro@...tes.org,
lpieralisi@...nel.org, kwilczynski@...nel.org, mani@...nel.org,
robh@...nel.org, bhelgaas@...gle.com, arnd@...db.de,
nunodasneves@...ux.microsoft.com, mhklinux@...look.com,
romank@...ux.microsoft.com
Subject: Re: [PATCH v0 13/15] x86/hyperv: Basic interrupt support for direct
attached devices
On Mon, Jan 19, 2026 at 10:42:28PM -0800, Mukesh R wrote:
> From: Mukesh Rathor <mrathor@...ux.microsoft.com>
>
> As mentioned previously, a direct attached device must be referenced
> via logical device id which is formed in the initial attach hypercall.
> Interrupt mapping paths for direct attached devices are almost same,
> except we must use logical device ids instead of the PCI device ids.
>
> L1VH only supports direct attaches for passing thru devices to its guests,
> and devices on L1VH are VMBus based. However, the interrupts are mapped
> via the map interrupt hypercall and not the traditional method of VMBus
> messages.
>
> Partition id for the relevant hypercalls is tricky. This because a device
> could be moving from root to guest and then back to the root. In case
> of L1VH, it could be moving from system host to L1VH root to a guest,
> then back to the L1VH root. So, it is carefully crafted by keeping
> track of whether the call is on behalf of a VMM process, whether the
> device is attached device (as opposed to mapped), and whether we are in
> an L1VH root/parent. If VMM process, we assume it is on behalf of a
> guest. Otherwise, the device is being attached or detached during boot
> or shutdown of the privileged partition.
>
> Lastly, a dummy cpu and vector is used to map interrupt for a direct
> attached device. This because, once a device is marked for direct attach,
> hypervisor will not let any interrupts be mapped to host. So it is mapped
> to guest dummy cpu and dummy vector. This is then correctly mapped during
> guest boot via the retarget paths.
>
> Signed-off-by: Mukesh Rathor <mrathor@...ux.microsoft.com>
> ---
> arch/arm64/include/asm/mshyperv.h | 15 +++++
> arch/x86/hyperv/irqdomain.c | 57 +++++++++++++-----
> arch/x86/include/asm/mshyperv.h | 4 ++
> drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++----
> 4 files changed, 142 insertions(+), 25 deletions(-)
>
> diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h
> index b721d3134ab6..27da480f94f6 100644
> --- a/arch/arm64/include/asm/mshyperv.h
> +++ b/arch/arm64/include/asm/mshyperv.h
> @@ -53,6 +53,21 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg)
> return hv_get_msr(reg);
> }
>
> +struct irq_data;
> +struct msi_msg;
> +struct pci_dev;
> +static inline void hv_irq_compose_msi_msg(struct irq_data *data,
> + struct msi_msg *msg) {};
> +static inline int hv_unmap_msi_interrupt(struct pci_dev *pdev,
> + struct hv_interrupt_entry *hvirqe)
> +{
> + return -EOPNOTSUPP;
> +}
> +static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
> +{
> + return false;
> +}
> +
> /* SMCCC hypercall parameters */
> #define HV_SMCCC_FUNC_NUMBER 1
> #define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \
> diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
> index 33017aa0caa4..e6eb457f791e 100644
> --- a/arch/x86/hyperv/irqdomain.c
> +++ b/arch/x86/hyperv/irqdomain.c
> @@ -13,6 +13,16 @@
> #include <linux/irqchip/irq-msi-lib.h>
> #include <asm/mshyperv.h>
>
> +/*
> + * For direct attached devices (which use logical device ids), hypervisor will
> + * not allow mappings to host. But VFIO needs to bind the interrupt at the very
> + * start before the guest cpu/vector is known. So we use dummy cpu and vector
> + * to bind in such case, and later when the guest starts, retarget will move it
> + * to correct guest cpu and vector.
> + */
> +#define HV_DDA_DUMMY_CPU 0
> +#define HV_DDA_DUMMY_VECTOR 32
> +
> static u64 hv_map_interrupt_hcall(u64 ptid, union hv_device_id hv_devid,
> bool level, int cpu, int vector,
> struct hv_interrupt_entry *ret_entry)
> @@ -24,6 +34,11 @@ static u64 hv_map_interrupt_hcall(u64 ptid, union hv_device_id hv_devid,
> u64 status;
> int nr_bank, var_size;
>
> + if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL) {
> + cpu = HV_DDA_DUMMY_CPU;
> + vector = HV_DDA_DUMMY_VECTOR;
> + }
> +
> local_irq_save(flags);
>
> input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -95,7 +110,8 @@ static int hv_map_interrupt(u64 ptid, union hv_device_id device_id, bool level,
> return hv_result_to_errno(status);
> }
>
> -static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
> +static int hv_unmap_interrupt(union hv_device_id hv_devid,
> + struct hv_interrupt_entry *irq_entry)
> {
> unsigned long flags;
> struct hv_input_unmap_device_interrupt *input;
> @@ -103,10 +119,14 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
>
> local_irq_save(flags);
> input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> -
> memset(input, 0, sizeof(*input));
> - input->partition_id = hv_current_partition_id;
> - input->device_id = id;
> +
> + if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
> + input->partition_id = hv_iommu_get_curr_partid();
> + else
> + input->partition_id = hv_current_partition_id;
> +
> + input->device_id = hv_devid.as_uint64;
> input->interrupt_entry = *irq_entry;
>
> status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
> @@ -263,6 +283,7 @@ static u64 hv_build_irq_devid(struct pci_dev *pdev)
> int hv_map_msi_interrupt(struct irq_data *data,
> struct hv_interrupt_entry *out_entry)
> {
> + u64 ptid;
> struct irq_cfg *cfg = irqd_cfg(data);
> struct hv_interrupt_entry dummy;
> union hv_device_id hv_devid;
> @@ -275,8 +296,17 @@ int hv_map_msi_interrupt(struct irq_data *data,
> hv_devid.as_uint64 = hv_build_irq_devid(pdev);
> cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
>
> - return hv_map_interrupt(hv_current_partition_id, hv_devid, false, cpu,
> - cfg->vector, out_entry ? out_entry : &dummy);
> + if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
> + if (hv_pcidev_is_attached_dev(pdev))
> + ptid = hv_iommu_get_curr_partid();
> + else
> + /* Device actually on l1vh root, not passthru'd to vm */
l1vh and root are mutually exclusive partitions.
If you wanted to highlight that it's l1vh itself and not its child guest, then
"l1vh parent" term would do.
> + ptid = hv_current_partition_id;
> + else
> + ptid = hv_current_partition_id;
Looks like the only special case is for attached logical devices,
otherwise hv_current_partition_id is used.
Can the logic simplified here?
Thanks,
Stanislav
> +
> + return hv_map_interrupt(ptid, hv_devid, false, cpu, cfg->vector,
> + out_entry ? out_entry : &dummy);
> }
> EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
>
> @@ -289,10 +319,7 @@ static void entry_to_msi_msg(struct hv_interrupt_entry *entry,
> msg->data = entry->msi_entry.data.as_uint32;
> }
>
> -static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
> - struct hv_interrupt_entry *irq_entry);
> -
> -static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> +void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> {
> struct hv_interrupt_entry *stored_entry;
> struct irq_cfg *cfg = irqd_cfg(data);
> @@ -341,16 +368,18 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> data->chip_data = stored_entry;
> entry_to_msi_msg(data->chip_data, msg);
> }
> +EXPORT_SYMBOL_GPL(hv_irq_compose_msi_msg);
>
> -static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
> - struct hv_interrupt_entry *irq_entry)
> +int hv_unmap_msi_interrupt(struct pci_dev *pdev,
> + struct hv_interrupt_entry *irq_entry)
> {
> union hv_device_id hv_devid;
>
> hv_devid.as_uint64 = hv_build_irq_devid(pdev);
>
> - return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
> + return hv_unmap_interrupt(hv_devid, irq_entry);
> }
> +EXPORT_SYMBOL_GPL(hv_unmap_msi_interrupt);
>
> /* NB: during map, hv_interrupt_entry is saved via data->chip_data */
> static void hv_teardown_msi_irq(struct pci_dev *pdev, struct irq_data *irqd)
> @@ -486,7 +515,7 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
> hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
> hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
>
> - return hv_unmap_interrupt(hv_devid.as_uint64, entry);
> + return hv_unmap_interrupt(hv_devid, entry);
> }
> EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
>
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index e4ccdbbf1d12..b6facd3a0f5e 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -204,11 +204,15 @@ static inline u64 hv_iommu_get_curr_partid(void)
> #endif /* CONFIG_HYPERV_IOMMU */
>
> u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
> +void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
> +extern bool hv_no_attdev;
>
> struct irq_domain *hv_create_pci_msi_domain(void);
>
> int hv_map_msi_interrupt(struct irq_data *data,
> struct hv_interrupt_entry *out_entry);
> +int hv_unmap_msi_interrupt(struct pci_dev *dev,
> + struct hv_interrupt_entry *hvirqe);
> int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
> struct hv_interrupt_entry *entry);
> int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 40f0b06bb966..71d1599dc4a8 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -660,15 +660,17 @@ static void hv_irq_retarget_interrupt(struct irq_data *data)
>
> params = *this_cpu_ptr(hyperv_pcpu_input_arg);
> memset(params, 0, sizeof(*params));
> - params->partition_id = HV_PARTITION_ID_SELF;
> +
> + if (hv_pcidev_is_attached_dev(pdev))
> + params->partition_id = hv_iommu_get_curr_partid();
> + else
> + params->partition_id = HV_PARTITION_ID_SELF;
> +
> params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
> - params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
> + params->int_entry.msi_entry.address.as_uint32 =
> + int_desc->address & 0xffffffff;
> params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
> - params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
> - (hbus->hdev->dev_instance.b[4] << 16) |
> - (hbus->hdev->dev_instance.b[7] << 8) |
> - (hbus->hdev->dev_instance.b[6] & 0xf8) |
> - PCI_FUNC(pdev->devfn);
> + params->device_id = hv_pci_vmbus_device_id(pdev);
> params->int_target.vector = hv_msi_get_int_vector(data);
>
> if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
> @@ -1263,6 +1265,15 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
> mb();
> }
> spin_unlock_irqrestore(&hbus->config_lock, flags);
> + /*
> + * Make sure PCI_INTERRUPT_PIN is hard-wired to 0 since it may
> + * be read using a 32bit read which is skipped by the above
> + * emulation.
> + */
> + if (PCI_INTERRUPT_PIN >= where &&
> + PCI_INTERRUPT_PIN <= (where + size)) {
> + *((char *)val + PCI_INTERRUPT_PIN - where) = 0;
> + }
> } else {
> dev_err(dev, "Attempt to read beyond a function's config space.\n");
> }
> @@ -1731,14 +1742,22 @@ static void hv_msi_free(struct irq_domain *domain, unsigned int irq)
> if (!int_desc)
> return;
>
> - irq_data->chip_data = NULL;
> hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
> if (!hpdev) {
> + irq_data->chip_data = NULL;
> kfree(int_desc);
> return;
> }
>
> - hv_int_desc_free(hpdev, int_desc);
> + if (hv_pcidev_is_attached_dev(pdev)) {
> + hv_unmap_msi_interrupt(pdev, irq_data->chip_data);
> + kfree(irq_data->chip_data);
> + irq_data->chip_data = NULL;
> + } else {
> + irq_data->chip_data = NULL;
> + hv_int_desc_free(hpdev, int_desc);
> + }
> +
> put_pcichild(hpdev);
> }
>
> @@ -2139,6 +2158,56 @@ static void hv_vmbus_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> msg->data = 0;
> }
>
> +/* Compose an msi message for a directly attached device */
> +static void hv_dda_compose_msi_msg(struct irq_data *irq_data,
> + struct msi_desc *msi_desc,
> + struct msi_msg *msg)
> +{
> + bool multi_msi;
> + struct hv_pcibus_device *hbus;
> + struct hv_pci_dev *hpdev;
> + struct pci_dev *pdev = msi_desc_to_pci_dev(msi_desc);
> +
> + multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
> + msi_desc->nvec_used > 1;
> +
> + if (multi_msi) {
> + dev_err(&hbus->hdev->device,
> + "Passthru direct attach does not support multi msi\n");
> + goto outerr;
> + }
> +
> + hbus = container_of(pdev->bus->sysdata, struct hv_pcibus_device,
> + sysdata);
> +
> + hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
> + if (!hpdev)
> + goto outerr;
> +
> + /* will unmap if needed and also update irq_data->chip_data */
> + hv_irq_compose_msi_msg(irq_data, msg);
> +
> + put_pcichild(hpdev);
> + return;
> +
> +outerr:
> + memset(msg, 0, sizeof(*msg));
> +}
> +
> +static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> +{
> + struct pci_dev *pdev;
> + struct msi_desc *msi_desc;
> +
> + msi_desc = irq_data_get_msi_desc(data);
> + pdev = msi_desc_to_pci_dev(msi_desc);
> +
> + if (hv_pcidev_is_attached_dev(pdev))
> + hv_dda_compose_msi_msg(data, msi_desc, msg);
> + else
> + hv_vmbus_compose_msi_msg(data, msg);
> +}
> +
> static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
> struct irq_domain *real_parent, struct msi_domain_info *info)
> {
> @@ -2177,7 +2246,7 @@ static const struct msi_parent_ops hv_pcie_msi_parent_ops = {
> /* HW Interrupt Chip Descriptor */
> static struct irq_chip hv_msi_irq_chip = {
> .name = "Hyper-V PCIe MSI",
> - .irq_compose_msi_msg = hv_vmbus_compose_msi_msg,
> + .irq_compose_msi_msg = hv_compose_msi_msg,
> .irq_set_affinity = irq_chip_set_affinity_parent,
> .irq_ack = irq_chip_ack_parent,
> .irq_eoi = irq_chip_eoi_parent,
> @@ -4096,7 +4165,7 @@ static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
> irq_data = irq_get_irq_data(entry->irq);
> if (WARN_ON_ONCE(!irq_data))
> return -EINVAL;
> - hv_vmbus_compose_msi_msg(irq_data, &entry->msg);
> + hv_compose_msi_msg(irq_data, &entry->msg);
> }
> return 0;
> }
> --
> 2.51.2.vfs.0.1
>
Powered by blists - more mailing lists