[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID:
<SN6PR02MB4157ADD06608EFE00B86A3F7D451A@SN6PR02MB4157.namprd02.prod.outlook.com>
Date: Thu, 17 Jul 2025 17:25:54 +0000
From: Michael Kelley <mhklinux@...look.com>
To: Dan Williams <dan.j.williams@...el.com>, "bhelgaas@...gle.com"
<bhelgaas@...gle.com>
CC: "linux-pci@...r.kernel.org" <linux-pci@...r.kernel.org>, "lukas@...ner.de"
<lukas@...ner.de>, "linux-kernel@...r.kernel.org"
<linux-kernel@...r.kernel.org>, "Jonathan.Cameron@...wei.com"
<Jonathan.Cameron@...wei.com>, Suzuki K Poulose <suzuki.poulose@....com>,
Lorenzo Pieralisi <lpieralisi@...nel.org>, Manivannan Sadhasivam
<manivannan.sadhasivam@...aro.org>, Rob Herring <robh@...nel.org>, "K. Y.
Srinivasan" <kys@...rosoft.com>, Haiyang Zhang <haiyangz@...rosoft.com>, Wei
Liu <wei.liu@...nel.org>, Dexuan Cui <decui@...rosoft.com>, "open
list:Hyper-V/Azure CORE AND DRIVERS" <linux-hyperv@...r.kernel.org>
Subject: RE: [PATCH 2/3] PCI: Enable host bridge emulation for
PCI_DOMAINS_GENERIC platforms
From: Dan Williams <dan.j.williams@...el.com> Sent: Wednesday, July 16, 2025 9:09 AM
>
> The ability to emulate a host bridge is useful not only for hardware PCI
> controllers like CONFIG_VMD, or virtual PCI controllers like
> CONFIG_PCI_HYPERV, but also for test and development scenarios like
> CONFIG_SAMPLES_DEVSEC [1].
>
> One stumbling block for defining CONFIG_SAMPLES_DEVSEC, a sample
> implementation of a platform TSM for PCI Device Security, is the need to
> accommodate PCI_DOMAINS_GENERIC architectures alongside x86 [2].
>
> In support of supplementing the existing CONFIG_PCI_BRIDGE_EMUL
> infrastructure for host bridges:
>
> * Introduce pci_bus_find_emul_domain_nr() as a common way to find a free
> PCI domain number whether that is to reuse the existing dynamic
> allocation code in the !ACPI case, or to assign an unused domain above
> the last ACPI segment.
>
> * Convert pci-hyperv to the new allocator so that the PCI core can
> unconditionally assume that bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET
> is the dynamically allocated case.
>
> A follow on patch can also convert vmd to the new scheme. Currently vmd
> is limited to CONFIG_PCI_DOMAINS_GENERIC=n (x86) so, unlike pci-hyperv,
> it does not immediately conflict with this new
> pci_bus_find_emul_domain_nr() mechanism.
>
> Link: https://lore.kernel.org/all/174107249038.1288555.12362100502109498455.stgit@dwillia2-xfh.jf.intel.com/ [1]
> Reported-by: Suzuki K Poulose <suzuki.poulose@....com>
> Closes: https://lore.kernel.org/all/20250311144601.145736-3-suzuki.poulose@arm.com/
> Cc: Lorenzo Pieralisi <lpieralisi@...nel.org>
> Cc: Manivannan Sadhasivam <manivannan.sadhasivam@...aro.org>
> Cc: Rob Herring <robh@...nel.org>
> Cc: Bjorn Helgaas <bhelgaas@...gle.com>
> Cc: "K. Y. Srinivasan" <kys@...rosoft.com>
> Cc: Haiyang Zhang <haiyangz@...rosoft.com>
> Cc: Wei Liu <wei.liu@...nel.org>
> Cc: Dexuan Cui <decui@...rosoft.com>
> Tested-by: Suzuki K Poulose <suzuki.poulose@....com>
> Signed-off-by: Dan Williams <dan.j.williams@...el.com>
> ---
> drivers/pci/controller/pci-hyperv.c | 53 ++---------------------------
> drivers/pci/pci.c | 43 ++++++++++++++++++++++-
> drivers/pci/probe.c | 8 ++++-
> include/linux/pci.h | 4 +++
> 4 files changed, 56 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index ef5d655a0052..cfe9806bdbe4 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -3630,48 +3630,6 @@ static int hv_send_resources_released(struct hv_device *hdev)
> return 0;
> }
>
> -#define HVPCI_DOM_MAP_SIZE (64 * 1024)
> -static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
> -
> -/*
> - * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
> - * as invalid for passthrough PCI devices of this driver.
> - */
> -#define HVPCI_DOM_INVALID 0
> -
> -/**
> - * hv_get_dom_num() - Get a valid PCI domain number
> - * Check if the PCI domain number is in use, and return another number if
> - * it is in use.
> - *
> - * @dom: Requested domain number
> - *
> - * return: domain number on success, HVPCI_DOM_INVALID on failure
> - */
> -static u16 hv_get_dom_num(u16 dom)
> -{
> - unsigned int i;
> -
> - if (test_and_set_bit(dom, hvpci_dom_map) == 0)
> - return dom;
> -
> - for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
> - if (test_and_set_bit(i, hvpci_dom_map) == 0)
> - return i;
> - }
> -
> - return HVPCI_DOM_INVALID;
> -}
> -
> -/**
> - * hv_put_dom_num() - Mark the PCI domain number as free
> - * @dom: Domain number to be freed
> - */
> -static void hv_put_dom_num(u16 dom)
> -{
> - clear_bit(dom, hvpci_dom_map);
> -}
> -
> /**
> * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
> * @hdev: VMBus's tracking struct for this root PCI bus
> @@ -3715,9 +3673,9 @@ static int hv_pci_probe(struct hv_device *hdev,
> * collisions) in the same VM.
> */
> dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
> - dom = hv_get_dom_num(dom_req);
> + dom = pci_bus_find_emul_domain_nr(dom_req);
>
> - if (dom == HVPCI_DOM_INVALID) {
> + if (dom < 0) {
> dev_err(&hdev->device,
> "Unable to use dom# 0x%x or other numbers", dom_req);
> ret = -EINVAL;
> @@ -3851,7 +3809,7 @@ static int hv_pci_probe(struct hv_device *hdev,
> destroy_wq:
> destroy_workqueue(hbus->wq);
> free_dom:
> - hv_put_dom_num(hbus->bridge->domain_nr);
> + pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr);
> free_bus:
> kfree(hbus);
> return ret;
> @@ -3976,8 +3934,6 @@ static void hv_pci_remove(struct hv_device *hdev)
> irq_domain_remove(hbus->irq_domain);
> irq_domain_free_fwnode(hbus->fwnode);
>
> - hv_put_dom_num(hbus->bridge->domain_nr);
> -
> kfree(hbus);
> }
>
> @@ -4148,9 +4104,6 @@ static int __init init_hv_pci_drv(void)
> if (ret)
> return ret;
>
> - /* Set the invalid domain number's bit, so it will not be used */
> - set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
> -
> /* Initialize PCI block r/w interface */
> hvpci_block_ops.read_block = hv_read_config_block;
> hvpci_block_ops.write_block = hv_write_config_block;
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index e9448d55113b..833ebf2d5213 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -6692,9 +6692,50 @@ static void pci_no_domains(void)
> #endif
> }
>
> +#ifdef CONFIG_PCI_DOMAINS
> +static DEFINE_IDA(pci_domain_nr_dynamic_ida);
> +
> +/*
> + * Find a free domain_nr either allocated by pci_domain_nr_dynamic_ida or
> + * fallback to the first free domain number above the last ACPI segment number.
> + * Caller may have a specific domain number in mind, in which case try to
> + * reserve it.
> + *
> + * Note that this allocation is freed by pci_release_host_bridge_dev().
> + */
> +int pci_bus_find_emul_domain_nr(int hint)
> +{
> + if (hint >= 0) {
> + hint = ida_alloc_range(&pci_domain_nr_dynamic_ida, hint, hint,
> + GFP_KERNEL);
This almost preserves the existing functionality in pci-hyperv.c. But if the
"hint" passed in is zero, current code in pci-hyperv.c treats that as a
collision and allocates some other value. The special treatment of zero is
necessary per the comment with the definition of HVPCI_DOM_INVALID.
I don't have an opinion on whether the code here should treat a "hint"
of zero as invalid, or whether that should be handled in pci-hyperv.c.
> +
> + if (hint >= 0)
> + return hint;
> + }
> +
> + if (acpi_disabled)
> + return ida_alloc(&pci_domain_nr_dynamic_ida, GFP_KERNEL);
> +
> + /*
> + * Emulated domains start at 0x10000 to not clash with ACPI _SEG
> + * domains. Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of
> + * which the lower 16 bits are the PCI Segment Group (domain) number.
> + * Other bits are currently reserved.
> + */
Back in 2018 and 2019, the Microsoft Linux team encountered problems with
PCI domain IDs that exceeded 0xFFFF. User space code, such as the Xorg X server,
assumed PCI domain IDs were at most 16 bits, and retained only the low 16 bits
if the value was larger. My memory of the details is vague, but I believe some
or all of this behavior was tied to libpciaccess. As a result of these user space
limitations, the pci-hyperv.c code made sure to not create any domain IDs
larger than 0xFFFF. The problem was not just theoretical -- Microsoft had
customers reporting issues due to the "32-bit domain ID problem" and the
pci-hyperv.c code was updated to avoid it.
I don't have information on whether user space code has been fixed, or
the extent to which such a fix has propagated into distro versions. At the
least, a VM with old user space code might break if the kernel is upgraded
to one with this patch. How do people see the risks now that it is 6 years
later? I don't have enough data to make an assessment.
Michael
> + return ida_alloc_range(&pci_domain_nr_dynamic_ida, 0x10000, INT_MAX,
> + GFP_KERNEL);
> +}
> +EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr);
> +
> +void pci_bus_release_emul_domain_nr(int domain_nr)
> +{
> + ida_free(&pci_domain_nr_dynamic_ida, domain_nr);
> +}
> +EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr);
> +#endif
> +
> #ifdef CONFIG_PCI_DOMAINS_GENERIC
> static DEFINE_IDA(pci_domain_nr_static_ida);
> -static DEFINE_IDA(pci_domain_nr_dynamic_ida);
>
> static void of_pci_reserve_static_domain_nr(void)
> {
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 4b8693ec9e4c..e94978c3be3d 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -632,6 +632,11 @@ static void pci_release_host_bridge_dev(struct device *dev)
>
> pci_free_resource_list(&bridge->windows);
> pci_free_resource_list(&bridge->dma_ranges);
> +
> + /* Host bridges only have domain_nr set in the emulation case */
> + if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
> + pci_bus_release_emul_domain_nr(bridge->domain_nr);
> +
> kfree(bridge);
> }
>
> @@ -1112,7 +1117,8 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
> device_del(&bridge->dev);
> free:
> #ifdef CONFIG_PCI_DOMAINS_GENERIC
> - pci_bus_release_domain_nr(parent, bus->domain_nr);
> + if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET)
> + pci_bus_release_domain_nr(parent, bus->domain_nr);
> #endif
> if (bus_registered)
> put_device(&bus->dev);
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 05e68f35f392..f6a713da5c49 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1934,10 +1934,14 @@ DEFINE_GUARD(pci_dev, struct pci_dev *,
> pci_dev_lock(_T), pci_dev_unlock(_T))
> */
> #ifdef CONFIG_PCI_DOMAINS
> extern int pci_domains_supported;
> +int pci_bus_find_emul_domain_nr(int hint);
> +void pci_bus_release_emul_domain_nr(int domain_nr);
> #else
> enum { pci_domains_supported = 0 };
> static inline int pci_domain_nr(struct pci_bus *bus) { return 0; }
> static inline int pci_proc_domain(struct pci_bus *bus) { return 0; }
> +static inline int pci_bus_find_emul_domain_nr(int hint) { return 0; }
> +static inline void pci_bus_release_emul_domain_nr(int domain_nr) { }
> #endif /* CONFIG_PCI_DOMAINS */
>
> /*
> --
> 2.50.1
>
Powered by blists - more mailing lists