lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <176953894915.725.1102545144304639827.stgit@linux.ibm.com>
Date: Tue, 27 Jan 2026 18:35:56 +0000
From: Shivaprasad G Bhat <sbhat@...ux.ibm.com>
To: linux-kernel@...r.kernel.org, linuxppc-dev@...ts.ozlabs.org,
        kvm@...r.kernel.org, iommu@...ts.linux.dev
Cc: chleroy@...nel.org, mpe@...erman.id.au, maddy@...ux.ibm.com,
        npiggin@...il.com, alex@...zbot.org, sbhat@...ux.ibm.com,
        joerg.roedel@....com, kevin.tian@...el.com, gbatra@...ux.ibm.com,
        jgg@...dia.com, clg@...d.org, vaibhav@...ux.ibm.com,
        brking@...ux.vnet.ibm.com, nnmlinux@...ux.ibm.com,
        amachhiw@...ux.ibm.com, tpearson@...torengineering.com
Subject: [RFC PATCH] powerpc: iommu: Initial IOMMUFD support for PPC64

The RFC attempts to implement the IOMMUFD support on PPC64 by
adding new iommu_ops for paging domain. The existing platform
domain continues to be the default domain for in-kernel use.
The domain ownership transfer ensures the reset of iommu states
for the new paging domain and in-kernel usage.

On PPC64, IOVA ranges are based on the type of the DMA window
and their properties. Currently, there is no way to expose the
attributes of the non-default 64-bit DMA window, which the platform
supports. The platform allows the operating system to select the
starting offset(at 4GiB or 512PiB default offset), pagesize and
window size for the non-default 64-bit DMA window. For example,
with VFIO, this is handled via VFIO_IOMMU_SPAPR_TCE_GET_INFO
and VFIO_IOMMU_SPAPR_TCE_CREATE|REMOVE ioctls. While I am exploring
the ways to expose and configure these DMA window attributes as
per user input, any suggestions in this regard will be very helpful.

Currently existing vfio type1 specific vfio-compat driver even
with this patch will not work for PPC64. I believe we need to have
a separate "vfio-spapr-compat" driver to make it work.

So brief list of current open problems and ongoing reworks:
 - Second DMA window support as mentioned above.
 - KVM support.
 - EEH support.
 - The vfio compat driver for the spapr tce iommu.
 - Multiple devices (multifunction, same/different iommu group checks,
   SRIOV VF assignment) support.
 - Race conditions, device plug/unplug.
 - self|tests.

The patch currently works for single device and exposes only the
default DMA window of 1GB to the user. It has been tested for
both PowerNV and pSeries machine tce iommu backends. The testing
was done using a Qemu[1] and TCG guest having a NVME device
passthrough. One can use the command like below to try:

qemu-system-ppc64 -machine pseries -accel tcg \
-device spapr-pci-host-bridge,index=1,id=pci.1,ddw=off \
-device vfio-pci,host=<hostdev>,id=hostdev0,\
bus=pci.1.0,addr=0x1,iommufd=iommufd0 \
-object iommufd,id=iommufd0 <...>
...
root:localhost# mount /dev/nvme0n1 /mnt
root:localhost# ls /mnt
...

The current patch is based on linux kernel 6.19-rc6 tree.

Signed-off-by: Shivaprasad G Bhat <sbhat@...ux.ibm.com>

References:
1 : https://github.com/shivaprasadbhat/qemu/tree/iommufd-wip
---
 arch/powerpc/include/asm/iommu.h              |    2 
 arch/powerpc/kernel/iommu.c                   |  181 +++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci-ioda-tce.c |    4 -
 arch/powerpc/platforms/powernv/pci-ioda.c     |    4 -
 arch/powerpc/platforms/powernv/pci.h          |    2 
 arch/powerpc/platforms/pseries/iommu.c        |    6 -
 drivers/vfio/Kconfig                          |    4 -
 7 files changed, 190 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index eafdd63cd6c4..1dc72fbb89e7 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -46,7 +46,7 @@ struct iommu_table_ops {
 			long index, long npages,
 			unsigned long uaddr,
 			enum dma_data_direction direction,
-			unsigned long attrs);
+			unsigned long attrs, bool is_phys);
 #ifdef CONFIG_IOMMU_API
 	/*
 	 * Exchanges existing TCE with new TCE plus direction bits;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0ce71310b7d9..e6543480c461 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -365,7 +365,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	/* Put the TCEs in the HW table */
 	build_fail = tbl->it_ops->set(tbl, entry, npages,
 				      (unsigned long)page &
-				      IOMMU_PAGE_MASK(tbl), direction, attrs);
+				      IOMMU_PAGE_MASK(tbl), direction, attrs, false);
 
 	/* tbl->it_ops->set() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
@@ -539,7 +539,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		/* Insert into HW table */
 		build_fail = tbl->it_ops->set(tbl, entry, npages,
 					      vaddr & IOMMU_PAGE_MASK(tbl),
-					      direction, attrs);
+					      direction, attrs, false);
 		if(unlikely(build_fail))
 			goto failure;
 
@@ -1201,7 +1201,15 @@ spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain,
 	 * also sets the dma_api ops
 	 */
 	table_group = iommu_group_get_iommudata(grp);
+
+	if (old && old->type == IOMMU_DOMAIN_DMA) {
+		ret = table_group->ops->unset_window(table_group, 0);
+		if (ret)
+			goto exit;
+	}
+
 	ret = table_group->ops->take_ownership(table_group, dev);
+exit:
 	iommu_group_put(grp);
 
 	return ret;
@@ -1260,6 +1268,167 @@ static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev)
 	return hose->controller_ops.device_group(hose, pdev);
 }
 
+struct ppc64_domain {
+	struct iommu_domain  domain;
+	struct device        *device; /* Make it a list */
+	struct iommu_table   *table;
+	spinlock_t           list_lock;
+	struct rcu_head      rcu;
+};
+
+static struct ppc64_domain *to_ppc64_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct ppc64_domain, domain);
+}
+
+static void spapr_tce_domain_free(struct iommu_domain *domain)
+{
+	struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain);
+
+	kfree(ppc64_domain);
+}
+
+static const struct iommu_ops spapr_tce_iommu_ops;
+static struct iommu_domain *spapr_tce_domain_alloc_paging(struct device *dev)
+{
+	struct iommu_group *grp = iommu_group_get(dev);
+	struct iommu_table_group *table_group;
+	struct ppc64_domain *ppc64_domain;
+	struct iommu_table *ptbl;
+	int ret = -1;
+
+	table_group = iommu_group_get_iommudata(grp);
+	ppc64_domain = kzalloc(sizeof(*ppc64_domain), GFP_KERNEL);
+	if (!ppc64_domain)
+		return NULL;
+
+	/* Just the default window hardcode for now */
+	ret = table_group->ops->create_table(table_group, 0, 0xc, 0x40000000, 1, &ptbl);
+	iommu_tce_table_get(ptbl);
+	ppc64_domain->table = ptbl; /* REVISIT: Single device for now */
+	if (!ppc64_domain->table) {
+		kfree(ppc64_domain);
+		iommu_tce_table_put(ptbl);
+		iommu_group_put(grp);
+		return NULL;
+	}
+
+	table_group->ops->set_window(table_group, 0, ptbl);
+	iommu_group_put(grp);
+
+	ppc64_domain->domain.pgsize_bitmap = SZ_4K;
+	ppc64_domain->domain.geometry.force_aperture = true;
+	ppc64_domain->domain.geometry.aperture_start = 0;
+	ppc64_domain->domain.geometry.aperture_end = 0x40000000; /*default window */
+	ppc64_domain->domain.ops = spapr_tce_iommu_ops.default_domain_ops;
+
+	spin_lock_init(&ppc64_domain->list_lock);
+
+	return &ppc64_domain->domain;
+}
+
+static size_t spapr_tce_iommu_unmap_pages(struct iommu_domain *domain,
+				unsigned long iova,
+				size_t pgsize, size_t pgcount,
+				struct iommu_iotlb_gather *gather)
+{
+	struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain);
+	struct iommu_table *tbl = ppc64_domain->table;
+	unsigned long pgshift = __ffs(pgsize);
+	size_t size = pgcount << pgshift;
+	size_t mapped = 0;
+	unsigned int tcenum;
+	int  mask;
+
+	if (pgsize != SZ_4K)
+		return -EINVAL;
+
+	size = PAGE_ALIGN(size);
+
+	mask = IOMMU_PAGE_MASK(tbl);
+	tcenum = iova >> tbl->it_page_shift;
+
+	tbl->it_ops->clear(tbl, tcenum, pgcount);
+
+	mapped = pgsize * pgcount;
+
+	return mapped;
+}
+
+static phys_addr_t spapr_tce_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+	struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain);
+	struct iommu_table *tbl = ppc64_domain->table;
+	phys_addr_t paddr, rpn, tceval;
+	unsigned int tcenum;
+
+	tcenum = iova >> tbl->it_page_shift;
+	tceval = tbl->it_ops->get(tbl, tcenum);
+
+	/* Ignore the direction bits */
+	rpn = tceval >> tbl->it_page_shift;
+	paddr = rpn << tbl->it_page_shift;
+
+	return paddr;
+}
+
+static int spapr_tce_iommu_map_pages(struct iommu_domain *domain,
+				unsigned long iova, phys_addr_t paddr,
+				size_t pgsize, size_t pgcount,
+				int prot, gfp_t gfp, size_t *mapped)
+{
+	struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain);
+	enum dma_data_direction direction = DMA_BIDIRECTIONAL;
+	struct iommu_table *tbl = ppc64_domain->table;
+	unsigned long pgshift = __ffs(pgsize);
+	size_t size = pgcount << pgshift;
+	unsigned int tcenum;
+	int ret;
+
+	if (pgsize != SZ_4K)
+		return -EINVAL;
+
+	if (iova < ppc64_domain->domain.geometry.aperture_start ||
+	    (iova + size - 1) > ppc64_domain->domain.geometry.aperture_end)
+		return -EINVAL;
+
+	if (!IS_ALIGNED(iova | paddr, pgsize))
+		return -EINVAL;
+
+	if (!(prot & IOMMU_WRITE))
+		direction = DMA_FROM_DEVICE;
+
+	if (!(prot & IOMMU_READ))
+		direction = DMA_TO_DEVICE;
+
+	size = PAGE_ALIGN(size);
+	tcenum = iova >> tbl->it_page_shift;
+
+	/* Put the TCEs in the HW table */
+	ret = tbl->it_ops->set(tbl, tcenum, pgcount,
+				paddr, direction, 0, true);
+	if (!ret && mapped)
+		*mapped = pgsize;
+
+	return 0;
+}
+
+static int spapr_tce_iommu_attach_device(struct iommu_domain *domain,
+				    struct device *dev, struct iommu_domain *old)
+{
+	struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain);
+
+	/* REVISIT */
+	if (!domain)
+		return 0;
+
+	/* REVISIT: Check table group, list handling */
+	ppc64_domain->device = dev;
+
+	return 0;
+}
+
+
 static const struct iommu_ops spapr_tce_iommu_ops = {
 	.default_domain = &spapr_tce_platform_domain,
 	.blocked_domain = &spapr_tce_blocked_domain,
@@ -1267,6 +1436,14 @@ static const struct iommu_ops spapr_tce_iommu_ops = {
 	.probe_device = spapr_tce_iommu_probe_device,
 	.release_device = spapr_tce_iommu_release_device,
 	.device_group = spapr_tce_iommu_device_group,
+	.domain_alloc_paging = spapr_tce_domain_alloc_paging,
+	.default_domain_ops = &(const struct iommu_domain_ops) {
+		.attach_dev     = spapr_tce_iommu_attach_device,
+		.map_pages      = spapr_tce_iommu_map_pages,
+		.unmap_pages    = spapr_tce_iommu_unmap_pages,
+		.iova_to_phys   = spapr_tce_iommu_iova_to_phys,
+		.free           = spapr_tce_domain_free,
+	}
 };
 
 static struct attribute *spapr_tce_iommu_attrs[] = {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e96324502db0..8800bf86d17a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -123,10 +123,10 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 
 int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
-		unsigned long attrs)
+		unsigned long attrs, bool is_phys)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
-	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
+	u64 rpn = !is_phys ? __pa(uaddr) >> tbl->it_page_shift : uaddr >> tbl->it_page_shift;
 	long i;
 
 	if (proto_tce & TCE_PCI_WRITE)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b0c1d9d16fb5..610146a63e3b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1241,10 +1241,10 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 		long npages, unsigned long uaddr,
 		enum dma_data_direction direction,
-		unsigned long attrs)
+		unsigned long attrs, bool is_phys)
 {
 	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
-			attrs);
+			attrs, is_phys);
 
 	if (!ret)
 		pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 42075501663b..3579ecd55d00 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -300,7 +300,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 
 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
-		unsigned long attrs);
+		unsigned long attrs, bool is_phys);
 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
 extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
 		unsigned long *hpa, enum dma_data_direction *direction);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index eec333dd2e59..8c6f9f18e462 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -122,7 +122,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      long npages, unsigned long uaddr,
 			      enum dma_data_direction direction,
-			      unsigned long attrs)
+			      unsigned long attrs, bool false)
 {
 	u64 proto_tce;
 	__be64 *tcep;
@@ -250,7 +250,7 @@ static DEFINE_PER_CPU(__be64 *, tce_page);
 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
 				     enum dma_data_direction direction,
-				     unsigned long attrs)
+				     unsigned long attrs, bool is_phys)
 {
 	u64 rc = 0;
 	u64 proto_tce;
@@ -287,7 +287,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		__this_cpu_write(tce_page, tcep);
 	}
 
-	rpn = __pa(uaddr) >> tceshift;
+	rpn = !is_phys ? __pa(uaddr) >> tceshift : uaddr >> tceshift;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index ceae52fd7586..9929aa78a5da 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -4,7 +4,7 @@ menuconfig VFIO
 	select IOMMU_API
 	depends on IOMMUFD || !IOMMUFD
 	select INTERVAL_TREE
-	select VFIO_GROUP if SPAPR_TCE_IOMMU || IOMMUFD=n
+	select VFIO_GROUP if IOMMUFD=n
 	select VFIO_DEVICE_CDEV if !VFIO_GROUP
 	select VFIO_CONTAINER if IOMMUFD=n
 	help
@@ -16,7 +16,7 @@ menuconfig VFIO
 if VFIO
 config VFIO_DEVICE_CDEV
 	bool "Support for the VFIO cdev /dev/vfio/devices/vfioX"
-	depends on IOMMUFD && !SPAPR_TCE_IOMMU
+	depends on IOMMUFD
 	default !VFIO_GROUP
 	help
 	  The VFIO device cdev is another way for userspace to get device



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ