[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250920203851.2205115-32-ajones@ventanamicro.com>
Date: Sat, 20 Sep 2025 15:39:02 -0500
From: Andrew Jones <ajones@...tanamicro.com>
To: iommu@...ts.linux.dev,
kvm-riscv@...ts.infradead.org,
kvm@...r.kernel.org,
linux-riscv@...ts.infradead.org,
linux-kernel@...r.kernel.org
Cc: jgg@...dia.com,
zong.li@...ive.com,
tjeznach@...osinc.com,
joro@...tes.org,
will@...nel.org,
robin.murphy@....com,
anup@...infault.org,
atish.patra@...ux.dev,
tglx@...utronix.de,
alex.williamson@...hat.com,
paul.walmsley@...ive.com,
palmer@...belt.com,
alex@...ti.fr
Subject: [RFC PATCH v2 12/18] iommu/riscv: Add guest file irqbypass support
Implement irq_set_vcpu_affinity() in the RISCV IOMMU driver.
irq_set_vcpu_affinity() is the channel from a hypervisor to the
IOMMU needed to ensure that assigned devices which direct MSIs to
guest IMSIC addresses will have those MSI writes redirected to
their corresponding guest interrupt files.
Signed-off-by: Andrew Jones <ajones@...tanamicro.com>
---
drivers/iommu/riscv/iommu-ir.c | 165 ++++++++++++++++++++++++++++++++-
drivers/iommu/riscv/iommu.c | 5 +-
drivers/iommu/riscv/iommu.h | 4 +
3 files changed, 171 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c
index 059671f18267..48f424ce1a8d 100644
--- a/drivers/iommu/riscv/iommu-ir.c
+++ b/drivers/iommu/riscv/iommu-ir.c
@@ -10,6 +10,8 @@
#include <linux/msi.h>
#include <linux/sizes.h>
+#include <asm/irq.h>
+
#include "../iommu-pages.h"
#include "iommu.h"
@@ -164,6 +166,48 @@ static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain,
rcu_read_unlock();
}
+static void riscv_iommu_ir_msitbl_clear(struct riscv_iommu_domain *domain)
+{
+ for (size_t i = 0; i < riscv_iommu_ir_nr_msiptes(domain); i++) {
+ riscv_iommu_ir_clear_pte(&domain->msi_root[i]);
+ refcount_set(&domain->msi_pte_counts[i], 0);
+ }
+}
+
+static void riscv_iommu_ir_msiptp_update(struct riscv_iommu_domain *domain)
+{
+ struct riscv_iommu_bond *bond;
+ struct riscv_iommu_device *iommu, *prev;
+ struct riscv_iommu_dc new_dc = {
+ .ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
+ RISCV_IOMMU_PC_TA_V,
+ .fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+ FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)),
+ .msiptp = virt_to_pfn(domain->msi_root) |
+ FIELD_PREP(RISCV_IOMMU_DC_MSIPTP_MODE,
+ RISCV_IOMMU_DC_MSIPTP_MODE_FLAT),
+ .msi_addr_mask = domain->msi_addr_mask,
+ .msi_addr_pattern = domain->msi_addr_pattern,
+ };
+
+ /* Like riscv_iommu_ir_msitbl_inval(), synchronize with riscv_iommu_bond_link() */
+ smp_mb();
+
+ rcu_read_lock();
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_iodir_update(iommu, bond->dev, &new_dc);
+ prev = iommu;
+ }
+
+ rcu_read_unlock();
+}
+
struct riscv_iommu_ir_chip_data {
size_t idx;
u32 config;
@@ -279,12 +323,127 @@ static int riscv_iommu_ir_irq_set_affinity(struct irq_data *data,
return ret;
}
+static bool riscv_iommu_ir_vcpu_check_config(struct riscv_iommu_domain *domain,
+ struct riscv_iommu_ir_vcpu_info *vcpu_info)
+{
+ return domain->msi_addr_mask == vcpu_info->msi_addr_mask &&
+ domain->msi_addr_pattern == vcpu_info->msi_addr_pattern &&
+ domain->group_index_bits == vcpu_info->group_index_bits &&
+ domain->group_index_shift == vcpu_info->group_index_shift;
+}
+
+static int riscv_iommu_ir_vcpu_new_config(struct riscv_iommu_domain *domain,
+ struct irq_data *data,
+ struct riscv_iommu_ir_vcpu_info *vcpu_info)
+{
+ struct riscv_iommu_msipte *pte;
+ size_t idx;
+ int ret;
+
+ if (domain->pgd_mode)
+ riscv_iommu_ir_unmap_imsics(domain);
+
+ riscv_iommu_ir_msitbl_clear(domain);
+
+ domain->msi_addr_mask = vcpu_info->msi_addr_mask;
+ domain->msi_addr_pattern = vcpu_info->msi_addr_pattern;
+ domain->group_index_bits = vcpu_info->group_index_bits;
+ domain->group_index_shift = vcpu_info->group_index_shift;
+ domain->imsic_stride = SZ_4K;
+ domain->msitbl_config += 1;
+
+ if (domain->pgd_mode) {
+ /*
+ * As in riscv_iommu_ir_irq_domain_create(), we do all stage1
+ * mappings up front since the MSI table will manage the
+ * translations.
+ *
+ * XXX: Since irq-set-vcpu-affinity is called in atomic context
+ * we need GFP_ATOMIC. If the number of 4K dma pte allocations
+ * is considered too many for GFP_ATOMIC, then we can wrap
+ * riscv_iommu_pte_alloc()'s iommu_alloc_pages_node_sz() call
+ * in a mempool and try to ensure the pool has enough elements
+ * in riscv_iommu_ir_irq_domain_enable_msis().
+ */
+ ret = riscv_iommu_ir_map_imsics(domain, GFP_ATOMIC);
+ if (ret)
+ return ret;
+ }
+
+ idx = riscv_iommu_ir_compute_msipte_idx(domain, vcpu_info->gpa);
+ pte = &domain->msi_root[idx];
+ riscv_iommu_ir_irq_set_msitbl_info(data, idx, domain->msitbl_config);
+ riscv_iommu_ir_set_pte(pte, vcpu_info->hpa);
+ riscv_iommu_ir_msitbl_inval(domain, NULL);
+ refcount_set(&domain->msi_pte_counts[idx], 1);
+
+ riscv_iommu_ir_msiptp_update(domain);
+
+ return 0;
+}
+
+static int riscv_iommu_ir_irq_set_vcpu_affinity(struct irq_data *data, void *arg)
+{
+ struct riscv_iommu_info *info = data->domain->host_data;
+ struct riscv_iommu_domain *domain = info->domain;
+ struct riscv_iommu_ir_vcpu_info *vcpu_info = arg;
+ struct riscv_iommu_msipte pteval;
+ struct riscv_iommu_msipte *pte;
+ bool inc = false, dec = false;
+ size_t old_idx, new_idx;
+ u32 old_config;
+
+ if (!domain->msi_root)
+ return -EOPNOTSUPP;
+
+ old_idx = riscv_iommu_ir_irq_msitbl_idx(data);
+ old_config = riscv_iommu_ir_irq_msitbl_config(data);
+
+ if (!vcpu_info) {
+ riscv_iommu_ir_msitbl_unmap(domain, data, old_idx);
+ return 0;
+ }
+
+ guard(raw_spinlock)(&domain->msi_lock);
+
+ if (!riscv_iommu_ir_vcpu_check_config(domain, vcpu_info))
+ return riscv_iommu_ir_vcpu_new_config(domain, data, vcpu_info);
+
+ new_idx = riscv_iommu_ir_compute_msipte_idx(domain, vcpu_info->gpa);
+ riscv_iommu_ir_irq_set_msitbl_info(data, new_idx, domain->msitbl_config);
+
+ pte = &domain->msi_root[new_idx];
+ riscv_iommu_ir_set_pte(&pteval, vcpu_info->hpa);
+
+ if (pteval.pte != pte->pte) {
+ *pte = pteval;
+ riscv_iommu_ir_msitbl_inval(domain, pte);
+ }
+
+ if (old_config != domain->msitbl_config)
+ inc = true;
+ else if (new_idx != old_idx)
+ inc = dec = true;
+
+ if (dec && refcount_dec_and_test(&domain->msi_pte_counts[old_idx])) {
+ pte = &domain->msi_root[old_idx];
+ riscv_iommu_ir_clear_pte(pte);
+ riscv_iommu_ir_msitbl_inval(domain, pte);
+ }
+
+ if (inc && !refcount_inc_not_zero(&domain->msi_pte_counts[new_idx]))
+ refcount_set(&domain->msi_pte_counts[new_idx], 1);
+
+ return 0;
+}
+
static struct irq_chip riscv_iommu_ir_irq_chip = {
.name = "IOMMU-IR",
.irq_ack = irq_chip_ack_parent,
.irq_mask = irq_chip_mask_parent,
.irq_unmask = irq_chip_unmask_parent,
.irq_set_affinity = riscv_iommu_ir_irq_set_affinity,
+ .irq_set_vcpu_affinity = riscv_iommu_ir_irq_set_vcpu_affinity,
};
static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain,
@@ -334,7 +493,11 @@ static void riscv_iommu_ir_irq_domain_free_irqs(struct irq_domain *irqdomain,
config = riscv_iommu_ir_irq_msitbl_config(data);
/*
* Only irqs with matching config versions need to be unmapped here
- * since config changes will unmap everything.
+ * since config changes will unmap everything and irq-set-vcpu-affinity
+ * irq deletions unmap at deletion time. An example of stale indices that
+ * don't need to be unmapped are those of irqs allocated by VFIO that a
+ * guest driver never used. The config change made for the guest will have
+ * already unmapped those, though, so there's no need to unmap them here.
*/
if (config == domain->msitbl_config) {
idx = riscv_iommu_ir_irq_msitbl_idx(data);
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 440c3eb6f15a..02f38aa0b231 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -957,8 +957,9 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
* device is not quiesced might be disruptive, potentially causing
* interim translation faults.
*/
-static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
- struct device *dev, struct riscv_iommu_dc *new_dc)
+void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
+ struct device *dev,
+ struct riscv_iommu_dc *new_dc)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct riscv_iommu_dc *dc;
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
index 130f82e8392a..5ab2b4d6ee88 100644
--- a/drivers/iommu/riscv/iommu.h
+++ b/drivers/iommu/riscv/iommu.h
@@ -124,6 +124,10 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu);
void riscv_iommu_remove(struct riscv_iommu_device *iommu);
void riscv_iommu_disable(struct riscv_iommu_device *iommu);
+void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
+ struct device *dev,
+ struct riscv_iommu_dc *new_dc);
+
void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
struct riscv_iommu_command *cmd);
void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, unsigned int timeout_us);
--
2.49.0
Powered by blists - more mailing lists