[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <8f257d2651eb8a4358fcbd47b0145002e5f1d638.1764237717.git.calvin@wbinvd.org>
Date: Thu, 27 Nov 2025 09:06:35 -0800
From: Calvin Owens <calvin@...nvd.org>
To: linux-kernel@...r.kernel.org
Cc: iommu@...ts.linux.dev,
Jason Gunthorpe <jgg@...pe.ca>,
Lu Baolu <baolu.lu@...ux.intel.com>,
Nicolin Chen <nicolinc@...dia.com>,
Joerg Roedel <joro@...tes.org>,
Will Deacon <will@...nel.org>,
David Woodhouse <dwmw2@...radead.org>,
Robin Murphy <robin.murphy@....com>
Subject: [PATCH next] iommu/vt-d: Use shallowest supported table depth in sagaw
A Skylake machine has problems with strict translation on next-20251124:
pci 0000:06:00.0: Adding to iommu group 18
------------[ cut here ]------------
WARNING: drivers/iommu/iommu.c:3055 at iommu_setup_default_domain+0x268/0x2f0, CPU#2: swapper/0/1
CPU: 2 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.18.0-rc6-next-20251124 #1 PREEMPTLAZY
Hardware name: ASUSTeK COMPUTER INC. WS C246M PRO Series/WS C246M PRO Series, BIOS 6101 06/26/2024
RIP: 0010:iommu_setup_default_domain+0x268/0x2f0
<snip>
Call Trace:
<TASK>
iommu_device_register+0x126/0x200
intel_iommu_init+0x2bf/0x580
pci_iommu_init+0xb/0x30
do_one_initcall+0xad/0x1c0
kernel_init_freeable+0x238/0x290
kernel_init+0x16/0x120
ret_from_fork+0x1ba/0x1f0
ret_from_fork_asm+0x11/0x20
</TASK>
Kernel panic - not syncing: kernel: panic_on_warn set ...
<snip>
Dumping ftrace buffer:
---------------------------------
2) | __iommu_group_set_domain_internal() { /* <-iommu_setup_default_domain+0x25e/0x2f0 */
2) | __iommu_device_set_domain() { /* <-__iommu_group_set_domain_internal+0x6d/0x140 */
2) | __iommu_attach_device() { /* <-__iommu_device_set_domain+0x6d/0xb0 */
2) | intel_iommu_attach_device() { /* <-__iommu_attach_device+0x1f/0xe0 */
2) 0.140 us | device_block_translation(); /* <-intel_iommu_attach_device+0x19/0x80 ret=0xffffffff81b5e980 */
2) | paging_domain_compatible() { /* <-intel_iommu_attach_device+0x24/0x80 */
2) | paging_domain_compatible_second_stage() { /* <-paging_domain_compatible+0x47/0x170 */
2) 0.137 us | pt_iommu_vtdss_hw_info(); /* <-paging_domain_compatible_second_stage+0x29/0x1a0 ret=0x1 */
2) 0.530 us | } /* paging_domain_compatible_second_stage ret=-22 */
2) 0.907 us | } /* paging_domain_compatible ret=-22 */
2) 1.653 us | } /* intel_iommu_attach_device ret=-22 */
2) 2.157 us | } /* __iommu_attach_device ret=-22 */
2) 2.528 us | } /* __iommu_device_set_domain ret=-22 */
2) 2.954 us | } /* __iommu_group_set_domain_internal ret=-22 */
---------------------------------
Rebooting in 10 seconds..
The failing condition in paging_domain_compatible_second_stage() is:
/* Page table level is supported. */
if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
return -EINVAL;
This happens because, for many domains on this machine, MGAW=39 but
SAGAW=0x04: that claims a 39-bit maximum address width, but also claims
to only support 48-bit/4-level paging, which seems odd.
Before the GENERIC_PT rewrite, the kernel only looked at SAGAW, so this
machine has been happily running for years using 4-level paging.
Now, the kernel refuses to use 4-level paging because MGAW=39. But SAGAW
claims not to support anything else, so we hit the -EINVAL case above
and fail to initialize.
If I force 4-level paging, everything works. If I force 39-bit/3-level
paging, nothing works (lots of bad context faults). So it seems like the
machine really only supports 4-level paging despite the 3-level MGAW.
I initially thought this was a latent firmware bug. But I can't actually
find anything in the VT-d spec which says the page table can't be deeper
than the physical address width. If it really is allowed, it's certainly
wasteful, but it seems to be the only way this machine will work.
Fix this by using the smallest page table depth supported by SAGAW which
is large enough to contain MGAW, allowing a deeper table than MGAW if
the hardware only supports that configuration.
Signed-off-by: Calvin Owens <calvin@...nvd.org>
---
drivers/iommu/generic_pt/fmt/vtdss.h | 7 ++++++
drivers/iommu/intel/iommu.c | 32 ++++++++++++++++++++++++++++
include/linux/generic_pt/iommu.h | 4 ++++
3 files changed, 43 insertions(+)
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
index d9774848eb6f..bbf6861d9be5 100644
--- a/drivers/iommu/generic_pt/fmt/vtdss.h
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -249,6 +249,13 @@ static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
{
struct pt_vtdss *table = &iommu_table->vtdss_pt;
unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+ unsigned int ptsz_lg2 = cfg->common.hw_min_ptsz_lg2;
+
+ if (vasz_lg2 < ptsz_lg2) {
+ pr_warn_once(FW_BUG "HW requires wasteful %ubit PT with %ubit MGAW\n",
+ ptsz_lg2, vasz_lg2);
+ vasz_lg2 = ptsz_lg2;
+ }
if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
return -EOPNOTSUPP;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d745f833d8b5..d44766bba3d7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2798,6 +2798,36 @@ static struct dmar_domain *paging_domain_alloc(void)
return domain;
}
+static unsigned int compute_min_ptsz_lg2(struct intel_iommu *iommu)
+{
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
+
+ /*
+ * Return the shallowest pagetable depth sufficient to represent the
+ * maximum guest address width which is supported by the hardware. On
+ * some hardware, that shallowest depth is deeper than the MGAW.
+ */
+
+ if (mgaw > 48)
+ goto five;
+
+ if (mgaw > 39)
+ goto four;
+
+ if (sagaw & BIT(1))
+ return 39;
+four:
+ if (sagaw & BIT(2))
+ return 48;
+five:
+ if (sagaw & BIT(3))
+ return 57;
+
+ pr_warn(FW_BUG "Can't satisfy mgaw=%u and sagaw=%02x", mgaw, sagaw);
+ return mgaw;
+}
+
static struct iommu_domain *
intel_iommu_domain_alloc_first_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
@@ -2832,6 +2862,7 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
cfg.common.hw_max_vasz_lg2 = min(cap_mgaw(iommu->cap),
cfg.common.hw_max_vasz_lg2);
cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.hw_min_ptsz_lg2 = compute_min_ptsz_lg2(iommu);
cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
BIT(PT_FEAT_FLUSH_RANGE);
/* First stage always uses scalable mode */
@@ -2916,6 +2947,7 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu);
cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.hw_min_ptsz_lg2 = compute_min_ptsz_lg2(iommu);
cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
/*
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index cfe05a77f86b..8c32e492d6d1 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -188,6 +188,10 @@ struct pt_iommu_cfg {
* might select a lower maximum OA.
*/
u8 hw_max_oasz_lg2;
+ /**
+ * @hw_min_ptsz_lg2: Minimum page table depth the IOMMU HW can support.
+ */
+ u8 hw_min_ptsz_lg2;
};
/* Generate the exported function signatures from iommu_pt.h */
--
2.47.3
Powered by blists - more mailing lists