lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <8f257d2651eb8a4358fcbd47b0145002e5f1d638.1764237717.git.calvin@wbinvd.org>
Date: Thu, 27 Nov 2025 09:06:35 -0800
From: Calvin Owens <calvin@...nvd.org>
To: linux-kernel@...r.kernel.org
Cc: iommu@...ts.linux.dev,
	Jason Gunthorpe <jgg@...pe.ca>,
	Lu Baolu <baolu.lu@...ux.intel.com>,
	Nicolin Chen <nicolinc@...dia.com>,
	Joerg Roedel <joro@...tes.org>,
	Will Deacon <will@...nel.org>,
	David Woodhouse <dwmw2@...radead.org>,
	Robin Murphy <robin.murphy@....com>
Subject: [PATCH next] iommu/vt-d: Use shallowest supported table depth in sagaw

A Skylake machine has problems with strict translation on next-20251124:

    pci 0000:06:00.0: Adding to iommu group 18
    ------------[ cut here ]------------
    WARNING: drivers/iommu/iommu.c:3055 at iommu_setup_default_domain+0x268/0x2f0, CPU#2: swapper/0/1
    CPU: 2 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.18.0-rc6-next-20251124 #1 PREEMPTLAZY
    Hardware name: ASUSTeK COMPUTER INC. WS C246M PRO Series/WS C246M PRO Series, BIOS 6101 06/26/2024
    RIP: 0010:iommu_setup_default_domain+0x268/0x2f0
    <snip>
    Call Trace:
     <TASK>
     iommu_device_register+0x126/0x200
     intel_iommu_init+0x2bf/0x580
     pci_iommu_init+0xb/0x30
     do_one_initcall+0xad/0x1c0
     kernel_init_freeable+0x238/0x290
     kernel_init+0x16/0x120
     ret_from_fork+0x1ba/0x1f0
     ret_from_fork_asm+0x11/0x20
     </TASK>
    Kernel panic - not syncing: kernel: panic_on_warn set ...
    <snip>
    Dumping ftrace buffer:
    ---------------------------------
     2)               |    __iommu_group_set_domain_internal() { /* <-iommu_setup_default_domain+0x25e/0x2f0 */
     2)               |      __iommu_device_set_domain() { /* <-__iommu_group_set_domain_internal+0x6d/0x140 */
     2)               |        __iommu_attach_device() { /* <-__iommu_device_set_domain+0x6d/0xb0 */
     2)               |          intel_iommu_attach_device() { /* <-__iommu_attach_device+0x1f/0xe0 */
     2)   0.140 us    |            device_block_translation(); /* <-intel_iommu_attach_device+0x19/0x80 ret=0xffffffff81b5e980 */
     2)               |            paging_domain_compatible() { /* <-intel_iommu_attach_device+0x24/0x80 */
     2)               |              paging_domain_compatible_second_stage() { /* <-paging_domain_compatible+0x47/0x170 */
     2)   0.137 us    |                pt_iommu_vtdss_hw_info(); /* <-paging_domain_compatible_second_stage+0x29/0x1a0 ret=0x1 */
     2)   0.530 us    |              } /* paging_domain_compatible_second_stage ret=-22 */
     2)   0.907 us    |            } /* paging_domain_compatible ret=-22 */
     2)   1.653 us    |          } /* intel_iommu_attach_device ret=-22 */
     2)   2.157 us    |        } /* __iommu_attach_device ret=-22 */
     2)   2.528 us    |      } /* __iommu_device_set_domain ret=-22 */
     2)   2.954 us    |    } /* __iommu_group_set_domain_internal ret=-22 */
    ---------------------------------
    Rebooting in 10 seconds..

The failing condition in paging_domain_compatible_second_stage() is:

    /* Page table level is supported. */
    if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
        return -EINVAL;

This happens because, for many domains on this machine, MGAW=39 but
SAGAW=0x04: that claims a 39-bit maximum address width, but also claims
to only support 48-bit/4-level paging, which seems odd.

Before the GENERIC_PT rewrite, the kernel only looked at SAGAW, so this
machine has been happily running for years using 4-level paging.

Now, the kernel refuses to use 4-level paging because MGAW=39. But SAGAW
claims not to support anything else, so we hit the -EINVAL case above
and fail to initialize.

If I force 4-level paging, everything works. If I force 39-bit/3-level
paging, nothing works (lots of bad context faults). So it seems like the
machine really only supports 4-level paging despite the 3-level MGAW.

I initially thought this was a latent firmware bug. But I can't actually
find anything in the VT-d spec which says the page table can't be deeper
than the physical address width. If it really is allowed, it's certainly
wasteful, but it seems to be the only way this machine will work.

Fix this by using the smallest page table depth supported by SAGAW which
is large enough to contain MGAW, allowing a deeper table than MGAW if
the hardware only supports that configuration.

Signed-off-by: Calvin Owens <calvin@...nvd.org>
---
 drivers/iommu/generic_pt/fmt/vtdss.h |  7 ++++++
 drivers/iommu/intel/iommu.c          | 32 ++++++++++++++++++++++++++++
 include/linux/generic_pt/iommu.h     |  4 ++++
 3 files changed, 43 insertions(+)

diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
index d9774848eb6f..bbf6861d9be5 100644
--- a/drivers/iommu/generic_pt/fmt/vtdss.h
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -249,6 +249,13 @@ static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
 {
 	struct pt_vtdss *table = &iommu_table->vtdss_pt;
 	unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+	unsigned int ptsz_lg2 = cfg->common.hw_min_ptsz_lg2;
+
+	if (vasz_lg2 < ptsz_lg2) {
+		pr_warn_once(FW_BUG "HW requires wasteful %ubit PT with %ubit MGAW\n",
+			ptsz_lg2, vasz_lg2);
+		vasz_lg2 = ptsz_lg2;
+	}
 
 	if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
 		return -EOPNOTSUPP;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d745f833d8b5..d44766bba3d7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2798,6 +2798,36 @@ static struct dmar_domain *paging_domain_alloc(void)
 	return domain;
 }
 
+static unsigned int compute_min_ptsz_lg2(struct intel_iommu *iommu)
+{
+	unsigned int sagaw = cap_sagaw(iommu->cap);
+	unsigned int mgaw = cap_mgaw(iommu->cap);
+
+	/*
+	 * Return the shallowest pagetable depth sufficient to represent the
+	 * maximum guest address width which is supported by the hardware. On
+	 * some hardware, that shallowest depth is deeper than the MGAW.
+	 */
+
+	if (mgaw > 48)
+		goto five;
+
+	if (mgaw > 39)
+		goto four;
+
+	if (sagaw & BIT(1))
+		return 39;
+four:
+	if (sagaw & BIT(2))
+		return 48;
+five:
+	if (sagaw & BIT(3))
+		return 57;
+
+	pr_warn(FW_BUG "Can't satisfy mgaw=%u and sagaw=%02x", mgaw, sagaw);
+	return mgaw;
+}
+
 static struct iommu_domain *
 intel_iommu_domain_alloc_first_stage(struct device *dev,
 				     struct intel_iommu *iommu, u32 flags)
@@ -2832,6 +2862,7 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
 	cfg.common.hw_max_vasz_lg2 = min(cap_mgaw(iommu->cap),
 					 cfg.common.hw_max_vasz_lg2);
 	cfg.common.hw_max_oasz_lg2 = 52;
+	cfg.common.hw_min_ptsz_lg2 = compute_min_ptsz_lg2(iommu);
 	cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
 			      BIT(PT_FEAT_FLUSH_RANGE);
 	/* First stage always uses scalable mode */
@@ -2916,6 +2947,7 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
 
 	cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu);
 	cfg.common.hw_max_oasz_lg2 = 52;
+	cfg.common.hw_min_ptsz_lg2 = compute_min_ptsz_lg2(iommu);
 	cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
 
 	/*
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index cfe05a77f86b..8c32e492d6d1 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -188,6 +188,10 @@ struct pt_iommu_cfg {
 	 * might select a lower maximum OA.
 	 */
 	u8 hw_max_oasz_lg2;
+	/**
+	 * @hw_min_ptsz_lg2: Minimum page table depth the IOMMU HW can support.
+	 */
+	u8 hw_min_ptsz_lg2;
 };
 
 /* Generate the exported function signatures from iommu_pt.h */
-- 
2.47.3


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ