linux-kernel - [PATCH v2 1/4] swiotlb: dma: its: Enforce host page-size alignment for shared buffers

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251221160920.297689-2-aneesh.kumar@kernel.org>
Date: Sun, 21 Dec 2025 21:39:17 +0530
From: "Aneesh Kumar K.V (Arm)" <aneesh.kumar@...nel.org>
To: linux-kernel@...r.kernel.org,
	iommu@...ts.linux.dev,
	linux-coco@...ts.linux.dev
Cc: Catalin Marinas <catalin.marinas@....com>,
	will@...nel.org,
	maz@...nel.org,
	tglx@...utronix.de,
	robin.murphy@....com,
	suzuki.poulose@....com,
	akpm@...ux-foundation.org,
	jgg@...pe.ca,
	steven.price@....com,
	"Aneesh Kumar K.V (Arm)" <aneesh.kumar@...nel.org>
Subject: [PATCH v2 1/4] swiotlb: dma: its: Enforce host page-size alignment for shared buffers

When running private-memory guests, the guest kernel must apply
additional constraints when allocating buffers that are shared with the
hypervisor.

These shared buffers are also accessed by the host kernel and therefore
must be aligned to the host’s page size.

On non-secure hosts, set_guest_memory_attributes() tracks memory at the
host PAGE_SIZE granularity. This creates a mismatch when the guest
applies attributes at 4K boundaries while the host uses 64K pages. In
such cases, the call returns -EINVAL, preventing the conversion of
memory regions from private to shared.

Architectures such as Arm can tolerate realm physical address space PFNs
being mapped as shared memory, as incorrect accesses are detected and
reported as GPC faults. However, relying on this mechanism is unsafe and
can still lead to kernel crashes.

This is particularly likely when guest_memfd allocations are mmapped and
accessed from userspace. Once exposed to userspace, we cannot guarantee
that applications will only access the intended 4K shared region rather
than the full 64K page mapped into their address space. Such userspace
addresses may also be passed back into the kernel and accessed via the
linear map, resulting in a GPC fault and a kernel crash.

With CCA, although Stage-2 mappings managed by the RMM still operate at
a 4K granularity, shared pages must nonetheless be aligned to the
host-managed page size to avoid the issues described above.

Introduce a new helper, mem_encryp_align(), to allow callers to enforce
the required alignment for shared buffers.

The architecture-specific implementation of mem_encrypt_align() will be
provided in a follow-up patch.

Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@...nel.org>
---
 arch/arm64/include/asm/mem_encrypt.h |  6 ++++++
 arch/arm64/mm/mem_encrypt.c          |  6 ++++++
 drivers/irqchip/irq-gic-v3-its.c     |  7 ++++---
 include/linux/mem_encrypt.h          |  7 +++++++
 kernel/dma/contiguous.c              | 10 ++++++++++
 kernel/dma/direct.c                  |  6 ++++++
 kernel/dma/pool.c                    |  6 ++++--
 kernel/dma/swiotlb.c                 | 18 ++++++++++++------
 8 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h
index d77c10cd5b79..b7ac143b81ce 100644
--- a/arch/arm64/include/asm/mem_encrypt.h
+++ b/arch/arm64/include/asm/mem_encrypt.h
@@ -17,6 +17,12 @@ int set_memory_encrypted(unsigned long addr, int numpages);
 int set_memory_decrypted(unsigned long addr, int numpages);
 bool force_dma_unencrypted(struct device *dev);
 
+#define mem_encrypt_align mem_encrypt_align
+static inline size_t mem_encrypt_align(size_t size)
+{
+	return size;
+}
+
 int realm_register_memory_enc_ops(void);
 
 /*
diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c
index 645c099fd551..deb364eadd47 100644
--- a/arch/arm64/mm/mem_encrypt.c
+++ b/arch/arm64/mm/mem_encrypt.c
@@ -46,6 +46,12 @@ int set_memory_decrypted(unsigned long addr, int numpages)
 	if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr)))
 		return 0;
 
+	if (WARN_ON(!IS_ALIGNED(addr, mem_encrypt_align(PAGE_SIZE))))
+		return 0;
+
+	if (WARN_ON(!IS_ALIGNED(numpages << PAGE_SHIFT, mem_encrypt_align(PAGE_SIZE))))
+		return 0;
+
 	return crypt_ops->decrypt(addr, numpages);
 }
 EXPORT_SYMBOL_GPL(set_memory_decrypted);
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 467cb78435a9..ffb8ef3a1eb3 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -213,16 +213,17 @@ static gfp_t gfp_flags_quirk;
 static struct page *its_alloc_pages_node(int node, gfp_t gfp,
 					 unsigned int order)
 {
+	unsigned int new_order;
 	struct page *page;
 	int ret = 0;
 
-	page = alloc_pages_node(node, gfp | gfp_flags_quirk, order);
-
+	new_order = get_order(mem_encrypt_align((PAGE_SIZE << order)));
+	page = alloc_pages_node(node, gfp | gfp_flags_quirk, new_order);
 	if (!page)
 		return NULL;
 
 	ret = set_memory_decrypted((unsigned long)page_address(page),
-				   1 << order);
+				   1 << new_order);
 	/*
 	 * If set_memory_decrypted() fails then we don't know what state the
 	 * page is in, so we can't free it. Instead we leak it.
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 07584c5e36fb..a0b9f6fe5d1a 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -54,6 +54,13 @@
 #define dma_addr_canonical(x)		(x)
 #endif
 
+#ifndef mem_encrypt_align
+static inline size_t mem_encrypt_align(size_t size)
+{
+	return size;
+}
+#endif
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __MEM_ENCRYPT_H__ */
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d9b9dcba6ff7..35f738c9eee2 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -45,6 +45,7 @@
 #include <linux/dma-map-ops.h>
 #include <linux/cma.h>
 #include <linux/nospec.h>
+#include <linux/dma-direct.h>
 
 #ifdef CONFIG_CMA_SIZE_MBYTES
 #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
@@ -356,6 +357,15 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 	int nid = dev_to_node(dev);
 #endif
 
+	/*
+	 * for untrusted device, we require the dma buffers to be aligned to
+	 * the size of allocation. if we can't do that with cma allocation, fail
+	 * cma allocation early.
+	 */
+	if (force_dma_unencrypted(dev))
+		if (get_order(size) > CONFIG_CMA_ALIGNMENT)
+			return NULL;
+
 	/* CMA can be used only in the context which permits sleeping */
 	if (!gfpflags_allow_blocking(gfp))
 		return NULL;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1f9ee9759426..3448d877c7c6 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -250,6 +250,9 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	    dma_direct_use_pool(dev, gfp))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
+	if (force_dma_unencrypted(dev))
+		size = mem_encrypt_align(size);
+
 	/* we always manually zero the memory once we are done */
 	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
 	if (!page)
@@ -359,6 +362,9 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
+	if (force_dma_unencrypted(dev))
+		size = mem_encrypt_align(size);
+
 	page = __dma_direct_alloc_pages(dev, size, gfp, false);
 	if (!page)
 		return NULL;
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index ee45dee33d49..86615e088240 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -80,12 +80,13 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 			      gfp_t gfp)
 {
 	unsigned int order;
+	unsigned int min_encrypt_order = get_order(mem_encrypt_align(PAGE_SIZE));
 	struct page *page = NULL;
 	void *addr;
 	int ret = -ENOMEM;
 
 	/* Cannot allocate larger than MAX_PAGE_ORDER */
-	order = min(get_order(pool_size), MAX_PAGE_ORDER);
+	order = min(get_order(mem_encrypt_align(pool_size)), MAX_PAGE_ORDER);
 
 	do {
 		pool_size = 1 << (PAGE_SHIFT + order);
@@ -94,7 +95,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 							 order, false);
 		if (!page)
 			page = alloc_pages(gfp, order);
-	} while (!page && order-- > 0);
+	} while (!page && order-- > min_encrypt_order);
 	if (!page)
 		goto out;
 
@@ -196,6 +197,7 @@ static int __init dma_atomic_pool_init(void)
 		unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
 		pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES);
 		atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K);
+		WARN_ON(!IS_ALIGNED(atomic_pool_size, mem_encrypt_align(PAGE_SIZE)));
 	}
 	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 0d37da3d95b6..db53dc7bff6a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -319,8 +319,8 @@ static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
 		unsigned int flags,
 		int (*remap)(void *tlb, unsigned long nslabs))
 {
-	size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
 	void *tlb;
+	size_t bytes = mem_encrypt_align(nslabs << IO_TLB_SHIFT);
 
 	/*
 	 * By default allocate the bounce buffer memory from low memory, but
@@ -328,9 +328,9 @@ static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
 	 * memory encryption.
 	 */
 	if (flags & SWIOTLB_ANY)
-		tlb = memblock_alloc(bytes, PAGE_SIZE);
+		tlb = memblock_alloc(bytes, mem_encrypt_align(PAGE_SIZE));
 	else
-		tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+		tlb = memblock_alloc_low(bytes, mem_encrypt_align(PAGE_SIZE));
 
 	if (!tlb) {
 		pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
@@ -339,7 +339,7 @@ static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
 	}
 
 	if (remap && remap(tlb, nslabs) < 0) {
-		memblock_free(tlb, PAGE_ALIGN(bytes));
+		memblock_free(tlb, bytes);
 		pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
 		return NULL;
 	}
@@ -461,15 +461,21 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 		swiotlb_adjust_nareas(num_possible_cpus());
 
 retry:
-	order = get_order(nslabs << IO_TLB_SHIFT);
+	order = get_order(mem_encrypt_align(nslabs << IO_TLB_SHIFT));
 	nslabs = SLABS_PER_PAGE << order;
 
+	WARN_ON(!IS_ALIGNED(order << PAGE_SHIFT, mem_encrypt_align(PAGE_SIZE)));
+	WARN_ON(!IS_ALIGNED(default_nslabs << IO_TLB_SHIFT, mem_encrypt_align(PAGE_SIZE)));
+	WARN_ON(!IS_ALIGNED(IO_TLB_MIN_SLABS << IO_TLB_SHIFT, mem_encrypt_align(PAGE_SIZE)));
+
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
 		vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
 						  order);
 		if (vstart)
 			break;
 		order--;
+		if (order < get_order(mem_encrypt_align(PAGE_SIZE)))
+			break;
 		nslabs = SLABS_PER_PAGE << order;
 		retried = true;
 	}
@@ -573,7 +579,7 @@ void __init swiotlb_exit(void)
  */
 static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
 {
-	unsigned int order = get_order(bytes);
+	unsigned int order = get_order(mem_encrypt_align(bytes));
 	struct page *page;
 	phys_addr_t paddr;
 	void *vaddr;
-- 
2.43.0