lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250709085158.0f050630@DESKTOP-0403QTC.>
Date: Wed, 9 Jul 2025 08:51:58 -0700
From: Jacob Pan <jacob.pan@...ux.microsoft.com>
To: Lu Baolu <baolu.lu@...ux.intel.com>
Cc: Joerg Roedel <joro@...tes.org>, Will Deacon <will@...nel.org>, Robin
 Murphy <robin.murphy@....com>, Kevin Tian <kevin.tian@...el.com>, Jason
 Gunthorpe <jgg@...dia.com>, Jann Horn <jannh@...gle.com>, Vasant Hegde
 <vasant.hegde@....com>, Dave Hansen <dave.hansen@...el.com>, Alistair
 Popple <apopple@...dia.com>, Peter Zijlstra <peterz@...radead.org>,
 Uladzislau Rezki <urezki@...il.com>, Jean-Philippe Brucker
 <jean-philippe@...aro.org>, Andy Lutomirski <luto@...nel.org>,
 iommu@...ts.linux.dev, security@...nel.org, linux-kernel@...r.kernel.org,
 stable@...r.kernel.org, jacob.pan@...ux.microsoft.com
Subject: Re: [PATCH 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

Hi BaoLu,

On Fri,  4 Jul 2025 21:30:56 +0800
Lu Baolu <baolu.lu@...ux.intel.com> wrote:

> The vmalloc() and vfree() functions manage virtually contiguous, but
> not necessarily physically contiguous, kernel memory regions. When
> vfree() unmaps such a region, it tears down the associated kernel
> page table entries and frees the physical pages.
>
> In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU
> hardware shares and walks the CPU's page tables. Architectures like
> x86 share static kernel address mappings across all user page tables,
> allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the
first place? It seems VT-d code does not set supervisor request (SRE)
for the user PASID, I don't see SRE equivalent in AMD IOMMU GCR3 table.
So the PTE U/S bit will prevent kernel memory access, no?

> Modern IOMMUs often cache page table entries to optimize walk
> performance, even for intermediate page table levels. If kernel page
Just wondering if this patch has anything specific to "intermediate page
table", since invalidation hint is always 0 so the intermediate TLBs
are always flushed.

> table mappings are changed (e.g., by vfree()), but the IOMMU's
> internal caches retain stale entries, Use-After-Free (UAF)
> vulnerability condition arises. If these freed page table pages are
> reallocated for a different purpose, potentially by an attacker, the
> IOMMU could misinterpret the new data as valid page table entries.
> This allows the IOMMU to walk into attacker-controlled memory,
> leading to arbitrary physical memory DMA access or privilege
> escalation.
> 
> To mitigate this, introduce a new iommu interface to flush IOMMU
> caches and fence pending page table walks when kernel page mappings
> are updated. This interface should be invoked from
> architecture-specific code that manages combined user and kernel page
> tables.
> 
> Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
> Cc: stable@...r.kernel.org
> Co-developed-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Jason Gunthorpe <jgg@...dia.com>
> Signed-off-by: Lu Baolu <baolu.lu@...ux.intel.com>
> ---
>  arch/x86/mm/tlb.c         |  2 ++
>  drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++-
>  include/linux/iommu.h     |  4 ++++
>  3 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..a41499dfdc3f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -12,6 +12,7 @@
>  #include <linux/task_work.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/mmu_context.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/mmu_context.h>
> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long
> start, unsigned long end) kernel_tlb_flush_range(info);
>  
>  	put_flush_tlb_info();
> +	iommu_sva_invalidate_kva_range(start, end);
>  }
>  
>  /*
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> index 1a51cfd82808..154384eab8a3 100644
> --- a/drivers/iommu/iommu-sva.c
> +++ b/drivers/iommu/iommu-sva.c
> @@ -10,6 +10,8 @@
>  #include "iommu-priv.h"
>  
>  static DEFINE_MUTEX(iommu_sva_lock);
> +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present);
> +static LIST_HEAD(iommu_sva_mms);
>  static struct iommu_domain *iommu_sva_domain_alloc(struct device
> *dev, struct mm_struct *mm);
>  
> @@ -42,6 +44,7 @@ static struct iommu_mm_data
> *iommu_alloc_mm_data(struct mm_struct *mm, struct de return
> ERR_PTR(-ENOSPC); }
>  	iommu_mm->pasid = pasid;
> +	iommu_mm->mm = mm;
>  	INIT_LIST_HEAD(&iommu_mm->sva_domains);
>  	/*
>  	 * Make sure the write to mm->iommu_mm is not reordered in
> front of @@ -132,8 +135,13 @@ struct iommu_sva
> *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if
> (ret) goto out_free_domain;
>  	domain->users = 1;
> -	list_add(&domain->next, &mm->iommu_mm->sva_domains);
>  
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_enable(&iommu_sva_present);
> +		list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
> +	}
> +	list_add(&domain->next, &iommu_mm->sva_domains);
>  out:
>  	refcount_set(&handle->users, 1);
>  	mutex_unlock(&iommu_sva_lock);
> @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva
> *handle) list_del(&domain->next);
>  		iommu_domain_free(domain);
>  	}
> +
> +	if (list_empty(&iommu_mm->sva_domains)) {
> +		list_del(&iommu_mm->mm_list_elm);
> +		if (list_empty(&iommu_sva_mms))
> +			static_branch_disable(&iommu_sva_present);
> +	}
> +
>  	mutex_unlock(&iommu_sva_lock);
>  	kfree(handle);
>  }
> @@ -312,3 +327,18 @@ static struct iommu_domain
> *iommu_sva_domain_alloc(struct device *dev, 
>  	return domain;
>  }
> +
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned
> long end) +{
> +	struct iommu_mm_data *iommu_mm;
> +
> +	might_sleep();
> +
> +	if (!static_branch_unlikely(&iommu_sva_present))
> +		return;
> +
> +	guard(mutex)(&iommu_sva_lock);
> +	list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
> +
> mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start,
> end); +} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 156732807994..31330c12b8ee 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -1090,7 +1090,9 @@ struct iommu_sva {
>  
>  struct iommu_mm_data {
>  	u32			pasid;
> +	struct mm_struct	*mm;
>  	struct list_head	sva_domains;
> +	struct list_head	mm_list_elm;
>  };
>  
>  int iommu_fwspec_init(struct device *dev, struct fwnode_handle
> *iommu_fwnode); @@ -1571,6 +1573,7 @@ struct iommu_sva
> *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm);
>  void iommu_sva_unbind_device(struct iommu_sva *handle);
>  u32 iommu_sva_get_pasid(struct iommu_sva *handle);
> +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned
> long end); #else
>  static inline struct iommu_sva *
>  iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
> @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct
> mm_struct *mm) }
>  
>  static inline void mm_pasid_drop(struct mm_struct *mm) {}
> +static inline void iommu_sva_invalidate_kva_range(unsigned long
> start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */
>  
>  #ifdef CONFIG_IOMMU_IOPF


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ