linux-kernel - Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aJm0znaAqBRWqOCT@pc636>
Date: Mon, 11 Aug 2025 11:15:58 +0200
From: Uladzislau Rezki <urezki@...il.com>
To: Ethan Zhao <etzhao1900@...il.com>, Baolu Lu <baolu.lu@...ux.intel.com>
Cc: Baolu Lu <baolu.lu@...ux.intel.com>,
	Dave Hansen <dave.hansen@...el.com>,
	Jason Gunthorpe <jgg@...dia.com>, Joerg Roedel <joro@...tes.org>,
	Will Deacon <will@...nel.org>, Robin Murphy <robin.murphy@....com>,
	Kevin Tian <kevin.tian@...el.com>, Jann Horn <jannh@...gle.com>,
	Vasant Hegde <vasant.hegde@....com>,
	Alistair Popple <apopple@...dia.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Uladzislau Rezki <urezki@...il.com>,
	Jean-Philippe Brucker <jean-philippe@...aro.org>,
	Andy Lutomirski <luto@...nel.org>, Yi Lai <yi1.lai@...el.com>,
	iommu@...ts.linux.dev, security@...nel.org,
	linux-kernel@...r.kernel.org, stable@...r.kernel.org
Subject: Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB
 flush

On Sun, Aug 10, 2025 at 03:19:58PM +0800, Ethan Zhao wrote:
> 
> 
> On 8/8/2025 1:15 PM, Baolu Lu wrote:
> > On 8/7/25 23:31, Dave Hansen wrote:
> > > > +void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > > > +{
> > > > +    struct page *page = virt_to_page(pte);
> > > > +
> > > > +    guard(spinlock)(&kernel_pte_work.lock);
> > > > +    list_add(&page->lru, &kernel_pte_work.list);
> > > > +    schedule_work(&kernel_pte_work.work);
> > > > +}
> > > > diff --git a/include/asm-generic/pgalloc.h
> > > > b/include/asm-generic/ pgalloc.h
> > > > index 3c8ec3bfea44..716ebab67636 100644
> > > > --- a/include/asm-generic/pgalloc.h
> > > > +++ b/include/asm-generic/pgalloc.h
> > > > @@ -46,6 +46,7 @@ static inline pte_t
> > > > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> > > >   #define pte_alloc_one_kernel(...)
> > > > alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
> > > >   #endif
> > > > 
> > > > +#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
> > > >   /**
> > > >    * pte_free_kernel - free PTE-level kernel page table memory
> > > >    * @mm: the mm_struct of the current context
> > > > @@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct
> > > > *mm, pte_t *pte)
> > > >   {
> > > >       pagetable_dtor_free(virt_to_ptdesc(pte));
> > > >   }
> > > > +#endif
> > > > 
> > > >   /**
> > > >    * __pte_alloc_one - allocate memory for a PTE-level user page table
> > > I'd much rather the arch-generic code looked like this:
> > > 
> > > #ifdef CONFIG_ASYNC_PGTABLE_FREE
> > > // code and struct here, or dump them over in some
> > > // other file and do this in a header
> > > #else
> > > static void pte_free_kernel_async(struct page *page) {}
> > > #endif
> > > 
> > > void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > > {
> > >      struct page *page = virt_to_page(pte);
> > > 
> > >      if (IS_DEFINED(CONFIG_ASYNC_PGTABLE_FREE)) {
> > >     pte_free_kernel_async(page);
> > >      else
> > >     pagetable_dtor_free(page_ptdesc(page));
> > > }
> > > 
> > > Then in Kconfig, you end up with something like:
> > > 
> > > config ASYNC_PGTABLE_FREE
> > >     def_bool y
> > >     depends on INTEL_IOMMU_WHATEVER
> > > 
> > > That very much tells much more of the whole story in code. It also gives
> > > the x86 folks that compile out the IOMMU the exact same code as the
> > > arch-generic folks. It_also_ makes it dirt simple and obvious for the
> > > x86 folks to optimize out the async behavior if they don't like it in
> > > the future by replacing the compile-time IOMMU check with a runtime one.
> > > 
> > > Also, if another crazy IOMMU implementation comes along that happens to
> > > do what the x86 IOMMUs do, then they have a single Kconfig switch to
> > > flip. If they follow what this patch tries to do, they'll start by
> > > copying and pasting the x86 implementation.
> > 
> > I'll do it like this.  Does that look good to you?
> > 
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index 70d29b14d851..6f1113e024fa 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -160,6 +160,7 @@ config IOMMU_DMA
> >   # Shared Virtual Addressing
> >   config IOMMU_SVA
> >       select IOMMU_MM_DATA
> > +    select ASYNC_PGTABLE_FREE if X86
> >       bool
> > 
> >   config IOMMU_IOPF
> > diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
> > index 3c8ec3bfea44..dbddacdca2ce 100644
> > --- a/include/asm-generic/pgalloc.h
> > +++ b/include/asm-generic/pgalloc.h
> > @@ -46,6 +46,19 @@ static inline pte_t
> > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> >   #define pte_alloc_one_kernel(...)
> > alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
> >   #endif
> > 
> > +#ifdef CONFIG_ASYNC_PGTABLE_FREE
> > +struct pgtable_free_work {
> > +    struct list_head list;
> > +    spinlock_t lock;
> > +    struct work_struct work;
> > +};
> > +extern struct pgtable_free_work kernel_pte_work;
> > +
> > +void pte_free_kernel_async(struct ptdesc *ptdesc);
> > +#else
> > +static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {}
> > +#endif
> > +
> >   /**
> >    * pte_free_kernel - free PTE-level kernel page table memory
> >    * @mm: the mm_struct of the current context
> > @@ -53,7 +66,12 @@ static inline pte_t
> > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> >    */
> >   static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> >   {
> > -    pagetable_dtor_free(virt_to_ptdesc(pte));
> > +    struct ptdesc *ptdesc = virt_to_ptdesc(pte);
> > +
> > +    if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE))
> > +        pte_free_kernel_async(ptdesc);
> > +    else
> > +        pagetable_dtor_free(ptdesc);
> >   }
> > 
> >   /**
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index e443fe8cd6cf..528550cfa7fe 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -1346,6 +1346,13 @@ config LOCK_MM_AND_FIND_VMA
> >   config IOMMU_MM_DATA
> >       bool
> > 
> > +config ASYNC_PGTABLE_FREE
> > +    bool "Asynchronous kernel page table freeing"
> > +    help
> > +      Perform kernel page table freeing asynchronously. This is required
> > +      for systems with IOMMU Shared Virtual Address (SVA) to flush IOTLB
> > +      paging structure caches.
> > +
> >   config EXECMEM
> >       bool
> > 
> > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> > index 567e2d084071..6639ee6641d4 100644
> > --- a/mm/pgtable-generic.c
> > +++ b/mm/pgtable-generic.c
> > @@ -13,6 +13,7 @@
> >   #include <linux/swap.h>
> >   #include <linux/swapops.h>
> >   #include <linux/mm_inline.h>
> > +#include <linux/iommu.h>
> >   #include <asm/pgalloc.h>
> >   #include <asm/tlb.h>
> > 
> > @@ -406,3 +407,32 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm,
> > pmd_t *pmd,
> >       pte_unmap_unlock(pte, ptl);
> >       goto again;
> >   }
> > +
> > +#ifdef CONFIG_ASYNC_PGTABLE_FREE
> > +static void kernel_pte_work_func(struct work_struct *work);
> > +struct pgtable_free_work kernel_pte_work = {
> > +    .list = LIST_HEAD_INIT(kernel_pte_work.list),
> > +    .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
> > +    .work = __WORK_INITIALIZER(kernel_pte_work.work,
> > kernel_pte_work_func),
> > +};
> > +
> > +static void kernel_pte_work_func(struct work_struct *work)
> > +{
> > +    struct ptdesc *ptdesc, *next;
> > +
> > +    iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
> > +
> > +    guard(spinlock)(&kernel_pte_work.lock);
> > +    list_for_each_entry_safe(ptdesc, next, &kernel_pte_work.list,
> > pt_list) {
> > +        list_del_init(&ptdesc->pt_list);
> > +        pagetable_dtor_free(ptdesc);
> > +    }
> > +}
> > +
> > +void pte_free_kernel_async(struct ptdesc *ptdesc)
> > +{
> > +    guard(spinlock)(&kernel_pte_work.lock);
> > +    list_add(&ptdesc->pt_list, &kernel_pte_work.list);
> > +    schedule_work(&kernel_pte_work.work);
> > +}
> kernel_pte_work.list is global shared var, it would make the producer
> pte_free_kernel() and the consumer kernel_pte_work_func() to operate in
> serialized timing. In a large system, I don't think you design this
> deliberately :)
>
Sorry for jumping.

Agree, unless it is never considered as a hot path or something that can
be really contented. It looks like you can use just a per-cpu llist to drain
thinks.

As for reference you can have a look at how vfree_atomic() handles deferred
freeing.

Thanks!

--
Uladzislau Rezki