[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aJm0znaAqBRWqOCT@pc636>
Date: Mon, 11 Aug 2025 11:15:58 +0200
From: Uladzislau Rezki <urezki@...il.com>
To: Ethan Zhao <etzhao1900@...il.com>, Baolu Lu <baolu.lu@...ux.intel.com>
Cc: Baolu Lu <baolu.lu@...ux.intel.com>,
Dave Hansen <dave.hansen@...el.com>,
Jason Gunthorpe <jgg@...dia.com>, Joerg Roedel <joro@...tes.org>,
Will Deacon <will@...nel.org>, Robin Murphy <robin.murphy@....com>,
Kevin Tian <kevin.tian@...el.com>, Jann Horn <jannh@...gle.com>,
Vasant Hegde <vasant.hegde@....com>,
Alistair Popple <apopple@...dia.com>,
Peter Zijlstra <peterz@...radead.org>,
Uladzislau Rezki <urezki@...il.com>,
Jean-Philippe Brucker <jean-philippe@...aro.org>,
Andy Lutomirski <luto@...nel.org>, Yi Lai <yi1.lai@...el.com>,
iommu@...ts.linux.dev, security@...nel.org,
linux-kernel@...r.kernel.org, stable@...r.kernel.org
Subject: Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB
flush
On Sun, Aug 10, 2025 at 03:19:58PM +0800, Ethan Zhao wrote:
>
>
> On 8/8/2025 1:15 PM, Baolu Lu wrote:
> > On 8/7/25 23:31, Dave Hansen wrote:
> > > > +void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > > > +{
> > > > + struct page *page = virt_to_page(pte);
> > > > +
> > > > + guard(spinlock)(&kernel_pte_work.lock);
> > > > + list_add(&page->lru, &kernel_pte_work.list);
> > > > + schedule_work(&kernel_pte_work.work);
> > > > +}
> > > > diff --git a/include/asm-generic/pgalloc.h
> > > > b/include/asm-generic/ pgalloc.h
> > > > index 3c8ec3bfea44..716ebab67636 100644
> > > > --- a/include/asm-generic/pgalloc.h
> > > > +++ b/include/asm-generic/pgalloc.h
> > > > @@ -46,6 +46,7 @@ static inline pte_t
> > > > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> > > > #define pte_alloc_one_kernel(...)
> > > > alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
> > > > #endif
> > > >
> > > > +#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
> > > > /**
> > > > * pte_free_kernel - free PTE-level kernel page table memory
> > > > * @mm: the mm_struct of the current context
> > > > @@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct
> > > > *mm, pte_t *pte)
> > > > {
> > > > pagetable_dtor_free(virt_to_ptdesc(pte));
> > > > }
> > > > +#endif
> > > >
> > > > /**
> > > > * __pte_alloc_one - allocate memory for a PTE-level user page table
> > > I'd much rather the arch-generic code looked like this:
> > >
> > > #ifdef CONFIG_ASYNC_PGTABLE_FREE
> > > // code and struct here, or dump them over in some
> > > // other file and do this in a header
> > > #else
> > > static void pte_free_kernel_async(struct page *page) {}
> > > #endif
> > >
> > > void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > > {
> > > struct page *page = virt_to_page(pte);
> > >
> > > if (IS_DEFINED(CONFIG_ASYNC_PGTABLE_FREE)) {
> > > pte_free_kernel_async(page);
> > > else
> > > pagetable_dtor_free(page_ptdesc(page));
> > > }
> > >
> > > Then in Kconfig, you end up with something like:
> > >
> > > config ASYNC_PGTABLE_FREE
> > > def_bool y
> > > depends on INTEL_IOMMU_WHATEVER
> > >
> > > That very much tells much more of the whole story in code. It also gives
> > > the x86 folks that compile out the IOMMU the exact same code as the
> > > arch-generic folks. It_also_ makes it dirt simple and obvious for the
> > > x86 folks to optimize out the async behavior if they don't like it in
> > > the future by replacing the compile-time IOMMU check with a runtime one.
> > >
> > > Also, if another crazy IOMMU implementation comes along that happens to
> > > do what the x86 IOMMUs do, then they have a single Kconfig switch to
> > > flip. If they follow what this patch tries to do, they'll start by
> > > copying and pasting the x86 implementation.
> >
> > I'll do it like this. Does that look good to you?
> >
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index 70d29b14d851..6f1113e024fa 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -160,6 +160,7 @@ config IOMMU_DMA
> > # Shared Virtual Addressing
> > config IOMMU_SVA
> > select IOMMU_MM_DATA
> > + select ASYNC_PGTABLE_FREE if X86
> > bool
> >
> > config IOMMU_IOPF
> > diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
> > index 3c8ec3bfea44..dbddacdca2ce 100644
> > --- a/include/asm-generic/pgalloc.h
> > +++ b/include/asm-generic/pgalloc.h
> > @@ -46,6 +46,19 @@ static inline pte_t
> > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> > #define pte_alloc_one_kernel(...)
> > alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
> > #endif
> >
> > +#ifdef CONFIG_ASYNC_PGTABLE_FREE
> > +struct pgtable_free_work {
> > + struct list_head list;
> > + spinlock_t lock;
> > + struct work_struct work;
> > +};
> > +extern struct pgtable_free_work kernel_pte_work;
> > +
> > +void pte_free_kernel_async(struct ptdesc *ptdesc);
> > +#else
> > +static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {}
> > +#endif
> > +
> > /**
> > * pte_free_kernel - free PTE-level kernel page table memory
> > * @mm: the mm_struct of the current context
> > @@ -53,7 +66,12 @@ static inline pte_t
> > *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
> > */
> > static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
> > {
> > - pagetable_dtor_free(virt_to_ptdesc(pte));
> > + struct ptdesc *ptdesc = virt_to_ptdesc(pte);
> > +
> > + if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE))
> > + pte_free_kernel_async(ptdesc);
> > + else
> > + pagetable_dtor_free(ptdesc);
> > }
> >
> > /**
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index e443fe8cd6cf..528550cfa7fe 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -1346,6 +1346,13 @@ config LOCK_MM_AND_FIND_VMA
> > config IOMMU_MM_DATA
> > bool
> >
> > +config ASYNC_PGTABLE_FREE
> > + bool "Asynchronous kernel page table freeing"
> > + help
> > + Perform kernel page table freeing asynchronously. This is required
> > + for systems with IOMMU Shared Virtual Address (SVA) to flush IOTLB
> > + paging structure caches.
> > +
> > config EXECMEM
> > bool
> >
> > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> > index 567e2d084071..6639ee6641d4 100644
> > --- a/mm/pgtable-generic.c
> > +++ b/mm/pgtable-generic.c
> > @@ -13,6 +13,7 @@
> > #include <linux/swap.h>
> > #include <linux/swapops.h>
> > #include <linux/mm_inline.h>
> > +#include <linux/iommu.h>
> > #include <asm/pgalloc.h>
> > #include <asm/tlb.h>
> >
> > @@ -406,3 +407,32 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm,
> > pmd_t *pmd,
> > pte_unmap_unlock(pte, ptl);
> > goto again;
> > }
> > +
> > +#ifdef CONFIG_ASYNC_PGTABLE_FREE
> > +static void kernel_pte_work_func(struct work_struct *work);
> > +struct pgtable_free_work kernel_pte_work = {
> > + .list = LIST_HEAD_INIT(kernel_pte_work.list),
> > + .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
> > + .work = __WORK_INITIALIZER(kernel_pte_work.work,
> > kernel_pte_work_func),
> > +};
> > +
> > +static void kernel_pte_work_func(struct work_struct *work)
> > +{
> > + struct ptdesc *ptdesc, *next;
> > +
> > + iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
> > +
> > + guard(spinlock)(&kernel_pte_work.lock);
> > + list_for_each_entry_safe(ptdesc, next, &kernel_pte_work.list,
> > pt_list) {
> > + list_del_init(&ptdesc->pt_list);
> > + pagetable_dtor_free(ptdesc);
> > + }
> > +}
> > +
> > +void pte_free_kernel_async(struct ptdesc *ptdesc)
> > +{
> > + guard(spinlock)(&kernel_pte_work.lock);
> > + list_add(&ptdesc->pt_list, &kernel_pte_work.list);
> > + schedule_work(&kernel_pte_work.work);
> > +}
> kernel_pte_work.list is global shared var, it would make the producer
> pte_free_kernel() and the consumer kernel_pte_work_func() to operate in
> serialized timing. In a large system, I don't think you design this
> deliberately :)
>
Sorry for jumping.
Agree, unless it is never considered as a hot path or something that can
be really contented. It looks like you can use just a per-cpu llist to drain
thinks.
As for reference you can have a look at how vfree_atomic() handles deferred
freeing.
Thanks!
--
Uladzislau Rezki
Powered by blists - more mailing lists