[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4a8df0e8-bd5a-44e4-acce-46ba75594846@linux.intel.com>
Date: Thu, 7 Aug 2025 22:40:39 +0800
From: Baolu Lu <baolu.lu@...ux.intel.com>
To: Dave Hansen <dave.hansen@...el.com>, Jason Gunthorpe <jgg@...dia.com>
Cc: baolu.lu@...ux.intel.com, Joerg Roedel <joro@...tes.org>,
Will Deacon <will@...nel.org>, Robin Murphy <robin.murphy@....com>,
Kevin Tian <kevin.tian@...el.com>, Jann Horn <jannh@...gle.com>,
Vasant Hegde <vasant.hegde@....com>, Alistair Popple <apopple@...dia.com>,
Peter Zijlstra <peterz@...radead.org>, Uladzislau Rezki <urezki@...il.com>,
Jean-Philippe Brucker <jean-philippe@...aro.org>,
Andy Lutomirski <luto@...nel.org>, Yi Lai <yi1.lai@...el.com>,
iommu@...ts.linux.dev, security@...nel.org, linux-kernel@...r.kernel.org,
stable@...r.kernel.org
Subject: Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB
flush
On 8/7/2025 12:34 AM, Dave Hansen wrote:
> On 8/6/25 09:09, Jason Gunthorpe wrote:
>>>>
>>>> You can't do this approach without also pushing the pages to freed on
>>>> a list and defering the free till the work. This is broadly what the
>>>> normal mm user flow is doing..
>>> FWIW, I think the simplest way to do this is to plop an unconditional
>>> schedule_work() in pte_free_kernel(). The work function will invalidate
>>> the IOTLBs and then free the page.
>>>
>>> Keep the schedule_work() unconditional to keep it simple. The
>>> schedule_work() is way cheaper than all the system-wide TLB invalidation
>>> IPIs that have to get sent as well. No need to add complexity to
>>> optimize out something that's in the noise already.
>> That works also, but now you have to allocate memory or you are
>> dead.. Is it OK these days, and safe in this code which seems a little
>> bit linked to memory management?
>>
>> The MM side avoided this by putting the list and the rcu_head in the
>> struct page.
>
> I don't think you need to allocate memory. A little static structure
> that uses the page->list and has a lock should do. Logically something
> like this:
>
> struct kernel_pgtable_work
> {
> struct list_head list;
> spinlock_t lock;
> struct work_struct work;
> } kernel_pte_work;
>
> pte_free_kernel()
> {
> struct page *page = ptdesc_magic();
>
> guard(spinlock)(&kernel_pte_work.lock);
>
> list_add(&page->list, &kernel_pte_work.list);
> schedule_work(&kernel_pte_work.work);
> }
>
> work_func()
> {
> iommu_sva_invalidate_kva();
>
> guard(spinlock)(&kernel_pte_work.lock);
>
> list_for_each_safe() {
> page = container_of(...);
> free_whatever(page);
> }
> }
>
> The only wrinkle is that pte_free_kernel() itself still has a pte and
> 'ptdesc', not a 'struct page'. But there is ptdesc->pt_list, which
> should be unused at this point, especially for non-pgd pages on x86.
>
> So, either go over to the 'struct page' earlier (maybe by open-coding
> pagetable_dtor_free()?), or just use the ptdesc.
I refactored the code above as follows. It compiles but hasn't been
tested yet. Does it look good to you?
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c88691b15f3c..d9307dd09f67 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -10,9 +10,11 @@
#define __HAVE_ARCH_PTE_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
+#define __HAVE_ARCH_PTE_FREE_KERNEL
#include <asm-generic/pgalloc.h>
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return
0; }
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ddf248c3ee7d..f9f6738dd3cc 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/hugetlb.h>
+#include <linux/iommu.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
@@ -844,3 +845,42 @@ void arch_check_zapped_pud(struct vm_area_struct
*vma, pud_t pud)
/* See note in arch_check_zapped_pte() */
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
}
+
+static void kernel_pte_work_func(struct work_struct *work);
+
+static struct {
+ struct list_head list;
+ spinlock_t lock;
+ struct work_struct work;
+} kernel_pte_work = {
+ .list = LIST_HEAD_INIT(kernel_pte_work.list),
+ .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
+ .work = __WORK_INITIALIZER(kernel_pte_work.work, kernel_pte_work_func),
+};
+
+static void kernel_pte_work_func(struct work_struct *work)
+{
+ struct page *page, *next;
+
+ iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
+
+ guard(spinlock)(&kernel_pte_work.lock);
+ list_for_each_entry_safe(page, next, &kernel_pte_work.list, lru) {
+ list_del_init(&page->lru);
+ pagetable_dtor_free(page_ptdesc(page));
+ }
+}
+
+/**
+ * pte_free_kernel - free PTE-level kernel page table memory
+ * @mm: the mm_struct of the current context
+ * @pte: pointer to the memory containing the page table
+ */
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ struct page *page = virt_to_page(pte);
+
+ guard(spinlock)(&kernel_pte_work.lock);
+ list_add(&page->lru, &kernel_pte_work.list);
+ schedule_work(&kernel_pte_work.work);
+}
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 3c8ec3bfea44..716ebab67636 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -46,6 +46,7 @@ static inline pte_t
*pte_alloc_one_kernel_noprof(struct mm_struct *mm)
#define pte_alloc_one_kernel(...)
alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif
+#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
/**
* pte_free_kernel - free PTE-level kernel page table memory
* @mm: the mm_struct of the current context
@@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct
*mm, pte_t *pte)
{
pagetable_dtor_free(virt_to_ptdesc(pte));
}
+#endif
/**
* __pte_alloc_one - allocate memory for a PTE-level user page table
--
2.43.0
Thanks,
baolu
Powered by blists - more mailing lists