linux-kernel - Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4a8df0e8-bd5a-44e4-acce-46ba75594846@linux.intel.com>
Date: Thu, 7 Aug 2025 22:40:39 +0800
From: Baolu Lu <baolu.lu@...ux.intel.com>
To: Dave Hansen <dave.hansen@...el.com>, Jason Gunthorpe <jgg@...dia.com>
Cc: baolu.lu@...ux.intel.com, Joerg Roedel <joro@...tes.org>,
 Will Deacon <will@...nel.org>, Robin Murphy <robin.murphy@....com>,
 Kevin Tian <kevin.tian@...el.com>, Jann Horn <jannh@...gle.com>,
 Vasant Hegde <vasant.hegde@....com>, Alistair Popple <apopple@...dia.com>,
 Peter Zijlstra <peterz@...radead.org>, Uladzislau Rezki <urezki@...il.com>,
 Jean-Philippe Brucker <jean-philippe@...aro.org>,
 Andy Lutomirski <luto@...nel.org>, Yi Lai <yi1.lai@...el.com>,
 iommu@...ts.linux.dev, security@...nel.org, linux-kernel@...r.kernel.org,
 stable@...r.kernel.org
Subject: Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB
 flush

On 8/7/2025 12:34 AM, Dave Hansen wrote:
> On 8/6/25 09:09, Jason Gunthorpe wrote:
>>>>
>>>> You can't do this approach without also pushing the pages to freed on
>>>> a list and defering the free till the work. This is broadly what the
>>>> normal mm user flow is doing..
>>> FWIW, I think the simplest way to do this is to plop an unconditional
>>> schedule_work() in pte_free_kernel(). The work function will invalidate
>>> the IOTLBs and then free the page.
>>>
>>> Keep the schedule_work() unconditional to keep it simple. The
>>> schedule_work() is way cheaper than all the system-wide TLB invalidation
>>> IPIs that have to get sent as well. No need to add complexity to
>>> optimize out something that's in the noise already.
>> That works also, but now you have to allocate memory or you are
>> dead.. Is it OK these days, and safe in this code which seems a little
>> bit linked to memory management?
>>
>> The MM side avoided this by putting the list and the rcu_head in the
>> struct page.
> 
> I don't think you need to allocate memory. A little static structure
> that uses the page->list and has a lock should do. Logically something
> like this:
> 
> struct kernel_pgtable_work
> {
> 	struct list_head list;
> 	spinlock_t lock;
> 	struct work_struct work;
> } kernel_pte_work;
> 
> pte_free_kernel()
> {
> 	struct page *page = ptdesc_magic();
> 
> 	guard(spinlock)(&kernel_pte_work.lock);
> 	
> 	list_add(&page->list, &kernel_pte_work.list);
> 	schedule_work(&kernel_pte_work.work);
> }
> 
> work_func()
> {
> 	iommu_sva_invalidate_kva();
> 
> 	guard(spinlock)(&kernel_pte_work.lock);
> 
> 	list_for_each_safe() {
> 		page = container_of(...);
> 		free_whatever(page);
> 	}
> }
> 
> The only wrinkle is that pte_free_kernel() itself still has a pte and
> 'ptdesc', not a 'struct page'. But there is ptdesc->pt_list, which
> should be unused at this point, especially for non-pgd pages on x86.
> 
> So, either go over to the 'struct page' earlier (maybe by open-coding
> pagetable_dtor_free()?), or just use the ptdesc.

I refactored the code above as follows. It compiles but hasn't been
tested yet. Does it look good to you?

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c88691b15f3c..d9307dd09f67 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -10,9 +10,11 @@

  #define __HAVE_ARCH_PTE_ALLOC_ONE
  #define __HAVE_ARCH_PGD_FREE
+#define __HAVE_ARCH_PTE_FREE_KERNEL
  #include <asm-generic/pgalloc.h>

  static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 
0; }
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte);

  #ifdef CONFIG_PARAVIRT_XXL
  #include <asm/paravirt.h>
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ddf248c3ee7d..f9f6738dd3cc 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
  #include <linux/mm.h>
  #include <linux/gfp.h>
  #include <linux/hugetlb.h>
+#include <linux/iommu.h>
  #include <asm/pgalloc.h>
  #include <asm/tlb.h>
  #include <asm/fixmap.h>
@@ -844,3 +845,42 @@ void arch_check_zapped_pud(struct vm_area_struct 
*vma, pud_t pud)
  	/* See note in arch_check_zapped_pte() */
  	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
  }
+
+static void kernel_pte_work_func(struct work_struct *work);
+
+static struct {
+	struct list_head list;
+	spinlock_t lock;
+	struct work_struct work;
+} kernel_pte_work = {
+	.list = LIST_HEAD_INIT(kernel_pte_work.list),
+	.lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
+	.work = __WORK_INITIALIZER(kernel_pte_work.work, kernel_pte_work_func),
+};
+
+static void kernel_pte_work_func(struct work_struct *work)
+{
+	struct page *page, *next;
+
+	iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
+
+	guard(spinlock)(&kernel_pte_work.lock);
+	list_for_each_entry_safe(page, next, &kernel_pte_work.list, lru) {
+		list_del_init(&page->lru);
+		pagetable_dtor_free(page_ptdesc(page));
+	}
+}
+
+/**
+ * pte_free_kernel - free PTE-level kernel page table memory
+ * @mm: the mm_struct of the current context
+ * @pte: pointer to the memory containing the page table
+ */
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	struct page *page = virt_to_page(pte);
+
+	guard(spinlock)(&kernel_pte_work.lock);
+	list_add(&page->lru, &kernel_pte_work.list);
+	schedule_work(&kernel_pte_work.work);
+}
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 3c8ec3bfea44..716ebab67636 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -46,6 +46,7 @@ static inline pte_t 
*pte_alloc_one_kernel_noprof(struct mm_struct *mm)
  #define pte_alloc_one_kernel(...) 
alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
  #endif

+#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
  /**
   * pte_free_kernel - free PTE-level kernel page table memory
   * @mm: the mm_struct of the current context
@@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct 
*mm, pte_t *pte)
  {
  	pagetable_dtor_free(virt_to_ptdesc(pte));
  }
+#endif

  /**
   * __pte_alloc_one - allocate memory for a PTE-level user page table
-- 
2.43.0

Thanks,
baolu