linux-kernel - Re: [PATCH v1 1/2] mm: let pte_lockptr() consume a pte

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <97d87df8-6076-4df0-90a9-a8f5ddea42e9@samsung.com>
Date: Tue, 30 Jul 2024 17:30:56 +0200
From: Marek Szyprowski <m.szyprowski@...sung.com>
To: David Hildenbrand <david@...hat.com>, linux-kernel@...r.kernel.org
Cc: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>, Muchun
	Song <muchun.song@...ux.dev>, Peter Xu <peterx@...hat.com>, Oscar Salvador
	<osalvador@...e.de>
Subject: Re: [PATCH v1 1/2] mm: let pte_lockptr() consume a pte_t pointer

On 25.07.2024 20:39, David Hildenbrand wrote:
> pte_lockptr() is the only *_lockptr() function that doesn't consume
> what would be expected: it consumes a pmd_t pointer instead of a pte_t
> pointer.
>
> Let's change that. The two callers in pgtable-generic.c are easily
> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> pte_offset_map_nolock() to obtain the lock, even though we won't actually
> be traversing the page table.
>
> This makes the code more similar to the other variants and avoids other
> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> reside now only in  pgtable-generic.c.
>
> Maybe, using pte_offset_map_nolock() is the right thing to do because
> the PTE table could have been removed in the meantime? At least it sounds
> more future proof if we ever have other means of page table reclaim.
>
> It's not quite clear if holding the PTE table lock is really required:
> what if someone else obtains the lock just after we unlock it? But we'll
> leave that as is for now, maybe there are good reasons.
>
> This is a preparation for adapting hugetlb page table locking logic to
> take the same locks as core-mm page table walkers would.
>
> Signed-off-by: David Hildenbrand <david@...hat.com>

This patch landed in today's linux-next as commit e98970a1d2d4 ("mm: let 
pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the 
following issue on most of my ARM 32bit based test boards:

Run /sbin/init as init process
8<--- cut here ---
Unable to handle kernel NULL pointer dereference at virtual address 
00000010 when read
[00000010] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 
6.10.0-12930-ge98970a1d2d4 #15137
Hardware name: Samsung Exynos (Flattened Device Tree)
PC is at __lock_acquire+0x20/0x1604
LR is at lock_acquire+0x21c/0x394
pc : [<c01990e4>]    lr : [<c019b2e8>]    psr: 20000093
sp : f082dc80  ip : 00000001  fp : c1d20000
r10: 00000000  r9 : c13096c8  r8 : 00000001
r7 : 00000000  r6 : c1d20000  r5 : c1278594  r4 : c1278594
r3 : 00000000  r2 : 00000000  r1 : 00000000  r0 : 00000010
Flags: nzCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 4000404a  DAC: 00000051
...
Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
Stack: (0xf082dc80 to 0xf082e000)
...
Call trace:
  __lock_acquire from lock_acquire+0x21c/0x394
  lock_acquire from _raw_spin_lock+0x3c/0x4c
  _raw_spin_lock from __pte_offset_map_lock+0x64/0x118
  __pte_offset_map_lock from handle_mm_fault+0xee0/0x13c8
  handle_mm_fault from __get_user_pages+0x18c/0x848
  __get_user_pages from get_user_pages_remote+0xdc/0x474
  get_user_pages_remote from get_arg_page+0x90/0x2a4
  get_arg_page from copy_string_kernel+0x14c/0x184
  copy_string_kernel from kernel_execve+0x94/0x174
  kernel_execve from try_to_run_init_process+0xc/0x3c
  try_to_run_init_process from kernel_init+0xcc/0x12c
  kernel_init from ret_from_fork+0x14/0x28
Exception stack(0xf082dfb0 to 0xf082dff8)
...
---[ end trace 0000000000000000 ]---
note: swapper/0[1] exited with irqs disabled
note: swapper/0[1] exited with preempt_count 1
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0000000b ]---

Reverting $subject on top of next-20240730 fixes/hides this issue.


> ---
>   include/linux/mm.h   |  7 ++++---
>   mm/khugepaged.c      | 21 +++++++++++++++------
>   mm/pgtable-generic.c |  4 ++--
>   3 files changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2c6ccf088c7be..0472a5090b180 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2873,9 +2873,10 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
>   }
>   #endif /* ALLOC_SPLIT_PTLOCKS */
>   
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>   {
> -	return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
> +	/* PTE page tables don't currently exceed a single page. */
> +	return ptlock_ptr(virt_to_ptdesc(pte));
>   }
>   
>   static inline bool ptlock_init(struct ptdesc *ptdesc)
> @@ -2898,7 +2899,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
>   /*
>    * We use mm->page_table_lock to guard all pagetable pages of the mm.
>    */
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>   {
>   	return &mm->page_table_lock;
>   }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cdd1d8655a76b..f3b3db1046155 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1697,12 +1697,13 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   	i_mmap_lock_read(mapping);
>   	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>   		struct mmu_notifier_range range;
> +		bool retracted = false;
>   		struct mm_struct *mm;
>   		unsigned long addr;
>   		pmd_t *pmd, pgt_pmd;
>   		spinlock_t *pml;
>   		spinlock_t *ptl;
> -		bool skipped_uffd = false;
> +		pte_t *pte;
>   
>   		/*
>   		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
> @@ -1739,9 +1740,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   		mmu_notifier_invalidate_range_start(&range);
>   
>   		pml = pmd_lock(mm, pmd);
> -		ptl = pte_lockptr(mm, pmd);
> +
> +		/*
> +		 * No need to check the PTE table content, but we'll grab the
> +		 * PTE table lock while we zap it.
> +		 */
> +		pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
> +		if (!pte)
> +			goto unlock_pmd;
>   		if (ptl != pml)
>   			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> +		pte_unmap(pte);
>   
>   		/*
>   		 * Huge page lock is still held, so normally the page table
> @@ -1752,20 +1761,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   		 * repeating the anon_vma check protects from one category,
>   		 * and repeating the userfaultfd_wp() check from another.
>   		 */
> -		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
> -			skipped_uffd = true;
> -		} else {
> +		if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
>   			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
>   			pmdp_get_lockless_sync();
> +			retracted = true;
>   		}
>   
>   		if (ptl != pml)
>   			spin_unlock(ptl);
> +unlock_pmd:
>   		spin_unlock(pml);
>   
>   		mmu_notifier_invalidate_range_end(&range);
>   
> -		if (!skipped_uffd) {
> +		if (retracted) {
>   			mm_dec_nr_ptes(mm);
>   			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
>   			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index a78a4adf711ac..13a7705df3f87 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -313,7 +313,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
>   
>   	pte = __pte_offset_map(pmd, addr, &pmdval);
>   	if (likely(pte))
> -		*ptlp = pte_lockptr(mm, &pmdval);
> +		*ptlp = pte_lockptr(mm, pte);
>   	return pte;
>   }
>   
> @@ -371,7 +371,7 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
>   	pte = __pte_offset_map(pmd, addr, &pmdval);
>   	if (unlikely(!pte))
>   		return pte;
> -	ptl = pte_lockptr(mm, &pmdval);
> +	ptl = pte_lockptr(mm, pte);
>   	spin_lock(ptl);
>   	if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
>   		*ptlp = ptl;

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland