[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aCzdnAmuOylilU1p@localhost.localdomain>
Date: Tue, 20 May 2025 21:53:00 +0200
From: Oscar Salvador <osalvador@...e.de>
To: Gavin Guo <gavinguo@...lia.com>
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org, muchun.song@...ux.dev,
akpm@...ux-foundation.org, mike.kravetz@...cle.com,
kernel-dev@...lia.com, stable@...r.kernel.org,
Hugh Dickins <hughd@...gle.com>, Florent Revest <revest@...gle.com>,
Gavin Shan <gshan@...hat.com>
Subject: Re: [PATCH] mm/hugetlb: fix a deadlock with pagecache_folio and
hugetlb_fault_mutex_table
On Tue, May 13, 2025 at 05:34:48PM +0800, Gavin Guo wrote:
> The patch fixes a deadlock which can be triggered by an internal
> syzkaller [1] reproducer and captured by bpftrace script [2] and its log
> [3] in this scenario:
>
> Process 1 Process 2
> --- ---
> hugetlb_fault
> mutex_lock(B) // take B
> filemap_lock_hugetlb_folio
> filemap_lock_folio
> __filemap_get_folio
> folio_lock(A) // take A
> hugetlb_wp
> mutex_unlock(B) // release B
> ... hugetlb_fault
> ... mutex_lock(B) // take B
> filemap_lock_hugetlb_folio
> filemap_lock_folio
> __filemap_get_folio
> folio_lock(A) // blocked
> unmap_ref_private
> ...
> mutex_lock(B) // retake and blocked
>
...
> Signed-off-by: Gavin Guo <gavinguo@...lia.com>
I think this is more convoluted that it needs to be?
hugetlb_wp() is called from hugetlb_no_page() and hugetlb_fault().
hugetlb_no_page() locks and unlocks the lock itself, which leaves us
with hugetlb_fault().
hugetlb_fault() always passed the folio locked to hugetlb_wp(), and the
latter only unlocks it when we have a cow from owner happening and we
cannot satisfy the allocation.
So, should not checking whether the folio is still locked after
returning enough?
What speaks against:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bd8971388236..23b57c5689a4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6228,6 +6228,12 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
u32 hash;
folio_put(old_folio);
+ /*
+ * The pagecache_folio needs to be unlocked to avoid
+ * deadlock when the child unmaps the folio.
+ */
+ if (pagecache_folio)
+ folio_unlock(pagecache_folio);
/*
* Drop hugetlb_fault_mutex and vma_lock before
* unmapping. unmapping needs to hold vma_lock
@@ -6825,7 +6831,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(vmf.ptl);
if (pagecache_folio) {
- folio_unlock(pagecache_folio);
+ /*
+ * hugetlb_wp() might have already unlocked pagecache_folio, so
+ * skip it if that is the case.
+ */
+ if (folio_test_locked(pagecache_folio))
+ folio_unlock(pagecache_folio);
folio_put(pagecache_folio);
}
out_mutex:
> ---
> mm/hugetlb.c | 33 ++++++++++++++++++++++++++++-----
> 1 file changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e3e6ac991b9c..ad54a74aa563 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6115,7 +6115,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
> * Keep the pte_same checks anyway to make transition from the mutex easier.
> */
> static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
> - struct vm_fault *vmf)
> + struct vm_fault *vmf,
> + bool *pagecache_folio_unlocked)
> {
> struct vm_area_struct *vma = vmf->vma;
> struct mm_struct *mm = vma->vm_mm;
> @@ -6212,6 +6213,22 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
> u32 hash;
>
> folio_put(old_folio);
> + /*
> + * The pagecache_folio needs to be unlocked to avoid
> + * deadlock and we won't re-lock it in hugetlb_wp(). The
> + * pagecache_folio could be truncated after being
> + * unlocked. So its state should not be relied
> + * subsequently.
> + *
> + * Setting *pagecache_folio_unlocked to true allows the
> + * caller to handle any necessary logic related to the
> + * folio's unlocked state.
> + */
> + if (pagecache_folio) {
> + folio_unlock(pagecache_folio);
> + if (pagecache_folio_unlocked)
> + *pagecache_folio_unlocked = true;
> + }
> /*
> * Drop hugetlb_fault_mutex and vma_lock before
> * unmapping. unmapping needs to hold vma_lock
> @@ -6566,7 +6583,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
> hugetlb_count_add(pages_per_huge_page(h), mm);
> if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
> /* Optimization, do the COW without a second fault */
> - ret = hugetlb_wp(folio, vmf);
> + ret = hugetlb_wp(folio, vmf, NULL);
> }
>
> spin_unlock(vmf->ptl);
> @@ -6638,6 +6655,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> struct hstate *h = hstate_vma(vma);
> struct address_space *mapping;
> int need_wait_lock = 0;
> + bool pagecache_folio_unlocked = false;
> struct vm_fault vmf = {
> .vma = vma,
> .address = address & huge_page_mask(h),
> @@ -6792,7 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>
> if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
> if (!huge_pte_write(vmf.orig_pte)) {
> - ret = hugetlb_wp(pagecache_folio, &vmf);
> + ret = hugetlb_wp(pagecache_folio, &vmf,
> + &pagecache_folio_unlocked);
> goto out_put_page;
> } else if (likely(flags & FAULT_FLAG_WRITE)) {
> vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
> @@ -6809,10 +6828,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> out_ptl:
> spin_unlock(vmf.ptl);
>
> - if (pagecache_folio) {
> + /*
> + * If the pagecache_folio is unlocked in hugetlb_wp(), we skip
> + * folio_unlock() here.
> + */
> + if (pagecache_folio && !pagecache_folio_unlocked)
> folio_unlock(pagecache_folio);
> + if (pagecache_folio)
> folio_put(pagecache_folio);
> - }
> out_mutex:
> hugetlb_vma_unlock_read(vma);
>
>
> base-commit: d76bb1ebb5587f66b0f8b8099bfbb44722bc08b3
> --
> 2.43.0
>
>
--
Oscar Salvador
SUSE Labs
Powered by blists - more mailing lists