linux-kernel - [RFC] mm/migrate: make sure folio_unlock() before folio_wait

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20251002081612.53281-1-byungchul@sk.com>
Date: Thu,  2 Oct 2025 17:16:12 +0900
From: Byungchul Park <byungchul@...com>
To: akpm@...ux-foundation.org
Cc: david@...hat.com,
	ziy@...dia.com,
	matthew.brost@...el.com,
	joshua.hahnjy@...il.com,
	rakie.kim@...com,
	gourry@...rry.net,
	ying.huang@...ux.alibaba.com,
	apopple@...dia.com,
	clameter@....com,
	kravetz@...ibm.com,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	max.byungchul.park@...il.com,
	kernel_team@...ynix.com,
	harry.yoo@...cle.com,
	gwan-gyeong.mun@...el.com,
	yeoreum.yun@....com,
	syzkaller@...glegroups.com,
	ysk@...lloc.com
Subject: [RFC] mm/migrate: make sure folio_unlock() before folio_wait_writeback()

DEPT(Dependency Tracker) reported a deadlock:

   ===================================================
   DEPT: Circular dependency has been detected.
   6.15.11-00046-g2c223fa7bd9a-dirty #13 Not tainted
   ---------------------------------------------------
   summary
   ---------------------------------------------------
   *** DEADLOCK ***

   context A
      [S] (unknown)(pg_locked_map:0)
      [W] dept_page_wait_on_bit(pg_writeback_map:0)
      [E] dept_page_clear_bit(pg_locked_map:0)

   context B
      [S] (unknown)(pg_writeback_map:0)
      [W] dept_page_wait_on_bit(pg_locked_map:0)
      [E] dept_page_clear_bit(pg_writeback_map:0)

   [S]: start of the event context
   [W]: the wait blocked
   [E]: the event not reachable
   ---------------------------------------------------
   context A's detail
   ---------------------------------------------------
   context A
      [S] (unknown)(pg_locked_map:0)
      [W] dept_page_wait_on_bit(pg_writeback_map:0)
      [E] dept_page_clear_bit(pg_locked_map:0)

   [S] (unknown)(pg_locked_map:0):
   (N/A)

   [W] dept_page_wait_on_bit(pg_writeback_map:0):
   [<ffff800080589c94>] folio_wait_bit+0x2c/0x38
   stacktrace:
         folio_wait_bit_common+0x824/0x8b8
         folio_wait_bit+0x2c/0x38
         folio_wait_writeback+0x5c/0xa4
         migrate_pages_batch+0x5e4/0x1788
         migrate_pages+0x15c4/0x1840
         compact_zone+0x9c8/0x1d20
         compact_node+0xd4/0x27c
         sysctl_compaction_handler+0x104/0x194
         proc_sys_call_handler+0x25c/0x3f8
         proc_sys_write+0x20/0x2c
         do_iter_readv_writev+0x350/0x448
         vfs_writev+0x1ac/0x44c
         do_pwritev+0x100/0x15c
         __arm64_sys_pwritev2+0x6c/0xcc
         invoke_syscall.constprop.0+0x64/0x18c
         el0_svc_common.constprop.0+0x80/0x198

   [E] dept_page_clear_bit(pg_locked_map:0):
   [<ffff800080700914>] migrate_folio_undo_src+0x1b4/0x200
   stacktrace:
         migrate_folio_undo_src+0x1b4/0x200
         migrate_pages_batch+0x1578/0x1788
         migrate_pages+0x15c4/0x1840
         compact_zone+0x9c8/0x1d20
         compact_node+0xd4/0x27c
         sysctl_compaction_handler+0x104/0x194
         proc_sys_call_handler+0x25c/0x3f8
         proc_sys_write+0x20/0x2c
         do_iter_readv_writev+0x350/0x448
         vfs_writev+0x1ac/0x44c
         do_pwritev+0x100/0x15c
         __arm64_sys_pwritev2+0x6c/0xcc
         invoke_syscall.constprop.0+0x64/0x18c
         el0_svc_common.constprop.0+0x80/0x198
         do_el0_svc+0x28/0x3c
         el0_svc+0x50/0x220
   ---------------------------------------------------
   context B's detail
   ---------------------------------------------------
   context B
      [S] (unknown)(pg_writeback_map:0)
      [W] dept_page_wait_on_bit(pg_locked_map:0)
      [E] dept_page_clear_bit(pg_writeback_map:0)

   [S] (unknown)(pg_writeback_map:0):
   (N/A)

   [W] dept_page_wait_on_bit(pg_locked_map:0):
   [<ffff80008081e478>] bdev_getblk+0x58/0x120
   stacktrace:
         find_get_block_common+0x224/0xbc4
         bdev_getblk+0x58/0x120
         __ext4_get_inode_loc+0x194/0x98c
         ext4_get_inode_loc+0x4c/0xcc
         ext4_reserve_inode_write+0x74/0x158
         __ext4_mark_inode_dirty+0xd4/0x4e0
         __ext4_ext_dirty+0x118/0x164
         ext4_ext_map_blocks+0x1578/0x2ca8
         ext4_map_blocks+0x2a4/0xa60
         ext4_convert_unwritten_extents+0x1b0/0x3c0
         ext4_convert_unwritten_io_end_vec+0x90/0x1a0
         ext4_end_io_end+0x58/0x194
         ext4_end_io_rsv_work+0xc4/0x150
         process_one_work+0x3b4/0xac0
         worker_thread+0x2b0/0x53c
         kthread+0x1a0/0x33c

   [E] dept_page_clear_bit(pg_writeback_map:0):
   [<ffff8000809dfc5c>] ext4_finish_bio+0x638/0x820
   stacktrace:
         folio_end_writeback+0x140/0x488
         ext4_finish_bio+0x638/0x820
         ext4_release_io_end+0x74/0x188
         ext4_end_io_end+0xa0/0x194
         ext4_end_io_rsv_work+0xc4/0x150
         process_one_work+0x3b4/0xac0
         worker_thread+0x2b0/0x53c
         kthread+0x1a0/0x33c
         ret_from_fork+0x10/0x20

To simplify the scenario:

   context X (wq worker)	context Y (process context)

				migrate_pages_batch()
   ext4_end_io_end()		  ...
     ...			  migrate_folio_unmap()
     ext4_get_inode_loc()	    ...
       ...			    folio_lock() // hold the folio lock
       bdev_getblk()		    ...
         ...			    folio_wait_writeback() // wait forever
         __find_get_block_slow()
           ...			    ...
           folio_lock() // wait forever
           folio_unlock()	  migrate_folio_undo_src()
				    ...
     ...			    folio_unlock() // never reachable
     ext4_finish_bio()
	...
	folio_end_writeback() // never reachable

context X is waiting for the folio lock to be released by context Y,
while context Y is waiting for the writeback to end in context X.
Ultimately, two contexts are waiting for the event that will never
happen, say, deadlock.

*Only one* of the following two conditions should be allowed, or we
cannot avoid this kind of deadlock:

   1. while holding a folio lock (and heading for folio_unlock()),
      waiting for a writeback to end,
   2. while heading for the writeback end, waiting for the folio lock to
      be released,

Since allowing 2 and avoiding 1 sound more sensible than the other,
remove the first condition by making sure folio_unlock() before
folio_wait_writeback() in migrate_folio_unmap().

Fixes: 49d2e9cc45443 ("[PATCH] Swap Migration V5: migrate_pages() function")
Reported-by: Yunseong Kim <ysk@...lloc.com>
Signed-off-by: Byungchul Park <byungchul@...com>
Tested-by: Yunseong Kim <ysk@...lloc.com>
---

Hi,

Thanks to Yunseong for reporting the issue, testing, and confirming if
this patch can resolve the issue.  We used the latest version of DEPT
to detect the issue:

   https://lore.kernel.org/all/20251002081247.51255-1-byungchul@sk.com/

I mentioned in the commit message above like:

   *Only one* of the following two conditions should be allowed, or we
   cannot avoid this kind of deadlock:
   
      1. while holding a folio lock (and heading for folio_unlock()),
         waiting for a writeback to end,
      2. while heading for the writeback end, waiting for the folio lock
         to be released,

Honestly, I'm not convinced which one we should choose between two, I
chose 'allowing 2 and avoiding 1' to resolve this issue though.

However, please let me know if I was wrong and we should go for
'allowing 1 and avoiding 2'.  If so, I should try a different approach,
for example, to fix by preventing folio_lock() or using folio_try_lock()
while heading for writeback end in ext4_end_io_end() or something.

To Yunseong,

The link you shared for a system hang is:

   https://gist.github.com/kzall0c/a6091bb2fd536865ca9aabfd017a1fc5

I think an important stacktrace for this issue, this is, waiting for
PG_writeback, was missed in the log.

	Byungchul

---
 mm/migrate.c | 57 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 9e5ef39ce73a..60b0b054f27a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1215,6 +1215,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
 
 	dst->private = NULL;
 
+retry_wait_writeback:
+	/*
+	 * Only in the case of a full synchronous migration is it
+	 * necessary to wait for PageWriteback.  In the async case, the
+	 * retry loop is too short and in the sync-light case, the
+	 * overhead of stalling is too much.  Plus, do not write-back if
+	 * it's in the middle of direct compaction
+	 */
+	if (folio_test_writeback(src) && mode == MIGRATE_SYNC)
+		folio_wait_writeback(src);
+
 	if (!folio_trylock(src)) {
 		if (mode == MIGRATE_ASYNC)
 			goto out;
@@ -1245,27 +1256,41 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
 
 		folio_lock(src);
 	}
-	locked = true;
-	if (folio_test_mlocked(src))
-		old_page_state |= PAGE_WAS_MLOCKED;
 
 	if (folio_test_writeback(src)) {
-		/*
-		 * Only in the case of a full synchronous migration is it
-		 * necessary to wait for PageWriteback. In the async case,
-		 * the retry loop is too short and in the sync-light case,
-		 * the overhead of stalling is too much
-		 */
-		switch (mode) {
-		case MIGRATE_SYNC:
-			break;
-		default:
-			rc = -EBUSY;
-			goto out;
+		if (mode == MIGRATE_SYNC) {
+			/*
+			 * folio_unlock() is required before trying
+			 * folio_wait_writeback().  Or it leads a
+			 * deadlock like:
+			 *
+			 *   context x		context y
+			 *   in XXX_io_end()	in migrate_folio_unmap()
+			 *
+			 *   ...		...
+			 *   bdev_getblk();	folio_lock();
+			 *
+			 *     // wait forever	// wait forever
+			 *     folio_lock();	folio_wait_writeback();
+			 *
+			 *     ...		...
+			 *     folio_unlock();
+			 *   ...		// never reachable
+			 *			folio_unlock();
+			 *   // never reachable
+			 *   folio_end_writeback();
+			 */
+			folio_unlock(src);
+			goto retry_wait_writeback;
 		}
-		folio_wait_writeback(src);
+		rc = -EBUSY;
+		goto out;
 	}
 
+	locked = true;
+	if (folio_test_mlocked(src))
+		old_page_state |= PAGE_WAS_MLOCKED;
+
 	/*
 	 * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
 	 * we cannot notice that anon_vma is freed while we migrate a page.

base-commit: e5f0a698b34ed76002dc5cff3804a61c80233a7a
-- 
2.17.1