lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1570228005-24979-1-git-send-email-cai@lca.pw>
Date:   Fri,  4 Oct 2019 18:26:45 -0400
From:   Qian Cai <cai@....pw>
To:     akpm@...ux-foundation.org
Cc:     mhocko@...nel.org, sergey.senozhatsky.work@...il.com,
        pmladek@...e.com, rostedt@...dmis.org, peterz@...radead.org,
        david@...hat.com, john.ogness@...utronix.de, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org, Qian Cai <cai@....pw>
Subject: [PATCH v2] mm/page_isolation: fix a deadlock with printk()

It is unsafe to call printk() while zone->lock was held, i.e.,

zone->lock --> console_lock

because the console could always allocate some memory in different code
paths and form locking chains in an opposite order,

console_lock --> * --> zone->lock

As the result, it triggers lockdep splats like below and in different
code paths in this thread [1]. Since has_unmovable_pages() was only used
in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
the former will set the REPORT_FAILURE flag which will call printk().
Hence, unlock the zone->lock just before the dump_page() there where
when has_unmovable_pages() returns true, there is no need to hold the
lock anyway in the rest of set_migratetype_isolate().

While at it, remove a problematic printk() in __offline_isolated_pages()
only for debugging as well which will always disable lockdep on debug
kernels.

The problem is probably there forever, but neither many developers will
run memory offline with the lockdep enabled nor admins in the field are
lucky enough yet to hit a perfect timing which required to trigger a
real deadlock. In addition, there aren't many places that call printk()
while zone->lock was held.

WARNING: possible circular locking dependency detected
------------------------------------------------------
test.sh/1724 is trying to acquire lock:
0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
01: 328/0xa30

but task is already holding lock:
000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
01: late_page_range+0x216/0x538

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&(&zone->lock)->rlock){-.-.}:
       lock_acquire+0x21a/0x468
       _raw_spin_lock+0x54/0x68
       get_page_from_freelist+0x8b6/0x2d28
       __alloc_pages_nodemask+0x246/0x658
       __get_free_pages+0x34/0x78
       sclp_init+0x106/0x690
       sclp_register+0x2e/0x248
       sclp_rw_init+0x4a/0x70
       sclp_console_init+0x4a/0x1b8
       console_init+0x2c8/0x410
       start_kernel+0x530/0x6a0
       startup_continue+0x70/0xd0

-> #1 (sclp_lock){-.-.}:
       lock_acquire+0x21a/0x468
       _raw_spin_lock_irqsave+0xcc/0xe8
       sclp_add_request+0x34/0x308
       sclp_conbuf_emit+0x100/0x138
       sclp_console_write+0x96/0x3b8
       console_unlock+0x6dc/0xa30
       vprintk_emit+0x184/0x3c8
       vprintk_default+0x44/0x50
       printk+0xa8/0xc0
       iommu_debugfs_setup+0xf2/0x108
       iommu_init+0x6c/0x78
       do_one_initcall+0x162/0x680
       kernel_init_freeable+0x4e8/0x5a8
       kernel_init+0x2a/0x188
       ret_from_fork+0x30/0x34
       kernel_thread_starter+0x0/0xc

-> #0 (console_owner){-...}:
       check_noncircular+0x338/0x3e0
       __lock_acquire+0x1e66/0x2d88
       lock_acquire+0x21a/0x468
       console_unlock+0x3a6/0xa30
       vprintk_emit+0x184/0x3c8
       vprintk_default+0x44/0x50
       printk+0xa8/0xc0
       __dump_page+0x1dc/0x710
       dump_page+0x2e/0x58
       has_unmovable_pages+0x2e8/0x470
       start_isolate_page_range+0x404/0x538
       __offline_pages+0x22c/0x1338
       memory_subsys_offline+0xa6/0xe8
       device_offline+0xe6/0x118
       state_store+0xf0/0x110
       kernfs_fop_write+0x1bc/0x270
       vfs_write+0xce/0x220
       ksys_write+0xea/0x190
       system_call+0xd8/0x2b4

other info that might help us debug this:

Chain exists of:
  console_owner --> sclp_lock --> &(&zone->lock)->rlock

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&(&zone->lock)->rlock);
                               lock(sclp_lock);
                               lock(&(&zone->lock)->rlock);
  lock(console_owner);

 *** DEADLOCK ***

9 locks held by test.sh/1724:
 #0: 000000000e925408 (sb_writers#4){.+.+}, at: vfs_write+0x201:
 #1: 0000000050aa4280 (&of->mutex){+.+.}, at: kernfs_fop_write:
 #2: 0000000062e5c628 (kn->count#198){.+.+}, at: kernfs_fop_write
 #3: 00000000523236a0 (device_hotplug_lock){+.+.}, at:
lock_device_hotplug_sysfs+0x30/0x80
 #4: 0000000062e70990 (&dev->mutex){....}, at: device_offline
 #5: 0000000051fd36b0 (cpu_hotplug_lock.rw_sem){++++}, at:
__offline_pages+0xec/0x1338
 #6: 00000000521ca470 (mem_hotplug_lock.rw_sem){++++}, at:
percpu_down_write+0x38/0x210
 #7: 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at:
start_isolate_page_range+0x216/0x538
 #8: 000000005205a100 (console_lock){+.+.}, at: vprintk_emit

stack backtrace:
Hardware name: IBM 2964 N96 400 (z/VM 6.4.0)
Call Trace:
([<00000000512ae218>] show_stack+0x110/0x1b0)
 [<0000000051b6d506>] dump_stack+0x126/0x178
 [<00000000513a4b08>] check_noncircular+0x338/0x3e0
 [<00000000513aaaf6>] __lock_acquire+0x1e66/0x2d88
 [<00000000513a7e12>] lock_acquire+0x21a/0x468
 [<00000000513bb2fe>] console_unlock+0x3a6/0xa30
 [<00000000513bde2c>] vprintk_emit+0x184/0x3c8
 [<00000000513be0b4>] vprintk_default+0x44/0x50
 [<00000000513beb60>] printk+0xa8/0xc0
 [<000000005158c364>] __dump_page+0x1dc/0x710
 [<000000005158c8c6>] dump_page+0x2e/0x58
 [<00000000515d87c8>] has_unmovable_pages+0x2e8/0x470
 [<000000005167072c>] start_isolate_page_range+0x404/0x538
 [<0000000051b96de4>] __offline_pages+0x22c/0x1338
 [<0000000051908586>] memory_subsys_offline+0xa6/0xe8
 [<00000000518e561e>] device_offline+0xe6/0x118
 [<0000000051908170>] state_store+0xf0/0x110
 [<0000000051796384>] kernfs_fop_write+0x1bc/0x270
 [<000000005168972e>] vfs_write+0xce/0x220
 [<0000000051689b9a>] ksys_write+0xea/0x190
 [<0000000051ba9990>] system_call+0xd8/0x2b4
INFO: lockdep is turned off.

[1] https://lore.kernel.org/lkml/1568817579.5576.172.camel@lca.pw/

Signed-off-by: Qian Cai <cai@....pw>
---

v2: unlock zone->lock in has_unmovable_pages() where necessary.

 include/linux/page-isolation.h |  2 +-
 mm/memory_hotplug.c            |  3 ++-
 mm/page_alloc.c                | 12 +++++-------
 mm/page_isolation.c            |  7 +++++--
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 1099c2fee20f..256d491151f5 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -34,7 +34,7 @@ static inline bool is_migrate_isolate(int migratetype)
 #define REPORT_FAILURE	0x2
 
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-			 int migratetype, int flags);
+			 int migratetype, int flags, unsigned long irqflags);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b1be791f772d..4635ae0da48b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1204,7 +1204,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 
-	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
+	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
+				    SKIP_HWPOISON, 0);
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15c2050c629b..5352aa25b9f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8170,7 +8170,7 @@ void *__init alloc_large_system_hash(const char *tablename,
  * race condition. So you can't expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-			 int migratetype, int flags)
+			 int migratetype, int flags, unsigned long irqflags)
 {
 	unsigned long found;
 	unsigned long iter = 0;
@@ -8276,9 +8276,11 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 	}
 	return false;
 unmovable:
-	WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
-	if (flags & REPORT_FAILURE)
+	if (flags & REPORT_FAILURE) {
+		spin_unlock_irqrestore(&zone->lock, irqflags);
 		dump_page(pfn_to_page(pfn + iter), reason);
+	}
+	WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
 	return true;
 }
 
@@ -8588,10 +8590,6 @@ void zone_pcp_reset(struct zone *zone)
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 		offlined_pages += 1 << order;
-#ifdef CONFIG_DEBUG_VM
-		pr_info("remove from free list %lx %d %lx\n",
-			pfn, 1 << order, end_pfn);
-#endif
 		del_page_from_free_area(page, &zone->free_area[order]);
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 89c19c0feadb..a656c6abb5ac 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -60,7 +60,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	 * We just check MOVABLE pages.
 	 */
 	if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
-				 isol_flags))
+				 isol_flags, flags))
 		ret = 0;
 
 	/*
@@ -81,7 +81,10 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 		__mod_zone_freepage_state(zone, -nr_pages, mt);
 	}
 
-	spin_unlock_irqrestore(&zone->lock, flags);
+	/* It may has already been unlocked in has_unmovable_pages() above. */
+	if (!(ret && isol_flags & REPORT_FAILURE))
+		spin_unlock_irqrestore(&zone->lock, flags);
+
 	if (!ret)
 		drain_all_pages(zone);
 	return ret;
-- 
1.8.3.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ