lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 29 Jul 2023 14:31:01 +0900
From:   Tetsuo Handa <penguin-kernel@...ove.SAKURA.ne.jp>
To:     Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        Michal Hocko <mhocko@...e.com>
Cc:     Petr Mladek <pmladek@...e.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        "Luis Claudio R. Goncalves" <lgoncalv@...hat.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Boqun Feng <boqun.feng@...il.com>,
        Ingo Molnar <mingo@...hat.com>,
        John Ogness <john.ogness@...utronix.de>,
        Mel Gorman <mgorman@...hsingularity.net>,
        Peter Zijlstra <peterz@...radead.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Waiman Long <longman@...hat.com>, Will Deacon <will@...nel.org>
Subject: Re: [PATCH v2 1/2] seqlock: Do the lockdep annotation before locking
 in do_write_seqcount_begin_nested()

On 2023/07/28 0:10, Sebastian Andrzej Siewior wrote:
> On 2023-06-28 21:14:16 [+0900], Tetsuo Handa wrote:
>>> Anyway, please do not do this change only because of printk().
>>> IMHO, the current ordering is more logical and the printk() problem
>>> should be solved another way.
>>
>> Then, since [PATCH 1/2] cannot be applied, [PATCH 2/2] is automatically
>> rejected.
> 
> My understanding is that this patch gets applied and your objection will
> be noted.

My preference is that zonelist_update_seq is not checked by !__GFP_DIRECT_RECLAIM
allocations, which is a low-hanging fruit towards GFP_LOCKLESS mentioned at
https://lkml.kernel.org/r/ZG3+l4qcCWTPtSMD@dhcp22.suse.cz and
https://lkml.kernel.org/r/ZJWWpGZMJIADQvRS@dhcp22.suse.cz .

Maybe we can defer checking zonelist_update_seq till retry check like below,
for this is really an infrequent event.

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d3460c7a480..2f7b82af2590 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3642,22 +3642,27 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release);
  * retries the allocation if zonelist changes. Writer side is protected by the
  * embedded spin_lock.
  */
-static DEFINE_SEQLOCK(zonelist_update_seq);
+static unsigned int zonelist_update_seq;
 
 static unsigned int zonelist_iter_begin(void)
 {
 	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
-		return read_seqbegin(&zonelist_update_seq);
+		return data_race(READ_ONCE(zonelist_update_seq));
 
 	return 0;
 }
 
-static unsigned int check_retry_zonelist(unsigned int seq)
+static unsigned int check_retry_zonelist(gfp_t gfp, unsigned int seq)
 {
-	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
-		return read_seqretry(&zonelist_update_seq, seq);
+	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE) && (gfp & __GFP_DIRECT_RECLAIM)) {
+		unsigned int seq2;
+
+		smp_rmb();
+		seq2 = data_race(READ_ONCE(zonelist_update_seq));
+		return unlikely(seq != seq2 || (seq2 & 1));
+	}
 
-	return seq;
+	return 0;
 }
 
 /* Perform direct synchronous page reclaim */
@@ -4146,7 +4151,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	 * a unnecessary OOM kill.
 	 */
 	if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
-	    check_retry_zonelist(zonelist_iter_cookie))
+	    check_retry_zonelist(gfp_mask, zonelist_iter_cookie))
 		goto restart;
 
 	/* Reclaim has failed us, start killing things */
@@ -4172,7 +4177,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	 * a unnecessary OOM kill.
 	 */
 	if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
-	    check_retry_zonelist(zonelist_iter_cookie))
+	    check_retry_zonelist(gfp_mask, zonelist_iter_cookie))
 		goto restart;
 
 	/*
@@ -5136,22 +5141,12 @@ static void __build_all_zonelists(void *data)
 	int nid;
 	int __maybe_unused cpu;
 	pg_data_t *self = data;
+	static DEFINE_SPINLOCK(lock);
 	unsigned long flags;
 
-	/*
-	 * Explicitly disable this CPU's interrupts before taking seqlock
-	 * to prevent any IRQ handler from calling into the page allocator
-	 * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
-	 */
-	local_irq_save(flags);
-	/*
-	 * Explicitly disable this CPU's synchronous printk() before taking
-	 * seqlock to prevent any printk() from trying to hold port->lock, for
-	 * tty_insert_flip_string_and_push_buffer() on other CPU might be
-	 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
-	 */
-	printk_deferred_enter();
-	write_seqlock(&zonelist_update_seq);
+	spin_lock_irqsave(&lock, flags);
+	data_race(zonelist_update_seq++);
+	smp_wmb();
 
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
@@ -5188,9 +5183,9 @@ static void __build_all_zonelists(void *data)
 #endif
 	}
 
-	write_sequnlock(&zonelist_update_seq);
-	printk_deferred_exit();
-	local_irq_restore(flags);
+	smp_wmb();
+	data_race(zonelist_update_seq++);
+	spin_unlock_irqrestore(&lock, flags);
 }
 
 static noinline void __init


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ