[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250708130930.k37I5UrI@linutronix.de>
Date: Tue, 8 Jul 2025 15:09:30 +0200
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>, Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>
Subject: Re: [PATCH 2/3] futex: Use RCU-based per-CPU reference counting
instead of rcuref_t
On 2025-07-07 16:36:22 [+0200], To linux-kernel@...r.kernel.org wrote:
so a box was doing innocent things and then this happened:
| slab mm_struct start ffff888549a50580 pointer offset 280 size 1352
| BUG: kernel NULL pointer dereference, address: 0000000000000000
| #PF: supervisor instruction fetch in kernel mode
| #PF: error_code(0x0010) - not-present page
| PGD 0 P4D 0
| Oops: Oops: 0010 [#1] SMP
| CPU: 11 UID: 1001 PID: 125007 Comm: clang Not tainted 6.16.0-rc5+ #262 PREEMPT(lazy) 3bf8bc6327fe388c2a27e778516b456f280aa854
| Hardware name: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.02.03.0003.041920141333 04/19/2014
| RIP: 0010:0x0
| Code: Unable to access opcode bytes at 0xffffffffffffffd6.
| RSP: 0000:ffffc90020317e60 EFLAGS: 00010282
| RAX: 0000000000000001 RBX: 0000000000000006 RCX: 0000000000000000
| RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff888549a50698
| RBP: ffff888a3faeab00 R08: 0000000000000000 R09: ffffc90020317bc8
| R10: ffffffff8296bdc8 R11: 0000000000000003 R12: ffff8881b6f80000
| R13: ffffc90020317e98 R14: 0000000000000005 R15: 0000000000000000
| FS: 00007fd37b766c40(0000) GS:ffff888abc9ef000(0000) knlGS:0000000000000000
| CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: ffffffffffffffd6 CR3: 00000005c3d5f001 CR4: 00000000000626f0
| Call Trace:
| <TASK>
| rcu_core+0x27c/0x720
| ? rcu_core+0x21c/0x720
| handle_softirqs+0xc5/0x260
| irq_exit_rcu+0x85/0xa0
| sysvec_apic_timer_interrupt+0x3d/0x90
| asm_sysvec_apic_timer_interrupt+0x1a/0x20
| RIP: 0033:0x7fd3862e1190
| Code: 41 56 53 48 89 d3 49 89 f6 49 89 ff 48 c7 47 08 00 00 00 00 8b 47 10 48 85 c0 74 52 49 8b 0f 48 c1 e0 04 31 d2 0f 1f 44 00 00 <48> c7 04 11 00 f0 ff ff 48 83 c2 10 48 39 d0 75 ef eb 31 4d 85 c9
| RSP: 002b:00007fff1e519aa0 EFLAGS: 00010202
| RAX: 0000000000040000 RBX: 00007fd37b71f010 RCX: 00007fd37b5c3010
| RDX: 000000000000fff0 RSI: 00007fd37b6ff010 RDI: 000055836a11e290
| RBP: 0000000000150050 R08: 00000000ffffffff R09: 0000000000000000
| R10: 0000000000000022 R11: 0000000000000246 R12: 000055836a597420
| R13: 000055836a593030 R14: 00007fd37b6ff010 R15: 000055836a11e290
| </TASK>
| Modules linked in:
| Dumping ftrace buffer:
…
| CR2: 0000000000000000
| ---[ end trace 0000000000000000 ]---
| RIP: 0010:0x0
| Code: Unable to access opcode bytes at 0xffffffffffffffd6.
| RSP: 0000:ffffc90020317e60 EFLAGS: 00010282
| RAX: 0000000000000001 RBX: 0000000000000006 RCX: 0000000000000000
| RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff888549a50698
on the plus side there is no evidence that this could be futex related
:)
However, I was wondering could this be because nothing ensures that the
mm stays around after the RCU callback has been enqueued.
What about this:
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index b13474825130f..2201da0afecc5 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
/* same as above but performs the slow path from the async context. Can
* be called from the atomic context as well
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 66c4d4cc2340b..0b885dcbde9af 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1149,7 +1149,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d1877abbb7147..cd8463f3d1026 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1602,6 +1602,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph)
wake_up_var(mm);
WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
}
static void futex_ref_rcu(struct rcu_head *head)
@@ -1637,6 +1638,11 @@ static void futex_ref_drop(struct futex_private_hash *fph)
* Can only transition the current fph;
*/
WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
/*
* In order to avoid the following scenario:
--
2.50.0
Sebastian
Powered by blists - more mailing lists