[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aDwDw9Aygqo6oAx+@ly-workstation>
Date: Sun, 1 Jun 2025 15:39:47 +0800
From: "Lai, Yi" <yi1.lai@...ux.intel.com>
To: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: linux-kernel@...r.kernel.org,
André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>, Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>, yi1.lai@...el.com
Subject: Re: [PATCH v12 14/21] futex: Allow to resize the private local hash
On Wed, Apr 16, 2025 at 06:29:14PM +0200, Sebastian Andrzej Siewior wrote:
> The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
> replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
> operation can now be invoked at runtime and resize an already existing
> internal private futex_hash_bucket to another size.
>
> The reallocation is based on an idea by Thomas Gleixner: The initial
> allocation of struct futex_private_hash sets the reference count
> to one. Every user acquires a reference on the local hash before using
> it and drops it after it enqueued itself on the hash bucket. There is no
> reference held while the task is scheduled out while waiting for the
> wake up.
> The resize process allocates a new struct futex_private_hash and drops
> the initial reference. Synchronized with mm_struct::futex_hash_lock it
> is checked if the reference counter for the currently used
> mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued
> on the current private hash are requeued on the new private hash and the
> new private hash is set to mm_struct::futex_phash. Otherwise the newly
> allocated private hash is saved as mm_struct::futex_phash_new and the
> rehashing and reassigning is delayed to the futex_hash() caller once the
> reference counter is marked DEAD.
> The replacement is not performed at rcuref_put() time because certain
> callers, such as futex_wait_queue(), drop their reference after changing
> the task state. This change will be destroyed once the futex_hash_lock
> is acquired.
>
> The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS
> multiple times. An increase and decrease is allowed and request blocks
> until the assignment is done.
>
> The private hash allocated at thread creation is changed from 16 to
> 16 <= 4 * number_of_threads <= global_hash_size
> where number_of_threads can not exceed the number of online CPUs. Should
> the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled.
>
> [peterz: reorganize the code to avoid state tracking and simplify new
> object handling, block the user until changes are in effect, allow
> increase and decrease of the hash].
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
> ---
> include/linux/futex.h | 3 +-
> include/linux/mm_types.h | 4 +-
> kernel/futex/core.c | 290 ++++++++++++++++++++++++++++++++++++---
> kernel/futex/requeue.c | 5 +
> 4 files changed, 281 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 1d3f7555825ec..40bc778b2bb45 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -85,7 +85,8 @@ void futex_hash_free(struct mm_struct *mm);
>
> static inline void futex_mm_init(struct mm_struct *mm)
> {
> - mm->futex_phash = NULL;
> + rcu_assign_pointer(mm->futex_phash, NULL);
> + mutex_init(&mm->futex_hash_lock);
> }
>
> #else /* !CONFIG_FUTEX_PRIVATE_HASH */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index a4b5661e41770..32ba5126e2214 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1033,7 +1033,9 @@ struct mm_struct {
> seqcount_t mm_lock_seq;
> #endif
> #ifdef CONFIG_FUTEX_PRIVATE_HASH
> - struct futex_private_hash *futex_phash;
> + struct mutex futex_hash_lock;
> + struct futex_private_hash __rcu *futex_phash;
> + struct futex_private_hash *futex_phash_new;
> #endif
>
> unsigned long hiwater_rss; /* High-watermark of RSS usage */
> diff --git a/kernel/futex/core.c b/kernel/futex/core.c
> index 53b3a00a92539..9e7dad52abea8 100644
> --- a/kernel/futex/core.c
> +++ b/kernel/futex/core.c
> @@ -40,6 +40,7 @@
> #include <linux/fault-inject.h>
> #include <linux/slab.h>
> #include <linux/prctl.h>
> +#include <linux/rcuref.h>
>
> #include "futex.h"
> #include "../locking/rtmutex_common.h"
> @@ -57,7 +58,9 @@ static struct {
> #define futex_hashmask (__futex_data.hashmask)
>
> struct futex_private_hash {
> + rcuref_t users;
> unsigned int hash_mask;
> + struct rcu_head rcu;
> void *mm;
> bool custom;
> struct futex_hash_bucket queues[];
> @@ -129,11 +132,14 @@ static inline bool futex_key_is_private(union futex_key *key)
>
> bool futex_private_hash_get(struct futex_private_hash *fph)
> {
> - return false;
> + return rcuref_get(&fph->users);
> }
>
> void futex_private_hash_put(struct futex_private_hash *fph)
> {
> + /* Ignore return value, last put is verified via rcuref_is_dead() */
> + if (rcuref_put(&fph->users))
> + wake_up_var(fph->mm);
> }
>
> /**
> @@ -143,8 +149,23 @@ void futex_private_hash_put(struct futex_private_hash *fph)
> * Obtain an additional reference for the already obtained hash bucket. The
> * caller must already own an reference.
> */
> -void futex_hash_get(struct futex_hash_bucket *hb) { }
> -void futex_hash_put(struct futex_hash_bucket *hb) { }
> +void futex_hash_get(struct futex_hash_bucket *hb)
> +{
> + struct futex_private_hash *fph = hb->priv;
> +
> + if (!fph)
> + return;
> + WARN_ON_ONCE(!futex_private_hash_get(fph));
> +}
> +
> +void futex_hash_put(struct futex_hash_bucket *hb)
> +{
> + struct futex_private_hash *fph = hb->priv;
> +
> + if (!fph)
> + return;
> + futex_private_hash_put(fph);
> +}
>
> static struct futex_hash_bucket *
> __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
> @@ -155,7 +176,7 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
> return NULL;
>
> if (!fph)
> - fph = key->private.mm->futex_phash;
> + fph = rcu_dereference(key->private.mm->futex_phash);
> if (!fph || !fph->hash_mask)
> return NULL;
>
> @@ -165,21 +186,119 @@ __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
> return &fph->queues[hash & fph->hash_mask];
> }
>
> +static void futex_rehash_private(struct futex_private_hash *old,
> + struct futex_private_hash *new)
> +{
> + struct futex_hash_bucket *hb_old, *hb_new;
> + unsigned int slots = old->hash_mask + 1;
> + unsigned int i;
> +
> + for (i = 0; i < slots; i++) {
> + struct futex_q *this, *tmp;
> +
> + hb_old = &old->queues[i];
> +
> + spin_lock(&hb_old->lock);
> + plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
> +
> + plist_del(&this->list, &hb_old->chain);
> + futex_hb_waiters_dec(hb_old);
> +
> + WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
> +
> + hb_new = __futex_hash(&this->key, new);
> + futex_hb_waiters_inc(hb_new);
> + /*
> + * The new pointer isn't published yet but an already
> + * moved user can be unqueued due to timeout or signal.
> + */
> + spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
> + plist_add(&this->list, &hb_new->chain);
> + this->lock_ptr = &hb_new->lock;
> + spin_unlock(&hb_new->lock);
> + }
> + spin_unlock(&hb_old->lock);
> + }
> +}
> +
> +static bool __futex_pivot_hash(struct mm_struct *mm,
> + struct futex_private_hash *new)
> +{
> + struct futex_private_hash *fph;
> +
> + WARN_ON_ONCE(mm->futex_phash_new);
> +
> + fph = rcu_dereference_protected(mm->futex_phash,
> + lockdep_is_held(&mm->futex_hash_lock));
> + if (fph) {
> + if (!rcuref_is_dead(&fph->users)) {
> + mm->futex_phash_new = new;
> + return false;
> + }
> +
> + futex_rehash_private(fph, new);
> + }
> + rcu_assign_pointer(mm->futex_phash, new);
> + kvfree_rcu(fph, rcu);
> + return true;
> +}
> +
Hi Sebastian Andrzej Siewior,
Greetings!
I used Syzkaller and found that there is KASAN: null-ptr-deref Read in __futex_pivot_hash in linux-next next-20250527.
After bisection and the first bad commit is:
"
bd54df5ea7ca futex: Allow to resize the private local hash
"
All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/250531_004606___futex_pivot_hash/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/250531_004606___futex_pivot_hash/bzImage_fefff2755f2aa4125dce2a1edfe7e545c7c621f2
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/250531_004606___futex_pivot_hash/bzImage_fefff2755f2aa4125dce2a1edfe7e545c7c621f2
"
[ 266.064649] Adding 124996k swap on ./swap-file. Priority:0 extents:1 across:124996k
[ 266.075472] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#11] SMP I
[ 266.075983] KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
[ 266.076337] CPU: 0 UID: 0 PID: 1168 Comm: repro Tainted: G B D 6.15.0-next-20250527-fefff2755f2a #1
[ 266.076882] Tainted: [B]=BAD_PAGE, [D]=DIE
[ 266.077073] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
[ 266.077594] RIP: 0010:plist_del+0xf3/0x2d0
[ 266.077803] Code: 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 a6 01 00 00 49 8d 7f 08 4c 8b 73 10 48 b8 00 00 00 00 00 0
[ 266.078640] RSP: 0018:ffff8880159dfc40 EFLAGS: 00010202
[ 266.078886] RAX: dffffc0000000000 RBX: ffff88800f2397e8 RCX: ffffffff85ca6b25
[ 266.079327] RDX: 0000000000000001 RSI: 0000000000000008 RDI: 0000000000000008
[ 266.079658] RBP: ffff8880159dfc70 R08: 0000000000000001 R09: ffffed1002b3bf7d
[ 266.079989] R10: 0000000000000003 R11: 000000000000000c R12: ffff88800f239800
[ 266.080311] R13: ffff88800f2397f0 R14: 0000000000000000 R15: 0000000000000000
[ 266.080635] FS: 00007f8c127ff640(0000) GS:ffff8880e355f000(0000) knlGS:0000000000000000
[ 266.080998] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 266.081260] CR2: 00007f8c127fee38 CR3: 00000000149da003 CR4: 0000000000770ef0
[ 266.081594] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 266.081919] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 266.082248] PKRU: 55555554
[ 266.082377] Call Trace:
[ 266.082496] <TASK>
[ 266.082605] __futex_pivot_hash+0x2b0/0x520
[ 266.082815] futex_hash_allocate+0xb26/0x10b0
[ 266.083028] ? __pfx_futex_hash_allocate+0x10/0x10
[ 266.083261] ? __sanitizer_cov_trace_switch+0x58/0xa0
[ 266.083508] ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[ 266.083756] ? static_key_count+0x69/0x80
[ 266.083948] futex_hash_prctl+0x20c/0x650
[ 266.084146] __do_sys_prctl+0x1a0d/0x2170
[ 266.084347] ? __pfx___do_sys_prctl+0x10/0x10
[ 266.084563] __x64_sys_prctl+0xc6/0x150
[ 266.084742] ? syscall_trace_enter+0x14d/0x280
[ 266.084956] x64_sys_call+0x1a25/0x2150
[ 266.085144] do_syscall_64+0x6d/0x2e0
[ 266.085324] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 266.085558] RIP: 0033:0x7f8c1283ee5d
[ 266.085731] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 8
[ 266.086550] RSP: 002b:00007f8c127fed48 EFLAGS: 00000246 ORIG_RAX: 000000000000009d
[ 266.086895] RAX: ffffffffffffffda RBX: 00007f8c127ff640 RCX: 00007f8c1283ee5d
[ 266.087219] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 000000000000004e
[ 266.087546] RBP: 00007f8c127fed60 R08: 0000000000000000 R09: 0000000000000000
[ 266.087869] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8c127ff640
[ 266.088191] R13: 0000000000000013 R14: 00007f8c1289f560 R15: 0000000000000000
[ 266.088521] </TASK>
[ 266.088631] Modules linked in:
[ 266.088810] ---[ end trace 0000000000000000 ]---
[ 266.089030] RIP: 0010:__futex_pivot_hash+0x271/0x520
[ 266.089265] Code: e8 84 a5 58 04 48 8b 45 d0 48 c1 e8 03 42 80 3c 28 00 0f 85 5e 02 00 00 48 8b 45 d0 4c 8b 30 4c 0
[ 266.090087] RSP: 0018:ffff88801b43fc80 EFLAGS: 00010206
[ 266.090332] RAX: 0007c018e000003c RBX: 003e00c7000001c9 RCX: ffffffff81799536
[ 266.090660] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff8880227e8888
[ 266.090983] RBP: ffff88801b43fcf8 R08: 0000000000000001 R09: ffffed1003687f7d
[ 266.091309] R10: 0000000000000003 R11: 6e696c6261736944 R12: ffff888014430d68
[ 266.091634] R13: dffffc0000000000 R14: 003e00c7000001e1 R15: ffff888014430a80
[ 266.091950] FS: 00007f8c127ff640(0000) GS:ffff8880e355f000(0000) knlGS:0000000000000000
[ 266.092319] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 266.092582] CR2: 00007f8c127fee38 CR3: 00000000149da003 CR4: 0000000000770ef0
[ 266.092915] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 266.093243] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 266.093608] PKRU: 55555554
[ 266.093738] note: repro[1168] exited with preempt_count 1
"
I also tried lastest linux-tag next-20250530. This issue can be reproduced. Here is the log:
"
[ 50.554828] Adding 124996k swap on ./swap-file. Priority:0 extents:1 across:124996k
[ 50.563846] Oops: general protection fault, probably for non-canonical address 0xe028fc18c0000065: 0000 [#4] SMP KI
[ 50.564384] KASAN: maybe wild-memory-access in range [0x014800c600000328-0x014800c60000032f]
[ 50.564774] CPU: 1 UID: 0 PID: 813 Comm: repro Tainted: G B D 6.15.0-next-20250530-kvm #3 PREEMPT(v
[ 50.565314] Tainted: [B]=BAD_PAGE, [D]=DIE
[ 50.565514] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4
[ 50.566028] RIP: 0010:__futex_pivot_hash+0x204/0x530
[ 50.566278] Code: e8 f1 e6 5b 04 48 8b 45 d0 48 c1 e8 03 42 80 3c 28 00 0f 85 d1 02 00 00 48 8b 45 d0 4c 8b 30 4c 0
[ 50.567119] RSP: 0018:ffff88801241fc80 EFLAGS: 00010206
[ 50.567372] RAX: 00290018c0000065 RBX: 014800c600000310 RCX: ffffffff8179ecdc
[ 50.567706] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff88801d5d1708
[ 50.568036] RBP: ffff88801241fcf8 R08: 0000000000000001 R09: ffffed1002483f7d
[ 50.568364] R10: 0000000000000003 R11: 00000000bd9dfb48 R12: ffff88801429bf00
[ 50.568699] R13: dffffc0000000000 R14: 014800c600000328 R15: 0000000000000001
[ 50.569035] FS: 00007f183fe43640(0000) GS:ffff8880e3652000(0000) knlGS:0000000000000000
[ 50.569415] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 50.569691] CR2: 00007f183fe42e38 CR3: 000000001115c005 CR4: 0000000000770ef0
[ 50.570026] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 50.570349] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 50.570684] PKRU: 55555554
[ 50.570820] Call Trace:
[ 50.570946] <TASK>
[ 50.571060] futex_hash_allocate+0xb3a/0x1060
[ 50.571279] ? sigprocmask+0x24e/0x370
[ 50.571470] ? __pfx_futex_hash_allocate+0x10/0x10
[ 50.571703] ? rcu_is_watching+0x19/0xc0
[ 50.571899] ? __sanitizer_cov_trace_switch+0x58/0xa0
[ 50.572152] ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[ 50.572416] ? static_key_count+0x63/0x80
[ 50.572608] ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[ 50.572870] futex_hash_prctl+0x1fe/0x650
[ 50.573069] __do_sys_prctl+0x4a3/0x2110
[ 50.573270] ? __pfx___do_sys_prctl+0x10/0x10
[ 50.573486] ? __audit_syscall_entry+0x39f/0x500
[ 50.573714] __x64_sys_prctl+0xc6/0x150
[ 50.573905] ? syscall_trace_enter+0x14d/0x280
[ 50.574120] x64_sys_call+0x1a2f/0x1fa0
[ 50.574314] do_syscall_64+0x6d/0x2e0
[ 50.574497] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 50.574736] RIP: 0033:0x7f183fc3ee5d
[ 50.574911] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 8
[ 50.575748] RSP: 002b:00007f183fe42d48 EFLAGS: 00000246 ORIG_RAX: 000000000000009d
[ 50.576105] RAX: ffffffffffffffda RBX: 00007f183fe43640 RCX: 00007f183fc3ee5d
[ 50.576434] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 000000000000004e
[ 50.576768] RBP: 00007f183fe42d60 R08: 0000000000000000 R09: 0000000000000000
[ 50.577105] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f183fe43640
[ 50.577444] R13: 000000000000000c R14: 00007f183fc9f560 R15: 0000000000000000
[ 50.577781] </TASK>
[ 50.577887] Modules linked in:
[ 50.578095] ---[ end trace 0000000000000000 ]---
[ 50.578316] RIP: 0010:__futex_pivot_hash+0x204/0x530
[ 50.578559] Code: e8 f1 e6 5b 04 48 8b 45 d0 48 c1 e8 03 42 80 3c 28 00 0f 85 d1 02 00 00 48 8b 45 d0 4c 8b 30 4c 0
[ 50.579394] RSP: 0018:ffff888012557c80 EFLAGS: 00010206
[ 50.579643] RAX: 00798018e0000056 RBX: 03cc00c700000299 RCX: ffffffff8179ecdc
[ 50.579975] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff8880117f8488
[ 50.580303] RBP: ffff888012557cf8 R08: 0000000000000001 R09: ffffed10024aaf7d
[ 50.580669] R10: 0000000000000003 R11: 6e696c6261736944 R12: ffff888012cf0000
[ 50.581597] R13: dffffc0000000000 R14: 03cc00c7000002b1 R15: 0000000000000001
[ 50.581937] FS: 00007f183fe43640(0000) GS:ffff8880e3652000(0000) knlGS:0000000000000000
[ 50.582309] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 50.582583] CR2: 00007f183fe42e38 CR3: 000000001115c005 CR4: 0000000000770ef0
[ 50.582977] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 50.583294] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 50.583622] PKRU: 55555554
[ 50.583758] note: repro[813] exited with preempt_count 1
"
Hope this cound be insightful to you.
Regards,
Yi Lai
---
If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.
How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0
// start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
// You could change the bzImage_xxx as you want
// Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@...alhost
After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@...alhost:/root/
Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage //x should equal or less than cpu num your pc has
Fill the bzImage file into above start3.sh to load the target kernel in vm.
Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install
> +static void futex_pivot_hash(struct mm_struct *mm)
> +{
> + scoped_guard(mutex, &mm->futex_hash_lock) {
> + struct futex_private_hash *fph;
> +
> + fph = mm->futex_phash_new;
> + if (fph) {
> + mm->futex_phash_new = NULL;
> + __futex_pivot_hash(mm, fph);
> + }
> + }
> +}
> +
> struct futex_private_hash *futex_private_hash(void)
> {
> struct mm_struct *mm = current->mm;
> - struct futex_private_hash *fph;
> + /*
> + * Ideally we don't loop. If there is a replacement in progress
> + * then a new private hash is already prepared and a reference can't be
> + * obtained once the last user dropped it's.
> + * In that case we block on mm_struct::futex_hash_lock and either have
> + * to perform the replacement or wait while someone else is doing the
> + * job. Eitherway, on the second iteration we acquire a reference on the
> + * new private hash or loop again because a new replacement has been
> + * requested.
> + */
> +again:
> + scoped_guard(rcu) {
> + struct futex_private_hash *fph;
>
> - fph = mm->futex_phash;
> - return fph;
> + fph = rcu_dereference(mm->futex_phash);
> + if (!fph)
> + return NULL;
> +
> + if (rcuref_get(&fph->users))
> + return fph;
> + }
> + futex_pivot_hash(mm);
> + goto again;
> }
>
> struct futex_hash_bucket *futex_hash(union futex_key *key)
> {
> + struct futex_private_hash *fph;
> struct futex_hash_bucket *hb;
>
> - hb = __futex_hash(key, NULL);
> - return hb;
> +again:
> + scoped_guard(rcu) {
> + hb = __futex_hash(key, NULL);
> + fph = hb->priv;
> +
> + if (!fph || futex_private_hash_get(fph))
> + return hb;
> + }
> + futex_pivot_hash(key->private.mm);
> + goto again;
> }
>
> #else /* !CONFIG_FUTEX_PRIVATE_HASH */
> @@ -664,6 +783,8 @@ int futex_unqueue(struct futex_q *q)
> spinlock_t *lock_ptr;
> int ret = 0;
>
> + /* RCU so lock_ptr is not going away during locking. */
> + guard(rcu)();
> /* In the common case we don't take the spinlock, which is nice. */
> retry:
> /*
> @@ -1065,6 +1186,10 @@ static void exit_pi_state_list(struct task_struct *curr)
> struct futex_pi_state *pi_state;
> union futex_key key = FUTEX_KEY_INIT;
>
> + /*
> + * The mutex mm_struct::futex_hash_lock might be acquired.
> + */
> + might_sleep();
> /*
> * Ensure the hash remains stable (no resize) during the while loop
> * below. The hb pointer is acquired under the pi_lock so we can't block
> @@ -1261,7 +1386,51 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
> #ifdef CONFIG_FUTEX_PRIVATE_HASH
> void futex_hash_free(struct mm_struct *mm)
> {
> - kvfree(mm->futex_phash);
> + struct futex_private_hash *fph;
> +
> + kvfree(mm->futex_phash_new);
> + fph = rcu_dereference_raw(mm->futex_phash);
> + if (fph) {
> + WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
> + kvfree(fph);
> + }
> +}
> +
> +static bool futex_pivot_pending(struct mm_struct *mm)
> +{
> + struct futex_private_hash *fph;
> +
> + guard(rcu)();
> +
> + if (!mm->futex_phash_new)
> + return true;
> +
> + fph = rcu_dereference(mm->futex_phash);
> + return rcuref_is_dead(&fph->users);
> +}
> +
> +static bool futex_hash_less(struct futex_private_hash *a,
> + struct futex_private_hash *b)
> +{
> + /* user provided always wins */
> + if (!a->custom && b->custom)
> + return true;
> + if (a->custom && !b->custom)
> + return false;
> +
> + /* zero-sized hash wins */
> + if (!b->hash_mask)
> + return true;
> + if (!a->hash_mask)
> + return false;
> +
> + /* keep the biggest */
> + if (a->hash_mask < b->hash_mask)
> + return true;
> + if (a->hash_mask > b->hash_mask)
> + return false;
> +
> + return false; /* equal */
> }
>
> static int futex_hash_allocate(unsigned int hash_slots, bool custom)
> @@ -1273,16 +1442,23 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
> if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
> return -EINVAL;
>
> - if (mm->futex_phash)
> - return -EALREADY;
> -
> - if (!thread_group_empty(current))
> - return -EINVAL;
> + /*
> + * Once we've disabled the global hash there is no way back.
> + */
> + scoped_guard(rcu) {
> + fph = rcu_dereference(mm->futex_phash);
> + if (fph && !fph->hash_mask) {
> + if (custom)
> + return -EBUSY;
> + return 0;
> + }
> + }
>
> fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
> if (!fph)
> return -ENOMEM;
>
> + rcuref_init(&fph->users, 1);
> fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
> fph->custom = custom;
> fph->mm = mm;
> @@ -1290,26 +1466,102 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
> for (i = 0; i < hash_slots; i++)
> futex_hash_bucket_init(&fph->queues[i], fph);
>
> - mm->futex_phash = fph;
> + if (custom) {
> + /*
> + * Only let prctl() wait / retry; don't unduly delay clone().
> + */
> +again:
> + wait_var_event(mm, futex_pivot_pending(mm));
> + }
> +
> + scoped_guard(mutex, &mm->futex_hash_lock) {
> + struct futex_private_hash *free __free(kvfree) = NULL;
> + struct futex_private_hash *cur, *new;
> +
> + cur = rcu_dereference_protected(mm->futex_phash,
> + lockdep_is_held(&mm->futex_hash_lock));
> + new = mm->futex_phash_new;
> + mm->futex_phash_new = NULL;
> +
> + if (fph) {
> + if (cur && !new) {
> + /*
> + * If we have an existing hash, but do not yet have
> + * allocated a replacement hash, drop the initial
> + * reference on the existing hash.
> + */
> + futex_private_hash_put(cur);
> + }
> +
> + if (new) {
> + /*
> + * Two updates raced; throw out the lesser one.
> + */
> + if (futex_hash_less(new, fph)) {
> + free = new;
> + new = fph;
> + } else {
> + free = fph;
> + }
> + } else {
> + new = fph;
> + }
> + fph = NULL;
> + }
> +
> + if (new) {
> + /*
> + * Will set mm->futex_phash_new on failure;
> + * futex_private_hash_get() will try again.
> + */
> + if (!__futex_pivot_hash(mm, new) && custom)
> + goto again;
> + }
> + }
> return 0;
> }
>
> int futex_hash_allocate_default(void)
> {
> + unsigned int threads, buckets, current_buckets = 0;
> + struct futex_private_hash *fph;
> +
> if (!current->mm)
> return 0;
>
> - if (current->mm->futex_phash)
> + scoped_guard(rcu) {
> + threads = min_t(unsigned int,
> + get_nr_threads(current),
> + num_online_cpus());
> +
> + fph = rcu_dereference(current->mm->futex_phash);
> + if (fph) {
> + if (fph->custom)
> + return 0;
> +
> + current_buckets = fph->hash_mask + 1;
> + }
> + }
> +
> + /*
> + * The default allocation will remain within
> + * 16 <= threads * 4 <= global hash size
> + */
> + buckets = roundup_pow_of_two(4 * threads);
> + buckets = clamp(buckets, 16, futex_hashmask + 1);
> +
> + if (current_buckets >= buckets)
> return 0;
>
> - return futex_hash_allocate(16, false);
> + return futex_hash_allocate(buckets, false);
> }
>
> static int futex_hash_get_slots(void)
> {
> struct futex_private_hash *fph;
>
> - fph = current->mm->futex_phash;
> + guard(rcu)();
> + fph = rcu_dereference(current->mm->futex_phash);
> if (fph && fph->hash_mask)
> return fph->hash_mask + 1;
> return 0;
> diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
> index b0e64fd454d96..c716a66f86929 100644
> --- a/kernel/futex/requeue.c
> +++ b/kernel/futex/requeue.c
> @@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
> futex_hb_waiters_inc(hb2);
> plist_add(&q->list, &hb2->chain);
> q->lock_ptr = &hb2->lock;
> + /*
> + * hb1 and hb2 belong to the same futex_hash_bucket_private
> + * because if we managed get a reference on hb1 then it can't be
> + * replaced. Therefore we avoid put(hb1)+get(hb2) here.
> + */
> }
> q->key = *key2;
> }
> --
> 2.49.0
>
Powered by blists - more mailing lists