[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <764628ca-e047-4309-8b51-0a740af5cc68@huawei.com>
Date: Fri, 19 Sep 2025 09:11:46 +0800
From: Li Lingfeng <lilingfeng3@...wei.com>
To: NeilBrown <neil@...wn.name>,
<34bd5595-8f3f-4c52-a1d5-d782fc99efb9@...wei.com>
CC: Dai Ngo <Dai.Ngo@...cle.com>, Chuck Lever <chuck.lever@...cle.com>, Jeff
Layton <jlayton@...nel.org>, Olga Kornievskaia <okorniev@...hat.com>, Tom
Talpey <tom@...pey.com>, "linux-kernel@...r.kernel.org"
<linux-kernel@...r.kernel.org>, "linux-nfs@...r.kernel.org"
<linux-nfs@...r.kernel.org>, yangerkun <yangerkun@...wei.com>, "zhangyi (F)"
<yi.zhang@...wei.com>, Hou Tao <houtao1@...wei.com>,
"chengzhihao1@...wei.com" <chengzhihao1@...wei.com>, "yukuai (C)"
<yukuai3@...wei.com>, <leo.lilong@...wei.com>
Subject: Re: [Question] nfsd: possible reordering between nf->nf_file
assignment and NFSD_FILE_PENDING clearing?
在 2025/9/19 6:59, NeilBrown 写道:
> On Fri, 19 Sep 2025, 34bd5595-8f3f-4c52-a1d5-d782fc99efb9@...wei.com wrote:
>> Hi,
> You probably need to backport
>
> Commit: 8238b4579866 ("wait_on_bit: add an acquire memory barrier")
>
> NeilBrown
>
Thank you for the reply, we will try it.
Thanks,
Lingfeng
>> On 2025/9/18 21:57, Li Lingfeng wrote:
>>> Recently, we encountered a null pointer dereference on a relatively old
>>> 5.10 kernel that does not include commit c4c649ab413ba ("NFSD: Convert
>>> filecache to rhltable"), which exhibited the same behavior as described
>>> in [1]. I was wondering if it might be caused by the reordering between
>>> the assignment of nf->nf_file and the clearing of NFSD_FILE_PENDING.
>>>
>>> Just to mention, I don't believe the analysis in [1] is entirely accurate,
>>> since hlist_add_head_rcu includes a write barrier.
>>>
>>> We haven't encountered this issue on newer kernel versions, but the
>>> assignment of nf->nf_file and the clearing of NFSD_FILE_PENDING appear
>>> consistent across different versions.
>>>
>>> Our expected outcome should be like this:
>>> T1 T2
>>> nfsd_read
>>> nfsd_file_acquire_gc
>>> nfsd_file_do_acquire
>>> nfsd_file_lookup_locked
>>> // get nfsd_file from nfsd_file_rhltable
>>> nfsd_read
>>> nfsd_file_acquire_gc
>>> nfsd_file_do_acquire
>>> nfsd_file_alloc
>>> nf->nf_flags // set NFSD_FILE_PENDING
>>> rhltable_insert // insert to nfsd_file_rhltable
>>> nf->nf_file = file // set nf_file
>>> wait_on_bit
>>> // wait NFSD_FILE_PENDING to be cleared
>>> clear_and_wake_up_bit // clear NFSD_FILE_PENDING
>>> // get file after being awakened
>>> file = nf->nf_file
>>>
>>> Or like this:
>>> T1 T2
>>> nfsd_read
>>> nfsd_file_acquire_gc
>>> nfsd_file_do_acquire
>>> nfsd_file_lookup_locked
>>> // get nfsd_file from nfsd_file_rhltable
>>> nfsd_read
>>> nfsd_file_acquire_gc
>>> nfsd_file_do_acquire
>>> nfsd_file_alloc
>>> nf->nf_flags // set NFSD_FILE_PENDING
>>> rhltable_insert // insert to nfsd_file_rhltable
>>> nf->nf_file = file // set nf_file
>>> clear_and_wake_up_bit // clear NFSD_FILE_PENDING
>>> // get file directly
>>> file = nf->nf_file
>>>
>>> But is it possible that due to reordering, it ends up like this:
>>> T1 T2
>>> nfsd_read
>>> nfsd_file_acquire_gc
>>> nfsd_file_do_acquire
>>> nfsd_file_lookup_locked
>>> // get nfsd_file from nfsd_file_rhltable
>>> nfsd_read
>>> nfsd_file_acquire_gc
>>> nfsd_file_do_acquire
>>> nfsd_file_alloc
>>> nf->nf_flags // set NFSD_FILE_PENDING
>>> rhltable_insert // insert to nfsd_file_rhltable
>>> clear_and_wake_up_bit // clear NFSD_FILE_PENDING
>>> // get file directly
>>> file = nf->nf_file
>>> nf->nf_file = file // set nf_file
>>> // Null dereference due to uninitialized file pointer.
>>>
>>> [1]: https://lore.kernel.org/all/20230818065507.1280625-1-haydenw.kernel@gmail.com/
>>>
>>> Any suggestion will be appreciated.
>>>
>>> Thanks,
>>> Lingfeng.
>>>
>> I would like to provide a reproducible test case, though it might not be
>> entirely precise.
>>
>> The test case mimics the nfsd_file_acquire workflow and consists of three
>> threads: main thread, thread1, and thread2, where:
>>
>> * Thread1 acts as the writer, simulating the open_file workflow.
>> * Thread2 acts as the reader, simulating the wait_for_construction workflow.
>> * The main thread runs multiple iterations to ensure that thread1 and thread2
>> can execute concurrently in each round.
>>
>> The test case is as follows:
>>
>>
>> // writer
>> static int thread_func1(void *data)
>> {
>> struct foo *nf;
>> void *file;
>>
>> nf = &global_nf;
>>
>> while (!kthread_should_stop()) {
>> wait_for_completion(&comp_start1);
>> if (kthread_should_stop()) break;
>>
>> /* Simulate the open_file process in nfsd_file_acquire() */
>> __set_bit(FOO_PENDING, &nf->nf_flags);
>> hlist_add_head_rcu(&nf->nf_node, &foo_hashtbl[ghashval].nfb_head);
>> file = foo_filp_open();
>>
>> /* Test whether the following two lines of code will cause memory reordering */
>> nf->nf_file = file;
>> clear_bit_unlock(FOO_PENDING, &nf->nf_flags);
>>
>> smp_mb__after_atomic();
>> wake_up_bit(&nf->nf_flags, FOO_PENDING);
>>
>> complete(&comp_end1);
>> }
>> if (file)
>> kfree(file);
>> pr_info("thread_func1: exit\n");
>> return 0;
>> }
>>
>> // reader
>> static int thread_func2(void *data)
>> {
>> void *file;
>> struct foo *nf;
>>
>> nf = &global_nf;
>>
>> while (!kthread_should_stop()) {
>> wait_for_completion(&comp_start2);
>> if (kthread_should_stop()) break;
>>
>> /* Simulate the wait_for_construction process in nfsd_file_acquire() */
>> retry:
>> rcu_read_lock();
>> nf = foo_find_locked(ghashval);
>> rcu_read_unlock();
>> if (!nf)
>> goto retry;
>>
>> wait_on_bit(&nf->nf_flags, FOO_PENDING, TASK_UNINTERRUPTIBLE);
>> file = nf->nf_file;
>> if (!file)
>> WARN_ON(1);
>> else
>> kfree(file);
>>
>> complete(&comp_end2);
>> }
>> pr_info("thread_func2: exit\n");
>> return 0;
>> }
>>
>> static int main_thread_func(void *data)
>> {
>> u64 iters = 0;
>>
>> while (!kthread_should_stop()) {
>> iters++;
>> if (iters % 1000000 == 0)
>> pr_info("main_thread_func: started %llu iterations\n", iters);
>>
>> /* Start both threads */
>> complete(&comp_start1);
>> complete(&comp_start2);
>> /* wait for both to finish */
>> wait_for_completion(&comp_end1);
>> wait_for_completion(&comp_end2);
>>
>> /* Reset completions */
>> reinit_completion(&comp_end1);
>> reinit_completion(&comp_end2);
>> reinit_completion(&comp_start1);
>> reinit_completion(&comp_start2);
>>
>> hlist_del_rcu(&global_nf.nf_node);
>> global_nf.nf_file = 0;
>> global_nf.nf_flags = 0;
>> }
>>
>> pr_info("main_thread_func: exit\n");
>>
>> return 0;
>> }
>>
>>
>> I compiled and executed this test case on ARM64. The experimental results show that
>> after approximately 6,000,000 rounds, the "file is null" warning in thread2 was
>> triggered, indicating that reordering occurred between the file assignment and flag
>> clearance operations in thread1.
>>
>>
>> [107632.795543] My module is being loaded
>> [107632.800255] Threads started successfully
>> [107637.656469] main_thread_func: started 1000000 iterations
>> [107642.520876] main_thread_func: started 2000000 iterations
>> [107646.919550] main_thread_func: started 3000000 iterations
>> [107651.545742] main_thread_func: started 4000000 iterations
>> [107655.577054] main_thread_func: started 5000000 iterations
>> [107660.507772] main_thread_func: started 6000000 iterations
>> [107663.212711] ------------[ cut here ]------------
>> [107663.218265] WARNING: CPU: 26 PID: 10603 at /path/to/nfsd/mod3/order_test.c:142 thread_func2+0xa0/0xe0 [order_test]
>>
>>
>> When I placed an smp_mb() between 'wait_on_bit' and 'file = nf->nf_file' in thread2,
>> the warning *no longer* occurred.
>>
>> wait_on_bit(&nf->nf_flags, FOO_PENDING, TASK_UNINTERRUPTIBLE);
>> + smp_mb();
>> file = nf->nf_file;
>>
>> I hope this test case proves helpful for investigating the aforementioned memory
>> reordering issue.
>>
>> Best regards,
>> Tengda
>>
>>
Powered by blists - more mailing lists