linux-kernel - Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <76e15f10-d3f1-2cab-63cb-25aa3b4f2cd4@bytedance.com>
Date:   Thu, 13 Apr 2023 00:44:42 +0800
From:   Qi Zheng <zhengqi.arch@...edance.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     Vlastimil Babka <vbabka@...e.cz>,
        "Zhang, Qiang1" <qiang1.zhang@...el.com>,
        Boqun Feng <boqun.feng@...il.com>,
        Qi Zheng <zhengqi.arch@...edance.com>,
        "42.hyeyoo@...il.com" <42.hyeyoo@...il.com>,
        "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
        "roman.gushchin@...ux.dev" <roman.gushchin@...ux.dev>,
        "iamjoonsoo.kim@....com" <iamjoonsoo.kim@....com>,
        "rientjes@...gle.com" <rientjes@...gle.com>,
        "penberg@...nel.org" <penberg@...nel.org>,
        "cl@...ux.com" <cl@...ux.com>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Zhao Gongyi <zhaogongyi@...edance.com>,
        Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        Thomas Gleixner <tglx@...utronix.de>,
        RCU <rcu@...r.kernel.org>,
        "Paul E . McKenney" <paulmck@...nel.org>
Subject: Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as
 raw_spinlock



On 2023/4/12 20:47, Peter Zijlstra wrote:
> On Wed, Apr 12, 2023 at 08:50:29AM +0200, Vlastimil Babka wrote:
> 
>>> --- a/lib/debugobjects.c
>>> +++ b/lib/debugobjects.c
>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>>>          unsigned long flags;
>>>
>>>          /*
>>> -        * On RT enabled kernels the pool refill must happen in preemptible
>>> +        * The pool refill must happen in preemptible
>>>           * context:
>>>           */
>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>> +       if (preemptible())
>>>                  fill_pool();
>>
>> +CC Peterz
>>
>> Aha so this is in fact another case where the code is written with
>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
> 
> Ooh, tricky, yes. PROVE_RAW_LOCK_NESTING always follows the PREEMP_RT
> rules and does not expect trickery like the above.
> 
> Something like the completely untested below might be of help..
> 
> ---
> diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
> index d22430840b53..f3120d6a7d9e 100644
> --- a/include/linux/lockdep_types.h
> +++ b/include/linux/lockdep_types.h
> @@ -33,6 +33,7 @@ enum lockdep_wait_type {
>   enum lockdep_lock_type {
>   	LD_LOCK_NORMAL = 0,	/* normal, catch all */
>   	LD_LOCK_PERCPU,		/* percpu */
> +	LD_LOCK_WAIT,		/* annotation */
>   	LD_LOCK_MAX,
>   };
>   
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 50d4863974e7..a4077f5bb75b 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -2279,8 +2279,9 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
>   	 * As a result, we will skip local_lock(), when we search for irq
>   	 * inversion bugs.
>   	 */
> -	if (entry->class->lock_type == LD_LOCK_PERCPU) {
> -		if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
> +	if (entry->class->lock_type != LD_LOCK_NORMAL) {
> +		if (entry->class->lock_type == LD_LOCK_PERCPU &&
> +		    DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
>   			return false;
>   
>   		return true;
> @@ -4752,7 +4753,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
>   
>   	for (; depth < curr->lockdep_depth; depth++) {
>   		struct held_lock *prev = curr->held_locks + depth;
> -		u8 prev_inner = hlock_class(prev)->wait_type_inner;
> +		struct lock_class *class = hlock_class(prev);
> +		u8 prev_inner = class->wait_type_inner;
>   
>   		if (prev_inner) {
>   			/*
> @@ -4762,6 +4764,12 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
>   			 * Also due to trylocks.
>   			 */
>   			curr_inner = min(curr_inner, prev_inner);
> +
> +			/*
> +			 * Allow override for annotations.
> +			 */
> +			if (unlikely(class->lock_type == LD_LOCK_WAIT))
> +				curr_inner = prev_inner;
>   		}
>   	}
>   
> diff --git a/lib/debugobjects.c b/lib/debugobjects.c
> index df86e649d8be..fae71ef72a16 100644
> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>   	 * On RT enabled kernels the pool refill must happen in preemptible
>   	 * context:
>   	 */
> -	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
> +		static lockdep_map dep_map = {

                 static struct lockdep_map dep_map = {

> +			.name = "wait-type-override",
> +			.wait_type_inner = LD_WAIT_SLEEP,
> +			.lock_type = LD_LOCK_WAIT,
> +		};
> +		lock_map_acquire(&dep_map);
>   		fill_pool();
> +		lock_map_release(&dep_map);
> +	}
>   
>   	db = get_bucket((unsigned long) addr);
>   

I just tested the above code, and then got the following
warning:

[    0.001000][    T0] =============================
[    0.001000][    T0] [ BUG: Invalid wait context ]
[    0.001000][    T0] 6.3.0-rc6-next-20230412+ #21 Not tainted
[    0.001000][    T0] -----------------------------
[    0.001000][    T0] swapper/0/0 is trying to lock:
[    0.001000][    T0] ffffffff825bcb80 
(wait-type-override){....}-{4:4}, at: __debug_object_init+0x0/0x590
[    0.001000][    T0] other info that might help us debug this:
[    0.001000][    T0] context-{5:5}
[    0.001000][    T0] 2 locks held by swapper/0/0:
[    0.001000][    T0]  #0: ffffffff824f5178 
(timekeeper_lock){....}-{2:2}, at: timekeeping_init+0xf1/0x270
[    0.001000][    T0]  #1: ffffffff824f5008 
(tk_core.seq.seqcount){....}-{0:0}, at: start_kernel+0x31a/0x800
[    0.001000][    T0] stack backtrace:
[    0.001000][    T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
6.3.0-rc6-next-20230412+ #21
[    0.001000][    T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 
1996), BIOS 1.14.0-2 04/01/2014
[    0.001000][    T0] Call Trace:
[    0.001000][    T0]  <TASK>
[    0.001000][    T0]  dump_stack_lvl+0x77/0xc0
[    0.001000][    T0]  __lock_acquire+0xa74/0x2960
[    0.001000][    T0]  ? save_trace+0x3f/0x320
[    0.001000][    T0]  ? add_lock_to_list+0x97/0x130
[    0.001000][    T0]  lock_acquire+0xe0/0x300
[    0.001000][    T0]  ? debug_object_active_state+0x180/0x180
[    0.001000][    T0]  __debug_object_init+0x47/0x590
[    0.001000][    T0]  ? debug_object_active_state+0x180/0x180
[    0.001000][    T0]  ? lock_acquire+0x100/0x300
[    0.001000][    T0]  hrtimer_init+0x23/0xc0
[    0.001000][    T0]  ntp_init+0x70/0x80
[    0.001000][    T0]  timekeeping_init+0x12c/0x270
[    0.001000][    T0]  ? start_kernel+0x31a/0x800
[    0.001000][    T0]  ? _printk+0x5c/0x80
[    0.001000][    T0]  start_kernel+0x31a/0x800
[    0.001000][    T0]  secondary_startup_64_no_verify+0xf4/0xfb
[    0.001000][    T0]  </TASK>

It seems that the LD_WAIT_SLEEP we set is already greater than the
LD_WAIT_SPIN of the current context.

-- 
Thanks,
Qi