linux-kernel - Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <162e6281-8828-e0bc-2b91-183b7f3a1f65@bytedance.com>
Date:   Thu, 13 Apr 2023 22:49:41 +0800
From:   Qi Zheng <zhengqi.arch@...edance.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     Vlastimil Babka <vbabka@...e.cz>,
        "Zhang, Qiang1" <qiang1.zhang@...el.com>,
        Boqun Feng <boqun.feng@...il.com>,
        "42.hyeyoo@...il.com" <42.hyeyoo@...il.com>,
        "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
        "roman.gushchin@...ux.dev" <roman.gushchin@...ux.dev>,
        "iamjoonsoo.kim@....com" <iamjoonsoo.kim@....com>,
        "rientjes@...gle.com" <rientjes@...gle.com>,
        "penberg@...nel.org" <penberg@...nel.org>,
        "cl@...ux.com" <cl@...ux.com>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Zhao Gongyi <zhaogongyi@...edance.com>,
        Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        Thomas Gleixner <tglx@...utronix.de>,
        RCU <rcu@...r.kernel.org>,
        "Paul E . McKenney" <paulmck@...nel.org>
Subject: Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as
 raw_spinlock



On 2023/4/13 00:44, Qi Zheng wrote:
> 
> 
> On 2023/4/12 20:47, Peter Zijlstra wrote:
>> On Wed, Apr 12, 2023 at 08:50:29AM +0200, Vlastimil Babka wrote:
>>
>>>> --- a/lib/debugobjects.c
>>>> +++ b/lib/debugobjects.c
>>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct 
>>>> debug_obj_descr *descr, int onstack
>>>>          unsigned long flags;
>>>>
>>>>          /*
>>>> -        * On RT enabled kernels the pool refill must happen in 
>>>> preemptible
>>>> +        * The pool refill must happen in preemptible
>>>>           * context:
>>>>           */
>>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>>> +       if (preemptible())
>>>>                  fill_pool();
>>>
>>> +CC Peterz
>>>
>>> Aha so this is in fact another case where the code is written with
>>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
>>
>> Ooh, tricky, yes. PROVE_RAW_LOCK_NESTING always follows the PREEMP_RT
>> rules and does not expect trickery like the above.
>>
>> Something like the completely untested below might be of help..
>>
>> ---
>> diff --git a/include/linux/lockdep_types.h 
>> b/include/linux/lockdep_types.h
>> index d22430840b53..f3120d6a7d9e 100644
>> --- a/include/linux/lockdep_types.h
>> +++ b/include/linux/lockdep_types.h
>> @@ -33,6 +33,7 @@ enum lockdep_wait_type {
>>   enum lockdep_lock_type {
>>       LD_LOCK_NORMAL = 0,    /* normal, catch all */
>>       LD_LOCK_PERCPU,        /* percpu */
>> +    LD_LOCK_WAIT,        /* annotation */
>>       LD_LOCK_MAX,
>>   };
>> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
>> index 50d4863974e7..a4077f5bb75b 100644
>> --- a/kernel/locking/lockdep.c
>> +++ b/kernel/locking/lockdep.c
>> @@ -2279,8 +2279,9 @@ static inline bool usage_skip(struct lock_list 
>> *entry, void *mask)
>>        * As a result, we will skip local_lock(), when we search for irq
>>        * inversion bugs.
>>        */
>> -    if (entry->class->lock_type == LD_LOCK_PERCPU) {
>> -        if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
>> LD_WAIT_CONFIG))
>> +    if (entry->class->lock_type != LD_LOCK_NORMAL) {
>> +        if (entry->class->lock_type == LD_LOCK_PERCPU &&
>> +            DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
>> LD_WAIT_CONFIG))
>>               return false;
>>           return true;
>> @@ -4752,7 +4753,8 @@ static int check_wait_context(struct task_struct 
>> *curr, struct held_lock *next)
>>       for (; depth < curr->lockdep_depth; depth++) {
>>           struct held_lock *prev = curr->held_locks + depth;
>> -        u8 prev_inner = hlock_class(prev)->wait_type_inner;
>> +        struct lock_class *class = hlock_class(prev);
>> +        u8 prev_inner = class->wait_type_inner;
>>           if (prev_inner) {
>>               /*
>> @@ -4762,6 +4764,12 @@ static int check_wait_context(struct 
>> task_struct *curr, struct held_lock *next)
>>                * Also due to trylocks.
>>                */
>>               curr_inner = min(curr_inner, prev_inner);
>> +
>> +            /*
>> +             * Allow override for annotations.
>> +             */
>> +            if (unlikely(class->lock_type == LD_LOCK_WAIT))
>> +                curr_inner = prev_inner;
>>           }
>>       }
>> diff --git a/lib/debugobjects.c b/lib/debugobjects.c
>> index df86e649d8be..fae71ef72a16 100644
>> --- a/lib/debugobjects.c
>> +++ b/lib/debugobjects.c
>> @@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct 
>> debug_obj_descr *descr, int onstack
>>        * On RT enabled kernels the pool refill must happen in preemptible
>>        * context:
>>        */
>> -    if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>> +    if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
>> +        static lockdep_map dep_map = {
> 
>                  static struct lockdep_map dep_map = {
> 
>> +            .name = "wait-type-override",
>> +            .wait_type_inner = LD_WAIT_SLEEP,
>> +            .lock_type = LD_LOCK_WAIT,
>> +        };
>> +        lock_map_acquire(&dep_map);
>>           fill_pool();
>> +        lock_map_release(&dep_map);
>> +    }
>>       db = get_bucket((unsigned long) addr);
> 
> I just tested the above code, and then got the following
> warning:
> 

> 
> It seems that the LD_WAIT_SLEEP we set is already greater than the
> LD_WAIT_SPIN of the current context.
> 

Can we do something like below? This solves the warning I encountered.

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index d22430840b53..f3120d6a7d9e 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
  enum lockdep_lock_type {
         LD_LOCK_NORMAL = 0,     /* normal, catch all */
         LD_LOCK_PERCPU,         /* percpu */
+       LD_LOCK_WAIT,           /* annotation */
         LD_LOCK_MAX,
  };

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dcd1d5bfc1e0..6859dba8a3aa 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2289,8 +2289,9 @@ static inline bool usage_skip(struct lock_list 
*entry, void *mask)
          * As a result, we will skip local_lock(), when we search for irq
          * inversion bugs.
          */
-       if (entry->class->lock_type == LD_LOCK_PERCPU) {
-               if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
LD_WAIT_CONFIG))
+       if (entry->class->lock_type != LD_LOCK_NORMAL) {
+               if (entry->class->lock_type == LD_LOCK_PERCPU &&
+                   DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
LD_WAIT_CONFIG))
                         return false;

                 return true;
@@ -3981,6 +3982,9 @@ static inline int
  valid_state(struct task_struct *curr, struct held_lock *this,
             enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
  {
+       if (unlikely(hlock_class(this)->lock_type == LD_LOCK_WAIT))
+               return 1;
+
         if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
                 graph_unlock();
                 print_usage_bug(curr, this, bad_bit, new_bit);
@@ -4768,7 +4772,8 @@ static int check_wait_context(struct task_struct 
*curr, struct held_lock *next)

         for (; depth < curr->lockdep_depth; depth++) {
                 struct held_lock *prev = curr->held_locks + depth;
-               u8 prev_inner = hlock_class(prev)->wait_type_inner;
+               struct lock_class *class = hlock_class(prev);
+               u8 prev_inner = class->wait_type_inner;

                 if (prev_inner) {
                         /*
@@ -4778,9 +4783,19 @@ static int check_wait_context(struct task_struct 
*curr, struct held_lock *next)
                          * Also due to trylocks.
                          */
                         curr_inner = min(curr_inner, prev_inner);
+
+                       /*
+                        * Allow override for annotations.
+                        */
+                       if (unlikely(class->lock_type == LD_LOCK_WAIT))
+                               curr_inner = prev_inner;
                 }
         }

+       if (unlikely(hlock_class(next)->lock_type == LD_LOCK_WAIT &&
+                    curr_inner == LD_WAIT_SPIN))
+               curr_inner = LD_WAIT_CONFIG;
+
         if (next_outer > curr_inner)
                 return print_lock_invalid_wait_context(curr, next);

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index df86e649d8be..a8a69991b0d0 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct 
debug_obj_descr *descr, int onstack
          * On RT enabled kernels the pool refill must happen in preemptible
          * context:
          */
-       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+               static struct lockdep_map dep_map = {
+                       .name = "wait-type-override",
+                       .wait_type_inner = LD_WAIT_CONFIG,
+                       .lock_type = LD_LOCK_WAIT,
+               };
+               lock_map_acquire(&dep_map);
                 fill_pool();
+               lock_map_release(&dep_map);
+       }

         db = get_bucket((unsigned long) addr);

-- 
Thanks,
Qi