lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <b4cc39c7-7e52-f9eb-8103-4b7e55f474a6@bytedance.com>
Date:   Mon, 3 Apr 2023 19:10:40 +0800
From:   Peng Zhang <zhangpeng.00@...edance.com>
To:     Marco Elver <elver@...gle.com>
Cc:     glider@...gle.com, dvyukov@...gle.com, akpm@...ux-foundation.org,
        kasan-dev@...glegroups.com, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH] mm: kfence: Improve the performance of __kfence_alloc()
 and __kfence_free()


在 2023/4/3 17:21, Marco Elver 写道:
> On Mon, 3 Apr 2023 at 08:28, Peng Zhang <zhangpeng.00@...edance.com> wrote:
>> In __kfence_alloc() and __kfence_free(), we will set and check canary.
>> Assuming that the size of the object is close to 0, nearly 4k memory
>> accesses are required because setting and checking canary is executed
>> byte by byte.
>>
>> canary is now defined like this:
>> KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
>>
>> Observe that canary is only related to the lower three bits of the
>> address, so every 8 bytes of canary are the same. We can access 8-byte
>> canary each time instead of byte-by-byte, thereby optimizing nearly 4k
>> memory accesses to 4k/8 times.
>>
>> Use the bcc tool funclatency to measure the latency of __kfence_alloc()
>> and __kfence_free(), the numbers (deleted the distribution of latency)
>> is posted below. Though different object sizes will have an impact on the
>> measurement, we ignore it for now and assume the average object size is
>> roughly equal.
>>
>> Before playing patch:
>> __kfence_alloc:
>> avg = 5055 nsecs, total: 5515252 nsecs, count: 1091
>> __kfence_free:
>> avg = 5319 nsecs, total: 9735130 nsecs, count: 1830
>>
>> After playing patch:
>> __kfence_alloc:
>> avg = 3597 nsecs, total: 6428491 nsecs, count: 1787
>> __kfence_free:
>> avg = 3046 nsecs, total: 3415390 nsecs, count: 1121
> Seems like a nice improvement!
>
>> The numbers indicate that there is ~30% - ~40% performance improvement.
>>
>> Signed-off-by: Peng Zhang <zhangpeng.00@...edance.com>
>> ---
>>   mm/kfence/core.c   | 71 +++++++++++++++++++++++++++++++++-------------
>>   mm/kfence/kfence.h | 10 ++++++-
>>   mm/kfence/report.c |  2 +-
>>   3 files changed, 62 insertions(+), 21 deletions(-)
>>
>> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
>> index 79c94ee55f97..0b1b1298c738 100644
>> --- a/mm/kfence/core.c
>> +++ b/mm/kfence/core.c
>> @@ -297,20 +297,13 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex
>>          WRITE_ONCE(meta->state, next);
>>   }
>>
>> -/* Write canary byte to @addr. */
>> -static inline bool set_canary_byte(u8 *addr)
>> -{
>> -       *addr = KFENCE_CANARY_PATTERN(addr);
>> -       return true;
>> -}
>> -
>>   /* Check canary byte at @addr. */
>>   static inline bool check_canary_byte(u8 *addr)
>>   {
>>          struct kfence_metadata *meta;
>>          unsigned long flags;
>>
>> -       if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
>> +       if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr)))
>>                  return true;
>>
>>          atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
>> @@ -323,11 +316,27 @@ static inline bool check_canary_byte(u8 *addr)
>>          return false;
>>   }
>>
>> -/* __always_inline this to ensure we won't do an indirect call to fn. */
>> -static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
>> +static inline void set_canary(const struct kfence_metadata *meta)
>>   {
>>          const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
>> -       unsigned long addr;
>> +       unsigned long addr = pageaddr;
>> +
>> +       /*
>> +        * The canary may be written to part of the object memory, but it does
>> +        * not affect it. The user should initialize the object before using it.
>> +        */
>> +       for (; addr < meta->addr; addr += sizeof(u64))
>> +               *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
>> +
>> +       addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64));
>> +       for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64))
>> +               *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
>> +}
>> +
>> +static inline void check_canary(const struct kfence_metadata *meta)
>> +{
>> +       const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
>> +       unsigned long addr = pageaddr;
>>
>>          /*
>>           * We'll iterate over each canary byte per-side until fn() returns
> This comment is now out-of-date ("fn" no longer exists).
>
>> @@ -339,14 +348,38 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>>           */
>>
>>          /* Apply to left of object. */
>> -       for (addr = pageaddr; addr < meta->addr; addr++) {
>> -               if (!fn((u8 *)addr))
>> +       for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) {
>> +               if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
>>                          break;
>>          }
>>
>> -       /* Apply to right of object. */
>> -       for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
>> -               if (!fn((u8 *)addr))
>> +       /*
>> +        * If the canary is damaged in a certain 64 bytes, or the canay memory
> "damaged" -> "corrupted"
> "canay" -> "canary"
>
>> +        * cannot be completely covered by multiple consecutive 64 bytes, it
>> +        * needs to be checked one by one.
>> +        */
>> +       for (; addr < meta->addr; addr++) {
>> +               if (unlikely(!check_canary_byte((u8 *)addr)))
>> +                       break;
>> +       }
>> +
>> +       /*
>> +        * Apply to right of object.
>> +        * For easier implementation, check from high address to low address.
>> +        */
>> +       addr = pageaddr + PAGE_SIZE - sizeof(u64);
>> +       for (; addr >= meta->addr + meta->size ; addr -= sizeof(u64)) {
>> +               if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
>> +                       break;
>> +       }
>> +
>> +       /*
>> +        * Same as above, checking byte by byte, but here is the reverse of
>> +        * the above.
>> +        */
>> +       addr = addr + sizeof(u64) - 1;
>> +       for (; addr >= meta->addr + meta->size; addr--) {
> The re-checking should forward-check i.e. not in reverse, otherwise
> the report might not include some corrupted bytes that had in the
> previous version been included. I think you need to check from low to
> high address to start with above.

Yes, it's better to forward-check to avoid losing the corrupted bytes
which be used in report.
I will include all your suggestions in the next version of the patch.
Thanks.

>
>> +               if (unlikely(!check_canary_byte((u8 *)addr)))
>>                          break;
>>          }
>>   }
>> @@ -434,7 +467,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>>   #endif
>>
>>          /* Memory initialization. */
>> -       for_each_canary(meta, set_canary_byte);
>> +       set_canary(meta);
>>
>>          /*
>>           * We check slab_want_init_on_alloc() ourselves, rather than letting
>> @@ -495,7 +528,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>>          alloc_covered_add(meta->alloc_stack_hash, -1);
>>
>>          /* Check canary bytes for memory corruption. */
>> -       for_each_canary(meta, check_canary_byte);
>> +       check_canary(meta);
>>
>>          /*
>>           * Clear memory if init-on-free is set. While we protect the page, the
>> @@ -751,7 +784,7 @@ static void kfence_check_all_canary(void)
>>                  struct kfence_metadata *meta = &kfence_metadata[i];
>>
>>                  if (meta->state == KFENCE_OBJECT_ALLOCATED)
>> -                       for_each_canary(meta, check_canary_byte);
>> +                       check_canary(meta);
>>          }
>>   }
>>
>> diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
>> index 600f2e2431d6..2aafc46a4aaf 100644
>> --- a/mm/kfence/kfence.h
>> +++ b/mm/kfence/kfence.h
>> @@ -21,7 +21,15 @@
>>    * lower 3 bits of the address, to detect memory corruptions with higher
>>    * probability, where similar constants are used.
>>    */
>> -#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
>> +#define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
>> +
>> +/*
>> + * Define a continuous 8-byte canary starting from a multiple of 8. The canary
>> + * of each byte is only related to the lowest three bits of its address, so the
>> + * canary of every 8 bytes is the same. 64-bit memory can be filled and checked
>> + * at a time instead of byte by byte to improve performance.
>> + */
>> +#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(0x0706050403020100))
>>
>>   /* Maximum stack depth for reports. */
>>   #define KFENCE_STACK_DEPTH 64
>> diff --git a/mm/kfence/report.c b/mm/kfence/report.c
>> index 60205f1257ef..197430a5be4a 100644
>> --- a/mm/kfence/report.c
>> +++ b/mm/kfence/report.c
>> @@ -168,7 +168,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
>>
>>          pr_cont("[");
>>          for (cur = (const u8 *)address; cur < end; cur++) {
>> -               if (*cur == KFENCE_CANARY_PATTERN(cur))
>> +               if (*cur == KFENCE_CANARY_PATTERN_U8(cur))
>>                          pr_cont(" .");
>>                  else if (no_hash_pointers)
>>                          pr_cont(" 0x%02x", *cur);
>> --
>> 2.20.1
>>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ