lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAP-5=fUeBwzSXqBkSeC3Y9WeH3KsEYM+2o7U7eAVwSodrmt__g@mail.gmail.com>
Date:   Tue, 24 Oct 2023 21:15:25 -0700
From:   Ian Rogers <irogers@...gle.com>
To:     Namhyung Kim <namhyung@...nel.org>
Cc:     Arnaldo Carvalho de Melo <acme@...nel.org>,
        Jiri Olsa <jolsa@...nel.org>,
        Adrian Hunter <adrian.hunter@...el.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...nel.org>,
        LKML <linux-kernel@...r.kernel.org>,
        linux-perf-users@...r.kernel.org, Song Liu <song@...nel.org>,
        Hao Luo <haoluo@...gle.com>, bpf@...r.kernel.org
Subject: Re: [PATCH v3 3/3] perf lock contention: Use per-cpu array map for spinlocks

On Fri, Oct 20, 2023 at 1:47 PM Namhyung Kim <namhyung@...nel.org> wrote:
>
> Currently lock contention timestamp is maintained in a hash map keyed by
> pid.  That means it needs to get and release a map element (which is
> proctected by spinlock!) on each contention begin and end pair.  This
> can impact on performance if there are a lot of contention (usually from
> spinlocks).
>
> It used to go with task local storage but it had an issue on memory
> allocation in some critical paths.  Although it's addressed in recent
> kernels IIUC, the tool should support old kernels too.  So it cannot
> simply switch to the task local storage at least for now.
>
> As spinlocks create lots of contention and they disabled preemption
> during the spinning, it can use per-cpu array to keep the timestamp to
> avoid overhead in hashmap update and delete.
>
> In contention_begin, it's easy to check the lock types since it can see
> the flags.  But contention_end cannot see it.  So let's try to per-cpu
> array first (unconditionally) if it has an active element (lock != 0).
> Then it should be used and per-task tstamp map should not be used until
> the per-cpu array element is cleared which means nested spinlock
> contention (if any) was finished and it nows see (the outer) lock.
>
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>

Acked-by: Ian Rogers <irogers@...gle.com>

Thanks,
Ian

> ---
>  .../perf/util/bpf_skel/lock_contention.bpf.c  | 89 +++++++++++++++----
>  1 file changed, 72 insertions(+), 17 deletions(-)
>
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 69d31fd77cd0..95cd8414f6ef 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -42,6 +42,14 @@ struct {
>         __uint(max_entries, MAX_ENTRIES);
>  } tstamp SEC(".maps");
>
> +/* maintain per-CPU timestamp at the beginning of contention */
> +struct {
> +       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +       __uint(key_size, sizeof(__u32));
> +       __uint(value_size, sizeof(struct tstamp_data));
> +       __uint(max_entries, 1);
> +} tstamp_cpu SEC(".maps");
> +
>  /* actual lock contention statistics */
>  struct {
>         __uint(type, BPF_MAP_TYPE_HASH);
> @@ -311,34 +319,57 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
>         return 0;
>  }
>
> -SEC("tp_btf/contention_begin")
> -int contention_begin(u64 *ctx)
> +static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
>  {
>         __u32 pid;
>         struct tstamp_data *pelem;
>
> -       if (!enabled || !can_record(ctx))
> -               return 0;
> +       /* Use per-cpu array map for spinlock and rwlock */
> +       if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
> +           flags == (LCB_F_SPIN | LCB_F_WRITE)) {
> +               __u32 idx = 0;
> +
> +               pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
> +               /* Do not update the element for nested locks */
> +               if (pelem && pelem->lock)
> +                       pelem = NULL;
> +               return pelem;
> +       }
>
>         pid = bpf_get_current_pid_tgid();
>         pelem = bpf_map_lookup_elem(&tstamp, &pid);
> +       /* Do not update the element for nested locks */
>         if (pelem && pelem->lock)
> -               return 0;
> +               return NULL;
>
>         if (pelem == NULL) {
>                 struct tstamp_data zero = {};
>
>                 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
>                         __sync_fetch_and_add(&task_fail, 1);
> -                       return 0;
> +                       return NULL;
>                 }
>
>                 pelem = bpf_map_lookup_elem(&tstamp, &pid);
>                 if (pelem == NULL) {
>                         __sync_fetch_and_add(&task_fail, 1);
> -                       return 0;
> +                       return NULL;
>                 }
>         }
> +       return pelem;
> +}
> +
> +SEC("tp_btf/contention_begin")
> +int contention_begin(u64 *ctx)
> +{
> +       struct tstamp_data *pelem;
> +
> +       if (!enabled || !can_record(ctx))
> +               return 0;
> +
> +       pelem = get_tstamp_elem(ctx[1]);
> +       if (pelem == NULL)
> +               return 0;
>
>         pelem->timestamp = bpf_ktime_get_ns();
>         pelem->lock = (__u64)ctx[0];
> @@ -377,24 +408,42 @@ int contention_begin(u64 *ctx)
>  SEC("tp_btf/contention_end")
>  int contention_end(u64 *ctx)
>  {
> -       __u32 pid;
> +       __u32 pid = 0, idx = 0;
>         struct tstamp_data *pelem;
>         struct contention_key key = {};
>         struct contention_data *data;
>         __u64 duration;
> +       bool need_delete = false;
>
>         if (!enabled)
>                 return 0;
>
> -       pid = bpf_get_current_pid_tgid();
> -       pelem = bpf_map_lookup_elem(&tstamp, &pid);
> -       if (!pelem || pelem->lock != ctx[0])
> -               return 0;
> +       /*
> +        * For spinlock and rwlock, it needs to get the timestamp for the
> +        * per-cpu map.  However, contention_end does not have the flags
> +        * so it cannot know whether it reads percpu or hash map.
> +        *
> +        * Try per-cpu map first and check if there's active contention.
> +        * If it is, do not read hash map because it cannot go to sleeping
> +        * locks before releasing the spinning locks.
> +        */
> +       pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
> +       if (pelem && pelem->lock) {
> +               if (pelem->lock != ctx[0])
> +                       return 0;
> +       } else {
> +               pid = bpf_get_current_pid_tgid();
> +               pelem = bpf_map_lookup_elem(&tstamp, &pid);
> +               if (!pelem || pelem->lock != ctx[0])
> +                       return 0;
> +               need_delete = true;
> +       }
>
>         duration = bpf_ktime_get_ns() - pelem->timestamp;
>         if ((__s64)duration < 0) {
>                 pelem->lock = 0;
> -               bpf_map_delete_elem(&tstamp, &pid);
> +               if (need_delete)
> +                       bpf_map_delete_elem(&tstamp, &pid);
>                 __sync_fetch_and_add(&time_fail, 1);
>                 return 0;
>         }
> @@ -406,8 +455,11 @@ int contention_end(u64 *ctx)
>         case LOCK_AGGR_TASK:
>                 if (lock_owner)
>                         key.pid = pelem->flags;
> -               else
> +               else {
> +                       if (!need_delete)
> +                               pid = bpf_get_current_pid_tgid();
>                         key.pid = pid;
> +               }
>                 if (needs_callstack)
>                         key.stack_id = pelem->stack_id;
>                 break;
> @@ -428,7 +480,8 @@ int contention_end(u64 *ctx)
>         if (!data) {
>                 if (data_map_full) {
>                         pelem->lock = 0;
> -                       bpf_map_delete_elem(&tstamp, &pid);
> +                       if (need_delete)
> +                               bpf_map_delete_elem(&tstamp, &pid);
>                         __sync_fetch_and_add(&data_fail, 1);
>                         return 0;
>                 }
> @@ -452,7 +505,8 @@ int contention_end(u64 *ctx)
>                         __sync_fetch_and_add(&data_fail, 1);
>                 }
>                 pelem->lock = 0;
> -               bpf_map_delete_elem(&tstamp, &pid);
> +               if (need_delete)
> +                       bpf_map_delete_elem(&tstamp, &pid);
>                 return 0;
>         }
>
> @@ -466,7 +520,8 @@ int contention_end(u64 *ctx)
>                 data->min_time = duration;
>
>         pelem->lock = 0;
> -       bpf_map_delete_elem(&tstamp, &pid);
> +       if (need_delete)
> +               bpf_map_delete_elem(&tstamp, &pid);
>         return 0;
>  }
>
> --
> 2.42.0.655.g421f12c284-goog
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ