[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAP-5=fUeBwzSXqBkSeC3Y9WeH3KsEYM+2o7U7eAVwSodrmt__g@mail.gmail.com>
Date: Tue, 24 Oct 2023 21:15:25 -0700
From: Ian Rogers <irogers@...gle.com>
To: Namhyung Kim <namhyung@...nel.org>
Cc: Arnaldo Carvalho de Melo <acme@...nel.org>,
Jiri Olsa <jolsa@...nel.org>,
Adrian Hunter <adrian.hunter@...el.com>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...nel.org>,
LKML <linux-kernel@...r.kernel.org>,
linux-perf-users@...r.kernel.org, Song Liu <song@...nel.org>,
Hao Luo <haoluo@...gle.com>, bpf@...r.kernel.org
Subject: Re: [PATCH v3 3/3] perf lock contention: Use per-cpu array map for spinlocks
On Fri, Oct 20, 2023 at 1:47 PM Namhyung Kim <namhyung@...nel.org> wrote:
>
> Currently lock contention timestamp is maintained in a hash map keyed by
> pid. That means it needs to get and release a map element (which is
> proctected by spinlock!) on each contention begin and end pair. This
> can impact on performance if there are a lot of contention (usually from
> spinlocks).
>
> It used to go with task local storage but it had an issue on memory
> allocation in some critical paths. Although it's addressed in recent
> kernels IIUC, the tool should support old kernels too. So it cannot
> simply switch to the task local storage at least for now.
>
> As spinlocks create lots of contention and they disabled preemption
> during the spinning, it can use per-cpu array to keep the timestamp to
> avoid overhead in hashmap update and delete.
>
> In contention_begin, it's easy to check the lock types since it can see
> the flags. But contention_end cannot see it. So let's try to per-cpu
> array first (unconditionally) if it has an active element (lock != 0).
> Then it should be used and per-task tstamp map should not be used until
> the per-cpu array element is cleared which means nested spinlock
> contention (if any) was finished and it nows see (the outer) lock.
>
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
Acked-by: Ian Rogers <irogers@...gle.com>
Thanks,
Ian
> ---
> .../perf/util/bpf_skel/lock_contention.bpf.c | 89 +++++++++++++++----
> 1 file changed, 72 insertions(+), 17 deletions(-)
>
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 69d31fd77cd0..95cd8414f6ef 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -42,6 +42,14 @@ struct {
> __uint(max_entries, MAX_ENTRIES);
> } tstamp SEC(".maps");
>
> +/* maintain per-CPU timestamp at the beginning of contention */
> +struct {
> + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> + __uint(key_size, sizeof(__u32));
> + __uint(value_size, sizeof(struct tstamp_data));
> + __uint(max_entries, 1);
> +} tstamp_cpu SEC(".maps");
> +
> /* actual lock contention statistics */
> struct {
> __uint(type, BPF_MAP_TYPE_HASH);
> @@ -311,34 +319,57 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
> return 0;
> }
>
> -SEC("tp_btf/contention_begin")
> -int contention_begin(u64 *ctx)
> +static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
> {
> __u32 pid;
> struct tstamp_data *pelem;
>
> - if (!enabled || !can_record(ctx))
> - return 0;
> + /* Use per-cpu array map for spinlock and rwlock */
> + if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
> + flags == (LCB_F_SPIN | LCB_F_WRITE)) {
> + __u32 idx = 0;
> +
> + pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
> + /* Do not update the element for nested locks */
> + if (pelem && pelem->lock)
> + pelem = NULL;
> + return pelem;
> + }
>
> pid = bpf_get_current_pid_tgid();
> pelem = bpf_map_lookup_elem(&tstamp, &pid);
> + /* Do not update the element for nested locks */
> if (pelem && pelem->lock)
> - return 0;
> + return NULL;
>
> if (pelem == NULL) {
> struct tstamp_data zero = {};
>
> if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
> __sync_fetch_and_add(&task_fail, 1);
> - return 0;
> + return NULL;
> }
>
> pelem = bpf_map_lookup_elem(&tstamp, &pid);
> if (pelem == NULL) {
> __sync_fetch_and_add(&task_fail, 1);
> - return 0;
> + return NULL;
> }
> }
> + return pelem;
> +}
> +
> +SEC("tp_btf/contention_begin")
> +int contention_begin(u64 *ctx)
> +{
> + struct tstamp_data *pelem;
> +
> + if (!enabled || !can_record(ctx))
> + return 0;
> +
> + pelem = get_tstamp_elem(ctx[1]);
> + if (pelem == NULL)
> + return 0;
>
> pelem->timestamp = bpf_ktime_get_ns();
> pelem->lock = (__u64)ctx[0];
> @@ -377,24 +408,42 @@ int contention_begin(u64 *ctx)
> SEC("tp_btf/contention_end")
> int contention_end(u64 *ctx)
> {
> - __u32 pid;
> + __u32 pid = 0, idx = 0;
> struct tstamp_data *pelem;
> struct contention_key key = {};
> struct contention_data *data;
> __u64 duration;
> + bool need_delete = false;
>
> if (!enabled)
> return 0;
>
> - pid = bpf_get_current_pid_tgid();
> - pelem = bpf_map_lookup_elem(&tstamp, &pid);
> - if (!pelem || pelem->lock != ctx[0])
> - return 0;
> + /*
> + * For spinlock and rwlock, it needs to get the timestamp for the
> + * per-cpu map. However, contention_end does not have the flags
> + * so it cannot know whether it reads percpu or hash map.
> + *
> + * Try per-cpu map first and check if there's active contention.
> + * If it is, do not read hash map because it cannot go to sleeping
> + * locks before releasing the spinning locks.
> + */
> + pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
> + if (pelem && pelem->lock) {
> + if (pelem->lock != ctx[0])
> + return 0;
> + } else {
> + pid = bpf_get_current_pid_tgid();
> + pelem = bpf_map_lookup_elem(&tstamp, &pid);
> + if (!pelem || pelem->lock != ctx[0])
> + return 0;
> + need_delete = true;
> + }
>
> duration = bpf_ktime_get_ns() - pelem->timestamp;
> if ((__s64)duration < 0) {
> pelem->lock = 0;
> - bpf_map_delete_elem(&tstamp, &pid);
> + if (need_delete)
> + bpf_map_delete_elem(&tstamp, &pid);
> __sync_fetch_and_add(&time_fail, 1);
> return 0;
> }
> @@ -406,8 +455,11 @@ int contention_end(u64 *ctx)
> case LOCK_AGGR_TASK:
> if (lock_owner)
> key.pid = pelem->flags;
> - else
> + else {
> + if (!need_delete)
> + pid = bpf_get_current_pid_tgid();
> key.pid = pid;
> + }
> if (needs_callstack)
> key.stack_id = pelem->stack_id;
> break;
> @@ -428,7 +480,8 @@ int contention_end(u64 *ctx)
> if (!data) {
> if (data_map_full) {
> pelem->lock = 0;
> - bpf_map_delete_elem(&tstamp, &pid);
> + if (need_delete)
> + bpf_map_delete_elem(&tstamp, &pid);
> __sync_fetch_and_add(&data_fail, 1);
> return 0;
> }
> @@ -452,7 +505,8 @@ int contention_end(u64 *ctx)
> __sync_fetch_and_add(&data_fail, 1);
> }
> pelem->lock = 0;
> - bpf_map_delete_elem(&tstamp, &pid);
> + if (need_delete)
> + bpf_map_delete_elem(&tstamp, &pid);
> return 0;
> }
>
> @@ -466,7 +520,8 @@ int contention_end(u64 *ctx)
> data->min_time = duration;
>
> pelem->lock = 0;
> - bpf_map_delete_elem(&tstamp, &pid);
> + if (need_delete)
> + bpf_map_delete_elem(&tstamp, &pid);
> return 0;
> }
>
> --
> 2.42.0.655.g421f12c284-goog
>
Powered by blists - more mailing lists