[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aBzqGn0Ktbl38sOF@google.com>
Date: Thu, 8 May 2025 10:30:02 -0700
From: Namhyung Kim <namhyung@...nel.org>
To: Arnaldo Carvalho de Melo <acme@...nel.org>
Cc: Ian Rogers <irogers@...gle.com>, Kan Liang <kan.liang@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>,
Adrian Hunter <adrian.hunter@...el.com>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...nel.org>, LKML <linux-kernel@...r.kernel.org>,
linux-perf-users@...r.kernel.org, Song Liu <song@...nel.org>,
bpf@...r.kernel.org, Stephane Eranian <eranian@...gle.com>
Subject: Re: [RFC/PATCH] perf lock contention: Add -J/--inject-delay option
Hi Arnaldo,
On Thu, May 08, 2025 at 11:45:58AM -0300, Arnaldo Carvalho de Melo wrote:
> On Mon, Feb 24, 2025 at 11:59:29PM -0800, Namhyung Kim wrote:
> > This is to slow down lock acquistion (on contention locks) deliberately.
> > A possible use case is to estimate impact on application performance by
> > optimization of kernel locking behavior. By delaying the lock it can
> > simulate the worse condition as a control group, and then compare with
> > the current behavior as a optimized condition.
>
> So this looks useful, I guess we can proceed and merge it?
That'd be great. :)
Thanks,
Namhyung
>
> > The syntax is 'time@...ction' and the time can have unit suffix like
> > "us" and "ms". For example, I ran a simple test like below.
> >
> > $ sudo perf lock con -abl -L tasklist_lock -- \
> > sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
> > contended total wait max wait avg wait address symbol
> >
> > 92 1.18 ms 199.54 us 12.79 us ffffffff8a806080 tasklist_lock (rwlock)
> >
> > The contention count was 92 and the average wait time was around 10 us.
> > But if I add 100 usec of delay to the tasklist_lock,
> >
> > $ sudo perf lock con -abl -L tasklist_lock -J 100us@...klist_lock -- \
> > sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
> > contended total wait max wait avg wait address symbol
> >
> > 190 15.67 ms 230.10 us 82.46 us ffffffff8a806080 tasklist_lock (rwlock)
> >
> > The contention count increased and the average wait time was up closed
> > to 100 usec. If I increase the delay even more,
> >
> > $ sudo perf lock con -abl -L tasklist_lock -J 1ms@...klist_lock -- \
> > sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
> > contended total wait max wait avg wait address symbol
> >
> > 1002 2.80 s 3.01 ms 2.80 ms ffffffff8a806080 tasklist_lock (rwlock)
> >
> > Now every sleep process had contention and the wait time was more than 1
> > msec. This is on my 4 CPU laptop so I guess one CPU has the lock while
> > other 3 are waiting for it mostly.
> >
> > For simplicity, it only supports global locks for now.
> >
> > Suggested-by: Stephane Eranian <eranian@...gle.com>
> > Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> > ---
> > tools/perf/Documentation/perf-lock.txt | 11 +++
> > tools/perf/builtin-lock.c | 74 +++++++++++++++++++
> > tools/perf/util/bpf_lock_contention.c | 28 +++++++
> > .../perf/util/bpf_skel/lock_contention.bpf.c | 43 +++++++++++
> > tools/perf/util/lock-contention.h | 8 ++
> > 5 files changed, 164 insertions(+)
> >
> > diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
> > index d3793054f7d35626..151fc837587b216e 100644
> > --- a/tools/perf/Documentation/perf-lock.txt
> > +++ b/tools/perf/Documentation/perf-lock.txt
> > @@ -215,6 +215,17 @@ CONTENTION OPTIONS
> > --cgroup-filter=<value>::
> > Show lock contention only in the given cgroups (comma separated list).
> >
> > +-J::
> > +--inject-delay=<time@...ction>::
> > + Add delays to the given lock. It's added to the contention-end part so
> > + that the (new) owner of the lock will be delayed. But by slowing down
> > + the owner, the waiters will also be delayed as well. This is working
> > + only with -b/--use-bpf.
> > +
> > + The 'time' is specified in nsec but it can have a unit suffix. Available
> > + units are "ms" and "us". Note that it will busy-wait after it gets the
> > + lock. Please use it at your own risk.
> > +
> >
> > SEE ALSO
> > --------
> > diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
> > index 5d405cd8e696d21b..3ef452d5d9f5679d 100644
> > --- a/tools/perf/builtin-lock.c
> > +++ b/tools/perf/builtin-lock.c
> > @@ -62,6 +62,8 @@ static const char *output_name = NULL;
> > static FILE *lock_output;
> >
> > static struct lock_filter filters;
> > +static struct lock_delay *delays;
> > +static int nr_delays;
> >
> > static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
> >
> > @@ -1971,6 +1973,8 @@ static int __cmd_contention(int argc, const char **argv)
> > .max_stack = max_stack_depth,
> > .stack_skip = stack_skip,
> > .filters = &filters,
> > + .delays = delays,
> > + .nr_delays = nr_delays,
> > .save_callstack = needs_callstack(),
> > .owner = show_lock_owner,
> > .cgroups = RB_ROOT,
> > @@ -2474,6 +2478,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
> > return ret;
> > }
> >
> > +static bool add_lock_delay(char *spec)
> > +{
> > + char *at, *pos;
> > + struct lock_delay *tmp;
> > + unsigned long duration;
> > +
> > + at = strchr(spec, '@');
> > + if (at == NULL) {
> > + pr_err("lock delay should have '@' sign: %s\n", spec);
> > + return false;
> > + }
> > + if (at == spec) {
> > + pr_err("lock delay should have time before '@': %s\n", spec);
> > + return false;
> > + }
> > +
> > + *at = '\0';
> > + duration = strtoul(spec, &pos, 0);
> > + if (!strcmp(pos, "ns"))
> > + duration *= 1;
> > + else if (!strcmp(pos, "us"))
> > + duration *= 1000;
> > + else if (!strcmp(pos, "ms"))
> > + duration *= 1000 * 1000;
> > + else if (*pos) {
> > + pr_err("invalid delay time: %s@%s\n", spec, at + 1);
> > + return false;
> > + }
> > +
> > + tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
> > + if (tmp == NULL) {
> > + pr_err("Memory allocation failure\n");
> > + return false;
> > + }
> > + delays = tmp;
> > +
> > + delays[nr_delays].sym = strdup(at + 1);
> > + if (delays[nr_delays].sym == NULL) {
> > + pr_err("Memory allocation failure\n");
> > + return false;
> > + }
> > + delays[nr_delays].time = duration;
> > +
> > + nr_delays++;
> > + return true;
> > +}
> > +
> > +static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
> > + int unset __maybe_unused)
> > +{
> > + char *s, *tmp, *tok;
> > + int ret = 0;
> > +
> > + s = strdup(str);
> > + if (s == NULL)
> > + return -1;
> > +
> > + for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
> > + if (!add_lock_delay(tok)) {
> > + ret = -1;
> > + break;
> > + }
> > + }
> > +
> > + free(s);
> > + return ret;
> > +}
> > +
> > int cmd_lock(int argc, const char **argv)
> > {
> > const struct option lock_options[] = {
> > @@ -2550,6 +2622,8 @@ int cmd_lock(int argc, const char **argv)
> > OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
> > OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
> > "Filter specific cgroups", parse_cgroup_filter),
> > + OPT_CALLBACK('J', "inject-delay", NULL, "TIME@...C",
> > + "Inject delays to specific locks", parse_lock_delay),
> > OPT_PARENT(lock_options)
> > };
> >
> > diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> > index fc8666222399c995..99b64f303e761cbc 100644
> > --- a/tools/perf/util/bpf_lock_contention.c
> > +++ b/tools/perf/util/bpf_lock_contention.c
> > @@ -183,6 +183,27 @@ int lock_contention_prepare(struct lock_contention *con)
> > skel->rodata->has_addr = 1;
> > }
> >
> > + /* resolve lock name in delays */
> > + if (con->nr_delays) {
> > + struct symbol *sym;
> > + struct map *kmap;
> > +
> > + for (i = 0; i < con->nr_delays; i++) {
> > + sym = machine__find_kernel_symbol_by_name(con->machine,
> > + con->delays[i].sym,
> > + &kmap);
> > + if (sym == NULL) {
> > + pr_warning("ignore unknown symbol: %s\n",
> > + con->delays[i].sym);
> > + continue;
> > + }
> > +
> > + con->delays[i].addr = map__unmap_ip(kmap, sym->start);
> > + }
> > + skel->rodata->lock_delay = 1;
> > + bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
> > + }
> > +
> > bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
> > bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
> > bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
> > @@ -272,6 +293,13 @@ int lock_contention_prepare(struct lock_contention *con)
> > bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
> > }
> >
> > + if (con->nr_delays) {
> > + fd = bpf_map__fd(skel->maps.lock_delays);
> > +
> > + for (i = 0; i < con->nr_delays; i++)
> > + bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
> > + }
> > +
> > if (con->aggr_mode == LOCK_AGGR_CGROUP)
> > read_all_cgroups(&con->cgroups);
> >
> > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > index 6533ea9b044c71d1..0ac9ae2f1711a129 100644
> > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > @@ -11,6 +11,9 @@
> > /* for collect_lock_syms(). 4096 was rejected by the verifier */
> > #define MAX_CPUS 1024
> >
> > +/* for do_lock_delay(). Arbitrarily set to 1 million. */
> > +#define MAX_LOOP (1U << 20)
> > +
> > /* lock contention flags from include/trace/events/lock.h */
> > #define LCB_F_SPIN (1U << 0)
> > #define LCB_F_READ (1U << 1)
> > @@ -114,6 +117,13 @@ struct {
> > __uint(max_entries, 1);
> > } slab_caches SEC(".maps");
> >
> > +struct {
> > + __uint(type, BPF_MAP_TYPE_HASH);
> > + __uint(key_size, sizeof(__u64));
> > + __uint(value_size, sizeof(__u64));
> > + __uint(max_entries, 1);
> > +} lock_delays SEC(".maps");
> > +
> > struct rw_semaphore___old {
> > struct task_struct *owner;
> > } __attribute__((preserve_access_index));
> > @@ -143,6 +153,7 @@ const volatile int needs_callstack;
> > const volatile int stack_skip;
> > const volatile int lock_owner;
> > const volatile int use_cgroup_v2;
> > +const volatile int lock_delay;
> >
> > /* determine the key of lock stat */
> > const volatile int aggr_mode;
> > @@ -348,6 +359,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
> > return 0;
> > }
> >
> > +static inline long delay_callback(__u64 idx, void *arg)
> > +{
> > + __u64 target = *(__u64 *)arg;
> > +
> > + if (target <= bpf_ktime_get_ns())
> > + return 1;
> > +
> > + /* just to kill time */
> > + (void)bpf_get_prandom_u32();
> > +
> > + return 0;
> > +}
> > +
> > +static inline void do_lock_delay(__u64 duration)
> > +{
> > + __u64 target = bpf_ktime_get_ns() + duration;
> > +
> > + bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
> > +}
> > +
> > +static inline void check_lock_delay(__u64 lock)
> > +{
> > + __u64 *delay;
> > +
> > + delay = bpf_map_lookup_elem(&lock_delays, &lock);
> > + if (delay)
> > + do_lock_delay(*delay);
> > +}
> > +
> > static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
> > {
> > __u32 pid;
> > @@ -566,6 +606,9 @@ int contention_end(u64 *ctx)
> > data->min_time = duration;
> >
> > out:
> > + if (lock_delay)
> > + check_lock_delay(pelem->lock);
> > +
> > pelem->lock = 0;
> > if (need_delete)
> > bpf_map_delete_elem(&tstamp, &pid);
> > diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> > index a09f7fe877df8184..12f6cb789ada1bc7 100644
> > --- a/tools/perf/util/lock-contention.h
> > +++ b/tools/perf/util/lock-contention.h
> > @@ -18,6 +18,12 @@ struct lock_filter {
> > char **slabs;
> > };
> >
> > +struct lock_delay {
> > + char *sym;
> > + unsigned long addr;
> > + unsigned long time;
> > +};
> > +
> > struct lock_stat {
> > struct hlist_node hash_entry;
> > struct rb_node rb; /* used for sorting */
> > @@ -140,6 +146,7 @@ struct lock_contention {
> > struct machine *machine;
> > struct hlist_head *result;
> > struct lock_filter *filters;
> > + struct lock_delay *delays;
> > struct lock_contention_fails fails;
> > struct rb_root cgroups;
> > unsigned long map_nr_entries;
> > @@ -148,6 +155,7 @@ struct lock_contention {
> > int aggr_mode;
> > int owner;
> > int nr_filtered;
> > + int nr_delays;
> > bool save_callstack;
> > };
> >
> > --
> > 2.48.1.658.g4767266eb4-goog
Powered by blists - more mailing lists