[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzaQUzQdba2=F2NoV7=Th98fxz2EN62QX2Ej92bazt1GAg@mail.gmail.com>
Date: Tue, 2 Jul 2024 10:54:51 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Peter Zijlstra <peterz@...radead.org>, "Paul E . McKenney" <paulmck@...nel.org>
Cc: Andrii Nakryiko <andrii@...nel.org>, linux-trace-kernel@...r.kernel.org,
rostedt@...dmis.org, mhiramat@...nel.org, oleg@...hat.com, mingo@...hat.com,
bpf@...r.kernel.org, jolsa@...nel.org, clm@...a.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs
and per-CPU RW semaphore
On Tue, Jul 2, 2024 at 4:54 AM Peter Zijlstra <peterz@...radead.org> wrote:
>
>
> +LKML
>
> On Tue, Jul 02, 2024 at 12:23:53PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 01, 2024 at 03:39:23PM -0700, Andrii Nakryiko wrote:
> > > This patch set, ultimately, switches global uprobes_treelock from RW spinlock
> > > to per-CPU RW semaphore, which has better performance and scales better under
> > > contention and multiple parallel threads triggering lots of uprobes.
> >
> > Why not RCU + normal lock thing?
>
> Something like the *completely* untested below.
>
> ---
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..03b38f3f7be3 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
> #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
>
> static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
> +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
>
> #define UPROBES_HASH_SZ 13
> /* serialize uprobe->pending_list */
> @@ -54,6 +55,7 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
> struct uprobe {
> struct rb_node rb_node; /* node in the rb tree */
> refcount_t ref;
> + struct rcu_head rcu;
> struct rw_semaphore register_rwsem;
> struct rw_semaphore consumer_rwsem;
> struct list_head pending_list;
> @@ -67,7 +69,7 @@ struct uprobe {
> * The generic code assumes that it has two members of unknown type
> * owned by the arch-specific code:
> *
> - * insn - copy_insn() saves the original instruction here for
> + * insn - copy_insn() saves the original instruction here for
> * arch_uprobe_analyze_insn().
> *
> * ixol - potentially modified instruction to execute out of
> @@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe)
> return uprobe;
> }
>
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> + kfree(uprobe);
> +}
> +
> static void put_uprobe(struct uprobe *uprobe)
> {
> if (refcount_dec_and_test(&uprobe->ref)) {
> @@ -604,7 +612,8 @@ static void put_uprobe(struct uprobe *uprobe)
right above this we have roughly this:
percpu_down_write(&uprobes_treelock);
/* refcount check */
rb_erase(&uprobe->rb_node, &uprobes_tree);
percpu_up_write(&uprobes_treelock);
This writer lock is necessary for modification of the RB tree. And I
was under impression that I shouldn't be doing
percpu_(down|up)_write() inside the normal
rcu_read_lock()/rcu_read_unlock() region (percpu_down_write has
might_sleep() in it). But maybe I'm wrong, hopefully Paul can help to
clarify.
But actually what's wrong with RCU Tasks Trace flavor? I will
ultimately use it anyway to avoid uprobe taking unnecessary refcount
and to protect uprobe->consumers iteration and uc->handler() calls,
which could be sleepable, so would need rcu_read_lock_trace().
> mutex_lock(&delayed_uprobe_lock);
> delayed_uprobe_remove(uprobe, NULL);
> mutex_unlock(&delayed_uprobe_lock);
> - kfree(uprobe);
> +
> + call_rcu(&uprobe->rcu, uprobe_free_rcu);
> }
> }
>
> @@ -668,12 +677,25 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
> static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> {
> struct uprobe *uprobe;
> + unsigned seq;
>
> - read_lock(&uprobes_treelock);
> - uprobe = __find_uprobe(inode, offset);
> - read_unlock(&uprobes_treelock);
> + guard(rcu)();
>
> - return uprobe;
> + do {
> + seq = read_seqcount_begin(&uprobes_seqcount);
> + uprobes = __find_uprobe(inode, offset);
> + if (uprobes) {
> + /*
> + * Lockless RB-tree lookups are prone to false-negatives.
> + * If they find something, it's good. If they do not find,
> + * it needs to be validated.
> + */
> + return uprobes;
> + }
> + } while (read_seqcount_retry(&uprobes_seqcount, seq));
> +
> + /* Really didn't find anything. */
> + return NULL;
> }
Honest question here, as I don't understand the tradeoffs well enough.
Is there a lot of benefit to switching to seqcount lock vs using
percpu RW semaphore (previously recommended by Ingo). The latter is a
nice drop-in replacement and seems to be very fast and scale well.
Right now we are bottlenecked on uprobe->register_rwsem (not
uprobes_treelock anymore), which is currently limiting the scalability
of uprobes and I'm going to work on that next once I'm done with this
series.
>
> static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
> @@ -702,7 +724,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
> struct uprobe *u;
>
> write_lock(&uprobes_treelock);
> + write_seqcount_begin(&uprobes_seqcount);
> u = __insert_uprobe(uprobe);
> + write_seqcount_end(&uprobes_seqcount);
> write_unlock(&uprobes_treelock);
>
> return u;
> @@ -936,7 +960,9 @@ static void delete_uprobe(struct uprobe *uprobe)
> return;
>
> write_lock(&uprobes_treelock);
> + write_seqcount_begin(&uprobes_seqcount);
> rb_erase(&uprobe->rb_node, &uprobes_tree);
> + write_seqcount_end(&uprobes_seqcount);
> write_unlock(&uprobes_treelock);
> RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
> put_uprobe(uprobe);
Powered by blists - more mailing lists