linux-kernel - [PATCH] uprobes: Optimize the allocation of insn

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240727094405.1362496-1-liaochang1@huawei.com>
Date: Sat, 27 Jul 2024 09:44:05 +0000
From: Liao Chang <liaochang1@...wei.com>
To: <peterz@...radead.org>, <mingo@...hat.com>, <acme@...nel.org>,
	<namhyung@...nel.org>, <mark.rutland@....com>,
	<alexander.shishkin@...ux.intel.com>, <jolsa@...nel.org>,
	<irogers@...gle.com>, <adrian.hunter@...el.com>, <kan.liang@...ux.intel.com>,
	<liaochang1@...wei.com>
CC: <linux-kernel@...r.kernel.org>, <linux-perf-users@...r.kernel.org>
Subject: [PATCH] uprobes: Optimize the allocation of insn_slot for performance

The profiling result of single-thread model of selftests bench reveals
performance bottlenecks in find_uprobe() and caches_clean_inval_pou() on
ARM64. On my local testing machine, 5% of CPU time is consumed by
find_uprobe() for trig-uprobe-ret, while caches_clean_inval_pou() take
about 34% of CPU time for trig-uprobe-nop and trig-uprobe-push.

This patch introduce struct uprobe_breakpoint to track previously
allocated insn_slot for frequently hit uprobe. it effectively reduce the
need for redundant insn_slot writes and subsequent expensive cache
flush, especially on architecture like ARM64. This patch has been tested
on Kunpeng916 (Hi1616), 4 NUMA nodes, 64 cores@ 2.4GHz. The selftest
bench and Redis GET/SET benchmark result below reveal obivious
performance gain.

before-opt
----------
trig-uprobe-nop:  0.371 ± 0.001M/s (0.371M/prod)
trig-uprobe-push: 0.370 ± 0.001M/s (0.370M/prod)
trig-uprobe-ret:  1.637 ± 0.001M/s (1.647M/prod)
trig-uretprobe-nop:  0.331 ± 0.004M/s (0.331M/prod)
trig-uretprobe-push: 0.333 ± 0.000M/s (0.333M/prod)
trig-uretprobe-ret:  0.854 ± 0.002M/s (0.854M/prod)
Redis SET (RPS) uprobe: 42728.52
Redis GET (RPS) uprobe: 43640.18
Redis SET (RPS) uretprobe: 40624.54
Redis GET (RPS) uretprobe: 41180.56

after-opt
---------
trig-uprobe-nop:  0.916 ± 0.001M/s (0.916M/prod)
trig-uprobe-push: 0.908 ± 0.001M/s (0.908M/prod)
trig-uprobe-ret:  1.855 ± 0.000M/s (1.855M/prod)
trig-uretprobe-nop:  0.640 ± 0.000M/s (0.640M/prod)
trig-uretprobe-push: 0.633 ± 0.001M/s (0.633M/prod)
trig-uretprobe-ret:  0.978 ± 0.003M/s (0.978M/prod)
Redis SET (RPS) uprobe: 43939.69
Redis GET (RPS) uprobe: 45200.80
Redis SET (RPS) uretprobe: 41658.58
Redis GET (RPS) uretprobe: 42805.80

While some uprobes might still need to share the same insn_slot, this
patch compare the instructions in the resued insn_slot with the
instructions execute out-of-line firstly to decides allocate a new one
or not.

Additionally, this patch use a rbtree associated with each thread that
hit uprobes to manage these allocated uprobe_breakpoint data. Due to the
rbtree of uprobe_breakpoints has smaller node, better locality and less
contention, it result in faster lookup times compared to find_uprobe().

The other part of this patch are some necessary memory management for
uprobe_breakpoint data. A uprobe_breakpoint is allocated for each newly
hit uprobe that doesn't already have a corresponding node in rbtree. All
uprobe_breakpoints will be freed when thread exit.

Signed-off-by: Liao Chang <liaochang1@...wei.com>
---
 include/linux/uprobes.h |   3 +
 kernel/events/uprobes.c | 246 +++++++++++++++++++++++++++++++++-------
 2 files changed, 211 insertions(+), 38 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f46e0ca0169c..04ee465980af 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -78,6 +78,9 @@ struct uprobe_task {
 
 	struct return_instance		*return_instances;
 	unsigned int			depth;
+
+	struct rb_root			breakpoints_tree;
+	rwlock_t			breakpoints_treelock;
 };
 
 struct return_instance {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2c83ba776fc7..3f1a6dc2a327 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -33,6 +33,7 @@
 #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
 
 static struct rb_root uprobes_tree = RB_ROOT;
+
 /*
  * allows us to skip the uprobe_mmap if there are no uprobe events active
  * at this time.  Probably a fine grained per inode count is better?
@@ -886,6 +887,174 @@ static bool filter_chain(struct uprobe *uprobe,
 	return ret;
 }
 
+static struct uprobe_task *get_utask(void);
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr);
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp);
+
+struct uprobe_breakpoint {
+	struct rb_node		rb_node;
+	refcount_t		ref;
+	unsigned long		slot;
+	unsigned long		vaddr;
+	struct uprobe		*uprobe;
+};
+
+static void put_ubp(struct uprobe_breakpoint *ubp)
+{
+	if (refcount_dec_and_test(&ubp->ref)) {
+		put_uprobe(ubp->uprobe);
+		kfree(ubp);
+	}
+}
+
+static struct uprobe_breakpoint *get_ubp(struct uprobe_breakpoint *ubp)
+{
+	refcount_inc(&ubp->ref);
+	return ubp;
+}
+
+#define __node_2_ubp(node) rb_entry((node), struct uprobe_breakpoint, rb_node)
+
+struct __ubp_key {
+	unsigned long bp_vaddr;
+};
+
+static int ubp_cmp(const unsigned long bp_vaddr,
+		   const struct uprobe_breakpoint *ubp)
+{
+	if (bp_vaddr < ubp->vaddr)
+		return -1;
+
+	if (bp_vaddr > ubp->vaddr)
+		return 1;
+
+	return 0;
+}
+
+static int __ubp_cmp_key(const void *k, const struct rb_node *b)
+{
+	const struct __ubp_key *key = k;
+
+	return ubp_cmp(key->bp_vaddr, __node_2_ubp(b));
+}
+
+static int __ubp_cmp(struct rb_node *a, const struct rb_node *b)
+{
+	const struct uprobe_breakpoint *ubp = __node_2_ubp(a);
+
+	return ubp_cmp(ubp->vaddr, __node_2_ubp(b));
+}
+
+static void delete_breakpoint(struct uprobe_task *utask)
+{
+	struct rb_node *node;
+
+	write_lock(&utask->breakpoints_treelock);
+	node = rb_first(&utask->breakpoints_tree);
+	while (node) {
+		rb_erase(node, &utask->breakpoints_tree);
+		write_unlock(&utask->breakpoints_treelock);
+
+		put_ubp(__node_2_ubp(node));
+
+		write_lock(&utask->breakpoints_treelock);
+		node = rb_next(node);
+	}
+	write_unlock(&utask->breakpoints_treelock);
+}
+
+static struct uprobe_breakpoint *alloc_breakpoint(struct uprobe_task *utask,
+						  struct uprobe *uprobe,
+						  unsigned long bp_vaddr)
+{
+	struct uprobe_breakpoint *ubp;
+	struct rb_node *node;
+
+	ubp = kzalloc(sizeof(struct uprobe_breakpoint), GFP_KERNEL);
+	if (!ubp)
+		return NULL;
+
+	ubp->vaddr = bp_vaddr;
+	ubp->uprobe = uprobe;
+	/* get access + creation ref */
+	refcount_set(&ubp->ref, 2);
+	ubp->slot = UINSNS_PER_PAGE;
+
+	write_lock(&utask->breakpoints_treelock);
+	node = rb_find_add(&ubp->rb_node, &utask->breakpoints_tree, __ubp_cmp);
+	write_unlock(&utask->breakpoints_treelock);
+
+	/* Two or more threads hit the same breakpoint */
+	if (node) {
+		WARN_ON(uprobe != __node_2_ubp(node)->upobre);
+		kfree(ubp);
+		return get_ubp(__node_2_ubp(node));
+	}
+
+	return ubp;
+}
+
+static struct uprobe_breakpoint *find_breakpoint(struct uprobe_task *utask,
+						 unsigned long bp_vaddr)
+{
+	struct rb_node *node;
+	struct __ubp_key key = {
+		.bp_vaddr = bp_vaddr,
+	};
+
+	read_lock(&utask->breakpoints_treelock);
+	node = rb_find(&key, &utask->breakpoints_tree, __ubp_cmp_key);
+	read_unlock(&utask->breakpoints_treelock);
+
+	if (node)
+		return get_ubp(__node_2_ubp(node));
+
+	return NULL;
+}
+
+static struct uprobe_breakpoint *find_active_breakpoint(struct pt_regs *regs,
+							unsigned long bp_vaddr)
+{
+	struct uprobe_task *utask = get_utask();
+	struct uprobe_breakpoint *ubp;
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	if (unlikely(!utask))
+		return NULL;
+
+	ubp = find_breakpoint(utask, bp_vaddr);
+	if (ubp)
+		return ubp;
+
+	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+	if (!uprobe) {
+		if (is_swbp > 0) {
+			/* No matching uprobe; signal SIGTRAP. */
+			force_sig(SIGTRAP);
+		} else {
+			/*
+			 * Either we raced with uprobe_unregister() or we can't
+			 * access this memory. The latter is only possible if
+			 * another thread plays with our ->mm. In both cases
+			 * we can simply restart. If this vma was unmapped we
+			 * can pretend this insn was not executed yet and get
+			 * the (correct) SIGSEGV after restart.
+			 */
+			instruction_pointer_set(regs, bp_vaddr);
+		}
+		return NULL;
+	}
+
+	ubp = alloc_breakpoint(utask, uprobe, bp_vaddr);
+	if (!ubp) {
+		put_uprobe(uprobe);
+		return NULL;
+	}
+
+	return ubp;
+}
+
 static int
 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long vaddr)
@@ -1576,9 +1745,8 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
 /*
  *  - search for a free slot.
  */
-static unsigned long xol_take_insn_slot(struct xol_area *area)
+static __always_inline int xol_take_insn_slot(struct xol_area *area)
 {
-	unsigned long slot_addr;
 	int slot_nr;
 
 	do {
@@ -1590,34 +1758,48 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
 			slot_nr = UINSNS_PER_PAGE;
 			continue;
 		}
-		wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+		wait_event(area->wq,
+			   (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
 	} while (slot_nr >= UINSNS_PER_PAGE);
 
-	slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
-	atomic_inc(&area->slot_count);
+	return slot_nr;
+}
+
+static __always_inline unsigned long
+choose_insn_slot(struct xol_area *area, struct uprobe_breakpoint *ubp)
+{
+	if ((ubp->slot == UINSNS_PER_PAGE) ||
+	    test_and_set_bit(ubp->slot, area->bitmap)) {
+		ubp->slot = xol_take_insn_slot(area);
+	}
 
-	return slot_addr;
+	atomic_inc(&area->slot_count);
+	return area->vaddr + ubp->slot * UPROBE_XOL_SLOT_BYTES;
 }
 
 /*
  * xol_get_insn_slot - allocate a slot for xol.
  * Returns the allocated slot address or 0.
  */
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+static unsigned long xol_get_insn_slot(struct uprobe_breakpoint *ubp)
 {
 	struct xol_area *area;
 	unsigned long xol_vaddr;
+	struct uprobe *uprobe = ubp->uprobe;
 
 	area = get_xol_area();
 	if (!area)
 		return 0;
 
-	xol_vaddr = xol_take_insn_slot(area);
+	xol_vaddr = choose_insn_slot(area, ubp);
 	if (unlikely(!xol_vaddr))
 		return 0;
 
-	arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
-			      &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+	if (memcmp((void *)xol_vaddr, &uprobe->arch.ixol,
+		   sizeof(uprobe->arch.ixol)))
+		arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
+				      &uprobe->arch.ixol,
+				      sizeof(uprobe->arch.ixol));
 
 	return xol_vaddr;
 }
@@ -1717,8 +1899,7 @@ void uprobe_free_utask(struct task_struct *t)
 	if (!utask)
 		return;
 
-	if (utask->active_uprobe)
-		put_uprobe(utask->active_uprobe);
+	delete_breakpoint(utask);
 
 	ri = utask->return_instances;
 	while (ri)
@@ -1739,8 +1920,11 @@ void uprobe_free_utask(struct task_struct *t)
  */
 static struct uprobe_task *get_utask(void)
 {
-	if (!current->utask)
+	if (!current->utask) {
 		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+		current->utask->breakpoints_tree = RB_ROOT;
+		rwlock_init(&current->utask->breakpoints_treelock);
+	}
 	return current->utask;
 }
 
@@ -1921,7 +2105,8 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 
 /* Prepare to single-step probed instruction out of line. */
 static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs,
+	  struct uprobe_breakpoint *ubp)
 {
 	struct uprobe_task *utask;
 	unsigned long xol_vaddr;
@@ -1931,12 +2116,12 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 	if (!utask)
 		return -ENOMEM;
 
-	xol_vaddr = xol_get_insn_slot(uprobe);
+	xol_vaddr = xol_get_insn_slot(ubp);
 	if (!xol_vaddr)
 		return -ENOMEM;
 
 	utask->xol_vaddr = xol_vaddr;
-	utask->vaddr = bp_vaddr;
+	utask->vaddr = ubp->vaddr;
 
 	err = arch_uprobe_pre_xol(&uprobe->arch, regs);
 	if (unlikely(err)) {
@@ -2182,32 +2367,19 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
  */
 static void handle_swbp(struct pt_regs *regs)
 {
+	struct uprobe_breakpoint *ubp;
 	struct uprobe *uprobe;
 	unsigned long bp_vaddr;
-	int is_swbp;
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
 	if (bp_vaddr == get_trampoline_vaddr())
 		return handle_trampoline(regs);
 
-	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
-	if (!uprobe) {
-		if (is_swbp > 0) {
-			/* No matching uprobe; signal SIGTRAP. */
-			force_sig(SIGTRAP);
-		} else {
-			/*
-			 * Either we raced with uprobe_unregister() or we can't
-			 * access this memory. The latter is only possible if
-			 * another thread plays with our ->mm. In both cases
-			 * we can simply restart. If this vma was unmapped we
-			 * can pretend this insn was not executed yet and get
-			 * the (correct) SIGSEGV after restart.
-			 */
-			instruction_pointer_set(regs, bp_vaddr);
-		}
+	ubp = find_active_breakpoint(regs, bp_vaddr);
+	if (!ubp)
 		return;
-	}
+
+	uprobe = ubp->uprobe;
 
 	/* change it in advance for ->handler() and restart */
 	instruction_pointer_set(regs, bp_vaddr);
@@ -2241,12 +2413,11 @@ static void handle_swbp(struct pt_regs *regs)
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;
 
-	if (!pre_ssout(uprobe, regs, bp_vaddr))
-		return;
+	pre_ssout(uprobe, regs, ubp);
 
 	/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
 out:
-	put_uprobe(uprobe);
+	put_ubp(ubp);
 }
 
 /*
@@ -2266,7 +2437,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 	else
 		WARN_ON_ONCE(1);
 
-	put_uprobe(uprobe);
 	utask->active_uprobe = NULL;
 	utask->state = UTASK_RUNNING;
 	xol_free_insn_slot(current);
-- 
2.34.1