[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aHkW8gS1eOTfGFA1@krava>
Date: Thu, 17 Jul 2025 17:29:54 +0200
From: Jiri Olsa <olsajiri@...il.com>
To: Jiri Olsa <olsajiri@...il.com>, Peter Zijlstra <peterz@...radead.org>
Cc: Oleg Nesterov <oleg@...hat.com>, Andrii Nakryiko <andrii@...nel.org>,
bpf@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org, x86@...nel.org,
Song Liu <songliubraving@...com>, Yonghong Song <yhs@...com>,
John Fastabend <john.fastabend@...il.com>,
Hao Luo <haoluo@...gle.com>, Steven Rostedt <rostedt@...dmis.org>,
Masami Hiramatsu <mhiramat@...nel.org>,
Alan Maguire <alan.maguire@...cle.com>,
David Laight <David.Laight@...lab.com>,
Thomas Weißschuh <thomas@...ch.de>,
Ingo Molnar <mingo@...nel.org>
Subject: Re: [PATCHv5 perf/core 10/22] uprobes/x86: Add support to optimize
uprobes
On Mon, Jul 14, 2025 at 11:29:07PM +0200, Jiri Olsa wrote:
SNIP
> > > +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > > + unsigned long vaddr)
> > > +{
> > > + uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
> > > + struct write_opcode_ctx ctx = {
> > > + .base = vaddr,
> > > + };
> > > + int err;
> > > +
> > > + /*
> > > + * We need to overwrite call instruction into nop5 instruction with
> > > + * breakpoint (int3) installed on top of its first byte. We will:
> > > + *
> > > + * - overwrite call opcode with breakpoint (int3)
> > > + * - sync cores
> > > + * - write last 4 bytes of the nop5 instruction
> > > + * - sync cores
> > > + */
> > > +
> > > + ctx.update = UNOPT_INT3;
> > > + err = write_insn(auprobe, vma, vaddr, &int3, 1, &ctx);
> > > + if (err)
> > > + return err;
> > > +
> > > + smp_text_poke_sync_each_cpu();
> > > +
> > > + ctx.update = UNOPT_PART;
> > > + err = write_insn(auprobe, vma, vaddr + 1, (uprobe_opcode_t *) auprobe->insn + 1, 4, &ctx);
> > > +
> > > + smp_text_poke_sync_each_cpu();
> > > + return err;
> > > +}
> >
> > Please unify these two functions; it makes absolutely no sense to have
> > two copies of this logic around.
>
> will try to come up with something
>
would the change below be ok? int3_update function
thanks,
jirka
---
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..1ee2e5115955 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
#define UPROBE_SWBP_INSN 0xcc
#define UPROBE_SWBP_INSN_SIZE 1
+enum {
+ ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0,
+ ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1,
+};
+
struct uprobe_xol_ops;
struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
u8 ilen;
} push;
};
+
+ unsigned long flags;
};
struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index d18e1ae59901..9baf42723ec6 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <asm/insn.h>
#include <asm/mmu_context.h>
+#include <asm/nops.h>
/* Post-execution fixups. */
@@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
return tramp;
}
-__maybe_unused
static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
{
struct uprobes_state *state = ¤t->mm->uprobes_state;
@@ -891,6 +891,277 @@ static int __init arch_uprobes_init(void)
late_initcall(arch_uprobes_init);
+enum {
+ EXPECT_SWBP,
+ EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+ unsigned long base;
+ int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+ return *insn == CALL_INSN_OPCODE;
+}
+
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+ int nbytes, void *data)
+{
+ struct write_opcode_ctx *ctx = data;
+ uprobe_opcode_t old_opcode[5];
+
+ uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+ switch (ctx->expect) {
+ case EXPECT_SWBP:
+ if (is_swbp_insn(&old_opcode[0]))
+ return 1;
+ break;
+ case EXPECT_CALL:
+ if (is_call_insn(&old_opcode[0]))
+ return 1;
+ break;
+ }
+
+ return -1;
+}
+
+/*
+ * Stolen comment from smp_text_poke_batch_finish.
+ *
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ *
+ * The way it is done:
+ * - Add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - Update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - Replace the first byte (INT3) by the first byte of the replacing opcode
+ * - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, char *insn, bool optimize)
+{
+ uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+ struct write_opcode_ctx ctx = {
+ .base = vaddr,
+ };
+ int err;
+
+ /*
+ * Write int3 trap.
+ *
+ * The swbp_optimize path comes with breakpoint already installed,
+ * so we can skip this step for optimize == true.
+ */
+ if (!optimize) {
+ ctx.expect = EXPECT_CALL;
+ err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /* Write all but the first byte of the patched range. */
+ ctx.expect = EXPECT_SWBP;
+ err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Write first byte.
+ *
+ * The swbp_unoptimize needs to finish uprobe removal together
+ * with ref_ctr update, using uprobe_write with proper flags.
+ */
+ err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+ optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+ return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, unsigned long tramp)
+{
+ u8 call[5];
+
+ __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+ (const void *) tramp, CALL_INSN_SIZE);
+ return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+ unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ uprobe_copy_from_page(page, vaddr, dst, len);
+ put_page(page);
+ return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+ struct __packed __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } *call = (struct __arch_relative_insn *) insn;
+
+ if (!is_call_insn(insn))
+ return false;
+ return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized)
+{
+ uprobe_opcode_t insn[5];
+ int err;
+
+ err = copy_from_vaddr(mm, vaddr, &insn, 5);
+ if (err)
+ return err;
+ *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+ return 0;
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+ return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+ test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (should_optimize(auprobe)) {
+ bool optimized = false;
+ int err;
+
+ /*
+ * We could race with another thread that already optimized the probe,
+ * so let's not overwrite it with int3 again in this case.
+ */
+ err = is_optimized(vma->vm_mm, vaddr, &optimized);
+ if (err)
+ return err;
+ if (optimized)
+ return 0;
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+ true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+ struct mm_struct *mm = vma->vm_mm;
+ bool optimized = false;
+ int err;
+
+ err = is_optimized(mm, vaddr, &optimized);
+ if (err)
+ return err;
+ if (optimized) {
+ err = swbp_unoptimize(auprobe, vma, vaddr);
+ WARN_ON_ONCE(err);
+ return err;
+ }
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+ false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr)
+{
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+ bool new = false;
+ int err = 0;
+
+ vma = find_vma(mm, vaddr);
+ if (!vma)
+ return -EINVAL;
+ tramp = get_uprobe_trampoline(vaddr, &new);
+ if (!tramp)
+ return -EINVAL;
+ err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+ if (WARN_ON_ONCE(err) && new)
+ destroy_uprobe_trampoline(tramp);
+ return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn[5];
+
+ /*
+ * Do not optimize if shadow stack is enabled, the return address hijack
+ * code in arch_uretprobe_hijack_return_addr updates wrong frame when
+ * the entry uprobe is optimized and the shadow stack crashes the app.
+ */
+ if (shstk_is_enabled())
+ return;
+
+ if (!should_optimize(auprobe))
+ return;
+
+ mmap_write_lock(mm);
+
+ /*
+ * Check if some other thread already optimized the uprobe for us,
+ * if it's the case just go away silently.
+ */
+ if (copy_from_vaddr(mm, vaddr, &insn, 5))
+ goto unlock;
+ if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+ goto unlock;
+
+ /*
+ * If we fail to optimize the uprobe we set the fail bit so the
+ * above should_optimize will fail from now on.
+ */
+ if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+ set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+ mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ if (memcmp(&auprobe->insn, x86_nops[5], 5))
+ return false;
+ /* We can't do cross page atomic writes yet. */
+ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
@@ -904,6 +1175,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
+static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ return false;
+}
#endif /* CONFIG_X86_64 */
struct uprobe_xol_ops {
@@ -1270,6 +1545,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
if (ret)
return ret;
+ if (can_optimize(auprobe, addr))
+ set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
ret = branch_setup_xol_ops(auprobe, &insn);
if (ret != -ENOSYS)
return ret;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b6b077cc7d0f..08ef78439d0d 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -192,7 +192,7 @@ struct uprobes_state {
};
typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
- uprobe_opcode_t *insn, int nbytes);
+ uprobe_opcode_t *insn, int nbytes, void *data);
extern void __init uprobes_init(void);
extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
@@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
bool is_register);
extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
- uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr);
+ uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+ void *data);
extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *
extern void arch_uprobe_clear_state(struct mm_struct *mm);
extern void arch_uprobe_init_state(struct mm_struct *mm);
extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
#else /* !CONFIG_UPROBES */
struct uprobes_state {
};
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cbba31c0495f..e54081beeab9 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
}
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
- int nbytes)
+ int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -492,12 +492,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
bool is_register)
{
return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
- verify_opcode, is_register, true /* do_update_ref_ctr */);
+ verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
}
int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
- uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr)
+ uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+ void *data)
{
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
struct mm_struct *mm = vma->vm_mm;
@@ -531,7 +532,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
goto out;
folio = page_folio(page);
- ret = verify(page, insn_vaddr, insn, nbytes);
+ ret = verify(page, insn_vaddr, insn, nbytes, data);
if (ret <= 0) {
folio_put(folio);
goto out;
@@ -2697,6 +2698,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
return true;
}
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2761,6 +2766,9 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
Powered by blists - more mailing lists