[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20201022165229.34cd5141@gandalf.local.home>
Date: Thu, 22 Oct 2020 16:52:29 -0400
From: Steven Rostedt <rostedt@...dmis.org>
To: Jiri Olsa <jolsa@...hat.com>
Cc: Jiri Olsa <jolsa@...nel.org>, Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Andrii Nakryiko <andriin@...com>, netdev@...r.kernel.org,
bpf@...r.kernel.org, Martin KaFai Lau <kafai@...com>,
Song Liu <songliubraving@...com>, Yonghong Song <yhs@...com>,
John Fastabend <john.fastabend@...il.com>,
KP Singh <kpsingh@...omium.org>, Daniel Xu <dxu@...uu.xyz>,
Jesper Brouer <jbrouer@...hat.com>,
Toke Høiland-Jørgensen <toke@...hat.com>,
Viktor Malik <vmalik@...hat.com>
Subject: Re: [RFC bpf-next 00/16] bpf: Speed up trampoline attach
On Thu, 22 Oct 2020 12:21:50 -0400
Steven Rostedt <rostedt@...dmis.org> wrote:
> On Thu, 22 Oct 2020 10:42:05 -0400
> Steven Rostedt <rostedt@...dmis.org> wrote:
>
> > I'd like to see how batch functions will work. I guess I need to start
> > looking at the bpf trampoline, to see if we can modify the ftrace
> > trampoline to have a quick access to parameters. It would be much more
> > beneficial to update the existing generic function tracer to have access to
> > function parameters that all users could benefit from, than to tweak a
> > single use case into giving this feature to a single user.
>
> Looking at the creation of the bpf trampoline, I think I can modify ftrace
> to have a more flexible callback. Something that passes the callback the
> following:
>
> the function being traced.
> a pointer to the parent caller (that could be modified)
> a pointer to the original stack frame (what the stack was when the
> function is entered)
> An array of the arguments of the function (which could also be modified)
>
> This is a change I've been wanting to make for some time, because it would
> allow function graph to be a user of function tracer, and would give
> everything access to the arguments.
>
> We would still need a helper function to store all regs to keep kprobes
> working unmodified, but this would still only be done if asked.
>
> The above change shouldn't hurt performance for either ftrace or bpf
> because it appears they both do the same. If BPF wants to have a batch
> processing of functions, then I think we should modify ftrace to do this
> new approach instead of creating another set of function trampolines.
The below is a quick proof of concept patch I whipped up. It will always
save 6 arguments, so if BPF is really interested in just saving the bare
minimum of arguments before calling, it can still use direct. But if you
are going to have a generic callback, you'll need to save all parameters
otherwise you can corrupt the function's parameter if traced function uses
more than you save.
Which looking at the bpf trampoline code, I noticed that if the verifier
can't find the btf func, it falls back to saving 5 parameters. Which can be
a bug on x86 if the function itself uses 6 or more. If you only save 5
parameters, then call a bpf program that calls a helper function that uses
more than 5 parameters, it will likely corrupt the 6th parameter of the
function being traced.
The code in question is this:
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
const char *tname,
struct btf_func_model *m)
{
const struct btf_param *args;
const struct btf_type *t;
u32 i, nargs;
int ret;
if (!func) {
/* BTF function prototype doesn't match the verifier types.
* Fall back to 5 u64 args.
*/
for (i = 0; i < 5; i++)
m->arg_size[i] = 8;
m->ret_size = 8;
m->nr_args = 5;
return 0;
}
Shouldn't it be falling back to 6, not 5?
-- Steve
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 7edbd5ee5ed4..b65d73f430ed 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -287,6 +287,10 @@ extern void ftrace_caller_end(void);
extern void ftrace_caller_op_ptr(void);
extern void ftrace_regs_caller_op_ptr(void);
extern void ftrace_regs_caller_jmp(void);
+extern void ftrace_args_caller(void);
+extern void ftrace_args_call(void);
+extern void ftrace_args_caller_end(void);
+extern void ftrace_args_caller_op_ptr(void);
/* movq function_trace_op(%rip), %rdx */
/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
@@ -317,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long end_offset;
unsigned long op_offset;
unsigned long call_offset;
- unsigned long jmp_offset;
+ unsigned long jmp_offset = 0;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -336,12 +340,16 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
call_offset = (unsigned long)ftrace_regs_call;
jmp_offset = (unsigned long)ftrace_regs_caller_jmp;
+ } else if (ops->flags & FTRACE_OPS_FL_ARGS) {
+ start_offset = (unsigned long)ftrace_args_caller;
+ end_offset = (unsigned long)ftrace_args_caller_end;
+ op_offset = (unsigned long)ftrace_args_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_args_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_caller_end;
op_offset = (unsigned long)ftrace_caller_op_ptr;
call_offset = (unsigned long)ftrace_call;
- jmp_offset = 0;
}
size = end_offset - start_offset;
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index ac3d5f22fe64..65ca634d0b37 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -176,6 +176,58 @@ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
retq
SYM_FUNC_END(ftrace_epilogue)
+SYM_FUNC_START(ftrace_args_caller)
+#ifdef CONFIG_FRAME_POINTER
+ push %rdp
+ movq %rsp %rdp
+# define CALLED_OFFEST (7 * 8)
+# define PARENT_OFFSET (8 * 8)
+#else
+# define CALLED_OFFSET (6 * 8)
+# define PARENT_OFFSET (7 * 8)
+#endif
+ /* save args */
+ pushq %r9
+ pushq %r8
+ pushq %rcx
+ pushq %rdx
+ pushq %rsi
+ pushq %rdi
+
+ /*
+ * Parameters:
+ * Called site (function called)
+ * Address of parent location
+ * pointer to ftrace_ops
+ * Location of stack when function was called
+ * Array of arguments.
+ */
+ movq CALLED_OFFSET(%rsp), %rdi
+ leaq PARENT_OFFSET(%rsp), %rsi
+SYM_INNER_LABEL(ftrace_args_caller_op_ptr, SYM_L_GLOBAL)
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+ movq %rsi, %rcx
+ leaq 0(%rsp), %r8
+
+SYM_INNER_LABEL(ftrace_args_call, SYM_L_GLOBAL)
+ callq ftrace_stub
+
+ popq %rdi
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %r8
+ popq %r9
+
+#ifdef CONFIG_FRAME_POINTER
+ popq %rdp
+#endif
+
+SYM_INNER_LABEL(ftrace_args_caller_end, SYM_L_GLOBAL)
+ jmp ftrace_epilogue
+SYM_FUNC_END(ftrace_args_caller)
+
SYM_FUNC_START(ftrace_regs_caller)
/* Save the current flags before any operations that can change them */
pushfq
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1bd3a0356ae4..0d077e8d7bb4 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -92,6 +92,17 @@ struct ftrace_ops;
typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs);
+typedef void (*ftrace_args_func_t)(unsigned long ip, unsigned long *parent_ip,
+ struct ftrace_ops *op, unsigned long *stack,
+ unsigned long *args);
+
+union ftrace_callback {
+ ftrace_func_t func;
+ ftrace_args_func_t args_func;
+};
+
+typedef union ftrace_callback ftrace_callback_t;
+
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
/*
@@ -169,6 +180,7 @@ enum {
FTRACE_OPS_FL_TRACE_ARRAY = BIT(15),
FTRACE_OPS_FL_PERMANENT = BIT(16),
FTRACE_OPS_FL_DIRECT = BIT(17),
+ FTRACE_OPS_FL_ARGS = BIT(18),
};
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -447,9 +459,11 @@ enum {
FTRACE_FL_DISABLED = (1UL << 25),
FTRACE_FL_DIRECT = (1UL << 24),
FTRACE_FL_DIRECT_EN = (1UL << 23),
+ FTRACE_FL_ARGS = (1UL << 22),
+ FTRACE_FL_ARGS_EN = (1UL << 21),
};
-#define FTRACE_REF_MAX_SHIFT 23
+#define FTRACE_REF_MAX_SHIFT 21
#define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1)
#define ftrace_rec_count(rec) ((rec)->flags & FTRACE_REF_MAX)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4833b6a82ce7..5632b0809dc0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1721,6 +1721,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (ops->flags & FTRACE_OPS_FL_DIRECT)
rec->flags |= FTRACE_FL_DIRECT;
+ else if (ops->flags & FTRACE_OPS_FL_ARGS)
+ rec->flags |= FTRACE_FL_ARGS;
+
/*
* If there's only a single callback registered to a
* function, and the ops has a trampoline registered
@@ -1757,6 +1760,10 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (ops->flags & FTRACE_OPS_FL_DIRECT)
rec->flags &= ~FTRACE_FL_DIRECT;
+ /* POC: but we will have more than one */
+ if (ops->flags & FTRACE_OPS_FL_ARGS)
+ rec->flags &= ~FTRACE_FL_ARGS;
+
/*
* If the rec had REGS enabled and the ops that is
* being removed had REGS set, then see if there is
@@ -2103,6 +2110,13 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
!(rec->flags & FTRACE_FL_TRAMP_EN))
flag |= FTRACE_FL_TRAMP;
+ /* Proof of concept */
+ if (ftrace_rec_count(rec) == 1) {
+ if (!(rec->flags & FTRACE_FL_ARGS) !=
+ !(rec->flags & FTRACE_FL_ARGS_EN))
+ flag |= FTRACE_FL_ARGS;
+ }
+
/*
* Direct calls are special, as count matters.
* We must test the record for direct, if the
@@ -2144,6 +2158,17 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
else
rec->flags &= ~FTRACE_FL_TRAMP_EN;
}
+ if (flag & FTRACE_FL_ARGS) {
+ if (ftrace_rec_count(rec) == 1) {
+ if (rec->flags & FTRACE_FL_ARGS)
+ rec->flags |= FTRACE_FL_ARGS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_ARGS_EN;
+ } else {
+ rec->flags &= ~FTRACE_FL_ARGS_EN;
+ }
+ }
+
if (flag & FTRACE_FL_DIRECT) {
/*
* If there's only one user (direct_ops helper)
@@ -2192,7 +2217,8 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* and REGS states. The _EN flags must be disabled though.
*/
rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
- FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
+ FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN |
+ FTRACE_FL_ARGS_EN);
}
ftrace_bug_type = FTRACE_BUG_NOP;
@@ -3630,7 +3656,8 @@ static int t_show(struct seq_file *m, void *v)
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ",
- rec->flags & FTRACE_FL_DIRECT ? " D" : " ");
+ rec->flags & FTRACE_FL_DIRECT ? " D" :
+ rec->flags & FTRACE_FL_ARGS ? " A" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2c2126e1871d..a3da84b0e599 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -86,9 +86,48 @@ void ftrace_destroy_function_files(struct trace_array *tr)
ftrace_free_ftrace_ops(tr);
}
+static void function_args_trace_call(unsigned long ip,
+ unsigned long *parent_ip,
+ struct ftrace_ops *op,
+ unsigned long *stack,
+ unsigned long *args)
+{
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int bit;
+ int cpu;
+ int pc;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+
+ bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ if (bit < 0)
+ goto out;
+
+ cpu = smp_processor_id();
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ if (!atomic_read(&data->disabled)) {
+ local_save_flags(flags);
+ trace_function(tr, ip, *parent_ip, flags, pc);
+ trace_printk("%pS %lx %lx %lx %lx %lx %lx\n",
+ (void *)ip, args[0], args[1], args[2], args[3],
+ args[4], args[5]);
+ }
+ trace_clear_recursion(bit);
+
+ out:
+ preempt_enable_notrace();
+
+}
+
static int function_trace_init(struct trace_array *tr)
{
- ftrace_func_t func;
+ ftrace_callback_t callback;
/*
* Instance trace_arrays get their ops allocated
@@ -101,11 +140,14 @@ static int function_trace_init(struct trace_array *tr)
/* Currently only the global instance can do stack tracing */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
func_flags.val & TRACE_FUNC_OPT_STACK)
- func = function_stack_trace_call;
- else
- func = function_trace_call;
+ callback.func = function_stack_trace_call;
+ else {
+ tr->ops->flags |= FTRACE_OPS_FL_ARGS;
+ callback.args_func = function_args_trace_call;
+ }
+// func = function_trace_call;
- ftrace_init_array_ops(tr, func);
+ ftrace_init_array_ops(tr, callback.func);
tr->array_buffer.cpu = get_cpu();
put_cpu();
Powered by blists - more mailing lists