[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZYGrB7NsDEWk2liL@krava>
Date: Tue, 19 Dec 2023 15:39:03 +0100
From: Jiri Olsa <olsajiri@...il.com>
To: "Masami Hiramatsu (Google)" <mhiramat@...nel.org>
Cc: Alexei Starovoitov <alexei.starovoitov@...il.com>,
Steven Rostedt <rostedt@...dmis.org>,
Florent Revest <revest@...omium.org>,
linux-trace-kernel@...r.kernel.org,
LKML <linux-kernel@...r.kernel.org>,
Martin KaFai Lau <martin.lau@...ux.dev>, bpf <bpf@...r.kernel.org>,
Sven Schnelle <svens@...ux.ibm.com>,
Alexei Starovoitov <ast@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Alan Maguire <alan.maguire@...cle.com>,
Mark Rutland <mark.rutland@....com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>, Guo Ren <guoren@...nel.org>
Subject: Re: [PATCH v5 28/34] fprobe: Rewrite fprobe on function-graph tracer
On Mon, Dec 18, 2023 at 10:17:10PM +0900, Masami Hiramatsu (Google) wrote:
SNIP
> -static void fprobe_exit_handler(struct rethook_node *rh, void *data,
> - unsigned long ret_ip, struct pt_regs *regs)
> +static int fprobe_entry(unsigned long func, unsigned long ret_ip,
> + struct ftrace_regs *fregs, struct fgraph_ops *gops)
> {
> - struct fprobe *fp = (struct fprobe *)data;
> - struct fprobe_rethook_node *fpr;
> - struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
> - int bit;
> + struct fprobe_hlist_node *node, *first;
> + unsigned long *fgraph_data = NULL;
> + unsigned long header;
> + int reserved_words;
> + struct fprobe *fp;
> + int used, ret;
>
> - if (!fp || fprobe_disabled(fp))
> - return;
> + if (WARN_ON_ONCE(!fregs))
> + return 0;
>
> - fpr = container_of(rh, struct fprobe_rethook_node, node);
> + first = node = find_first_fprobe_node(func);
> + if (unlikely(!first))
> + return 0;
> +
> + reserved_words = 0;
> + hlist_for_each_entry_from_rcu(node, hlist) {
> + if (node->addr != func)
> + break;
> + fp = READ_ONCE(node->fp);
> + if (!fp || !fp->exit_handler)
> + continue;
> + /*
> + * Since fprobe can be enabled until the next loop, we ignore the
> + * fprobe's disabled flag in this loop.
> + */
> + reserved_words +=
> + DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
> + }
> + node = first;
> + if (reserved_words) {
> + fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
> + if (unlikely(!fgraph_data)) {
> + hlist_for_each_entry_from_rcu(node, hlist) {
> + if (node->addr != func)
> + break;
> + fp = READ_ONCE(node->fp);
> + if (fp && !fprobe_disabled(fp))
> + fp->nmissed++;
> + }
> + return 0;
> + }
> + }
this looks expensive compared to what we do now.. IIUC due to the graph
ops limitations (16 ctive ops), you have just single graph ops for fprobe
and each fprobe registration stores ips into hash which you need to search
in here to get registered callbacks..?
I wonder would it make sense to allow arbitrary number of active graph_ops
with the price some callback might fail because there's no stack space so
each fprobe instance would have its own graph_ops.. and we would get rid
of the code above (and below) ?
jirka
>
> /*
> - * we need to assure no calls to traceable functions in-between the
> - * end of fprobe_handler and the beginning of fprobe_exit_handler.
> + * TODO: recursion detection has been done in the fgraph. Thus we need
> + * to add a callback to increment missed counter.
> */
> - bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
> - if (bit < 0) {
> - fp->nmissed++;
> + used = 0;
> + hlist_for_each_entry_from_rcu(node, hlist) {
> + void *data;
> +
> + if (node->addr != func)
> + break;
> + fp = READ_ONCE(node->fp);
> + if (!fp || fprobe_disabled(fp))
> + continue;
> +
> + if (fp->entry_data_size && fp->exit_handler)
> + data = fgraph_data + used + 1;
> + else
> + data = NULL;
> +
> + if (fprobe_shared_with_kprobes(fp))
> + ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
> + else
> + ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
> + /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
> + if (!ret && fp->exit_handler) {
> + int size_words = DIV_ROUND_UP(fp->entry_data_size, sizeof(long));
> +
> + header = encode_fprobe_header(fp, size_words);
> + if (likely(header)) {
> + fgraph_data[used] = header;
> + used += size_words + 1;
> + }
> + }
> + }
> + if (used < reserved_words)
> + memset(fgraph_data + used, 0, reserved_words - used);
> +
> + /* If any exit_handler is set, data must be used. */
> + return used != 0;
> +}
> +NOKPROBE_SYMBOL(fprobe_entry);
> +
> +static void fprobe_return(unsigned long func, unsigned long ret_ip,
> + struct ftrace_regs *fregs, struct fgraph_ops *gops)
> +{
> + unsigned long *fgraph_data = NULL;
> + unsigned long val;
> + struct fprobe *fp;
> + int size, curr;
> + int size_words;
> +
> + fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
> + if (!fgraph_data)
> + return;
> + size_words = DIV_ROUND_UP(size, sizeof(long));
> +
> + preempt_disable();
> +
> + curr = 0;
> + while (size_words > curr) {
> + val = fgraph_data[curr++];
> + if (!val)
> + break;
> +
> + size = decode_fprobe_header(val, &fp);
> + if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
> + if (WARN_ON_ONCE(curr + size > size_words))
> + break;
> + fp->exit_handler(fp, func, ret_ip, fregs,
> + size ? fgraph_data + curr : NULL);
> + }
> + curr += size + 1;
> + }
> + preempt_enable();
> +}
> +NOKPROBE_SYMBOL(fprobe_return);
> +
> +static struct fgraph_ops fprobe_graph_ops = {
> + .entryregfunc = fprobe_entry,
> + .retregfunc = fprobe_return,
> +};
> +static int fprobe_graph_active;
> +
> +/* Add @addrs to the ftrace filter and register fgraph if needed. */
> +static int fprobe_graph_add_ips(unsigned long *addrs, int num)
> +{
> + int ret;
> +
> + lockdep_assert_held(&fprobe_mutex);
> +
> + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
> + if (ret)
> + return ret;
> +
> + if (!fprobe_graph_active) {
> + ret = register_ftrace_graph(&fprobe_graph_ops);
> + if (WARN_ON_ONCE(ret)) {
> + ftrace_free_filter(&fprobe_graph_ops.ops);
> + return ret;
> + }
> + }
> + fprobe_graph_active++;
> + return 0;
> +}
> +
> +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
> +static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
> +{
> + lockdep_assert_held(&fprobe_mutex);
> +
> + fprobe_graph_active--;
> + if (!fprobe_graph_active) {
> + /* Q: should we unregister it ? */
> + unregister_ftrace_graph(&fprobe_graph_ops);
> return;
> }
>
> - fp->exit_handler(fp, fpr->entry_ip, ret_ip, fregs,
> - fp->entry_data_size ? (void *)fpr->data : NULL);
> - ftrace_test_recursion_unlock(bit);
> + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
> }
> -NOKPROBE_SYMBOL(fprobe_exit_handler);
>
> static int symbols_cmp(const void *a, const void *b)
> {
> @@ -176,55 +406,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num)
> return ERR_PTR(-ENOENT);
> }
>
> -static void fprobe_init(struct fprobe *fp)
> -{
> - fp->nmissed = 0;
> - if (fprobe_shared_with_kprobes(fp))
> - fp->ops.func = fprobe_kprobe_handler;
> - else
> - fp->ops.func = fprobe_handler;
> - fp->ops.flags |= FTRACE_OPS_FL_SAVE_ARGS;
> -}
> +struct filter_match_data {
> + const char *filter;
> + const char *notfilter;
> + size_t index;
> + size_t size;
> + unsigned long *addrs;
> +};
>
> -static int fprobe_init_rethook(struct fprobe *fp, int num)
> +static int filter_match_callback(void *data, const char *name, unsigned long addr)
> {
> - int size;
> + struct filter_match_data *match = data;
>
> - if (num <= 0)
> - return -EINVAL;
> + if (!glob_match(match->filter, name) ||
> + (match->notfilter && glob_match(match->notfilter, name)))
> + return 0;
>
> - if (!fp->exit_handler) {
> - fp->rethook = NULL;
> + if (!ftrace_location(addr))
> return 0;
> - }
>
> - /* Initialize rethook if needed */
> - if (fp->nr_maxactive)
> - size = fp->nr_maxactive;
> - else
> - size = num * num_possible_cpus() * 2;
> - if (size <= 0)
> - return -EINVAL;
> + if (match->addrs)
> + match->addrs[match->index] = addr;
>
> - /* Initialize rethook */
> - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
> - sizeof(struct fprobe_rethook_node), size);
> - if (IS_ERR(fp->rethook))
> - return PTR_ERR(fp->rethook);
> + match->index++;
> + return match->index == match->size;
> +}
>
> - return 0;
> +/*
> + * Make IP list from the filter/no-filter glob patterns.
> + * Return the number of matched symbols, or -ENOENT.
> + */
> +static int ip_list_from_filter(const char *filter, const char *notfilter,
> + unsigned long *addrs, size_t size)
> +{
> + struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
> + .index = 0, .size = size, .addrs = addrs};
> + int ret;
> +
> + ret = kallsyms_on_each_symbol(filter_match_callback, &match);
> + if (ret < 0)
> + return ret;
> + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
> + if (ret < 0)
> + return ret;
> +
> + return match.index ?: -ENOENT;
> }
>
> static void fprobe_fail_cleanup(struct fprobe *fp)
> {
> - if (!IS_ERR_OR_NULL(fp->rethook)) {
> - /* Don't need to cleanup rethook->handler because this is not used. */
> - rethook_free(fp->rethook);
> - fp->rethook = NULL;
> + kfree(fp->hlist_array);
> + fp->hlist_array = NULL;
> +}
> +
> +/* Initialize the fprobe data structure. */
> +static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
> +{
> + struct fprobe_hlist *hlist_array;
> + unsigned long addr;
> + int size, i;
> +
> + if (!fp || !addrs || num <= 0)
> + return -EINVAL;
> +
> + size = ALIGN(fp->entry_data_size, sizeof(long));
> + if (size > MAX_FPROBE_DATA_SIZE)
> + return -E2BIG;
> + fp->entry_data_size = size;
> +
> + hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
> + if (!hlist_array)
> + return -ENOMEM;
> +
> + fp->nmissed = 0;
> +
> + hlist_array->size = num;
> + fp->hlist_array = hlist_array;
> + hlist_array->fp = fp;
> + for (i = 0; i < num; i++) {
> + hlist_array->array[i].fp = fp;
> + addr = ftrace_location(addrs[i]);
> + if (!addr) {
> + fprobe_fail_cleanup(fp);
> + return -ENOENT;
> + }
> + hlist_array->array[i].addr = addr;
> }
> - ftrace_free_filter(&fp->ops);
> + return 0;
> }
>
> +#define FPROBE_IPS_MAX INT_MAX
> +
> /**
> * register_fprobe() - Register fprobe to ftrace by pattern.
> * @fp: A fprobe data structure to be registered.
> @@ -238,46 +510,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp)
> */
> int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
> {
> - struct ftrace_hash *hash;
> - unsigned char *str;
> - int ret, len;
> + unsigned long *addrs;
> + int ret;
>
> if (!fp || !filter)
> return -EINVAL;
>
> - fprobe_init(fp);
> -
> - len = strlen(filter);
> - str = kstrdup(filter, GFP_KERNEL);
> - ret = ftrace_set_filter(&fp->ops, str, len, 0);
> - kfree(str);
> - if (ret)
> + ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
> + if (ret < 0)
> return ret;
>
> - if (notfilter) {
> - len = strlen(notfilter);
> - str = kstrdup(notfilter, GFP_KERNEL);
> - ret = ftrace_set_notrace(&fp->ops, str, len, 0);
> - kfree(str);
> - if (ret)
> - goto out;
> - }
> -
> - /* TODO:
> - * correctly calculate the total number of filtered symbols
> - * from both filter and notfilter.
> - */
> - hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
> - if (WARN_ON_ONCE(!hash))
> - goto out;
> -
> - ret = fprobe_init_rethook(fp, (int)hash->count);
> - if (!ret)
> - ret = register_ftrace_function(&fp->ops);
> + addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
> + if (!addrs)
> + return -ENOMEM;
> + ret = ip_list_from_filter(filter, notfilter, addrs, ret);
> + if (ret > 0)
> + ret = register_fprobe_ips(fp, addrs, ret);
>
> -out:
> - if (ret)
> - fprobe_fail_cleanup(fp);
> + kfree(addrs);
> return ret;
> }
> EXPORT_SYMBOL_GPL(register_fprobe);
> @@ -285,7 +535,7 @@ EXPORT_SYMBOL_GPL(register_fprobe);
> /**
> * register_fprobe_ips() - Register fprobe to ftrace by address.
> * @fp: A fprobe data structure to be registered.
> - * @addrs: An array of target ftrace location addresses.
> + * @addrs: An array of target function address.
> * @num: The number of entries of @addrs.
> *
> * Register @fp to ftrace for enabling the probe on the address given by @addrs.
> @@ -297,23 +547,27 @@ EXPORT_SYMBOL_GPL(register_fprobe);
> */
> int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
> {
> - int ret;
> -
> - if (!fp || !addrs || num <= 0)
> - return -EINVAL;
> -
> - fprobe_init(fp);
> + struct fprobe_hlist *hlist_array;
> + int ret, i;
>
> - ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
> + ret = fprobe_init(fp, addrs, num);
> if (ret)
> return ret;
>
> - ret = fprobe_init_rethook(fp, num);
> - if (!ret)
> - ret = register_ftrace_function(&fp->ops);
> + mutex_lock(&fprobe_mutex);
> +
> + hlist_array = fp->hlist_array;
> + ret = fprobe_graph_add_ips(addrs, num);
> + if (!ret) {
> + add_fprobe_hash(fp);
> + for (i = 0; i < hlist_array->size; i++)
> + insert_fprobe_node(&hlist_array->array[i]);
> + }
> + mutex_unlock(&fprobe_mutex);
>
> if (ret)
> fprobe_fail_cleanup(fp);
> +
> return ret;
> }
> EXPORT_SYMBOL_GPL(register_fprobe_ips);
> @@ -351,14 +605,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms);
>
> bool fprobe_is_registered(struct fprobe *fp)
> {
> - if (!fp || (fp->ops.saved_func != fprobe_handler &&
> - fp->ops.saved_func != fprobe_kprobe_handler))
> + if (!fp || !fp->hlist_array)
> return false;
> return true;
> }
>
> /**
> - * unregister_fprobe() - Unregister fprobe from ftrace
> + * unregister_fprobe() - Unregister fprobe.
> * @fp: A fprobe data structure to be unregistered.
> *
> * Unregister fprobe (and remove ftrace hooks from the function entries).
> @@ -367,23 +620,40 @@ bool fprobe_is_registered(struct fprobe *fp)
> */
> int unregister_fprobe(struct fprobe *fp)
> {
> - int ret;
> + struct fprobe_hlist *hlist_array;
> + unsigned long *addrs = NULL;
> + int ret = 0, i, count;
>
> - if (!fprobe_is_registered(fp))
> - return -EINVAL;
> + mutex_lock(&fprobe_mutex);
> + if (!fp || !is_fprobe_still_exist(fp)) {
> + ret = -EINVAL;
> + goto out;
> + }
>
> - if (!IS_ERR_OR_NULL(fp->rethook))
> - rethook_stop(fp->rethook);
> + hlist_array = fp->hlist_array;
> + addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
> + if (!addrs) {
> + ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */
> + goto out;
> + }
>
> - ret = unregister_ftrace_function(&fp->ops);
> - if (ret < 0)
> - return ret;
> + /* Remove non-synonim ips from table and hash */
> + count = 0;
> + for (i = 0; i < hlist_array->size; i++) {
> + if (!delete_fprobe_node(&hlist_array->array[i]))
> + addrs[count++] = hlist_array->array[i].addr;
> + }
> + del_fprobe_hash(fp);
>
> - if (!IS_ERR_OR_NULL(fp->rethook))
> - rethook_free(fp->rethook);
> + fprobe_graph_remove_ips(addrs, count);
>
> - ftrace_free_filter(&fp->ops);
> + kfree_rcu(hlist_array, rcu);
> + fp->hlist_array = NULL;
>
> +out:
> + mutex_unlock(&fprobe_mutex);
> +
> + kfree(addrs);
> return ret;
> }
> EXPORT_SYMBOL_GPL(unregister_fprobe);
> diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
> index 271ce0caeec0..cf92111b5c79 100644
> --- a/lib/test_fprobe.c
> +++ b/lib/test_fprobe.c
> @@ -17,10 +17,8 @@ static u32 rand1, entry_val, exit_val;
> /* Use indirect calls to avoid inlining the target functions */
> static u32 (*target)(u32 value);
> static u32 (*target2)(u32 value);
> -static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
> static unsigned long target_ip;
> static unsigned long target2_ip;
> -static unsigned long target_nest_ip;
> static int entry_return_value;
>
> static noinline u32 fprobe_selftest_target(u32 value)
> @@ -33,11 +31,6 @@ static noinline u32 fprobe_selftest_target2(u32 value)
> return (value / div_factor) + 1;
> }
>
> -static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
> -{
> - return nest(value + 2);
> -}
> -
> static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
> unsigned long ret_ip,
> struct ftrace_regs *fregs, void *data)
> @@ -79,22 +72,6 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
> KUNIT_EXPECT_NULL(current_test, data);
> }
>
> -static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
> - unsigned long ret_ip,
> - struct ftrace_regs *fregs, void *data)
> -{
> - KUNIT_EXPECT_FALSE(current_test, preemptible());
> - return 0;
> -}
> -
> -static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
> - unsigned long ret_ip,
> - struct ftrace_regs *fregs, void *data)
> -{
> - KUNIT_EXPECT_FALSE(current_test, preemptible());
> - KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
> -}
> -
> /* Test entry only (no rethook) */
> static void test_fprobe_entry(struct kunit *test)
> {
> @@ -191,25 +168,6 @@ static void test_fprobe_data(struct kunit *test)
> KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
> }
>
> -/* Test nr_maxactive */
> -static void test_fprobe_nest(struct kunit *test)
> -{
> - static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
> - struct fprobe fp = {
> - .entry_handler = nest_entry_handler,
> - .exit_handler = nest_exit_handler,
> - .nr_maxactive = 1,
> - };
> -
> - current_test = test;
> - KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
> -
> - target_nest(rand1, target);
> - KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
> -
> - KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
> -}
> -
> static void test_fprobe_skip(struct kunit *test)
> {
> struct fprobe fp = {
> @@ -247,10 +205,8 @@ static int fprobe_test_init(struct kunit *test)
> rand1 = get_random_u32_above(div_factor);
> target = fprobe_selftest_target;
> target2 = fprobe_selftest_target2;
> - target_nest = fprobe_selftest_nest_target;
> target_ip = get_ftrace_location(target);
> target2_ip = get_ftrace_location(target2);
> - target_nest_ip = get_ftrace_location(target_nest);
>
> return 0;
> }
> @@ -260,7 +216,6 @@ static struct kunit_case fprobe_testcases[] = {
> KUNIT_CASE(test_fprobe),
> KUNIT_CASE(test_fprobe_syms),
> KUNIT_CASE(test_fprobe_data),
> - KUNIT_CASE(test_fprobe_nest),
> KUNIT_CASE(test_fprobe_skip),
> {}
> };
>
Powered by blists - more mailing lists