[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250815004023.144cfbd9ae39fac9ce80ee98@kernel.org>
Date: Fri, 15 Aug 2025 00:40:23 +0900
From: Masami Hiramatsu (Google) <mhiramat@...nel.org>
To: Menglong Dong <menglong8.dong@...il.com>
Cc: olsajiri@...il.com, rostedt@...dmis.org, mathieu.desnoyers@...icios.com,
hca@...ux.ibm.com, revest@...omium.org, linux-kernel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org, bpf@...r.kernel.org
Subject: Re: [PATCH bpf-next v3 1/4] fprobe: use rhltable for
fprobe_ip_table
On Thu, 31 Jul 2025 17:24:24 +0800
Menglong Dong <menglong8.dong@...il.com> wrote:
> For now, all the kernel functions who are hooked by the fprobe will be
> added to the hash table "fprobe_ip_table". The key of it is the function
> address, and the value of it is "struct fprobe_hlist_node".
>
> The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And
> this means the overhead of the hash table lookup will grow linearly if
> the count of the functions in the fprobe more than 256. When we try to
> hook all the kernel functions, the overhead will be huge.
>
> Therefore, replace the hash table with rhltable to reduce the overhead.
>
Hi Menglong,
Thanks for update, I have just some nitpicks.
> Signed-off-by: Menglong Dong <dongml2@...natelecom.cn>
> ---
> v3:
> - some format optimization
> - handle the error that returned from rhltable_insert in
> insert_fprobe_node
> ---
> include/linux/fprobe.h | 3 +-
> kernel/trace/fprobe.c | 154 +++++++++++++++++++++++------------------
> 2 files changed, 90 insertions(+), 67 deletions(-)
>
> diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
> index 702099f08929..f5d8982392b9 100644
> --- a/include/linux/fprobe.h
> +++ b/include/linux/fprobe.h
> @@ -7,6 +7,7 @@
> #include <linux/ftrace.h>
> #include <linux/rcupdate.h>
> #include <linux/refcount.h>
> +#include <linux/rhashtable.h>
nit: can you also include this header file in fprobe.c ?
> #include <linux/slab.h>
>
> struct fprobe;
> @@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
> * @fp: The fprobe which owns this.
> */
> struct fprobe_hlist_node {
> - struct hlist_node hlist;
> + struct rhlist_head hlist;
> unsigned long addr;
> struct fprobe *fp;
> };
> diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> index ba7ff14f5339..2f1683a26c10 100644
> --- a/kernel/trace/fprobe.c
> +++ b/kernel/trace/fprobe.c
> @@ -41,47 +41,46 @@
> * - RCU hlist traversal under disabling preempt
> */
> static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
> -static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
> +static struct rhltable fprobe_ip_table;
> static DEFINE_MUTEX(fprobe_mutex);
>
> -/*
> - * Find first fprobe in the hlist. It will be iterated twice in the entry
> - * probe, once for correcting the total required size, the second time is
> - * calling back the user handlers.
> - * Thus the hlist in the fprobe_table must be sorted and new probe needs to
> - * be added *before* the first fprobe.
> - */
> -static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
> +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed)
> {
> - struct fprobe_hlist_node *node;
> - struct hlist_head *head;
> + return hash_ptr(*(unsigned long **)data, 32);
> +}
>
> - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> - hlist_for_each_entry_rcu(node, head, hlist,
> - lockdep_is_held(&fprobe_mutex)) {
> - if (node->addr == ip)
> - return node;
> - }
> - return NULL;
> +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg,
> + const void *ptr)
> +{
> + unsigned long key = *(unsigned long *)arg->key;
> + const struct fprobe_hlist_node *n = ptr;
> +
> + return n->addr != key;
> }
> -NOKPROBE_SYMBOL(find_first_fprobe_node);
>
> -/* Node insertion and deletion requires the fprobe_mutex */
> -static void insert_fprobe_node(struct fprobe_hlist_node *node)
> +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed)
> {
> - unsigned long ip = node->addr;
> - struct fprobe_hlist_node *next;
> - struct hlist_head *head;
> + const struct fprobe_hlist_node *n = data;
> +
> + return hash_ptr((void *)n->addr, 32);
> +}
> +
> +static const struct rhashtable_params fprobe_rht_params = {
> + .head_offset = offsetof(struct fprobe_hlist_node, hlist),
> + .key_offset = offsetof(struct fprobe_hlist_node, addr),
> + .key_len = sizeof_field(struct fprobe_hlist_node, addr),
> + .hashfn = fprobe_node_hashfn,
> + .obj_hashfn = fprobe_node_obj_hashfn,
> + .obj_cmpfn = fprobe_node_cmp,
> + .automatic_shrinking = true,
> +};
>
> +/* Node insertion and deletion requires the fprobe_mutex */
> +static int insert_fprobe_node(struct fprobe_hlist_node *node)
> +{
> lockdep_assert_held(&fprobe_mutex);
>
> - next = find_first_fprobe_node(ip);
> - if (next) {
> - hlist_add_before_rcu(&node->hlist, &next->hlist);
> - return;
> - }
> - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
> - hlist_add_head_rcu(&node->hlist, head);
> + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
> }
>
> /* Return true if there are synonims */
> @@ -92,9 +91,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
> /* Avoid double deleting */
> if (READ_ONCE(node->fp) != NULL) {
> WRITE_ONCE(node->fp, NULL);
> - hlist_del_rcu(&node->hlist);
> + rhltable_remove(&fprobe_ip_table, &node->hlist,
> + fprobe_rht_params);
> }
> - return !!find_first_fprobe_node(node->addr);
> + return !!rhltable_lookup(&fprobe_ip_table, &node->addr,
> + fprobe_rht_params);
> }
>
> /* Check existence of the fprobe */
> @@ -249,9 +250,10 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
> static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
> struct ftrace_regs *fregs)
> {
> - struct fprobe_hlist_node *node, *first;
> unsigned long *fgraph_data = NULL;
> unsigned long func = trace->func;
> + struct fprobe_hlist_node *node;
> + struct rhlist_head *head, *pos;
> unsigned long ret_ip;
> int reserved_words;
> struct fprobe *fp;
> @@ -260,14 +262,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
> if (WARN_ON_ONCE(!fregs))
> return 0;
>
> - first = node = find_first_fprobe_node(func);
> - if (unlikely(!first))
> - return 0;
> -
> + rcu_read_lock();
> + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params);
> reserved_words = 0;
> - hlist_for_each_entry_from_rcu(node, hlist) {
> + rhl_for_each_entry_rcu(node, pos, head, hlist) {
> if (node->addr != func)
> - break;
> + continue;
> fp = READ_ONCE(node->fp);
> if (!fp || !fp->exit_handler)
> continue;
> @@ -278,17 +278,19 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
> reserved_words +=
> FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
> }
> - node = first;
> + rcu_read_unlock();
> if (reserved_words) {
> fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
> if (unlikely(!fgraph_data)) {
> - hlist_for_each_entry_from_rcu(node, hlist) {
> + rcu_read_lock();
> + rhl_for_each_entry_rcu(node, pos, head, hlist) {
> if (node->addr != func)
> - break;
> + continue;
> fp = READ_ONCE(node->fp);
> if (fp && !fprobe_disabled(fp))
> fp->nmissed++;
> }
> + rcu_read_unlock();
> return 0;
> }
> }
> @@ -299,12 +301,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
> */
> ret_ip = ftrace_regs_get_return_address(fregs);
> used = 0;
> - hlist_for_each_entry_from_rcu(node, hlist) {
> + rhl_for_each_entry_rcu(node, pos, head, hlist) {
> int data_size;
> void *data;
>
> if (node->addr != func)
> - break;
> + continue;
> fp = READ_ONCE(node->fp);
> if (!fp || fprobe_disabled(fp))
> continue;
> @@ -448,25 +450,21 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad
> return 0;
> }
>
> -static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head,
> - struct fprobe_addr_list *alist)
> +static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
> + struct fprobe_addr_list *alist)
> {
> - struct fprobe_hlist_node *node;
> int ret = 0;
>
> - hlist_for_each_entry_rcu(node, head, hlist,
> - lockdep_is_held(&fprobe_mutex)) {
> - if (!within_module(node->addr, mod))
> - continue;
> - if (delete_fprobe_node(node))
> - continue;
> - /*
> - * If failed to update alist, just continue to update hlist.
> - * Therefore, at list user handler will not hit anymore.
> - */
> - if (!ret)
> - ret = fprobe_addr_list_add(alist, node->addr);
> - }
> + if (!within_module(node->addr, mod))
> + return;
> + if (delete_fprobe_node(node))
> + return;
> + /*
> + * If failed to update alist, just continue to update hlist.
> + * Therefore, at list user handler will not hit anymore.
> + */
> + if (!ret)
> + ret = fprobe_addr_list_add(alist, node->addr);
> }
>
> /* Handle module unloading to manage fprobe_ip_table. */
> @@ -474,8 +472,9 @@ static int fprobe_module_callback(struct notifier_block *nb,
> unsigned long val, void *data)
> {
> struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
> + struct fprobe_hlist_node *node;
> + struct rhashtable_iter iter;
> struct module *mod = data;
> - int i;
>
> if (val != MODULE_STATE_GOING)
> return NOTIFY_DONE;
> @@ -486,8 +485,16 @@ static int fprobe_module_callback(struct notifier_block *nb,
> return NOTIFY_DONE;
>
> mutex_lock(&fprobe_mutex);
> - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
> - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
> + rhashtable_walk_enter(&fprobe_ip_table.ht, &iter);
nit: Use rhltable_walk_enter() instead.
Others looks good to me.
Thank you,
> + do {
> + rhashtable_walk_start(&iter);
> +
> + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
> + fprobe_remove_node_in_module(mod, node, &alist);
> +
> + rhashtable_walk_stop(&iter);
> + } while (node == ERR_PTR(-EAGAIN));
> + rhashtable_walk_exit(&iter);
>
> if (alist.index < alist.size && alist.index > 0)
> ftrace_set_filter_ips(&fprobe_graph_ops.ops,
> @@ -722,8 +729,16 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
> ret = fprobe_graph_add_ips(addrs, num);
> if (!ret) {
> add_fprobe_hash(fp);
> - for (i = 0; i < hlist_array->size; i++)
> - insert_fprobe_node(&hlist_array->array[i]);
> + for (i = 0; i < hlist_array->size; i++) {
> + ret = insert_fprobe_node(&hlist_array->array[i]);
> + if (ret)
> + break;
> + }
> + /* fallback on insert error */
> + if (ret) {
> + for (i--; i >= 0; i--)
> + delete_fprobe_node(&hlist_array->array[i]);
> + }
> }
> mutex_unlock(&fprobe_mutex);
>
> @@ -819,3 +834,10 @@ int unregister_fprobe(struct fprobe *fp)
> return ret;
> }
> EXPORT_SYMBOL_GPL(unregister_fprobe);
> +
> +static int __init fprobe_initcall(void)
> +{
> + rhltable_init(&fprobe_ip_table, &fprobe_rht_params);
> + return 0;
> +}
> +late_initcall(fprobe_initcall);
> --
> 2.50.1
>
--
Masami Hiramatsu (Google) <mhiramat@...nel.org>
Powered by blists - more mailing lists