[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260122162935.8581-3-fw@strlen.de>
Date: Thu, 22 Jan 2026 17:29:33 +0100
From: Florian Westphal <fw@...len.de>
To: <netdev@...r.kernel.org>
Cc: Paolo Abeni <pabeni@...hat.com>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
<netfilter-devel@...r.kernel.org>,
pablo@...filter.org
Subject: [PATCH net-next 2/4] netfilter: nft_set_rbtree: translate rbtree to array for binary search
From: Pablo Neira Ayuso <pablo@...filter.org>
The rbtree can temporarily store overlapping inactive elements during
the transaction processing, leading to false negative lookups.
To address this issue, this patch adds a .commit function that walks the
the rbtree to build a array of intervals of ordered elements. This
conversion compacts the two singleton elements that represent the start
and the end of the interval into a single interval object for space
efficient.
Binary search is O(log n), similar to rbtree lookup time, therefore,
performance number should be similar, and there is an implementation
available under lib/bsearch.c and include/linux/bsearch.h that is used
for this purpose.
This slightly increases memory consumption for this new array that
stores pointers to the start and the end of the interval.
With this patch:
# time nft -f 100k-intervals-set.nft
real 0m4.218s
user 0m3.544s
sys 0m0.400s
Without this patch:
# time nft -f 100k-intervals-set.nft
real 0m3.920s
user 0m3.547s
sys 0m0.276s
With this patch, with IPv4 intervals:
baseline rbtree (match on first field only): 15254954pps
Without this patch:
baseline rbtree (match on first field only): 10256119pps
This provides a ~50% improvement in matching intervals from packet path.
Signed-off-by: Pablo Neira Ayuso <pablo@...filter.org>
Signed-off-by: Florian Westphal <fw@...len.de>
---
net/netfilter/nft_set_rbtree.c | 341 +++++++++++++++++++++++++--------
1 file changed, 257 insertions(+), 84 deletions(-)
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index ca594161b840..821808e8da06 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -10,14 +10,29 @@
#include <linux/module.h>
#include <linux/list.h>
#include <linux/rbtree.h>
+#include <linux/bsearch.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+struct nft_array_interval {
+ struct nft_set_ext *from;
+ struct nft_set_ext *to;
+};
+
+struct nft_array {
+ u32 max_intervals;
+ u32 num_intervals;
+ struct nft_array_interval *intervals;
+ struct rcu_head rcu_head;
+};
+
struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
+ struct nft_array __rcu *array;
+ struct nft_array *array_next;
seqcount_rwlock_t count;
unsigned long last_gc;
};
@@ -47,90 +62,6 @@ static int nft_rbtree_cmp(const struct nft_set *set,
set->klen);
}
-static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
-{
- return nft_set_elem_expired(&rbe->ext);
-}
-
-static const struct nft_set_ext *
-__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, unsigned int seq)
-{
- struct nft_rbtree *priv = nft_set_priv(set);
- const struct nft_rbtree_elem *rbe, *interval = NULL;
- u8 genmask = nft_genmask_cur(net);
- const struct rb_node *parent;
- int d;
-
- parent = rcu_dereference_raw(priv->root.rb_node);
- while (parent != NULL) {
- if (read_seqcount_retry(&priv->count, seq))
- return NULL;
-
- rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-
- d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
- if (d < 0) {
- parent = rcu_dereference_raw(parent->rb_left);
- if (interval &&
- !nft_rbtree_cmp(set, rbe, interval) &&
- nft_rbtree_interval_end(rbe) &&
- nft_rbtree_interval_start(interval))
- continue;
- if (nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_rbtree_elem_expired(rbe))
- interval = rbe;
- } else if (d > 0)
- parent = rcu_dereference_raw(parent->rb_right);
- else {
- if (!nft_set_elem_active(&rbe->ext, genmask)) {
- parent = rcu_dereference_raw(parent->rb_left);
- continue;
- }
-
- if (nft_rbtree_elem_expired(rbe))
- return NULL;
-
- if (nft_rbtree_interval_end(rbe)) {
- if (nft_set_is_anonymous(set))
- return NULL;
- parent = rcu_dereference_raw(parent->rb_left);
- interval = NULL;
- continue;
- }
-
- return &rbe->ext;
- }
- }
-
- if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_rbtree_interval_start(interval))
- return &interval->ext;
-
- return NULL;
-}
-
-INDIRECT_CALLABLE_SCOPE
-const struct nft_set_ext *
-nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key)
-{
- struct nft_rbtree *priv = nft_set_priv(set);
- unsigned int seq = read_seqcount_begin(&priv->count);
- const struct nft_set_ext *ext;
-
- ext = __nft_rbtree_lookup(net, set, key, seq);
- if (ext || !read_seqcount_retry(&priv->count, seq))
- return ext;
-
- read_lock_bh(&priv->lock);
- seq = read_seqcount_begin(&priv->count);
- ext = __nft_rbtree_lookup(net, set, key, seq);
- read_unlock_bh(&priv->lock);
-
- return ext;
-}
-
static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
const u32 *key, struct nft_rbtree_elem **elem,
unsigned int seq, unsigned int flags, u8 genmask)
@@ -221,6 +152,60 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set,
return &rbe->priv;
}
+struct nft_array_lookup_ctx {
+ const u32 *key;
+ u32 klen;
+};
+
+static int nft_array_lookup_cmp(const void *pkey, const void *entry)
+{
+ const struct nft_array_interval *interval = entry;
+ const struct nft_array_lookup_ctx *ctx = pkey;
+ int a, b;
+
+ if (!interval->from)
+ return 1;
+
+ a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
+ if (!interval->to)
+ b = -1;
+ else
+ b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
+
+ if (a >= 0 && b < 0)
+ return 0;
+
+ if (a < 0)
+ return -1;
+
+ return 1;
+}
+
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_array *array = rcu_dereference(priv->array);
+ const struct nft_array_interval *interval;
+ struct nft_array_lookup_ctx ctx = {
+ .key = key,
+ .klen = set->klen,
+ };
+
+ if (!array)
+ return NULL;
+
+ interval = bsearch(&ctx, array->intervals, array->num_intervals,
+ sizeof(struct nft_array_interval),
+ nft_array_lookup_cmp);
+ if (!interval || nft_set_elem_expired(interval->from))
+ return NULL;
+
+ return interval->from;
+}
+
static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe)
@@ -481,6 +466,87 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
return 0;
}
+static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals)
+{
+ struct nft_array_interval *intervals;
+
+ intervals = kvcalloc(max_intervals, sizeof(struct nft_array_interval),
+ GFP_KERNEL_ACCOUNT);
+ if (!intervals)
+ return -ENOMEM;
+
+ if (array->intervals)
+ kvfree(array->intervals);
+
+ array->intervals = intervals;
+ array->max_intervals = max_intervals;
+
+ return 0;
+}
+
+static struct nft_array *nft_array_alloc(u32 max_intervals)
+{
+ struct nft_array *array;
+
+ array = kzalloc(sizeof(*array), GFP_KERNEL_ACCOUNT);
+ if (!array)
+ return NULL;
+
+ if (nft_array_intervals_alloc(array, max_intervals) < 0) {
+ kfree(array);
+ return NULL;
+ }
+
+ return array;
+}
+
+#define NFT_ARRAY_EXTRA_SIZE 10240
+
+/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
+ * packed representation coming from userspace for anonymous sets too.
+ */
+static u32 nft_array_elems(const struct nft_set *set)
+{
+ u32 nelems = atomic_read(&set->nelems);
+
+ /* Adjacent intervals are represented with a single start element in
+ * anonymous sets, use the current element counter as is.
+ */
+ if (nft_set_is_anonymous(set))
+ return nelems;
+
+ /* Add extra room for never matching interval at the beginning and open
+ * interval at the end which only use a single element to represent it.
+ * The conversion to array will compact intervals, this allows reduce
+ * memory consumption.
+ */
+ return (nelems / 2) + 2;
+}
+
+static int nft_array_may_resize(const struct nft_set *set)
+{
+ u32 nelems = nft_array_elems(set), new_max_intervals;
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_array *array;
+
+ if (!priv->array_next) {
+ array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE);
+ if (!array)
+ return -ENOMEM;
+
+ priv->array_next = array;
+ }
+
+ if (nelems < priv->array_next->max_intervals)
+ return 0;
+
+ new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE;
+ if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
struct nft_elem_priv **elem_priv)
@@ -489,6 +555,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree *priv = nft_set_priv(set);
int err;
+ if (nft_array_may_resize(set) < 0)
+ return -ENOMEM;
+
do {
if (fatal_signal_pending(current))
return -EINTR;
@@ -553,6 +622,9 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
u64 tstamp = nft_net_tstamp(net);
int d;
+ if (nft_array_may_resize(set) < 0)
+ return NULL;
+
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
@@ -615,6 +687,11 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
switch (iter->type) {
case NFT_ITER_UPDATE:
lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (nft_array_may_resize(set) < 0) {
+ iter->err = -ENOMEM;
+ break;
+ }
nft_rbtree_do_walk(ctx, set, iter);
break;
case NFT_ITER_READ:
@@ -717,14 +794,24 @@ static int nft_rbtree_init(const struct nft_set *set,
seqcount_rwlock_init(&priv->count, &priv->lock);
priv->root = RB_ROOT;
+ priv->array = NULL;
+ priv->array_next = NULL;
+
return 0;
}
+static void __nft_array_free(struct nft_array *array)
+{
+ kvfree(array->intervals);
+ kfree(array);
+}
+
static void nft_rbtree_destroy(const struct nft_ctx *ctx,
const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
+ struct nft_array *array;
struct rb_node *node;
while ((node = priv->root.rb_node) != NULL) {
@@ -732,6 +819,12 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx,
rbe = rb_entry(node, struct nft_rbtree_elem, node);
nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
}
+
+ array = rcu_dereference_protected(priv->array, true);
+ if (array)
+ __nft_array_free(array);
+ if (priv->array_next)
+ __nft_array_free(priv->array_next);
}
static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
@@ -752,12 +845,91 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
+static void nft_array_free_rcu(struct rcu_head *rcu_head)
+{
+ struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head);
+
+ __nft_array_free(array);
+}
+
static void nft_rbtree_commit(struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *prev_rbe;
+ struct nft_array *old;
+ u32 num_intervals = 0;
+ struct rb_node *node;
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
nft_rbtree_gc(set);
+
+ /* No changes, skip, eg. elements updates only. */
+ if (!priv->array_next)
+ return;
+
+ /* Reverse walk to create an array from smaller to largest interval. */
+ node = rb_last(&priv->root);
+ if (node)
+ prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ else
+ prev_rbe = NULL;
+
+ while (prev_rbe) {
+ rbe = prev_rbe;
+
+ if (nft_rbtree_interval_start(rbe))
+ priv->array_next->intervals[num_intervals].from = &rbe->ext;
+ else if (nft_rbtree_interval_end(rbe))
+ priv->array_next->intervals[num_intervals++].to = &rbe->ext;
+
+ if (num_intervals >= priv->array_next->max_intervals) {
+ pr_warn_once("malformed interval set from userspace?");
+ goto err_out;
+ }
+
+ node = rb_prev(node);
+ if (!node)
+ break;
+
+ prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ /* For anonymous sets, when adjacent ranges are found,
+ * the end element is not added to the set to pack the set
+ * representation. Use next start element to complete this
+ * interval.
+ */
+ if (nft_rbtree_interval_start(rbe) &&
+ nft_rbtree_interval_start(prev_rbe) &&
+ priv->array_next->intervals[num_intervals].from)
+ priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext;
+
+ if (num_intervals >= priv->array_next->max_intervals) {
+ pr_warn_once("malformed interval set from userspace?");
+ goto err_out;
+ }
+ }
+
+ if (priv->array_next->intervals[num_intervals].from)
+ num_intervals++;
+err_out:
+ priv->array_next->num_intervals = num_intervals;
+ old = rcu_replace_pointer(priv->array, priv->array_next, true);
+ priv->array_next = NULL;
+ if (old)
+ call_rcu(&old->rcu_head, nft_array_free_rcu);
+}
+
+static void nft_rbtree_abort(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_array *array_next;
+
+ if (!priv->array_next)
+ return;
+
+ array_next = priv->array_next;
+ priv->array_next = NULL;
+ __nft_array_free(array_next);
}
static void nft_rbtree_gc_init(const struct nft_set *set)
@@ -821,6 +993,7 @@ const struct nft_set_type nft_set_rbtree_type = {
.flush = nft_rbtree_flush,
.activate = nft_rbtree_activate,
.commit = nft_rbtree_commit,
+ .abort = nft_rbtree_abort,
.gc_init = nft_rbtree_gc_init,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
--
2.52.0
Powered by blists - more mailing lists