[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAFn2buAXHDcKiHyPs_7rT617j7=BopZRrMKVv5pYWNi2OxRAfQ@mail.gmail.com>
Date: Thu, 13 Nov 2025 07:30:48 -0800
From: Scott Mitchell <scott.k.mitch1@...il.com>
To: Eric Dumazet <edumazet@...gle.com>
Cc: pablo@...filter.org, kadlec@...filter.org, fw@...len.de, phil@....cc,
davem@...emloft.net, kuba@...nel.org, pabeni@...hat.com, horms@...nel.org,
netfilter-devel@...r.kernel.org, coreteam@...filter.org,
netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
Scott Mitchell <scott_mitchell@...le.com>
Subject: Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup
with hash table
On Thu, Nov 13, 2025 at 2:25 AM Eric Dumazet <edumazet@...gle.com> wrote:
>
> On Thu, Nov 13, 2025 at 1:26 AM Scott Mitchell <scott.k.mitch1@...il.com> wrote:
> >
> > The current implementation uses a linear list to find queued packets by
> > ID when processing verdicts from userspace. With large queue depths and
> > out-of-order verdicting, this O(n) lookup becomes a significant
> > bottleneck, causing userspace verdict processing to dominate CPU time.
> >
> > Replace the linear search with a hash table for O(1) average-case
> > packet lookup by ID. The hash table size is configurable via the new
> > NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
> > NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
> > two to enable efficient bitwise masking instead of modulo operations.
> > Unpatched kernels silently ignore the new attribute, maintaining
> > backward compatibility.
> >
> > The existing list data structure is retained for operations requiring
> > linear iteration (e.g. flush, device down events). Hot fields
> > (queue_hash_mask, queue_hash pointer) are placed in the same cache line
> > as the spinlock and packet counters for optimal memory access patterns.
> >
> > Signed-off-by: Scott Mitchell <scott_mitchell@...le.com>
> > ---
> > Changes in v2:
> > - Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
> > tables with vmalloc fallback (Florian Westphal)
> > - Remove incorrect comment about concurrent resizes - nfnetlink subsystem
> > mutex already serializes config operations (Florian Westphal)
> > - Fix style: remove unnecessary braces around single-line if (Florian Westphal)
> >
> > include/net/netfilter/nf_queue.h | 1 +
> > .../uapi/linux/netfilter/nfnetlink_queue.h | 1 +
> > net/netfilter/nfnetlink_queue.c | 129 ++++++++++++++++--
> > 3 files changed, 123 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> > index 4aeffddb7586..3d0def310523 100644
> > --- a/include/net/netfilter/nf_queue.h
> > +++ b/include/net/netfilter/nf_queue.h
> > @@ -11,6 +11,7 @@
> > /* Each queued (to userspace) skbuff has one of these. */
> > struct nf_queue_entry {
> > struct list_head list;
> > + struct hlist_node hash_node;
> > struct sk_buff *skb;
> > unsigned int id;
> > unsigned int hook_index; /* index in hook_entries->hook[] */
> > diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
> > index efcb7c044a74..bc296a17e5aa 100644
> > --- a/include/uapi/linux/netfilter/nfnetlink_queue.h
> > +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
> > @@ -107,6 +107,7 @@ enum nfqnl_attr_config {
> > NFQA_CFG_QUEUE_MAXLEN, /* __u32 */
> > NFQA_CFG_MASK, /* identify which flags to change */
> > NFQA_CFG_FLAGS, /* value of these flags (__u32) */
> > + NFQA_CFG_HASH_SIZE, /* __u32 hash table size (rounded to power of 2) */
> > __NFQA_CFG_MAX
> > };
> > #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> > diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> > index 8b7b39d8a109..f076609cac32 100644
> > --- a/net/netfilter/nfnetlink_queue.c
> > +++ b/net/netfilter/nfnetlink_queue.c
> > @@ -46,7 +46,10 @@
> > #include <net/netfilter/nf_conntrack.h>
> > #endif
> >
> > -#define NFQNL_QMAX_DEFAULT 1024
> > +#define NFQNL_QMAX_DEFAULT 1024
> > +#define NFQNL_MIN_HASH_SIZE 16
> > +#define NFQNL_DEFAULT_HASH_SIZE 1024
> > +#define NFQNL_MAX_HASH_SIZE 131072
> >
> > /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
> > * includes the header length. Thus, the maximum packet length that we
> > @@ -65,6 +68,7 @@ struct nfqnl_instance {
> > unsigned int copy_range;
> > unsigned int queue_dropped;
> > unsigned int queue_user_dropped;
> > + unsigned int queue_hash_size;
> >
> >
> > u_int16_t queue_num; /* number of this queue */
> > @@ -77,6 +81,8 @@ struct nfqnl_instance {
> > spinlock_t lock ____cacheline_aligned_in_smp;
> > unsigned int queue_total;
> > unsigned int id_sequence; /* 'sequence' of pkt ids */
> > + unsigned int queue_hash_mask;
> > + struct hlist_head *queue_hash;
> > struct list_head queue_list; /* packets in queue */
> > };
> >
> > @@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
> > return net_generic(net, nfnl_queue_net_id);
> > }
> >
> > +static inline unsigned int
> > +nfqnl_packet_hash(u32 id, unsigned int mask)
> > +{
> > + return hash_32(id, 32) & mask;
> > +}
> > +
>
> (Resent in plaintext for the lists, sorry for duplicates)
>
> I do not think this is an efficient hash function.
>
> queue->id_sequence is monotonically increasing (controlled by the
> kernel : __nfqnl_enqueue_packet(), not user space).
>
> I would use return (id & mask) so that we have better use of cpu
> caches and hardware prefetchers,
> in case a cpu receives a batch of ~64 packets from a busy network device.
>
> Your hash function would require 8x more cache line misses.
>
Nice improvement, done!
>
> > +static inline u32
> > +nfqnl_normalize_hash_size(u32 hash_size)
> > +{
> > + /* Must be power of two for queue_hash_mask to work correctly.
> > + * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
> > + */
> > + BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
> > + !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
> > + !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
> > + NFQNL_MAX_HASH_SIZE > 1U << 31);
> > +
> > + if (!hash_size)
> > + return NFQNL_DEFAULT_HASH_SIZE;
> > +
> > + /* Clamp to valid range before power of two to avoid overflow */
> > + if (hash_size <= NFQNL_MIN_HASH_SIZE)
> > + return NFQNL_MIN_HASH_SIZE;
> > +
> > + if (hash_size >= NFQNL_MAX_HASH_SIZE)
> > + return NFQNL_MAX_HASH_SIZE;
> > +
> > + if (!is_power_of_2(hash_size))
> > + hash_size = roundup_pow_of_two(hash_size);
> > +
> > + return hash_size;
> > +}
> > +
> > static inline u_int8_t instance_hashfn(u_int16_t queue_num)
> > {
> > return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> > @@ -114,13 +153,56 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
> > return NULL;
> > }
> >
> > +static int
> > +nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
> > +{
> > + struct hlist_head *new_hash, *old_hash;
> > + struct nf_queue_entry *entry;
> > + unsigned int h, hash_mask;
> > +
> > + hash_size = nfqnl_normalize_hash_size(hash_size);
> > + if (hash_size == inst->queue_hash_size)
> > + return 0;
> > +
> > + new_hash = kvcalloc(hash_size, sizeof(*new_hash), GFP_KERNEL_ACCOUNT);
> > + if (!new_hash)
> > + return -ENOMEM;
> > +
> > + hash_mask = hash_size - 1;
> > +
> > + for (h = 0; h < hash_size; h++)
> > + INIT_HLIST_HEAD(&new_hash[h]);
> > +
> > + spin_lock_bh(&inst->lock);
> > +
> > + list_for_each_entry(entry, &inst->queue_list, list) {
> > + /* No hlist_del() since old_hash will be freed and we hold lock */
> > + h = nfqnl_packet_hash(entry->id, hash_mask);
> > + hlist_add_head(&entry->hash_node, &new_hash[h]);
> > + }
> > +
> > + old_hash = inst->queue_hash;
> > + inst->queue_hash_size = hash_size;
> > + inst->queue_hash_mask = hash_mask;
> > + inst->queue_hash = new_hash;
> > +
> > + spin_unlock_bh(&inst->lock);
> > +
> > + kvfree(old_hash);
> > +
> > + return 0;
> > +}
> > +
> > static struct nfqnl_instance *
> > -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> > +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
> > + u32 hash_size)
> > {
> > struct nfqnl_instance *inst;
> > unsigned int h;
> > int err;
> >
> > + hash_size = nfqnl_normalize_hash_size(hash_size);
> > +
> > spin_lock(&q->instances_lock);
> > if (instance_lookup(q, queue_num)) {
> > err = -EEXIST;
> > @@ -133,11 +215,24 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> > goto out_unlock;
> > }
> >
> > + inst->queue_hash = kvcalloc(hash_size, sizeof(*inst->queue_hash),
> > + GFP_KERNEL_ACCOUNT);
> > + if (!inst->queue_hash) {
> > + kfree(inst);
> > + err = -ENOMEM;
> > + goto out_unlock;
> > + }
> > +
> > + for (h = 0; h < hash_size; h++)
> > + INIT_HLIST_HEAD(&inst->queue_hash[h]);
> > +
> > inst->queue_num = queue_num;
> > inst->peer_portid = portid;
> > inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
> > inst->copy_range = NFQNL_MAX_COPY_RANGE;
> > inst->copy_mode = NFQNL_COPY_NONE;
> > + inst->queue_hash_size = hash_size;
> > + inst->queue_hash_mask = hash_size - 1;
> > spin_lock_init(&inst->lock);
> > INIT_LIST_HEAD(&inst->queue_list);
> >
> > @@ -154,6 +249,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> > return inst;
> >
> > out_free:
> > + kvfree(inst->queue_hash);
> > kfree(inst);
> > out_unlock:
> > spin_unlock(&q->instances_lock);
> > @@ -172,6 +268,7 @@ instance_destroy_rcu(struct rcu_head *head)
> > rcu_read_lock();
> > nfqnl_flush(inst, NULL, 0);
> > rcu_read_unlock();
> > + kvfree(inst->queue_hash);
> > kfree(inst);
> > module_put(THIS_MODULE);
> > }
> > @@ -194,13 +291,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
> > static inline void
> > __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> > {
> > - list_add_tail(&entry->list, &queue->queue_list);
> > - queue->queue_total++;
> > + unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
> > +
> > + hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
> > + list_add_tail(&entry->list, &queue->queue_list);
> > + queue->queue_total++;
> > }
> >
> > static void
> > __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> > {
> > + hlist_del(&entry->hash_node);
> > list_del(&entry->list);
> > queue->queue_total--;
> > }
> > @@ -209,10 +310,11 @@ static struct nf_queue_entry *
> > find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
> > {
> > struct nf_queue_entry *entry = NULL, *i;
> > + unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
> >
> > spin_lock_bh(&queue->lock);
> >
> > - list_for_each_entry(i, &queue->queue_list, list) {
> > + hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
> > if (i->id == id) {
> > entry = i;
> > break;
> > @@ -407,8 +509,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
> > spin_lock_bh(&queue->lock);
> > list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
> > if (!cmpfn || cmpfn(entry, data)) {
> > - list_del(&entry->list);
> > - queue->queue_total--;
> > + __dequeue_entry(queue, entry);
> > nfqnl_reinject(entry, NF_DROP);
> > }
> > }
> > @@ -1483,6 +1584,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
> > [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 },
> > [NFQA_CFG_MASK] = { .type = NLA_U32 },
> > [NFQA_CFG_FLAGS] = { .type = NLA_U32 },
> > + [NFQA_CFG_HASH_SIZE] = { .type = NLA_U32 },
> > };
> >
> > static const struct nf_queue_handler nfqh = {
> > @@ -1495,11 +1597,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> > {
> > struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
> > u_int16_t queue_num = ntohs(info->nfmsg->res_id);
> > + u32 hash_size = 0;
> > struct nfqnl_msg_config_cmd *cmd = NULL;
> > struct nfqnl_instance *queue;
> > __u32 flags = 0, mask = 0;
> > int ret = 0;
> >
> > + if (nfqa[NFQA_CFG_HASH_SIZE])
> > + hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
> > +
> > if (nfqa[NFQA_CFG_CMD]) {
> > cmd = nla_data(nfqa[NFQA_CFG_CMD]);
> >
> > @@ -1559,11 +1665,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> > goto err_out_unlock;
> > }
> > queue = instance_create(q, queue_num,
> > - NETLINK_CB(skb).portid);
> > + NETLINK_CB(skb).portid, hash_size);
> > if (IS_ERR(queue)) {
> > ret = PTR_ERR(queue);
> > goto err_out_unlock;
> > }
> > + hash_size = 0; /* avoid resize later in this function */
> > break;
> > case NFQNL_CFG_CMD_UNBIND:
> > if (!queue) {
> > @@ -1586,6 +1693,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> > goto err_out_unlock;
> > }
> >
> > + if (hash_size > 0) {
> > + ret = nfqnl_hash_resize(queue, hash_size);
> > + if (ret)
> > + goto err_out_unlock;
> > + }
> > +
> > if (nfqa[NFQA_CFG_PARAMS]) {
> > struct nfqnl_msg_config_params *params =
> > nla_data(nfqa[NFQA_CFG_PARAMS]);
> > --
> > 2.39.5 (Apple Git-154)
> >
Powered by blists - more mailing lists