linux-kernel - Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAFn2buAXHDcKiHyPs_7rT617j7=BopZRrMKVv5pYWNi2OxRAfQ@mail.gmail.com>
Date: Thu, 13 Nov 2025 07:30:48 -0800
From: Scott Mitchell <scott.k.mitch1@...il.com>
To: Eric Dumazet <edumazet@...gle.com>
Cc: pablo@...filter.org, kadlec@...filter.org, fw@...len.de, phil@....cc, 
	davem@...emloft.net, kuba@...nel.org, pabeni@...hat.com, horms@...nel.org, 
	netfilter-devel@...r.kernel.org, coreteam@...filter.org, 
	netdev@...r.kernel.org, linux-kernel@...r.kernel.org, 
	Scott Mitchell <scott_mitchell@...le.com>
Subject: Re: [PATCH v2] netfilter: nfnetlink_queue: optimize verdict lookup
 with hash table

On Thu, Nov 13, 2025 at 2:25 AM Eric Dumazet <edumazet@...gle.com> wrote:
>
> On Thu, Nov 13, 2025 at 1:26 AM Scott Mitchell <scott.k.mitch1@...il.com> wrote:
> >
> > The current implementation uses a linear list to find queued packets by
> > ID when processing verdicts from userspace. With large queue depths and
> > out-of-order verdicting, this O(n) lookup becomes a significant
> > bottleneck, causing userspace verdict processing to dominate CPU time.
> >
> > Replace the linear search with a hash table for O(1) average-case
> > packet lookup by ID. The hash table size is configurable via the new
> > NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
> > NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
> > two to enable efficient bitwise masking instead of modulo operations.
> > Unpatched kernels silently ignore the new attribute, maintaining
> > backward compatibility.
> >
> > The existing list data structure is retained for operations requiring
> > linear iteration (e.g. flush, device down events). Hot fields
> > (queue_hash_mask, queue_hash pointer) are placed in the same cache line
> > as the spinlock and packet counters for optimal memory access patterns.
> >
> > Signed-off-by: Scott Mitchell <scott_mitchell@...le.com>
> > ---
> > Changes in v2:
> > - Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
> >   tables with vmalloc fallback (Florian Westphal)
> > - Remove incorrect comment about concurrent resizes - nfnetlink subsystem
> >   mutex already serializes config operations (Florian Westphal)
> > - Fix style: remove unnecessary braces around single-line if (Florian Westphal)
> >
> >  include/net/netfilter/nf_queue.h              |   1 +
> >  .../uapi/linux/netfilter/nfnetlink_queue.h    |   1 +
> >  net/netfilter/nfnetlink_queue.c               | 129 ++++++++++++++++--
> >  3 files changed, 123 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> > index 4aeffddb7586..3d0def310523 100644
> > --- a/include/net/netfilter/nf_queue.h
> > +++ b/include/net/netfilter/nf_queue.h
> > @@ -11,6 +11,7 @@
> >  /* Each queued (to userspace) skbuff has one of these. */
> >  struct nf_queue_entry {
> >         struct list_head        list;
> > +       struct hlist_node       hash_node;
> >         struct sk_buff          *skb;
> >         unsigned int            id;
> >         unsigned int            hook_index;     /* index in hook_entries->hook[] */
> > diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
> > index efcb7c044a74..bc296a17e5aa 100644
> > --- a/include/uapi/linux/netfilter/nfnetlink_queue.h
> > +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
> > @@ -107,6 +107,7 @@ enum nfqnl_attr_config {
> >         NFQA_CFG_QUEUE_MAXLEN,          /* __u32 */
> >         NFQA_CFG_MASK,                  /* identify which flags to change */
> >         NFQA_CFG_FLAGS,                 /* value of these flags (__u32) */
> > +       NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
> >         __NFQA_CFG_MAX
> >  };
> >  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> > diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> > index 8b7b39d8a109..f076609cac32 100644
> > --- a/net/netfilter/nfnetlink_queue.c
> > +++ b/net/netfilter/nfnetlink_queue.c
> > @@ -46,7 +46,10 @@
> >  #include <net/netfilter/nf_conntrack.h>
> >  #endif
> >
> > -#define NFQNL_QMAX_DEFAULT 1024
> > +#define NFQNL_QMAX_DEFAULT      1024
> > +#define NFQNL_MIN_HASH_SIZE     16
> > +#define NFQNL_DEFAULT_HASH_SIZE 1024
> > +#define NFQNL_MAX_HASH_SIZE     131072
> >
> >  /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
> >   * includes the header length. Thus, the maximum packet length that we
> > @@ -65,6 +68,7 @@ struct nfqnl_instance {
> >         unsigned int copy_range;
> >         unsigned int queue_dropped;
> >         unsigned int queue_user_dropped;
> > +       unsigned int queue_hash_size;
> >
> >
> >         u_int16_t queue_num;                    /* number of this queue */
> > @@ -77,6 +81,8 @@ struct nfqnl_instance {
> >         spinlock_t      lock    ____cacheline_aligned_in_smp;
> >         unsigned int    queue_total;
> >         unsigned int    id_sequence;            /* 'sequence' of pkt ids */
> > +       unsigned int    queue_hash_mask;
> > +       struct hlist_head *queue_hash;
> >         struct list_head queue_list;            /* packets in queue */
> >  };
> >
> > @@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
> >         return net_generic(net, nfnl_queue_net_id);
> >  }
> >
> > +static inline unsigned int
> > +nfqnl_packet_hash(u32 id, unsigned int mask)
> > +{
> > +       return hash_32(id, 32) & mask;
> > +}
> > +
>
> (Resent in plaintext for the lists, sorry for duplicates)
>
> I do not think this is an efficient hash function.
>
> queue->id_sequence is monotonically increasing (controlled by the
> kernel : __nfqnl_enqueue_packet(), not user space).
>
> I would use   return (id & mask) so that we have better use of cpu
> caches and hardware prefetchers,
> in case a cpu receives a batch of ~64 packets from a busy network device.
>
> Your hash function would require 8x more cache line misses.
>

Nice improvement, done!

>
> > +static inline u32
> > +nfqnl_normalize_hash_size(u32 hash_size)
> > +{
> > +       /* Must be power of two for queue_hash_mask to work correctly.
> > +        * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
> > +        */
> > +       BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
> > +                    !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
> > +                    !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
> > +                    NFQNL_MAX_HASH_SIZE > 1U << 31);
> > +
> > +       if (!hash_size)
> > +               return NFQNL_DEFAULT_HASH_SIZE;
> > +
> > +       /* Clamp to valid range before power of two to avoid overflow */
> > +       if (hash_size <= NFQNL_MIN_HASH_SIZE)
> > +               return NFQNL_MIN_HASH_SIZE;
> > +
> > +       if (hash_size >= NFQNL_MAX_HASH_SIZE)
> > +               return NFQNL_MAX_HASH_SIZE;
> > +
> > +       if (!is_power_of_2(hash_size))
> > +               hash_size = roundup_pow_of_two(hash_size);
> > +
> > +       return hash_size;
> > +}
> > +
> >  static inline u_int8_t instance_hashfn(u_int16_t queue_num)
> >  {
> >         return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> > @@ -114,13 +153,56 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
> >         return NULL;
> >  }
> >
> > +static int
> > +nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
> > +{
> > +       struct hlist_head *new_hash, *old_hash;
> > +       struct nf_queue_entry *entry;
> > +       unsigned int h, hash_mask;
> > +
> > +       hash_size = nfqnl_normalize_hash_size(hash_size);
> > +       if (hash_size == inst->queue_hash_size)
> > +               return 0;
> > +
> > +       new_hash = kvcalloc(hash_size, sizeof(*new_hash), GFP_KERNEL_ACCOUNT);
> > +       if (!new_hash)
> > +               return -ENOMEM;
> > +
> > +       hash_mask = hash_size - 1;
> > +
> > +       for (h = 0; h < hash_size; h++)
> > +               INIT_HLIST_HEAD(&new_hash[h]);
> > +
> > +       spin_lock_bh(&inst->lock);
> > +
> > +       list_for_each_entry(entry, &inst->queue_list, list) {
> > +               /* No hlist_del() since old_hash will be freed and we hold lock */
> > +               h = nfqnl_packet_hash(entry->id, hash_mask);
> > +               hlist_add_head(&entry->hash_node, &new_hash[h]);
> > +       }
> > +
> > +       old_hash = inst->queue_hash;
> > +       inst->queue_hash_size = hash_size;
> > +       inst->queue_hash_mask = hash_mask;
> > +       inst->queue_hash = new_hash;
> > +
> > +       spin_unlock_bh(&inst->lock);
> > +
> > +       kvfree(old_hash);
> > +
> > +       return 0;
> > +}
> > +
> >  static struct nfqnl_instance *
> > -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> > +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
> > +               u32 hash_size)
> >  {
> >         struct nfqnl_instance *inst;
> >         unsigned int h;
> >         int err;
> >
> > +       hash_size = nfqnl_normalize_hash_size(hash_size);
> > +
> >         spin_lock(&q->instances_lock);
> >         if (instance_lookup(q, queue_num)) {
> >                 err = -EEXIST;
> > @@ -133,11 +215,24 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> >                 goto out_unlock;
> >         }
> >
> > +       inst->queue_hash = kvcalloc(hash_size, sizeof(*inst->queue_hash),
> > +                                   GFP_KERNEL_ACCOUNT);
> > +       if (!inst->queue_hash) {
> > +               kfree(inst);
> > +               err = -ENOMEM;
> > +               goto out_unlock;
> > +       }
> > +
> > +       for (h = 0; h < hash_size; h++)
> > +               INIT_HLIST_HEAD(&inst->queue_hash[h]);
> > +
> >         inst->queue_num = queue_num;
> >         inst->peer_portid = portid;
> >         inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
> >         inst->copy_range = NFQNL_MAX_COPY_RANGE;
> >         inst->copy_mode = NFQNL_COPY_NONE;
> > +       inst->queue_hash_size = hash_size;
> > +       inst->queue_hash_mask = hash_size - 1;
> >         spin_lock_init(&inst->lock);
> >         INIT_LIST_HEAD(&inst->queue_list);
> >
> > @@ -154,6 +249,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> >         return inst;
> >
> >  out_free:
> > +       kvfree(inst->queue_hash);
> >         kfree(inst);
> >  out_unlock:
> >         spin_unlock(&q->instances_lock);
> > @@ -172,6 +268,7 @@ instance_destroy_rcu(struct rcu_head *head)
> >         rcu_read_lock();
> >         nfqnl_flush(inst, NULL, 0);
> >         rcu_read_unlock();
> > +       kvfree(inst->queue_hash);
> >         kfree(inst);
> >         module_put(THIS_MODULE);
> >  }
> > @@ -194,13 +291,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
> >  static inline void
> >  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> >  {
> > -       list_add_tail(&entry->list, &queue->queue_list);
> > -       queue->queue_total++;
> > +       unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
> > +
> > +       hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
> > +       list_add_tail(&entry->list, &queue->queue_list);
> > +       queue->queue_total++;
> >  }
> >
> >  static void
> >  __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> >  {
> > +       hlist_del(&entry->hash_node);
> >         list_del(&entry->list);
> >         queue->queue_total--;
> >  }
> > @@ -209,10 +310,11 @@ static struct nf_queue_entry *
> >  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
> >  {
> >         struct nf_queue_entry *entry = NULL, *i;
> > +       unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
> >
> >         spin_lock_bh(&queue->lock);
> >
> > -       list_for_each_entry(i, &queue->queue_list, list) {
> > +       hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
> >                 if (i->id == id) {
> >                         entry = i;
> >                         break;
> > @@ -407,8 +509,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
> >         spin_lock_bh(&queue->lock);
> >         list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
> >                 if (!cmpfn || cmpfn(entry, data)) {
> > -                       list_del(&entry->list);
> > -                       queue->queue_total--;
> > +                       __dequeue_entry(queue, entry);
> >                         nfqnl_reinject(entry, NF_DROP);
> >                 }
> >         }
> > @@ -1483,6 +1584,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
> >         [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 },
> >         [NFQA_CFG_MASK]         = { .type = NLA_U32 },
> >         [NFQA_CFG_FLAGS]        = { .type = NLA_U32 },
> > +       [NFQA_CFG_HASH_SIZE]    = { .type = NLA_U32 },
> >  };
> >
> >  static const struct nf_queue_handler nfqh = {
> > @@ -1495,11 +1597,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> >  {
> >         struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
> >         u_int16_t queue_num = ntohs(info->nfmsg->res_id);
> > +       u32 hash_size = 0;
> >         struct nfqnl_msg_config_cmd *cmd = NULL;
> >         struct nfqnl_instance *queue;
> >         __u32 flags = 0, mask = 0;
> >         int ret = 0;
> >
> > +       if (nfqa[NFQA_CFG_HASH_SIZE])
> > +               hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
> > +
> >         if (nfqa[NFQA_CFG_CMD]) {
> >                 cmd = nla_data(nfqa[NFQA_CFG_CMD]);
> >
> > @@ -1559,11 +1665,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> >                                 goto err_out_unlock;
> >                         }
> >                         queue = instance_create(q, queue_num,
> > -                                               NETLINK_CB(skb).portid);
> > +                                               NETLINK_CB(skb).portid, hash_size);
> >                         if (IS_ERR(queue)) {
> >                                 ret = PTR_ERR(queue);
> >                                 goto err_out_unlock;
> >                         }
> > +                       hash_size = 0; /* avoid resize later in this function */
> >                         break;
> >                 case NFQNL_CFG_CMD_UNBIND:
> >                         if (!queue) {
> > @@ -1586,6 +1693,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
> >                 goto err_out_unlock;
> >         }
> >
> > +       if (hash_size > 0) {
> > +               ret = nfqnl_hash_resize(queue, hash_size);
> > +               if (ret)
> > +                       goto err_out_unlock;
> > +       }
> > +
> >         if (nfqa[NFQA_CFG_PARAMS]) {
> >                 struct nfqnl_msg_config_params *params =
> >                         nla_data(nfqa[NFQA_CFG_PARAMS]);
> > --
> > 2.39.5 (Apple Git-154)
> >