lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iJUvUo7fj-ePfm78V18rT-YF-sV6f-BCY9Sr7qqxXCRqQ@mail.gmail.com>
Date: Fri, 19 Sep 2025 23:52:58 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: Kuniyuki Iwashima <kuniyu@...gle.com>
Cc: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>, Simon Horman <horms@...nel.org>, 
	Willem de Bruijn <willemb@...gle.com>, netdev@...r.kernel.org, eric.dumazet@...il.com
Subject: Re: [PATCH net-next] udp: remove busylock and add per NUMA queues

On Fri, Sep 19, 2025 at 6:18 PM Kuniyuki Iwashima <kuniyu@...gle.com> wrote:
>
> On Fri, Sep 19, 2025 at 9:43 AM Eric Dumazet <edumazet@...gle.com> wrote:
> >
> > busylock was protecting UDP sockets against packet floods,
> > but unfortunately was not protecting the host itself.
> >
> > Under stress, many cpus could spin while acquiring the busylock,
> > and NIC had to drop packets. Or packets would be dropped
> > in cpu backlog if RPS/RFS were in place.
> >
> > This patch replaces the busylock by intermediate
> > lockless queues. (One queue per NUMA node).
> >
> > This means that fewer number of cpus have to acquire
> > the UDP receive queue lock.
> >
> > Most of the cpus can either:
> > - immediately drop the packet.
> > - or queue it in their NUMA aware lockless queue.
> >
> > Then one of the cpu is chosen to process this lockless queue
> > in a batch.
> >
> > The batch only contains packets that were cooked on the same
> > NUMA node, thus with very limited latency impact.
> >
> > Tested:
> >
> > DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes
> > (Intel(R) Xeon(R) 6985P-C)
> >
> > Before:
> >
> > nstat -n ; sleep 1 ; nstat | grep Udp
> > Udp6InDatagrams                 1004179            0.0
> > Udp6InErrors                    3117               0.0
> > Udp6RcvbufErrors                3117               0.0
> >
> > After:
> > nstat -n ; sleep 1 ; nstat | grep Udp
> > Udp6InDatagrams                 1116633            0.0
> > Udp6InErrors                    14197275           0.0
> > Udp6RcvbufErrors                14197275           0.0
> >
> > We can see this host can now proces 14.2 M more packets per second
> > while under attack, and the victim socket can receive 11 % more
> > packets.
> >
> > Note that the remaining bottleneck for this platform is in
> > udp_drops_inc() because we limited struct numa_drop_counters
> > to only two nodes so far.
> >
> > Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> > ---
> >  include/linux/udp.h |  9 ++++-
> >  include/net/udp.h   | 11 ++++-
> >  net/ipv4/udp.c      | 99 ++++++++++++++++++++++++---------------------
> >  net/ipv6/udp.c      |  5 ++-
> >  4 files changed, 73 insertions(+), 51 deletions(-)
> >
> > diff --git a/include/linux/udp.h b/include/linux/udp.h
> > index e554890c4415b411f35007d3ece9e6042db7a544..58795688a18636ea79aa1f5d06eacc676a2e7849 100644
> > --- a/include/linux/udp.h
> > +++ b/include/linux/udp.h
> > @@ -44,6 +44,12 @@ enum {
> >         UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
> >  };
> >
> > +/* per NUMA structure for lockless producer usage. */
> > +struct udp_prod_queue {
> > +       struct llist_head       ll_root ____cacheline_aligned_in_smp;
> > +       atomic_t                rmem_alloc;
> > +};
> > +
> >  struct udp_sock {
> >         /* inet_sock has to be the first member */
> >         struct inet_sock inet;
> > @@ -90,6 +96,8 @@ struct udp_sock {
> >                                                 struct sk_buff *skb,
> >                                                 int nhoff);
> >
> > +       struct udp_prod_queue *udp_prod_queue;
> > +
> >         /* udp_recvmsg try to use this before splicing sk_receive_queue */
> >         struct sk_buff_head     reader_queue ____cacheline_aligned_in_smp;
> >
> > @@ -109,7 +117,6 @@ struct udp_sock {
> >          */
> >         struct hlist_node       tunnel_list;
> >         struct numa_drop_counters drop_counters;
> > -       spinlock_t              busylock ____cacheline_aligned_in_smp;
> >  };
> >
> >  #define udp_test_bit(nr, sk)                   \
> > diff --git a/include/net/udp.h b/include/net/udp.h
> > index eecd64097f91196897f45530540b9c9b68c5ba4e..ae750324bc87a79d0e9182c5589371d82be3e3ee 100644
> > --- a/include/net/udp.h
> > +++ b/include/net/udp.h
> > @@ -284,16 +284,23 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
> >  struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
> >                                   netdev_features_t features, bool is_ipv6);
> >
> > -static inline void udp_lib_init_sock(struct sock *sk)
> > +static inline int udp_lib_init_sock(struct sock *sk)
> >  {
> >         struct udp_sock *up = udp_sk(sk);
> >
> >         sk->sk_drop_counters = &up->drop_counters;
> > -       spin_lock_init(&up->busylock);
> >         skb_queue_head_init(&up->reader_queue);
> >         INIT_HLIST_NODE(&up->tunnel_list);
> >         up->forward_threshold = sk->sk_rcvbuf >> 2;
> >         set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
> > +
> > +       up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue),
> > +                                    GFP_KERNEL);
> > +       if (!up->udp_prod_queue)
> > +               return -ENOMEM;
> > +       for (int i = 0; i < nr_node_ids; i++)
> > +               init_llist_head(&up->udp_prod_queue[i].ll_root);
> > +       return 0;
> >  }
> >
> >  static inline void udp_drops_inc(struct sock *sk)
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index 0c40426628eb2306b609881341a51307c4993871..f2d95fe18aec8f317ab33b4ed3306149fce6690b 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -1685,25 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
> >         udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
> >  }
> >
> > -/* Idea of busylocks is to let producers grab an extra spinlock
> > - * to relieve pressure on the receive_queue spinlock shared by consumer.
> > - * Under flood, this means that only one producer can be in line
> > - * trying to acquire the receive_queue spinlock.
> > - */
> > -static spinlock_t *busylock_acquire(struct sock *sk)
> > -{
> > -       spinlock_t *busy = &udp_sk(sk)->busylock;
> > -
> > -       spin_lock(busy);
> > -       return busy;
> > -}
> > -
> > -static void busylock_release(spinlock_t *busy)
> > -{
> > -       if (busy)
> > -               spin_unlock(busy);
> > -}
> > -
> >  static int udp_rmem_schedule(struct sock *sk, int size)
> >  {
> >         int delta;
> > @@ -1718,14 +1699,23 @@ static int udp_rmem_schedule(struct sock *sk, int size)
> >  int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
> >  {
> >         struct sk_buff_head *list = &sk->sk_receive_queue;
> > +       struct udp_prod_queue *udp_prod_queue;
> > +       struct llist_node *ll_list;
> >         unsigned int rmem, rcvbuf;
> > -       spinlock_t *busy = NULL;
> >         int size, err = -ENOMEM;
> > +       struct sk_buff *next;
> > +       int total_size = 0;
> > +       int q_size = 0;
> > +       int nb = 0;
> >
> >         rmem = atomic_read(&sk->sk_rmem_alloc);
> >         rcvbuf = READ_ONCE(sk->sk_rcvbuf);
> >         size = skb->truesize;
> >
> > +       udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];
> > +
> > +       rmem += atomic_read(&udp_prod_queue->rmem_alloc);
> > +
> >         /* Immediately drop when the receive queue is full.
> >          * Cast to unsigned int performs the boundary check for INT_MAX.
> >          */
> > @@ -1747,45 +1737,60 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
> >         if (rmem > (rcvbuf >> 1)) {
> >                 skb_condense(skb);
> >                 size = skb->truesize;
> > -               rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
> > -               if (rmem > rcvbuf)
> > -                       goto uncharge_drop;
> > -               busy = busylock_acquire(sk);
> > -       } else {
> > -               atomic_add(size, &sk->sk_rmem_alloc);
> >         }
> >
> >         udp_set_dev_scratch(skb);
> >
> > +       atomic_add(size, &udp_prod_queue->rmem_alloc);
> > +
> > +       if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
> > +               return 0;
> > +
> >         spin_lock(&list->lock);
> > -       err = udp_rmem_schedule(sk, size);
> > -       if (err) {
> > -               spin_unlock(&list->lock);
> > -               goto uncharge_drop;
> > -       }
> >
> > -       sk_forward_alloc_add(sk, -size);
> > +       ll_list = llist_del_all(&udp_prod_queue->ll_root);
> >
> > -       /* no need to setup a destructor, we will explicitly release the
> > -        * forward allocated memory on dequeue
> > -        */
> > -       sock_skb_set_dropcount(sk, skb);
> > +       ll_list = llist_reverse_order(ll_list);
> > +
> > +       llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
> > +               size = udp_skb_truesize(skb);
> > +               total_size += size;
> > +               err = udp_rmem_schedule(sk, size);
> > +               if (err) {
> > +                       udp_drops_inc(sk);
> > +                       // TODO update SNMP values.
> > +                       sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
>
> Could using skb_attempt_defer_free() here


1) These skbs are on the same NUMA node.

2) This is an extremely unlikely path.

 In fact I had another patch in udp_release_mem() trying to keep
sk_forward_allocation
as long as some packets are in flight.

@@ -1508,7 +1516,10 @@ static void udp_rmem_release(struct sock *sk,
unsigned int size,
  if (!rx_queue_lock_held)
  spin_lock(&sk_queue->lock);

- amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+ if (size != sk_rmem_alloc_get(sk))
+     amt = 0;
+ else
+     amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
  sk_forward_alloc_add(sk, size - amt);

  if (amt)

3) We would like to keep SKB_DROP_REASON_PROTO_MEM if possible.

> and cpu_to_node(skb->alloc_cpu) for prod_queue selection help further ?

Not sure what you mean.

Here, all skbs in ll_list have cpu_to_node(skb->alloc_cpu) == our_node.

We could add a DEBUG_NET_WARN_ON_ONCE() if you like ;)

>
> Or cache miss of alloc_cpu and defer_lock & IPI could be
> rather expensive under DDoS ?
>
>
> > +                       continue;
> > +               }
> > +
> > +               q_size += size;
> > +               sk_forward_alloc_add(sk, -size);
> > +
> > +               /* no need to setup a destructor, we will explicitly release the
> > +                * forward allocated memory on dequeue
> > +                */
> > +               sock_skb_set_dropcount(sk, skb);
> > +               nb++;
> > +               __skb_queue_tail(list, skb);
> > +       }
> > +
> > +       atomic_add(q_size, &sk->sk_rmem_alloc);
> >
> > -       __skb_queue_tail(list, skb);
> >         spin_unlock(&list->lock);
> >
> > -       if (!sock_flag(sk, SOCK_DEAD))
> > -               INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
> > +       atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
> >
> > -       busylock_release(busy);
> > -       return 0;
> > +       if (!sock_flag(sk, SOCK_DEAD)) {
> > +               while (nb) {
> > +                       INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
> > +                       nb--;
> > +               }
> > +       }
> >
> > -uncharge_drop:
> > -       atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
> > +       return 0;
> >
> >  drop:
> >         udp_drops_inc(sk);
> > -       busylock_release(busy);
> >         return err;
> >  }
> >  EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
> > @@ -1814,10 +1819,11 @@ static void udp_destruct_sock(struct sock *sk)
> >
> >  int udp_init_sock(struct sock *sk)
> >  {
> > -       udp_lib_init_sock(sk);
> > +       int res = udp_lib_init_sock(sk);
> > +
> >         sk->sk_destruct = udp_destruct_sock;
> >         set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
> > -       return 0;
> > +       return res;
> >  }
> >
> >  void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
> > @@ -2906,6 +2912,7 @@ void udp_destroy_sock(struct sock *sk)
> >                         udp_tunnel_cleanup_gro(sk);
> >                 }
> >         }
> > +       kfree(up->udp_prod_queue);
> >  }
> >
> >  typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
> > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> > index 9f4d340d1e3a63d38f80138ef9f6aac4a33afa05..813a2ba75824d14631642bf6973f65063b2825cb 100644
> > --- a/net/ipv6/udp.c
> > +++ b/net/ipv6/udp.c
> > @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk)
> >
> >  int udpv6_init_sock(struct sock *sk)
> >  {
> > -       udp_lib_init_sock(sk);
> > +       int res = udp_lib_init_sock(sk);
> > +
> >         sk->sk_destruct = udpv6_destruct_sock;
> >         set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
> > -       return 0;
> > +       return res;
> >  }
> >
> >  INDIRECT_CALLABLE_SCOPE
> > --
> > 2.51.0.470.ga7dc726c21-goog
> >

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ