lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL+tcoCL8DmRv70NSbJPPkPd-ksD=FMo+HPMi8i-HvAS=x2rxA@mail.gmail.com>
Date: Sat, 27 Sep 2025 09:09:29 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Eric Dumazet <edumazet@...gle.com>
Cc: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>, Simon Horman <horms@...nel.org>, 
	Kuniyuki Iwashima <kuniyu@...gle.com>, Willem de Bruijn <willemb@...gle.com>, netdev@...r.kernel.org, 
	eric.dumazet@...il.com
Subject: Re: [PATCH net-next 2/3] net: use llist for sd->defer_list

On Fri, Sep 26, 2025 at 11:13 PM Eric Dumazet <edumazet@...gle.com> wrote:
>
> Get rid of sd->defer_lock and adopt llist operations.
>
> We optimize skb_attempt_defer_free() for the common case,
> where the packet is queued. Otherwise sd->defer_count
> is increasing, until skb_defer_free_flush() clears it.
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>

Quite interesting optimization! I like the no lock version. Thanks!

Reviewed-by: Jason Xing <kerneljasonxing@...il.com>

> ---
>  include/linux/netdevice.h |  8 ++++----
>  net/core/dev.c            | 18 ++++++------------
>  net/core/skbuff.c         | 15 +++++++--------
>  3 files changed, 17 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 27e3fa69253f694b98d32b6138cf491da5a8b824..5c9aa16933d197f70746d64e5f44cae052d9971c 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3537,10 +3537,10 @@ struct softnet_data {
>         struct numa_drop_counters drop_counters;
>
>         /* Another possibly contended cache line */
> -       spinlock_t              defer_lock ____cacheline_aligned_in_smp;
> -       atomic_t                defer_count;
> -       int                     defer_ipi_scheduled;
> -       struct sk_buff          *defer_list;
> +       struct llist_head       defer_list ____cacheline_aligned_in_smp;
> +       atomic_long_t           defer_count;
> +
> +       int                     defer_ipi_scheduled ____cacheline_aligned_in_smp;
>         call_single_data_t      defer_csd;
>  };
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 8566678d83444e8aacbfea4842878279cf28516f..fb67372774de10b0b112ca71c7c7a13819c2325b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6717,22 +6717,16 @@ EXPORT_SYMBOL(napi_complete_done);
>
>  static void skb_defer_free_flush(struct softnet_data *sd)
>  {
> +       struct llist_node *free_list;
>         struct sk_buff *skb, *next;
>
> -       /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
> -       if (!READ_ONCE(sd->defer_list))
> +       if (llist_empty(&sd->defer_list))
>                 return;
> +       atomic_long_set(&sd->defer_count, 0);
> +       free_list = llist_del_all(&sd->defer_list);
>
> -       spin_lock(&sd->defer_lock);
> -       skb = sd->defer_list;
> -       sd->defer_list = NULL;
> -       atomic_set(&sd->defer_count, 0);
> -       spin_unlock(&sd->defer_lock);
> -
> -       while (skb != NULL) {
> -               next = skb->next;
> +       llist_for_each_entry_safe(skb, next, free_list, ll_node) {

nit: no need to keep brackets

>                 napi_consume_skb(skb, 1);
> -               skb = next;
>         }
>  }
>
> @@ -12995,7 +12989,7 @@ static int __init net_dev_init(void)
>                 sd->cpu = i;
>  #endif
>                 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
> -               spin_lock_init(&sd->defer_lock);
> +               init_llist_head(&sd->defer_list);
>
>                 gro_init(&sd->backlog.gro);
>                 sd->backlog.poll = process_backlog;
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f91571f51c69ecf8c2fffed5f3a3cd33fd95828b..22d9dba0e433cf67243a5b7dda77e61d146baf50 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -7184,6 +7184,7 @@ static void kfree_skb_napi_cache(struct sk_buff *skb)
>   */
>  void skb_attempt_defer_free(struct sk_buff *skb)
>  {
> +       unsigned long defer_count;
>         int cpu = skb->alloc_cpu;
>         struct softnet_data *sd;
>         unsigned int defer_max;
> @@ -7201,17 +7202,15 @@ nodefer:        kfree_skb_napi_cache(skb);
>
>         sd = &per_cpu(softnet_data, cpu);
>         defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
> -       if (atomic_read(&sd->defer_count) >= defer_max)
> +       defer_count = atomic_long_inc_return(&sd->defer_count);
> +
> +       if (defer_count >= defer_max)
>                 goto nodefer;
>
> -       spin_lock_bh(&sd->defer_lock);
> -       /* Send an IPI every time queue reaches half capacity. */
> -       kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1);
> +       llist_add(&skb->ll_node, &sd->defer_list);
>
> -       skb->next = sd->defer_list;
> -       /* Paired with READ_ONCE() in skb_defer_free_flush() */
> -       WRITE_ONCE(sd->defer_list, skb);
> -       spin_unlock_bh(&sd->defer_lock);
> +       /* Send an IPI every time queue reaches half capacity. */
> +       kick = (defer_count - 1) == (defer_max >> 1);
>
>         /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
>          * if we are unlucky enough (this seems very unlikely).
> --
> 2.51.0.536.g15c5d4f767-goog
>
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ