[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z5KkEF-2NiX4SuB_@LQ3V64L9R2>
Date: Thu, 23 Jan 2025 12:18:24 -0800
From: Joe Damato <jdamato@...tly.com>
To: Ahmed Zaki <ahmed.zaki@...el.com>
Cc: netdev@...r.kernel.org, intel-wired-lan@...ts.osuosl.org,
andrew+netdev@...n.ch, edumazet@...gle.com, kuba@...nel.org,
horms@...nel.org, pabeni@...hat.com, davem@...emloft.net,
michael.chan@...adcom.com, tariqt@...dia.com,
anthony.l.nguyen@...el.com, przemyslaw.kitszel@...el.com,
shayd@...dia.com, akpm@...ux-foundation.org, shayagr@...zon.com,
kalesh-anakkur.purayil@...adcom.com
Subject: Re: [PATCH net-next v6 2/5] net: napi: add CPU affinity to
napi_config
On Fri, Jan 17, 2025 at 05:33:32PM -0700, Ahmed Zaki wrote:
> A common task for most drivers is to remember the user-set CPU affinity
> to its IRQs. On each netdev reset, the driver should re-assign the
> user's settings to the IRQs.
>
> Add CPU affinity mask to napi_config. To delegate the CPU affinity
> management to the core, drivers must:
> 1 - set the new netdev flag "irq_affinity_auto":
> netif_enable_irq_affinity(netdev)
> 2 - create the napi with persistent config:
> netif_napi_add_config()
> 3 - bind an IRQ to the napi instance: netif_napi_set_irq()
>
> the core will then make sure to use re-assign affinity to the napi's
> IRQ.
>
> The default IRQ mask is set to one cpu starting from the closest NUMA.
Maybe the above is helpful to add to
Documentation/networking/napi.rst ?
> Signed-off-by: Ahmed Zaki <ahmed.zaki@...el.com>
> ---
> include/linux/netdevice.h | 14 ++++++++++-
> net/core/dev.c | 51 +++++++++++++++++++++++++++++----------
> 2 files changed, 51 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 98259f19c627..d576e5c91c43 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -351,6 +351,7 @@ struct napi_config {
> u64 gro_flush_timeout;
> u64 irq_suspend_timeout;
> u32 defer_hard_irqs;
> + cpumask_t affinity_mask;
> unsigned int napi_id;
> };
>
> @@ -393,8 +394,8 @@ struct napi_struct {
> struct list_head dev_list;
> struct hlist_node napi_hash_node;
> int irq;
> -#ifdef CONFIG_RFS_ACCEL
> struct irq_affinity_notify notify;
> +#ifdef CONFIG_RFS_ACCEL
> int napi_rmap_idx;
> #endif
> int index;
> @@ -1991,6 +1992,11 @@ enum netdev_reg_state {
> *
> * @threaded: napi threaded mode is enabled
> *
> + * @irq_affinity_auto: driver wants the core to manage the IRQ affinity.
> + * Set by netif_enable_irq_affinity(), then driver must
> + * create persistent napi by netif_napi_add_config()
> + * and finally bind napi to IRQ (netif_napi_set_irq).
> + *
> * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap.
> * Set by calling netif_enable_cpu_rmap().
> *
> @@ -2401,6 +2407,7 @@ struct net_device {
> struct lock_class_key *qdisc_tx_busylock;
> bool proto_down;
> bool threaded;
> + bool irq_affinity_auto;
> bool rx_cpu_rmap_auto;
>
> /* priv_flags_slow, ungrouped to save space */
> @@ -2653,6 +2660,11 @@ static inline void netdev_set_ml_priv(struct net_device *dev,
> dev->ml_priv_type = type;
> }
>
> +static inline void netif_enable_irq_affinity(struct net_device *dev)
> +{
> + dev->irq_affinity_auto = true;
> +}
I'll have to look at the patches which use the above function, but
the first thing that came to mind when I saw this was does the above
need a WRITE_ONCE ?
The reads below seem to be protected by a lock; I haven't yet looked
at the other patches so maybe the write is also protected by
netdev->lock ?
> /*
> * Net namespace inlines
> */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index dbb63005bc2b..bc82c7f621b3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
[...]
>
> +#ifdef CONFIG_RFS_ACCEL
> static void netif_napi_affinity_release(struct kref *ref)
> {
> struct napi_struct *napi =
> @@ -6901,7 +6908,7 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq)
> if (!rmap)
> return -EINVAL;
>
> - napi->notify.notify = netif_irq_cpu_rmap_notify;
> + napi->notify.notify = netif_napi_irq_notify;
Same question as previous patch: does it make sense to only set the
callbacks below when all other operations have succeeded?
> napi->notify.release = netif_napi_affinity_release;
> cpu_rmap_get(rmap);
> rc = cpu_rmap_add(rmap, napi);
[...]
> @@ -6976,23 +6987,28 @@ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
> {
> int rc;
>
> - if (!napi->dev->rx_cpu_rmap_auto)
> - goto out;
> -
> - /* Remove existing rmap entries */
> - if (napi->irq != irq && napi->irq > 0)
> + /* Remove existing resources */
> + if ((napi->dev->rx_cpu_rmap_auto || napi->dev->irq_affinity_auto) &&
> + napi->irq > 0 && napi->irq != irq)
> irq_set_affinity_notifier(napi->irq, NULL);
>
> - if (irq > 0) {
> + if (irq > 0 && napi->dev->rx_cpu_rmap_auto) {
> rc = napi_irq_cpu_rmap_add(napi, irq);
> if (rc) {
> netdev_warn(napi->dev, "Unable to update ARFS map (%d)\n",
> rc);
> netif_disable_cpu_rmap(napi->dev);
> }
> + } else if (irq > 0 && napi->config && napi->dev->irq_affinity_auto) {
> + napi->notify.notify = netif_napi_irq_notify;
> + napi->notify.release = netif_napi_affinity_release;
> +
> + rc = irq_set_affinity_notifier(irq, &napi->notify);
> + if (rc)
> + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
> + rc);
I see now that my comments on the previous patch are stale after
this patch is applied. I wonder if the "irq > 0" part can be pulled
out to simplify the branches here?
Powered by blists - more mailing lists