[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZuMC2fYPPtWggB2w@LQ3V64L9R2.homenet.telecomitalia.it>
Date: Thu, 12 Sep 2024 17:03:53 +0200
From: Joe Damato <jdamato@...tly.com>
To: netdev@...r.kernel.org
Cc: mkarsten@...terloo.ca, kuba@...nel.org, skhawaja@...gle.com,
sdf@...ichev.me, bjorn@...osinc.com, amritha.nambiar@...el.com,
sridhar.samudrala@...el.com,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>, Paolo Abeni <pabeni@...hat.com>,
Jonathan Corbet <corbet@....net>, Jiri Pirko <jiri@...nulli.us>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Lorenzo Bianconi <lorenzo@...nel.org>,
David Ahern <dsahern@...nel.org>,
Johannes Berg <johannes.berg@...el.com>,
"open list:DOCUMENTATION" <linux-doc@...r.kernel.org>,
open list <linux-kernel@...r.kernel.org>
Subject: Re: [RFC net-next v3 5/9] net: napi: Add napi_config
Several comments on different things below for this patch that I just noticed.
On Thu, Sep 12, 2024 at 10:07:13AM +0000, Joe Damato wrote:
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the storage for a NAPI by passing an index
> when calling netif_napi_add_storage.
>
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted), and set to 0 when napi_enable is called.
Forgot to re-read all the commit messages. I will do that for rfcv4
and make sure they are all correct; this message is not correct.
> Drivers which implement call netif_napi_add_storage will have persistent
> NAPI IDs.
>
> Signed-off-by: Joe Damato <jdamato@...tly.com>
> ---
> .../networking/net_cachelines/net_device.rst | 1 +
> include/linux/netdevice.h | 34 +++++++++
> net/core/dev.c | 74 +++++++++++++++++--
> net/core/dev.h | 12 +++
> 4 files changed, 113 insertions(+), 8 deletions(-)
>
[...]
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3e07ab8e0295..08afc96179f9 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
[...]
> @@ -2685,6 +2717,8 @@ void __netif_napi_del(struct napi_struct *napi);
> */
> static inline void netif_napi_del(struct napi_struct *napi)
> {
> + napi->config = NULL;
> + napi->index = -1;
> __netif_napi_del(napi);
> synchronize_net();
> }
I don't quite see how, but occasionally when changing the queue
count with ethtool -L, I seem to trigger a page_pool issue.
I assumed it was related to either my changes above in netif_napi_del, or
below in __netif_napi_del, but I'm not so sure because the issue does not
happen reliably and I can't seem to figure out how my change would cause this.
When it does happen, this is the stack trace:
page_pool_empty_ring() page_pool refcnt -30528 violation
------------[ cut here ]------------
Negative(-1) inflight packet-pages
WARNING: CPU: 1 PID: 5117 at net/core/page_pool.c:617 page_pool_inflight+0x4c/0x90
[...]
CPU: 1 UID: 0 PID: 5117 Comm: ethtool Tainted: G W [...]
RIP: 0010:page_pool_inflight+0x4c/0x90
Code: e4 b8 00 00 00 00 44 0f 48 e0 44 89 e0 41 5c e9 8a c1 1b 00 66 90 45 85 e4 79 ef 44 89 e6 48 c7 c7 78 56 26 82 e8 14 63 78 ff <0f> 0b eb dc 65 8b 05 b5 af 71 7e 48 0f a3 05 21 d9 3b 01 73 d7 48
RSP: 0018:ffffc9001d01b640 EFLAGS: 00010282
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
RDX: 0000000000000027 RSI: 00000000ffefffff RDI: ffff88bf4f45c988
RBP: ffff8900da55a800 R08: 0000000000000000 R09: ffffc9001d01b480
R10: 0000000000000001 R11: 0000000000000001 R12: 00000000ffffffff
R13: ffffffff82cd35b0 R14: ffff890062550f00 R15: ffff8881b0e85400
FS: 00007fa9cb382740(0000) GS:ffff88bf4f440000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000558baa9d3b38 CR3: 000000011222a000 CR4: 0000000000350ef0
Call Trace:
<TASK>
? __warn+0x80/0x110
? page_pool_inflight+0x4c/0x90
? report_bug+0x19c/0x1b0
? handle_bug+0x3c/0x70
? exc_invalid_op+0x18/0x70
? asm_exc_invalid_op+0x1a/0x20
? page_pool_inflight+0x4c/0x90
page_pool_release+0x10e/0x1d0
page_pool_destroy+0x66/0x160
mlx5e_free_rq+0x69/0xb0 [mlx5_core]
mlx5e_close_queues+0x39/0x150 [mlx5_core]
mlx5e_close_channel+0x1c/0x60 [mlx5_core]
mlx5e_close_channels+0x49/0xa0 [mlx5_core]
mlx5e_switch_priv_channels+0xa9/0x120 [mlx5_core]
? __pfx_mlx5e_num_channels_changed_ctx+0x10/0x10 [mlx5_core]
mlx5e_safe_switch_params+0xb8/0xf0 [mlx5_core]
mlx5e_ethtool_set_channels+0x17a/0x290 [mlx5_core]
ethnl_set_channels+0x243/0x310
ethnl_default_set_doit+0xc1/0x170
genl_family_rcv_msg_doit+0xd9/0x130
genl_rcv_msg+0x18f/0x2c0
? __pfx_ethnl_default_set_doit+0x10/0x10
? __pfx_genl_rcv_msg+0x10/0x10
netlink_rcv_skb+0x5a/0x110
genl_rcv+0x28/0x40
netlink_unicast+0x1aa/0x260
netlink_sendmsg+0x1e9/0x420
__sys_sendto+0x1d5/0x1f0
? handle_mm_fault+0x1cb/0x290
? do_user_addr_fault+0x558/0x7c0
__x64_sys_sendto+0x29/0x30
do_syscall_64+0x5d/0x170
entry_SYSCALL_64_after_hwframe+0x76/0x7e
> diff --git a/net/core/dev.c b/net/core/dev.c
> index f2fd503516de..ca2227d0b8ed 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
[...]
> @@ -6736,7 +6776,13 @@ void __netif_napi_del(struct napi_struct *napi)
> if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
> return;
>
> - napi_hash_del(napi);
> + if (!napi->config) {
> + napi_hash_del(napi);
> + } else {
> + napi->index = -1;
> + napi->config = NULL;
> + }
> +
See above; perhaps something related to this change is triggering the page pool
warning occasionally.
> list_del_rcu(&napi->dev_list);
> napi_free_frags(napi);
>
> @@ -11049,6 +11095,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> unsigned int txqs, unsigned int rxqs)
> {
> struct net_device *dev;
> + size_t napi_config_sz;
> + unsigned int maxqs;
>
> BUG_ON(strlen(name) >= sizeof(dev->name));
>
> @@ -11062,6 +11110,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> return NULL;
> }
>
> + WARN_ON_ONCE(txqs != rxqs);
This warning triggers for me on boot every time with mlx5 NICs.
The code in mlx5 seems to get the rxq and txq maximums in:
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
mlx5e_create_netdev
which does:
txqs = mlx5e_get_max_num_txqs(mdev, profile);
rxqs = mlx5e_get_max_num_rxqs(mdev, profile);
netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs);
In my case for my device, txqs: 760, rxqs: 63.
I would guess that this warning will trigger everytime for mlx5 NICs
and would be quite annoying.
We may just want to replace the allocation logic to allocate
txqs+rxqs, remove the WARN_ON_ONCE, and be OK with some wasted
space?
> + maxqs = max(txqs, rxqs);
> +
> dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
> GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
> if (!dev)
> @@ -11136,6 +11187,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> if (!dev->ethtool)
> goto free_all;
>
> + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
> + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
> + if (!dev->napi_config)
> + goto free_all;
> +
[...]
> diff --git a/net/core/dev.h b/net/core/dev.h
> index a9d5f678564a..9eb3f559275c 100644
> --- a/net/core/dev.h
> +++ b/net/core/dev.h
> @@ -167,11 +167,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
> static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
> u32 defer)
> {
> + unsigned int count = max(netdev->num_rx_queues,
> + netdev->num_tx_queues);
> struct napi_struct *napi;
> + int i;
>
> WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
> list_for_each_entry(napi, &netdev->napi_list, dev_list)
> napi_set_defer_hard_irqs(napi, defer);
> +
> + for (i = 0; i < count; i++)
> + netdev->napi_config[i].defer_hard_irqs = defer;
The above is incorrect. Some devices may have netdev->napi_config =
NULL if they don't call the add_storage wrapper.
Unless there is major feedback/changes requested that affect this
code, in the rfcv4 branch I plan to fix this by adding a NULL check:
if (netdev->napi_config)
for (....)
netdev->napi_config[i]....
>
> /**
> @@ -206,11 +212,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
> static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
> unsigned long timeout)
> {
> + unsigned int count = max(netdev->num_rx_queues,
> + netdev->num_tx_queues);
> struct napi_struct *napi;
> + int i;
>
> WRITE_ONCE(netdev->gro_flush_timeout, timeout);
> list_for_each_entry(napi, &netdev->napi_list, dev_list)
> napi_set_gro_flush_timeout(napi, timeout);
> +
> + for (i = 0; i < count; i++)
> + netdev->napi_config[i].gro_flush_timeout = timeout;
Same as above.
Powered by blists - more mailing lists