[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZrdsnT5aIs85jyL/@boxer>
Date: Sat, 10 Aug 2024 15:35:25 +0200
From: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
To: Tony Nguyen <anthony.l.nguyen@...el.com>
CC: <davem@...emloft.net>, <kuba@...nel.org>, <pabeni@...hat.com>,
<edumazet@...gle.com>, <netdev@...r.kernel.org>, Sriram Yagnaraman
<sriram.yagnaraman@....tech>, <magnus.karlsson@...el.com>, <ast@...nel.org>,
<daniel@...earbox.net>, <hawk@...nel.org>, <john.fastabend@...il.com>,
<bpf@...r.kernel.org>, <kurt@...utronix.de>,
<sriram.yagnaraman@...csson.com>, <richardcochran@...il.com>,
<benjamin.steinke@...s-audio.com>, <bigeasy@...utronix.de>, "Chandan Kumar
Rout" <chandanx.rout@...el.com>
Subject: Re: [PATCH net-next 2/4] igb: Introduce XSK data structures and
helpers
On Thu, Aug 08, 2024 at 11:35:52AM -0700, Tony Nguyen wrote:
> From: Sriram Yagnaraman <sriram.yagnaraman@....tech>
>
> Add the following ring flags
> - IGB_RING_FLAG_TX_DISABLED (when xsk pool is being setup)
> - IGB_RING_FLAG_AF_XDP_ZC (xsk pool is active)
>
> Add a xdp_buff array for use with XSK receive batch API, and a pointer
> to xsk_pool in igb_adapter.
>
> Add enable/disable functions for TX and RX rings
> Add enable/disable functions for XSK pool
> Add xsk wakeup function
>
> None of the above functionality will be active until
> NETDEV_XDP_ACT_XSK_ZEROCOPY is advertised in netdev->xdp_features.
>
> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@....tech>
> Signed-off-by: Kurt Kanzenbach <kurt@...utronix.de>
> Tested-by: Chandan Kumar Rout <chandanx.rout@...el.com> (A Contingent Worker at Intel)
> Signed-off-by: Tony Nguyen <anthony.l.nguyen@...el.com>
> ---
> drivers/net/ethernet/intel/igb/Makefile | 2 +-
> drivers/net/ethernet/intel/igb/igb.h | 14 +-
> drivers/net/ethernet/intel/igb/igb_main.c | 9 +
> drivers/net/ethernet/intel/igb/igb_xsk.c | 210 ++++++++++++++++++++++
> 4 files changed, 233 insertions(+), 2 deletions(-)
> create mode 100644 drivers/net/ethernet/intel/igb/igb_xsk.c
>
> diff --git a/drivers/net/ethernet/intel/igb/Makefile b/drivers/net/ethernet/intel/igb/Makefile
> index 463c0d26b9d4..6c1b702fd992 100644
> --- a/drivers/net/ethernet/intel/igb/Makefile
> +++ b/drivers/net/ethernet/intel/igb/Makefile
> @@ -8,4 +8,4 @@ obj-$(CONFIG_IGB) += igb.o
>
> igb-y := igb_main.o igb_ethtool.o e1000_82575.o \
> e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o \
> - e1000_i210.o igb_ptp.o igb_hwmon.o
> + e1000_i210.o igb_ptp.o igb_hwmon.o igb_xsk.o
> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
> index 0de71ec324ed..053130c01480 100644
> --- a/drivers/net/ethernet/intel/igb/igb.h
> +++ b/drivers/net/ethernet/intel/igb/igb.h
> @@ -20,6 +20,7 @@
> #include <linux/mdio.h>
>
> #include <net/xdp.h>
> +#include <net/xdp_sock_drv.h>
>
> struct igb_adapter;
>
> @@ -320,6 +321,7 @@ struct igb_ring {
> union { /* array of buffer info structs */
> struct igb_tx_buffer *tx_buffer_info;
> struct igb_rx_buffer *rx_buffer_info;
> + struct xdp_buff **rx_buffer_info_zc;
> };
> void *desc; /* descriptor ring memory */
> unsigned long flags; /* ring specific flags */
> @@ -357,6 +359,7 @@ struct igb_ring {
> };
> };
> struct xdp_rxq_info xdp_rxq;
> + struct xsk_buff_pool *xsk_pool;
> } ____cacheline_internodealigned_in_smp;
>
> struct igb_q_vector {
> @@ -384,7 +387,9 @@ enum e1000_ring_flags_t {
> IGB_RING_FLAG_RX_SCTP_CSUM,
> IGB_RING_FLAG_RX_LB_VLAN_BSWAP,
> IGB_RING_FLAG_TX_CTX_IDX,
> - IGB_RING_FLAG_TX_DETECT_HANG
> + IGB_RING_FLAG_TX_DETECT_HANG,
> + IGB_RING_FLAG_TX_DISABLED,
> + IGB_RING_FLAG_AF_XDP_ZC
> };
>
> #define ring_uses_large_buffer(ring) \
> @@ -822,4 +827,11 @@ int igb_add_mac_steering_filter(struct igb_adapter *adapter,
> int igb_del_mac_steering_filter(struct igb_adapter *adapter,
> const u8 *addr, u8 queue, u8 flags);
>
> +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
> + struct igb_ring *ring);
> +int igb_xsk_pool_setup(struct igb_adapter *adapter,
> + struct xsk_buff_pool *pool,
> + u16 qid);
> +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
> +
> #endif /* _IGB_H_ */
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index bdb7637559b8..b6f23bbeff71 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -2904,9 +2904,14 @@ static int igb_xdp_setup(struct net_device *dev, struct netdev_bpf *bpf)
>
> static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> {
> + struct igb_adapter *adapter = netdev_priv(dev);
> +
> switch (xdp->command) {
> case XDP_SETUP_PROG:
> return igb_xdp_setup(dev, xdp);
> + case XDP_SETUP_XSK_POOL:
> + return igb_xsk_pool_setup(adapter, xdp->xsk.pool,
> + xdp->xsk.queue_id);
> default:
> return -EINVAL;
> }
> @@ -3033,6 +3038,7 @@ static const struct net_device_ops igb_netdev_ops = {
> .ndo_setup_tc = igb_setup_tc,
> .ndo_bpf = igb_xdp,
> .ndo_xdp_xmit = igb_xdp_xmit,
> + .ndo_xsk_wakeup = igb_xsk_wakeup,
> };
>
> /**
> @@ -4355,6 +4361,8 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
> u64 tdba = ring->dma;
> int reg_idx = ring->reg_idx;
>
> + ring->xsk_pool = igb_xsk_pool(adapter, ring);
use WRITE_ONCE()
> +
> wr32(E1000_TDLEN(reg_idx),
> ring->count * sizeof(union e1000_adv_tx_desc));
> wr32(E1000_TDBAL(reg_idx),
> @@ -4750,6 +4758,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
> xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
> WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
> MEM_TYPE_PAGE_SHARED, NULL));
> + ring->xsk_pool = igb_xsk_pool(adapter, ring);
ditto
I was recently addressing issues around xsk in ice, see:
[0]: https://lore.kernel.org/netdev/172239123450.15322.12860347838208396251.git-patchwork-notify@kernel.org/
>
> /* disable the queue */
> wr32(E1000_RXDCTL(reg_idx), 0);
> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
> new file mode 100644
> index 000000000000..925bf97f7caa
> --- /dev/null
> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
> @@ -0,0 +1,210 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2018 Intel Corporation. */
> +
> +#include <linux/bpf_trace.h>
> +#include <net/xdp_sock_drv.h>
> +#include <net/xdp.h>
> +
> +#include "e1000_hw.h"
> +#include "igb.h"
> +
> +static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present)
> +{
> + int size = pool_present ?
> + sizeof(*ring->rx_buffer_info_zc) * ring->count :
> + sizeof(*ring->rx_buffer_info) * ring->count;
> + void *buff_info = vmalloc(size);
You need to take into account the rx_buffer_info_zc in the memset in
igb_configure_rx_ring(). Also why vmalloc?
> +
> + if (!buff_info)
> + return -ENOMEM;
> +
> + if (pool_present) {
> + vfree(ring->rx_buffer_info);
> + ring->rx_buffer_info = NULL;
> + ring->rx_buffer_info_zc = buff_info;
> + } else {
> + vfree(ring->rx_buffer_info_zc);
> + ring->rx_buffer_info_zc = NULL;
> + ring->rx_buffer_info = buff_info;
> + }
> +
> + return 0;
> +}
> +
> +static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid)
> +{
> + struct igb_ring *tx_ring = adapter->tx_ring[qid];
> + struct igb_ring *rx_ring = adapter->rx_ring[qid];
> + struct e1000_hw *hw = &adapter->hw;
> +
> + set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
> +
> + wr32(E1000_TXDCTL(tx_ring->reg_idx), 0);
> + wr32(E1000_RXDCTL(rx_ring->reg_idx), 0);
> +
synchronize_net() to let the napi finish its current job?
> + /* Rx/Tx share the same napi context. */
> + napi_disable(&rx_ring->q_vector->napi);
> +
> + igb_clean_tx_ring(tx_ring);
> + igb_clean_rx_ring(rx_ring);
> +
> + memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats));
> + memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats));
> +}
> +
> +static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
> +{
> + struct igb_ring *tx_ring = adapter->tx_ring[qid];
> + struct igb_ring *rx_ring = adapter->rx_ring[qid];
> +
> + igb_configure_tx_ring(adapter, tx_ring);
> + igb_configure_rx_ring(adapter, rx_ring);
> +
synchronize_net() after updating xsk_pool ptrs
> + clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
> +
> + /* call igb_desc_unused which always leaves
> + * at least 1 descriptor unused to make sure
> + * next_to_use != next_to_clean
> + */
> + igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
> +
> + /* Rx/Tx share the same napi context. */
> + napi_enable(&rx_ring->q_vector->napi);
> +}
> +
> +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
> + struct igb_ring *ring)
> +{
> + int qid = ring->queue_index;
> +
> + if (!igb_xdp_is_enabled(adapter) ||
> + !test_bit(IGB_RING_FLAG_AF_XDP_ZC, &ring->flags))
See:
[1]: https://lore.kernel.org/netdev/20240603-net-2024-05-30-intel-net-fixes-v2-3-e3563aa89b0c@intel.com/
how to avoid the introduction of IGB_RING_FLAG_AF_XDP_ZC altogether.
> + return NULL;
> +
> + return xsk_get_pool_from_qid(adapter->netdev, qid);
> +}
> +
> +static int igb_xsk_pool_enable(struct igb_adapter *adapter,
> + struct xsk_buff_pool *pool,
> + u16 qid)
> +{
> + struct net_device *netdev = adapter->netdev;
> + struct igb_ring *tx_ring, *rx_ring;
> + bool if_running;
> + int err;
> +
> + if (qid >= adapter->num_rx_queues)
> + return -EINVAL;
> +
> + if (qid >= netdev->real_num_rx_queues ||
> + qid >= netdev->real_num_tx_queues)
> + return -EINVAL;
> +
> + err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR);
> + if (err)
> + return err;
> +
> + tx_ring = adapter->tx_ring[qid];
> + rx_ring = adapter->rx_ring[qid];
> + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
> + if (if_running)
> + igb_txrx_ring_disable(adapter, qid);
> +
> + set_bit(IGB_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
> + set_bit(IGB_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
> +
> + if (if_running) {
> + err = igb_realloc_rx_buffer_info(rx_ring, true);
> + if (!err) {
> + igb_txrx_ring_enable(adapter, qid);
> + /* Kick start the NAPI context so that receiving will start */
> + err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
> + }
> +
> + if (err) {
> + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
> + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
> + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
> + return err;
> + }
> + }
> +
> + return 0;
> +}
> +
> +static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid)
> +{
> + struct igb_ring *tx_ring, *rx_ring;
> + struct xsk_buff_pool *pool;
> + bool if_running;
> + int err;
> +
> + pool = xsk_get_pool_from_qid(adapter->netdev, qid);
> + if (!pool)
> + return -EINVAL;
> +
> + tx_ring = adapter->tx_ring[qid];
> + rx_ring = adapter->rx_ring[qid];
> + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
> + if (if_running)
> + igb_txrx_ring_disable(adapter, qid);
> +
> + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
> + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
> + clear_bit(IGB_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
> +
> + if (if_running) {
> + err = igb_realloc_rx_buffer_info(rx_ring, false);
> + if (err)
> + return err;
> +
> + igb_txrx_ring_enable(adapter, qid);
> + }
> +
> + return 0;
> +}
> +
> +int igb_xsk_pool_setup(struct igb_adapter *adapter,
> + struct xsk_buff_pool *pool,
> + u16 qid)
> +{
> + return pool ? igb_xsk_pool_enable(adapter, pool, qid) :
> + igb_xsk_pool_disable(adapter, qid);
> +}
> +
> +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> +{
> + struct igb_adapter *adapter = netdev_priv(dev);
> + struct e1000_hw *hw = &adapter->hw;
> + struct igb_ring *ring;
> + u32 eics = 0;
> +
> + if (test_bit(__IGB_DOWN, &adapter->state))
> + return -ENETDOWN;
> +
> + if (!igb_xdp_is_enabled(adapter))
> + return -EINVAL;
> +
> + if (qid >= adapter->num_tx_queues)
> + return -EINVAL;
> +
> + ring = adapter->tx_ring[qid];
> +
> + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
> + return -ENETDOWN;
> +
> + if (!ring->xsk_pool)
READ_ONCE()
Also, please test this patchset against a scenario where you do Tx ZC from
every queue available and toggle the interface down and up. We had a nasty
case that [0] fixed where we were producing Tx descriptors to wire when
interface was either already going down or not brought up yet.
> + return -EINVAL;
> +
> + if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
> + /* Cause software interrupt to ensure Rx ring is cleaned */
> + if (adapter->flags & IGB_FLAG_HAS_MSIX) {
> + eics |= ring->q_vector->eims_value;
> + wr32(E1000_EICS, eics);
> + } else {
> + wr32(E1000_ICS, E1000_ICS_RXDMT0);
> + }
> + }
> +
> + return 0;
> +}
> --
> 2.42.0
>
Powered by blists - more mailing lists