lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y+PPzz4AHRxZgs9r@boxer>
Date:   Wed, 8 Feb 2023 17:37:35 +0100
From:   Maciej Fijalkowski <maciej.fijalkowski@...el.com>
To:     Vladimir Oltean <vladimir.oltean@....com>
CC:     <netdev@...r.kernel.org>, "David S. Miller" <davem@...emloft.net>,
        "Eric Dumazet" <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Claudiu Manoil <claudiu.manoil@....com>,
        Björn Töpel <bjorn@...nel.org>,
        Magnus Karlsson <magnus.karlsson@...el.com>,
        Alexei Starovoitov <ast@...nel.org>,
        "Daniel Borkmann" <daniel@...earbox.net>,
        Jesper Dangaard Brouer <hawk@...nel.org>,
        John Fastabend <john.fastabend@...il.com>,
        <bpf@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: Re: [RFC PATCH net-next 11/11] net: enetc: add TX support for
 zero-copy XDP sockets

On Mon, Feb 06, 2023 at 12:08:37PM +0200, Vladimir Oltean wrote:

Hey Vladimir,

> Schedule NAPI by hand from enetc_xsk_wakeup(), and send frames from the
> XSK TX queue from NAPI context. Add them to the completion queue from
> the enetc_clean_tx_ring() procedure which is common for all kinds of
> traffic.
> 
> We reuse one of the TX rings for XDP (XDP_TX/XDP_REDIRECT) for XSK as
> well. They are already cropped from the TX rings that the network stack
> can use when XDP is enabled (with or without AF_XDP).
> 
> As for XDP_REDIRECT calling enetc's ndo_xdp_xmit, I'm not sure if that
> can run simultaneously with enetc_poll() (on different CPUs, but towards
> the same TXQ). I guess it probably can, but idk what to do about it.
> The problem is that enetc_xdp_xmit() sends to
> priv->xdp_tx_ring[smp_processor_id()], while enetc_xsk_xmit() and XDP_TX
> send to priv->xdp_tx_ring[NAPI instance]. So when the NAPI instance runs

Why not use cpu id on the latter then?

> on a different CPU that the one it is numerically equal to, we should
> have a lock that serializes XDP_REDIRECT with the others.
> 
> Signed-off-by: Vladimir Oltean <vladimir.oltean@....com>
> ---
>  drivers/net/ethernet/freescale/enetc/enetc.c | 102 ++++++++++++++++++-
>  drivers/net/ethernet/freescale/enetc/enetc.h |   2 +
>  2 files changed, 99 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
> index 3990c006c011..bc0db788afc7 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc.c
> +++ b/drivers/net/ethernet/freescale/enetc/enetc.c
> @@ -84,7 +84,7 @@ static void enetc_free_tx_swbd(struct enetc_bdr *tx_ring,
>  	struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd);
>  	struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd);
>  
> -	if (tx_swbd->dma)
> +	if (!tx_swbd->is_xsk && tx_swbd->dma)
>  		enetc_unmap_tx_buff(tx_ring, tx_swbd);
>  
>  	if (xdp_frame) {
> @@ -817,7 +817,8 @@ static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring,
>  	rx_ring->xdp.xdp_tx_in_flight--;
>  }
>  
> -static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget)
> +static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget,
> +				int *xsk_confirmed)
>  {
>  	int tx_frm_cnt = 0, tx_byte_cnt = 0, tx_win_drop = 0;
>  	struct net_device *ndev = tx_ring->ndev;
> @@ -854,7 +855,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget)
>  				tx_win_drop++;
>  		}
>  
> -		if (tx_swbd->is_xdp_tx)
> +		if (tx_swbd->is_xsk)
> +			(*xsk_confirmed)++;
> +		else if (tx_swbd->is_xdp_tx)
>  			enetc_recycle_xdp_tx_buff(tx_ring, tx_swbd);
>  		else if (likely(tx_swbd->dma))
>  			enetc_unmap_tx_buff(tx_ring, tx_swbd);
> @@ -1465,6 +1468,58 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames,
>  }
>  EXPORT_SYMBOL_GPL(enetc_xdp_xmit);
>  
> +static void enetc_xsk_map_tx_desc(struct enetc_tx_swbd *tx_swbd,
> +				  const struct xdp_desc *xsk_desc,
> +				  struct xsk_buff_pool *pool)
> +{
> +	dma_addr_t dma;
> +
> +	dma = xsk_buff_raw_get_dma(pool, xsk_desc->addr);
> +	xsk_buff_raw_dma_sync_for_device(pool, dma, xsk_desc->len);
> +
> +	tx_swbd->dma = dma;
> +	tx_swbd->len = xsk_desc->len;
> +	tx_swbd->is_xsk = true;
> +	tx_swbd->is_eof = true;
> +}
> +
> +static bool enetc_xsk_xmit(struct net_device *ndev, struct xsk_buff_pool *pool,
> +			   u32 queue_id)
> +{
> +	struct enetc_ndev_priv *priv = netdev_priv(ndev);
> +	struct xdp_desc *xsk_descs = pool->tx_descs;
> +	struct enetc_tx_swbd tx_swbd = {0};
> +	struct enetc_bdr *tx_ring;
> +	u32 budget, batch;
> +	int i, k;
> +
> +	tx_ring = priv->xdp_tx_ring[queue_id];
> +
> +	/* Shouldn't race with anyone because we are running in the softirq
> +	 * of the only CPU that sends packets to this TX ring
> +	 */
> +	budget = min(enetc_bd_unused(tx_ring) - 1, ENETC_XSK_TX_BATCH);
> +
> +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> +	if (!batch)
> +		return true;
> +
> +	i = tx_ring->next_to_use;
> +
> +	for (k = 0; k < batch; k++) {
> +		enetc_xsk_map_tx_desc(&tx_swbd, &xsk_descs[k], pool);
> +		enetc_xdp_map_tx_buff(tx_ring, i, &tx_swbd, tx_swbd.len);
> +		enetc_bdr_idx_inc(tx_ring, &i);
> +	}
> +
> +	tx_ring->next_to_use = i;
> +
> +	xsk_tx_release(pool);

xsk_tx_release() is not needed if you're using
xsk_tx_peek_release_desc_batch() above, it will do this for you at the end
of its job.

> +	enetc_update_tx_ring_tail(tx_ring);
> +
> +	return budget != batch;
> +}
> +
>  static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i,
>  				     struct xdp_buff *xdp_buff, u16 size)
>  {
> @@ -1881,6 +1936,7 @@ static int enetc_poll(struct napi_struct *napi, int budget)
>  	struct enetc_bdr *rx_ring = &v->rx_ring;
>  	struct xsk_buff_pool *pool;
>  	struct bpf_prog *prog;
> +	int xsk_confirmed = 0;
>  	bool complete = true;
>  	int work_done;
>  	int i;
> @@ -1888,7 +1944,8 @@ static int enetc_poll(struct napi_struct *napi, int budget)
>  	enetc_lock_mdio();
>  
>  	for (i = 0; i < v->count_tx_rings; i++)
> -		if (!enetc_clean_tx_ring(&v->tx_ring[i], budget))
> +		if (!enetc_clean_tx_ring(&v->tx_ring[i], budget,
> +					 &xsk_confirmed))
>  			complete = false;
>  
>  	prog = rx_ring->xdp.prog;
> @@ -1901,6 +1958,17 @@ static int enetc_poll(struct napi_struct *napi, int budget)
>  	else
>  		work_done = enetc_clean_rx_ring(rx_ring, napi, budget);
>  
> +	if (pool) {
> +		if (xsk_confirmed)
> +			xsk_tx_completed(pool, xsk_confirmed);
> +
> +		if (xsk_uses_need_wakeup(pool))
> +			xsk_set_tx_need_wakeup(pool);
> +
> +		if (!enetc_xsk_xmit(rx_ring->ndev, pool, rx_ring->index))
> +			complete = false;
> +	}
> +
>  	if (work_done == budget)
>  		complete = false;
>  	if (work_done)
> @@ -3122,7 +3190,31 @@ static int enetc_setup_xsk_pool(struct net_device *ndev,
>  
>  int enetc_xsk_wakeup(struct net_device *ndev, u32 queue_id, u32 flags)
>  {
> -	/* xp_assign_dev() wants this; nothing needed for RX */
> +	struct enetc_ndev_priv *priv = netdev_priv(ndev);
> +	struct enetc_int_vector *v;
> +	struct enetc_bdr *rx_ring;
> +
> +	if (!netif_running(ndev) || !netif_carrier_ok(ndev))
> +		return -ENETDOWN;
> +
> +	if (queue_id >= priv->bdr_int_num)
> +		return -ERANGE;
> +
> +	v = priv->int_vector[queue_id];
> +	rx_ring = &v->rx_ring;
> +
> +	if (!rx_ring->xdp.xsk_pool || !rx_ring->xdp.prog)
> +		return -EINVAL;
> +
> +	/* No way to kick TX by triggering a hardirq right away =>
> +	 * raise softirq. This might schedule NAPI on a CPU different than the
> +	 * smp_affinity of its IRQ would suggest, but that would happen anyway
> +	 * if, say, we change that affinity under heavy traffic.
> +	 * So enetc_poll() has to be prepared for it anyway.
> +	 */
> +	if (!napi_if_scheduled_mark_missed(&v->napi))
> +		napi_schedule(&v->napi);
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
> index e1a746e37c9a..403f40473b52 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc.h
> +++ b/drivers/net/ethernet/freescale/enetc/enetc.h
> @@ -36,6 +36,7 @@ struct enetc_tx_swbd {
>  	u8 is_eof:1;
>  	u8 is_xdp_tx:1;
>  	u8 is_xdp_redirect:1;
> +	u8 is_xsk:1;
>  	u8 qbv_en:1;
>  };
>  
> @@ -86,6 +87,7 @@ struct enetc_xdp_data {
>  #define ENETC_RX_RING_DEFAULT_SIZE	2048
>  #define ENETC_TX_RING_DEFAULT_SIZE	2048
>  #define ENETC_DEFAULT_TX_WORK		(ENETC_TX_RING_DEFAULT_SIZE / 2)
> +#define ENETC_XSK_TX_BATCH		ENETC_DEFAULT_TX_WORK
>  
>  struct enetc_bdr_resource {
>  	/* Input arguments saved for teardown */
> -- 
> 2.34.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ