linux-kernel - Re: [PATCH RFC net-next v4 6/9] iavf: switch to Page Pool

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6b8bc66f-8a02-b6b4-92cc-f8aaf067abd8@huawei.com>
Date:   Thu, 6 Jul 2023 20:47:22 +0800
From:   Yunsheng Lin <linyunsheng@...wei.com>
To:     Alexander Lobakin <aleksander.lobakin@...el.com>,
        "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>
CC:     Maciej Fijalkowski <maciej.fijalkowski@...el.com>,
        Michal Kubiak <michal.kubiak@...el.com>,
        Larysa Zaremba <larysa.zaremba@...el.com>,
        Alexander Duyck <alexanderduyck@...com>,
        David Christensen <drc@...ux.vnet.ibm.com>,
        Jesper Dangaard Brouer <hawk@...nel.org>,
        Ilias Apalodimas <ilias.apalodimas@...aro.org>,
        Paul Menzel <pmenzel@...gen.mpg.de>, <netdev@...r.kernel.org>,
        <intel-wired-lan@...ts.osuosl.org>, <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH RFC net-next v4 6/9] iavf: switch to Page Pool

On 2023/7/5 23:55, Alexander Lobakin wrote:
> Now that the IAVF driver simply uses dev_alloc_page() + free_page() with
> no custom recycling logics, it can easily be switched to using Page
> Pool / libie API instead.
> This allows to removing the whole dancing around headroom, HW buffer
> size, and page order. All DMA-for-device is now done in the PP core,
> for-CPU -- in the libie helper.
> Use skb_mark_for_recycle() to bring back the recycling and restore the
> performance. Speaking of performance: on par with the baseline and
> faster with the PP optimization series applied. But the memory usage for
> 1500b MTU is now almost 2x lower (x86_64) thanks to allocating a page
> every second descriptor.
> 
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@...el.com>
> ---

...

> @@ -2562,11 +2541,7 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
>  
>  	netdev->netdev_ops = &iavf_netdev_ops;
>  	iavf_set_ethtool_ops(netdev);
> -	netdev->watchdog_timeo = 5 * HZ;

This seems like a unrelated change here?

> -
> -	/* MTU range: 68 - 9710 */
> -	netdev->min_mtu = ETH_MIN_MTU;
> -	netdev->max_mtu = IAVF_MAX_RXBUFFER - IAVF_PACKET_HDR_PAD;
> +	netdev->max_mtu = LIBIE_MAX_MTU;
>  

...

>  /**
> @@ -766,13 +742,19 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
>   **/
>  int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
>  {
> -	struct device *dev = rx_ring->dev;
> -	int bi_size;
> +	struct page_pool *pool;
> +
> +	pool = libie_rx_page_pool_create(&rx_ring->q_vector->napi,
> +					 rx_ring->count);

If a page is able to be spilt between more than one desc, perhaps the
prt_ring size does not need to be as big as rx_ring->count.

> +	if (IS_ERR(pool))
> +		return PTR_ERR(pool);
> +
> +	rx_ring->pp = pool;
>  
>  	/* warn if we are about to overwrite the pointer */
>  	WARN_ON(rx_ring->rx_bi);
> -	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
> -	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
> +	rx_ring->rx_bi = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_bi),
> +				 GFP_KERNEL);
>  	if (!rx_ring->rx_bi)
>  		goto err;
>  

...

>  
>  /**
>   * iavf_build_skb - Build skb around an existing buffer
> - * @rx_ring: Rx descriptor ring to transact packets on
>   * @rx_buffer: Rx buffer to pull data from
>   * @size: size of buffer to add to skb
>   *
>   * This function builds an skb around an existing Rx buffer, taking care
>   * to set up the skb correctly and avoid any memcpy overhead.
>   */
> -static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
> -				      struct iavf_rx_buffer *rx_buffer,
> +static struct sk_buff *iavf_build_skb(const struct libie_rx_buffer *rx_buffer,
>  				      unsigned int size)
>  {
> -	void *va;
> -#if (PAGE_SIZE < 8192)
> -	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
> -#else
> -	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
> -				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
> -#endif
> +	struct page *page = rx_buffer->page;
> +	u32 hr = page->pp->p.offset;
>  	struct sk_buff *skb;
> +	void *va;
>  
> -	if (!rx_buffer || !size)
> -		return NULL;
>  	/* prefetch first cache line of first page */
> -	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
> -	net_prefetch(va);
> +	va = page_address(page) + rx_buffer->offset;
> +	net_prefetch(va + hr);
>  
>  	/* build an skb around the page buffer */
> -	skb = napi_build_skb(va - IAVF_SKB_PAD, truesize);
> -	if (unlikely(!skb))
> +	skb = napi_build_skb(va, rx_buffer->truesize);
> +	if (unlikely(!skb)) {
> +		page_pool_put_page(page->pp, page, size, true);

Isn't it more correct to call page_pool_put_full_page() here?
as we do not know which frag is used for the rx_buffer, and depend
on the last released frag to do the syncing, maybe I should mention
that in Documentation/networking/page_pool.rst.

>  		return NULL;
> +	}

...

>  struct iavf_queue_stats {
>  	u64 packets;
>  	u64 bytes;
> @@ -311,16 +243,19 @@ enum iavf_ring_state_t {
>  struct iavf_ring {
>  	struct iavf_ring *next;		/* pointer to next ring in q_vector */
>  	void *desc;			/* Descriptor ring memory */
> -	struct device *dev;		/* Used for DMA mapping */
> +	union {
> +		struct page_pool *pp;	/* Used on Rx for buffer management */
> +		struct device *dev;	/* Used on Tx for DMA mapping */
> +	};
>  	struct net_device *netdev;	/* netdev ring maps to */
>  	union {
> +		struct libie_rx_buffer *rx_bi;
>  		struct iavf_tx_buffer *tx_bi;
> -		struct iavf_rx_buffer *rx_bi;
>  	};
>  	DECLARE_BITMAP(state, __IAVF_RING_STATE_NBITS);
> +	u8 __iomem *tail;

Is there a reason to move it here?

>  	u16 queue_index;		/* Queue number of ring */
>  	u8 dcb_tc;			/* Traffic class of ring */
> -	u8 __iomem *tail;
>