lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAHS8izNeKdsys4VCEW5F1gDoK7dPJZ6fAew3700TwmH3=tT_ag@mail.gmail.com>
Date: Thu, 22 May 2025 16:18:42 -0700
From: Mina Almasry <almasrymina@...gle.com>
To: Tariq Toukan <tariqt@...dia.com>
Cc: "David S. Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>, Eric Dumazet <edumazet@...gle.com>, 
	Andrew Lunn <andrew+netdev@...n.ch>, Saeed Mahameed <saeedm@...dia.com>, 
	Leon Romanovsky <leon@...nel.org>, Richard Cochran <richardcochran@...il.com>, 
	Alexei Starovoitov <ast@...nel.org>, Daniel Borkmann <daniel@...earbox.net>, 
	Jesper Dangaard Brouer <hawk@...nel.org>, John Fastabend <john.fastabend@...il.com>, netdev@...r.kernel.org, 
	linux-rdma@...r.kernel.org, linux-kernel@...r.kernel.org, bpf@...r.kernel.org, 
	Moshe Shemesh <moshe@...dia.com>, Mark Bloch <mbloch@...dia.com>, Gal Pressman <gal@...dia.com>, 
	Cosmin Ratiu <cratiu@...dia.com>, Dragos Tatulea <dtatulea@...dia.com>
Subject: Re: [PATCH net-next V2 08/11] net/mlx5e: Convert over to netmem

On Thu, May 22, 2025 at 2:46 PM Tariq Toukan <tariqt@...dia.com> wrote:
>
> From: Saeed Mahameed <saeedm@...dia.com>
>
> mlx5e_page_frag holds the physical page itself, to naturally support
> zc page pools, remove physical page reference from mlx5 and replace it
> with netmem_ref, to avoid internal handling in mlx5 for net_iov backed
> pages.
>
> SHAMPO can issue packets that are not split into header and data. These
> packets will be dropped if the data part resides in a net_iov as the
> driver can't read into this area.
>
> No performance degradation observed.
>
> Signed-off-by: Saeed Mahameed <saeedm@...dia.com>
> Signed-off-by: Dragos Tatulea <dtatulea@...dia.com>
> Signed-off-by: Cosmin Ratiu <cratiu@...dia.com>
> Signed-off-by: Tariq Toukan <tariqt@...dia.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en.h  |   2 +-
>  .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 103 ++++++++++--------
>  2 files changed, 61 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index c329de1d4f0a..65a73913b9a2 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -553,7 +553,7 @@ struct mlx5e_icosq {
>  } ____cacheline_aligned_in_smp;
>
>  struct mlx5e_frag_page {

omega nit: maybe this should be renamed now to mlx5e_frag_netmem. Up to you.

> -       struct page *page;
> +       netmem_ref netmem;
>         u16 frags;
>  };
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> index e34ef53ebd0e..75e753adedef 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> @@ -273,33 +273,33 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
>
>  #define MLX5E_PAGECNT_BIAS_MAX (PAGE_SIZE / 64)
>
> -static int mlx5e_page_alloc_fragmented(struct page_pool *pool,
> +static int mlx5e_page_alloc_fragmented(struct page_pool *pp,
>                                        struct mlx5e_frag_page *frag_page)
>  {
> -       struct page *page;
> +       netmem_ref netmem = page_pool_alloc_netmems(pp,
> +                                                   GFP_ATOMIC | __GFP_NOWARN);
>
> -       page = page_pool_dev_alloc_pages(pool);
> -       if (unlikely(!page))
> +       if (unlikely(!netmem))
>                 return -ENOMEM;
>
> -       page_pool_fragment_page(page, MLX5E_PAGECNT_BIAS_MAX);
> +       page_pool_fragment_netmem(netmem, MLX5E_PAGECNT_BIAS_MAX);
>
>         *frag_page = (struct mlx5e_frag_page) {
> -               .page   = page,
> +               .netmem = netmem,
>                 .frags  = 0,
>         };
>
>         return 0;
>  }
>
> -static void mlx5e_page_release_fragmented(struct page_pool *pool,
> +static void mlx5e_page_release_fragmented(struct page_pool *pp,
>                                           struct mlx5e_frag_page *frag_page)
>  {
>         u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - frag_page->frags;
> -       struct page *page = frag_page->page;
> +       netmem_ref netmem = frag_page->netmem;
>
> -       if (page_pool_unref_page(page, drain_count) == 0)
> -               page_pool_put_unrefed_page(pool, page, -1, true);
> +       if (page_pool_unref_netmem(netmem, drain_count) == 0)
> +               page_pool_put_unrefed_netmem(pp, netmem, -1, true);
>  }
>
>  static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
> @@ -359,7 +359,7 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe,
>                 frag->flags &= ~BIT(MLX5E_WQE_FRAG_SKIP_RELEASE);
>
>                 headroom = i == 0 ? rq->buff.headroom : 0;
> -               addr = page_pool_get_dma_addr(frag->frag_page->page);
> +               addr = page_pool_get_dma_addr_netmem(frag->frag_page->netmem);
>                 wqe->data[i].addr = cpu_to_be64(addr + frag->offset + headroom);
>         }
>
> @@ -500,9 +500,10 @@ mlx5e_add_skb_shared_info_frag(struct mlx5e_rq *rq, struct skb_shared_info *sinf
>                                struct xdp_buff *xdp, struct mlx5e_frag_page *frag_page,
>                                u32 frag_offset, u32 len)
>  {
> +       netmem_ref netmem = frag_page->netmem;
>         skb_frag_t *frag;
>
> -       dma_addr_t addr = page_pool_get_dma_addr(frag_page->page);
> +       dma_addr_t addr = page_pool_get_dma_addr_netmem(netmem);
>
>         dma_sync_single_for_cpu(rq->pdev, addr + frag_offset, len, rq->buff.map_dir);
>         if (!xdp_buff_has_frags(xdp)) {
> @@ -515,9 +516,9 @@ mlx5e_add_skb_shared_info_frag(struct mlx5e_rq *rq, struct skb_shared_info *sinf
>         }
>
>         frag = &sinfo->frags[sinfo->nr_frags++];
> -       skb_frag_fill_page_desc(frag, frag_page->page, frag_offset, len);
> +       skb_frag_fill_netmem_desc(frag, netmem, frag_offset, len);
>
> -       if (page_is_pfmemalloc(frag_page->page))
> +       if (!netmem_is_net_iov(netmem) && netmem_is_pfmemalloc(netmem))

nit: unnecessary net_iov check AFAICT?

>                 xdp_buff_set_frag_pfmemalloc(xdp);
>         sinfo->xdp_frags_size += len;
>  }
> @@ -528,27 +529,29 @@ mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
>                    u32 frag_offset, u32 len,
>                    unsigned int truesize)
>  {
> -       dma_addr_t addr = page_pool_get_dma_addr(frag_page->page);
> +       dma_addr_t addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
>         u8 next_frag = skb_shinfo(skb)->nr_frags;
> +       netmem_ref netmem = frag_page->netmem;
>
>         dma_sync_single_for_cpu(rq->pdev, addr + frag_offset, len,
>                                 rq->buff.map_dir);
>
> -       if (skb_can_coalesce(skb, next_frag, frag_page->page, frag_offset)) {
> +       if (skb_can_coalesce_netmem(skb, next_frag, netmem, frag_offset)) {
>                 skb_coalesce_rx_frag(skb, next_frag - 1, len, truesize);
> -       } else {
> -               frag_page->frags++;
> -               skb_add_rx_frag(skb, next_frag, frag_page->page,
> -                               frag_offset, len, truesize);
> +               return;
>         }
> +
> +       frag_page->frags++;
> +       skb_add_rx_frag_netmem(skb, next_frag, netmem,
> +                              frag_offset, len, truesize);
>  }
>
>  static inline void
>  mlx5e_copy_skb_header(struct mlx5e_rq *rq, struct sk_buff *skb,
> -                     struct page *page, dma_addr_t addr,
> +                     netmem_ref netmem, dma_addr_t addr,
>                       int offset_from, int dma_offset, u32 headlen)
>  {
> -       const void *from = page_address(page) + offset_from;
> +       const void *from = netmem_address(netmem) + offset_from;

I think this needs a check that netmem_address != NULL and safe error
handling in case it is? If the netmem is unreadable, netmem_address
will return NULL, and because you add offset_from to it, you can't
NULL check from as well.

>         /* Aligning len to sizeof(long) optimizes memcpy performance */
>         unsigned int len = ALIGN(headlen, sizeof(long));
>
> @@ -685,7 +688,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq,
>                 if (unlikely(err))
>                         goto err_unmap;
>
> -               addr = page_pool_get_dma_addr(frag_page->page);
> +               addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
>
>                 for (int j = 0; j < MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; j++) {
>                         header_offset = mlx5e_shampo_hd_offset(index++);
> @@ -796,7 +799,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
>                 err = mlx5e_page_alloc_fragmented(rq->page_pool, frag_page);
>                 if (unlikely(err))
>                         goto err_unmap;
> -               addr = page_pool_get_dma_addr(frag_page->page);
> +
> +               addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
>                 umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
>                         .ptag = cpu_to_be64(addr | MLX5_EN_WR),
>                 };
> @@ -1216,7 +1220,7 @@ static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index)
>         struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index);
>         u16 head_offset = mlx5e_shampo_hd_offset(header_index) + rq->buff.headroom;
>
> -       return page_address(frag_page->page) + head_offset;
> +       return netmem_address(frag_page->netmem) + head_offset;

Similar concern here. netmem_address may be NULL, especially when you
enable unreadable netmem support in the later patches. There are a
couple of call sites below as well.

>  }
>
>  static void mlx5e_shampo_update_ipv4_udp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4)
> @@ -1677,11 +1681,11 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
>         dma_addr_t addr;
>         u32 frag_size;
>
> -       va             = page_address(frag_page->page) + wi->offset;
> +       va             = netmem_address(frag_page->netmem) + wi->offset;
>         data           = va + rx_headroom;
>         frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
>
> -       addr = page_pool_get_dma_addr(frag_page->page);
> +       addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
>         dma_sync_single_range_for_cpu(rq->pdev, addr, wi->offset,
>                                       frag_size, rq->buff.map_dir);
>         net_prefetch(data);
> @@ -1731,10 +1735,10 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
>
>         frag_page = wi->frag_page;
>
> -       va = page_address(frag_page->page) + wi->offset;
> +       va = netmem_address(frag_page->netmem) + wi->offset;
>         frag_consumed_bytes = min_t(u32, frag_info->frag_size, cqe_bcnt);
>
> -       addr = page_pool_get_dma_addr(frag_page->page);
> +       addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
>         dma_sync_single_range_for_cpu(rq->pdev, addr, wi->offset,
>                                       rq->buff.frame0_sz, rq->buff.map_dir);
>         net_prefetchw(va); /* xdp_frame data area */
> @@ -2007,13 +2011,14 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
>
>         if (prog) {
>                 /* area for bpf_xdp_[store|load]_bytes */
> -               net_prefetchw(page_address(frag_page->page) + frag_offset);
> +               net_prefetchw(netmem_address(frag_page->netmem) + frag_offset);
>                 if (unlikely(mlx5e_page_alloc_fragmented(rq->page_pool,
>                                                          &wi->linear_page))) {
>                         rq->stats->buff_alloc_err++;
>                         return NULL;
>                 }
> -               va = page_address(wi->linear_page.page);
> +
> +               va = netmem_address(wi->linear_page.netmem);
>                 net_prefetchw(va); /* xdp_frame data area */
>                 linear_hr = XDP_PACKET_HEADROOM;
>                 linear_data_len = 0;
> @@ -2124,8 +2129,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
>                         while (++pagep < frag_page);
>                 }
>                 /* copy header */
> -               addr = page_pool_get_dma_addr(head_page->page);
> -               mlx5e_copy_skb_header(rq, skb, head_page->page, addr,
> +               addr = page_pool_get_dma_addr_netmem(head_page->netmem);
> +               mlx5e_copy_skb_header(rq, skb, head_page->netmem, addr,
>                                       head_offset, head_offset, headlen);
>                 /* skb linear part was allocated with headlen and aligned to long */
>                 skb->tail += headlen;
> @@ -2155,11 +2160,11 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
>                 return NULL;
>         }
>
> -       va             = page_address(frag_page->page) + head_offset;
> +       va             = netmem_address(frag_page->netmem) + head_offset;
>         data           = va + rx_headroom;
>         frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
>
> -       addr = page_pool_get_dma_addr(frag_page->page);
> +       addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
>         dma_sync_single_range_for_cpu(rq->pdev, addr, head_offset,
>                                       frag_size, rq->buff.map_dir);
>         net_prefetch(data);
> @@ -2198,16 +2203,19 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
>                           struct mlx5_cqe64 *cqe, u16 header_index)
>  {
>         struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index);
> -       dma_addr_t page_dma_addr = page_pool_get_dma_addr(frag_page->page);
>         u16 head_offset = mlx5e_shampo_hd_offset(header_index);
> -       dma_addr_t dma_addr = page_dma_addr + head_offset;
>         u16 head_size = cqe->shampo.header_size;
>         u16 rx_headroom = rq->buff.headroom;
>         struct sk_buff *skb = NULL;
> +       dma_addr_t page_dma_addr;
> +       dma_addr_t dma_addr;
>         void *hdr, *data;
>         u32 frag_size;
>
> -       hdr             = page_address(frag_page->page) + head_offset;
> +       page_dma_addr = page_pool_get_dma_addr_netmem(frag_page->netmem);
> +       dma_addr = page_dma_addr + head_offset;
> +
> +       hdr             = netmem_address(frag_page->netmem) + head_offset;
>         data            = hdr + rx_headroom;
>         frag_size       = MLX5_SKB_FRAG_SZ(rx_headroom + head_size);
>
> @@ -2232,7 +2240,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
>                 }
>
>                 net_prefetchw(skb->data);
> -               mlx5e_copy_skb_header(rq, skb, frag_page->page, dma_addr,
> +               mlx5e_copy_skb_header(rq, skb, frag_page->netmem, dma_addr,
>                                       head_offset + rx_headroom,
>                                       rx_headroom, head_size);
>                 /* skb linear part was allocated with headlen and aligned to long */
> @@ -2326,11 +2334,20 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
>         }
>
>         if (!*skb) {
> -               if (likely(head_size))
> +               if (likely(head_size)) {
>                         *skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index);
> -               else
> -                       *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe, cqe_bcnt,
> -                                                                 data_offset, page_idx);
> +               } else {
> +                       struct mlx5e_frag_page *frag_page;
> +
> +                       frag_page = &wi->alloc_units.frag_pages[page_idx];
> +                       if (unlikely(netmem_is_net_iov(frag_page->netmem)))
> +                               goto free_hd_entry;
> +                       *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe,
> +                                                                 cqe_bcnt,
> +                                                                 data_offset,
> +                                                                 page_idx);
> +               }
> +
>                 if (unlikely(!*skb))
>                         goto free_hd_entry;
>
> --
> 2.31.1
>
>


-- 
Thanks,
Mina

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ