[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1d59e91c-1c92-ccfd-652e-860c8bcd794e@mellanox.com>
Date: Wed, 7 Mar 2018 14:11:36 +0200
From: Tariq Toukan <tariqt@...lanox.com>
To: Jesper Dangaard Brouer <brouer@...hat.com>, netdev@...r.kernel.org,
BjörnTöpel <bjorn.topel@...el.com>,
magnus.karlsson@...el.com
Cc: eugenia@...lanox.com, Jason Wang <jasowang@...hat.com>,
John Fastabend <john.fastabend@...il.com>,
Eran Ben Elisha <eranbe@...lanox.com>,
Saeed Mahameed <saeedm@...lanox.com>, galp@...lanox.com,
Daniel Borkmann <borkmann@...earbox.net>,
Alexei Starovoitov <alexei.starovoitov@...il.com>,
Tariq Toukan <tariqt@...lanox.com>
Subject: Re: [bpf-next V1 PATCH 13/15] mlx5: use page_pool for
xdp_return_frame call
On 06/03/2018 11:48 PM, Jesper Dangaard Brouer wrote:
> This patch shows how it is possible to have both the driver local page
> cache, which uses elevated refcnt for "catching"/avoiding SKB
> put_page. And at the same time, have pages getting returned to the
> page_pool from ndp_xdp_xmit DMA completion.
>
> Performance is surprisingly good. Tested DMA-TX completion on ixgbe,
> that calls "xdp_return_frame", which call page_pool_put_page().
> Stats show DMA-TX-completion runs on CPU#9 and mlx5 RX runs on CPU#5.
> (Internally page_pool uses ptr_ring, which is what gives the good
> cross CPU performance).
>
> Show adapter(s) (ixgbe2 mlx5p2) statistics (ONLY that changed!)
> Ethtool(ixgbe2 ) stat: 732863573 ( 732,863,573) <= tx_bytes /sec
> Ethtool(ixgbe2 ) stat: 781724427 ( 781,724,427) <= tx_bytes_nic /sec
> Ethtool(ixgbe2 ) stat: 12214393 ( 12,214,393) <= tx_packets /sec
> Ethtool(ixgbe2 ) stat: 12214435 ( 12,214,435) <= tx_pkts_nic /sec
> Ethtool(mlx5p2 ) stat: 12211786 ( 12,211,786) <= rx3_cache_empty /sec
> Ethtool(mlx5p2 ) stat: 36506736 ( 36,506,736) <= rx_64_bytes_phy /sec
> Ethtool(mlx5p2 ) stat: 2336430575 ( 2,336,430,575) <= rx_bytes_phy /sec
> Ethtool(mlx5p2 ) stat: 12211786 ( 12,211,786) <= rx_cache_empty /sec
> Ethtool(mlx5p2 ) stat: 22823073 ( 22,823,073) <= rx_discards_phy /sec
> Ethtool(mlx5p2 ) stat: 1471860 ( 1,471,860) <= rx_out_of_buffer /sec
> Ethtool(mlx5p2 ) stat: 36506715 ( 36,506,715) <= rx_packets_phy /sec
> Ethtool(mlx5p2 ) stat: 2336542282 ( 2,336,542,282) <= rx_prio0_bytes /sec
> Ethtool(mlx5p2 ) stat: 13683921 ( 13,683,921) <= rx_prio0_packets /sec
> Ethtool(mlx5p2 ) stat: 821015537 ( 821,015,537) <= rx_vport_unicast_bytes /sec
> Ethtool(mlx5p2 ) stat: 13683608 ( 13,683,608) <= rx_vport_unicast_packets /sec
>
> Before this patch: single flow performance was 6Mpps, and if I started
> two flows the collective performance drop to 4Mpps, because we hit the
> page allocator lock (further negative scaling occurs).
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@...hat.com>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 ++
> drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 39 ++++++++++++++++++---
> drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 10 ++++-
> 3 files changed, 45 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index 28cc26debeda..ab91166f7c5a 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -53,6 +53,8 @@
> #include "mlx5_core.h"
> #include "en_stats.h"
>
> +struct page_pool;
> +
you can have the include here instead:
#include <net/page_pool.h>
and remove it from the .c files.
> #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
>
> #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
> @@ -535,6 +537,7 @@ struct mlx5e_rq {
> /* XDP */
> struct bpf_prog *xdp_prog;
> struct mlx5e_xdpsq xdpsq;
> + struct page_pool *page_pool;
>
> /* control */
> struct mlx5_wq_ctrl wq_ctrl;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 49732c8c27c1..fbe27110ff02 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -44,6 +44,8 @@
> #include "accel/ipsec.h"
> #include "vxlan.h"
>
> +#include <net/page_pool.h>
> +
> struct mlx5e_rq_param {
> u32 rqc[MLX5_ST_SZ_DW(rqc)];
> struct mlx5_wq_param wq;
> @@ -396,6 +398,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> int err;
> int i;
>
> + struct page_pool_params pp_params = { 0 };
> +
> rqp->wq.db_numa_node = cpu_to_node(c->cpu);
>
> err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
> @@ -506,12 +510,33 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> rq->mkey_be = c->mkey_be;
> }
>
> - /* This must only be activate for order-0 pages */
> - if (rq->xdp_prog)
> - err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
> - MEM_TYPE_PAGE_ORDER0, NULL);
> - if (err)
> + /* Create a page_pool and register it with rxq */
> + pp_params.size = PAGE_POOL_PARAMS_SIZE;
> + pp_params.order = rq->buff.page_order;
> + pp_params.dev = c->pdev;
> + pp_params.nid = cpu_to_node(c->cpu);
> + pp_params.dma_dir = rq->buff.map_dir;
> + pp_params.pool_size = 1 << params->log_rq_size;
if rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ, need to multiply
by MLX5_MPWRQ_PAGES_PER_WQE.
> + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
> +
> + /* page_pool can be used even when there is no rq->xdp_prog,
> + * given page_pool does not handle DMA mapping there is no
> + * required state to clear. And page_pool gracefully handle
> + * elevated refcnt.
> + */
> + rq->page_pool = page_pool_create(&pp_params);
> + if (IS_ERR_OR_NULL(rq->page_pool)) {
> + kfree(rq->wqe.frag_info);
> + rq->page_pool = NULL;
> + err = -ENOMEM;
better: PTR_ERR(rq->page_pool)
> goto err_rq_wq_destroy;
> + }
> + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
> + MEM_TYPE_PAGE_POOL, rq->page_pool);
> + if (err) {
> + err = -ENOMEM;
why change err value?
> + goto err_rq_wq_destroy;
> + }
>
> for (i = 0; i < wq_sz; i++) {
> struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
> @@ -549,6 +574,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
> if (rq->xdp_prog)
> bpf_prog_put(rq->xdp_prog);
> xdp_rxq_info_unreg(&rq->xdp_rxq);
> + if (rq->page_pool)
> + page_pool_destroy_rcu(rq->page_pool);
> mlx5_wq_destroy(&rq->wq_ctrl);
>
> return err;
> @@ -562,6 +589,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
> bpf_prog_put(rq->xdp_prog);
>
> xdp_rxq_info_unreg(&rq->xdp_rxq);
> + if (rq->page_pool)
> + page_pool_destroy_rcu(rq->page_pool);
>
> switch (rq->wq_type) {
> case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> index 6dcc3e8fbd3e..4898239467d9 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> @@ -45,6 +45,8 @@
> #include "en_accel/ipsec_rxtx.h"
> #include "lib/clock.h"
>
> +#include <net/page_pool.h>
> +
> static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
> {
> return config->rx_filter == HWTSTAMP_FILTER_ALL;
> @@ -221,7 +223,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
> if (mlx5e_rx_cache_get(rq, dma_info))
> return 0;
>
> - dma_info->page = dev_alloc_pages(rq->buff.page_order);
> + dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
> if (unlikely(!dma_info->page))
> return -ENOMEM;
>
> @@ -250,7 +252,11 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
> return;
>
> mlx5e_page_dma_unmap(rq, dma_info);
> - put_page(dma_info->page);
> +
> + if (likely(recycle))
> + page_pool_recycle_direct(rq->page_pool, dma_info->page);
> + else
> + put_page(dma_info->page);
> }
>
we can save a branch, this way:
- if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
+ if (likely(recycle)) {
+ if (mlx5e_rx_cache_put(rq, dma_info))
+ return;
+
+ mlx5e_page_dma_unmap(rq, dma_info);
+ page_pool_recycle_direct(rq->page_pool, dma_info->page);
return;
+ }
> static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
>
Powered by blists - more mailing lists