lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f25767d3-4191-1fe9-d9cd-97f4bbee458c@gmail.com>
Date:   Thu, 24 Nov 2016 14:36:47 +0200
From:   Tariq Toukan <ttoukan.linux@...il.com>
To:     Eric Dumazet <eric.dumazet@...il.com>,
        David Miller <davem@...emloft.net>
Cc:     netdev <netdev@...r.kernel.org>, Tariq Toukan <tariqt@...lanox.com>
Subject: Re: [PATCH net-next] mlx4: reorganize struct mlx4_en_tx_ring

Hi Eric,
Thanks for your patch.

On 23/11/2016 1:56 AM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@...gle.com>
>
> Goal is to reorganize this critical structure to increase performance.
>
> ndo_start_xmit() should only dirty one cache line, and access as few
> cache lines as possible.
>
> Add sp_ (Slow Path) prefix to fields that are not used in fast path,
> to make clear what is going on.
>
> After this patch pahole reports something much better, as all
> ndo_start_xmit() needed fields are packed into two cache lines instead
> of seven or eight
>
> struct mlx4_en_tx_ring {
> 	u32                        last_nr_txbb;         /*     0   0x4 */
> 	u32                        cons;                 /*   0x4   0x4 */
> 	long unsigned int          wake_queue;           /*   0x8   0x8 */
> 	struct netdev_queue *      tx_queue;             /*  0x10   0x8 */
> 	u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /*  0x18   0x8 */
> 	struct mlx4_en_rx_ring *   recycle_ring;         /*  0x20   0x8 */
>
> 	/* XXX 24 bytes hole, try to pack */
>
> 	/* --- cacheline 1 boundary (64 bytes) --- */
> 	u32                        prod;                 /*  0x40   0x4 */
> 	unsigned int               tx_dropped;           /*  0x44   0x4 */
> 	long unsigned int          bytes;                /*  0x48   0x8 */
> 	long unsigned int          packets;              /*  0x50   0x8 */
> 	long unsigned int          tx_csum;              /*  0x58   0x8 */
> 	long unsigned int          tso_packets;          /*  0x60   0x8 */
> 	long unsigned int          xmit_more;            /*  0x68   0x8 */
> 	struct mlx4_bf             bf;                   /*  0x70  0x18 */
> 	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
> 	__be32                     doorbell_qpn;         /*  0x88   0x4 */
> 	__be32                     mr_key;               /*  0x8c   0x4 */
> 	u32                        size;                 /*  0x90   0x4 */
> 	u32                        size_mask;            /*  0x94   0x4 */
> 	u32                        full_size;            /*  0x98   0x4 */
> 	u32                        buf_size;             /*  0x9c   0x4 */
> 	void *                     buf;                  /*  0xa0   0x8 */
> 	struct mlx4_en_tx_info *   tx_info;              /*  0xa8   0x8 */
> 	int                        qpn;                  /*  0xb0   0x4 */
> 	u8                         queue_index;          /*  0xb4   0x1 */
> 	bool                       bf_enabled;           /*  0xb5   0x1 */
> 	bool                       bf_alloced;           /*  0xb6   0x1 */
> 	u8                         hwtstamp_tx_type;     /*  0xb7   0x1 */
> 	u8 *                       bounce_buf;           /*  0xb8   0x8 */
> 	/* --- cacheline 3 boundary (192 bytes) --- */
> 	long unsigned int          queue_stopped;        /*  0xc0   0x8 */
> 	struct mlx4_hwq_resources  sp_wqres;             /*  0xc8  0x58 */
> 	/* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
> 	struct mlx4_qp             sp_qp;                /* 0x120  0x30 */
> 	/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
> 	struct mlx4_qp_context     sp_context;           /* 0x150  0xf8 */
> 	/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
> 	cpumask_t                  sp_affinity_mask;     /* 0x248  0x20 */
> 	enum mlx4_qp_state         sp_qp_state;          /* 0x268   0x4 */
> 	u16                        sp_stride;            /* 0x26c   0x2 */
> 	u16                        sp_cqn;               /* 0x26e   0x2 */
>
> 	/* size: 640, cachelines: 10, members: 36 */
> 	/* sum members: 600, holes: 1, sum holes: 24 */
> 	/* padding: 16 */
> };
>
> Instead of this silly placement :
>
> struct mlx4_en_tx_ring {
> 	u32                        last_nr_txbb;         /*     0   0x4 */
> 	u32                        cons;                 /*   0x4   0x4 */
> 	long unsigned int          wake_queue;           /*   0x8   0x8 */
>
> 	/* XXX 48 bytes hole, try to pack */
>
> 	/* --- cacheline 1 boundary (64 bytes) --- */
> 	u32                        prod;                 /*  0x40   0x4 */
>
> 	/* XXX 4 bytes hole, try to pack */
>
> 	long unsigned int          bytes;                /*  0x48   0x8 */
> 	long unsigned int          packets;              /*  0x50   0x8 */
> 	long unsigned int          tx_csum;              /*  0x58   0x8 */
> 	long unsigned int          tso_packets;          /*  0x60   0x8 */
> 	long unsigned int          xmit_more;            /*  0x68   0x8 */
> 	unsigned int               tx_dropped;           /*  0x70   0x4 */
>
> 	/* XXX 4 bytes hole, try to pack */
>
> 	struct mlx4_bf             bf;                   /*  0x78  0x18 */
> 	/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
> 	long unsigned int          queue_stopped;        /*  0x90   0x8 */
> 	cpumask_t                  affinity_mask;        /*  0x98  0x10 */
> 	struct mlx4_qp             qp;                   /*  0xa8  0x30 */
> 	/* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
> 	struct mlx4_hwq_resources  wqres;                /*  0xd8  0x58 */
> 	/* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
> 	u32                        size;                 /* 0x130   0x4 */
> 	u32                        size_mask;            /* 0x134   0x4 */
> 	u16                        stride;               /* 0x138   0x2 */
>
> 	/* XXX 2 bytes hole, try to pack */
>
> 	u32                        full_size;            /* 0x13c   0x4 */
> 	/* --- cacheline 5 boundary (320 bytes) --- */
> 	u16                        cqn;                  /* 0x140   0x2 */
>
> 	/* XXX 2 bytes hole, try to pack */
>
> 	u32                        buf_size;             /* 0x144   0x4 */
> 	__be32                     doorbell_qpn;         /* 0x148   0x4 */
> 	__be32                     mr_key;               /* 0x14c   0x4 */
> 	void *                     buf;                  /* 0x150   0x8 */
> 	struct mlx4_en_tx_info *   tx_info;              /* 0x158   0x8 */
> 	struct mlx4_en_rx_ring *   recycle_ring;         /* 0x160   0x8 */
> 	u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /* 0x168   0x8 */
> 	u8 *                       bounce_buf;           /* 0x170   0x8 */
> 	struct mlx4_qp_context     context;              /* 0x178  0xf8 */
> 	/* --- cacheline 9 boundary (576 bytes) was 48 bytes ago --- */
> 	int                        qpn;                  /* 0x270   0x4 */
> 	enum mlx4_qp_state         qp_state;             /* 0x274   0x4 */
> 	u8                         queue_index;          /* 0x278   0x1 */
> 	bool                       bf_enabled;           /* 0x279   0x1 */
> 	bool                       bf_alloced;           /* 0x27a   0x1 */
>
> 	/* XXX 5 bytes hole, try to pack */
>
> 	/* --- cacheline 10 boundary (640 bytes) --- */
> 	struct netdev_queue *      tx_queue;             /* 0x280   0x8 */
> 	int                        hwtstamp_tx_type;     /* 0x288   0x4 */
>
> 	/* size: 704, cachelines: 11, members: 36 */
> 	/* sum members: 587, holes: 6, sum holes: 65 */
> 	/* padding: 52 */
> };
>
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
>   drivers/net/ethernet/mellanox/mlx4/en_netdev.c |    2
>   drivers/net/ethernet/mellanox/mlx4/en_tx.c     |   48 +++++++--------
>   drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   42 +++++++------
>   3 files changed, 48 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 9a807e93c9fdd81e61e561208aa1480a244d0bdb..9018bb1b2e12142e048281a9d28ddf95e0023a61 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -1305,7 +1305,7 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
>   		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i)))
>   			continue;
>   		en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n",
> -			i, tx_ring->qpn, tx_ring->cqn,
> +			i, tx_ring->qpn, tx_ring->sp_cqn,
>   			tx_ring->cons, tx_ring->prod);
>   	}
>   
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> index 5de3cbe24f2bf467f9a8f7d499e131b6d2a1844c..4b597dca5c52d114344d638895275ed0d378bd96 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> @@ -66,7 +66,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
>   
>   	ring->size = size;
>   	ring->size_mask = size - 1;
> -	ring->stride = stride;
> +	ring->sp_stride = stride;
>   	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
>   
>   	tmp = size * sizeof(struct mlx4_en_tx_info);
> @@ -90,22 +90,22 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
>   			goto err_info;
>   		}
>   	}
> -	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
> +	ring->buf_size = ALIGN(size * ring->sp_stride, MLX4_EN_PAGE_SIZE);
>   
>   	/* Allocate HW buffers on provided NUMA node */
>   	set_dev_node(&mdev->dev->persist->pdev->dev, node);
> -	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
> +	err = mlx4_alloc_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
>   	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
>   	if (err) {
>   		en_err(priv, "Failed allocating hwq resources\n");
>   		goto err_bounce;
>   	}
>   
> -	ring->buf = ring->wqres.buf.direct.buf;
> +	ring->buf = ring->sp_wqres.buf.direct.buf;
>   
>   	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
>   	       ring, ring->buf, ring->size, ring->buf_size,
> -	       (unsigned long long) ring->wqres.buf.direct.map);
> +	       (unsigned long long) ring->sp_wqres.buf.direct.map);
>   
>   	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
>   				    MLX4_RESERVE_ETH_BF_QP);
> @@ -114,12 +114,12 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
>   		goto err_hwq_res;
>   	}
>   
> -	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
> +	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp, GFP_KERNEL);
>   	if (err) {
>   		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
>   		goto err_reserve;
>   	}
> -	ring->qp.event = mlx4_en_sqp_event;
> +	ring->sp_qp.event = mlx4_en_sqp_event;
>   
>   	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
>   	if (err) {
> @@ -141,7 +141,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
>   	if (queue_index < priv->num_tx_rings_p_up)
>   		cpumask_set_cpu(cpumask_local_spread(queue_index,
>   						     priv->mdev->dev->numa_node),
> -				&ring->affinity_mask);
> +				&ring->sp_affinity_mask);
>   
>   	*pring = ring;
>   	return 0;
> @@ -149,7 +149,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
>   err_reserve:
>   	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
>   err_hwq_res:
> -	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
> +	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
>   err_bounce:
>   	kfree(ring->bounce_buf);
>   	ring->bounce_buf = NULL;
> @@ -171,10 +171,10 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
>   
>   	if (ring->bf_alloced)
>   		mlx4_bf_free(mdev->dev, &ring->bf);
> -	mlx4_qp_remove(mdev->dev, &ring->qp);
> -	mlx4_qp_free(mdev->dev, &ring->qp);
> +	mlx4_qp_remove(mdev->dev, &ring->sp_qp);
> +	mlx4_qp_free(mdev->dev, &ring->sp_qp);
>   	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
> -	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
> +	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
>   	kfree(ring->bounce_buf);
>   	ring->bounce_buf = NULL;
>   	kvfree(ring->tx_info);
> @@ -190,7 +190,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
>   	struct mlx4_en_dev *mdev = priv->mdev;
>   	int err;
>   
> -	ring->cqn = cq;
> +	ring->sp_cqn = cq;
>   	ring->prod = 0;
>   	ring->cons = 0xffffffff;
>   	ring->last_nr_txbb = 1;
> @@ -198,21 +198,21 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
>   	memset(ring->buf, 0, ring->buf_size);
>   	ring->free_tx_desc = mlx4_en_free_tx_desc;
>   
> -	ring->qp_state = MLX4_QP_STATE_RST;
> -	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
> +	ring->sp_qp_state = MLX4_QP_STATE_RST;
> +	ring->doorbell_qpn = cpu_to_be32(ring->sp_qp.qpn << 8);
>   	ring->mr_key = cpu_to_be32(mdev->mr.key);
>   
> -	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
> -				ring->cqn, user_prio, &ring->context);
> +	mlx4_en_fill_qp_context(priv, ring->size, ring->sp_stride, 1, 0, ring->qpn,
> +				ring->sp_cqn, user_prio, &ring->sp_context);
>   	if (ring->bf_alloced)
> -		ring->context.usr_page =
> +		ring->sp_context.usr_page =
>   			cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
>   							 ring->bf.uar->index));
>   
> -	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
> -			       &ring->qp, &ring->qp_state);
> -	if (!cpumask_empty(&ring->affinity_mask))
> -		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
> +	err = mlx4_qp_to_ready(mdev->dev, &ring->sp_wqres.mtt, &ring->sp_context,
> +			       &ring->sp_qp, &ring->sp_qp_state);
> +	if (!cpumask_empty(&ring->sp_affinity_mask))
> +		netif_set_xps_queue(priv->dev, &ring->sp_affinity_mask,
>   				    ring->queue_index);
>   
>   	return err;
> @@ -223,8 +223,8 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
>   {
>   	struct mlx4_en_dev *mdev = priv->mdev;
>   
> -	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
> -		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
> +	mlx4_qp_modify(mdev->dev, NULL, ring->sp_qp_state,
> +		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
>   }
>   
>   static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> index eff21651b67308a17fe3d60d236cd0b6800a3fd2..574bcbb1b38fc4758511d8f7bd17a87b0a507a73 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> @@ -281,46 +281,50 @@ struct mlx4_en_tx_ring {
>   	u32			last_nr_txbb;
>   	u32			cons;
>   	unsigned long		wake_queue;
> +	struct netdev_queue	*tx_queue;
> +	u32			(*free_tx_desc)(struct mlx4_en_priv *priv,
> +						struct mlx4_en_tx_ring *ring,
> +						int index, u8 owner,
> +						u64 timestamp, int napi_mode);
> +	struct mlx4_en_rx_ring	*recycle_ring;
>   
>   	/* cache line used and dirtied in mlx4_en_xmit() */
>   	u32			prod ____cacheline_aligned_in_smp;
> +	unsigned int		tx_dropped;
>   	unsigned long		bytes;
>   	unsigned long		packets;
>   	unsigned long		tx_csum;
>   	unsigned long		tso_packets;
>   	unsigned long		xmit_more;
> -	unsigned int		tx_dropped;
>   	struct mlx4_bf		bf;
> -	unsigned long		queue_stopped;
>   
>   	/* Following part should be mostly read */
> -	cpumask_t		affinity_mask;
> -	struct mlx4_qp		qp;
> -	struct mlx4_hwq_resources wqres;
> +	__be32			doorbell_qpn;
> +	__be32			mr_key;
>   	u32			size; /* number of TXBBs */
>   	u32			size_mask;
> -	u16			stride;
>   	u32			full_size;
> -	u16			cqn;	/* index of port CQ associated with this ring */
>   	u32			buf_size;
> -	__be32			doorbell_qpn;
> -	__be32			mr_key;
>   	void			*buf;
>   	struct mlx4_en_tx_info	*tx_info;
> -	struct mlx4_en_rx_ring	*recycle_ring;
> -	u32			(*free_tx_desc)(struct mlx4_en_priv *priv,
> -						struct mlx4_en_tx_ring *ring,
> -						int index, u8 owner,
> -						u64 timestamp, int napi_mode);
> -	u8			*bounce_buf;
> -	struct mlx4_qp_context	context;
>   	int			qpn;
> -	enum mlx4_qp_state	qp_state;
>   	u8			queue_index;
>   	bool			bf_enabled;
>   	bool			bf_alloced;
> -	struct netdev_queue	*tx_queue;
> -	int			hwtstamp_tx_type;
> +	u8			hwtstamp_tx_type;
> +	u8			*bounce_buf;
> +
> +	/* Not used in fast path
> +	 * Only queue_stopped might be used if BQL is not properly working.
> +	 */
> +	unsigned long		queue_stopped;
> +	struct mlx4_hwq_resources sp_wqres;
> +	struct mlx4_qp		sp_qp;
> +	struct mlx4_qp_context	sp_context;
> +	cpumask_t		sp_affinity_mask;
> +	enum mlx4_qp_state	sp_qp_state;
> +	u16			sp_stride;
> +	u16			sp_cqn;	/* index of port CQ associated with this ring */
>   } ____cacheline_aligned_in_smp;
>   
>   struct mlx4_en_rx_desc {
>
>
Reviewed-by: Tariq Toukan <tariqt@...lanox.com>

Regards,
Tariq

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ