[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1480402716.18162.124.camel@edumazet-glaptop3.roam.corp.google.com>
Date: Mon, 28 Nov 2016 22:58:36 -0800
From: Eric Dumazet <eric.dumazet@...il.com>
To: Jesper Dangaard Brouer <brouer@...hat.com>
Cc: Rick Jones <rick.jones2@....com>, netdev@...r.kernel.org,
Saeed Mahameed <saeedm@...lanox.com>,
Tariq Toukan <tariqt@...lanox.com>
Subject: [WIP] net+mlx4: auto doorbell
On Mon, 2016-11-21 at 10:10 -0800, Eric Dumazet wrote:
> Not sure it this has been tried before, but the doorbell avoidance could
> be done by the driver itself, because it knows a TX completion will come
> shortly (well... if softirqs are not delayed too much !)
>
> Doorbell would be forced only if :
>
> ( "skb->xmit_more is not set" AND "TX engine is not 'started yet'" )
> OR
> ( too many [1] packets were put in TX ring buffer, no point deferring
> more)
>
> Start the pump, but once it is started, let the doorbells being done by
> TX completion.
>
> ndo_start_xmit and TX completion handler would have to maintain a shared
> state describing if packets were ready but doorbell deferred.
>
>
> Note that TX completion means "if at least one packet was drained",
> otherwise busy polling, constantly calling napi->poll() would force a
> doorbell too soon for devices sharing a NAPI for both RX and TX.
>
> But then, maybe busy poll would like to force a doorbell...
>
> I could try these ideas on mlx4 shortly.
>
>
> [1] limit could be derived from active "ethtool -c" params, eg tx-frames
I have a WIP, that increases pktgen rate by 75 % on mlx4 when bulking is
not used.
lpaa23:~# echo 0 >/sys/class/net/eth0/doorbell_opt
lpaa23:~# sar -n DEV 1 10|grep eth0
22:43:26 eth0 0.00 5822800.00 0.00 597064.41 0.00 0.00 1.00
22:43:27 eth0 24.00 5788237.00 2.09 593520.26 0.00 0.00 0.00
22:43:28 eth0 12.00 5817777.00 1.43 596551.47 0.00 0.00 1.00
22:43:29 eth0 22.00 5841516.00 1.61 598982.87 0.00 0.00 0.00
22:43:30 eth0 4.00 4389137.00 0.71 450058.08 0.00 0.00 1.00
22:43:31 eth0 4.00 5871008.00 0.72 602007.79 0.00 0.00 0.00
22:43:32 eth0 12.00 5891809.00 1.43 604142.60 0.00 0.00 1.00
22:43:33 eth0 10.00 5901904.00 1.12 605175.70 0.00 0.00 0.00
22:43:34 eth0 5.00 5907982.00 0.69 605798.99 0.00 0.00 1.00
22:43:35 eth0 2.00 5847086.00 0.12 599554.71 0.00 0.00 0.00
Average: eth0 9.50 5707925.60 0.99 585285.69 0.00 0.00 0.50
lpaa23:~# echo 1 >/sys/class/net/eth0/doorbell_opt
lpaa23:~# sar -n DEV 1 10|grep eth0
22:43:47 eth0 9.00 10226424.00 1.02 1048608.05 0.00 0.00 1.00
22:43:48 eth0 1.00 10316955.00 0.06 1057890.89 0.00 0.00 0.00
22:43:49 eth0 1.00 10310104.00 0.10 1057188.32 0.00 0.00 1.00
22:43:50 eth0 0.00 10249423.00 0.00 1050966.23 0.00 0.00 0.00
22:43:51 eth0 0.00 10210441.00 0.00 1046969.05 0.00 0.00 1.00
22:43:52 eth0 2.00 10198389.00 0.16 1045733.17 0.00 0.00 1.00
22:43:53 eth0 8.00 10079257.00 1.43 1033517.83 0.00 0.00 0.00
22:43:54 eth0 0.00 7693509.00 0.00 788885.16 0.00 0.00 0.00
22:43:55 eth0 2.00 10343076.00 0.20 1060569.32 0.00 0.00 1.00
22:43:56 eth0 1.00 10224571.00 0.14 1048417.93 0.00 0.00 0.00
Average: eth0 2.40 9985214.90 0.31 1023874.60 0.00 0.00 0.50
And about 11 % improvement on an mono-flow UDP_STREAM test.
skb_set_owner_w() is now the most consuming function.
lpaa23:~# ./udpsnd -4 -H 10.246.7.152 -d 2 &
[1] 13696
lpaa23:~# echo 0 >/sys/class/net/eth0/doorbell_opt
lpaa23:~# sar -n DEV 1 10|grep eth0
22:50:47 eth0 3.00 1355422.00 0.45 319706.04 0.00 0.00 0.00
22:50:48 eth0 2.00 1344270.00 0.42 317035.21 0.00 0.00 1.00
22:50:49 eth0 3.00 1350503.00 0.51 318478.34 0.00 0.00 0.00
22:50:50 eth0 29.00 1348593.00 2.86 318113.02 0.00 0.00 1.00
22:50:51 eth0 14.00 1354855.00 1.83 319508.56 0.00 0.00 0.00
22:50:52 eth0 7.00 1357794.00 0.73 320226.89 0.00 0.00 1.00
22:50:53 eth0 5.00 1326130.00 0.63 312784.72 0.00 0.00 0.00
22:50:54 eth0 2.00 994584.00 0.12 234598.40 0.00 0.00 1.00
22:50:55 eth0 5.00 1318209.00 0.75 310932.46 0.00 0.00 0.00
22:50:56 eth0 20.00 1323445.00 1.73 312178.11 0.00 0.00 1.00
Average: eth0 9.00 1307380.50 1.00 308356.18 0.00 0.00 0.50
lpaa23:~# echo 3 >/sys/class/net/eth0/doorbell_opt
lpaa23:~# sar -n DEV 1 10|grep eth0
22:51:03 eth0 4.00 1512055.00 0.54 356599.40 0.00 0.00 0.00
22:51:04 eth0 4.00 1507631.00 0.55 355609.46 0.00 0.00 1.00
22:51:05 eth0 4.00 1487789.00 0.42 350917.47 0.00 0.00 0.00
22:51:06 eth0 7.00 1474460.00 1.22 347811.16 0.00 0.00 1.00
22:51:07 eth0 2.00 1496529.00 0.24 352995.18 0.00 0.00 0.00
22:51:08 eth0 3.00 1485856.00 0.49 350425.65 0.00 0.00 1.00
22:51:09 eth0 1.00 1114808.00 0.06 262905.38 0.00 0.00 0.00
22:51:10 eth0 2.00 1510924.00 0.30 356397.53 0.00 0.00 1.00
22:51:11 eth0 2.00 1506408.00 0.30 355345.76 0.00 0.00 0.00
22:51:12 eth0 2.00 1499122.00 0.32 353668.75 0.00 0.00 1.00
Average: eth0 3.10 1459558.20 0.44 344267.57 0.00 0.00 0.50
drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 90 +++++++++++------
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 4
include/linux/netdevice.h | 1
net/core/net-sysfs.c | 18 +++
5 files changed, 83 insertions(+), 32 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..fbea83218fc0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1089,7 +1089,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
if (polled) {
if (doorbell_pending)
- mlx4_en_xmit_doorbell(priv->tx_ring[TX_XDP][cq->ring]);
+ mlx4_en_xmit_doorbell(dev, priv->tx_ring[TX_XDP][cq->ring]);
mlx4_cq_set_ci(&cq->mcq);
wmb(); /* ensure HW sees CQ consumer before we post new buffers */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 4b597dca5c52..affebb435679 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -67,7 +67,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
ring->size = size;
ring->size_mask = size - 1;
ring->sp_stride = stride;
- ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
+ ring->full_size = ring->size - HEADROOM - 2*MAX_DESC_TXBBS;
tmp = size * sizeof(struct mlx4_en_tx_info);
ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
@@ -193,6 +193,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
ring->sp_cqn = cq;
ring->prod = 0;
ring->cons = 0xffffffff;
+ ring->ncons = 0;
ring->last_nr_txbb = 1;
memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
memset(ring->buf, 0, ring->buf_size);
@@ -227,9 +228,9 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
}
-static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
+static inline bool mlx4_en_is_tx_ring_full(const struct mlx4_en_tx_ring *ring)
{
- return ring->prod - ring->cons > ring->full_size;
+ return READ_ONCE(ring->prod) - READ_ONCE(ring->cons) > ring->full_size;
}
static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
@@ -374,6 +375,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
/* Skip last polled descriptor */
ring->cons += ring->last_nr_txbb;
+ ring->ncons += ring->last_nr_txbb;
en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
ring->cons, ring->prod);
@@ -389,6 +391,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
!!(ring->cons & ring->size), 0,
0 /* Non-NAPI caller */);
ring->cons += ring->last_nr_txbb;
+ ring->ncons += ring->last_nr_txbb;
cnt++;
}
@@ -401,6 +404,38 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
return cnt;
}
+void mlx4_en_xmit_doorbell(const struct net_device *dev,
+ struct mlx4_en_tx_ring *ring)
+{
+
+ if (dev->doorbell_opt & 1) {
+ u32 oval = READ_ONCE(ring->prod_bell);
+ u32 nval = READ_ONCE(ring->prod);
+
+ if (oval == nval)
+ return;
+
+ /* I can not tell yet if a cmpxchg() is needed or not */
+ if (dev->doorbell_opt & 2)
+ WRITE_ONCE(ring->prod_bell, nval);
+ else
+ if (cmpxchg(&ring->prod_bell, oval, nval) != oval)
+ return;
+ }
+ /* Since there is no iowrite*_native() that writes the
+ * value as is, without byteswapping - using the one
+ * the doesn't do byteswapping in the relevant arch
+ * endianness.
+ */
+#if defined(__LITTLE_ENDIAN)
+ iowrite32(
+#else
+ iowrite32be(
+#endif
+ ring->doorbell_qpn,
+ ring->bf.uar->map + MLX4_SEND_DOORBELL);
+}
+
static bool mlx4_en_process_tx_cq(struct net_device *dev,
struct mlx4_en_cq *cq, int napi_budget)
{
@@ -496,8 +531,13 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
wmb();
/* we want to dirty this cache line once */
- ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
- ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+ WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
+ ring_cons += txbbs_skipped;
+ WRITE_ONCE(ring->cons, ring_cons);
+ WRITE_ONCE(ring->ncons, ring_cons + last_nr_txbb);
+
+ if (dev->doorbell_opt)
+ mlx4_en_xmit_doorbell(dev, ring);
if (ring->free_tx_desc == mlx4_en_recycle_tx_desc)
return done < budget;
@@ -725,29 +765,14 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
__iowrite64_copy(dst, src, bytecnt / 8);
}
-void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
-{
- wmb();
- /* Since there is no iowrite*_native() that writes the
- * value as is, without byteswapping - using the one
- * the doesn't do byteswapping in the relevant arch
- * endianness.
- */
-#if defined(__LITTLE_ENDIAN)
- iowrite32(
-#else
- iowrite32be(
-#endif
- ring->doorbell_qpn,
- ring->bf.uar->map + MLX4_SEND_DOORBELL);
-}
static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
struct mlx4_en_tx_desc *tx_desc,
union mlx4_wqe_qpn_vlan qpn_vlan,
int desc_size, int bf_index,
__be32 op_own, bool bf_ok,
- bool send_doorbell)
+ bool send_doorbell,
+ const struct net_device *dev, int nr_txbb)
{
tx_desc->ctrl.qpn_vlan = qpn_vlan;
@@ -761,6 +786,7 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
wmb();
+ ring->prod += nr_txbb;
mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
desc_size);
@@ -773,8 +799,9 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
*/
dma_wmb();
tx_desc->ctrl.owner_opcode = op_own;
+ ring->prod += nr_txbb;
if (send_doorbell)
- mlx4_en_xmit_doorbell(ring);
+ mlx4_en_xmit_doorbell(dev, ring);
else
ring->xmit_more++;
}
@@ -1017,8 +1044,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
}
- ring->prod += nr_txbb;
-
/* If we used a bounce buffer then copy descriptor back into place */
if (unlikely(bounce))
tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
@@ -1033,6 +1058,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
}
send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
+ /* Doorbell avoidance : We can omit doorbell if we know a TX completion
+ * will happen shortly.
+ */
+ if (send_doorbell &&
+ dev->doorbell_opt &&
+ (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
+ send_doorbell = false;
+
real_size = (real_size / 16) & 0x3f;
bf_ok &= desc_size <= MAX_BF && send_doorbell;
@@ -1043,7 +1076,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
qpn_vlan.fence_size = real_size;
mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, desc_size, bf_index,
- op_own, bf_ok, send_doorbell);
+ op_own, bf_ok, send_doorbell, dev, nr_txbb);
if (unlikely(stop_queue)) {
/* If queue was emptied after the if (stop_queue) , and before
@@ -1054,7 +1087,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
*/
smp_rmb();
- ring_cons = ACCESS_ONCE(ring->cons);
if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
netif_tx_wake_queue(ring->tx_queue);
ring->wake_queue++;
@@ -1158,8 +1190,6 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
rx_ring->xdp_tx++;
AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length);
- ring->prod += nr_txbb;
-
stop_queue = mlx4_en_is_tx_ring_full(ring);
send_doorbell = stop_queue ||
*doorbell_pending > MLX4_EN_DOORBELL_BUDGET;
@@ -1173,7 +1203,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
qpn_vlan.fence_size = real_size;
mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, TXBB_SIZE, bf_index,
- op_own, bf_ok, send_doorbell);
+ op_own, bf_ok, send_doorbell, dev, nr_txbb);
*doorbell_pending = send_doorbell ? 0 : *doorbell_pending + 1;
return NETDEV_TX_OK;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 574bcbb1b38f..c3fd0deda198 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -280,6 +280,7 @@ struct mlx4_en_tx_ring {
*/
u32 last_nr_txbb;
u32 cons;
+ u32 ncons;
unsigned long wake_queue;
struct netdev_queue *tx_queue;
u32 (*free_tx_desc)(struct mlx4_en_priv *priv,
@@ -290,6 +291,7 @@ struct mlx4_en_tx_ring {
/* cache line used and dirtied in mlx4_en_xmit() */
u32 prod ____cacheline_aligned_in_smp;
+ u32 prod_bell;
unsigned int tx_dropped;
unsigned long bytes;
unsigned long packets;
@@ -699,7 +701,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
struct mlx4_en_rx_alloc *frame,
struct net_device *dev, unsigned int length,
int tx_ind, int *doorbell_pending);
-void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring);
+void mlx4_en_xmit_doorbell(const struct net_device *dev, struct mlx4_en_tx_ring *ring);
bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
struct mlx4_en_rx_alloc *frame);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd874cc20..39565b5425a6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1816,6 +1816,7 @@ struct net_device {
DECLARE_HASHTABLE (qdisc_hash, 4);
#endif
unsigned long tx_queue_len;
+ unsigned long doorbell_opt;
spinlock_t tx_global_lock;
int watchdog_timeo;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b0c04cf4851d..df05f81f5150 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -367,6 +367,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+static int change_doorbell_opt(struct net_device *dev, unsigned long val)
+{
+ dev->doorbell_opt = val;
+ return 0;
+}
+
+static ssize_t doorbell_opt_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_doorbell_opt);
+}
+NETDEVICE_SHOW_RW(doorbell_opt, fmt_ulong);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -531,6 +548,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
&dev_attr_proto_down.attr,
+ &dev_attr_doorbell_opt.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
Powered by blists - more mailing lists