[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190605232348.6452-10-saeedm@mellanox.com>
Date: Wed, 5 Jun 2019 23:24:53 +0000
From: Saeed Mahameed <saeedm@...lanox.com>
To: "David S. Miller" <davem@...emloft.net>,
Jason Gunthorpe <jgg@...lanox.com>,
Doug Ledford <dledford@...hat.com>
CC: Michael Chan <michael.chan@...adcom.com>,
Andy Gospodarek <andy@...yhouse.net>,
Tal Gilboa <talgi@...lanox.com>,
"linux-rdma@...r.kernel.org" <linux-rdma@...r.kernel.org>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
Yamin Friedman <yaminf@...lanox.com>,
Max Gurtovoy <maxg@...lanox.com>,
Saeed Mahameed <saeedm@...lanox.com>
Subject: [for-next 9/9] RDMA/core: Provide RDMA DIM support for ULPs
From: Yamin Friedman <yaminf@...lanox.com>
Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.
Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:
READS without DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 3.8GiB/s | 7.7M | 1401 usec | 2442 usec
4k | 7.0GiB/s | 1.8M | 4817 usec | 6587 usec
64k | 10.7GiB/s| 175k | 9896 usec | 10028 usec
IO WRITES without DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 3.6GiB/s | 7.5M | 1434 usec | 2474 usec
4k | 6.3GiB/s | 1.6M | 938 usec | 1221 usec
64k | 10.7GiB/s| 175k | 8979 usec | 12780 usec
IO READS with DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 4GiB/s | 8.2M | 816 usec | 889 usec
4k | 10.1GiB/s| 2.65M| 3359 usec | 5080 usec
64k | 10.7GiB/s| 175k | 9896 usec | 10028 usec
IO WRITES with DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 3.9GiB/s | 8.1M | 799 usec | 922 usec
4k | 9.6GiB/s | 2.5M | 717 usec | 1004 usec
64k | 10.7GiB/s| 176k | 8586 usec | 12256 usec
The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.
Signed-off-by: Yamin Friedman <yaminf@...lanox.com>
Reviewed-by: Max Gurtovoy <maxg@...lanox.com>
Signed-off-by: Saeed Mahameed <saeedm@...lanox.com>
---
drivers/infiniband/core/cq.c | 78 ++++++++++++++++++-
drivers/net/ethernet/mellanox/mlx4/Kconfig | 1 +
.../net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
include/rdma/ib_verbs.h | 27 ++++++-
4 files changed, 102 insertions(+), 5 deletions(-)
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a4c81992267c..326d928d2763 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <rdma/ib_verbs.h>
+#include <linux/rdma_dim.h>
/* # of WCs to poll for with a single call to ib_poll_cq */
#define IB_POLL_BATCH 16
@@ -26,6 +27,32 @@
#define IB_POLL_FLAGS \
(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+ struct dim *dim = container_of(w, struct dim, work);
+ struct ib_cq *cq = container_of(dim, struct ib_cq, dim);
+
+ u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+ u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+ dim->state = DIM_START_MEASURE;
+
+ cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static bool rdma_dim_init(struct dim *dim, struct ib_cq *cq)
+{
+ if (!cq->device->ops.modify_cq)
+ return false;
+
+ memset(dim, 0, sizeof(*dim));
+ dim->state = DIM_START_MEASURE;
+ dim->tune_state = DIM_GOING_RIGHT;
+ dim->profile_ix = RDMA_DIM_START_PROFILE;
+
+ return true;
+}
+
static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
int batch)
{
@@ -98,6 +125,24 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
return completed;
}
+static int ib_poll_dim_handler(struct irq_poll *iop, int budget)
+{
+ struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+ int completed;
+ struct dim *dim = &cq->dim;
+
+ completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+ if (completed < budget) {
+ irq_poll_complete(&cq->iop);
+ if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+ irq_poll_sched(&cq->iop);
+ }
+
+ rdma_dim(dim, completed);
+
+ return completed;
+}
+
static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
{
irq_poll_sched(&cq->iop);
@@ -105,14 +150,18 @@ static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
static void ib_cq_poll_work(struct work_struct *work)
{
- struct ib_cq *cq = container_of(work, struct ib_cq, work);
+ struct ib_cq *cq = container_of(work, struct ib_cq,
+ work);
int completed;
completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
IB_POLL_BATCH);
+
if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
queue_work(cq->comp_wq, &cq->work);
+ else if (cq->dim_used)
+ rdma_dim(&cq->dim, completed);
}
static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -129,6 +178,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
* @poll_ctx: context to poll the CQ from.
* @caller: module owner name.
* @udata: Valid user data or NULL for kernel object
+ * @use_dim: use dynamic interrupt moderation
*
* This is the proper interface to allocate a CQ for in-kernel users. A
* CQ allocated with this interface will automatically be polled from the
@@ -138,7 +188,8 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
int nr_cqe, int comp_vector,
enum ib_poll_context poll_ctx,
- const char *caller, struct ib_udata *udata)
+ const char *caller, struct ib_udata *udata,
+ bool use_dim)
{
struct ib_cq_init_attr cq_attr = {
.cqe = nr_cqe,
@@ -173,13 +224,30 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
case IB_POLL_SOFTIRQ:
cq->comp_handler = ib_cq_completion_softirq;
- irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+ if (use_dim)
+ cq->dim_used = rdma_dim_init(&cq->dim, cq);
+
+ if (cq->dim_used) {
+ irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+ ib_poll_dim_handler);
+ INIT_WORK(&cq->dim.work, ib_cq_rdma_dim_work);
+ } else {
+ irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+ ib_poll_handler);
+ }
+
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
break;
case IB_POLL_WORKQUEUE:
case IB_POLL_UNBOUND_WORKQUEUE:
cq->comp_handler = ib_cq_completion_workqueue;
INIT_WORK(&cq->work, ib_cq_poll_work);
+ if (use_dim)
+ cq->dim_used = rdma_dim_init(&cq->dim, cq);
+
+ if (cq->dim_used)
+ INIT_WORK(&cq->dim.work, ib_cq_rdma_dim_work);
+
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
ib_comp_wq : ib_comp_unbound_wq;
@@ -217,10 +285,14 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
break;
case IB_POLL_SOFTIRQ:
irq_poll_disable(&cq->iop);
+ if (cq->dim_used)
+ cancel_work_sync(&cq->dim.work);
break;
case IB_POLL_WORKQUEUE:
case IB_POLL_UNBOUND_WORKQUEUE:
cancel_work_sync(&cq->work);
+ if (cq->dim_used)
+ cancel_work_sync(&cq->dim.work);
break;
default:
WARN_ON_ONCE(1);
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
index e69c3c31e701..93cd25997b24 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -28,6 +28,7 @@ config MLX4_CORE
tristate
depends on PCI
select NET_DEVLINK
+ select DIMLIB
default n
config MLX4_DEBUG
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7845aa5bf6be..ef292fbb53c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -7,6 +7,7 @@ config MLX5_CORE
tristate "Mellanox 5th generation network adapters (ConnectX series) core driver"
depends on PCI
select NET_DEVLINK
+ select DIMLIB
imply PTP_1588_CLOCK
imply VXLAN
imply MLXFW
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0742095355f2..7b03fb3e4f0b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,7 @@
#include <rdma/restrack.h>
#include <uapi/rdma/rdma_user_ioctl.h>
#include <uapi/rdma/ib_user_ioctl_verbs.h>
+#include <linux/dim.h>
#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN
@@ -1638,6 +1639,8 @@ struct ib_cq {
* Implementation details of the RDMA core, don't use in drivers:
*/
struct rdma_restrack_entry res;
+ struct dim dim;
+ bool dim_used;
};
struct ib_srq {
@@ -3746,7 +3749,8 @@ static inline int ib_post_recv(struct ib_qp *qp,
struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
int nr_cqe, int comp_vector,
enum ib_poll_context poll_ctx,
- const char *caller, struct ib_udata *udata);
+ const char *caller, struct ib_udata *udata,
+ bool use_dim);
/**
* ib_alloc_cq_user: Allocate kernel/user CQ
@@ -3764,7 +3768,7 @@ static inline struct ib_cq *ib_alloc_cq_user(struct ib_device *dev,
struct ib_udata *udata)
{
return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
- KBUILD_MODNAME, udata);
+ KBUILD_MODNAME, udata, false);
}
/**
@@ -3785,6 +3789,25 @@ static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
NULL);
}
+/**
+ * ib_alloc_cq_dim: Allocate kernel CQ with dynamic interrupt moderation
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @comp_vector: Completion vector used for the IRQs
+ * @poll_ctx: Context used for polling the CQ
+ *
+ * NOTE: for user cq use ib_alloc_cq_user with valid udata!
+ */
+static inline struct ib_cq *ib_alloc_cq_dim(struct ib_device *dev,
+ void *private, int nr_cqe,
+ int comp_vector,
+ enum ib_poll_context poll_ctx)
+{
+ return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+ KBUILD_MODNAME, NULL, true);
+}
+
/**
* ib_free_cq_user - Free kernel/user CQ
* @cq: The CQ to free
--
2.21.0
Powered by blists - more mailing lists