[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180221201354.8802-2-saeedm@mellanox.com>
Date: Wed, 21 Feb 2018 12:13:48 -0800
From: Saeed Mahameed <saeedm@...lanox.com>
To: "David S. Miller" <davem@...emloft.net>,
Doug Ledford <dledford@...hat.com>
Cc: Leon Romanovsky <leonro@...lanox.com>,
Jason Gunthorpe <jgg@...lanox.com>, netdev@...r.kernel.org,
linux-rdma@...r.kernel.org, Saeed Mahameed <saeedm@...lanox.com>
Subject: [for-next 1/7] net/mlx5: CQ Database per EQ
Before this patch the driver had one CQ database protected via one
spinlock, this spinlock is meant to synchronize between CQ
adding/removing and CQ IRQ interrupt handling.
On a system with large number of CPUs and on a work load that requires
lots of interrupts, this global spinlock becomes a very nasty hotspot
and introduces a contention between the active cores, which will
significantly hurt performance and becomes a bottleneck that prevents
seamless cpu scaling.
To solve this we simply move the CQ database and its spinlock to be per
EQ (IRQ), thus per core.
Tested with:
system: 2 sockets, 14 cores per socket, hyperthreading, 2x14x2=56 cores
netperf command: ./super_netperf 200 -P 0 -t TCP_RR -H <server> -l 30 -- -r 300,300 -o -s 1M,1M -S 1M,1M
WITHOUT THIS PATCH:
Average: CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
Average: all 4.32 0.00 36.15 0.09 0.00 34.02 0.00 0.00 0.00 25.41
Samples: 2M of event 'cycles:pp', Event count (approx.): 1554616897271
Overhead Command Shared Object Symbol
+ 14.28% swapper [kernel.vmlinux] [k] intel_idle
+ 12.25% swapper [kernel.vmlinux] [k] queued_spin_lock_slowpath
+ 10.29% netserver [kernel.vmlinux] [k] queued_spin_lock_slowpath
+ 1.32% netserver [kernel.vmlinux] [k] mlx5e_xmit
WITH THIS PATCH:
Average: CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
Average: all 4.27 0.00 34.31 0.01 0.00 18.71 0.00 0.00 0.00 42.69
Samples: 2M of event 'cycles:pp', Event count (approx.): 1498132937483
Overhead Command Shared Object Symbol
+ 23.33% swapper [kernel.vmlinux] [k] intel_idle
+ 1.69% netserver [kernel.vmlinux] [k] mlx5e_xmit
Tested-by: Song Liu <songliubraving@...com>
Signed-off-by: Saeed Mahameed <saeedm@...lanox.com>
Reviewed-by: Gal Pressman <galp@...lanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/cq.c | 69 +++++++++++++++-----------
drivers/net/ethernet/mellanox/mlx5/core/eq.c | 10 +++-
drivers/net/ethernet/mellanox/mlx5/core/main.c | 8 +--
include/linux/mlx5/cq.h | 3 +-
include/linux/mlx5/driver.h | 22 ++++----
5 files changed, 62 insertions(+), 50 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 1016e05c7ec7..dfbeeaa43276 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -86,10 +86,10 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
}
-void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
+void mlx5_cq_completion(struct mlx5_eq *eq, u32 cqn)
{
+ struct mlx5_cq_table *table = &eq->cq_table;
struct mlx5_core_cq *cq;
- struct mlx5_cq_table *table = &dev->priv.cq_table;
spin_lock(&table->lock);
cq = radix_tree_lookup(&table->tree, cqn);
@@ -98,7 +98,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
spin_unlock(&table->lock);
if (!cq) {
- mlx5_core_warn(dev, "Completion event for bogus CQ 0x%x\n", cqn);
+ mlx5_core_warn(eq->dev, "Completion event for bogus CQ 0x%x\n", cqn);
return;
}
@@ -110,9 +110,9 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
complete(&cq->free);
}
-void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
+void mlx5_cq_event(struct mlx5_eq *eq, u32 cqn, int event_type)
{
- struct mlx5_cq_table *table = &dev->priv.cq_table;
+ struct mlx5_cq_table *table = &eq->cq_table;
struct mlx5_core_cq *cq;
spin_lock(&table->lock);
@@ -124,7 +124,7 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
spin_unlock(&table->lock);
if (!cq) {
- mlx5_core_warn(dev, "Async event for bogus CQ 0x%x\n", cqn);
+ mlx5_core_warn(eq->dev, "Async event for bogus CQ 0x%x\n", cqn);
return;
}
@@ -137,19 +137,22 @@ void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
u32 *in, int inlen)
{
- struct mlx5_cq_table *table = &dev->priv.cq_table;
u32 out[MLX5_ST_SZ_DW(create_cq_out)];
u32 din[MLX5_ST_SZ_DW(destroy_cq_in)];
u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)];
int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
c_eqn);
- struct mlx5_eq *eq;
+ struct mlx5_eq *eq, *async_eq;
+ struct mlx5_cq_table *table;
int err;
+ async_eq = &dev->priv.eq_table.async_eq;
eq = mlx5_eqn2eq(dev, eqn);
if (IS_ERR(eq))
return PTR_ERR(eq);
+ table = &eq->cq_table;
+
memset(out, 0, sizeof(out));
MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
@@ -159,6 +162,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
cq->cqn = MLX5_GET(create_cq_out, out, cqn);
cq->cons_index = 0;
cq->arm_sn = 0;
+ cq->eq = eq;
refcount_set(&cq->refcount, 1);
init_completion(&cq->free);
if (!cq->comp)
@@ -167,12 +171,20 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
cq->tasklet_ctx.priv = &eq->tasklet_ctx;
INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+ /* Add to comp EQ CQ tree to recv comp events */
spin_lock_irq(&table->lock);
err = radix_tree_insert(&table->tree, cq->cqn, cq);
spin_unlock_irq(&table->lock);
if (err)
goto err_cmd;
+ /* Add to async EQ CQ tree to recv Async events */
+ spin_lock_irq(&async_eq->cq_table.lock);
+ err = radix_tree_insert(&async_eq->cq_table.tree, cq->cqn, cq);
+ spin_unlock_irq(&async_eq->cq_table.lock);
+ if (err)
+ goto err_cq_table;
+
cq->pid = current->pid;
err = mlx5_debug_cq_add(dev, cq);
if (err)
@@ -183,6 +195,10 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
return 0;
+err_cq_table:
+ spin_lock_irq(&table->lock);
+ radix_tree_delete(&table->tree, cq->cqn);
+ spin_unlock_irq(&table->lock);
err_cmd:
memset(din, 0, sizeof(din));
memset(dout, 0, sizeof(dout));
@@ -195,21 +211,34 @@ EXPORT_SYMBOL(mlx5_core_create_cq);
int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
{
- struct mlx5_cq_table *table = &dev->priv.cq_table;
+ struct mlx5_cq_table *asyn_eq_cq_table = &dev->priv.eq_table.async_eq.cq_table;
+ struct mlx5_cq_table *table = &cq->eq->cq_table;
u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0};
u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0};
struct mlx5_core_cq *tmp;
int err;
+ spin_lock_irq(&asyn_eq_cq_table->lock);
+ tmp = radix_tree_delete(&asyn_eq_cq_table->tree, cq->cqn);
+ spin_unlock_irq(&asyn_eq_cq_table->lock);
+ if (!tmp) {
+ mlx5_core_warn(dev, "cq 0x%x not found in async eq cq tree\n", cq->cqn);
+ return -EINVAL;
+ }
+ if (tmp != cq) {
+ mlx5_core_warn(dev, "corruption on cqn 0x%x in async eq cq tree\n", cq->cqn);
+ return -EINVAL;
+ }
+
spin_lock_irq(&table->lock);
tmp = radix_tree_delete(&table->tree, cq->cqn);
spin_unlock_irq(&table->lock);
if (!tmp) {
- mlx5_core_warn(dev, "cq 0x%x not found in tree\n", cq->cqn);
+ mlx5_core_warn(dev, "cq 0x%x not found in comp eq cq tree\n", cq->cqn);
return -EINVAL;
}
if (tmp != cq) {
- mlx5_core_warn(dev, "corruption on srqn 0x%x\n", cq->cqn);
+ mlx5_core_warn(dev, "corruption on cqn 0x%x in comp eq cq tree\n", cq->cqn);
return -EINVAL;
}
@@ -270,21 +299,3 @@ int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev,
return mlx5_core_modify_cq(dev, cq, in, sizeof(in));
}
EXPORT_SYMBOL(mlx5_core_modify_cq_moderation);
-
-int mlx5_init_cq_table(struct mlx5_core_dev *dev)
-{
- struct mlx5_cq_table *table = &dev->priv.cq_table;
- int err;
-
- memset(table, 0, sizeof(*table));
- spin_lock_init(&table->lock);
- INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
- err = mlx5_cq_debugfs_init(dev);
-
- return err;
-}
-
-void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev)
-{
- mlx5_cq_debugfs_cleanup(dev);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 25106e996a96..328403ebf2f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -415,7 +415,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
switch (eqe->type) {
case MLX5_EVENT_TYPE_COMP:
cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff;
- mlx5_cq_completion(dev, cqn);
+ mlx5_cq_completion(eq, cqn);
break;
case MLX5_EVENT_TYPE_DCT_DRAINED:
rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
@@ -472,7 +472,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrome 0x%x\n",
cqn, eqe->data.cq_err.syndrome);
- mlx5_cq_event(dev, cqn, eqe->type);
+ mlx5_cq_event(eq, cqn, eqe->type);
break;
case MLX5_EVENT_TYPE_PAGE_REQUEST:
@@ -567,6 +567,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
int nent, u64 mask, const char *name,
enum mlx5_eq_type type)
{
+ struct mlx5_cq_table *cq_table = &eq->cq_table;
u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
struct mlx5_priv *priv = &dev->priv;
irq_handler_t handler;
@@ -576,6 +577,11 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
u32 *in;
int err;
+ /* Init CQ table */
+ memset(cq_table, 0, sizeof(*cq_table));
+ spin_lock_init(&cq_table->lock);
+ INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
eq->type = type;
eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
eq->cons_index = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 2ef641c91c26..8cc22bf80c87 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -942,9 +942,9 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
goto out;
}
- err = mlx5_init_cq_table(dev);
+ err = mlx5_cq_debugfs_init(dev);
if (err) {
- dev_err(&pdev->dev, "failed to initialize cq table\n");
+ dev_err(&pdev->dev, "failed to initialize cq debugfs\n");
goto err_eq_cleanup;
}
@@ -1002,7 +1002,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
mlx5_cleanup_mkey_table(dev);
mlx5_cleanup_srq_table(dev);
mlx5_cleanup_qp_table(dev);
- mlx5_cleanup_cq_table(dev);
+ mlx5_cq_debugfs_cleanup(dev);
err_eq_cleanup:
mlx5_eq_cleanup(dev);
@@ -1023,7 +1023,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_cleanup_mkey_table(dev);
mlx5_cleanup_srq_table(dev);
mlx5_cleanup_qp_table(dev);
- mlx5_cleanup_cq_table(dev);
+ mlx5_cq_debugfs_cleanup(dev);
mlx5_eq_cleanup(dev);
}
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 48c181a2acc9..06ba425a6ad7 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -60,6 +60,7 @@ struct mlx5_core_cq {
} tasklet_ctx;
int reset_notify_added;
struct list_head reset_notify;
+ struct mlx5_eq *eq;
};
@@ -171,8 +172,6 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL);
}
-int mlx5_init_cq_table(struct mlx5_core_dev *dev);
-void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev);
int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
u32 *in, int inlen);
int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6ed79a8a8318..96e003db2bcd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -375,8 +375,15 @@ struct mlx5_eq_pagefault {
mempool_t *pool;
};
+struct mlx5_cq_table {
+ /* protect radix tree */
+ spinlock_t lock;
+ struct radix_tree_root tree;
+};
+
struct mlx5_eq {
struct mlx5_core_dev *dev;
+ struct mlx5_cq_table cq_table;
__be32 __iomem *doorbell;
u32 cons_index;
struct mlx5_buf buf;
@@ -526,13 +533,6 @@ struct mlx5_core_health {
struct delayed_work recover_work;
};
-struct mlx5_cq_table {
- /* protect radix tree
- */
- spinlock_t lock;
- struct radix_tree_root tree;
-};
-
struct mlx5_qp_table {
/* protect radix tree
*/
@@ -654,10 +654,6 @@ struct mlx5_priv {
struct dentry *cmdif_debugfs;
/* end: qp staff */
- /* start: cq staff */
- struct mlx5_cq_table cq_table;
- /* end: cq staff */
-
/* start: mkey staff */
struct mlx5_mkey_table mkey_table;
/* end: mkey staff */
@@ -1053,12 +1049,12 @@ int mlx5_eq_init(struct mlx5_core_dev *dev);
void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
-void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
+void mlx5_cq_completion(struct mlx5_eq *eq, u32 cqn);
void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced);
-void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
+void mlx5_cq_event(struct mlx5_eq *eq, u32 cqn, int event_type);
int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
int nent, u64 mask, const char *name,
enum mlx5_eq_type type);
--
2.14.3
Powered by blists - more mailing lists