[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZBmlBEldcG6rMcM1@nvidia.com>
Date: Tue, 21 Mar 2023 09:37:24 -0300
From: Jason Gunthorpe <jgg@...dia.com>
To: Leon Romanovsky <leon@...nel.org>
Cc: Patrisious Haddad <phaddad@...dia.com>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, linux-rdma@...r.kernel.org,
netdev@...r.kernel.org, Paolo Abeni <pabeni@...hat.com>,
Saeed Mahameed <saeedm@...dia.com>
Subject: Re: [PATCH rdma-next v1 2/3] RDMA/mlx5: Handling dct common resource
destruction upon firmware failure
On Tue, Mar 21, 2023 at 02:02:59PM +0200, Leon Romanovsky wrote:
> On Tue, Mar 21, 2023 at 08:53:35AM -0300, Jason Gunthorpe wrote:
> > On Tue, Mar 21, 2023 at 09:54:58AM +0200, Leon Romanovsky wrote:
> > > On Mon, Mar 20, 2023 at 04:18:14PM -0300, Jason Gunthorpe wrote:
> > > > On Thu, Mar 16, 2023 at 03:39:27PM +0200, Leon Romanovsky wrote:
> > > > > From: Patrisious Haddad <phaddad@...dia.com>
> > > > >
> > > > > Previously when destroying a DCT, if the firmware function for the
> > > > > destruction failed, the common resource would have been destroyed
> > > > > either way, since it was destroyed before the firmware object.
> > > > > Which leads to kernel warning "refcount_t: underflow" which indicates
> > > > > possible use-after-free.
> > > > > Which is triggered when we try to destroy the common resource for the
> > > > > second time and execute refcount_dec_and_test(&common->refcount).
> > > > >
> > > > > So, currently before destroying the common resource we check its
> > > > > refcount and continue with the destruction only if it isn't zero.
> > > >
> > > > This seems super sketchy
> > > >
> > > > If the destruction fails why not set the refcount back to 1?
> > >
> > > Because destruction will fail in destroy_rq_tracked() which is after
> > > destroy_resource_common().
> > >
> > > In first destruction attempt, we delete qp from radix tree and wait for all
> > > reference to drop. In order do not undo all this logic (setting 1 alone is
> > > not enough), it is much safer simply skip destroy_resource_common() in reentry
> > > case.
> >
> > This is the bug I pointed a long time ago, it is ordered wrong to
> > remove restrack before destruction is assured
>
> It is not restrack, but internal to mlx5_core structure.
>
> 176 static void destroy_resource_common(struct mlx5_ib_dev *dev,
> 177 struct mlx5_core_qp *qp)
> 178 {
> 179 struct mlx5_qp_table *table = &dev->qp_table;
> 180 unsigned long flags;
> 181
>
> ....
>
> 185 spin_lock_irqsave(&table->lock, flags);
> 186 radix_tree_delete(&table->tree,
> 187 qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
> 188 spin_unlock_irqrestore(&table->lock, flags);
> 189 mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
> 190 wait_for_completion(&qp->common.free);
> 191 }
Same basic issue.
"RSC"'s refcount stuff is really only for ODP to use, and the silly
pseudo locking should really just be rwsem not a refcount.
Get DCT out of that particular mess and the scheme is quite simple and
doesn't nee hacky stuff.
Please make a patch to remove radix tree from this code too...
diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index bae0334d6e7f18..68009bff4bd544 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -88,23 +88,34 @@ static bool is_event_type_allowed(int rsc_type, int event_type)
}
}
+static int dct_event_notifier(struct mlx5_ib_dev *dev, struct mlx5_eqe *eqe)
+{
+ struct mlx5_core_dct *dct;
+ u32 qpn;
+
+ qpn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+ xa_lock(&dev->qp_table.dct_xa);
+ dct = xa_load(&dev->qp_table.dct_xa, qpn);
+ if (dct)
+ complete(&dct->drained);
+ xa_unlock(&dev->qp_table.dct_xa);
+ return NOTIFY_OK;
+}
+
static int rsc_event_notifier(struct notifier_block *nb,
unsigned long type, void *data)
{
+ struct mlx5_ib_dev *dev =
+ container_of(nb, struct mlx5_ib_dev, qp_table.nb);
struct mlx5_core_rsc_common *common;
- struct mlx5_qp_table *table;
- struct mlx5_core_dct *dct;
+ struct mlx5_eqe *eqe = data;
u8 event_type = (u8)type;
struct mlx5_core_qp *qp;
- struct mlx5_eqe *eqe;
u32 rsn;
switch (event_type) {
case MLX5_EVENT_TYPE_DCT_DRAINED:
- eqe = data;
- rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
- rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN);
- break;
+ return dct_event_notifier(dev, eqe);
case MLX5_EVENT_TYPE_PATH_MIG:
case MLX5_EVENT_TYPE_COMM_EST:
case MLX5_EVENT_TYPE_SQ_DRAINED:
@@ -113,7 +124,6 @@ static int rsc_event_notifier(struct notifier_block *nb,
case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
- eqe = data;
rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
break;
@@ -121,8 +131,7 @@ static int rsc_event_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
- table = container_of(nb, struct mlx5_qp_table, nb);
- common = mlx5_get_rsc(table, rsn);
+ common = mlx5_get_rsc(&dev->qp_table, rsn);
if (!common)
return NOTIFY_OK;
@@ -137,11 +146,6 @@ static int rsc_event_notifier(struct notifier_block *nb,
qp->event(qp, event_type);
/* Need to put resource in event handler */
return NOTIFY_OK;
- case MLX5_RES_DCT:
- dct = (struct mlx5_core_dct *)common;
- if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
- complete(&dct->drained);
- break;
default:
break;
}
@@ -188,7 +192,7 @@ static void destroy_resource_common(struct mlx5_ib_dev *dev,
}
static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
- struct mlx5_core_dct *dct, bool need_cleanup)
+ struct mlx5_core_dct *dct)
{
u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {};
struct mlx5_core_qp *qp = &dct->mqp;
@@ -203,13 +207,14 @@ static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
}
wait_for_completion(&dct->drained);
destroy:
- if (need_cleanup)
- destroy_resource_common(dev, &dct->mqp);
MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
MLX5_SET(destroy_dct_in, in, uid, qp->uid);
err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in);
- return err;
+ if (err)
+ return err;
+ xa_cmpxchg(&dev->qp_table.dct_xa, dct->mqp.qpn, dct, NULL, GFP_KERNEL);
+ return 0;
}
int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
@@ -227,13 +232,13 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
qp->qpn = MLX5_GET(create_dct_out, out, dctn);
qp->uid = MLX5_GET(create_dct_in, in, uid);
- err = create_resource_common(dev, qp, MLX5_RES_DCT);
+ err = xa_err(xa_store(&dev->qp_table.dct_xa, qp->qpn, dct, GFP_KERNEL));
if (err)
goto err_cmd;
return 0;
err_cmd:
- _mlx5_core_destroy_dct(dev, dct, false);
+ _mlx5_core_destroy_dct(dev, dct);
return err;
}
@@ -284,7 +289,7 @@ static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev,
int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
struct mlx5_core_dct *dct)
{
- return _mlx5_core_destroy_dct(dev, dct, true);
+ return _mlx5_core_destroy_dct(dev, dct);
}
int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp)
@@ -488,6 +493,7 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev)
spin_lock_init(&table->lock);
INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+ xa_init(&table->dct_xa);
mlx5_qp_debugfs_init(dev->mdev);
table->nb.notifier_call = rsc_event_notifier;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209e4..87e19e6d07a94a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -381,7 +381,6 @@ enum mlx5_res_type {
MLX5_RES_SRQ = 3,
MLX5_RES_XSRQ = 4,
MLX5_RES_XRQ = 5,
- MLX5_RES_DCT = MLX5_EVENT_QUEUE_TYPE_DCT,
};
struct mlx5_core_rsc_common {
@@ -443,6 +442,7 @@ struct mlx5_core_health {
struct mlx5_qp_table {
struct notifier_block nb;
+ struct xarray dct_xa;
/* protect radix tree
*/
Powered by blists - more mailing lists