netdev - Re: [PATCH rdma-next v1 2/3] RDMA/mlx5: Handling dct common resource destruction upon firmware failure

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZBmlBEldcG6rMcM1@nvidia.com>
Date:   Tue, 21 Mar 2023 09:37:24 -0300
From:   Jason Gunthorpe <jgg@...dia.com>
To:     Leon Romanovsky <leon@...nel.org>
Cc:     Patrisious Haddad <phaddad@...dia.com>,
        "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>, linux-rdma@...r.kernel.org,
        netdev@...r.kernel.org, Paolo Abeni <pabeni@...hat.com>,
        Saeed Mahameed <saeedm@...dia.com>
Subject: Re: [PATCH rdma-next v1 2/3] RDMA/mlx5: Handling dct common resource
 destruction upon firmware failure

On Tue, Mar 21, 2023 at 02:02:59PM +0200, Leon Romanovsky wrote:
> On Tue, Mar 21, 2023 at 08:53:35AM -0300, Jason Gunthorpe wrote:
> > On Tue, Mar 21, 2023 at 09:54:58AM +0200, Leon Romanovsky wrote:
> > > On Mon, Mar 20, 2023 at 04:18:14PM -0300, Jason Gunthorpe wrote:
> > > > On Thu, Mar 16, 2023 at 03:39:27PM +0200, Leon Romanovsky wrote:
> > > > > From: Patrisious Haddad <phaddad@...dia.com>
> > > > > 
> > > > > Previously when destroying a DCT, if the firmware function for the
> > > > > destruction failed, the common resource would have been destroyed
> > > > > either way, since it was destroyed before the firmware object.
> > > > > Which leads to kernel warning "refcount_t: underflow" which indicates
> > > > > possible use-after-free.
> > > > > Which is triggered when we try to destroy the common resource for the
> > > > > second time and execute refcount_dec_and_test(&common->refcount).
> > > > > 
> > > > > So, currently before destroying the common resource we check its
> > > > > refcount and continue with the destruction only if it isn't zero.
> > > > 
> > > > This seems super sketchy
> > > > 
> > > > If the destruction fails why not set the refcount back to 1?
> > > 
> > > Because destruction will fail in destroy_rq_tracked() which is after
> > > destroy_resource_common().
> > > 
> > > In first destruction attempt, we delete qp from radix tree and wait for all
> > > reference to drop. In order do not undo all this logic (setting 1 alone is
> > > not enough), it is much safer simply skip destroy_resource_common() in reentry
> > > case.
> > 
> > This is the bug I pointed a long time ago, it is ordered wrong to
> > remove restrack before destruction is assured
> 
> It is not restrack, but internal to mlx5_core structure.
> 
>   176 static void destroy_resource_common(struct mlx5_ib_dev *dev,
>   177                                     struct mlx5_core_qp *qp)
>   178 {
>   179         struct mlx5_qp_table *table = &dev->qp_table;
>   180         unsigned long flags;
>   181
> 
> ....
> 
>   185         spin_lock_irqsave(&table->lock, flags);
>   186         radix_tree_delete(&table->tree,
>   187                           qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
>   188         spin_unlock_irqrestore(&table->lock, flags);
>   189         mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
>   190         wait_for_completion(&qp->common.free);
>   191 }

Same basic issue.

"RSC"'s refcount stuff is really only for ODP to use, and the silly
pseudo locking should really just be rwsem not a refcount.

Get DCT out of that particular mess and the scheme is quite simple and
doesn't nee hacky stuff.

Please make a patch to remove radix tree from this code too...

diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index bae0334d6e7f18..68009bff4bd544 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -88,23 +88,34 @@ static bool is_event_type_allowed(int rsc_type, int event_type)
 	}
 }
 
+static int dct_event_notifier(struct mlx5_ib_dev *dev, struct mlx5_eqe *eqe)
+{
+	struct mlx5_core_dct *dct;
+	u32 qpn;
+
+	qpn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+	xa_lock(&dev->qp_table.dct_xa);
+	dct = xa_load(&dev->qp_table.dct_xa, qpn);
+	if (dct)
+		complete(&dct->drained);
+	xa_unlock(&dev->qp_table.dct_xa);
+	return NOTIFY_OK;
+}
+
 static int rsc_event_notifier(struct notifier_block *nb,
 			      unsigned long type, void *data)
 {
+	struct mlx5_ib_dev *dev =
+		container_of(nb, struct mlx5_ib_dev, qp_table.nb);
 	struct mlx5_core_rsc_common *common;
-	struct mlx5_qp_table *table;
-	struct mlx5_core_dct *dct;
+	struct mlx5_eqe *eqe = data;
 	u8 event_type = (u8)type;
 	struct mlx5_core_qp *qp;
-	struct mlx5_eqe *eqe;
 	u32 rsn;
 
 	switch (event_type) {
 	case MLX5_EVENT_TYPE_DCT_DRAINED:
-		eqe = data;
-		rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
-		rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN);
-		break;
+		return dct_event_notifier(dev, eqe);
 	case MLX5_EVENT_TYPE_PATH_MIG:
 	case MLX5_EVENT_TYPE_COMM_EST:
 	case MLX5_EVENT_TYPE_SQ_DRAINED:
@@ -113,7 +124,6 @@ static int rsc_event_notifier(struct notifier_block *nb,
 	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
-		eqe = data;
 		rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
 		rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
 		break;
@@ -121,8 +131,7 @@ static int rsc_event_notifier(struct notifier_block *nb,
 		return NOTIFY_DONE;
 	}
 
-	table = container_of(nb, struct mlx5_qp_table, nb);
-	common = mlx5_get_rsc(table, rsn);
+	common = mlx5_get_rsc(&dev->qp_table, rsn);
 	if (!common)
 		return NOTIFY_OK;
 
@@ -137,11 +146,6 @@ static int rsc_event_notifier(struct notifier_block *nb,
 		qp->event(qp, event_type);
 		/* Need to put resource in event handler */
 		return NOTIFY_OK;
-	case MLX5_RES_DCT:
-		dct = (struct mlx5_core_dct *)common;
-		if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
-			complete(&dct->drained);
-		break;
 	default:
 		break;
 	}
@@ -188,7 +192,7 @@ static void destroy_resource_common(struct mlx5_ib_dev *dev,
 }
 
 static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
-				  struct mlx5_core_dct *dct, bool need_cleanup)
+				  struct mlx5_core_dct *dct)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {};
 	struct mlx5_core_qp *qp = &dct->mqp;
@@ -203,13 +207,14 @@ static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
 	}
 	wait_for_completion(&dct->drained);
 destroy:
-	if (need_cleanup)
-		destroy_resource_common(dev, &dct->mqp);
 	MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
 	MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
 	MLX5_SET(destroy_dct_in, in, uid, qp->uid);
 	err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in);
-	return err;
+	if (err)
+		return err;
+	xa_cmpxchg(&dev->qp_table.dct_xa, dct->mqp.qpn, dct, NULL, GFP_KERNEL);
+	return 0;
 }
 
 int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
@@ -227,13 +232,13 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
 
 	qp->qpn = MLX5_GET(create_dct_out, out, dctn);
 	qp->uid = MLX5_GET(create_dct_in, in, uid);
-	err = create_resource_common(dev, qp, MLX5_RES_DCT);
+	err = xa_err(xa_store(&dev->qp_table.dct_xa, qp->qpn, dct, GFP_KERNEL));
 	if (err)
 		goto err_cmd;
 
 	return 0;
 err_cmd:
-	_mlx5_core_destroy_dct(dev, dct, false);
+	_mlx5_core_destroy_dct(dev, dct);
 	return err;
 }
 
@@ -284,7 +289,7 @@ static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev,
 int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
 			  struct mlx5_core_dct *dct)
 {
-	return _mlx5_core_destroy_dct(dev, dct, true);
+	return _mlx5_core_destroy_dct(dev, dct);
 }
 
 int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp)
@@ -488,6 +493,7 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev)
 
 	spin_lock_init(&table->lock);
 	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	xa_init(&table->dct_xa);
 	mlx5_qp_debugfs_init(dev->mdev);
 
 	table->nb.notifier_call = rsc_event_notifier;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209e4..87e19e6d07a94a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -381,7 +381,6 @@ enum mlx5_res_type {
 	MLX5_RES_SRQ	= 3,
 	MLX5_RES_XSRQ	= 4,
 	MLX5_RES_XRQ	= 5,
-	MLX5_RES_DCT	= MLX5_EVENT_QUEUE_TYPE_DCT,
 };
 
 struct mlx5_core_rsc_common {
@@ -443,6 +442,7 @@ struct mlx5_core_health {
 
 struct mlx5_qp_table {
 	struct notifier_block   nb;
+	struct xarray dct_xa;
 
 	/* protect radix tree
 	 */