lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 15 Feb 2017 22:41:40 +0000
From:   Ben Hutchings <ben@...adent.org.uk>
To:     linux-kernel@...r.kernel.org, stable@...r.kernel.org
CC:     akpm@...ux-foundation.org, "Chuck Lever" <chuck.lever@...cle.com>,
        "J. Bruce Fields" <bfields@...hat.com>
Subject: [PATCH 3.16 065/306] svcrdma: Tail iovec leaves an orphaned DMA
 mapping

3.16.40-rc1 review patch.  If anyone has any objections, please let me know.

------------------

From: Chuck Lever <chuck.lever@...cle.com>

commit cace564f8b6260e806f5e28d7f192fd0e0c603ed upstream.

The ctxt's count field is overloaded to mean the number of pages in
the ctxt->page array and the number of SGEs in the ctxt->sge array.
Typically these two numbers are the same.

However, when an inline RPC reply is constructed from an xdr_buf
with a tail iovec, the head and tail often occupy the same page,
but each are DMA mapped independently. In that case, ->count equals
the number of pages, but it does not equal the number of SGEs.
There's one more SGE, for the tail iovec. Hence there is one more
DMA mapping than there are pages in the ctxt->page array.

This isn't a real problem until the server's iommu is enabled. Then
each RPC reply that has content in that iovec orphans a DMA mapping
that consists of real resources.

krb5i and krb5p always populate that tail iovec. After a couple
million sent krb5i/p RPC replies, the NFS server starts behaving
erratically. Reboot is needed to clear the problem.

Fixes: 9d11b51ce7c1 ("svcrdma: Fix send_reply() scatter/gather set-up")
Signed-off-by: Chuck Lever <chuck.lever@...cle.com>
Signed-off-by: J. Bruce Fields <bfields@...hat.com>
[bwh: Backported to 3.16:
 - Adjust context
 - Drop changes to svc_rdma_bc_sendto()
 - s/xprt->sc_pd->local_dma_lkey/xprt->sc_dma_lkey/
Signed-off-by: Ben Hutchings <ben@...adent.org.uk>
---
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -83,6 +83,7 @@ struct svc_rdma_op_ctxt {
 	unsigned long flags;
 	enum dma_data_direction direction;
 	int count;
+	unsigned int mapped_sges;
 	struct ib_sge sge[RPCSVC_MAXPAGES];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
@@ -178,6 +179,14 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQUESTS    16
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
+/* Track DMA maps for this transport and context */
+static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
+					   struct svc_rdma_op_ctxt *ctxt)
+{
+	ctxt->mapped_sges++;
+	atomic_inc(&rdma->sc_dma_used);
+}
+
 /* svc_rdma_marshal.c */
 extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
 				      int *, int *);
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -178,7 +178,7 @@ static int rdma_read_chunk_lcl(struct sv
 					   ctxt->sge[pno].addr);
 		if (ret)
 			goto err;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 
 		/* The lkey here is either a local dma lkey or a dma_mr lkey */
 		ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -184,7 +184,7 @@ static int send_write(struct svcxprt_rdm
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 sge[sge_no].addr))
 			goto err;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 		sge[sge_no].lkey = xprt->sc_dma_lkey;
 		ctxt->count++;
 		sge_off = 0;
@@ -411,7 +411,7 @@ static int send_reply(struct svcxprt_rdm
 			    ctxt->sge[0].length, DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
 		goto err;
-	atomic_inc(&rdma->sc_dma_used);
+	svc_rdma_count_mappings(rdma, ctxt);
 
 	ctxt->direction = DMA_TO_DEVICE;
 
@@ -427,7 +427,7 @@ static int send_reply(struct svcxprt_rdm
 		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
 					 ctxt->sge[sge_no].addr))
 			goto err;
-		atomic_inc(&rdma->sc_dma_used);
+		svc_rdma_count_mappings(rdma, ctxt);
 		ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
 		ctxt->sge[sge_no].length = sge_bytes;
 	}
@@ -442,23 +442,9 @@ static int send_reply(struct svcxprt_rdm
 		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
 		ctxt->count++;
 		rqstp->rq_respages[page_no] = NULL;
-		/*
-		 * If there are more pages than SGE, terminate SGE
-		 * list so that svc_rdma_unmap_dma doesn't attempt to
-		 * unmap garbage.
-		 */
-		if (page_no+1 >= sge_no)
-			ctxt->sge[page_no+1].length = 0;
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	/* The loop above bumps sc_dma_used for each sge. The
-	 * xdr_buf.tail gets a separate sge, but resides in the
-	 * same page as xdr_buf.head. Don't count it twice.
-	 */
-	if (sge_no > ctxt->count)
-		atomic_dec(&rdma->sc_dma_used);
-
 	BUG_ON(sge_no > rdma->sc_max_sge);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -108,6 +108,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_co
 	ctxt->xprt = xprt;
 	INIT_LIST_HEAD(&ctxt->dto_q);
 	ctxt->count = 0;
+	ctxt->mapped_sges = 0;
 	ctxt->frmr = NULL;
 	atomic_inc(&xprt->sc_ctxt_used);
 	return ctxt;
@@ -116,22 +117,27 @@ struct svc_rdma_op_ctxt *svc_rdma_get_co
 void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
 	struct svcxprt_rdma *xprt = ctxt->xprt;
-	int i;
-	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+	struct ib_device *device = xprt->sc_cm_id->device;
+	u32 lkey = xprt->sc_dma_lkey;
+	unsigned int i, count;
+
+	for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
 		/*
 		 * Unmap the DMA addr in the SGE if the lkey matches
 		 * the sc_dma_lkey, otherwise, ignore it since it is
 		 * an FRMR lkey and will be unmapped later when the
 		 * last WR that uses it completes.
 		 */
-		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
-			atomic_dec(&xprt->sc_dma_used);
-			ib_dma_unmap_page(xprt->sc_cm_id->device,
+		if (ctxt->sge[i].lkey == lkey) {
+			count++;
+			ib_dma_unmap_page(device,
 					    ctxt->sge[i].addr,
 					    ctxt->sge[i].length,
 					    ctxt->direction);
 		}
 	}
+	ctxt->mapped_sges = 0;
+	atomic_sub(count, &xprt->sc_dma_used);
 }
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -521,7 +527,7 @@ int svc_rdma_post_recv(struct svcxprt_rd
 				     DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
 			goto err_put_ctxt;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 		ctxt->sge[sge_no].addr = pa;
 		ctxt->sge[sge_no].length = PAGE_SIZE;
 		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
@@ -1346,7 +1352,7 @@ void svc_rdma_send_error(struct svcxprt_
 		svc_rdma_put_context(ctxt, 1);
 		return;
 	}
-	atomic_inc(&xprt->sc_dma_used);
+	svc_rdma_count_mappings(xprt, ctxt);
 	ctxt->sge[0].lkey = xprt->sc_dma_lkey;
 	ctxt->sge[0].length = length;
 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ