lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241018064101.336232-11-kanchana.p.sridhar@intel.com>
Date: Thu, 17 Oct 2024 23:40:58 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
To: linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	hannes@...xchg.org,
	yosryahmed@...gle.com,
	nphamcs@...il.com,
	chengming.zhou@...ux.dev,
	usamaarif642@...il.com,
	ryan.roberts@....com,
	ying.huang@...el.com,
	21cnbao@...il.com,
	akpm@...ux-foundation.org,
	linux-crypto@...r.kernel.org,
	herbert@...dor.apana.org.au,
	davem@...emloft.net,
	clabbe@...libre.com,
	ardb@...nel.org,
	ebiggers@...gle.com,
	surenb@...gle.com,
	kristen.c.accardi@...el.com,
	zanussi@...nel.org,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	mcgrof@...nel.org,
	kees@...nel.org,
	joel.granados@...nel.org,
	bfoster@...hat.com,
	willy@...radead.org,
	linux-fsdevel@...r.kernel.org
Cc: wajdi.k.feghali@...el.com,
	vinodh.gopal@...el.com,
	kanchana.p.sridhar@...el.com
Subject: [RFC PATCH v1 10/13] mm: zswap: Create multiple reqs/buffers in crypto_acomp_ctx if platform has IAA.

Intel IAA hardware acceleration can be used effectively to improve the
zswap_store() performance of large folios by batching multiple pages in a
folio to be compressed in parallel by IAA. Hence, to build compress batching
of zswap large folio stores using IAA, we need to be able to submit a batch
of compress jobs from zswap to the hardware to compress in parallel if the
iaa_crypto "async" mode is used.

The IAA compress batching paradigm works as follows:

 1) Submit N crypto_acomp_compress() jobs using N requests.
 2) Use the iaa_crypto driver async poll() method to check for the jobs
    to complete.
 3) There are no ordering constraints implied by submission, hence we
    could loop through the requests and process any job that has
    completed.
 4) This would repeat until all jobs have completed with success/error
    status.

To facilitate this, we need to provide for multiple acomp_reqs in
"struct crypto_acomp_ctx", each representing a distinct compress
job. Likewise, there needs to be a distinct destination buffer
corresponding to each acomp_req.

If CONFIG_ZSWAP_STORE_BATCHING_ENABLED is enabled, this patch will set the
SWAP_CRYPTO_SUB_BATCH_SIZE constant to 8UL. This implies each per-cpu
crypto_acomp_ctx associated with the zswap_pool can submit up to 8
acomp_reqs at a time to accomplish parallel compressions.

If IAA is not present and/or CONFIG_ZSWAP_STORE_BATCHING_ENABLED is not
set, SWAP_CRYPTO_SUB_BATCH_SIZE will be set to 1UL.

On an Intel Sapphire Rapids server, each socket has 4 IAA, each of which
has 2 compress engines and 8 decompress engines. Experiments modeling a
contended system with say 72 processes running under a cgroup with a fixed
memory-limit, have shown that there is a significant performance
improvement with dispatching compress jobs from all cores to all the
IAA devices on the socket. Hence, SWAP_CRYPTO_SUB_BATCH_SIZE is set to
8 to maximize compression throughput if IAA is available.

The definition of "struct crypto_acomp_ctx" is modified to make the
req/buffer be arrays of size SWAP_CRYPTO_SUB_BATCH_SIZE. Thus, the
added memory footprint cost of this per-cpu structure for batching is
incurred only for platforms that have Intel IAA.

Suggested-by: Ying Huang <ying.huang@...el.com>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
---
 mm/swap.h  |  11 ++++++
 mm/zswap.c | 104 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/mm/swap.h b/mm/swap.h
index ad2f121de970..566616c971d4 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -8,6 +8,17 @@ struct mempolicy;
 #include <linux/swapops.h> /* for swp_offset */
 #include <linux/blk_types.h> /* for bio_end_io_t */
 
+/*
+ * For IAA compression batching:
+ * Maximum number of IAA acomp compress requests that will be processed
+ * in a sub-batch.
+ */
+#if defined(CONFIG_ZSWAP_STORE_BATCHING_ENABLED)
+#define SWAP_CRYPTO_SUB_BATCH_SIZE 8UL
+#else
+#define SWAP_CRYPTO_SUB_BATCH_SIZE 1UL
+#endif
+
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
 struct swap_iocb;
diff --git a/mm/zswap.c b/mm/zswap.c
index 4893302d8c34..579869d1bdf6 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -152,9 +152,9 @@ bool zswap_never_enabled(void)
 
 struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
-	struct acomp_req *req;
+	struct acomp_req *req[SWAP_CRYPTO_SUB_BATCH_SIZE];
+	u8 *buffer[SWAP_CRYPTO_SUB_BATCH_SIZE];
 	struct crypto_wait wait;
-	u8 *buffer;
 	struct mutex mutex;
 	bool is_sleepable;
 };
@@ -832,49 +832,64 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 	struct crypto_acomp *acomp;
-	struct acomp_req *req;
 	int ret;
+	int i, j;
 
 	mutex_init(&acomp_ctx->mutex);
 
-	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!acomp_ctx->buffer)
-		return -ENOMEM;
-
 	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
 	if (IS_ERR(acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
 				pool->tfm_name, PTR_ERR(acomp));
-		ret = PTR_ERR(acomp);
-		goto acomp_fail;
+		return PTR_ERR(acomp);
 	}
 	acomp_ctx->acomp = acomp;
 	acomp_ctx->is_sleepable = acomp_is_async(acomp);
 
-	req = acomp_request_alloc(acomp_ctx->acomp);
-	if (!req) {
-		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
-		ret = -ENOMEM;
-		goto req_fail;
+	for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
+		acomp_ctx->buffer[i] = kmalloc_node(PAGE_SIZE * 2,
+						GFP_KERNEL, cpu_to_node(cpu));
+		if (!acomp_ctx->buffer[i]) {
+			for (j = 0; j < i; ++j)
+				kfree(acomp_ctx->buffer[j]);
+			ret = -ENOMEM;
+			goto buf_fail;
+		}
+	}
+
+	for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
+		acomp_ctx->req[i] = acomp_request_alloc(acomp_ctx->acomp);
+		if (!acomp_ctx->req[i]) {
+			pr_err("could not alloc crypto acomp_request req[%d] %s\n",
+			       i, pool->tfm_name);
+			for (j = 0; j < i; ++j)
+				acomp_request_free(acomp_ctx->req[j]);
+			ret = -ENOMEM;
+			goto req_fail;
+		}
 	}
-	acomp_ctx->req = req;
 
+	/*
+	 * The crypto_wait is used only in fully synchronous, i.e., with scomp
+	 * or non-poll mode of acomp, hence there is only one "wait" per
+	 * acomp_ctx, with callback set to req[0].
+	 */
 	crypto_init_wait(&acomp_ctx->wait);
 	/*
 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
 	 * won't be called, crypto_wait_req() will return without blocking.
 	 */
-	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+	acomp_request_set_callback(acomp_ctx->req[0], CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
 
 	return 0;
 
 req_fail:
+	for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
+		kfree(acomp_ctx->buffer[i]);
+buf_fail:
 	crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
-	kfree(acomp_ctx->buffer);
 	return ret;
 }
 
@@ -884,11 +899,17 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 
 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
-		if (!IS_ERR_OR_NULL(acomp_ctx->req))
-			acomp_request_free(acomp_ctx->req);
+		int i;
+
+		for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
+			if (!IS_ERR_OR_NULL(acomp_ctx->req[i]))
+				acomp_request_free(acomp_ctx->req[i]);
+
+		for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
+			kfree(acomp_ctx->buffer[i]);
+
 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 			crypto_free_acomp(acomp_ctx->acomp);
-		kfree(acomp_ctx->buffer);
 	}
 
 	return 0;
@@ -911,7 +932,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 
 	mutex_lock(&acomp_ctx->mutex);
 
-	dst = acomp_ctx->buffer;
+	dst = acomp_ctx->buffer[0];
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
 
@@ -921,7 +942,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	 * giving the dst buffer with enough length to avoid buffer overflow.
 	 */
 	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+	acomp_request_set_params(acomp_ctx->req[0], &input, &output, PAGE_SIZE, dlen);
 
 	/*
 	 * If the crypto_acomp provides an asynchronous poll() interface,
@@ -940,19 +961,20 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	 * parallel.
 	 */
 	if (acomp_ctx->acomp->poll) {
-		comp_ret = crypto_acomp_compress(acomp_ctx->req);
+		comp_ret = crypto_acomp_compress(acomp_ctx->req[0]);
 		if (comp_ret == -EINPROGRESS) {
 			do {
-				comp_ret = crypto_acomp_poll(acomp_ctx->req);
+				comp_ret = crypto_acomp_poll(acomp_ctx->req[0]);
 				if (comp_ret && comp_ret != -EAGAIN)
 					break;
 			} while (comp_ret);
 		}
 	} else {
-		comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+		comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req[0]),
+					   &acomp_ctx->wait);
 	}
 
-	dlen = acomp_ctx->req->dlen;
+	dlen = acomp_ctx->req[0]->dlen;
 	if (comp_ret)
 		goto unlock;
 
@@ -1006,31 +1028,39 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	 */
 	if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
 	    !virt_addr_valid(src)) {
-		memcpy(acomp_ctx->buffer, src, entry->length);
-		src = acomp_ctx->buffer;
+		memcpy(acomp_ctx->buffer[0], src, entry->length);
+		src = acomp_ctx->buffer[0];
 		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	sg_init_one(&input, src, entry->length);
 	sg_init_table(&output, 1);
 	sg_set_folio(&output, folio, PAGE_SIZE, 0);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
+	acomp_request_set_params(acomp_ctx->req[0], &input, &output,
+				 entry->length, PAGE_SIZE);
 	if (acomp_ctx->acomp->poll) {
-		ret = crypto_acomp_decompress(acomp_ctx->req);
+		ret = crypto_acomp_decompress(acomp_ctx->req[0]);
 		if (ret == -EINPROGRESS) {
 			do {
-				ret = crypto_acomp_poll(acomp_ctx->req);
+				ret = crypto_acomp_poll(acomp_ctx->req[0]);
 				BUG_ON(ret && ret != -EAGAIN);
 			} while (ret);
 		}
 	} else {
-		BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
+		BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req[0]),
+				       &acomp_ctx->wait));
 	}
-	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
-	mutex_unlock(&acomp_ctx->mutex);
+	BUG_ON(acomp_ctx->req[0]->dlen != PAGE_SIZE);
 
-	if (src != acomp_ctx->buffer)
+	if (src != acomp_ctx->buffer[0])
 		zpool_unmap_handle(zpool, entry->handle);
+
+	/*
+	 * It is safer to unlock the mutex after the check for
+	 * "src != acomp_ctx->buffer[0]" so that the value of "src"
+	 * does not change.
+	 */
+	mutex_unlock(&acomp_ctx->mutex);
 }
 
 /*********************************
-- 
2.27.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ