[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260125033537.334628-19-kanchana.p.sridhar@intel.com>
Date: Sat, 24 Jan 2026 19:35:29 -0800
From: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
To: linux-kernel@...r.kernel.org,
linux-mm@...ck.org,
hannes@...xchg.org,
yosry.ahmed@...ux.dev,
nphamcs@...il.com,
chengming.zhou@...ux.dev,
usamaarif642@...il.com,
ryan.roberts@....com,
21cnbao@...il.com,
ying.huang@...ux.alibaba.com,
akpm@...ux-foundation.org,
senozhatsky@...omium.org,
sj@...nel.org,
kasong@...cent.com,
linux-crypto@...r.kernel.org,
herbert@...dor.apana.org.au,
davem@...emloft.net,
clabbe@...libre.com,
ardb@...nel.org,
ebiggers@...gle.com,
surenb@...gle.com,
kristen.c.accardi@...el.com,
vinicius.gomes@...el.com,
giovanni.cabiddu@...el.com
Cc: wajdi.k.feghali@...el.com,
kanchana.p.sridhar@...el.com
Subject: [PATCH v14 18/26] crypto: acomp, iaa - crypto_acomp integration of IAA Batching.
This commit makes the necessary changes for correctly integrating IAA
compress/decompress batching with the crypto_acomp API as per the
discussions in [1]. Further, IAA sets crypto_alg flags to indicate
support for segmentation.
To provide context from the perspective of a kernel user such as zswap,
the zswap interface to these batching API will be done by setting up the
acomp_req through these crypto API to designate multiple src/dst SG
lists representing the batch being sent to iaa_crypto:
acomp_request_set_src_folio()
acomp_request_set_dst_sg()
acomp_request_set_unit_size()
before proceeding to invoke batch compression using the existing
crypto_acomp_compress() interface.
Within crypto_acomp_compress(), an acomp_req whose tfm supports
segmentation is further tested for an "slen" that is greater than the
request's unit_size. If so, we invoke "acomp_do_req_batch_parallel()",
similar to the "acomp_do_req_chain()" case.
acomp_do_req_batch_parallel() creates a wait_queue_head
"batch_parallel_wq", stores it in the acomp_req's "__ctx", then calls
tfm->compress()/tfm->decompress().
Next, the iaa_crypto driver alg's compress() implementation submits the
batch's requests and immediately returns to
acomp_do_req_batch_parallel(); which then waits for the
"batch_parallel_wq" to be notified by a tfm->batch_completed() event.
To support this, a "batch_completed()" API is added to
"struct crypto_acomp" and "struct acomp_alg".
The iaa_crypto driver alg's batch_completed() implementation waits for
each batch sub-request to complete and notifies the batch_parallel_wq.
If any sub-request has an error, -EINVAL is returned to the acomp_req's
callback, else 0.
[1]: https://lore.kernel.org/all/aRqSqQxR4eHzvb2g@gondor.apana.org.au/
Suggested-by: Herbert Xu <herbert@...dor.apana.org.au>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
---
crypto/acompress.c | 63 ++++++++++
drivers/crypto/intel/iaa/iaa_crypto.h | 3 +
drivers/crypto/intel/iaa/iaa_crypto_main.c | 137 +++++++++++++++++++--
include/crypto/acompress.h | 7 ++
include/crypto/internal/acompress.h | 7 ++
5 files changed, 210 insertions(+), 7 deletions(-)
diff --git a/crypto/acompress.c b/crypto/acompress.c
index cfb8ede02cf4..c48a1a20e21f 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -105,6 +105,7 @@ static int crypto_acomp_init_tfm(struct crypto_tfm *tfm)
acomp->compress = alg->compress;
acomp->decompress = alg->decompress;
+ acomp->batch_completed = alg->batch_completed;
acomp->reqsize = alg->base.cra_reqsize;
acomp->base.exit = crypto_acomp_exit_tfm;
@@ -291,6 +292,65 @@ static __always_inline int acomp_do_req_chain(struct acomp_req *req, bool comp)
return acomp_reqchain_finish(req, err);
}
+static int acomp_do_req_batch_parallel(struct acomp_req *req, bool comp)
+{
+ struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
+ unsigned long *bpwq_addr = acomp_request_ctx(req);
+ wait_queue_head_t batch_parallel_wq;
+ int ret;
+
+ init_waitqueue_head(&batch_parallel_wq);
+ *bpwq_addr = (unsigned long)&batch_parallel_wq;
+
+ ret = comp ? tfm->compress(req) : tfm->decompress(req);
+
+ wait_event(batch_parallel_wq, tfm->batch_completed(req, comp));
+
+ if (req->slen < 0)
+ ret |= -EINVAL;
+
+ return ret;
+}
+
+/**
+ * Please note:
+ * ============
+ *
+ * 1) If @req->unit_size is 0, there is no impact to existing acomp users.
+ *
+ * 2) If @req->unit_size is non-0 (for e.g. zswap compress batching) and
+ * @req->src and @req->dst are scatterlists:
+ *
+ * a) Algorithms that do not support segmentation:
+ *
+ * We call acomp_do_req_chain() that handles the trivial case when
+ * the caller has passed exactly one segment. The dst SG list's length is
+ * set to the compression error/compressed length for that segment.
+ *
+ * b) Algorithms that support segmentation:
+ *
+ * If the source length is more than @req->unit_size,
+ * acomp_do_req_batch_parallel() is invoked: this calls the tfm's
+ * compress() API, which uses the @req->unit_size being greater than
+ * @req->slen to ascertain that it needs to do batching. The algorithm's
+ * compress() implementation submits the batch's sub-requests for
+ * compression and returns.
+ *
+ * Algorithms that support batching must provide a batch_completed() API.
+ * When the batch's compression sub-requests have completed, they must
+ * notify a wait_queue using the batch_completed() API. The batching tfm
+ * implementation must set the dst SG lists to contain the individual
+ * sub-requests' error/compressed lengths.
+ *
+ * If the source length == @req->unit_size, the tfm's compress() API is
+ * invoked. The assumption is that segmentation algorithms will internally
+ * set the dst SG list's length to indicate error/compressed length in
+ * this case, similar to the batching case.
+ *
+ * 3) To prevent functional/performance regressions, we preserve existing
+ * behavior in all other cases, such as, when @req->unit_size is non-0 and
+ * @req->src and/or @req->dst is virtual; instead of returning an error.
+ */
int crypto_acomp_compress(struct acomp_req *req)
{
struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
@@ -302,6 +362,9 @@ int crypto_acomp_compress(struct acomp_req *req)
if (!crypto_acomp_req_seg(tfm))
return acomp_do_req_chain(req, true);
+ if (likely((req->slen > req->unit_size) && tfm->batch_completed))
+ return acomp_do_req_batch_parallel(req, true);
+
return tfm->compress(req);
}
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index db83c21e92f1..d85a8f1cbb93 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -69,10 +69,13 @@
* IAA. In other words, don't make any assumptions, and protect
* compression/decompression data.
*
+ * @data: Driver internal data to interface with crypto_acomp.
+ *
*/
struct iaa_batch_ctx {
struct iaa_req **reqs;
struct mutex mutex;
+ void *data;
};
#define IAA_COMP_MODES_MAX IAA_MODE_NONE
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 8d83a1ea15d7..915bf9b17b39 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -2524,6 +2524,71 @@ static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa_mode
* Interfaces to crypto_alg and crypto_acomp.
*********************************************/
+static __always_inline int iaa_crypto_acomp_acompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ struct iaa_req **reqs,
+ unsigned int unit_size)
+{
+ int nr_reqs = parent_req->slen / unit_size;
+
+ return iaa_comp_submit_acompress_batch(ctx, parent_req, reqs, nr_reqs, unit_size);
+}
+
+static __always_inline int iaa_crypto_acomp_adecompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ struct iaa_req **reqs,
+ unsigned int unit_size)
+{
+ int nr_reqs = parent_req->dlen / unit_size;
+
+ return iaa_comp_submit_adecompress_batch(ctx, parent_req, reqs, nr_reqs);
+}
+
+static bool iaa_crypto_acomp_batch_completed(struct acomp_req *areq, bool comp)
+{
+ unsigned long *cpu_ctx_addr = acomp_request_ctx(areq);
+ struct iaa_batch_ctx *cpu_ctx = (struct iaa_batch_ctx *)*cpu_ctx_addr;
+ wait_queue_head_t *batch_parallel_wq = (wait_queue_head_t *)cpu_ctx->data;
+ struct iaa_req **reqs = cpu_ctx->reqs;
+ int nr_reqs = (comp ? areq->slen : areq->dlen) / areq->unit_size;
+
+ /*
+ * Since both, compress and decompress require the eventual
+ * caller (zswap) to verify @areq->dlen, we use @areq->slen to
+ * flag the batch's success/error to crypto_acomp, which will
+ * return this as the @err status to the crypto_acomp callback
+ * function.
+ */
+ if (iaa_comp_batch_completed(NULL, reqs, nr_reqs))
+ areq->slen = -EINVAL;
+
+ /*
+ * Set the acomp_req's dlen to be the first SG list's
+ * compressed/decompressed length/error value to enable zswap code
+ * equivalence for non-batching and batching acomp_algs.
+ */
+ areq->dlen = areq->dst->length;
+
+ /* All sub-requests have finished. Notify the @batch_parallel_wq. */
+ if (waitqueue_active(batch_parallel_wq))
+ wake_up(batch_parallel_wq);
+
+ mutex_unlock(&cpu_ctx->mutex);
+
+ return true;
+}
+
+/*
+ * Main compression API for kernel users of crypto_acomp, such as zswap.
+ *
+ * crypto_acomp_compress() calls into this procedure for:
+ * - Sequential compression of a single page,
+ * - Parallel batch compression of multiple pages.
+ *
+ * @areq: asynchronous compress request
+ */
static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq)
{
struct crypto_tfm *tfm = areq->base.tfm;
@@ -2534,14 +2599,47 @@ static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq)
if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
ctx = iaa_ctx[idx];
- acomp_to_iaa(areq, &parent_req, ctx);
- ret = iaa_comp_acompress(ctx, &parent_req);
- iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ if (likely(areq->slen == areq->unit_size) || !areq->unit_size) {
+ acomp_to_iaa(areq, &parent_req, ctx);
+ ret = iaa_comp_acompress(ctx, &parent_req);
+ iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ } else {
+ struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx);
+ struct iaa_req **reqs;
+ unsigned long *cpu_ctx_addr, *bpwq_addr;
+
+ acomp_to_iaa(areq, &parent_req, ctx);
+
+ mutex_lock(&cpu_ctx->mutex);
+
+ bpwq_addr = acomp_request_ctx(areq);
+ /* Save the wait_queue_head. */
+ cpu_ctx->data = (wait_queue_head_t *)*bpwq_addr;
+
+ reqs = cpu_ctx->reqs;
+
+ ret = iaa_crypto_acomp_acompress_batch(ctx,
+ &parent_req,
+ reqs,
+ areq->unit_size);
+
+ cpu_ctx_addr = acomp_request_ctx(areq);
+ *cpu_ctx_addr = (unsigned long)cpu_ctx;
+ }
}
return ret;
}
+/*
+ * Main decompression API for kernel users of crypto_acomp, such as zswap.
+ *
+ * crypto_acomp_decompress() calls into this procedure for:
+ * - Sequential decompression of a single buffer,
+ * - Parallel batch decompression of multiple buffers.
+ *
+ * @areq: asynchronous decompress request
+ */
static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq)
{
struct crypto_tfm *tfm = areq->base.tfm;
@@ -2552,9 +2650,33 @@ static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq)
if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
ctx = iaa_ctx[idx];
- acomp_to_iaa(areq, &parent_req, ctx);
- ret = iaa_comp_adecompress(ctx, &parent_req);
- iaa_to_acomp(parent_req.dlen, areq);
+ if (likely(areq->dlen == areq->unit_size) || !areq->unit_size) {
+ acomp_to_iaa(areq, &parent_req, ctx);
+ ret = iaa_comp_adecompress(ctx, &parent_req);
+ iaa_to_acomp(parent_req.dlen, areq);
+ } else {
+ struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx);
+ struct iaa_req **reqs;
+ unsigned long *cpu_ctx_addr, *bpwq_addr;
+
+ acomp_to_iaa(areq, &parent_req, ctx);
+
+ mutex_lock(&cpu_ctx->mutex);
+
+ bpwq_addr = acomp_request_ctx(areq);
+ /* Save the wait_queue_head. */
+ cpu_ctx->data = (wait_queue_head_t *)*bpwq_addr;
+
+ reqs = cpu_ctx->reqs;
+
+ ret = iaa_crypto_acomp_adecompress_batch(ctx,
+ &parent_req,
+ reqs,
+ areq->unit_size);
+
+ cpu_ctx_addr = acomp_request_ctx(areq);
+ *cpu_ctx_addr = (unsigned long)cpu_ctx;
+ }
}
return ret;
@@ -2574,10 +2696,11 @@ static struct acomp_alg iaa_acomp_fixed_deflate = {
.init = iaa_crypto_acomp_init_fixed,
.compress = iaa_crypto_acomp_acompress_main,
.decompress = iaa_crypto_acomp_adecompress_main,
+ .batch_completed = iaa_crypto_acomp_batch_completed,
.base = {
.cra_name = "deflate",
.cra_driver_name = "deflate-iaa",
- .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_SEG,
.cra_ctxsize = sizeof(struct iaa_compression_ctx),
.cra_reqsize = sizeof(u32),
.cra_module = THIS_MODULE,
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 86e4932cd112..752110a7719c 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -109,6 +109,12 @@ struct acomp_req {
*
* @compress: Function performs a compress operation
* @decompress: Function performs a de-compress operation
+ * @batch_completed: Waits for batch completion of parallel
+ * compress/decompress requests submitted via
+ * @compress/@...ompress. Returns bool status
+ * of all batch sub-requests having completed.
+ * Returns an error code in @req->slen if any
+ * of the sub-requests completed with an error.
* @reqsize: Context size for (de)compression requests
* @fb: Synchronous fallback tfm
* @base: Common crypto API algorithm data structure
@@ -116,6 +122,7 @@ struct acomp_req {
struct crypto_acomp {
int (*compress)(struct acomp_req *req);
int (*decompress)(struct acomp_req *req);
+ bool (*batch_completed)(struct acomp_req *req, bool comp);
unsigned int reqsize;
struct crypto_tfm base;
};
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 366dbdb987e8..7c4e14491d59 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -28,6 +28,12 @@
*
* @compress: Function performs a compress operation
* @decompress: Function performs a de-compress operation
+ * @batch_completed: Waits for batch completion of parallel
+ * compress/decompress requests submitted via
+ * @compress/@...ompress. Returns bool status
+ * of all batch sub-requests having completed.
+ * Returns an error code in @req->slen if any
+ * of the sub-requests completed with an error.
* @init: Initialize the cryptographic transformation object.
* This function is used to initialize the cryptographic
* transformation object. This function is called only once at
@@ -46,6 +52,7 @@
struct acomp_alg {
int (*compress)(struct acomp_req *req);
int (*decompress)(struct acomp_req *req);
+ bool (*batch_completed)(struct acomp_req *req, bool comp);
int (*init)(struct crypto_acomp *tfm);
void (*exit)(struct crypto_acomp *tfm);
--
2.27.0
Powered by blists - more mailing lists