[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241220100936.2193541-4-matsuda-daisuke@fujitsu.com>
Date: Fri, 20 Dec 2024 19:09:34 +0900
From: Daisuke Matsuda <matsuda-daisuke@...itsu.com>
To: linux-rdma@...r.kernel.org,
leon@...nel.org,
jgg@...pe.ca,
zyjzyj2000@...il.com
Cc: linux-kernel@...r.kernel.org,
rpearsonhpe@...il.com,
lizhijian@...itsu.com,
Daisuke Matsuda <matsuda-daisuke@...itsu.com>
Subject: [PATCH for-next v9 3/5] RDMA/rxe: Allow registering MRs for On-Demand Paging
Allow userspace to register an ODP-enabled MR, in which case the flag
IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no
RDMA operation enabled right now. They will be supported later in the
subsequent two patches.
rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR. It syncs
process address space from the CPU page table to the driver page table
(dma_list/pfn_list in umem_odp) when called with RXE_PAGEFAULT_SNAPSHOT
flag. Additionally, It can be used to trigger page fault when pages being
accessed are not present or do not have proper read/write permissions, and
possibly to prefetch pages in the future.
Signed-off-by: Daisuke Matsuda <matsuda-daisuke@...itsu.com>
---
drivers/infiniband/sw/rxe/rxe.c | 7 +++
drivers/infiniband/sw/rxe/rxe_loc.h | 12 ++++
drivers/infiniband/sw/rxe/rxe_mr.c | 9 ++-
drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++
drivers/infiniband/sw/rxe/rxe_resp.c | 15 +++--
drivers/infiniband/sw/rxe/rxe_verbs.c | 5 +-
6 files changed, 128 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 255677bc12b2..3ca73f8d96cc 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -75,6 +75,13 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
rxe->ndev->dev_addr);
rxe->max_ucontext = RXE_MAX_UCONTEXT;
+
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
+
+ /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
+ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
+ }
}
/* initialize port attributes */
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 0162ac9431c1..1043240daa97 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -185,4 +185,16 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
/* rxe_odp.c */
extern const struct mmu_interval_notifier_ops rxe_mn_ops;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
+ u64 iova, int access_flags, struct rxe_mr *mr);
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+static inline int
+rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
+ int access_flags, struct rxe_mr *mr)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 12cacc4f60f2..190225a34139 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -323,7 +323,10 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
return err;
}
- return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
+ if (mr->umem->is_odp)
+ return -EOPNOTSUPP;
+ else
+ return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
}
/* copy data in or out of a wqe, i.e. sg list
@@ -532,6 +535,10 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
struct page *page;
u64 *va;
+ /* ODP is not supported right now. WIP. */
+ if (mr->umem->is_odp)
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+
/* See IBA oA19-28 */
if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
rxe_dbg_mr(mr, "mr not in valid state\n");
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index 2be8066db012..141290078754 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -36,3 +36,89 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
const struct mmu_interval_notifier_ops rxe_mn_ops = {
.invalidate = rxe_ib_invalidate_range,
};
+
+#define RXE_PAGEFAULT_RDONLY BIT(1)
+#define RXE_PAGEFAULT_SNAPSHOT BIT(2)
+static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
+ u64 access_mask;
+ int np;
+
+ access_mask = ODP_READ_ALLOWED_BIT;
+ if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
+ access_mask |= ODP_WRITE_ALLOWED_BIT;
+
+ /*
+ * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
+ * Callers must release the lock later to let invalidation handler
+ * do its work again.
+ */
+ np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt,
+ access_mask, fault);
+ return np;
+}
+
+static int rxe_odp_init_pages(struct rxe_mr *mr)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ int ret;
+
+ ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address,
+ mr->umem->length,
+ RXE_PAGEFAULT_SNAPSHOT);
+
+ if (ret >= 0)
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return ret >= 0 ? 0 : ret;
+}
+
+int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
+ u64 iova, int access_flags, struct rxe_mr *mr)
+{
+ struct ib_umem_odp *umem_odp;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ return -EOPNOTSUPP;
+
+ rxe_mr_init(access_flags, mr);
+
+ if (!start && length == U64_MAX) {
+ if (iova != 0)
+ return -EINVAL;
+ if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+ return -EINVAL;
+
+ /* Never reach here, for implicit ODP is not implemented. */
+ }
+
+ umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags,
+ &rxe_mn_ops);
+ if (IS_ERR(umem_odp)) {
+ rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n",
+ (int)PTR_ERR(umem_odp));
+ return PTR_ERR(umem_odp);
+ }
+
+ umem_odp->private = mr;
+
+ mr->umem = &umem_odp->umem;
+ mr->access = access_flags;
+ mr->ibmr.length = length;
+ mr->ibmr.iova = iova;
+ mr->page_offset = ib_umem_offset(&umem_odp->umem);
+
+ err = rxe_odp_init_pages(mr);
+ if (err) {
+ ib_umem_odp_release(umem_odp);
+ return err;
+ }
+
+ mr->state = RXE_MR_STATE_VALID;
+ mr->ibmr.type = IB_MR_TYPE_USER;
+
+ return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index c11ab280551a..e703a3ab82d4 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -649,6 +649,10 @@ static enum resp_states process_flush(struct rxe_qp *qp,
struct rxe_mr *mr = qp->resp.mr;
struct resp_res *res = qp->resp.res;
+ /* ODP is not supported right now. WIP. */
+ if (mr->umem->is_odp)
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+
/* oA19-14, oA19-15 */
if (res && res->replay)
return RESPST_ACKNOWLEDGE;
@@ -702,10 +706,13 @@ static enum resp_states atomic_reply(struct rxe_qp *qp,
if (!res->replay) {
u64 iova = qp->resp.va + qp->resp.offset;
- err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
- atmeth_comp(pkt),
- atmeth_swap_add(pkt),
- &res->atomic.orig_val);
+ if (mr->umem->is_odp)
+ err = RESPST_ERR_UNSUPPORTED_OPCODE;
+ else
+ err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
+ atmeth_comp(pkt),
+ atmeth_swap_add(pkt),
+ &res->atomic.orig_val);
if (err)
return err;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 5c18f7e342f2..13064302d766 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1278,7 +1278,10 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
mr->ibmr.pd = ibpd;
mr->ibmr.device = ibpd->device;
- err = rxe_mr_init_user(rxe, start, length, access, mr);
+ if (access & IB_ACCESS_ON_DEMAND)
+ err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr);
+ else
+ err = rxe_mr_init_user(rxe, start, length, access, mr);
if (err) {
rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err);
goto err_cleanup;
--
2.43.0
Powered by blists - more mailing lists