lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1437159665-6612-7-git-send-email-jglisse@redhat.com>
Date:	Fri, 17 Jul 2015 15:01:03 -0400
From:	Jérôme Glisse <jglisse@...hat.com>
To:	<linux-kernel@...r.kernel.org>, <linux-rdma@...r.kernel.org>
Cc:	Christophe Harle <charle@...dia.com>,
	Duncan Poole <dpoole@...dia.com>,
	Sherry Cheung <SCheung@...dia.com>,
	Subhash Gutti <sgutti@...dia.com>,
	John Hubbard <jhubbard@...dia.com>,
	Mark Hairgrove <mhairgrove@...dia.com>,
	Lucien Dunning <ldunning@...dia.com>,
	Cameron Buschardt <cabuschardt@...dia.com>,
	Arvind Gopalakrishnan <arvindg@...dia.com>,
	Haggai Eran <haggaie@...lanox.com>,
	Shachar Raindel <raindel@...lanox.com>,
	Liran Liss <liranl@...lanox.com>,
	Jérôme Glisse <jglisse@...hat.com>
Subject: [PATCH 6/8] IB/mlx5/hmm: add mlx5 HMM device initialization and callback v3.

This add the core HMM callback for mlx5 device driver and initialize
the HMM device for the mlx5 infiniband device driver.

Changed since v1:
  - Adapt to new hmm_mirror lifetime rules.
  - HMM_ISDIRTY no longer exist.

Changed since v2:
  - Adapt to HMM page table changes.

Signed-off-by: Jérôme Glisse <jglisse@...hat.com>
Signed-off-by: John Hubbard <jhubbard@...dia.com>
---
 drivers/infiniband/core/umem_odp.c   |  12 ++-
 drivers/infiniband/hw/mlx5/main.c    |   5 +
 drivers/infiniband/hw/mlx5/mem.c     |  38 +++++++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  18 +++-
 drivers/infiniband/hw/mlx5/mr.c      |   8 +-
 drivers/infiniband/hw/mlx5/odp.c     | 178 ++++++++++++++++++++++++++++++++++-
 include/rdma/ib_umem_odp.h           |  20 +++-
 7 files changed, 269 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index ac87ac6..c5e7461 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -134,7 +134,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 			return -ENOMEM;
 		}
 		kref_init(&ib_mirror->kref);
-		init_rwsem(&ib_mirror->hmm_mr_rwsem);
+		init_rwsem(&ib_mirror->umem_rwsem);
 		ib_mirror->umem_tree = RB_ROOT;
 		ib_mirror->ib_device = ib_device;
 
@@ -151,10 +151,11 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 		context->ib_mirror = ib_mirror_ref(ib_mirror);
 	}
 	mutex_unlock(&ib_device->hmm_mutex);
-	umem->odp_data.ib_mirror = ib_mirror;
+	umem->odp_data->ib_mirror = ib_mirror;
 
 	down_write(&ib_mirror->umem_rwsem);
-	rbt_ib_umem_insert(&umem->odp_data->interval_tree, &mirror->umem_tree);
+	rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+			   &ib_mirror->umem_tree);
 	up_write(&ib_mirror->umem_rwsem);
 
 	mmput(mm);
@@ -163,7 +164,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 
 void ib_umem_odp_release(struct ib_umem *umem)
 {
-	struct ib_mirror *ib_mirror = umem->odp_data;
+	struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror;
 
 	/*
 	 * Ensure that no more pages are mapped in the umem.
@@ -180,7 +181,8 @@ void ib_umem_odp_release(struct ib_umem *umem)
 	 * range covered by one and only one umem while holding the umem rwsem.
 	 */
 	down_write(&ib_mirror->umem_rwsem);
-	rbt_ib_umem_remove(&umem->odp_data->interval_tree, &mirror->umem_tree);
+	rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+			   &ib_mirror->umem_tree);
 	up_write(&ib_mirror->umem_rwsem);
 
 	ib_mirror_unref(ib_mirror);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5593f22..b731d06 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1531,6 +1531,9 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (err)
 		goto err_rsrc;
 
+	/* If HMM initialization fails we just do not enable odp. */
+	mlx5_dev_init_odp_hmm(&dev->ib_dev, &mdev->pdev->dev);
+
 	err = ib_register_device(&dev->ib_dev, NULL);
 	if (err)
 		goto err_odp;
@@ -1555,6 +1558,7 @@ err_umrc:
 
 err_dev:
 	ib_unregister_device(&dev->ib_dev);
+	mlx5_dev_fini_odp_hmm(&dev->ib_dev);
 
 err_odp:
 	mlx5_ib_odp_remove_one(dev);
@@ -1574,6 +1578,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 
 	ib_unregister_device(&dev->ib_dev);
 	destroy_umrc_res(dev);
+	mlx5_dev_fini_odp_hmm(&dev->ib_dev);
 	mlx5_ib_odp_remove_one(dev);
 	destroy_dev_resources(&dev->devr);
 	ib_dealloc_device(&dev->ib_dev);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 21084c7..8b11d30 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -154,6 +154,8 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 			    __be64 *pas, int access_flags, void *data)
 {
 	unsigned long umem_page_shift = ilog2(umem->page_size);
+	unsigned long start = ib_umem_start(umem) + (offset << PAGE_SHIFT);
+	unsigned long end = start + (num_pages << PAGE_SHIFT);
 	int shift = page_shift - umem_page_shift;
 	int mask = (1 << shift) - 1;
 	int i, k;
@@ -164,7 +166,41 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	int entry;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+	if (umem->odp_data) {
+		struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror;
+		struct hmm_mirror *mirror = &ib_mirror->base;
+		struct hmm_pt_iter *iter = data, local_iter;
+		unsigned long addr;
+
+		if (iter == NULL) {
+			iter = &local_iter;
+			hmm_pt_iter_init(iter, &mirror->pt);
+		}
+
+		for (i=0, addr=start; i < num_pages; ++i, addr+=PAGE_SIZE) {
+			unsigned long next = end;
+			dma_addr_t *ptep, pte;
+
+			/* Get and lock pointer to mirror page table. */
+			ptep = hmm_pt_iter_lookup(iter, addr, &next);
+			pte = ptep ? *ptep : 0;
+			/* HMM will not have any page tables set up, if this
+			 * function is called before page faults have happened
+			 * on the MR. In that case, we don't have PA's yet, so
+			 * just set each one to zero and continue on. The hw
+			 * will trigger a page fault.
+			 */
+			if (hmm_pte_test_valid_dma(&pte))
+				pas[i] = cpu_to_be64(umem_dma_to_mtt(pte));
+			else
+				pas[i] = (__be64)0;
+		}
+
+		if (iter == &local_iter)
+			hmm_pt_iter_fini(iter);
+
+		return;
+	}
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 	const bool odp = umem->odp_data != NULL;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 5f1d0dd..83b832e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -635,6 +635,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
 extern struct workqueue_struct *mlx5_ib_page_fault_wq;
 
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
@@ -649,12 +650,16 @@ void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
 void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev, struct device *dev);
+void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev);
+int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start,
+			    u64 end, void *cookie);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end);
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
@@ -691,4 +696,15 @@ static inline u8 convert_access(int acc)
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
 
+#ifndef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
+static inline void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev,
+					 struct device *dev)
+{
+}
+
+static inline void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev)
+{
+}
+#endif /* ! CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
+
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 6e9e117..3f3a339 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1209,7 +1209,13 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
 		/* Wait for all running page-fault handlers to finish. */
 		synchronize_srcu(&dev->mr_srcu);
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+		if (mlx5_ib_umem_invalidate(umem, ib_umem_start(umem),
+					    ib_umem_end(umem), NULL))
+			/*
+			 * FIXME do something to kill all mr and umem
+			 * in use by this process.
+			 */
+			pr_err("killing all mr with odp due to mtt update failure\n");
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 		/* Destroy all page mappings */
 		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index b2bf671..d8156ec 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -52,8 +52,184 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 	return container_of(mmr, struct mlx5_ib_mr, mmr);
 }
 
+
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+
+
+int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start,
+			    u64 end, void *cookie)
+{
+	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
+	u64 idx = 0, blk_start_idx = 0;
+	struct hmm_pt_iter iter;
+	struct mlx5_ib_mr *mlx5_ib_mr;
+	struct hmm_mirror *mirror;
+	unsigned long addr;
+	int in_block = 0;
+	int ret = 0;
+
+	if (!umem || !umem->odp_data) {
+		pr_err("invalidation called on NULL umem or non-ODP umem\n");
+		return -EINVAL;
+	}
+
+	/* Is this ib_mr active and registered yet ? */
+	if (umem->odp_data->private == NULL)
+		return 0;
+
+	mlx5_ib_mr = umem->odp_data->private;
+	if (!mlx5_ib_mr->ibmr.pd)
+		return 0;
+
+	mirror = &umem->odp_data->ib_mirror->base;
+	start = max_t(u64, ib_umem_start(umem), start);
+	end = min_t(u64, ib_umem_end(umem), end);
+	hmm_pt_iter_init(&iter, &mirror->pt);
+
+	/*
+	 * Iteration one - zap the HW's MTTs. HMM ensures that while we are
+	 * doing the invalidation, no page fault will attempt to overwrite the
+	 * same MTTs.  Concurent invalidations might race us, but they will
+	 * write 0s as well, so no difference in the end result.
+	 */
+	for (addr = start; addr < end; addr += (u64)umem->page_size) {
+		unsigned long next = end;
+		dma_addr_t *ptep;
+
+		/* Get and lock pointer to mirror page table. */
+		ptep = hmm_pt_iter_walk(&iter, &addr, &next);
+		for (; ptep && addr < next; addr += PAGE_SIZE, ptep++) {
+			idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+			/*
+			 * Strive to write the MTTs in chunks, but avoid
+			 * overwriting non-existing MTTs. The huristic here can
+			 * be improved to estimate the cost of another UMR vs.
+			 * the cost of bigger UMR.
+			 */
+			if ((*ptep) & (ODP_READ_ALLOWED_BIT |
+				       ODP_WRITE_ALLOWED_BIT)) {
+				if ((*ptep) & ODP_WRITE_ALLOWED_BIT)
+					hmm_pte_set_dirty(ptep);
+				/*
+				 * Because there can not be concurrent overlapping
+				 * munmap, page migrate, page write protect then it
+				 * is safe here to clear those bits.
+				 */
+				hmm_pte_clear_bit(ptep, ODP_READ_ALLOWED_SHIFT);
+				hmm_pte_clear_bit(ptep, ODP_WRITE_ALLOWED_SHIFT);
+				if (!in_block) {
+					blk_start_idx = idx;
+					in_block = 1;
+				}
+			} else {
+				u64 umr_offset = idx & umr_block_mask;
+
+				if (in_block && umr_offset == 0) {
+					ret = mlx5_ib_update_mtt(mlx5_ib_mr,
+							 blk_start_idx,
+							 idx - blk_start_idx,
+							 1, &iter) || ret;
+					in_block = 0;
+				}
+			}
+		}
+	}
+	if (in_block)
+		ret = mlx5_ib_update_mtt(mlx5_ib_mr, blk_start_idx,
+					 idx - blk_start_idx + 1, 1,
+					 &iter) || ret;
+	hmm_pt_iter_fini(&iter);
+	return ret;
+}
+
+static int mlx5_hmm_invalidate_range(struct hmm_mirror *mirror,
+				     unsigned long start,
+				     unsigned long end)
+{
+	struct ib_mirror *ib_mirror;
+	int ret;
+
+	ib_mirror = container_of(mirror, struct ib_mirror, base);
+
+	/* Go over all memory region and invalidate them. */
+	down_read(&ib_mirror->umem_rwsem);
+	ret = rbt_ib_umem_for_each_in_range(&ib_mirror->umem_tree, start, end,
+					    mlx5_ib_umem_invalidate, NULL);
+	up_read(&ib_mirror->umem_rwsem);
+	return ret;
+}
+
+static void mlx5_hmm_release(struct hmm_mirror *mirror)
+{
+	struct ib_mirror *ib_mirror;
+
+	ib_mirror = container_of(mirror, struct ib_mirror, base);
+
+	/* Go over all memory region and invalidate them. */
+	mlx5_hmm_invalidate_range(mirror, 0, ULLONG_MAX);
+}
+
+static void mlx5_hmm_free(struct hmm_mirror *mirror)
+{
+	struct ib_mirror *ib_mirror;
+
+	ib_mirror = container_of(mirror, struct ib_mirror, base);
+	kfree(ib_mirror);
+}
+
+static int mlx5_hmm_update(struct hmm_mirror *mirror,
+			    struct hmm_event *event)
+{
+	struct device *device = mirror->device->dev;
+	int ret = 0;
+
+	switch (event->etype) {
+	case HMM_DEVICE_RFAULT:
+	case HMM_DEVICE_WFAULT:
+		/* FIXME implement. */
+		break;
+	case HMM_NONE:
+	default:
+		dev_warn(device, "Warning: unhandled HMM event (%d) defaulting to invalidation\n",
+			 event->etype);
+		/* Fallthrough. */
+	/* For write protect and fork we could only invalidate writeable mr. */
+	case HMM_WRITE_PROTECT:
+	case HMM_MIGRATE:
+	case HMM_MUNMAP:
+	case HMM_FORK:
+		ret = mlx5_hmm_invalidate_range(mirror,
+						event->start,
+						event->end);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct hmm_device_ops mlx5_hmm_ops = {
+	.release		= &mlx5_hmm_release,
+	.free			= &mlx5_hmm_free,
+	.update			= &mlx5_hmm_update,
+};
+
+void mlx5_dev_init_odp_hmm(struct ib_device *ib_device, struct device *dev)
+{
+	INIT_LIST_HEAD(&ib_device->ib_mirrors);
+	ib_device->hmm_dev.dev = dev;
+	ib_device->hmm_dev.ops = &mlx5_hmm_ops;
+	ib_device->hmm_ready = !hmm_device_register(&ib_device->hmm_dev);
+	mutex_init(&ib_device->hmm_mutex);
+}
+
+void mlx5_dev_fini_odp_hmm(struct ib_device *ib_device)
+{
+	if (!ib_device->hmm_ready)
+		return;
+	hmm_device_unregister(&ib_device->hmm_dev);
+}
+
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 
 
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index c7c2670..e982fd3 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -133,7 +133,25 @@ struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
 
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
-#error "CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM not supported at this stage !"
+
+
+/*
+ * HMM have few bits reserved for hardware specific bits inside the mirror page
+ * table. For IB we record the mapping protection per page there.
+ */
+#define ODP_READ_ALLOWED_SHIFT	(HMM_PTE_HW_SHIFT + 0)
+#define ODP_WRITE_ALLOWED_SHIFT	(HMM_PTE_HW_SHIFT + 1)
+#define ODP_READ_ALLOWED_BIT	(1 << ODP_READ_ALLOWED_SHIFT)
+#define ODP_WRITE_ALLOWED_BIT	(1 << ODP_WRITE_ALLOWED_SHIFT)
+
+/* Make sure we are not overwritting valid address bit on target arch. */
+#if (HMM_PTE_HW_SHIFT + 2) > PAGE_SHIFT
+#error (HMM_PTE_HW_SHIFT + 2) > PAGE_SHIFT
+#endif
+
+#define ODP_DMA_ADDR_MASK HMM_PTE_DMA_MASK
+
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ