lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1439493653-1191-6-git-send-email-jglisse@redhat.com>
Date:	Thu, 13 Aug 2015 15:20:50 -0400
From:	Jérôme Glisse <jglisse@...hat.com>
To:	<linux-kernel@...r.kernel.org>, <linux-rdma@...r.kernel.org>
Cc:	Christophe Harle <charle@...dia.com>,
	Duncan Poole <dpoole@...dia.com>,
	Sherry Cheung <SCheung@...dia.com>,
	Subhash Gutti <sgutti@...dia.com>,
	John Hubbard <jhubbard@...dia.com>,
	Mark Hairgrove <mhairgrove@...dia.com>,
	Lucien Dunning <ldunning@...dia.com>,
	Cameron Buschardt <cabuschardt@...dia.com>,
	Arvind Gopalakrishnan <arvindg@...dia.com>,
	Haggai Eran <haggaie@...lanox.com>,
	Shachar Raindel <raindel@...lanox.com>,
	Liran Liss <liranl@...lanox.com>,
	Jérôme Glisse <jglisse@...hat.com>
Subject: [RFC PATCH 5/8 v2] IB/odp/hmm: add core infiniband structure and helper for ODP with HMM v3.

This add new core infiniband structure and helper to implement ODP (on
demand paging) on top of HMM. We need to retain the tree of ib_umem as
some hardware associate unique identifiant with each umem (or mr) and
only allow hardware page table to be updated using this unique id.

Changed since v1:
  - Adapt to new hmm_mirror lifetime rules.
  - Fix scan of existing mirror in ib_umem_odp_get().

Changed since v2:
  - Remove FIXME for empty umem as it is an invalid case.
  - Fix HMM version of ib_umem_odp_release()

Signed-off-by: Jérôme Glisse <jglisse@...hat.com>
Signed-off-by: John Hubbard <jhubbard@...dia.com>
Signed-off-by: Haggai Eran <haggaie@...lanox.com>
---
 drivers/infiniband/core/umem_odp.c    | 145 ++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/uverbs_cmd.c  |   1 +
 drivers/infiniband/core/uverbs_main.c |   6 ++
 include/rdma/ib_umem_odp.h            |  27 +++++++
 include/rdma/ib_verbs.h               |  12 +++
 5 files changed, 191 insertions(+)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index d3b65d4..bcbc2c2 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -42,7 +42,152 @@
 #include <rdma/ib_umem_odp.h>
 
 #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+
+static void ib_mirror_destroy(struct kref *kref)
+{
+	struct ib_mirror *ib_mirror;
+	struct ib_device *ib_device;
+
+	ib_mirror = container_of(kref, struct ib_mirror, kref);
+
+	ib_device = ib_mirror->ib_device;
+	mutex_lock(&ib_device->hmm_mutex);
+	list_del_init(&ib_mirror->list);
+	mutex_unlock(&ib_device->hmm_mutex);
+
+	/* hmm_mirror_unregister() will free the structure. */
+	hmm_mirror_unregister(&ib_mirror->base);
+}
+
+void ib_mirror_unref(struct ib_mirror *ib_mirror)
+{
+	if (ib_mirror == NULL)
+		return;
+
+	kref_put(&ib_mirror->kref, ib_mirror_destroy);
+}
+EXPORT_SYMBOL(ib_mirror_unref);
+
+static inline struct ib_mirror *ib_mirror_ref(struct ib_mirror *ib_mirror)
+{
+	if (!ib_mirror || !kref_get_unless_zero(&ib_mirror->kref))
+		return NULL;
+	return ib_mirror;
+}
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+{
+	struct mm_struct *mm = get_task_mm(current);
+	struct ib_device *ib_device = context->device;
+	struct ib_mirror *ib_mirror;
+	struct pid *our_pid;
+	int ret;
+
+	if (!mm || !ib_device->hmm_ready)
+		return -EINVAL;
+
+	/* This can not happen ! */
+	if (unlikely(ib_umem_start(umem) == ib_umem_end(umem)))
+		return -EINVAL;
+
+	/* Prevent creating ODP MRs in child processes */
+	rcu_read_lock();
+	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+	put_pid(our_pid);
+	if (context->tgid != our_pid) {
+		mmput(mm);
+		return -EINVAL;
+	}
+
+	umem->hugetlb = 0;
+	umem->odp_data = kmalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+	if (umem->odp_data == NULL) {
+		mmput(mm);
+		return -ENOMEM;
+	}
+	umem->odp_data->private = NULL;
+	umem->odp_data->umem = umem;
+
+	mutex_lock(&ib_device->hmm_mutex);
+	/* Is there an existing mirror for this process mm ? */
+	ib_mirror = ib_mirror_ref(context->ib_mirror);
+	if (!ib_mirror) {
+		struct ib_mirror *tmp;
+
+		list_for_each_entry(tmp, &ib_device->ib_mirrors, list) {
+			if (tmp->base.hmm->mm != mm)
+				continue;
+			ib_mirror = ib_mirror_ref(tmp);
+			break;
+		}
+	}
+
+	if (!ib_mirror) {
+		/* We need to create a new mirror. */
+		ib_mirror = kmalloc(sizeof(*ib_mirror), GFP_KERNEL);
+		if (!ib_mirror) {
+			mutex_unlock(&ib_device->hmm_mutex);
+			mmput(mm);
+			return -ENOMEM;
+		}
+		kref_init(&ib_mirror->kref);
+		init_rwsem(&ib_mirror->hmm_mr_rwsem);
+		ib_mirror->umem_tree = RB_ROOT;
+		ib_mirror->ib_device = ib_device;
+
+		ib_mirror->base.device = &ib_device->hmm_dev;
+		ret = hmm_mirror_register(&ib_mirror->base);
+		if (ret) {
+			mutex_unlock(&ib_device->hmm_mutex);
+			kfree(ib_mirror);
+			mmput(mm);
+			return ret;
+		}
+
+		list_add(&ib_mirror->list, &ib_device->ib_mirrors);
+		context->ib_mirror = ib_mirror_ref(ib_mirror);
+	}
+	mutex_unlock(&ib_device->hmm_mutex);
+	umem->odp_data.ib_mirror = ib_mirror;
+
+	down_write(&ib_mirror->umem_rwsem);
+	rbt_ib_umem_insert(&umem->odp_data->interval_tree, &mirror->umem_tree);
+	up_write(&ib_mirror->umem_rwsem);
+
+	mmput(mm);
+	return 0;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+	struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror;
+
+	/*
+	 * Ensure that no more pages are mapped in the umem.
+	 *
+	 * It is the driver's responsibility to ensure, before calling us,
+	 * that the hardware will not attempt to access the MR any more.
+	 */
+
+	/* One optimization to release resources early here would be to call :
+	 *	hmm_mirror_range_discard(&ib_mirror->base,
+	 *			 ib_umem_start(umem),
+	 *			 ib_umem_end(umem));
+	 * But we can have overlapping umem so we would need to only discard
+	 * range covered by one and only one umem while holding the umem rwsem.
+	 */
+	down_write(&ib_mirror->umem_rwsem);
+	rbt_ib_umem_remove(&umem->odp_data->interval_tree, &mirror->umem_tree);
+	up_write(&ib_mirror->umem_rwsem);
+
+	ib_mirror_unref(ib_mirror);
+	kfree(umem->odp_data);
+	kfree(umem);
+}
+
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
+
 static void ib_umem_notifier_start_account(struct ib_umem *item)
 {
 	mutex_lock(&item->odp_data->umem_mutex);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 53163aa..1db6a17 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -339,6 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 
 #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)
 #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+	ucontext->ib_mirror = NULL;
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 	ucontext->umem_tree = RB_ROOT;
 	init_rwsem(&ucontext->umem_rwsem);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index f6eef2d..201bde3 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -45,6 +45,7 @@
 #include <linux/cdev.h>
 #include <linux/anon_inodes.h>
 #include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
 
 #include <asm/uaccess.h>
 
@@ -298,6 +299,11 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 		kfree(uobj);
 	}
 
+#if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+	ib_mirror_unref(context->ib_mirror);
+	context->ib_mirror = NULL;
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
+
 	put_pid(context->tgid);
 
 	return context->device->dealloc_ucontext(context);
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 313d7f1..36b72f0 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -37,6 +37,32 @@
 #include <rdma/ib_verbs.h>
 #include <linux/interval_tree.h>
 
+#if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+/* struct ib_mirror - per process mirror structure for infiniband driver.
+ *
+ * @ib_device: Infiniband device this mirror is associated with.
+ * @base: The hmm base mirror struct.
+ * @kref: Refcount for the structure.
+ * @list: For the list of ib_mirror of a given ib_device.
+ * @umem_tree: Red black tree of ib_umem ordered by virtual address.
+ * @umem_rwsem: Semaphore protecting the reb black tree.
+ *
+ * Because ib_ucontext struct is tie to file descriptor there can be several of
+ * them for a same process, which violate HMM requirement. Hence we create only
+ * one ib_mirror struct per process and have each ib_umem struct reference it.
+ */
+struct ib_mirror {
+	struct ib_device	*ib_device;
+	struct hmm_mirror	base;
+	struct kref		kref;
+	struct list_head	list;
+	struct rb_root		umem_tree;
+	struct rw_semaphore	umem_rwsem;
+};
+
+void ib_mirror_unref(struct ib_mirror *ib_mirror);
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
+
 struct umem_odp_node {
 	u64 __subtree_last;
 	struct rb_node rb;
@@ -44,6 +70,7 @@ struct umem_odp_node {
 
 struct ib_umem_odp {
 #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+	struct ib_mirror	*ib_mirror;
 #else
 	/*
 	 * An array of the pages included in the on-demand paging umem.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9d32df11..987050b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -49,6 +49,9 @@
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <uapi/linux/if_ether.h>
+#if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+#include <linux/hmm.h>
+#endif
 
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
@@ -1217,6 +1220,7 @@ struct ib_ucontext {
 	struct pid             *tgid;
 #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)
 #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+	struct ib_mirror	*ib_mirror;
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 	struct rb_root      umem_tree;
 	/*
@@ -1730,6 +1734,14 @@ struct ib_device {
 
 	struct ib_dma_mapping_ops   *dma_ops;
 
+#if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM)
+	/* For ODP using HMM. */
+	struct hmm_device	     hmm_dev;
+	struct list_head	     ib_mirrors;
+	struct mutex		     hmm_mutex;
+	bool			     hmm_ready;
+#endif
+
 	struct module               *owner;
 	struct device                dev;
 	struct kobject               *ports_parent;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ