linux-kernel - [PATCH v12 08/29] HMM: add device page fault support v6.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1457469802-11850-9-git-send-email-jglisse@redhat.com>
Date:	Tue,  8 Mar 2016 15:43:01 -0500
From:	Jérôme Glisse <jglisse@...hat.com>
To:	akpm@...ux-foundation.org, <linux-kernel@...r.kernel.org>,
	linux-mm@...ck.org
Cc:	Linus Torvalds <torvalds@...ux-foundation.org>, <joro@...tes.org>,
	Mel Gorman <mgorman@...e.de>, "H. Peter Anvin" <hpa@...or.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Andrea Arcangeli <aarcange@...hat.com>,
	Johannes Weiner <jweiner@...hat.com>,
	Larry Woodman <lwoodman@...hat.com>,
	Rik van Riel <riel@...hat.com>,
	Dave Airlie <airlied@...hat.com>,
	Brendan Conoboy <blc@...hat.com>,
	Joe Donohue <jdonohue@...hat.com>,
	Christophe Harle <charle@...dia.com>,
	Duncan Poole <dpoole@...dia.com>,
	Sherry Cheung <SCheung@...dia.com>,
	Subhash Gutti <sgutti@...dia.com>,
	John Hubbard <jhubbard@...dia.com>,
	Mark Hairgrove <mhairgrove@...dia.com>,
	Lucien Dunning <ldunning@...dia.com>,
	Cameron Buschardt <cabuschardt@...dia.com>,
	Arvind Gopalakrishnan <arvindg@...dia.com>,
	Haggai Eran <haggaie@...lanox.com>,
	Shachar Raindel <raindel@...lanox.com>,
	Liran Liss <liranl@...lanox.com>,
	Roland Dreier <roland@...estorage.com>,
	Ben Sander <ben.sander@....com>,
	Greg Stoner <Greg.Stoner@....com>,
	John Bridgman <John.Bridgman@....com>,
	Michael Mantor <Michael.Mantor@....com>,
	Paul Blinzer <Paul.Blinzer@....com>,
	Leonid Shamis <Leonid.Shamis@....com>,
	Laurent Morichetti <Laurent.Morichetti@....com>,
	Alexander Deucher <Alexander.Deucher@....com>,
	Jérôme Glisse <jglisse@...hat.com>,
	Jatin Kumar <jakumar@...dia.com>
Subject: [PATCH v12 08/29] HMM: add device page fault support v6.

This patch add helper for device page fault. Thus helpers will fill
the mirror page table using the CPU page table and synchronizing
with any update to CPU page table.

Changed since v1:
  - Add comment about directory lock.

Changed since v2:
  - Check for mirror->hmm in hmm_mirror_fault()

Changed since v3:
  - Adapt to HMM page table changes.

Changed since v4:
  - Fix PROT_NONE, ie do not populate from protnone pte.
  - Fix huge pmd handling (start address may != pmd start address)
  - Fix missing entry case.

Signed-off-by: Jérôme Glisse <jglisse@...hat.com>
Signed-off-by: Sherry Cheung <SCheung@...dia.com>
Signed-off-by: Subhash Gutti <sgutti@...dia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@...dia.com>
Signed-off-by: John Hubbard <jhubbard@...dia.com>
Signed-off-by: Jatin Kumar <jakumar@...dia.com>
---
 include/linux/hmm.h |  15 ++
 mm/hmm.c            | 386 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 400 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 5488fa9..d819ec9 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -85,6 +85,12 @@ struct hmm_event {
 	bool			backoff;
 };
 
+static inline bool hmm_event_overlap(const struct hmm_event *a,
+				     const struct hmm_event *b)
+{
+	return !((a->end <= b->start) || (a->start >= b->end));
+}
+
 
 /* hmm_device - Each device must register one and only one hmm_device.
  *
@@ -176,6 +182,10 @@ struct hmm_device_ops {
  * @rwsem: Serialize the mirror list modifications.
  * @mmu_notifier: The mmu_notifier of this mm.
  * @rcu: For delayed cleanup call from mmu_notifier.release() callback.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
+ * @lock: Serialize device_faults list modification.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -192,6 +202,10 @@ struct hmm {
 	struct rw_semaphore	rwsem;
 	struct mmu_notifier	mmu_notifier;
 	struct rcu_head		rcu;
+	struct list_head	device_faults;
+	unsigned		ndevice_faults;
+	wait_queue_head_t	wait_queue;
+	spinlock_t		lock;
 };
 
 
@@ -250,6 +264,7 @@ int hmm_mirror_register(struct hmm_mirror *mirror);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 struct hmm_mirror *hmm_mirror_ref(struct hmm_mirror *mirror);
 void hmm_mirror_unref(struct hmm_mirror **mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index c172a49..a9bdab5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -67,7 +67,7 @@ static inline int hmm_event_init(struct hmm_event *event,
 				 enum hmm_etype etype)
 {
 	event->start = start & PAGE_MASK;
-	event->end = min(end, hmm->vm_end);
+	event->end = PAGE_ALIGN(min(end, hmm->vm_end));
 	if (event->start >= event->end)
 		return -EINVAL;
 	event->etype = etype;
@@ -103,6 +103,10 @@ static int hmm_init(struct hmm *hmm)
 	kref_init(&hmm->kref);
 	INIT_HLIST_HEAD(&hmm->mirrors);
 	init_rwsem(&hmm->rwsem);
+	INIT_LIST_HEAD(&hmm->device_faults);
+	hmm->ndevice_faults = 0;
+	init_waitqueue_head(&hmm->wait_queue);
+	spin_lock_init(&hmm->lock);
 
 	/* register notifier */
 	hmm->mmu_notifier.ops = &hmm_notifier_ops;
@@ -167,6 +171,58 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
 	return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+	int ret = 0;
+
+	mmu_notifier_range_wait_active(hmm->mm, event->start, event->end);
+
+	spin_lock(&hmm->lock);
+	if (mmu_notifier_range_inactive(hmm->mm, event->start, event->end)) {
+		list_add_tail(&event->list, &hmm->device_faults);
+		hmm->ndevice_faults++;
+		event->backoff = false;
+	} else
+		ret = -EAGAIN;
+	spin_unlock(&hmm->lock);
+
+	wake_up(&hmm->wait_queue);
+
+	return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+	spin_lock(&hmm->lock);
+	list_del_init(&event->list);
+	hmm->ndevice_faults--;
+	spin_unlock(&hmm->lock);
+
+	wake_up(&hmm->wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+	struct hmm_event *fevent;
+	unsigned long wait_for = 0;
+
+again:
+	spin_lock(&hmm->lock);
+	list_for_each_entry(fevent, &hmm->device_faults, list) {
+		if (!hmm_event_overlap(fevent, ievent))
+			continue;
+		fevent->backoff = true;
+		wait_for = hmm->ndevice_faults;
+	}
+	spin_unlock(&hmm->lock);
+
+	if (wait_for > 0) {
+		wait_event(hmm->wait_queue, wait_for != hmm->ndevice_faults);
+		wait_for = 0;
+		goto again;
+	}
+}
+
 static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 {
 	struct hmm_mirror *mirror;
@@ -175,6 +231,8 @@ static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 	if (hmm->mm->hmm != hmm)
 		return;
 
+	hmm_wait_device_fault(hmm, event);
+
 again:
 	down_read(&hmm->rwsem);
 	hlist_for_each_entry(mirror, &hmm->mirrors, mlist)
@@ -186,6 +244,33 @@ again:
 			goto again;
 		}
 	up_read(&hmm->rwsem);
+
+	wake_up(&hmm->wait_queue);
+}
+
+static int hmm_mm_fault(struct hmm *hmm,
+			struct hmm_event *event,
+			struct vm_area_struct *vma,
+			unsigned long addr)
+{
+	unsigned flags = FAULT_FLAG_ALLOW_RETRY;
+	struct mm_struct *mm = vma->vm_mm;
+	int r;
+
+	flags |= (event->etype == HMM_DEVICE_WFAULT) ? FAULT_FLAG_WRITE : 0;
+	for (addr &= PAGE_MASK; addr < event->end; addr += PAGE_SIZE) {
+
+		r = handle_mm_fault(mm, vma, addr, flags);
+		if (r & VM_FAULT_RETRY)
+			return -EBUSY;
+		if (r & VM_FAULT_ERROR) {
+			if (r & VM_FAULT_OOM)
+				return -ENOMEM;
+			/* Same error code for all other cases. */
+			return -EFAULT;
+		}
+	}
+	return 0;
 }
 
 
@@ -228,6 +313,7 @@ static void hmm_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	}
 	up_write(&hmm->rwsem);
 
+	wake_up(&hmm->wait_queue);
 	hmm_unref(hmm);
 }
 
@@ -419,6 +505,304 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
 	hmm_pt_iter_fini(&iter);
 }
 
+static inline bool hmm_mirror_is_dead(struct hmm_mirror *mirror)
+{
+	if (hlist_unhashed(&mirror->mlist) || list_empty(&mirror->dlist))
+		return true;
+	return false;
+}
+
+struct hmm_mirror_fault {
+	struct hmm_mirror	*mirror;
+	struct hmm_event	*event;
+	struct vm_area_struct	*vma;
+	unsigned long		addr;
+	struct hmm_pt_iter	*iter;
+};
+
+static int hmm_mirror_fault_hpmd(struct hmm_mirror *mirror,
+				 struct hmm_event *event,
+				 struct vm_area_struct *vma,
+				 struct hmm_pt_iter *iter,
+				 pmd_t *pmdp,
+				 struct hmm_mirror_fault *mirror_fault,
+				 unsigned long start,
+				 unsigned long end)
+{
+	struct page *page;
+	unsigned long addr, pfn;
+	unsigned flags = FOLL_TOUCH;
+	spinlock_t *ptl;
+	int ret;
+
+	ptl = pmd_lock(mirror->hmm->mm, pmdp);
+	if (unlikely(!pmd_trans_huge(*pmdp))) {
+		spin_unlock(ptl);
+		return -EAGAIN;
+	}
+	flags |= event->etype == HMM_DEVICE_WFAULT ? FOLL_WRITE : 0;
+	page = follow_trans_huge_pmd(vma, start, pmdp, flags);
+	pfn = page_to_pfn(page);
+	spin_unlock(ptl);
+
+	/* Just fault in the whole PMD. */
+	start &= PMD_MASK;
+	end = start + PMD_SIZE - 1;
+
+	if (!pmd_write(*pmdp) && event->etype == HMM_DEVICE_WFAULT)
+			return -ENOENT;
+
+	for (ret = 0, addr = start; !ret && addr < end;) {
+		unsigned long i, next = end;
+		dma_addr_t *hmm_pte;
+
+		hmm_pte = hmm_pt_iter_populate(iter, addr, &next);
+		if (!hmm_pte)
+			return -ENOMEM;
+
+		i = hmm_pt_index(&mirror->pt, addr, mirror->pt.llevel);
+
+		/*
+		 * The directory lock protect against concurrent clearing of
+		 * page table bit flags. Exceptions being the dirty bit and
+		 * the device driver private flags.
+		 */
+		hmm_pt_iter_directory_lock(iter);
+		do {
+			if (!hmm_pte_test_valid_pfn(&hmm_pte[i])) {
+				hmm_pte[i] = hmm_pte_from_pfn(pfn);
+				hmm_pt_iter_directory_ref(iter);
+			}
+			BUG_ON(hmm_pte_pfn(hmm_pte[i]) != pfn);
+			if (pmd_write(*pmdp))
+				hmm_pte_set_write(&hmm_pte[i]);
+		} while (addr += PAGE_SIZE, pfn++, i++, addr != next);
+		hmm_pt_iter_directory_unlock(iter);
+		mirror_fault->addr = addr;
+	}
+
+	return 0;
+}
+
+static int hmm_pte_hole(unsigned long addr,
+			unsigned long next,
+			struct mm_walk *walk)
+{
+	return -ENOENT;
+}
+
+static int hmm_mirror_fault_pmd(pmd_t *pmdp,
+				unsigned long start,
+				unsigned long end,
+				struct mm_walk *walk)
+{
+	struct hmm_mirror_fault *mirror_fault = walk->private;
+	struct hmm_mirror *mirror = mirror_fault->mirror;
+	struct hmm_event *event = mirror_fault->event;
+	struct hmm_pt_iter *iter = mirror_fault->iter;
+	bool write = (event->etype == HMM_DEVICE_WFAULT);
+	unsigned long addr;
+	int ret = 0;
+
+	/* Make sure there was no gap. */
+	if (start != mirror_fault->addr)
+		return -ENOENT;
+
+	if (event->backoff)
+		return -EAGAIN;
+
+	if (pmd_none(*pmdp))
+		return -ENOENT;
+
+	if (pmd_trans_huge(*pmdp))
+		return hmm_mirror_fault_hpmd(mirror, event, mirror_fault->vma,
+					     iter, pmdp, mirror_fault, start,
+					     end);
+
+	if (pmd_none_or_trans_huge_or_clear_bad(pmdp))
+		return -EFAULT;
+
+	for (ret = 0, addr = start; !ret && addr < end;) {
+		unsigned long i = 0, next = end;
+		dma_addr_t *hmm_pte;
+		pte_t *ptep;
+
+		hmm_pte = hmm_pt_iter_populate(iter, addr, &next);
+		if (!hmm_pte)
+			return -ENOMEM;
+
+		ptep = pte_offset_map(pmdp, start);
+		hmm_pt_iter_directory_lock(iter);
+		do {
+			if (!pte_present(*ptep) ||
+			    (write && !pte_write(*ptep)) ||
+			    pte_protnone(*ptep)) {
+				ret = -ENOENT;
+				ptep++;
+				break;
+			}
+
+			if (!hmm_pte_test_valid_pfn(&hmm_pte[i])) {
+				hmm_pte[i] = hmm_pte_from_pfn(pte_pfn(*ptep));
+				hmm_pt_iter_directory_ref(iter);
+			}
+			BUG_ON(hmm_pte_pfn(hmm_pte[i]) != pte_pfn(*ptep));
+			if (pte_write(*ptep))
+				hmm_pte_set_write(&hmm_pte[i]);
+		} while (addr += PAGE_SIZE, ptep++, i++, addr != next);
+		hmm_pt_iter_directory_unlock(iter);
+		pte_unmap(ptep - 1);
+		mirror_fault->addr = addr;
+	}
+
+	return ret;
+}
+
+static int hmm_mirror_handle_fault(struct hmm_mirror *mirror,
+				   struct hmm_event *event,
+				   struct vm_area_struct *vma,
+				   struct hmm_pt_iter *iter)
+{
+	struct hmm_mirror_fault mirror_fault;
+	unsigned long addr = event->start;
+	struct mm_walk walk = {0};
+	int ret = 0;
+
+	if ((event->etype == HMM_DEVICE_WFAULT) && !(vma->vm_flags & VM_WRITE))
+		return -EACCES;
+
+	ret = hmm_device_fault_start(mirror->hmm, event);
+	if (ret)
+		return ret;
+
+again:
+	if (event->backoff) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	if (addr >= event->end)
+		goto out;
+
+	mirror_fault.event = event;
+	mirror_fault.mirror = mirror;
+	mirror_fault.vma = vma;
+	mirror_fault.addr = addr;
+	mirror_fault.iter = iter;
+	walk.mm = mirror->hmm->mm;
+	walk.private = &mirror_fault;
+	walk.pmd_entry = hmm_mirror_fault_pmd;
+	walk.pte_hole = hmm_pte_hole;
+	ret = walk_page_range(addr, event->end, &walk);
+	if (!ret) {
+		ret = mirror->device->ops->update(mirror, event);
+		if (!ret) {
+			addr = mirror_fault.addr;
+			goto again;
+		}
+	}
+
+out:
+	hmm_device_fault_end(mirror->hmm, event);
+	if (ret == -ENOENT) {
+		ret = hmm_mm_fault(mirror->hmm, event, vma, addr);
+		ret = ret ? ret : -EAGAIN;
+	}
+	return ret;
+}
+
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event)
+{
+	struct vm_area_struct *vma;
+	struct hmm_pt_iter iter;
+	int ret = 0;
+
+	mirror = hmm_mirror_ref(mirror);
+	if (!mirror)
+		return -ENODEV;
+	if (event->start >= mirror->hmm->vm_end) {
+		hmm_mirror_unref(&mirror);
+		return -EINVAL;
+	}
+	if (hmm_event_init(event, mirror->hmm, event->start,
+			   event->end, event->etype)) {
+		hmm_mirror_unref(&mirror);
+		return -EINVAL;
+	}
+	hmm_pt_iter_init(&iter, &mirror->pt);
+
+retry:
+	if (hmm_mirror_is_dead(mirror)) {
+		hmm_mirror_unref(&mirror);
+		return -ENODEV;
+	}
+
+	/*
+	 * So synchronization with the cpu page table is the most important
+	 * and tedious aspect of device page fault. There must be a strong
+	 * ordering btw call to device->update() for device page fault and
+	 * device->update() for cpu page table invalidation/update.
+	 *
+	 * Page that are exposed to device driver must stay valid while the
+	 * callback is in progress ie any cpu page table invalidation that
+	 * render those pages obsolete must call device->update() after the
+	 * device->update() call that faulted those pages.
+	 *
+	 * To achieve this we rely on few things. First the mmap_sem insure
+	 * us that any munmap() syscall will serialize with us. So issue are
+	 * with unmap_mapping_range() and with migrate or merge page. For this
+	 * hmm keep track of affected range of address and block device page
+	 * fault that hit overlapping range.
+	 */
+	down_read(&mirror->hmm->mm->mmap_sem);
+	vma = find_vma_intersection(mirror->hmm->mm, event->start, event->end);
+	if (!vma) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (vma->vm_start > event->start) {
+		event->end = vma->vm_start;
+		ret = -EFAULT;
+		goto out;
+	}
+	event->end = min(event->end, vma->vm_end) & PAGE_MASK;
+	if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP | VM_HUGETLB))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	switch (event->etype) {
+	case HMM_DEVICE_WFAULT:
+		if (!(vma->vm_flags & VM_WRITE)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		/* fallthrough */
+	case HMM_DEVICE_RFAULT:
+		/* Handle the PROT_NONE case early on. */
+		if (!(vma->vm_flags & (VM_WRITE | VM_READ))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		ret = hmm_mirror_handle_fault(mirror, event, vma, &iter);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+out:
+	/* Drop the mmap_sem so anyone waiting on it have a chance. */
+	if (ret != -EBUSY)
+		up_read(&mirror->hmm->mm->mmap_sem);
+	wake_up(&mirror->hmm->wait_queue);
+	if (ret == -EAGAIN)
+		goto retry;
+	hmm_pt_iter_fini(&iter);
+	hmm_mirror_unref(&mirror);
+	return ret;
+}
+EXPORT_SYMBOL(hmm_mirror_fault);
+
 /* hmm_mirror_register() - register mirror against current process for a device.
  *
  * @mirror: The mirror struct being registered.
-- 
2.4.3