linux-kernel - Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZjMPtrsrUi8-QJ0G@bender.morinfr.org>
Date: Thu, 2 May 2024 05:59:50 +0200
From: Guillaume Morin <guillaume@...infr.org>
To: David Hildenbrand <david@...hat.com>
Cc: Guillaume Morin <guillaume@...infr.org>, oleg@...hat.com,
	linux-kernel@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
	muchun.song@...ux.dev
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 30 Apr 21:25, David Hildenbrand wrote:
> > I tried to get the hugepd stuff right but this was the first I heard
> > about it :-) Afaict follow_huge_pmd and friends were already DTRT
> 
> I'll have to have a closer look at some details (the hugepd writability
> check looks a bit odd), but it's mostly what I would have expected!

Ok in the meantime, here is the uprobe change on your current
uprobes_cow trying to address the comments you made in your previous
message. Some of them were not 100% clear to me, so it's a best effort
patch :-) Again lightly tested

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 	{}
 };
 
+bool hugetlbfs_mapping(struct address_space *mapping) {
+	return mapping->a_ops == &hugetlbfs_aops;
+}
+
 /*
  * Mask used when checking the page offset value passed in via system
  * calls.  This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
 	umode_t mode;
 };
 
+bool hugetlbfs_mapping(struct address_space *mapping);
+
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
 {
 	return NULL;
 }
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
 #endif /* !CONFIG_HUGETLBFS */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/highmem.h>
+#include <linux/hugetlb.h>
 #include <linux/pagemap.h>	/* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
  */
 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 {
-	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+	vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;
 
 	if (is_register)
 		flags |= VM_WRITE;
@@ -177,6 +178,19 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
 	kunmap_atomic(kaddr);
 }
 
+static bool compare_pages(struct page *page1, struct page *page2, unsigned long page_size)
+{
+	char *addr1, *addr2;
+	int ret;
+
+	addr1 = kmap_local_page(page1);
+	addr2 = kmap_local_page(page2);
+	ret = memcmp(addr1, addr2, page_size);
+	kunmap_local(addr2);
+	kunmap_local(addr1);
+	return ret == 0;
+}
+
 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
 {
 	uprobe_opcode_t old_opcode;
@@ -366,7 +380,9 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
 }
 
 static bool orig_page_is_identical(struct vm_area_struct *vma,
-		unsigned long vaddr, struct page *page, bool *large)
+		unsigned long vaddr, struct page *page,
+		unsigned long page_size,
+		bool *large)
 {
 	const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
 	struct page *orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
@@ -380,7 +396,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
 
 	*large = folio_test_large(orig_folio);
 	identical = folio_test_uptodate(orig_folio) &&
-		    pages_identical(page, orig_page);
+		    compare_pages(page, orig_page, page_size);
 	folio_put(orig_folio);
 	return identical;
 }
@@ -396,6 +412,81 @@ struct uwo_data {
 	uprobe_opcode_t opcode;
 };
 
+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long page_mask,
+				  unsigned long vaddr,
+				  unsigned long next, struct mm_walk *walk)
+{
+	struct uwo_data *data = walk->private;
+	const bool is_register = !!is_swbp_insn(&data->opcode);
+	pte_t pte = huge_ptep_get(ptep);
+	struct folio *folio;
+	struct page *page;
+	bool large;
+	struct hstate *h = hstate_vma(walk->vma);
+	unsigned subpage_index = (vaddr & (huge_page_size(h) - 1)) >>
+		PAGE_SHIFT;
+
+	if (!pte_present(pte))
+		return UWO_RETRY;
+	page = vm_normal_page(walk->vma, vaddr, pte);
+	if (!page)
+		return UWO_RETRY;
+	folio = page_folio(page);
+
+	/* When unregistering and there is no anon folio anymore, we're done. */
+	if (!folio_test_anon(folio))
+		return is_register ? UWO_RETRY_WRITE_FAULT : UWO_DONE;
+
+	/*
+	 * See can_follow_write_pte(): we'd actually prefer requiring a
+	 * writable PTE here, but when unregistering we might no longer have
+	 * VM_WRITE ...
+	 */
+	if (!huge_pte_write(pte)) {
+		if (!PageAnonExclusive(page))
+			return UWO_RETRY_WRITE_FAULT;
+		if (unlikely(userfaultfd_wp(walk->vma) && huge_pte_uffd_wp(pte)))
+			return UWO_RETRY_WRITE_FAULT;
+		/* SOFTDIRTY is handled via pte_mkdirty() below. */
+	}
+
+	/* Unmap + flush the TLB, such that we can write atomically .*/
+	flush_cache_page(walk->vma, vaddr & page_mask, pte_pfn(pte));
+	pte = huge_ptep_clear_flush(walk->vma, vaddr & page_mask, ptep);
+	copy_to_page(nth_page(page, subpage_index), data->opcode_vaddr,
+		     &data->opcode, UPROBE_SWBP_INSN_SIZE);
+
+	/*
+	 * When unregistering, we may only zap a PTE if uffd is disabled and
+	 * the folio is not pinned ...
+	 */
+	if (is_register || userfaultfd_missing(walk->vma) ||
+	    folio_maybe_dma_pinned(folio))
+		goto remap;
+
+	/*
+	 * ... the mapped anon page is identical to the original page (that
+	 * will get faulted in on next access), and we don't have GUP pins.
+	 */
+	if (!orig_page_is_identical(walk->vma, vaddr & page_mask, page,
+				    huge_page_size(h), &large))
+		goto remap;
+
+	hugetlb_remove_rmap(folio);
+	folio_put(folio);
+	return UWO_DONE;
+remap:
+	/*
+	 * Make sure that our copy_to_page() changes become visible before the
+	 * set_huge_pte_at() write.
+	 */
+	smp_wmb();
+	/* We modified the page. Make sure to mark the PTE dirty. */
+	set_huge_pte_at(walk->mm , vaddr & page_mask, ptep,
+			huge_pte_mkdirty(pte), huge_page_size(h));
+	return UWO_DONE;
+}
+
 static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
 		unsigned long next, struct mm_walk *walk)
 {
@@ -447,7 +538,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
 	 * ... the mapped anon page is identical to the original page (that
 	 * will get faulted in on next access), and we don't have GUP pins.
 	 */
-	if (!orig_page_is_identical(walk->vma, vaddr, page, &large))
+	if (!orig_page_is_identical(walk->vma, vaddr, page, PAGE_SIZE, &large))
 		goto remap;
 
 	/* Zap it and try to reclaim swap space. */
@@ -473,6 +564,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
 }
 
 static const struct mm_walk_ops write_opcode_ops = {
+	.hugetlb_entry		= __write_opcode_hugetlb,
 	.pte_entry		= __write_opcode_pte,
 	.walk_lock		= PGWALK_WRLOCK,
 };
@@ -510,6 +602,8 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	int ret, ref_ctr_updated = 0;
 	struct page *page;
+	unsigned long page_size = PAGE_SIZE;
+	unsigned long page_mask = PAGE_MASK;
 
 	if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
 		return -EINVAL;
@@ -528,6 +622,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	if (ret != 1)
 		goto out;
 
+	if (is_vm_hugetlb_page(vma)) {
+		struct hstate *h = hstate_vma(vma);
+		page_size = huge_page_size(h);
+		page_mask = huge_page_mask(h);
+	}
 	ret = verify_opcode(page, opcode_vaddr, &opcode);
 	put_page(page);
 	if (ret <= 0)
@@ -547,8 +646,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		 * unregistering. So trigger MMU notifiers now, as we won't
 		 * be able to do it under PTL.
 		 */
+		const unsigned long start = vaddr & page_mask;
 		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
-					vaddr, vaddr + PAGE_SIZE);
+					start, start + page_size);
 		mmu_notifier_invalidate_range_start(&range);
 	}
 
@@ -830,8 +930,16 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
 	 */
 	if (mapping->a_ops->read_folio)
 		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
-	else
+	else if (!is_file_hugepages(filp))
 		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+	else {
+		struct hstate *h = hstate_file(filp);
+		unsigned long mask = huge_page_mask(h);
+		page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		page = nth_page(page, (offset & (huge_page_size(h) - 1)) >> PAGE_SHIFT);
+	}
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
@@ -1182,9 +1290,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
 	if (!uc->handler && !uc->ret_handler)
 		return -EINVAL;
 
-	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+	/* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+	 * logic
+	 */
 	if (!inode->i_mapping->a_ops->read_folio &&
-	    !shmem_mapping(inode->i_mapping))
+	    !shmem_mapping(inode->i_mapping) &&
+	    !hugetlbfs_mapping(inode->i_mapping))
 		return -EIO;
 	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))

-- 
Guillaume Morin <guillaume@...infr.org>