lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z91j_UwNhi2DQB3N@krava>
Date: Fri, 21 Mar 2025 14:05:01 +0100
From: Jiri Olsa <olsajiri@...il.com>
To: David Hildenbrand <david@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	linux-arm-kernel@...ts.infradead.org,
	linux-trace-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	Andrew Morton <akpm@...ux-foundation.org>,
	Andrii Nakryiko <andrii.nakryiko@...il.com>,
	Matthew Wilcox <willy@...radead.org>,
	Russell King <linux@...linux.org.uk>,
	Masami Hiramatsu <mhiramat@...nel.org>,
	Oleg Nesterov <oleg@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Namhyung Kim <namhyung@...nel.org>,
	Mark Rutland <mark.rutland@....com>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Ian Rogers <irogers@...gle.com>,
	Adrian Hunter <adrian.hunter@...el.com>,
	"Liang, Kan" <kan.liang@...ux.intel.com>,
	Tong Tiangen <tongtiangen@...wei.com>
Subject: Re: [PATCH v3 3/3] kernel/events/uprobes: uprobe_write_opcode()
 rewrite

On Fri, Mar 21, 2025 at 12:37:13PM +0100, David Hildenbrand wrote:

SNIP

> +static int __uprobe_write_opcode(struct vm_area_struct *vma,
> +		struct folio_walk *fw, struct folio *folio,
> +		unsigned long opcode_vaddr, uprobe_opcode_t opcode)
> +{
> +	const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
> +	const bool is_register = !!is_swbp_insn(&opcode);
> +	bool pmd_mappable;
> +
> +	/* For now, we'll only handle PTE-mapped folios. */
> +	if (fw->level != FW_LEVEL_PTE)
> +		return -EFAULT;
> +
> +	/*
> +	 * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
> +	 * but the VMA might not be writable.
> +	 */
> +	if (!pte_write(fw->pte)) {
> +		if (!PageAnonExclusive(fw->page))
> +			return -EFAULT;
> +		if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
> +			return -EFAULT;
> +		/* SOFTDIRTY is handled via pte_mkdirty() below. */
> +	}
> +
> +	/*
> +	 * We'll temporarily unmap the page and flush the TLB, such that we can
> +	 * modify the page atomically.
> +	 */
> +	flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
> +	fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
> +	copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
> +
> +	/*
> +	 * When unregistering, we may only zap a PTE if uffd is disabled and
> +	 * there are no unexpected folio references ...
> +	 */
> +	if (is_register || userfaultfd_missing(vma) ||
> +	    (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
> +	     folio_test_swapcache(folio) * folio_nr_pages(folio)))
> +		goto remap;
> +
> +	/*
> +	 * ... and the mapped page is identical to the original page that
> +	 * would get faulted in on next access.
> +	 */
> +	if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
> +		goto remap;
> +
> +	dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
> +	folio_remove_rmap_pte(folio, fw->page, vma);
> +	if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
> +	     folio_trylock(folio)) {
> +		folio_free_swap(folio);
> +		folio_unlock(folio);
> +	}
> +	folio_put(folio);

hi,
it's probably ok and I'm missing something, but why do we call folio_put
in here, I'd think it's done by folio_put call in uprobe_write_opcode

thanks,
jirka


> +
> +	return pmd_mappable;
> +remap:
> +	/*
> +	 * Make sure that our copy_to_page() changes become visible before the
> +	 * set_pte_at() write.
> +	 */
> +	smp_wmb();
> +	/* We modified the page. Make sure to mark the PTE dirty. */
> +	set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
> +	return 0;
> +}
> +
>  /*
>   * NOTE:
>   * Expect the breakpoint instruction to be the smallest size instruction for
> @@ -475,116 +480,115 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
>   * uprobe_write_opcode - write the opcode at a given virtual address.
>   * @auprobe: arch specific probepoint information.
>   * @vma: the probed virtual memory area.
> - * @vaddr: the virtual address to store the opcode.
> - * @opcode: opcode to be written at @vaddr.
> + * @opcode_vaddr: the virtual address to store the opcode.
> + * @opcode: opcode to be written at @opcode_vaddr.
>   *
>   * Called with mm->mmap_lock held for read or write.
>   * Return 0 (success) or a negative errno.
>   */
>  int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> -		unsigned long vaddr, uprobe_opcode_t opcode)
> +		const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
>  {
> +	const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct uprobe *uprobe;
> -	struct page *old_page, *new_page;
>  	int ret, is_register, ref_ctr_updated = 0;
> -	bool orig_page_huge = false;
>  	unsigned int gup_flags = FOLL_FORCE;
> +	struct mmu_notifier_range range;
> +	struct folio_walk fw;
> +	struct folio *folio;
> +	struct page *page;
>  
>  	is_register = is_swbp_insn(&opcode);
>  	uprobe = container_of(auprobe, struct uprobe, arch);
>  
> -retry:
> +	if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
> +		return -EINVAL;
> +
> +	/*
> +	 * When registering, we have to break COW to get an exclusive anonymous
> +	 * page that we can safely modify. Use FOLL_WRITE to trigger a write
> +	 * fault if required. When unregistering, we might be lucky and the
> +	 * anon page is already gone. So defer write faults until really
> +	 * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
> +	 * cannot deal with PMDs yet.
> +	 */
>  	if (is_register)
> -		gup_flags |= FOLL_SPLIT_PMD;
> -	/* Read the page with vaddr into memory */
> -	ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, NULL);
> -	if (ret != 1)
> -		return ret;
> +		gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
>  
> -	ret = verify_opcode(old_page, vaddr, &opcode);
> +retry:
> +	ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
>  	if (ret <= 0)
> -		goto put_old;
> -
> -	if (is_zero_page(old_page)) {
> -		ret = -EINVAL;
> -		goto put_old;
> -	}
> +		goto out;
> +	folio = page_folio(page);
>  
> -	if (WARN(!is_register && PageCompound(old_page),
> -		 "uprobe unregister should never work on compound page\n")) {
> -		ret = -EINVAL;
> -		goto put_old;
> +	ret = verify_opcode(page, opcode_vaddr, &opcode);
> +	if (ret <= 0) {
> +		folio_put(folio);
> +		goto out;
>  	}
>  
>  	/* We are going to replace instruction, update ref_ctr. */
>  	if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
>  		ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
> -		if (ret)
> -			goto put_old;
> +		if (ret) {
> +			folio_put(folio);
> +			goto out;
> +		}
>  
>  		ref_ctr_updated = 1;
>  	}
>  
>  	ret = 0;
> -	if (!is_register && !PageAnon(old_page))
> -		goto put_old;
> -
> -	ret = anon_vma_prepare(vma);
> -	if (ret)
> -		goto put_old;
> -
> -	ret = -ENOMEM;
> -	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
> -	if (!new_page)
> -		goto put_old;
> -
> -	__SetPageUptodate(new_page);
> -	copy_highpage(new_page, old_page);
> -	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
> +	if (unlikely(!folio_test_anon(folio))) {
> +		VM_WARN_ON_ONCE(is_register);
> +		folio_put(folio);
> +		goto out;
> +	}
>  
>  	if (!is_register) {
> -		struct page *orig_page;
> -		pgoff_t index;
> -
> -		VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
> -
> -		index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
> -		orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
> -					  index);
> -
> -		if (orig_page) {
> -			if (PageUptodate(orig_page) &&
> -			    pages_identical(new_page, orig_page)) {
> -				/* let go new_page */
> -				put_page(new_page);
> -				new_page = NULL;
> -
> -				if (PageCompound(orig_page))
> -					orig_page_huge = true;
> -			}
> -			put_page(orig_page);
> -		}
> +		/*
> +		 * In the common case, we'll be able to zap the page when
> +		 * unregistering. So trigger MMU notifiers now, as we won't
> +		 * be able to do it under PTL.
> +		 */
> +		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +					vaddr, vaddr + PAGE_SIZE);
> +		mmu_notifier_invalidate_range_start(&range);
> +	}
> +
> +	ret = -EAGAIN;
> +	/* Walk the page tables again, to perform the actual update. */
> +	if (folio_walk_start(&fw, vma, vaddr, 0)) {
> +		if (fw.page == page)
> +			ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
> +		folio_walk_end(&fw, vma);
>  	}
>  
> -	ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
> -	if (new_page)
> -		put_page(new_page);
> -put_old:
> -	put_page(old_page);
> +	if (!is_register)
> +		mmu_notifier_invalidate_range_end(&range);
>  
> -	if (unlikely(ret == -EAGAIN))
> +	folio_put(folio);
> +	switch (ret) {
> +	case -EFAULT:
> +		gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
> +		fallthrough;
> +	case -EAGAIN:
>  		goto retry;
> +	default:
> +		break;
> +	}
>  
> +out:
>  	/* Revert back reference counter if instruction update failed. */
> -	if (ret && is_register && ref_ctr_updated)
> +	if (ret < 0 && is_register && ref_ctr_updated)
>  		update_ref_ctr(uprobe, mm, -1);
>  
>  	/* try collapse pmd for compound page */
> -	if (!ret && orig_page_huge)
> +	if (ret > 0)
>  		collapse_pte_mapped_thp(mm, vaddr, false);
>  
> -	return ret;
> +	return ret < 0 ? ret : 0;
>  }
>  
>  /**
> -- 
> 2.48.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ