linux-kernel - [RFC PATCH] mm/madvise: remove redundant mmap_lock operations from process

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250111004618.1566-1-sj@kernel.org>
Date: Fri, 10 Jan 2025 16:46:18 -0800
From: SeongJae Park <sj@...nel.org>
To: 
Cc: SeongJae Park <sj@...nel.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Jens Axboe <axboe@...nel.dk>,
	Pavel Begunkov <asml.silence@...il.com>,
	damon@...ts.linux.dev,
	io-uring@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-mm@...ck.org
Subject: [RFC PATCH] mm/madvise: remove redundant mmap_lock operations from process_madvise()

process_madvise() calls do_madvise() for each address range.  Then, each
do_madvise() invocation holds and releases same mmap_lock.  Optimize the
redundant lock operations by doing the locking in process_madvise(), and
inform do_madvise() that the lock is already held and therefore can be
skipped.

Evaluation
==========

I measured the time to apply MADV_DONTNEED advice to 256 MiB memory
using multiple madvise() calls, 4 KiB per each call.  I also do the same
with process_madvise(), but with varying iovec size from 1 to 1024.
The source code for the measurement is available at GitHub[1].

The measurement results are as below.  'sz_batches' column shows the
iovec size of process_madvise() calls.  '0' is for madvise() calls case.
'before' and 'after' columns are the measured time to apply
MADV_DONTNEED to the 256 MiB memory buffer in nanoseconds, on kernels
that built without and with this patch, respectively.  So lower value
means better efficiency.  'after/before' column is the ratio of 'after'
to 'before'.

    sz_batches  before     after      after/before
    0           124062365  96670188   0.779206393494111
    1           136341258  113915688  0.835518827323714
    2           105314942  78898211   0.749164453796119
    4           82012858   59778998   0.728897875989153
    8           82562651   51003069   0.617749895167489
    16          71474930   47575960   0.665631431888076
    32          71391211   42902076   0.600943385033768
    64          68225932   41337835   0.605896230190011
    128         71053578   42467240   0.597679120395598
    256         85094126   41630463   0.489228398679364
    512         68531628   44049763   0.6427654542221
    1024        79338892   43370866   0.546653285755491

The measurement shows this patch reduces the process_madvise() latency,
proportional to the batching size, from about 25% with the batch size 2
to about 55% with the batch size 1,024.  The trend is somewhat we can
expect.

Interestingly, this patch has also optimize madvise() and single batch
size process_madvise(), though.  I ran this test multiple times, but the
results are consistent.  I'm still investigating if there are something
I'm missing.  But I believe the investigation may not necessarily be a
blocker of this RFC, so just posting this.  I will add updates of the
madvise() and single batch size process_madvise() investigation later.

[1] https://github.com/sjp38/eval_proc_madvise

Signed-off-by: SeongJae Park <sj@...nel.org>
---
 include/linux/mm.h |  3 ++-
 io_uring/advise.c  |  2 +-
 mm/damon/vaddr.c   |  2 +-
 mm/madvise.c       | 54 +++++++++++++++++++++++++++++++++++-----------
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 612b513ebfbd..e3ca5967ebd4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3459,7 +3459,8 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		    unsigned long end, struct list_head *uf, bool unlock);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
 		     struct list_head *uf);
-extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
+extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+		int behavior, bool lock_held);
 
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
diff --git a/io_uring/advise.c b/io_uring/advise.c
index cb7b881665e5..010b55d5a26e 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -56,7 +56,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
-	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
+	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice, false);
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 #else
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a6174f725bd7..30b5a251d73e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -646,7 +646,7 @@ static unsigned long damos_madvise(struct damon_target *target,
 	if (!mm)
 		return 0;
 
-	applied = do_madvise(mm, start, len, behavior) ? 0 : len;
+	applied = do_madvise(mm, start, len, behavior, false) ? 0 : len;
 	mmput(mm);
 
 	return applied;
diff --git a/mm/madvise.c b/mm/madvise.c
index 49f3a75046f6..c107376db9d5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1637,7 +1637,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  *  -EPERM  - memory is sealed.
  */
-int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+		int behavior, bool lock_held)
 {
 	unsigned long end;
 	int error;
@@ -1668,12 +1669,14 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 		return madvise_inject_error(behavior, start, start + len_in);
 #endif
 
-	write = madvise_need_mmap_write(behavior);
-	if (write) {
-		if (mmap_write_lock_killable(mm))
-			return -EINTR;
-	} else {
-		mmap_read_lock(mm);
+	if (!lock_held) {
+		write = madvise_need_mmap_write(behavior);
+		if (write) {
+			if (mmap_write_lock_killable(mm))
+				return -EINTR;
+		} else {
+			mmap_read_lock(mm);
+		}
 	}
 
 	start = untagged_addr_remote(mm, start);
@@ -1692,17 +1695,19 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	}
 	blk_finish_plug(&plug);
 
-	if (write)
-		mmap_write_unlock(mm);
-	else
-		mmap_read_unlock(mm);
+	if (!lock_held) {
+		if (write)
+			mmap_write_unlock(mm);
+		else
+			mmap_read_unlock(mm);
+	}
 
 	return error;
 }
 
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	return do_madvise(current->mm, start, len_in, behavior);
+	return do_madvise(current->mm, start, len_in, behavior, false);
 }
 
 /* Perform an madvise operation over a vector of addresses and lengths. */
@@ -1711,12 +1716,28 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 {
 	ssize_t ret = 0;
 	size_t total_len;
+	bool hold_lock = true;
+	int write;
 
 	total_len = iov_iter_count(iter);
 
+#ifdef CONFIG_MEMORY_FAILURE
+	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+		hold_lock = false;
+#endif
+	if (hold_lock) {
+		write = madvise_need_mmap_write(behavior);
+		if (write) {
+			if (mmap_write_lock_killable(mm))
+				return -EINTR;
+		} else {
+			mmap_read_lock(mm);
+		}
+	}
+
 	while (iov_iter_count(iter)) {
 		ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
-				 iter_iov_len(iter), behavior);
+				 iter_iov_len(iter), behavior, hold_lock);
 		/*
 		 * An madvise operation is attempting to restart the syscall,
 		 * but we cannot proceed as it would not be correct to repeat
@@ -1739,6 +1760,13 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 		iov_iter_advance(iter, iter_iov_len(iter));
 	}
 
+	if (hold_lock) {
+		if (write)
+			mmap_write_unlock(mm);
+		else
+			mmap_read_unlock(mm);
+	}
+
 	ret = (total_len - iov_iter_count(iter)) ? : ret;
 
 	return ret;
-- 
2.39.5