lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260130132809.59707-1-chao@kernel.org>
Date: Fri, 30 Jan 2026 21:28:08 +0800
From: Chao Yu <chao@...nel.org>
To: jaegeuk@...nel.org
Cc: linux-f2fs-devel@...ts.sourceforge.net,
	linux-kernel@...r.kernel.org,
	Chao Yu <chao@...nel.org>
Subject: [PATCH 1/2] f2fs: fix lock priority inversion issue

If userspace thread has held f2fs rw semaphore, due to its low priority,
it could be runnable or preempted state for long time, during the time,
it will block high priority thread which is trying to grab the same rw
semaphore, e.g. cp_rwsem, io_rwsem...

To fix such issue, let's detect thread's priority when it tries to grab
f2fs_rwsem lock, if the priority is lower than a priority threshold, let's
uplift the priority before it enters into critical region of lock, and
restore the priority after it leaves from critical region.

Meanwhile, introducing two new sysfs nodes:
- /sys/fs/f2fs/<disk>/adjust_lock_priority, it is used to control whether
the functionality is enable or not.
==========     ==================
Flag_Value     Flag_Description
==========     ==================
0x00000000     Disabled (default)
0x00000001     cp_rwsem
0x00000002     node_change
0x00000004     node_write
0x00000008     gc_lock
0x00000010     cp_global
0x00000020     io_rwsem
==========     ==================
- /sys/fs/f2fs/<disk>/lock_duration_priority, it is used to control
priority threshold.

Signed-off-by: Chao Yu <chao@...nel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs | 24 +++++++++
 fs/f2fs/checkpoint.c                    | 66 ++++++++++++++++++++++++-
 fs/f2fs/f2fs.h                          | 12 +++++
 fs/f2fs/super.c                         |  2 +
 fs/f2fs/sysfs.c                         | 18 +++++++
 5 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 9a8ec2290f68..ea6474db8a31 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -963,3 +963,27 @@ Description:	This sysfs entry can be used to change type of injected timeout:
 		0x00000003     Simulate Non-IO type sleep time
 		0x00000004     Simulate runnable time
 		==========     ===============================
+
+What:		/sys/fs/f2fs/<disk>/adjust_lock_priority
+Date:		January 2026
+Contact:	"Chao Yu" <chao@...nel.org>
+Description:	This sysfs entry can be used to enable/disable to adjust priority for task
+		which is in critical region covered by lock.
+		==========     ==================
+		Flag_Value     Flag_Description
+		==========     ==================
+		0x00000000     Disabled (default)
+		0x00000001     cp_rwsem
+		0x00000002     node_change
+		0x00000004     node_write
+		0x00000008     gc_lock
+		0x00000010     cp_global
+		0x00000020     io_rwsem
+		==========     ==================
+
+What:		/sys/fs/f2fs/<disk>/lock_duration_priority
+Date:		January 2026
+Contact:	"Chao Yu" <chao@...nel.org>
+Description:	f2fs can tune priority of thread which has entered into critical region covered by
+		f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
+		range is [100,139], by default the value is 120.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5172396c0b01..2f5a03e29d0b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -90,16 +90,72 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
 			runnable_time, io_sleep_time, other_time);
 }
 
+static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
+{
+	if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
+		return false;
+
+	switch (sem->name) {
+	/*
+	 * writer is checkpoint which has high priority, let's just uplift
+	 * priority for reader
+	 */
+	case LOCK_NAME_CP_RWSEM:
+	case LOCK_NAME_NODE_CHANGE:
+	case LOCK_NAME_NODE_WRITE:
+		return !is_write;
+	case LOCK_NAME_GC_LOCK:
+	case LOCK_NAME_CP_GLOBAL:
+	case LOCK_NAME_IO_RWSEM:
+		return true;
+	default:
+		f2fs_bug_on(sem->sbi, 1);
+	}
+	return false;
+}
+
+static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
+						bool is_write)
+{
+	lc->need_restore = false;
+	if (!sem->sbi->adjust_lock_priority)
+		return;
+	if (rt_task(current))
+		return;
+	if (!need_uplift_priority(sem, is_write))
+		return;
+	lc->orig_nice = task_nice(current);
+	lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
+	if (lc->orig_nice <= lc->new_nice)
+		return;
+	set_user_nice(current, lc->new_nice);
+	lc->need_restore = true;
+}
+
+static void restore_priority(struct f2fs_lock_context *lc)
+{
+	if (!lc->need_restore)
+		return;
+	/* someone has updated the priority */
+	if (task_nice(current) != lc->new_nice)
+		return;
+	set_user_nice(current, lc->orig_nice);
+}
+
 void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
+	uplift_priority(sem, lc, false);
 	f2fs_down_read(sem);
 	trace_lock_elapsed_time_start(sem, lc);
 }
 
 int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
-	if (!f2fs_down_read_trylock(sem))
+	uplift_priority(sem, lc, false);
+	if (!f2fs_down_read_trylock(sem)) {
+		restore_priority(lc);
 		return 0;
+	}
 	trace_lock_elapsed_time_start(sem, lc);
 	return 1;
 }
@@ -107,19 +163,24 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex
 void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
 	f2fs_up_read(sem);
+	restore_priority(lc);
 	trace_lock_elapsed_time_end(sem, lc, false);
 }
 
 void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
+	uplift_priority(sem, lc, true);
 	f2fs_down_write(sem);
 	trace_lock_elapsed_time_start(sem, lc);
 }
 
 int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
-	if (!f2fs_down_write_trylock(sem))
+	uplift_priority(sem, lc, true);
+	if (!f2fs_down_write_trylock(sem)) {
+		restore_priority(lc);
 		return 0;
+	}
 	trace_lock_elapsed_time_start(sem, lc);
 	return 1;
 }
@@ -127,6 +188,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte
 void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
 	f2fs_up_write(sem);
+	restore_priority(lc);
 	trace_lock_elapsed_time_end(sem, lc, true);
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 29f81a496b72..a6e7368fc40a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -185,6 +185,7 @@ enum f2fs_lock_name {
 	LOCK_NAME_GC_LOCK,
 	LOCK_NAME_CP_GLOBAL,
 	LOCK_NAME_IO_RWSEM,
+	LOCK_NAME_MAX,
 };
 
 enum f2fs_timeout_type {
@@ -1447,7 +1448,10 @@ struct f2fs_time_stat {
 
 struct f2fs_lock_context {
 	struct f2fs_time_stat ts;
+	int orig_nice;
+	int new_nice;
 	bool lock_trace;
+	bool need_restore;
 };
 
 struct f2fs_gc_control {
@@ -1588,6 +1592,8 @@ enum node_type {
 /* a threshold of maximum elapsed time in critical region to print tracepoint */
 #define MAX_LOCK_ELAPSED_TIME		500
 
+#define F2FS_DEFAULT_TASK_PRIORITY		(DEFAULT_PRIO)
+
 static inline int f2fs_test_bit(unsigned int nr, char *addr);
 static inline void f2fs_set_bit(unsigned int nr, char *addr);
 static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1998,6 +2004,12 @@ struct f2fs_sb_info {
 	/* max elapsed time threshold in critical region that lock covered */
 	unsigned long long max_lock_elapsed_time;
 
+	/* enable/disable to adjust task priority in critical region covered by lock */
+	unsigned int adjust_lock_priority;
+
+	/* adjust priority for task which is in critical region covered by lock */
+	unsigned int lock_duration_priority;
+
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	struct kmem_cache *page_array_slab;	/* page array entry */
 	unsigned int page_array_slab_size;	/* default page array slab size */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 9d421a07d2d5..d5cf7265e5d3 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	spin_lock_init(&sbi->gc_remaining_trials_lock);
 	atomic64_set(&sbi->current_atomic_write, 0);
 	sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
+	sbi->adjust_lock_priority = 0;
+	sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
 
 	sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
 		4096 : sbi->blocksize;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index d01a2664a250..3a272e7edf23 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -955,6 +955,20 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "adjust_lock_priority")) {
+		if (t >= BIT(LOCK_NAME_MAX - 1))
+			return -EINVAL;
+		sbi->adjust_lock_priority = t;
+		return count;
+	}
+
+	if (!strcmp(a->attr.name, "lock_duration_priority")) {
+		if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
+			return -EINVAL;
+		sbi->lock_duration_priority = t;
+		return count;
+	}
+
 	__sbi_store_value(a, sbi, ptr + a->offset, t);
 
 	return count;
@@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out);
 F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
 F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
 F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
+F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
+F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
 
 /* STAT_INFO ATTR */
 #ifdef CONFIG_F2FS_STAT_FS
@@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(allocate_section_hint),
 	ATTR_LIST(allocate_section_policy),
 	ATTR_LIST(max_lock_elapsed_time),
+	ATTR_LIST(lock_duration_priority),
+	ATTR_LIST(adjust_lock_priority),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs);
-- 
2.40.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ