lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251015114952.4014352-1-leonylgao@gmail.com>
Date: Wed, 15 Oct 2025 19:49:52 +0800
From: Yongliang Gao <leonylgao@...il.com>
To: rostedt@...dmis.org,
	mhiramat@...nel.org,
	mathieu.desnoyers@...icios.com
Cc: linux-kernel@...r.kernel.org,
	linux-trace-kernel@...r.kernel.org,
	Yongliang Gao <leonylgao@...cent.com>,
	Huang Cun <cunhuang@...cent.com>
Subject: [PATCH] trace/pid_list: optimize pid_list->lock contention

From: Yongliang Gao <leonylgao@...cent.com>

When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.

For example, in a vmcore environment with 288 cores, We found 267
CPUs are in pid_list->lock contention.

 #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
 #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
 #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161

Signed-off-by: Yongliang Gao <leonylgao@...cent.com>
Reviewed-by: Huang Cun <cunhuang@...cent.com>
---
 kernel/trace/pid_list.c | 26 +++++++++++++-------------
 kernel/trace/pid_list.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..62082a4f60db 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return false;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	read_lock_irqsave(&pid_list->lock, flags);
 	upper_chunk = pid_list->upper[upper1];
 	if (upper_chunk) {
 		lower_chunk = upper_chunk->data[upper2];
 		if (lower_chunk)
 			ret = test_bit(lower, lower_chunk->data);
 	}
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	read_unlock_irqrestore(&pid_list->lock, flags);
 
 	return ret;
 }
@@ -177,7 +177,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	write_lock_irqsave(&pid_list->lock, flags);
 	upper_chunk = pid_list->upper[upper1];
 	if (!upper_chunk) {
 		upper_chunk = get_upper_chunk(pid_list);
@@ -199,7 +199,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
 	set_bit(lower, lower_chunk->data);
 	ret = 0;
  out:
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	write_unlock_irqrestore(&pid_list->lock, flags);
 	return ret;
 }
 
@@ -229,7 +229,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	write_lock_irqsave(&pid_list->lock, flags);
 	upper_chunk = pid_list->upper[upper1];
 	if (!upper_chunk)
 		goto out;
@@ -250,7 +250,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
 		}
 	}
  out:
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	write_unlock_irqrestore(&pid_list->lock, flags);
 	return 0;
 }
 
@@ -282,7 +282,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
 	if (pid_split(pid, &upper1, &upper2, &lower) < 0)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&pid_list->lock, flags);
+	read_lock_irqsave(&pid_list->lock, flags);
 	for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
 		upper_chunk = pid_list->upper[upper1];
 
@@ -302,7 +302,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
 	}
 
  found:
-	raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+	read_unlock_irqrestore(&pid_list->lock, flags);
 	if (upper1 > UPPER_MASK)
 		return -1;
 
@@ -339,10 +339,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 	int lcnt = 0;
 
  again:
-	raw_spin_lock(&pid_list->lock);
+	write_lock(&pid_list->lock);
 	upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
 	lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
-	raw_spin_unlock(&pid_list->lock);
+	write_unlock(&pid_list->lock);
 
 	if (upper_count <= 0 && lower_count <= 0)
 		return;
@@ -369,7 +369,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 		lcnt++;
 	}
 
-	raw_spin_lock(&pid_list->lock);
+	write_lock(&pid_list->lock);
 	if (upper) {
 		*upper_next = pid_list->upper_list;
 		pid_list->upper_list = upper;
@@ -380,7 +380,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
 		pid_list->lower_list = lower;
 		pid_list->free_lower_chunks += lcnt;
 	}
-	raw_spin_unlock(&pid_list->lock);
+	write_unlock(&pid_list->lock);
 
 	/*
 	 * On success of allocating all the chunks, both counters
@@ -418,7 +418,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
 
 	init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
 
-	raw_spin_lock_init(&pid_list->lock);
+	rwlock_init(&pid_list->lock);
 
 	for (i = 0; i < CHUNK_ALLOC; i++) {
 		union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..da200834f4ad 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,7 +76,7 @@ union upper_chunk {
 };
 
 struct trace_pid_list {
-	raw_spinlock_t			lock;
+	rwlock_t			lock;
 	struct irq_work			refill_irqwork;
 	union upper_chunk		*upper[UPPER1_SIZE]; // 1 or 2K in size
 	union upper_chunk		*upper_list;
-- 
2.43.5


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ