[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251015114952.4014352-1-leonylgao@gmail.com>
Date: Wed, 15 Oct 2025 19:49:52 +0800
From: Yongliang Gao <leonylgao@...il.com>
To: rostedt@...dmis.org,
mhiramat@...nel.org,
mathieu.desnoyers@...icios.com
Cc: linux-kernel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org,
Yongliang Gao <leonylgao@...cent.com>,
Huang Cun <cunhuang@...cent.com>
Subject: [PATCH] trace/pid_list: optimize pid_list->lock contention
From: Yongliang Gao <leonylgao@...cent.com>
When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.
For example, in a vmcore environment with 288 cores, We found 267
CPUs are in pid_list->lock contention.
#4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
#5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
#6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
#7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
#8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
#9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161
Signed-off-by: Yongliang Gao <leonylgao@...cent.com>
Reviewed-by: Huang Cun <cunhuang@...cent.com>
---
kernel/trace/pid_list.c | 26 +++++++++++++-------------
kernel/trace/pid_list.h | 2 +-
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 090bb5ea4a19..62082a4f60db 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -138,14 +138,14 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return false;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ read_lock_irqsave(&pid_list->lock, flags);
upper_chunk = pid_list->upper[upper1];
if (upper_chunk) {
lower_chunk = upper_chunk->data[upper2];
if (lower_chunk)
ret = test_bit(lower, lower_chunk->data);
}
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ read_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -177,7 +177,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return -EINVAL;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_lock_irqsave(&pid_list->lock, flags);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk) {
upper_chunk = get_upper_chunk(pid_list);
@@ -199,7 +199,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
set_bit(lower, lower_chunk->data);
ret = 0;
out:
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ write_unlock_irqrestore(&pid_list->lock, flags);
return ret;
}
@@ -229,7 +229,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return -EINVAL;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ write_lock_irqsave(&pid_list->lock, flags);
upper_chunk = pid_list->upper[upper1];
if (!upper_chunk)
goto out;
@@ -250,7 +250,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
}
}
out:
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ write_unlock_irqrestore(&pid_list->lock, flags);
return 0;
}
@@ -282,7 +282,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
if (pid_split(pid, &upper1, &upper2, &lower) < 0)
return -EINVAL;
- raw_spin_lock_irqsave(&pid_list->lock, flags);
+ read_lock_irqsave(&pid_list->lock, flags);
for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
upper_chunk = pid_list->upper[upper1];
@@ -302,7 +302,7 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
}
found:
- raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ read_unlock_irqrestore(&pid_list->lock, flags);
if (upper1 > UPPER_MASK)
return -1;
@@ -339,10 +339,10 @@ static void pid_list_refill_irq(struct irq_work *iwork)
int lcnt = 0;
again:
- raw_spin_lock(&pid_list->lock);
+ write_lock(&pid_list->lock);
upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
- raw_spin_unlock(&pid_list->lock);
+ write_unlock(&pid_list->lock);
if (upper_count <= 0 && lower_count <= 0)
return;
@@ -369,7 +369,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
lcnt++;
}
- raw_spin_lock(&pid_list->lock);
+ write_lock(&pid_list->lock);
if (upper) {
*upper_next = pid_list->upper_list;
pid_list->upper_list = upper;
@@ -380,7 +380,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
pid_list->lower_list = lower;
pid_list->free_lower_chunks += lcnt;
}
- raw_spin_unlock(&pid_list->lock);
+ write_unlock(&pid_list->lock);
/*
* On success of allocating all the chunks, both counters
@@ -418,7 +418,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
- raw_spin_lock_init(&pid_list->lock);
+ rwlock_init(&pid_list->lock);
for (i = 0; i < CHUNK_ALLOC; i++) {
union upper_chunk *chunk;
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
index 62e73f1ac85f..da200834f4ad 100644
--- a/kernel/trace/pid_list.h
+++ b/kernel/trace/pid_list.h
@@ -76,7 +76,7 @@ union upper_chunk {
};
struct trace_pid_list {
- raw_spinlock_t lock;
+ rwlock_t lock;
struct irq_work refill_irqwork;
union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
union upper_chunk *upper_list;
--
2.43.5
Powered by blists - more mailing lists