[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220322120834.98637-3-zhouchengming@bytedance.com>
Date: Tue, 22 Mar 2022 20:08:30 +0800
From: Chengming Zhou <zhouchengming@...edance.com>
To: peterz@...radead.org, mingo@...hat.com, acme@...nel.org,
mark.rutland@....com, alexander.shishkin@...ux.intel.com,
jolsa@...nel.org, namhyung@...nel.org, eranian@...gle.com
Cc: linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org,
duanxiongchun@...edance.com, songmuchun@...edance.com,
Chengming Zhou <zhouchengming@...edance.com>
Subject: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup
Although we don't have incosistency problem any more, we can
have other problem like:
CPU1 CPU2
(in context_switch) (attach running task)
prev->cgroups = cgrp2
perf_cgroup_sched_switch(prev, next)
cgrp2 == cgrp2 is True
If perf_cgroup of prev task changes from cgrp1 to cgrp2,
perf_cgroup_sched_switch() will skip perf_cgroup_switch(),
so the CPU would still schedule the cgrp1 events, but we should
schedule the cgrp2 events.
The reason of this problem is that we shouldn't use the changeable
prev->cgroups to decide whether skip perf_cgroup_switch().
This patch introduces a percpu perf_cgroup to cache the perf_cgroup
that scheduled in cpuctxes, which later used to compare with the
perf_cgroup of next task to decide whether skip perf_cgroup_switch().
Since the perf_cgroup_switch() can be called after the context switch,
the cgroup events might be scheduled already. So we put the comparison
of perf_cgroups in perf_cgroup_switch(), and delete the unused function
perf_cgroup_sched_switch().
We must clear the percpu perf_cgroup cache when the last cgroup event
disabled.
Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context switch code")
Signed-off-by: Chengming Zhou <zhouchengming@...edance.com>
---
kernel/events/core.c | 63 ++++++++++++++++----------------------------
1 file changed, 22 insertions(+), 41 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8b5cf2aedfe6..848a3bfa9513 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
}
}
+static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroup);
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
/*
@@ -833,6 +834,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cgroup *cgrp;
struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
local_irq_save(flags);
+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (cgrp == __this_cpu_read(cpu_perf_cgroup))
+ goto out;
+
+ __this_cpu_write(cpu_perf_cgroup, cgrp);
+
list = this_cpu_ptr(&cgrp_cpuctx_list);
list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ if (cpuctx->cgrp == cgrp)
+ continue;
+
perf_pmu_disable(cpuctx->ctx.pmu);
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -855,14 +867,11 @@ static void perf_cgroup_switch(struct task_struct *task)
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
+ cpuctx->cgrp = cgrp;
/*
* set cgrp before ctxsw in to allow
* event_filter_match() to not have to pass
* task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgroup events are only per-cpu
*/
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -870,35 +879,10 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+out:
local_irq_restore(flags);
}
-static inline void perf_cgroup_sched_switch(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
-
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task);
-
- rcu_read_unlock();
-}
-
static int perf_cgroup_ensure_storage(struct perf_event *event,
struct cgroup_subsys_state *css)
{
@@ -1035,6 +1019,9 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
cpuctx->cgrp = NULL;
list_del(&cpuctx->cgrp_cpuctx_entry);
+
+ if (list_empty(per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)))
+ __this_cpu_write(cpu_perf_cgroup, NULL);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1062,11 +1049,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
{
}
-static inline void perf_cgroup_sched_switch(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -1080,11 +1062,6 @@ perf_cgroup_set_timestamp(struct task_struct *task,
{
}
-static inline void
-perf_cgroup_sched_switch(struct task_struct *task, struct task_struct *next)
-{
-}
-
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
return 0;
@@ -1104,6 +1081,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif
/*
@@ -3625,7 +3606,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_switch(task, next);
+ perf_cgroup_switch(next);
}
/*
--
2.20.1
Powered by blists - more mailing lists