[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250929114225.36172-3-gmonaco@redhat.com>
Date: Mon, 29 Sep 2025 13:42:23 +0200
From: Gabriele Monaco <gmonaco@...hat.com>
To: linux-kernel@...r.kernel.org,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>,
Andrew Morton <akpm@...ux-foundation.org>,
David Hildenbrand <david@...hat.com>,
linux-mm@...ck.org
Cc: Gabriele Monaco <gmonaco@...hat.com>
Subject: [PATCH v3 2/4] rseq: Schedule the mm_cid_compaction from rseq_sched_switch_event()
Currently the mm_cid_compaction is triggered by the scheduler tick and
runs in a task_work, behaviour is more unpredictable with periodic tasks
with short runtime, which may rarely run during a tick.
Schedule the mm_cid_compaction from the rseq_sched_switch_event() call
only if the scan is required, that is when the pseudo-period of 100ms
elapsed.
Keep a tick handler used for long running tasks that are never preempted
(i.e. that never call rseq_sched_switch_event), which triggers a
compaction and mm_cid update only in that case.
Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
---
include/linux/mm_types.h | 11 +++++++++
include/linux/rseq.h | 3 +++
include/linux/sched.h | 3 +++
kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 2 ++
5 files changed, 60 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 08bc2442db93..5dab88707014 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1424,6 +1424,13 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
raw_spin_unlock(&mm->cpus_allowed_lock);
}
+
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+ if (!mm)
+ return false;
+ return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
+}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
@@ -1434,6 +1441,10 @@ static inline unsigned int mm_cid_size(void)
return 0;
}
static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+ return false;
+}
#endif /* CONFIG_SCHED_MM_CID */
struct mmu_gather;
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index b8ea95011ec3..12eecde46ff5 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -4,6 +4,7 @@
#ifdef CONFIG_RSEQ
#include <linux/sched.h>
+#include <linux/mm_types.h>
void __rseq_handle_slowpath(struct pt_regs *regs);
@@ -68,6 +69,8 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
rseq_raise_notify_resume(t);
}
}
+ if (mm_cid_needs_scan(t->mm))
+ task_add_mm_cid(t);
}
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 857ed17d443b..80c1afb2087d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,6 +1407,7 @@ struct task_struct {
int last_mm_cid; /* Most recent cid in mm */
int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
+ unsigned long last_cid_reset; /* Time of last reset in jiffies */
struct callback_head cid_work;
#endif
@@ -2300,6 +2301,7 @@ void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit_signals(struct task_struct *t);
+void task_add_mm_cid(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
return t->mm_cid;
@@ -2309,6 +2311,7 @@ static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline void task_add_mm_cid(struct task_struct *t) { }
static inline int task_mm_cid(struct task_struct *t)
{
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e742a655c9a8..30652bb4a223 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10840,19 +10840,53 @@ void init_sched_mm_cid(struct task_struct *t)
init_task_work(&t->cid_work, task_mm_cid_work);
}
+void task_add_mm_cid(struct task_struct *t)
+{
+ struct callback_head *work = &t->cid_work;
+
+ if (work->next != work)
+ return;
+ /* No page allocation under rq lock */
+ task_work_add(t, work, TWA_RESUME);
+}
+
void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
+ u64 rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
+ /*
+ * If a task is running unpreempted for a long time, it won't get its
+ * mm_cid compacted and won't update its mm_cid value after a
+ * compaction occurs.
+ * For such a task, this function does two things:
+ * A) trigger the mm_cid recompaction,
+ * B) trigger an update of the task's rseq->mm_cid field at some point
+ * after recompaction, so it can get a mm_cid value closer to 0.
+ * A change in the mm_cid triggers an rseq_preempt.
+ *
+ * B occurs once after the compaction work completes, neither A nor B
+ * run as long as the compaction work is pending, the task is exiting
+ * or is not a userspace task.
+ */
if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
+ test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+ if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
+ if (mm_cid_needs_scan(curr->mm)) {
+ /* Trigger mm_cid recompaction */
+ task_add_mm_cid(curr);
+ } else if (time_after(jiffies, curr->last_cid_reset +
+ msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
+ /* Update mm_cid field */
+ if (!curr->mm_cid_active)
+ return;
+ mm_cid_snapshot_time(rq, curr->mm);
+ mm_cid_put_lazy(curr);
+ curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq, curr, curr->mm);
+ rseq_sched_set_task_mm_cid(curr, curr->mm_cid);
+ rseq_sched_switch_event(curr);
+ }
}
void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8f14d231e7a7..8c0fb3b0fb35 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3512,6 +3512,7 @@ extern const char *preempt_modes[];
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
#define MM_CID_SCAN_DELAY 100 /* 100ms */
+#define RSEQ_UNPREEMPTED_THRESHOLD SCHED_MM_CID_PERIOD_NS
extern raw_spinlock_t cid_lock;
extern int use_cid_lock;
@@ -3715,6 +3716,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
int cid;
lockdep_assert_rq_held(rq);
+ t->last_cid_reset = jiffies;
cpumask = mm_cidmask(mm);
cid = __this_cpu_read(pcpu_cid->cid);
if (mm_cid_is_valid(cid)) {
--
2.51.0
Powered by blists - more mailing lists