lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230905171105.1005672-3-mathieu.desnoyers@efficios.com>
Date:   Tue,  5 Sep 2023 13:11:05 -0400
From:   Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     linux-kernel@...r.kernel.org,
        Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
        Ingo Molnar <mingo@...hat.com>,
        Valentin Schneider <vschneid@...hat.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Swapnil Sapkal <Swapnil.Sapkal@....com>,
        Aaron Lu <aaron.lu@...el.com>,
        Julien Desfossez <jdesfossez@...italocean.com>, x86@...nel.org
Subject: [RFC PATCH 2/2] sched: Implement adaptative rate limiting of task migrations

Define a task migration quota (initially 10) within a 2ms window for
each task. Whenever a task reaches its quota, the following affine
wakeups keep the task on its previous CPU for the rest of the window.
When the migration quota has been entirely used within a window, the
available quota is divided by 2 for the next window, down to a minimum
of 1 per window. When the available quota is not entirely used within a
window, it is replenished by adding 1 per window.

One advantage of this approach is to leave extra room for sporadic
migration bursts, limiting the migration rate more strictly as tasks
repeatedly bust their quota.

Another advantage of this approach is to reduce the number of
sched_clock calls on the wakeup path when tasks are below their quota.

The resulting benchmarks are similar to those achieved with the
non-adaptative approach for hackbench and perf bench sched messaging
workloads.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Valentin Schneider <vschneid@...hat.com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Ben Segall <bsegall@...gle.com>
Cc: Mel Gorman <mgorman@...e.de>
Cc: Daniel Bristot de Oliveira <bristot@...hat.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>
Cc: Juri Lelli <juri.lelli@...hat.com>
Cc: Swapnil Sapkal <Swapnil.Sapkal@....com>
Cc: Aaron Lu <aaron.lu@...el.com>
Cc: Julien Desfossez <jdesfossez@...italocean.com>
Cc: x86@...nel.org
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   |  2 ++
 kernel/sched/fair.c   | 36 +++++++++++++++++++++++++++++++-----
 kernel/sched/sched.h  |  2 ++
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1111d04255cc..2190e59ebc99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -565,6 +565,8 @@ struct sched_entity {
 	u64				nr_migrations;
 
 	u64				next_migration_time;
+	int				nr_migrations_per_window;
+	int				quota_migrations_per_window;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	int				depth;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d294fce261d..834e37231c36 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4511,6 +4511,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vlag			= 0;
 	p->se.slice			= sysctl_sched_base_slice;
 	p->se.next_migration_time	= 0;
+	p->se.nr_migrations_per_window	= 0;
+	p->se.quota_migrations_per_window = SCHED_MIGRATION_QUOTA;
 	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 24ac69913005..e2f4c30483a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -963,9 +963,11 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
 static bool should_migrate_task(struct task_struct *p, int prev_cpu)
 {
 	/* Rate limit task migration. */
-	if (sched_clock_cpu(prev_cpu) < p->se.next_migration_time)
-	       return false;
-	return true;
+	if (p->se.nr_migrations_per_window < p->se.quota_migrations_per_window)
+		return true;
+	if (sched_clock_cpu(prev_cpu) > p->se.next_migration_time)
+		return true;
+	return false;
 }
 
 /*
@@ -7946,6 +7948,31 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	return new_cpu;
 }
 
+static void migrate_task_ratelimit_fair(struct task_struct *p, int new_cpu)
+{
+	struct sched_entity *se = &p->se;
+	u64 now;
+
+	/* Rate limit task migration. */
+	now = sched_clock_cpu(task_cpu(p));
+	if (now < se->next_migration_time) {
+		se->nr_migrations_per_window++;
+		if (!sched_clock_stable())
+			se->next_migration_time += sched_clock_cpu(new_cpu) - now;
+	} else {
+		if (!sched_clock_stable())
+			now = sched_clock_cpu(new_cpu);
+		se->next_migration_time = now + SCHED_MIGRATION_RATELIMIT_WINDOW;
+		if (se->nr_migrations_per_window >= se->quota_migrations_per_window)
+			se->quota_migrations_per_window = max(1, se->quota_migrations_per_window >> 1);
+		else
+			se->quota_migrations_per_window =
+				min(SCHED_MIGRATION_QUOTA,
+				    se->quota_migrations_per_window + SCHED_MIGRATION_QUOTA_REPLENISH);
+		se->nr_migrations_per_window = 1;
+	}
+}
+
 /*
  * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -7955,8 +7982,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
 	struct sched_entity *se = &p->se;
 
-	/* Rate limit task migration. */
-	se->next_migration_time = sched_clock_cpu(new_cpu) + SCHED_MIGRATION_RATELIMIT_WINDOW;
+	migrate_task_ratelimit_fair(p, new_cpu);
 
 	if (!task_on_rq_migrating(p)) {
 		remove_entity_load_avg(se);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9b1a5976761..04529661976c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -105,6 +105,8 @@ struct cpuidle_state;
 #define TASK_ON_RQ_MIGRATING	2
 
 #define SCHED_MIGRATION_RATELIMIT_WINDOW	2000000		/* 2 ms */
+#define SCHED_MIGRATION_QUOTA			10
+#define SCHED_MIGRATION_QUOTA_REPLENISH		1
 
 extern __read_mostly int scheduler_running;
 
-- 
2.39.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ