linux-kernel - [PATCH 3/3] sched/fair: Test that the average lag across the system is zero

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250422-b4-eevdf-tests-v1-post-v1-3-d005d1c7cf61@gianis.ca>
Date: Wed, 23 Apr 2025 00:11:15 +0000
From: "Dhaval Giani (AMD)" <dhaval@...nis.ca>
Cc: linux-kernel@...r.kernel.org, Dhaval Giani <dhaval.giani@....com>, Gautham Shenoy <gautham.shenoy@....com>, K Prateek Nayak <kprateek.nayak@....com>, "Dhaval Giani (AMD)" <dhaval@...nis.ca>
Subject: [PATCH 3/3] sched/fair: Test that the average lag across the system is zero

Lemma 2 of the EEVDF paper says that the sum of lags of all the
tasks in the system is zero.

The test is slightly different - the sum of lag across a runqueue is
zero.

The linux EEVDF implementation doesn't track a global vruntime.
Instead it tracks the "zero-lag" vruntime. This can be obtained by
calling avg_vruntime(cfs_rq).

Walk through every single CFS runqueue (per CPU as well as per-cgroup),
and add up the vruntimes. The average should be the same as
avg_vruntime.

Signed-off-by: Dhaval Giani (AMD) <dhaval@...nis.ca>
---
 include/linux/sched.h      |   7 ++
 kernel/sched/eevdf-tests.c | 174 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c        |   3 +
 3 files changed, 184 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f96ac198289349199b9c671240a20fc7826228ad..72788d51912657919adfad7f451983e80be4fa39 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -610,6 +610,13 @@ struct sched_entity {
 	 */
 	struct sched_avg		avg;
 #endif
+#ifdef CONFIG_SCHED_EEVDF_TESTING
+	/*
+	 * Add a list element so that we don't recurse
+	 * in the EEVDF unit test
+	 */
+	struct list_head tg_entry;
+#endif
 };
 
 struct sched_rt_entity {
diff --git a/kernel/sched/eevdf-tests.c b/kernel/sched/eevdf-tests.c
index 8532330769bcc93dbf9cd98ebba75c838f62c045..c343e94b9a44ad32d01a0eedcb61c1bbf5fdbaf6 100644
--- a/kernel/sched/eevdf-tests.c
+++ b/kernel/sched/eevdf-tests.c
@@ -24,6 +24,35 @@
 bool eevdf_positive_lag_test;
 u8 eevdf_positive_lag_count = 10;
 
+static int test_total_zero_lag(void *);
+static void launch_test_zero_lag(void);
+
+static int eevdf_zero_lag_show(struct seq_file *m, void *v)
+{
+	return 0;
+}
+
+static int eevdf_zero_lag_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, eevdf_zero_lag_show, NULL);
+}
+
+static ssize_t eevdf_zero_lag_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	launch_test_zero_lag();
+	return 1;
+
+}
+
+static const struct file_operations eevdf_zero_lag_fops = {
+	.open		= eevdf_zero_lag_open,
+	.write		= eevdf_zero_lag_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static struct dentry *debugfs_eevdf_testing;
 void debugfs_eevdf_testing_init(struct dentry *debugfs_sched)
 {
@@ -33,6 +62,8 @@ void debugfs_eevdf_testing_init(struct dentry *debugfs_sched)
 				debugfs_eevdf_testing, &eevdf_positive_lag_test);
 	debugfs_create_u8("eevdf_positive_lag_test_count", 0600,
 				debugfs_eevdf_testing, &eevdf_positive_lag_count);
+	debugfs_create_file("eevdf_zero_lag_test", 0700, debugfs_eevdf_testing,
+				NULL, &eevdf_zero_lag_fops);
 
 }
 
@@ -65,4 +96,147 @@ void test_eevdf_positive_lag(struct cfs_rq *cfs, struct sched_entity *se)
 	}
 }
 
+/*
+ * we do, what we need to do
+ */
+#define __node_2_se(node) \
+	rb_entry((node), struct sched_entity, run_node)
+
+static bool test_eevdf_cfs_rq_zero_lag(struct cfs_rq *cfs, struct list_head *tg_se)
+{
+	u64 cfs_avg_vruntime;
+	u64 calculated_avg_vruntime;
+
+	u64 total_vruntime = 0;
+	u64 nr_tasks = 0;
+
+	struct sched_entity *se;
+	struct rb_node *node;
+	struct rb_root *root;
+
+	cfs_avg_vruntime = avg_vruntime(cfs);
+
+	/*
+	 * Walk through the rb tree -> look at the se->vruntime value and add it
+	 */
+
+	total_vruntime = 0;
+	nr_tasks = 0;
+
+	root = &cfs->tasks_timeline.rb_root;
+
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		se = __node_2_se(node);
+		WARN_ON_ONCE(__builtin_add_overflow(total_vruntime,
+					se->vruntime, &total_vruntime));
+		/*
+		 * if it is a task group, add to a list to look at later
+		 */
+		if (!entity_is_task(se))
+			list_add_tail(&se->tg_entry, tg_se);
+		nr_tasks++;
+	}
+
+	if (cfs->curr) {
+		WARN_ON_ONCE(__builtin_add_overflow(total_vruntime,
+					cfs->curr->vruntime, &total_vruntime));
+		nr_tasks++;
+	}
+
+	/* If there are no tasks, there is no lag :-) */
+	if (!nr_tasks)
+		return true;
+
+	calculated_avg_vruntime = total_vruntime / nr_tasks;
+
+	return (calculated_avg_vruntime == cfs_avg_vruntime);
+
+}
+
+/*
+ * Call with rq lock held
+ *
+ * return false on failure
+ */
+static bool test_eevdf_zero_lag(struct cfs_rq *cfs)
+{
+	struct list_head tg_se = LIST_HEAD_INIT(tg_se);
+	struct list_head *se_entry;
+
+	/*
+	 * The base CFS runqueue will always have sched entities queued.
+	 * Test it, and start populating the tg_se list.
+	 *
+	 * If it fails, short circuit and return fail.
+	 */
+
+	if (!test_eevdf_cfs_rq_zero_lag(cfs, &tg_se))
+		return false;
+
+	/*
+	 * We made it here, let's walk through the list. Since it is
+	 * setup as a queue, as we continue calling the rq test, it
+	 * will add new task_groups to the list. Once drained, if we
+	 * haven't failed, we will return true.
+	 */
+
+	list_for_each(se_entry, &tg_se) {
+		struct sched_entity *se = list_entry(se_entry, struct sched_entity, tg_entry);
+
+		if (!test_eevdf_cfs_rq_zero_lag(group_cfs_rq(se), &tg_se))
+			return false;
+	}
+
+	/*
+	 * WOOT! We succeeded!
+	 */
+	return true;
+
+}
+
+/*
+ * The average vruntime of the entire cfs_rq should be equal
+ * to the avg_vruntime(cfs_rq)
+ */
+static int test_total_zero_lag(void *data)
+{
+	int cpu;
+	struct rq *rq;
+	struct cfs_rq *cfs;
+	bool success = false;
+
+	for_each_online_cpu(cpu) {
+
+		rq = cpu_rq(cpu);
+		guard(rq_lock_irq)(rq);
+
+		cfs = &rq->cfs;
+
+		success = test_eevdf_zero_lag(cfs);
+
+		if (!success)
+			break;
+	}
+	if (!success) {
+		trace_printk("FAILED: tracked average vruntime doesn't match calculated average vruntime\n");
+		return -1;
+	}
+	trace_printk("PASS: Tracked average runtime matches calculated average vruntime\n");
+	return 0;
+}
+
+static void launch_test_zero_lag(void)
+{
+	struct task_struct *kt;
+
+	kt = kthread_create(&test_total_zero_lag, NULL, "eevdf-tester-%d",
+					smp_processor_id());
+	if (!kt) {
+		trace_printk("Failed to launch kthread\n");
+		return;
+	}
+
+	wake_up_process(kt);
+}
+
 #endif /* CONFIG_SCHED_EEVDF_TESTING */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 924d9d35c2aa937bc0f4ca9565ba774397b90f77..858c4e1b8fac661996d879a8dcab2776db09d1c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13473,6 +13473,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
+#ifdef CONFIG_SCHED_EEVDF_TESTING
+	INIT_LIST_HEAD(&(se->group_node));
+#endif
 }
 
 static DEFINE_MUTEX(shares_mutex);

-- 
2.49.0