linux-kernel - [PATCH 04/10] sched/fair: Add penalty calculation and application logic

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251110033232.12538-5-kernellwp@gmail.com>
Date: Mon, 10 Nov 2025 11:32:25 +0800
From: Wanpeng Li <kernellwp@...il.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Paolo Bonzini <pbonzini@...hat.com>,
	Sean Christopherson <seanjc@...gle.com>
Cc: Steven Rostedt <rostedt@...dmis.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	linux-kernel@...r.kernel.org,
	kvm@...r.kernel.org,
	Wanpeng Li <wanpengli@...cent.com>
Subject: [PATCH 04/10] sched/fair: Add penalty calculation and application logic

From: Wanpeng Li <wanpengli@...cent.com>

From: Wanpeng Li <wanpengli@...cent.com>

Implement core penalty calculation and application mechanisms for
yield deboost operations.

Add yield_deboost_apply_debounce() for reverse-pair debouncing to
prevent ping-pong behavior. When A→B then B→A occurs within ~600us,
downscale the penalty.

Add yield_deboost_calculate_penalty() to calculate vruntime penalty
based on the fairness gap (vruntime delta between yielding and target
tasks), scheduling granularity with safety floor for abnormal values,
and queue-size-based caps (2 tasks: 6.0×gran, 3: 4.0×, 4-6: 2.5×,
7-8: 2.0×, 9-12: 1.5×, >12: 1.0×). Apply special handling for zero
gap with refined multipliers and 10% boost weighting on positive gaps.

Add yield_deboost_apply_penalty() to apply the penalty with overflow
protection and update EEVDF fields (deadline, vlag) and min_vruntime.

The penalty is tuned to provide meaningful preference while avoiding
starvation, scales with queue depth, and prevents oscillation through
debouncing. These static functions will be integrated in the next
patch.

Signed-off-by: Wanpeng Li <wanpengli@...cent.com>
---
 kernel/sched/fair.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 740c002b8f1c..4bad324f3662 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9118,6 +9118,159 @@ static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, str
 	return true;
 }
 
+/*
+ * Apply debounce for reverse pair within ~600us to reduce ping-pong.
+ * Downscales penalty to max(need, gran) when the previous pair was target->source,
+ * and updates per-rq debounce tracking fields to avoid cross-CPU races.
+ */
+static u64 yield_deboost_apply_debounce(struct rq *rq, struct sched_entity *se_t,
+					u64 penalty, u64 need, u64 gran)
+{
+	u64 now_ns = rq->clock;
+	struct task_struct *p_yielding = rq->curr;
+	struct task_struct *p_target = task_of(se_t);
+
+	if (p_yielding && p_target) {
+		pid_t src_pid = p_yielding->pid;
+		pid_t dst_pid = p_target->pid;
+		pid_t last_src = rq->yield_deboost_last_src_pid;
+		pid_t last_dst = rq->yield_deboost_last_dst_pid;
+		u64  last_ns  = rq->yield_deboost_last_pair_time_ns;
+
+		if (last_src == dst_pid && last_dst == src_pid &&
+		    (now_ns - last_ns) <= (600ULL * NSEC_PER_USEC)) {
+			u64 alt = need;
+			if (alt < gran)
+				alt = gran;
+			if (penalty > alt)
+				penalty = alt;
+		}
+
+		/* Update per-rq tracking */
+		rq->yield_deboost_last_src_pid = src_pid;
+		rq->yield_deboost_last_dst_pid = dst_pid;
+		rq->yield_deboost_last_pair_time_ns = now_ns;
+	}
+
+	return penalty;
+}
+
+/*
+ * Calculate penalty with debounce logic for EEVDF yield deboost.
+ * Computes vruntime penalty based on fairness gap (need) plus granularity,
+ * applies queue-size-based caps to prevent excessive penalties in small queues,
+ * and implements reverse-pair debounce (~300us) to reduce ping-pong effects.
+ * Returns 0 if no penalty needed, otherwise returns clamped penalty value.
+ */
+static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+				    struct sched_entity *se_t_lca, struct sched_entity *se_t,
+				    int nr_queued)
+{
+	u64 gran, need, penalty, maxp;
+	u64 gran_floor;
+	u64 weighted_need, base;
+
+	gran = calc_delta_fair(sysctl_sched_base_slice, se_y_lca);
+	/* Low-bound safeguard for gran when slice is abnormally small */
+	gran_floor = calc_delta_fair(sysctl_sched_base_slice >> 1, se_y_lca);
+	if (gran < gran_floor)
+		gran = gran_floor;
+
+	need = 0;
+	if (se_t_lca->vruntime > se_y_lca->vruntime)
+		need = se_t_lca->vruntime - se_y_lca->vruntime;
+
+	/* Apply 10% boost to need when positive (weighted_need = need * 1.10) */
+	penalty = gran;
+	if (need) {
+		/* weighted_need = need + 10% */
+		weighted_need = need + need / 10;
+		/* clamp to avoid overflow when adding to gran (still capped later) */
+		if (weighted_need > U64_MAX - penalty)
+			weighted_need = U64_MAX - penalty;
+		penalty += weighted_need;
+	}
+
+	/* Apply debounce via helper to avoid ping-pong */
+	penalty = yield_deboost_apply_debounce(rq, se_t, penalty, need, gran);
+
+	/* Upper bound (cap): slightly more aggressive for mid-size queues */
+	if (nr_queued == 2)
+		maxp = gran * 6;		/* Strongest push for 2-task ping-pong */
+	else if (nr_queued == 3)
+		maxp = gran * 4;		/* 4.0 * gran */
+	else if (nr_queued <= 6)
+		maxp = (gran * 5) / 2;		/* 2.5 * gran */
+	else if (nr_queued <= 8)
+		maxp = gran * 2;		/* 2.0 * gran */
+	else if (nr_queued <= 12)
+		maxp = (gran * 3) / 2;		/* 1.5 * gran */
+	else
+		maxp = gran;			/* 1.0 * gran */
+
+	if (penalty < gran)
+		penalty = gran;
+	if (penalty > maxp)
+		penalty = maxp;
+
+	/* If no need, apply refined baseline push (low risk + mid risk combined). */
+	if (need == 0) {
+		/*
+		 * Baseline multiplier for need==0:
+		 *   2        -> 1.00 * gran
+		 *   3        -> 0.9375 * gran
+		 *   4–6      -> 0.625 * gran
+		 *   7–8      -> 0.50  * gran
+		 *   9–12     -> 0.375 * gran
+		 *   >12      -> 0.25  * gran
+		 */
+		base = gran;
+		if (nr_queued == 3)
+			base = (gran * 15) / 16;	/* 0.9375 */
+		else if (nr_queued >= 4 && nr_queued <= 6)
+			base = (gran * 5) / 8;		/* 0.625 */
+		else if (nr_queued >= 7 && nr_queued <= 8)
+			base = gran / 2;		/* 0.5 */
+		else if (nr_queued >= 9 && nr_queued <= 12)
+			base = (gran * 3) / 8;		/* 0.375 */
+		else if (nr_queued > 12)
+			base = gran / 4;		/* 0.25 */
+
+		if (penalty < base)
+			penalty = base;
+	}
+
+	return penalty;
+}
+
+/*
+ * Apply penalty and update EEVDF fields for scheduler consistency.
+ * Safely applies vruntime penalty with overflow protection, then updates
+ * EEVDF-specific fields (deadline, vlag) and cfs_rq min_vruntime to maintain
+ * scheduler state consistency. Returns true on successful application,
+ * false if penalty cannot be safely applied.
+ */
+static void __maybe_unused yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+				 struct cfs_rq *cfs_rq_common, u64 penalty)
+{
+	u64 new_vruntime;
+
+	/* Overflow protection */
+	if (se_y_lca->vruntime > (U64_MAX - penalty))
+		return;
+
+	new_vruntime = se_y_lca->vruntime + penalty;
+
+	/* Validity check */
+	if (new_vruntime <= se_y_lca->vruntime)
+		return;
+
+	se_y_lca->vruntime = new_vruntime;
+	se_y_lca->deadline = se_y_lca->vruntime + calc_delta_fair(se_y_lca->slice, se_y_lca);
+	se_y_lca->vlag = avg_vruntime(cfs_rq_common) - se_y_lca->vruntime;
+	update_min_vruntime(cfs_rq_common);
+}
+
 /*
  * sched_yield() is very simple
  */
-- 
2.43.0