linux-kernel - Re: [PATCH 17/24] sched/fair: Implement delayed dequeue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5618d029-769a-4690-a581-2df8939f26a9@samsung.com>
Date: Thu, 29 Aug 2024 00:38:00 +0200
From: Marek Szyprowski <m.szyprowski@...sung.com>
To: Peter Zijlstra <peterz@...radead.org>, mingo@...hat.com,
	juri.lelli@...hat.com, vincent.guittot@...aro.org, dietmar.eggemann@....com,
	rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
	vschneid@...hat.com, linux-kernel@...r.kernel.org
Cc: kprateek.nayak@....com, wuyun.abel@...edance.com,
	youssefesmat@...omium.org, tglx@...utronix.de, efault@....de
Subject: Re: [PATCH 17/24] sched/fair: Implement delayed dequeue

On 27.07.2024 12:27, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
>
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
>
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
>
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
>
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>

This patch landed recently in linux-next as commit 152e11f6df29 
("sched/fair: Implement delayed dequeue"). In my tests on some of the 
ARM 32bit boards it causes a regression in rtcwake tool behavior - from 
time to time this simple call never ends:

# time rtcwake -s 10 -m on

Reverting this commit (together with its compile dependencies) on top of 
linux-next fixes this issue. Let me know how can I help debugging this 
issue.

> ---
>   kernel/sched/deadline.c |    1
>   kernel/sched/fair.c     |   82 ++++++++++++++++++++++++++++++++++++++++++------
>   kernel/sched/features.h |    9 +++++
>   3 files changed, 81 insertions(+), 11 deletions(-)
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_t
>   		else
>   			p = dl_se->server_pick_next(dl_se);
>   		if (!p) {
> -			WARN_ON_ONCE(1);
>   			dl_se->dl_yielded = 1;
>   			update_curr_dl_se(rq, dl_se, 0);
>   			goto again;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5379,20 +5379,44 @@ static void clear_buddies(struct cfs_rq
>   
>   static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>   
> -static void
> +static bool
>   dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>   {
> -	int action = UPDATE_TG;
> +	if (flags & DEQUEUE_DELAYED) {
> +		/*
> +		 * DEQUEUE_DELAYED is typically called from pick_next_entity()
> +		 * at which point we've already done update_curr() and do not
> +		 * want to do so again.
> +		 */
> +		SCHED_WARN_ON(!se->sched_delayed);
> +		se->sched_delayed = 0;
> +	} else {
> +		bool sleep = flags & DEQUEUE_SLEEP;
> +
> +		/*
> +		 * DELAY_DEQUEUE relies on spurious wakeups, special task
> +		 * states must not suffer spurious wakeups, excempt them.
> +		 */
> +		if (flags & DEQUEUE_SPECIAL)
> +			sleep = false;
> +
> +		SCHED_WARN_ON(sleep && se->sched_delayed);
> +		update_curr(cfs_rq);
>   
> +		if (sched_feat(DELAY_DEQUEUE) && sleep &&
> +		    !entity_eligible(cfs_rq, se)) {
> +			if (cfs_rq->next == se)
> +				cfs_rq->next = NULL;
> +			se->sched_delayed = 1;
> +			return false;
> +		}
> +	}
> +
> +	int action = UPDATE_TG;
>   	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
>   		action |= DO_DETACH;
>   
>   	/*
> -	 * Update run-time statistics of the 'current'.
> -	 */
> -	update_curr(cfs_rq);
> -
> -	/*
>   	 * When dequeuing a sched_entity, we must:
>   	 *   - Update loads to have both entity and cfs_rq synced with now.
>   	 *   - For group_entity, update its runnable_weight to reflect the new
> @@ -5430,6 +5454,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>   
>   	if (cfs_rq->nr_running == 0)
>   		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> +	return true;
>   }
>   
>   static void
> @@ -5828,11 +5854,21 @@ static bool throttle_cfs_rq(struct cfs_r
>   	idle_task_delta = cfs_rq->idle_h_nr_running;
>   	for_each_sched_entity(se) {
>   		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +		int flags;
> +
>   		/* throttled entity or throttle-on-deactivate */
>   		if (!se->on_rq)
>   			goto done;
>   
> -		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> +		/*
> +		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> +		 * This avoids teaching dequeue_entities() about throttled
> +		 * entities and keeps things relatively simple.
> +		 */
> +		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> +		if (se->sched_delayed)
> +			flags |= DEQUEUE_DELAYED;
> +		dequeue_entity(qcfs_rq, se, flags);
>   
>   		if (cfs_rq_is_idle(group_cfs_rq(se)))
>   			idle_task_delta = cfs_rq->h_nr_running;
> @@ -6918,6 +6954,7 @@ static int dequeue_entities(struct rq *r
>   	bool was_sched_idle = sched_idle_rq(rq);
>   	int rq_h_nr_running = rq->cfs.h_nr_running;
>   	bool task_sleep = flags & DEQUEUE_SLEEP;
> +	bool task_delayed = flags & DEQUEUE_DELAYED;
>   	struct task_struct *p = NULL;
>   	int idle_h_nr_running = 0;
>   	int h_nr_running = 0;
> @@ -6931,7 +6968,13 @@ static int dequeue_entities(struct rq *r
>   
>   	for_each_sched_entity(se) {
>   		cfs_rq = cfs_rq_of(se);
> -		dequeue_entity(cfs_rq, se, flags);
> +
> +		if (!dequeue_entity(cfs_rq, se, flags)) {
> +			if (p && &p->se == se)
> +				return -1;
> +
> +			break;
> +		}
>   
>   		cfs_rq->h_nr_running -= h_nr_running;
>   		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> @@ -6956,6 +6999,7 @@ static int dequeue_entities(struct rq *r
>   			break;
>   		}
>   		flags |= DEQUEUE_SLEEP;
> +		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
>   	}
>   
>   	for_each_sched_entity(se) {
> @@ -6985,6 +7029,17 @@ static int dequeue_entities(struct rq *r
>   	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
>   		rq->next_balance = jiffies;
>   
> +	if (p && task_delayed) {
> +		SCHED_WARN_ON(!task_sleep);
> +		SCHED_WARN_ON(p->on_rq != 1);
> +
> +		/* Fix-up what dequeue_task_fair() skipped */
> +		hrtick_update(rq);
> +
> +		/* Fix-up what block_task() skipped. */
> +		__block_task(rq, p);
> +	}
> +
>   	return 1;
>   }
>   /*
> @@ -6996,8 +7051,10 @@ static bool dequeue_task_fair(struct rq
>   {
>   	util_est_dequeue(&rq->cfs, p);
>   
> -	if (dequeue_entities(rq, &p->se, flags) < 0)
> +	if (dequeue_entities(rq, &p->se, flags) < 0) {
> +		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
>   		return false;
> +	}
>   
>   	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>   	hrtick_update(rq);
> @@ -12973,6 +13030,11 @@ static void set_next_task_fair(struct rq
>   		/* ensure bandwidth has been allocated on our new cfs_rq */
>   		account_cfs_rq_runtime(cfs_rq, 0);
>   	}
> +
> +	if (!first)
> +		return;
> +
> +	SCHED_WARN_ON(se->sched_delayed);
>   }
>   
>   void init_cfs_rq(struct cfs_rq *cfs_rq)
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
>   SCHED_FEAT(CACHE_HOT_BUDDY, true)
>   
>   /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +
> +/*
>    * Allow wakeup-time preemption of the current task:
>    */
>   SCHED_FEAT(WAKEUP_PREEMPTION, true)
>
>
>
Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland