[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <f87a8c0d-527d-a9bc-9653-ff955e0e95b4@bytedance.com>
Date: Fri, 11 Mar 2022 15:58:47 +0800
From: chenying <chenying.kernel@...edance.com>
To: mingo@...hat.com, peterz@...radead.org, juri.lelli@...hat.com,
vincent.guittot@...aro.org, dietmar.eggemann@....com,
rostedt@...dmis.org, mgorman@...e.de, bristot@...hat.com,
bsegall@...gle.com
Cc: linux-kernel@...r.kernel.org, duanxiongchun@...edance.com,
zhouchengming@...edance.com, songmuchun@...edance.com,
zhengqi.arch@...edance.com, zhoufeng.zf@...edance.com,
ligang.bdlg@...edance.com
Subject: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle
task with vruntime offset
We add a time offset to the se->vruntime when the idle sched_entity
is enqueued, so that the idle entity will always be on the right of
the non-idle in the runqueue. This can allow non-idle tasks to be
selected and run before the idle.
A use-case is that sched_idle for background tasks and non-idle
for foreground. The foreground tasks are latency sensitive and do
not want to be disturbed by the background. It is well known that
the idle tasks can be preempted by the non-idle tasks when waking up,
but will not distinguish between idle and non-idle when pick the next
entity. This may cause background tasks to disturb the foreground.
Test results as below:
~$ ./loop.sh &
[1] 764
~$ chrt -i 0 ./loop.sh &
[2] 765
~$ taskset -p 04 764
~$ taskset -p 04 765
~$ top -p 764 -p 765
top - 13:10:01 up 1 min, 2 users, load average: 1.30, 0.38, 0.13
Tasks: 2 total, 2 running, 0 sleeping, 0 stopped, 0 zombie
%Cpu(s): 12.5 us, 0.0 sy, 0.0 ni, 87.4 id, 0.0 wa, 0.0 hi, 0.0 si,
0.0 st
KiB Mem : 16393492 total, 16142256 free, 111028 used, 140208 buff/cache
KiB Swap: 385836 total, 385836 free, 0 used. 16037992 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
764 chenyin+ 20 0 12888 1144 1004 R 100.0 0.0 1:05.12 loop.sh
765 chenyin+ 20 0 12888 1224 1080 R 0.0 0.0 0:16.21 loop.sh
The non-idle process (764) can run at 100% and without being disturbed by
the idle process (765).
~$ cat /sys/fs/cgroup/cpu/background/cgroup.procs
765
~$ cat /sys/fs/cgroup/cpu/foreground/cgroup.procs
764
~$ top -p 764 -p 765
top - 13:17:19 up 9 min, 2 users, load average: 2.00, 1.64, 0.86
Tasks: 2 total, 2 running, 0 sleeping, 0 stopped, 0 zombie
%Cpu(s): 12.5 us, 0.0 sy, 0.0 ni, 87.5 id, 0.0 wa, 0.0 hi, 0.0 si,
0.0 st
KiB Mem : 16393492 total, 16139576 free, 112732 used, 141184 buff/cache
KiB Swap: 385836 total, 385836 free, 0 used. 16036236 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
764 chenyin+ 20 0 12888 1144 1004 R 100.0 0.0 8:23.51 loop.sh
765 chenyin+ 20 0 12888 1224 1080 R 0.0 0.0 0:16.21 loop.sh
The non-idle group can run at 100% and without being disturbed by the
idle group.
Co-developed-by: chengming zhou <zhouchengming@...edance.com>
Signed-off-by: chenying <chenying.kernel@...edance.com>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 6 +++++-
kernel/sched/debug.c | 2 ++
kernel/sched/fair.c | 26 ++++++++++++++++++++++----
kernel/sched/features.h | 2 ++
kernel/sched/sched.h | 1 +
6 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248..20412f353cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -545,6 +545,7 @@ struct sched_entity {
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
+ u64 vruntime_offset;
u64 prev_sum_exec_runtime;
u64 nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..beb9d6f54c52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,6 +4239,7 @@ static void __sched_fork(unsigned long
clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vruntime_offset = 0;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7211,8 +7212,11 @@ static void __setscheduler_params(struct
task_struct *p,
if (dl_policy(policy))
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ p->se.vruntime_offset = 0;
+ } else if (idle_policy(policy))
+ p->se.vruntime_offset = sched_idle_vruntime_offset;
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..701496626830 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -460,6 +460,7 @@ static void print_cfs_group_stats(struct seq_file
*m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
+ PN(se->vruntime_offset);
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
@@ -969,6 +970,7 @@ void proc_sched_show_task(struct task_struct *p,
struct pid_namespace *ns,
PN(se.exec_start);
PN(se.vruntime);
+ PN(se.vruntime_offset);
PN(se.sum_exec_runtime);
nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5146163bfabb..6a2cba63b4a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -92,6 +92,8 @@ static unsigned int
normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+unsigned long long sched_idle_vruntime_offset = 2592000000000000; /*
30 days */
+
int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -535,10 +537,19 @@ static inline u64 min_vruntime(u64 min_vruntime,
u64 vruntime)
return min_vruntime;
}
+static inline s64 vtime_diff(struct sched_entity *a,
+ struct sched_entity *b)
+{
+ if (sched_feat(VRUNTIME_OFFSET))
+ return (s64)(a->vruntime_offset - b->vruntime_offset);
+ else
+ return 0;
+}
+
static inline bool entity_before(struct sched_entity *a,
struct sched_entity *b)
{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ return (s64)(a->vruntime - b->vruntime + vtime_diff(a, b)) < 0;
}
#define __node_2_se(node) \
@@ -4445,7 +4456,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct
sched_entity *curr)
return;
se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
+ delta = curr->vruntime - se->vruntime + vtime_diff(curr, se);
if (delta < 0)
return;
@@ -7036,7 +7047,7 @@ static unsigned long wakeup_gran(struct
sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
+ s64 gran, vdiff = curr->vruntime - se->vruntime + vtime_diff(curr, se);
if (vdiff <= 0)
return -1;
@@ -11131,7 +11142,7 @@ bool cfs_prio_less(struct task_struct *a, struct
task_struct *b, bool in_fi)
* min_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
- delta = (s64)(sea->vruntime - seb->vruntime) +
+ delta = (s64)(sea->vruntime - seb->vruntime + vtime_diff(sea, seb)) +
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
return delta > 0;
@@ -11190,6 +11201,9 @@ static void task_fork_fair(struct task_struct *p)
}
place_entity(cfs_rq, se, 1);
+ if (task_has_idle_policy(p))
+ se->vruntime_offset = sched_idle_vruntime_offset;
+
if (sysctl_sched_child_runs_first && curr && entity_before(curr,
se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
@@ -11655,6 +11669,10 @@ int sched_group_set_idle(struct task_group *tg,
long idle)
rq_lock_irqsave(rq, &rf);
grp_cfs_rq->idle = idle;
+ if (idle)
+ se->vruntime_offset = sched_idle_vruntime_offset;
+ else
+ se->vruntime_offset = 0;
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..f59f507e6dba 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -100,3 +100,5 @@ SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(VRUNTIME_OFFSET, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..1bc0c0756fd4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -95,6 +95,7 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
+extern unsigned long long sched_idle_vruntime_offset;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
--
2.11.0
Powered by blists - more mailing lists