linux-kernel - [PATCH 6/7] sched/rt: make it configurable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170529210302.26868-7-nicolas.pitre@linaro.org>
Date:   Mon, 29 May 2017 17:03:01 -0400
From:   Nicolas Pitre <nicolas.pitre@...aro.org>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     linux-kernel@...r.kernel.org
Subject: [PATCH 6/7] sched/rt: make it configurable

On most small systems where user space is tightly controlled, the realtime
scheduling class can often be dispensed with to reduce the kernel footprint.
Let's make it configurable.

Signed-off-by: Nicolas Pitre <nico@...aro.org>
---
 include/linux/init_task.h      | 15 +++++++++++----
 include/linux/sched.h          |  2 ++
 include/linux/sched/rt.h       |  4 ++--
 init/Kconfig                   | 14 ++++++++++++--
 kernel/sched/Makefile          |  4 ++--
 kernel/sched/core.c            | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/debug.c           |  2 ++
 kernel/sched/sched.h           |  7 +++++--
 kernel/sched/stop_task.c       |  4 +++-
 kernel/sysctl.c                |  4 +++-
 kernel/time/posix-cpu-timers.c |  6 +++++-
 11 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc1..6befc0aa61 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -225,6 +225,16 @@ extern struct cred init_cred;
 #define INIT_TASK_SECURITY
 #endif
 
+#ifdef CONFIG_SCHED_RT
+#define INIT_TASK_RT(tsk)						\
+	.rt		= {						\
+		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
+		.time_slice	= RR_TIMESLICE,				\
+	},
+#else
+#define INIT_TASK_RT(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,10 +260,7 @@ extern struct cred init_cred;
 	.se		= {						\
 		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
 	},								\
-	.rt		= {						\
-		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
-		.time_slice	= RR_TIMESLICE,				\
-	},								\
+	INIT_TASK_RT(tsk)						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
 	INIT_CGROUP_SCHED(tsk)						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba0c203669..71a43480ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -518,7 +518,9 @@ struct task_struct {
 
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
+#ifdef CONFIG_SCHED_RT
 	struct sched_rt_entity		rt;
+#endif
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group		*sched_task_group;
 #endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index f93329aba3..f2d636582d 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -7,7 +7,7 @@ struct task_struct;
 
 static inline int rt_prio(int prio)
 {
-	if (unlikely(prio < MAX_RT_PRIO))
+	if (IS_ENABLED(CONFIG_SCHED_RT) && unlikely(prio < MAX_RT_PRIO))
 		return 1;
 	return 0;
 }
@@ -17,7 +17,7 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
diff --git a/init/Kconfig b/init/Kconfig
index f73e3f0940..3bcd49f576 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -687,7 +687,7 @@ config TREE_RCU_TRACE
 
 config RCU_BOOST
 	bool "Enable RCU priority boosting"
-	depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+	depends on SCHED_RT && RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
 	default n
 	help
 	  This option boosts the priority of preempted RCU readers that
@@ -1090,7 +1090,7 @@ config CFS_BANDWIDTH
 
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
-	depends on CGROUP_SCHED
+	depends on CGROUP_SCHED && SCHED_RT
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
@@ -1303,8 +1303,17 @@ config SCHED_AUTOGROUP
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config SCHED_RT
+	bool "Real Time Task Scheduling" if EXPERT
+	default y
+	help
+	  This adds the sched_rt scheduling class to the kernel providing
+ 	  support for the SCHED_FIFO and SCHED_RR policies. You might want
+	  to disable this to reduce the kernel size. If unsure say y.
+
 config SCHED_DL
 	bool "Deadline Task Scheduling" if EXPERT
+	depends on SCHED_RT
 	default y
 	help
 	  This adds the sched_dl scheduling class to the kernel providing
@@ -1632,6 +1641,7 @@ config BASE_FULL
 config FUTEX
 	bool "Enable futex support" if EXPERT
 	default y
+	depends on SCHED_RT
 	select RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 3bd6a7c1cc..bccbef85e5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,8 +16,8 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-y += idle_task.o fair.o rt.o
+obj-y += wait.o swait.o completion.o idle.o idle_task.o fair.o
+obj-$(CONFIG_SCHED_RT) += rt.o
 obj-$(CONFIG_SCHED_DL) += deadline.o $(if $(CONFIG_SMP),cpudeadline.o)
 obj-$(CONFIG_SMP) += cpupri.o topology.o stop_task.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7b004e440..3dd6fce750 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -640,6 +640,7 @@ bool sched_can_stop_tick(struct rq *rq)
 		return false;
 #endif
 
+#ifdef CONFIG_SCHED_RT
 	/*
 	 * If there are more than one RR tasks, we need the tick to effect the
 	 * actual RR behaviour.
@@ -658,6 +659,7 @@ bool sched_can_stop_tick(struct rq *rq)
 	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
 	if (fifo_nr_running)
 		return true;
+#endif
 
 	/*
 	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
@@ -1586,7 +1588,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
 		 * Reset it back to a normal scheduling class so that
 		 * it can die in pieces.
 		 */
-		old_stop->sched_class = &rt_sched_class;
+		old_stop->sched_class = stop_sched_class.next;
 	}
 }
 
@@ -2182,11 +2184,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	__dl_clear_params(p);
 #endif
 
+#ifdef CONFIG_SCHED_RT
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->rt.timeout		= 0;
 	p->rt.time_slice	= sched_rr_timeslice;
 	p->rt.on_rq		= 0;
 	p->rt.on_list		= 0;
+#endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3716,13 +3720,18 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		p->sched_class = &dl_sched_class;
 	} else
 #endif
+#ifdef CONFIG_SCHED_RT
 	if (rt_prio(prio)) {
 		if (oldprio < prio)
 			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
-	} else {
+	} else
+#endif
+	{
+#ifdef CONFIG_SCHED_RT
 		if (rt_prio(oldprio))
 			p->rt.timeout = 0;
+#endif
 		p->sched_class = &fair_sched_class;
 	}
 
@@ -3997,6 +4006,23 @@ static int __sched_setscheduler(struct task_struct *p,
 
 	/* May grab non-irq protected spin_locks: */
 	BUG_ON(in_interrupt());
+
+	/*
+	 * When the RT scheduling class is disabled, let's make sure kernel threads
+	 * wanting RT still get lowest nice value to give them highest available
+	 * priority rather than simply returning an error. Obviously we can't test
+	 * rt_policy() here as it is always false in that case.
+	 */
+	if (!IS_ENABLED(CONFIG_SCHED_RT) && !user &&
+	    (policy == SCHED_FIFO || policy == SCHED_RR)) {
+		static const struct sched_attr k_attr = {
+			.sched_policy = SCHED_NORMAL,
+			.sched_nice = MIN_NICE,
+		};
+		attr = &k_attr;
+		policy = SCHED_NORMAL;
+	}
+
 recheck:
 	/* Double check policy once rq lock held: */
 	if (policy < 0) {
@@ -5726,7 +5752,9 @@ void __init sched_init_smp(void)
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
 
+#ifdef CONFIG_SCHED_RT
 	init_sched_rt_class();
+#endif
 #ifdef CONFIG_SCHED_DL
 	init_sched_dl_class();
 #endif
@@ -5832,7 +5860,9 @@ void __init sched_init(void)
 	}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
+#ifdef CONFIG_SCHED_RT
 	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+#endif
 #ifdef CONFIG_SCHED_DL
 	init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
 #endif
@@ -5864,7 +5894,10 @@ void __init sched_init(void)
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs);
+#ifdef CONFIG_SCHED_RT
 		init_rt_rq(&rq->rt);
+		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
 #ifdef CONFIG_SCHED_DL
 		init_dl_rq(&rq->dl);
 #endif
@@ -5895,7 +5928,6 @@ void __init sched_init(void)
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
@@ -6132,7 +6164,9 @@ static DEFINE_SPINLOCK(task_group_lock);
 static void sched_free_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
+#ifdef CONFIG_SCHED_RT
 	free_rt_sched_group(tg);
+#endif
 	autogroup_free(tg);
 	kmem_cache_free(task_group_cache, tg);
 }
@@ -6149,8 +6183,10 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 
+#ifdef CONFIG_SCHED_RT
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
+#endif
 
 	return tg;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 84f80a81ab..c550723ce9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -645,7 +645,9 @@ do {									\
 
 	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
+#ifdef CONFIG_SCHED_RT
 	print_rt_stats(m, cpu);
+#endif
 #ifdef CONFIG_SCHED_DL
 	print_dl_stats(m, cpu);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 41dc10b707..38439eefd3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -132,7 +132,8 @@ static inline int fair_policy(int policy)
 
 static inline int rt_policy(int policy)
 {
-	return policy == SCHED_FIFO || policy == SCHED_RR;
+	return IS_ENABLED(CONFIG_SCHED_RT) &&
+	       (policy == SCHED_FIFO || policy == SCHED_RR);
 }
 
 static inline int dl_policy(int policy)
@@ -1447,8 +1448,10 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
 #define sched_class_highest (&stop_sched_class)
 #elif defined(CONFIG_SCHED_DL)
 #define sched_class_highest (&dl_sched_class)
-#else
+#elif defined(CONFIG_SCHED_RT)
 #define sched_class_highest (&rt_sched_class)
+#else
+#define sched_class_highest (&fair_sched_class)
 #endif
 
 #define for_each_class(class) \
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 5632dc3e63..7cad8c1540 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -112,8 +112,10 @@ static void update_curr_stop(struct rq *rq)
 const struct sched_class stop_sched_class = {
 #ifdef CONFIG_SCHED_DL
 	.next			= &dl_sched_class,
-#else
+#elif defined(CONFIG_SCHED_RT)
 	.next			= &rt_sched_class,
+#else
+	.next			= &fair_sched_class,
 #endif
 
 	.enqueue_task		= enqueue_task_stop,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76c..1c670f4053 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -401,6 +401,7 @@ static struct ctl_table kern_table[] = {
 	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHED_RT
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
@@ -422,6 +423,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
 	},
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",
@@ -1071,7 +1073,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &neg_one,
 	},
 #endif
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
 	{
 		.procname	= "max_lock_depth",
 		.data		= &max_lock_depth,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d2a1e6dd02..010efb0e91 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -790,10 +790,12 @@ static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
 	struct list_head *timers = tsk->cpu_timers;
-	struct signal_struct *const sig = tsk->signal;
 	struct task_cputime *tsk_expires = &tsk->cputime_expires;
 	u64 expires;
+#ifdef CONFIG_SCHED_RT
+	struct signal_struct *const sig = tsk->signal;
 	unsigned long soft;
+#endif
 
 	/*
 	 * If cputime_expires is zero, then there are no active
@@ -811,6 +813,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
 						   tsk->se.sum_exec_runtime);
 
+#ifdef CONFIG_SCHED_RT
 	/*
 	 * Check for the special case thread timers.
 	 */
@@ -847,6 +850,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 		}
 	}
+#endif
 	if (task_cputime_zero(tsk_expires))
 		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
-- 
2.9.4