linux-kernel - [PATCH 1/2] sched: Introduce new flags to sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1505860632-11275-2-git-send-email-rohit.k.jain@oracle.com>
Date:   Tue, 19 Sep 2017 15:37:11 -0700
From:   Rohit Jain <rohit.k.jain@...cle.com>
To:     linux-kernel@...r.kernel.org, eas-dev@...ts.linaro.org
Cc:     peterz@...radead.org, mingo@...hat.com, joelaf@...gle.com
Subject: [PATCH 1/2] sched: Introduce new flags to sched_setaffinity to support soft affinity.

These are the changes for supporting the system call and set the
cpus_preferred mask as the application wants. This patch does not make
the cpus_preferred take any action.

Signed-off-by: Rohit Jain <rohit.k.jain@...cle.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 include/linux/init_task.h              |   1 +
 include/linux/sched.h                  |   4 +-
 include/linux/syscalls.h               |   3 +
 include/uapi/asm-generic/unistd.h      |   4 +-
 include/uapi/linux/sched.h             |   3 +
 kernel/compat.c                        |   2 +-
 kernel/sched/core.c                    | 167 ++++++++++++++++++++++++++++-----
 kernel/time/tick-sched.c               |   1 +
 9 files changed, 159 insertions(+), 27 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..bd5f346 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	sched_setaffinity_flags	sys_sched_setaffinity_flags
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 0e84971..bb8a8e1 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -235,6 +235,7 @@ extern struct cred init_cred;
 	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
+	.cpus_preferred = CPU_MASK_ALL,					\
 	.nr_cpus_allowed= NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 534542d..7e08ae8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -582,6 +582,7 @@ struct task_struct {
 	unsigned int			policy;
 	int				nr_cpus_allowed;
 	cpumask_t			cpus_allowed;
+	cpumask_t			cpus_preferred;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int				rcu_read_lock_nesting;
@@ -1647,7 +1648,8 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 # define vcpu_is_preempted(cpu)	false
 #endif
 
-extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask,
+			      int flags);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
 #ifndef TASK_SIZE_OF
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d4dfac8..83d04da 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -326,6 +326,9 @@ asmlinkage long sys_sched_get_priority_max(int policy);
 asmlinkage long sys_sched_get_priority_min(int policy);
 asmlinkage long sys_sched_rr_get_interval(pid_t pid,
 					struct timespec __user *interval);
+asmlinkage long sys_sched_setaffinity_flags(pid_t pid, unsigned int len,
+					    unsigned long __user *user_mask_ptr,
+					    int flags);
 asmlinkage long sys_setpriority(int which, int who, int niceval);
 asmlinkage long sys_getpriority(int which, int who);
 
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..5e88941 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -376,6 +376,8 @@ __SYSCALL(__NR_sched_getparam, sys_sched_getparam)
 #define __NR_sched_setaffinity 122
 __SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \
 	  compat_sys_sched_setaffinity)
+#define __NR_sched_setaffinity_flags 293
+__SYSCALL(__NR_sched_setaffinity_flags, sys_sched_setaffinity_flags)
 #define __NR_sched_getaffinity 123
 __SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \
 	  compat_sys_sched_getaffinity)
@@ -733,7 +735,7 @@ __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 __SYSCALL(__NR_statx,     sys_statx)
 
 #undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index e2a6c7b..81c17f5 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -49,4 +49,7 @@
 #define SCHED_FLAG_RESET_ON_FORK	0x01
 #define SCHED_FLAG_RECLAIM		0x02
 
+#define SCHED_HARD_AFFINITY	0
+#define SCHED_SOFT_AFFINITY	1
+
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index 6f0a0e7..0ec60ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -356,7 +356,7 @@ COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
 	if (retval)
 		goto out;
 
-	retval = sched_setaffinity(pid, new_mask);
+	retval = sched_setaffinity(pid, new_mask, SCHED_HARD_AFFINITY);
 out:
 	free_cpumask_var(new_mask);
 	return retval;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec80d2f..2e8d392 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1031,6 +1031,11 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
+void set_cpus_preferred_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(&p->cpus_preferred, new_mask);
+}
+
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct rq *rq = task_rq(p);
@@ -1053,6 +1058,36 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		put_prev_task(rq, p);
 
 	p->sched_class->set_cpus_allowed(p, new_mask);
+	set_cpus_preferred_common(p, new_mask);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_curr_task(rq, p);
+}
+
+void do_set_cpus_preferred(struct task_struct *p, const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
+	set_cpus_preferred_common(p, new_mask);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1142,6 +1177,63 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	return ret;
 }
 
+static int
+__set_cpus_preferred_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	unsigned int dest_cpu;
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * Kernel threads are allowed on online && !active CPUs
+		 */
+		cpu_valid_mask = cpu_online_mask;
+	}
+
+	if (cpumask_equal(&p->cpus_preferred, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_cpus_preferred(p, new_mask);
+
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * For kernel threads that do indeed end up on online &&
+		 * !active we want to ensure they are strict per-CPU threads.
+		 */
+		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+			!cpumask_intersects(new_mask, cpu_active_mask) &&
+			p->nr_cpus_allowed != 1);
+	}
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+	if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		rq = move_queued_task(rq, &rf, p, dest_cpu);
+	}
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	return ret;
+}
+
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	return __set_cpus_allowed_ptr(p, new_mask, false);
@@ -4620,7 +4712,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	return retval;
 }
 
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask, int flags)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
@@ -4686,19 +4778,23 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
-
-	if (!retval) {
-		cpuset_cpus_allowed(p, cpus_allowed);
-		if (!cpumask_subset(new_mask, cpus_allowed)) {
-			/*
-			 * We must have raced with a concurrent cpuset
-			 * update. Just reset the cpus_allowed to the
-			 * cpuset's cpus_allowed
-			 */
-			cpumask_copy(new_mask, cpus_allowed);
-			goto again;
+	if (flags == SCHED_HARD_AFFINITY) {
+		retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+		if (!retval) {
+			cpuset_cpus_allowed(p, cpus_allowed);
+			if (!cpumask_subset(new_mask, cpus_allowed)) {
+				/*
+				 * We must have raced with a concurrent cpuset
+				 * update. Just reset the cpus_allowed to the
+				 * cpuset's cpus_allowed
+				 */
+				cpumask_copy(new_mask, cpus_allowed);
+				goto again;
+			}
 		}
+	} else if (flags == SCHED_SOFT_AFFINITY) {
+		retval = __set_cpus_preferred_ptr(p, new_mask);
 	}
 out_free_new_mask:
 	free_cpumask_var(new_mask);
@@ -4720,30 +4816,53 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 
-/**
- * sys_sched_setaffinity - set the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new CPU mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
-		unsigned long __user *, user_mask_ptr)
+static bool
+valid_affinity_flags(int flags)
+{
+	return flags == SCHED_HARD_AFFINITY || flags == SCHED_SOFT_AFFINITY;
+}
+
+static int
+sched_setaffinity_common(pid_t pid, unsigned int len,
+			 unsigned long __user *user_mask_ptr, int flags)
 {
 	cpumask_var_t new_mask;
 	int retval;
 
+	if (!valid_affinity_flags(flags))
+		return -EINVAL;
+
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
 	if (retval == 0)
-		retval = sched_setaffinity(pid, new_mask);
+		retval = sched_setaffinity(pid, new_mask, flags);
 	free_cpumask_var(new_mask);
 	return retval;
 }
 
+SYSCALL_DEFINE4(sched_setaffinity_flags, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr, int, flags)
+{
+	return sched_setaffinity_common(pid, len, user_mask_ptr, flags);
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	return sched_setaffinity_common(pid, len, user_mask_ptr,
+					SCHED_HARD_AFFINITY);
+}
+
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index eb0e975..ede1add 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,6 +19,7 @@
 #include <linux/percpu.h>
 #include <linux/nmi.h>
 #include <linux/profile.h>
+#include <linux/vmstat.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/stat.h>
-- 
2.7.4