lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 10 Sep 2022 16:23:23 +0530
From:   K Prateek Nayak <kprateek.nayak@....com>
To:     <linux-kernel@...r.kernel.org>
CC:     <aubrey.li@...ux.intel.com>, <efault@....de>,
        <gautham.shenoy@....com>, <libo.chen@...cle.com>,
        <mgorman@...hsingularity.net>, <mingo@...nel.org>,
        <peterz@...radead.org>, <song.bao.hua@...ilicon.com>,
        <srikar@...ux.vnet.ibm.com>, <tglx@...utronix.de>,
        <valentin.schneider@....com>, <vincent.guittot@...aro.org>,
        <wuyun.abel@...edance.com>, <wyes.karny@....com>,
        <yu.c.chen@...el.com>, <yangyicong@...wei.com>
Subject: [PATCH 2/5] prctl: Add interface and helper functions to set hints

Hints are low-level knobs that can influence task placement decisions
at various scheduler decision points.

- Design of wakeup hints

Hints are of two kinds:

    o fork time hints: These hints influence initial placement. It is
      observed that a correct initial placement can have a long lasting
      effect on workload performance. These hints override scheduler
      behavior when the system is not heavily loaded to overlook any
      pre-configured bias in scheduler (such as NUMA Imbalance) and
      place tasks in a way user finds beneficial.
      These hints are of the form PR_SCHED_HINT_FORK_* namely:
      	- PR_SCHED_HINT_FORK_AFFINE
      	- PR_SCHED_HINT_FORK_SPREAD

    o wakeup hints: These hints target a specific MC Domain during
      wakeup. The user can choose to bias the placement towards waker's
      LLC if we believe the waker-wakee follow a producer-consumer
      pattern. The user can also choose to bias the placement towards
      the MC domain where the tasks previously ran if we believe waker's
      signaling is just for synchronization and the wakee will continue
      to consume the data, it produced during its last run.
      These hints are of the form PR_SCHED_HINT_WAKE_* namely:
        - PR_SCHED_HINT_WAKE_AFFINE
      	- PR_SCHED_HINT_WAKE_HOLD

Only one hint of each type can be set at once currently. Failure to do
so will lead to prctl() call returning -EINVAL.

- API Design

An example of setting hint PR_SCHED_HINT_FORK_AFFINE and
PR_SCHED_HINT_WAKE_AFFINE for the current running process is a follows:

	#include <sys/prctl.h>

	prctl(PR_SCHED_HINT /* prctl() cmd */,
	      PR_SCHED_HINT_WAKE_AFFINE | PR_SCHED_HINT_FORK_AFFINE /* Hints */,
	      0 /* pid of task to set hint for. pid 0 sets hint current task */,
	      0, 0);

The above command sets PR_SCHED_HINT_WAKE_AFFINE and
PR_SCHED_HINT_FORK_AFFINE for the task calling the prctl(). For a hint
to be set, the task should be ftrace-able.

As hints can be inherited by childern, one can wrap the runner script
with a hint to avail the benefits and need not change the workload.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 include/linux/sched.h      |  1 +
 include/uapi/linux/prctl.h |  8 ++++
 kernel/sched/core.c        | 81 ++++++++++++++++++++++++++++++++++++++
 kernel/sys.c               |  5 +++
 4 files changed, 95 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc953c9e956a..84a630d7c529 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2405,5 +2405,6 @@ static inline void sched_core_fork(struct task_struct *p) { }
 #endif
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+extern int sched_set_hint(unsigned int hint, pid_t pid);
 
 #endif
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a5e06dcbba13..0a5dd42f1eab 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -284,4 +284,12 @@ struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+/* Hint the scheduler of the expected task behavior */
+#define PR_SCHED_HINT			65
+# define PR_SCHED_HINT_DEFAULT		0
+# define PR_SCHED_HINT_FORK_AFFINE	(1U << 0) /* Initial placement close to forking CPU */
+# define PR_SCHED_HINT_FORK_SPREAD	(1U << 1) /* Initial placement biased towards idlest group */
+# define PR_SCHED_HINT_WAKE_AFFINE	(1U << 2) /* Subsequent wakeup target waker's MC domain */
+# define PR_SCHED_HINT_WAKE_HOLD	(1U << 3) /* Subsequent wakeup target last run's MC domain */
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d289d87acf7..60bee250d7b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,6 +71,7 @@
 # endif
 #endif
 
+#include <uapi/linux/prctl.h>
 #include <uapi/linux/sched/types.h>
 
 #include <asm/switch_to.h>
@@ -7323,6 +7324,86 @@ static void __setscheduler_params(struct task_struct *p,
 	set_load_weight(p, true);
 }
 
+/*
+ * Check whether the hints are valid and clear.
+ * Returns 0 if the hints are invalid of if
+ * the set hints are ambiguous and can lead to
+ * inconsistent behavior.
+ * Returns 1 for valid hint.
+ */
+static int valid_hint(unsigned int hint) {
+	if (!hint)
+		return 1;
+
+	/*
+	 * Largest value of hint can be calculated by setting the hints
+	 * with the largest numeric value in each independent category.
+	 */
+	if (hint > (PR_SCHED_HINT_WAKE_HOLD | PR_SCHED_HINT_FORK_SPREAD))
+		return 0;
+
+	/*
+	 * Only one of the fork time hints must be
+	 * set. Consistent behavior cannot be
+	 * guarenteed with conflicting hints.
+	 */
+	if ((hint & PR_SCHED_HINT_FORK_AFFINE) &&
+	    (hint & PR_SCHED_HINT_FORK_SPREAD))
+		return 0;
+
+	/*
+	 * Only one of the wakeup hints must be
+	 * set for the same reason stated above.
+	 */
+	if ((hint & PR_SCHED_HINT_WAKE_AFFINE) &&
+	    (hint & PR_SCHED_HINT_WAKE_HOLD))
+		return 0;
+
+	return 1;
+}
+
+/* Called from prctl interface: PR_SCHED_HINT */
+int sched_set_hint(unsigned int hint, pid_t pid)
+{
+	struct task_struct *task;
+	int err = 0;
+
+	/*
+	 * Make sure hint is valid and the user has not
+	 * requested for conflicting behavior at any
+	 * given decision point.
+	 */
+	if (!valid_hint(hint))
+		return -EINVAL;
+
+	rcu_read_lock();
+	if (pid == 0) {
+		task = current;
+	} else {
+		task = find_task_by_vpid(pid);
+		if (!task) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. Use the regular "ptrace_may_access()" checks.
+	 */
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	WRITE_ONCE(task->hint, hint);
+out:
+	put_task_struct(task);
+	return err;
+}
+
 /*
  * Check the target process has a UID that matches the current process's:
  */
diff --git a/kernel/sys.c b/kernel/sys.c
index b911fa6d81ab..505ceea548bd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2623,6 +2623,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
+	case PR_SCHED_HINT:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = sched_set_hint(arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
2.25.1

Powered by blists - more mailing lists