linux-kernel - [PATCH 1/2] sched: deferred set priority (dprio) -- rebased for the tip

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <CA+80gGaC5tutmFiTCyL-zoBY4BcbZnh63wXhO-g5UetP+w5g4w@mail.gmail.com>
Date:	Thu, 25 Sep 2014 12:26:04 -0700
From:	Sergey Oboguev <oboguev.public@...il.com>
To:	linux-kernel@...r.kernel.org
Cc:	mingo@...hat.com, Peter Zijlstra <peterz@...radead.org>
Subject: [PATCH 1/2] sched: deferred set priority (dprio) -- rebased for the tip

This is a replica of "[PATCH 1/2] dprio" (posted yesterday for 3.16.3)
rebased now for the current tip (3.17.0-rc6).

Signed-off-by: Sergey Oboguev <oboguev@...oo.com>

---
 Documentation/sysctl/kernel.txt |  14 +
 fs/exec.c                       |   8 +
 include/linux/dprio.h           | 129 +++++++++
 include/linux/init_task.h       |  17 ++
 include/linux/sched.h           |  19 ++
 include/uapi/linux/Kbuild       |   1 +
 include/uapi/linux/capability.h |   5 +-
 include/uapi/linux/dprio_api.h  | 137 +++++++++
 include/uapi/linux/prctl.h      |   2 +
 init/Kconfig                    |   2 +
 kernel/Kconfig.dprio            |  68 +++++
 kernel/exit.c                   |   6 +
 kernel/fork.c                   |  88 +++++-
 kernel/sched/Makefile           |   1 +
 kernel/sched/core.c             | 195 ++++++++++++-
 kernel/sched/dprio.c            | 617 ++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c                    |   6 +
 kernel/sysctl.c                 |  12 +
 18 files changed, 1315 insertions(+), 12 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f79eb96..7b379cd 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -30,6 +30,7 @@ show up in /proc/sys/kernel:
 - core_uses_pid
 - ctrl-alt-del
 - dmesg_restrict
+- dprio_privileged
 - domainname
 - hostname
 - hotplug
@@ -267,6 +268,19 @@ default value of dmesg_restrict.

 ==============================================================

+dprio_privileged:
+
+This toggle indicates whether unprivileged users are prevented
+from using dprio(2) to execute deferred set priority requests.
+When dprio_privileged is set to (0) there are no restrictions.
+When dprio_privileged is set set to (1), users must have CAP_DPRIO
+to use dprio(2), i.e. prctl(PR_SET_DEFERRED_SETPRIO).
+
+The kernel config option CONFIG_DEFERRED_SETPRIO_PRIVILEGED sets
+the default value of dprio_privileged.
+
+==============================================================
+
 domainname & hostname:

 These files can be used to set the NIS/YP domainname and the
diff --git a/fs/exec.c b/fs/exec.c
index a2b42a9..439bc42 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/dprio.h>

 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1430,6 +1431,7 @@ static int do_execve_common(struct filename *filename,
     struct file *file;
     struct files_struct *displaced;
     int retval;
+    struct dprio_saved_context dprio_context;

     if (IS_ERR(filename))
         return PTR_ERR(filename);
@@ -1480,6 +1482,9 @@ static int do_execve_common(struct filename *filename,
     if (retval)
         goto out_unmark;

+    dprio_handle_request();
+    dprio_save_reset_context(&dprio_context);
+
     bprm->argc = count(argv, MAX_ARG_STRINGS);
     if ((retval = bprm->argc) < 0)
         goto out;
@@ -1518,6 +1523,7 @@ static int do_execve_common(struct filename *filename,
     putname(filename);
     if (displaced)
         put_files_struct(displaced);
+    dprio_free_context(&dprio_context);
     return retval;

 out:
@@ -1526,6 +1532,8 @@ out:
         mmput(bprm->mm);
     }

+    dprio_restore_context(&dprio_context);
+
 out_unmark:
     current->fs->in_exec = 0;
     current->in_execve = 0;
diff --git a/include/linux/dprio.h b/include/linux/dprio.h
new file mode 100644
index 0000000..1119c00
--- /dev/null
+++ b/include/linux/dprio.h
@@ -0,0 +1,129 @@
+/*
+ * include/linux/dprio.h
+ *
+ * Deferred set priority.
+ *
+ * Started by (C) 2014 Sergey Oboguev <oboguev@...oo.com>
+ *
+ * This code is licenced under the GPL version 2 or later.
+ * For details see linux-kernel-base/COPYING.
+ */
+
+#ifndef _LINUX_DPRIO_H
+#define _LINUX_DPRIO_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_DEFERRED_SETPRIO
+
+/*
+ * @mask contains bit-flags indicating which policies have been pre-approved.
+ * Other fields are valid only if the corresponding bit is set in the @mask.
+ */
+static __always_inline void __dprio_info_assumptions(void)
+{
+    /* SCHED_xxx is used as a bit index in @mask */
+    BUILD_BUG_ON(SCHED_NORMAL > 31);
+    BUILD_BUG_ON(SCHED_FIFO > 31);
+    BUILD_BUG_ON(SCHED_RR > 31);
+    BUILD_BUG_ON(SCHED_BATCH > 31);
+    BUILD_BUG_ON(SCHED_IDLE > 31);
+}
+struct dprio_info {
+    unsigned mask;
+    s32 normal_sched_nice;
+    s32 batch_sched_nice;
+    u32 fifo_sched_priority;
+    u32 rr_sched_priority;
+    bool capable_sys_nice;
+};
+
+/*
+ * Called by dup_task_struct to reset non-inherited fields
+ */
+static __always_inline void set_task_in_dprio(struct task_struct *tsk,
+                          bool in_dprio)
+{
+#ifdef CONFIG_DEBUG_DEFERRED_SETPRIO
+    tsk->in_dprio = in_dprio;
+#endif
+}
+
+static inline void dprio_dup_task_struct(struct task_struct *tsk)
+{
+    /* reset deferred setprio fields not inherited from the parent */
+    tsk->dprio_ku_area_pp = NULL;
+    tsk->dprio_info = NULL;
+    set_task_in_dprio(tsk, false);
+}
+
+void dprio_detach(struct task_struct *tsk);
+void dprio_handle_request(void);
+bool dprio_check_for_request(struct task_struct *prev);
+long dprio_prctl(int option, unsigned long a2, unsigned long a3,
+         unsigned long a4, unsigned long a5);
+
+struct dprio_saved_context {
+    struct dprio_ku_area __user * __user *dprio_ku_area_pp;
+    struct dprio_info *dprio_info;
+};
+
+static inline void dprio_save_reset_context(struct dprio_saved_context *saved)
+{
+    saved->dprio_ku_area_pp = current->dprio_ku_area_pp;
+    saved->dprio_info = current->dprio_info;
+
+    if (unlikely(saved->dprio_ku_area_pp)) {
+        preempt_disable();
+        current->dprio_ku_area_pp = NULL;
+        current->dprio_info = NULL;
+        preempt_enable();
+    }
+}
+
+static inline void dprio_restore_context(struct dprio_saved_context *saved)
+{
+    if (unlikely(saved->dprio_ku_area_pp)) {
+        preempt_disable();
+        current->dprio_ku_area_pp = saved->dprio_ku_area_pp;
+        current->dprio_info = saved->dprio_info;
+        preempt_enable();
+    }
+}
+
+static inline void dprio_free_context(struct dprio_saved_context *saved)
+{
+    if (unlikely(saved->dprio_info))
+        kfree(saved->dprio_info);
+}
+
+#ifdef CONFIG_DEFERRED_SETPRIO_PRIVILEGED
+  #define DPRIO_PRIVILEGED_INITIAL_VALUE  true
+#else
+  #define DPRIO_PRIVILEGED_INITIAL_VALUE  false
+#endif
+
+extern unsigned int dprio_privileged;
+
+int dprio_check_permission(void);
+
+#else /* ndef CONFIG_DEFERRED_SETPRIO */
+
+static inline void set_task_in_dprio(struct task_struct *tsk, bool in_dprio) {}
+static inline void dprio_dup_task_struct(struct task_struct *tsk) {}
+static inline void dprio_detach(struct task_struct *tsk) {}
+static inline void dprio_handle_request(void) {}
+
+struct dprio_saved_context {
+    char dummy[0];        /* suppress compiler warning */
+};
+
+static inline void dprio_save_reset_context(struct
dprio_saved_context *saved) {}
+static inline void dprio_restore_context(struct dprio_saved_context *saved) {}
+static inline void dprio_free_context(struct dprio_saved_context *saved) {}
+
+#endif /* CONFIG_DEFERRED_SETPRIO */
+
+#endif /* _LINUX_DPRIO_H */
+
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 77fc43f..5950f20 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -166,6 +166,22 @@ extern struct task_group root_task_group;
 # define INIT_RT_MUTEXES(tsk)
 #endif

+#ifdef CONFIG_DEBUG_DEFERRED_SETPRIO
+# define INIT_DEFERRED_SETPRIO_DEBUG                    \
+    .in_dprio = false,
+#else
+# define INIT_DEFERRED_SETPRIO_DEBUG
+#endif
+
+#ifdef CONFIG_DEFERRED_SETPRIO
+# define INIT_DEFERRED_SETPRIO                        \
+    .dprio_ku_area_pp = NULL,                    \
+    .dprio_info = NULL,                        \
+    INIT_DEFERRED_SETPRIO_DEBUG
+#else
+# define INIT_DEFERRED_SETPRIO
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -237,6 +253,7 @@ extern struct task_group root_task_group;
     INIT_CPUSET_SEQ(tsk)                        \
     INIT_RT_MUTEXES(tsk)                        \
     INIT_VTIME(tsk)                            \
+    INIT_DEFERRED_SETPRIO                        \
 }


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 48ae6c4..0d6a359 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1247,6 +1247,11 @@ struct task_struct {

     int wake_cpu;
 #endif
+#ifdef CONFIG_DEFERRED_SETPRIO
+    /* try to keep @dprio_ku_area in the same cacheline as @state or
+       @on_rq or @sched_class */
+    struct dprio_ku_area __user * __user *dprio_ku_area_pp;
+#endif
     int on_rq;

     int prio, static_prio, normal_prio;
@@ -1660,6 +1665,15 @@ struct task_struct {
     unsigned int    sequential_io;
     unsigned int    sequential_io_avg;
 #endif
+#ifdef CONFIG_DEFERRED_SETPRIO
+    struct dprio_info *dprio_info;
+#endif
+#ifdef CONFIG_PUT_TASK_TIMEBOUND
+    struct work_struct put_task_work;
+#endif
+#ifdef CONFIG_DEBUG_DEFERRED_SETPRIO
+    bool in_dprio;
+#endif
 };

 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2202,6 +2216,11 @@ extern int sched_setscheduler_nocheck(struct
task_struct *, int,
                       const struct sched_param *);
 extern int sched_setattr(struct task_struct *,
              const struct sched_attr *);
+extern int sched_setattr_precheck(struct task_struct *p,
+                  const struct sched_attr *attr);
+extern int sched_setattr_prechecked(struct task_struct *p,
+                    const struct sched_attr *attr,
+                    bool merge_reset_on_fork);
 extern struct task_struct *idle_task(int cpu);
 /**
  * is_idle_task - is the specified task an idle task?
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index be88166..f4a9c31 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -99,6 +99,7 @@ header-y += dlmconstants.h
 header-y += dm-ioctl.h
 header-y += dm-log-userspace.h
 header-y += dn.h
+header-y += dprio_api.h
 header-y += dqblk_xfs.h
 header-y += edd.h
 header-y += efs_fs_sb.h
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..55c4bb0 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -351,8 +351,11 @@ struct vfs_cap_data {

 #define CAP_AUDIT_READ        37

+/* Allow the use of deferred set priority (PR_SET_DEFERRED_SETPRIO) */

-#define CAP_LAST_CAP         CAP_AUDIT_READ
+#define CAP_DPRIO         38
+
+#define CAP_LAST_CAP         CAP_DPRIO

 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)

diff --git a/include/uapi/linux/dprio_api.h b/include/uapi/linux/dprio_api.h
new file mode 100644
index 0000000..1748f40
--- /dev/null
+++ b/include/uapi/linux/dprio_api.h
@@ -0,0 +1,137 @@
+/*
+ * Deferred set priority.
+ *
+ * This file contains the defitions for dprio(2) userspace-kernel interface.
+ */
+
+#ifndef _UAPI_LINUX_DPRIO_API_H
+#define _UAPI_LINUX_DPRIO_API_H
+
+#ifndef __KERNEL__
+  #include <linux/types.h>
+  #include <sched.h>
+#endif
+
+/*
+ * Userspace-kernel dprio protocol is as follows:
+ *
+ * Userspace:
+ *
+ *     Select and fill-in dprio_ku_area:
+ *         Set @resp = DPRIO_RESP_NONE.
+ *         Set @sched_attr.
+ *
+ *     Set @cmd to point dprio_ku_area.
+ *
+ *     @cmd is u64 variable previously designated in the call
+ *     prctl(PR_SET_DEFERRED_SETPRIO, & @cmd, ...)
+ *
+ * Kernel:
+ *
+ *     1) On task preemption attempt or at other processing point,
+ *        such as fork or exec, read @cmd.
+ *        If cannot (e.g. @cmd inaccessible incl. page swapped out), quit.
+ *        Note: will reattempt again on next preemption cycle.
+ *
+ *     2) If read-in value of @cmd is 0, do nothing. Quit.
+ *
+ *     3) Set @resp = DPRIO_RESP_UNKNOWN.
+ *        If cannot (e.g. inaccessible), quit.
+ *
+ *     4) Set @cmd = NULL.
+ *        If cannot (e.g. inaccessible), quit.
+ *        Note that in this case request handling will be reattempted on next
+ *        thread preemption cycle. Thus @resp value of DPRIO_RESP_UNKNOWN may
+ *        be transient and overwritten with DPRIO_RESP_OK or DPRIO_RESP_ERROR
+ *        if @cmd is not reset to 0 by the kernel (or to 0 or to the address
+ *        of another dprio_ku_area by the userspace).
+ *
+ *     5) Read @sched_attr.
+ *        If cannot (e.g. inaccessible), quit.
+ *
+ *     6) Try to change task scheduling attributes in accordance with read-in
+ *        value of @sched_attr.
+ *
+ *     7) If successful, set @resp = DPRIO_RESP_OK and Quit.
+ *
+ *     8) If unsuccessful, set @error = appopriate errno-style value.
+ *        If cannot (e.g. @error inaccessible), quit.
+ *        Set @resp = DPRIO_RESP_ERROR.
+ *        If cannot (e.g. @resp inaccessible), quit.
+ *
+ * Explanation of possible @resp codes:
+ *
+ * DPRIO_RESP_NONE
+ *
+ *     Request has not been processed yet.
+ *
+ * DPRIO_RESP_OK
+ *
+ *     Request has been successfully processed.
+ *
+ * DPRIO_RESP_ERROR
+ *
+ *     Request has failed, @error has errno-style error code.
+ *
+ * DPRIO_RESP_UNKNOWN
+ *
+ *     Request processing has been attempted, but the outcome is unknown.
+ *     Request might have been successful or failed.
+ *     Current os-level thread priority becomes unknown.
+ *
+ *     @error field may be invalid.
+ *
+ *     This code is written to @resp at the start of request processing,
+ *     then @resp is changed to OK or ERR at the end of request processing
+ *     if dprio_ku_area and @cmd stay accessible for write.
+ *
+ *     This status code is never left visible to the userspace code in the
+ *     current thread if dprio_ku_area and @cmd are locked in memory and remain
+ *     properly accessible for read and write during request processing.
+ *
+ *     This status code might happen (i.e. stay visible to userspace code
+ *     in the current thread) if access to dprio_ku_area or @cmd is lost
+ *     during request processing, for example the page that contains the area
+ *     gets swapped out or the area is otherwise not fully accessible for
+ *     reading and writing.
+ *
+ *     If @error has value of DPRIO_RESP_UNKNOWN and @cmd is still pointing
+ *     to dprio_ku_area containing @error, it is possible for the request to
+ *     be reprocessed again at the next context switch and @error change to
+ *     DPRIO_RESP_OK or DPRIO_RESP_ERROR. To ensure @error does not change
+ *     under your feet, change @cmd to either NULL or address of another
+ *     dprio_ku_area distinct from one containing this @error.
+ */
+enum {
+    DPRIO_RESP_NONE     = 0,
+    DPRIO_RESP_OK       = 1,
+    DPRIO_RESP_ERROR    = 2,
+    DPRIO_RESP_UNKNOWN  = 3
+};
+
+/*
+ * It is up to the client access methods whether it will want to define
+ * strucutre elements as volatile.
+ */
+#ifndef __dprio_volatile
+  #define __dprio_volatile
+#endif
+
+struct dprio_ku_area {
+    /*
+     * Size of struct sched_attr may change in future definitions
+     * of the structure, therefore @sched_attr should come after
+     * @resp and @error in order to maintain the compatibility
+     * between userland and kernel built with different versions
+     * of struct sched_attr definition.
+     *
+     * Userland code should use volatile and/or compiler barriers
+     * to ensure the protocol.
+     */
+    __dprio_volatile __u32 resp;        /* DPRIO_RESP_xxx */
+    __dprio_volatile __u32 error;        /* one of errno values */
+    __dprio_volatile struct sched_attr sched_attr;
+};
+
+#endif /* _UAPI_LINUX_DPRIO_API_H */
+
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 58afc04..3513db5 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -152,4 +152,6 @@
 #define PR_SET_THP_DISABLE    41
 #define PR_GET_THP_DISABLE    42

+#define PR_SET_DEFERRED_SETPRIO    43
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index 4fe5500..036023e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1958,3 +1958,5 @@ config ASN1
       functions to call on what tags.

 source "kernel/Kconfig.locks"
+source "kernel/Kconfig.dprio"
+
diff --git a/kernel/Kconfig.dprio b/kernel/Kconfig.dprio
new file mode 100644
index 0000000..c18f2d0
--- /dev/null
+++ b/kernel/Kconfig.dprio
@@ -0,0 +1,68 @@
+menuconfig DEFERRED_SETPRIO
+    bool "Enable deferred setting of task priority"
+    default n
+    help
+      Enabling this option allows authorized applications to use
+      PR_SET_DEFERRED_SETPRIO request in prctl(2) system call.
+
+      Applications that change task priority with very high frequency can
+      benefit from using this facility as long as they are specifically
+      implemented to use prctl(PR_SET_DEFERRED_SETPRIO). If the system does
+      not intend to run such applications there is no benefit to using
+      this option.
+
+      The downside of selecting this option is a slightly increased latency
+      in task switching only in the case when a deferred set priority request
+      by a previous task is pending at task switch time. Added delay in task
+      context switch in this case is in the order of 1 usec (typical time for
+      executing deferred sched_setattr system call), which normally is not
+      significant, but may be a consideration in a system intended for hard
+      real-time use.
+
+      If unsure, say N.
+
+if DEFERRED_SETPRIO
+
+config PUT_TASK_TIMEBOUND
+    bool "Deterministic task switch latency when
deferred-set-task-priority is used"
+    depends on DEFERRED_SETPRIO && RT_MUTEXES
+    default n
+    help
+      Enabling this option ensures deterministic time-bound task switch
+      latency when a deferred set task priority request is pending on a
+      task rescheduling and task switch, and the processing of this request
+      causes an adjustment of priority inheritance chain under very low
+      memory conditions (depleted atomic pool).
+
+      Select Y when building the kernel for hard real-time system requiring
+      the determinism in task switch latency. Select N for general-purpose
+      desktop or server system.
+
+      This option has memory cost of about 20-40 bytes per each running task
+      in the system.
+
+config DEBUG_DEFERRED_SETPRIO
+    bool "Enable debugging code for deferred-set-task-priority"
+    depends on DEFERRED_SETPRIO
+    default n
+    help
+      Enable debugging code for DEFERRED_SETPRIO.
+
+      If unsure, say N.
+
+config DEFERRED_SETPRIO_PRIVILEGED
+    bool "Is deferred-set-task-priority a privileged operation"
+    depends on DEFERRED_SETPRIO
+    default y
+    help
+      Define whether the deferred set task priority facility is accessible
+      only for tasks having CAP_DPRIO capability or the facility is
+      unprivileged and available to all users on the system. This option
+      defines the initial value of the setting at system startup time but
+      the setting can be altered later dynamically via
+      /proc/sys/kernel/dprio_privileged.
+
+      If unsure, say Y.
+
+endif # DEFERRED_SETPRIO
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019..2b0ca5b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
+#include <linux/dprio.h>

 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -690,6 +691,11 @@ void do_exit(long code)

     ptrace_event(PTRACE_EVENT_EXIT, code);

+    /*
+     * No more deferred priority changes applied in __schedule for this task
+     */
+    dprio_detach(tsk);
+
     validate_creds_for_do_exit(tsk);

     /*
diff --git a/kernel/fork.c b/kernel/fork.c
index ad64248..74f5933 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
 #include <linux/uprobes.h>
 #include <linux/aio.h>
 #include <linux/compiler.h>
+#include <linux/dprio.h>

 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -234,7 +235,7 @@ static inline void put_signal_struct(struct
signal_struct *sig)
         free_signal_struct(sig);
 }

-void __put_task_struct(struct task_struct *tsk)
+static inline void __do_put_task_struct(struct task_struct *tsk)
 {
     WARN_ON(!tsk->exit_state);
     WARN_ON(atomic_read(&tsk->usage));
@@ -249,6 +250,84 @@ void __put_task_struct(struct task_struct *tsk)
     if (!profile_handoff_task(tsk))
         free_task(tsk);
 }
+
+#ifdef CONFIG_PUT_TASK_TIMEBOUND
+/*
+ * If timebound, use preallocated struct work_struct always guaranteed
+ * to be available, even if atomic kmalloc pool is depleted.
+ */
+static inline struct work_struct *alloc_put_task_work(struct task_struct *tsk)
+{
+    return &tsk->put_task_work;
+}
+
+static inline void free_put_task_work(struct work_struct *work)
+{
+}
+
+static inline struct task_struct *put_task_work_tsk(struct work_struct *work)
+{
+    return container_of(work, struct task_struct, put_task_work);
+}
+#else
+struct put_task_work {
+    struct work_struct work;
+    struct task_struct *tsk;
+};
+
+static inline struct work_struct *alloc_put_task_work(struct task_struct *tsk)
+{
+    struct put_task_work *dwork =
+        kmalloc(sizeof(*dwork), GFP_NOWAIT | __GFP_NOWARN);
+    if (unlikely(!dwork))
+        return NULL;
+    dwork->tsk = tsk;
+    return &dwork->work;
+}
+
+static inline void free_put_task_work(struct work_struct *work)
+{
+    struct put_task_work *dwork =
+        container_of(work, struct put_task_work, work);
+    kfree(dwork);
+}
+
+static inline struct task_struct *put_task_work_tsk(struct work_struct *work)
+{
+    struct put_task_work *dwork =
+        container_of(work, struct put_task_work, work);
+    return dwork->tsk;
+}
+#endif
+
+#ifdef CONFIG_DEFERRED_SETPRIO
+static void __put_task_struct_work(struct work_struct *work)
+{
+    __do_put_task_struct(put_task_work_tsk(work));
+    free_put_task_work(work);
+}
+#endif
+
+void __put_task_struct(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEFERRED_SETPRIO
+    /*
+     * When called from inside of __schedule(), try to defer processing
+     * to a worker thread, in order to mininize the scheduling latency
+     * and make it deterministic.
+     */
+    if (unlikely(preempt_count() & PREEMPT_ACTIVE)) {
+        struct work_struct *work = alloc_put_task_work(tsk);
+
+        if (likely(work)) {
+            INIT_WORK(work, __put_task_struct_work);
+            schedule_work(work);
+            return;
+        }
+    }
+#endif
+    __do_put_task_struct(tsk);
+}
 EXPORT_SYMBOL_GPL(__put_task_struct);

 void __init __weak arch_task_cache_init(void) { }
@@ -321,6 +400,8 @@ static struct task_struct *dup_task_struct(struct
task_struct *orig)
     if (err)
         goto free_ti;

+    dprio_dup_task_struct(tsk);
+
     tsk->stack = ti;
 #ifdef CONFIG_SECCOMP
     /*
@@ -1631,6 +1712,11 @@ long do_fork(unsigned long clone_flags,
     long nr;

     /*
+     * Process pending "deferred set priority" request.
+     */
+    dprio_handle_request();
+
+    /*
      * Determine whether and which event to report to ptracer.  When
      * called from kernel_thread or CLONE_UNTRACED is explicitly
      * requested, no event is reported; otherwise, report if the event
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b..a93d07c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_DEFERRED_SETPRIO) += dprio.o
\ No newline at end of file
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25e4513..db3d5e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
+#include <linux/dprio.h>

 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -2742,6 +2743,111 @@ again:
     BUG(); /* the idle class will always have a runnable task */
 }

+#ifdef CONFIG_DEFERRED_SETPRIO
+
+/*
+ * __schedule should never be reentered recursively while it is handling
+ * deferred change priority request in dprio_set_schedattr, i.e. when
+ * @prev->in_dprio is true.
+ *
+ * To prevent reenterancy, dprio_handle_request(...) keeps preemption
+ * disable counter non-zero and also sets PREEMPT_ACTIVE flag.
+ */
+static __always_inline bool dprio_sched_recursion(struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_DEFERRED_SETPRIO
+    if (unlikely(prev->in_dprio)) {
+        WARN_ONCE(1, KERN_ERR "BUG: dprio recursion in __schedule\n");
+
+        prev->state = TASK_RUNNING;
+        clear_tsk_need_resched(prev);
+        clear_preempt_need_resched();
+        sched_preempt_enable_no_resched();
+
+        return true;
+    }
+#endif /* CONFIG_DEBUG_DEFERRED_SETPRIO */
+
+    return false;
+}
+
+/*
+ * Check if deferred change priority request from the userland is pending
+ * and if so, handle it.
+ *
+ *     Academically speaking, it would be desirable (instead of calling
+ *     dprio_set_schedattr *before* pick_next_task) to call it *after*
+ *     pick_next_task and only if (next != prev). However in practice this
+ *     would save at most one sched_setattr call per task scheduling interval
+ *     (only for the tasks that use dprio), and then only sometimes, only when
+ *     both dprio request is pending at rescheduling time and the task gets
+ *     actually preempted by another task. At typical values of Linux
scheduling
+ *     parameters and the cost of sched_setattr call this translates to an
+ *     additional possible saving for dprio tasks that is well under 0.1%,
+ *     and probably much lower.
+ *
+ *     Nevertheless if dprio_set_schedattr were ever to be moved after the call
+ *     to pick_next_task, existing class schedulers would need to be revised
+ *     to support, in addition to call sequence
+ *
+ *       [pick_next_task] [context_switch]
+ *
+ *     also the sequence
+ *
+ *       [pick_next_task] [unlock rq] [...] [lock rq]
[pick_next_task] [context_switch]
+ *
+ *     where [...] may include a bunch of intervening class scheduler method
+ *     calls local CPU and other CPUs, since we'd be giving up the rq lock.
+ *     This would require splitting pick_next_task into "prepare" and
+ *     "commit/abort" phases.
+ */
+static __always_inline void dprio_sched_handle_request(struct
task_struct *prev)
+{
+    if (unlikely(prev->dprio_ku_area_pp != NULL) &&
+        unlikely(dprio_check_for_request(prev))) {
+        int sv_pc;
+
+        /*
+         * Do not attempt to process "deferred set priority" request for
+         * TASK_DEAD, STOPPED, TRACED and other states where it won't be
+         * appropriate.
+         */
+        switch (prev->state) {
+        case TASK_RUNNING:
+        case TASK_INTERRUPTIBLE:
+        case TASK_UNINTERRUPTIBLE:
+            break;
+        default:
+            return;
+        }
+
+        sv_pc = preempt_count();
+        if (!(sv_pc & PREEMPT_ACTIVE))
+            __preempt_count_add(PREEMPT_ACTIVE);
+        set_task_in_dprio(prev, true);
+        /*
+         * Keep preemption disabled to avoid __schedule() recursion.
+         * In addition PREEMPT_ACTIVE notifies dprio_handle_request()
+         * and routines that may be called from inside of it, such as
+         * __put_task_struct(), of the calling context.
+         */
+        dprio_handle_request();
+
+        set_task_in_dprio(prev, false);
+        if (!(sv_pc & PREEMPT_ACTIVE))
+            __preempt_count_sub(PREEMPT_ACTIVE);
+    }
+}
+#else  /* !defined CONFIG_DEFERRED_SETPRIO */
+
+static __always_inline bool dprio_sched_recursion(struct task_struct *prev)
+    { return false; }
+
+static __always_inline void dprio_sched_handle_request(struct
task_struct *prev)
+    {}
+
+#endif  /* CONFIG_DEFERRED_SETPRIO */
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -2795,6 +2901,10 @@ need_resched:

     schedule_debug(prev);

+    if (dprio_sched_recursion(prev))
+        return;
+    dprio_sched_handle_request(prev);
+
     if (sched_feat(HRTICK))
         hrtick_clear(rq);

@@ -3374,9 +3484,31 @@ static bool check_same_owner(struct task_struct *p)
     return match;
 }

+/*
+ * Flags for _sched_setscheduler and __sched_setscheduler:
+ *
+ *     SCHEDOP_KERNEL        on behalf of the kernel
+ *     SCHEDOP_USER        on behalf of the userspace
+ *
+ *     SCHEDOP_PRECHECK_ONLY    precheck security only, do not
+ *                actually change priority
+ *     SCHEDOP_PRECHECKED    security has been prechecked
+ *
+ *     SCHEDOP_MERGE_RESET_ON_FORK  use logical "or" of
+ *                attr->sched_flags & SCHED_FLAG_RESET_ON_FORK
+ *                and p->sched_reset_on_fork
+ *
+ * SCHEDOP_KERNEL and SCHEDOP_USER are mutually exclusive.
+ */
+#define SCHEDOP_KERNEL            (1 << 0)
+#define SCHEDOP_USER            (1 << 1)
+#define SCHEDOP_PRECHECK_ONLY        (1 << 2)
+#define SCHEDOP_PRECHECKED        (1 << 3)
+#define SCHEDOP_MERGE_RESET_ON_FORK    (1 << 4)
+
 static int __sched_setscheduler(struct task_struct *p,
                 const struct sched_attr *attr,
-                bool user)
+                int opflags)
 {
     int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
               MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3386,9 +3518,13 @@ static int __sched_setscheduler(struct task_struct *p,
     const struct sched_class *prev_class;
     struct rq *rq;
     int reset_on_fork;
+    bool check_security;

     /* may grab non-irq protected spin_locks */
     BUG_ON(in_interrupt());
+
+    check_security = (opflags & SCHEDOP_USER) && !(opflags &
SCHEDOP_PRECHECKED);
+
 recheck:
     /* double check policy once rq lock held */
     if (policy < 0) {
@@ -3396,6 +3532,8 @@ recheck:
         policy = oldpolicy = p->policy;
     } else {
         reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+        if (opflags & SCHEDOP_MERGE_RESET_ON_FORK)
+            reset_on_fork |= p->sched_reset_on_fork;

         if (policy != SCHED_DEADLINE &&
                 policy != SCHED_FIFO && policy != SCHED_RR &&
@@ -3422,7 +3560,7 @@ recheck:
     /*
      * Allow unprivileged RT tasks to decrease priority:
      */
-    if (user && !capable(CAP_SYS_NICE)) {
+    if (check_security && !capable(CAP_SYS_NICE)) {
         if (fair_policy(policy)) {
             if (attr->sched_nice < task_nice(p) &&
                 !can_nice(p, attr->sched_nice))
@@ -3470,7 +3608,7 @@ recheck:
             return -EPERM;
     }

-    if (user) {
+    if (check_security) {
         retval = security_task_setscheduler(p);
         if (retval)
             return retval;
@@ -3505,13 +3643,17 @@ recheck:
         if (dl_policy(policy))
             goto change;

-        p->sched_reset_on_fork = reset_on_fork;
+        if (!(opflags & SCHEDOP_PRECHECK_ONLY)) {
+            if (opflags & SCHEDOP_MERGE_RESET_ON_FORK)
+                reset_on_fork |= p->sched_reset_on_fork;
+            p->sched_reset_on_fork = reset_on_fork;
+        }
         task_rq_unlock(rq, p, &flags);
         return 0;
     }
 change:

-    if (user) {
+    if (opflags & SCHEDOP_USER) {
 #ifdef CONFIG_RT_GROUP_SCHED
         /*
          * Do not allow realtime tasks into groups that have no runtime
@@ -3559,6 +3701,13 @@ change:
         return -EBUSY;
     }

+    if (opflags & SCHEDOP_PRECHECK_ONLY) {
+        task_rq_unlock(rq, p, &flags);
+        return 0;
+    }
+
+    if (opflags & SCHEDOP_MERGE_RESET_ON_FORK)
+        reset_on_fork |= p->sched_reset_on_fork;
     p->sched_reset_on_fork = reset_on_fork;
     oldprio = p->prio;

@@ -3606,7 +3755,7 @@ change:
 }

 static int _sched_setscheduler(struct task_struct *p, int policy,
-                   const struct sched_param *param, bool check)
+                   const struct sched_param *param, int opflags)
 {
     struct sched_attr attr = {
         .sched_policy   = policy,
@@ -3621,7 +3770,7 @@ static int _sched_setscheduler(struct
task_struct *p, int policy,
         attr.sched_policy = policy;
     }

-    return __sched_setscheduler(p, &attr, check);
+    return __sched_setscheduler(p, &attr, opflags);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT
priority of a thread.
@@ -3636,16 +3785,42 @@ static int _sched_setscheduler(struct
task_struct *p, int policy,
 int sched_setscheduler(struct task_struct *p, int policy,
                const struct sched_param *param)
 {
-    return _sched_setscheduler(p, policy, param, true);
+    return _sched_setscheduler(p, policy, param, SCHEDOP_USER);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);

 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-    return __sched_setscheduler(p, attr, true);
+    return __sched_setscheduler(p, attr, SCHEDOP_USER);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);

+/*
+ * Check for security context required to execute sched_setattr,
+ * but do not execute actual task scheduler properties setting.
+ */
+int sched_setattr_precheck(struct task_struct *p, const struct
sched_attr *attr)
+{
+    return __sched_setscheduler(p, attr, SCHEDOP_USER |
+                         SCHEDOP_PRECHECK_ONLY);
+}
+EXPORT_SYMBOL_GPL(sched_setattr_precheck);
+
+/*
+ * Execute sched_setattr bypassing security checks.
+ */
+int sched_setattr_prechecked(struct task_struct *p,
+                 const struct sched_attr *attr,
+                 bool merge_reset_on_fork)
+{
+    int exflags = merge_reset_on_fork ? SCHEDOP_MERGE_RESET_ON_FORK : 0;
+
+    return __sched_setscheduler(p, attr, SCHEDOP_USER |
+                         SCHEDOP_PRECHECKED |
+                         exflags);
+}
+EXPORT_SYMBOL_GPL(sched_setattr_prechecked);
+
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or
RT priority of a thread from kernelspace.
  * @p: the task in question.
@@ -3662,7 +3837,7 @@ EXPORT_SYMBOL_GPL(sched_setattr);
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                    const struct sched_param *param)
 {
-    return _sched_setscheduler(p, policy, param, false);
+    return _sched_setscheduler(p, policy, param, SCHEDOP_KERNEL);
 }

 static int
diff --git a/kernel/sched/dprio.c b/kernel/sched/dprio.c
new file mode 100644
index 0000000..94cec5f
--- /dev/null
+++ b/kernel/sched/dprio.c
@@ -0,0 +1,617 @@
+/*
+ * kernel/sched/dprio.c
+ *
+ * Deferred set priority.
+ *
+ * Started by (C) 2014 Sergey Oboguev <oboguev@...oo.com>
+ *
+ * This code is licenced under the GPL version 2 or later.
+ * For details see linux-kernel-base/COPYING.
+ */
+
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/dprio.h>
+#include <linux/dprio_api.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/prctl.h>
+#include <linux/init.h>
+
+unsigned int dprio_privileged = DPRIO_PRIVILEGED_INITIAL_VALUE;
+
+/*
+ * Returns 0 on success.
+ */
+static inline int __copyin(void *dst, const void __user *src,
+               unsigned size, bool atomic)
+{
+    int ret;
+
+    /* Use barrier() to sequence userspace-kernel dprio protocol */
+    barrier();
+    if (atomic) {
+        pagefault_disable();
+        ret = __copy_from_user_inatomic(dst, src, size);
+        pagefault_enable();
+    } else {
+        ret = copy_from_user(dst, src, size);
+    }
+    barrier();
+
+    return ret;
+}
+
+/*
+ * Returns 0 on success.
+ */
+static inline int __copyout(void __user *dst, const void *src,
+                unsigned size, bool atomic)
+{
+    int ret;
+
+    /* Use barrier() to sequence userspace-kernel dprio protocol */
+    barrier();
+    if (atomic) {
+        pagefault_disable();
+        ret = __copy_to_user_inatomic(dst, src, size);
+        pagefault_enable();
+    } else {
+        ret = copy_to_user(dst, src, size);
+    }
+    barrier();
+
+    return ret;
+}
+
+#define __copyin_var(x, uptr, atomic)    \
+    __copyin(&(x), (uptr), sizeof(x), (atomic))
+
+#define __copyout_var(x, uptr, atomic)    \
+    __copyout((uptr), &(x), sizeof(x), (atomic))
+
+
+/*
+ * Mimics sched_copy_attr()
+ */
+#define CHUNK_SIZE 32u
+static int dprio_copyin_sched_attr(struct sched_attr __user *uattr,
+                   struct sched_attr *attr,
+                   bool atomic)
+{
+    u32 size;
+
+    if (!access_ok(VERIFY_READ, uattr, SCHED_ATTR_SIZE_VER0))
+        return -EFAULT;
+
+    /*
+     * zero the full structure, so that a short copy will be nice.
+     */
+    memset(attr, 0, sizeof(*attr));
+
+    if (__copyin_var(size, &uattr->size, atomic))
+        return -EFAULT;
+
+    if (size > PAGE_SIZE)    /* silly large */
+        return -E2BIG;
+
+    if (!size)        /* abi compat */
+        size = SCHED_ATTR_SIZE_VER0;
+
+    if (size < SCHED_ATTR_SIZE_VER0)
+        return -E2BIG;
+
+    /*
+     * If we're handed a bigger struct than we know of,
+     * ensure all the unknown bits are 0 - i.e. new
+     * user-space does not rely on any kernel feature
+     * extensions we dont know about yet.
+     */
+    if (size > sizeof(*attr)) {
+        unsigned char __user *addr;
+        unsigned char __user *end;
+        unsigned char val[CHUNK_SIZE];
+        unsigned k, chunk_size;
+
+        addr = (char __user *)uattr + sizeof(*attr);
+        end  = (char __user *)uattr + size;
+
+        for (; addr < end; addr += chunk_size) {
+            chunk_size = min((unsigned) (end - addr), CHUNK_SIZE);
+            if (__copyin(val, addr, chunk_size, atomic))
+                return -EFAULT;
+            for (k = 0;  k < chunk_size; k++) {
+                if (val[k])
+                    return -E2BIG;
+            }
+        }
+        size = sizeof(*attr);
+    }
+
+    if (__copyin(attr, uattr, size, atomic))
+        return -EFAULT;
+
+    attr->size = size;
+
+    /*
+     * XXX: do we want to be lenient like existing syscalls; or do we want
+     * to be strict and return an error on out-of-bounds values?
+     * See also other uses of clamp(..., MIN_NICE, MAX_NICE) below.
+     */
+    attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+    return 0;
+}
+
+
+/*
+ * Detach the task from userland deferred setprio request area and deallocate
+ * all resources for the connection. Called from:
+ *
+ *   - prctl(PR_SET_DEFERRED_SETPRIO) with area argument passed as NULL
+ *     to terminate previous connection
+ *
+ *   - prctl(PR_SET_DEFERRED_SETPRIO) with new non-NULL area argument
+ *     setting new connection. Previous connection is terminated before
+ *     establishing a new one
+ *
+ *   - when the task is terminated in do_exit()
+ */
+void dprio_detach(struct task_struct *tsk)
+{
+    preempt_disable();
+
+    tsk->dprio_ku_area_pp = NULL;
+
+    if (unlikely(tsk->dprio_info)) {
+        kfree(tsk->dprio_info);
+        tsk->dprio_info = NULL;
+    }
+
+    preempt_enable();
+}
+
+/*
+ * Pre-process sched_attr just read from the userspace, whether during precheck
+ * or during dprio request execution, to impose uniform interpretation of
+ * structure format and values.
+ */
+static void uniform_attr(struct sched_attr *attr)
+{
+    /* accommodate legacy hack */
+    if ((attr->sched_policy & SCHED_RESET_ON_FORK) &&
+        attr->sched_policy != -1) {
+        attr->sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+        attr->sched_policy &= ~SCHED_RESET_ON_FORK;
+    }
+
+    if (attr->sched_policy == SCHED_IDLE)
+        attr->sched_nice = MAX_NICE;
+}
+
+/*
+ * Precheck whether current process is authorized to set its scheduling
+ * properties to @uattr. If yes, make record in @info and return 0.
+ * If not, return error.
+ */
+static int precheck(struct dprio_info *info, struct sched_attr __user *uattr)
+{
+    struct sched_attr attr;
+    u32 policy;
+    unsigned mask;
+    int error;
+
+    error = dprio_copyin_sched_attr(uattr, &attr, false);
+    if (error)
+        return error;
+
+    uniform_attr(&attr);
+
+    policy = attr.sched_policy;
+    mask = 1 << policy;
+
+    switch (policy) {
+    case SCHED_NORMAL:
+        attr.sched_nice = clamp(attr.sched_nice, MIN_NICE, MAX_NICE);
+        if ((info->mask & mask) &&
+            attr.sched_nice >= info->normal_sched_nice)
+            break;
+        error = sched_setattr_precheck(current, &attr);
+        if (error == 0) {
+            info->normal_sched_nice = attr.sched_nice;
+            info->mask |= mask;
+        }
+        break;
+
+    case SCHED_BATCH:
+        attr.sched_nice = clamp(attr.sched_nice, MIN_NICE, MAX_NICE);
+        if ((info->mask & mask) &&
+            attr.sched_nice >= info->batch_sched_nice)
+            break;
+        error = sched_setattr_precheck(current, &attr);
+        if (error == 0) {
+            info->batch_sched_nice = attr.sched_nice;
+            info->mask |= mask;
+        }
+        break;
+
+    case SCHED_FIFO:
+        if ((info->mask & mask) &&
+            attr.sched_priority <= info->fifo_sched_priority)
+            break;
+        error = sched_setattr_precheck(current, &attr);
+        if (error == 0) {
+            info->fifo_sched_priority = attr.sched_priority;
+            info->mask |= mask;
+        }
+        break;
+
+    case SCHED_RR:
+        if ((info->mask & mask) &&
+            attr.sched_priority <= info->rr_sched_priority)
+            break;
+        error = sched_setattr_precheck(current, &attr);
+        if (error == 0) {
+            info->rr_sched_priority = attr.sched_priority;
+            info->mask |= mask;
+        }
+        break;
+
+    case SCHED_IDLE:
+        if (info->mask & mask)
+            break;
+        error = sched_setattr_precheck(current, &attr);
+        if (error == 0)
+            info->mask |= mask;
+        break;
+
+    case SCHED_DEADLINE:
+        /*
+         * DL is not a meaningful policy for deferred set
+         * priority
+         */
+    default:
+        error = -EINVAL;
+        break;
+    }
+
+    return error;
+}
+
+/*
+ * Implements prctl(PR_SET_DEFERRED_SETPRIO).
+ *
+ * To set PR_SET_DEFERRED_SETPRIO:
+ *
+ *     a2 = address of u64 variable in the userspace that holds the pointer
+ *          to dprio_ku_area or NULL
+ *
+ *     a3 = address of userspace array of pointers to sched_attr entries
+ *          to preapprove for subsequent pre-checked use by deferred set
+ *          priority requests
+ *
+ *     a4 = count of entries in a3 or 0
+ *
+ *     a5 = 0
+ *
+ * To reset PR_SET_DEFERRED_SETPRIO:
+ *
+ *     a2 = 0
+ *     a3 = 0
+ *     a4 = 0
+ *     a5 = 0
+ *
+ * Thus valid calls are:
+ *
+ *     struct sched_attr **sched_attrs_pp;
+ *     prctl(PR_SET_DEFERRED_SETPRIO, dprio_ku_area_pp,
+ *           sched_attrs_pp, nattrs, 0)
+ *
+ *     prctl(PR_SET_DEFERRED_SETPRIO, NULL, NULL, 0, 0)
+ *
+ */
+long dprio_prctl(int option, unsigned long a2, unsigned long a3,
+         unsigned long a4, unsigned long a5)
+{
+    struct dprio_ku_area __user * __user *ku_area_pp;
+    struct dprio_ku_area __user *ku_area_p;
+    struct dprio_info *info = NULL;
+    unsigned long ne, nentries;
+    struct sched_attr __user * __user *uattr_pp;
+    struct sched_attr __user *uattr_p;
+    bool atomic = false;
+    long error = 0;
+
+    if (option != PR_SET_DEFERRED_SETPRIO)
+        return -EINVAL;
+
+    ku_area_pp = (struct dprio_ku_area __user * __user *) a2;
+
+    /*
+    * Handle reset operation for PR_SET_DEFERRED_SETPRIO
+     */
+    if (ku_area_pp == NULL) {
+        if (a3 | a4 | a5)
+            return -EINVAL;
+        dprio_handle_request();
+        dprio_detach(current);
+        return 0;
+    }
+
+    /*
+     * Handle set operation for PR_SET_DEFERRED_SETPRIO
+     */
+    uattr_pp = (struct sched_attr __user * __user *) a3;
+    nentries = a4;
+    if (a5)
+        return -EINVAL;
+
+    /* sanity check to avoid long spinning in the kernel */
+    if (nentries > 4096) {
+        error = -EINVAL;
+        goto out;
+    }
+
+    /* Check alignment */
+    if ((unsigned long) ku_area_pp % sizeof(u64))
+        return  -EINVAL;
+
+    /* check *ku_area_pp is readable and writeable */
+    if (__copyin_var(ku_area_p, ku_area_pp, atomic) ||
+        __copyout_var(ku_area_p, ku_area_pp, atomic))
+        return  -EFAULT;
+
+    error = dprio_check_permission();
+    if (error)
+        return error;
+
+    info = kmalloc(sizeof(*info), GFP_KERNEL);
+    if (info == NULL)
+        return -ENOMEM;
+    info->mask = 0;
+    /*
+     * XXX:
+     *
+     * We may trigger a false recording of PF_SUPERPRIV here by requesting
+     * CAP_SYS_NICE capability we may not actually use later, however
+     * since we cannot modify current->flags during dprio_handle_request()
+     * when called from __schedule(), the alternatives would be either
+     * possibly missing the recording of PF_SUPERPRIV, or (better) splitting
+     * PF_SUPERPRIV from current->flags and moving it to a variable with
+     * atomic access protocol.
+     */
+    info->capable_sys_nice = capable(CAP_SYS_NICE);
+
+    /*
+     * We prevalidate maximum requested priority levels at the time of
+     * prctl set-up instead of validating priority change requests during
+     * their actual processing in __schedule and do_fork in order to:
+     *
+     *    - reduce latency during request processing in __schedule()
+     *
+     *    - avoid blocking in the secirity code when setprio processing
+     *      is performed in _schedule()
+     *
+     *    - avoid EINTR or ERESTARTSYS etc. that may be returned by
+     *      the security code during setprio request processing
+     */
+    for (ne = 0;  ne < nentries;  ne++) {
+        cond_resched();
+        if (__copyin_var(uattr_p, uattr_pp + ne, atomic)) {
+            error = -EFAULT;
+            goto out;
+        }
+        error = precheck(info, uattr_p);
+        if (error)
+            goto out;
+    }
+
+    /*
+     * If there was a previous active dprio ku area, try to process
+     * any pending request in it and detach from it.
+     */
+    dprio_handle_request();
+    dprio_detach(current);
+
+    preempt_disable();
+    current->dprio_ku_area_pp = ku_area_pp;
+    current->dprio_info = info;
+    preempt_enable();
+
+out:
+    if (error && info)
+        kfree(info);
+
+    return error;
+}
+
+/*
+ * Check if "deferred set priority" request from the userland is pending.
+ * Returns @true if request has been detected, @false if not.
+ *
+ * If page pointed by dprio_ku_area_pp is not currently accessible (e.g. not
+ * valid or paged out), return @false.
+ */
+bool dprio_check_for_request(struct task_struct *prev)
+{
+    struct dprio_ku_area __user *ku_area_p;
+    bool atomic = true;
+
+#ifdef CONFIG_DEBUG_DEFERRED_SETPRIO
+    /*
+     * We are only called if prev->dprio_ku_area_pp != NULL,
+     * thus prev cannot be a kernel thread
+     */
+    if (unlikely(prev->active_mm != prev->mm)) {
+        WARN_ONCE(1, KERN_ERR "BUG: dprio: address space not mapped\n");
+        return false;
+    }
+#endif /* CONFIG_DEBUG_DEFERRED_SETPRIO */
+
+    if (__copyin_var(ku_area_p, prev->dprio_ku_area_pp, atomic))
+        return false;
+
+    return ku_area_p != NULL;
+}
+
+/*
+ * Handle pending "deferred set priority" request from the userland.
+ */
+void dprio_handle_request(void)
+{
+    struct dprio_ku_area __user *ku;
+    struct dprio_ku_area __user *ku_null;
+    struct sched_attr attr;
+    bool atomic;
+    u32 resp, error;
+    int ierror = 0;
+    unsigned long rlim_rtprio;
+    long rlim_nice;
+    struct dprio_info *info;
+
+    /* attached to ku area? */
+    if (current->dprio_ku_area_pp == NULL)
+        return;
+
+    /* called from __schedule? */
+    atomic = preempt_count() != 0;
+
+    /* fetch ku request area address from the userspace */
+    if (__copyin_var(ku, current->dprio_ku_area_pp, atomic))
+        return;
+
+    /* check if request is pending */
+    if (unlikely(ku == NULL))
+        return;
+
+    /* remark to the userspace:
+       request processing has been started/attempted */
+    resp = DPRIO_RESP_UNKNOWN;
+    if (__copyout_var(resp, &ku->resp, atomic))
+        return;
+
+    /* reset pending request */
+    ku_null = NULL;
+    if (__copyout_var(ku_null, current->dprio_ku_area_pp, atomic))
+        return;
+
+    /* fetch request parameters from the userspace */
+    if (dprio_copyin_sched_attr(&ku->sched_attr, &attr, atomic))
+        return;
+
+    /* impose uniform interpretation of sched_attr */
+    uniform_attr(&attr);
+
+    if (attr.sched_flags & ~SCHED_FLAG_RESET_ON_FORK) {
+        ierror = -EINVAL;
+        goto out;
+    }
+
+    /*
+     * check if request has been pre-authorized
+     */
+    info = current->dprio_info;
+    switch (attr.sched_policy) {
+    case SCHED_NORMAL:
+        if (!(info->mask & (1 << SCHED_NORMAL)) ||
+            attr.sched_nice < info->normal_sched_nice)
+            ierror = -EPERM;
+        /*
+         * check whether RLIMIT_NICE has been reduced
+         * by setrlimit or prlimit
+         */
+        if (ierror == 0 && !info->capable_sys_nice) {
+            rlim_nice = 20 - task_rlimit(current, RLIMIT_NICE);
+            if (attr.sched_nice < rlim_nice)
+                ierror = -EPERM;
+        }
+        break;
+
+    case SCHED_BATCH:
+        if (!(info->mask & (1 << SCHED_BATCH)) ||
+            attr.sched_nice < info->batch_sched_nice)
+            ierror = -EPERM;
+        /*
+         * check whether RLIMIT_NICE has been reduced
+         * by setrlimit or prlimit
+         */
+        if (ierror == 0 && !info->capable_sys_nice) {
+            rlim_nice = 20 - task_rlimit(current, RLIMIT_NICE);
+            if (attr.sched_nice < rlim_nice)
+                ierror = -EPERM;
+        }
+        break;
+
+    case SCHED_FIFO:
+        if (!(info->mask & (1 << SCHED_FIFO)) ||
+            attr.sched_priority > info->fifo_sched_priority)
+            ierror = -EPERM;
+        /*
+         * check whether RLIMIT_RTPRIO has been reduced
+         * by setrlimit or prlimit
+         */
+        if (ierror == 0 && !info->capable_sys_nice) {
+            rlim_rtprio = task_rlimit(current, RLIMIT_RTPRIO);
+            if (rlim_rtprio == 0 || attr.sched_priority > rlim_rtprio)
+                ierror = -EPERM;
+        }
+        break;
+
+    case SCHED_RR:
+        if (!(info->mask & (1 << SCHED_RR)) ||
+            attr.sched_priority > info->rr_sched_priority)
+            ierror = -EPERM;
+        /*
+         * check whether RLIMIT_RTPRIO has been reduced
+         * by setrlimit or prlimit
+         */
+        if (ierror == 0 && !info->capable_sys_nice) {
+            rlim_rtprio = task_rlimit(current, RLIMIT_RTPRIO);
+            if (rlim_rtprio == 0 || attr.sched_priority > rlim_rtprio)
+                ierror = -EPERM;
+        }
+        break;
+
+    case SCHED_IDLE:
+        if (!(info->mask & (1 << SCHED_IDLE)))
+            ierror = -EPERM;
+        break;
+
+    default:
+        ierror = -EINVAL;
+        break;
+    }
+
+    /* execute the request */
+    if (ierror == 0)
+        ierror = sched_setattr_prechecked(current, &attr, true);
+
+out:
+    if (ierror) {
+        error = (u32) -ierror;
+        resp = DPRIO_RESP_ERROR;
+        if (0 == __copyout_var(error, &ku->error, atomic))
+            __copyout_var(resp, &ku->resp, atomic);
+    } else {
+        resp = DPRIO_RESP_OK;
+        __copyout_var(resp, &ku->resp, atomic);
+    }
+}
+
+/*
+ * Verify if the current task is authorized to use
prctl(PR_SET_DEFERRED_SETPRIO).
+ */
+int dprio_check_permission(void)
+{
+    if (dprio_privileged && !capable(CAP_DPRIO))
+        return -EPERM;
+
+    return 0;
+}
+
diff --git a/kernel/sys.c b/kernel/sys.c
index b663664..7fe4486 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
+#include <linux/dprio.h>

 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
@@ -2009,6 +2010,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned
long, arg2, unsigned long, arg3,
             me->mm->def_flags &= ~VM_NOHUGEPAGE;
         up_write(&me->mm->mmap_sem);
         break;
+#ifdef CONFIG_DEFERRED_SETPRIO
+    case PR_SET_DEFERRED_SETPRIO:
+        error = dprio_prctl(option, arg2, arg3, arg4, arg5);
+        break;
+#endif
     default:
         error = -EINVAL;
         break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab45666..3cce55a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/dprio.h>

 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -432,6 +433,17 @@ static struct ctl_table kern_table[] = {
         .extra2        = &one,
     },
 #endif
+#ifdef CONFIG_DEFERRED_SETPRIO
+    {
+        .procname    = "dprio_privileged",
+        .data        = &dprio_privileged,
+        .maxlen        = sizeof(unsigned int),
+        .mode        = 0644,
+        .proc_handler    = proc_dointvec_minmax,
+        .extra1        = &zero,
+        .extra2        = &one,
+    },
+#endif
 #ifdef CONFIG_CFS_BANDWIDTH
     {
         .procname    = "sched_cfs_bandwidth_slice_us",
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/