lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180907214047.26914-24-jschoenh@amazon.de>
Date:   Fri,  7 Sep 2018 23:40:10 +0200
From:   Jan H. Schönherr <jschoenh@...zon.de>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     Jan H. Schönherr <jschoenh@...zon.de>,
        linux-kernel@...r.kernel.org
Subject: [RFC 23/60] cosched: Add core data structures for coscheduling

For coscheduling, we will set up hierarchical runqueues that correspond
to larger fractions of the system. They will be organized along the
scheduling domains.

Although it is overkill at the moment, we keep a full struct rq per
scheduling domain. The existing code is so used to pass struct rq
around, that it would be a large refactoring effort to concentrate the
actually needed fields of struct rq in a smaller structure. Also, we
will probably need more fields in the future.

Extend struct rq and struct cfs_rq with extra structs encapsulating
all purely coscheduling related fields: struct sdrq_data and struct
sdrq, respectively.

Extend struct task_group, so that we can keep track of the hierarchy
and how this task_group should behave. We can now distinguish between
regular task groups and scheduled task groups. The former work as usual,
while the latter actively utilize the hierarchical aspect and represent
SEs of a lower hierarchy level at a higher level within the parent task
group, causing SEs at the lower level to get coscheduled.

Signed-off-by: Jan H. Schönherr <jschoenh@...zon.de>
---
 kernel/sched/sched.h | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b36e61914a42..1bce6061ac45 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -368,6 +368,27 @@ struct task_group {
 #endif
 #endif
 
+#ifdef CONFIG_COSCHEDULING
+	/*
+	 * References the top of this task-group's RQ hierarchy. This is
+	 * static and does not change. It is used as entry-point to traverse
+	 * the structure on creation/destruction.
+	 */
+	struct cfs_rq *top_cfsrq;
+
+	/* Protects .scheduled from concurrent modifications */
+	raw_spinlock_t lock;
+
+	/*
+	 * Indicates the level at which this task group is scheduled:
+	 * 0 == bottom level == regular task group
+	 * >0 == scheduled task group
+	 *
+	 * Modifications are (for now) requested by the user.
+	 */
+	int scheduled;
+#endif
+
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	**rt_se;
 	struct rt_rq		**rt_rq;
@@ -485,6 +506,120 @@ struct rq_flags {
 #endif
 };
 
+#ifdef CONFIG_COSCHEDULING
+struct sdrq_data {
+	/*
+	 * Leader for this part of the hierarchy.
+	 *
+	 * The leader CPU is responsible for scheduling decisions and any
+	 * required maintenance.
+	 *
+	 * Leadership is variable and may be taken while the hierarchy level
+	 * is locked.
+	 */
+	int leader;
+
+	/* Height within the hierarchy: leaf == 0, parent == child + 1 */
+	int level;
+
+	/* Parent runqueue */
+	struct sdrq_data *parent;
+
+	/*
+	 * SD-RQ from which SEs get selected.
+	 *
+	 * This is set by the parent's leader and defines the current
+	 * schedulable subset of tasks within this part of the hierarchy.
+	 */
+	struct sdrq *current_sdrq;
+
+	/* CPUs making up this part of the hierarchy */
+	const struct cpumask *span;
+
+	/* Number of CPUs within this part of the hierarchy */
+	unsigned int span_weight;
+
+	/*
+	 * Determines if the corresponding SD-RQs are to be allocated on
+	 * a specific NUMA node.
+	 */
+	int numa_node;
+
+	/* Storage for rq_flags, when we need to lock multiple runqueues. */
+	struct rq_flags rf;
+
+	/* Do we have the parent runqueue locked? */
+	bool parent_locked;
+
+	/*
+	 * In case the CPU has been forced into idle, the idle_se references the
+	 * scheduling entity responsible for this. Only used on bottom level at
+	 * the moment.
+	 */
+	struct sched_entity *idle_se;
+};
+
+struct sdrq {
+	/* Common information for all SD-RQs at the same position */
+	struct sdrq_data *data;
+
+	/* SD hierarchy */
+	struct sdrq *sd_parent;		/* parent of this node */
+	struct list_head children;	/* children of this node */
+	struct list_head siblings;	/* link to parent's children list */
+
+	/*
+	 * is_root == 1 => link via tg_se into tg_parent->cfs_rq
+	 * is_root == 0 => link via sd_parent->sd_se into sd_parent->cfs_rq
+	 */
+	int is_root;
+
+	/*
+	 * SD-SE: an SE to be enqueued in .cfs_rq to represent this
+	 * node's children in order to make their members schedulable.
+	 *
+	 * In the bottom layer .sd_se has to be NULL for various if-conditions
+	 * and loop terminations. On other layers .sd_se points to .__sd_se.
+	 *
+	 * .__sd_se is unused within the bottom layer.
+	 */
+	struct sched_entity *sd_se;
+	struct sched_entity __sd_se;
+
+	/* Accumulated load of all SD-children */
+	atomic64_t sdse_load;
+
+	/*
+	 * Reference to the SD-runqueue at the same hierarchical position
+	 * in the parent task group.
+	 */
+	struct sdrq *tg_parent;
+	struct list_head tg_children;	/* child TGs of this node */
+	struct list_head tg_siblings;	/* link to parent's children list */
+
+	/*
+	 * TG-SE: a SE to be enqueued in .tg_parent->cfs_rq.
+	 *
+	 * In the case of a regular TG it is enqueued if .cfs_rq is not empty.
+	 * In the case of a scheduled TG it is enqueued if .cfs_rq is not empty
+	 * and this SD-RQ acts as a root SD within its TG.
+	 *
+	 * .tg_se takes over the role of .cfs_rq->my_se and points to the same
+	 * SE over its life-time, while .cfs_rq->my_se now points to either the
+	 * TG-SE or the SD-SE (or NULL in the parts of the root task group).
+	 */
+	struct sched_entity *tg_se;
+
+	/*
+	 * CFS runqueue of this SD runqueue.
+	 *
+	 * FIXME: Now that struct sdrq is embedded in struct cfs_rq, we could
+	 *        drop this.
+	 */
+	struct cfs_rq *cfs_rq;
+};
+#endif /* CONFIG_COSCHEDULING */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight	load;
@@ -544,6 +679,12 @@ struct cfs_rq {
 	u64			last_h_load_update;
 	struct sched_entity	*h_load_next;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_COSCHEDULING
+	/* Extra info needed for hierarchical scheduling */
+	struct sdrq sdrq;
+#endif
+
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -817,6 +958,11 @@ struct rq {
 	struct rt_rq		rt;
 	struct dl_rq		dl;
 
+#ifdef CONFIG_COSCHEDULING
+	/* Extra information for hierarchical scheduling */
+	struct sdrq_data sdrq_data;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
 	struct list_head	leaf_cfs_rq_list;
@@ -935,6 +1081,11 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+#ifdef CONFIG_COSCHEDULING
+	/* Top level runqueue for this sched_domain */
+	struct rq rq;
+#endif
 };
 
 static inline int cpu_of(struct rq *rq)
-- 
2.9.3.1.gcba166c.dirty

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ