linux-kernel - [RFC 49/60] cosched: Adjust locking for enqueuing and dequeueing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180907214047.26914-50-jschoenh@amazon.de>
Date:   Fri,  7 Sep 2018 23:40:36 +0200
From:   Jan H. Schönherr <jschoenh@...zon.de>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     Jan H. Schönherr <jschoenh@...zon.de>,
        linux-kernel@...r.kernel.org
Subject: [RFC 49/60] cosched: Adjust locking for enqueuing and dequeueing

Enqueuing and dequeuing of tasks (or entities) are a general activities
that span across leader boundaries. They start from the bottom of the
runqueue hierarchy and bubble upwards, until they hit their terminating
condition (for example, enqueuing stops when the parent entity is already
enqueued).

We employ chain-locking in these cases to minimize lock contention.
For example, if enqueuing has moved past a hierarchy level of a different
leader, that leader can already make scheduling decisions again. Also,
this opens the possibility to combine concurrent enqueues/dequeues to
some extend, so that only one of multiple CPUs has to walk up the
hierarchy.

Signed-off-by: Jan H. Schönherr <jschoenh@...zon.de>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6d64f4478fda..0dc4d289497c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4510,6 +4510,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct sched_entity *se;
 	long task_delta, dequeue = 1;
 	bool empty;
+	struct rq_chain rc;
 
 	/*
 	 * FIXME: We can only handle CPU runqueues at the moment.
@@ -4532,8 +4533,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	rcu_read_unlock();
 
 	task_delta = cfs_rq->h_nr_running;
+	rq_chain_init(&rc, rq);
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
+		rq_chain_lock(&rc, se);
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			break;
@@ -4549,6 +4553,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!se)
 		sub_nr_running(rq, task_delta);
 
+	rq_chain_unlock(&rc);
+
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
@@ -4577,6 +4583,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct sched_entity *se;
 	int enqueue = 1;
 	long task_delta;
+	struct rq_chain rc;
 
 	SCHED_WARN_ON(!is_cpu_rq(rq));
 
@@ -4598,7 +4605,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		return;
 
 	task_delta = cfs_rq->h_nr_running;
+	rq_chain_init(&rc, rq);
 	for_each_sched_entity(se) {
+		rq_chain_lock(&rc, se);
 		if (se->on_rq)
 			enqueue = 0;
 
@@ -4614,6 +4623,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!se)
 		add_nr_running(rq, task_delta);
 
+	rq_chain_unlock(&rc);
+
 	/* Determine whether we need to wake up potentially idle CPU: */
 	if (rq->curr == rq->idle && nr_cfs_tasks(rq))
 		resched_curr(rq);
@@ -5136,8 +5147,11 @@ bool enqueue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 			 unsigned int task_delta)
 {
 	struct cfs_rq *cfs_rq;
+	struct rq_chain rc;
 
+	rq_chain_init(&rc, rq);
 	for_each_sched_entity(se) {
+		rq_chain_lock(&rc, se);
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
@@ -5157,6 +5171,8 @@ bool enqueue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 	}
 
 	for_each_sched_entity(se) {
+		/* FIXME: taking locks up to the top is bad */
+		rq_chain_lock(&rc, se);
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running += task_delta;
 
@@ -5167,6 +5183,8 @@ bool enqueue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 		update_cfs_group(se);
 	}
 
+	rq_chain_unlock(&rc);
+
 	return se != NULL;
 }
 
@@ -5211,9 +5229,12 @@ bool dequeue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 			 unsigned int task_delta)
 {
 	struct cfs_rq *cfs_rq;
+	struct rq_chain rc;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
+	rq_chain_init(&rc, rq);
 	for_each_sched_entity(se) {
+		rq_chain_lock(&rc, se);
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 
@@ -5231,6 +5252,9 @@ bool dequeue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 		if (cfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
 			se = parent_entity(se);
+			if (se)
+				rq_chain_lock(&rc, se);
+
 			/*
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
@@ -5243,6 +5267,8 @@ bool dequeue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 	}
 
 	for_each_sched_entity(se) {
+		/* FIXME: taking locks up to the top is bad */
+		rq_chain_lock(&rc, se);
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running -= task_delta;
 
@@ -5253,6 +5279,8 @@ bool dequeue_entity_fair(struct rq *rq, struct sched_entity *se, int flags,
 		update_cfs_group(se);
 	}
 
+	rq_chain_unlock(&rc);
+
 	return se != NULL;
 }
 
@@ -9860,11 +9888,15 @@ static inline bool vruntime_normalized(struct task_struct *p)
 static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq;
+	struct rq_chain rc;
+
+	rq_chain_init(&rc, hrq_of(cfs_rq_of(se)));
 
 	/* Start to propagate at parent */
 	se = parent_entity(se);
 
 	for_each_sched_entity(se) {
+		rq_chain_lock(&rc, se);
 		cfs_rq = cfs_rq_of(se);
 
 		if (cfs_rq_throttled(cfs_rq))
@@ -9872,6 +9904,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
+	rq_chain_unlock(&rc);
 }
 #else
 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-- 
2.9.3.1.gcba166c.dirty