linux-kernel - Re: [patch 3/3] From: Ben Segall <bsegall@...gle.com>

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 09 Nov 2011 18:30:26 -0800
From:	Paul Turner <pjt@...gle.com>
To:	unlisted-recipients:; (no To-header on input)
CC:	linux-kernel@...r.kernel.org,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Ingo Molnar <mingo@...e.hu>, Ben Segall <bsegall@...gle.com>
Subject: Re: [patch 3/3] From: Ben Segall <bsegall@...gle.com>

I mangled the subject/from headers on this one, fixed should be below.

Thanks,

- Paul
---
sched: update task accounting on throttle so that idle_balance() will trigger
From: Ben Segall <bsegall@...gle.com>

Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().

Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.

This allows schedule() to call idle_balance when we go idle due to throttling.

Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
   +patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261

Signed-off-by: Ben Segall <bsegall@...gle.com>
Signed-off-by: Paul Turner <pjt@...gle.com>
---
  kernel/sched.c      |   24 ++++++++----
  kernel/sched_fair.c |  101 ++++++++++++++++++++++++++++++++++++----------------
  2 files changed, 87 insertions(+), 38 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
  #endif
  };

+enum runtime_state {
+	RUNTIME_UNLIMITED,
+	RUNTIME_AVAILABLE,
+	RUNTIME_THROTTLING,
+	RUNTIME_THROTTLED
+};
+
  /* task group related information */
  struct task_group {
  	struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
  	unsigned long load_contribution;
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
-	int runtime_enabled;
+	enum runtime_state runtime_state;
  	u64 runtime_expires;
  	s64 runtime_remaining;

  	u64 throttled_timestamp;
-	int throttled, throttle_count;
+	int throttle_count;
  	struct list_head throttled_list;
  #endif
  #endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf

  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
-	cfs_rq->runtime_enabled = 0;
+	cfs_rq->runtime_state = RUNTIME_UNLIMITED;
  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
  }

@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
  	for_each_leaf_cfs_rq(rq, cfs_rq) {
  		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

-		if (!cfs_rq->runtime_enabled)
+		if (!cfs_rq_runtime_enabled(cfs_rq))
  			continue;

  		/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
  		struct rq *rq = rq_of(cfs_rq);

  		raw_spin_lock_irq(&rq->lock);
-		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
-
  		if (cfs_rq_throttled(cfs_rq))
  			unthrottle_cfs_rq(cfs_rq);
+
+		if (runtime_enabled)
+			cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+		else
+			cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+		cfs_rq->runtime_remaining = 0;
  		raw_spin_unlock_irq(&rq->lock);
  	}
  out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
  	}
  }

+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		if (!se->on_rq)
+			break;
+
+		qcfs_rq->h_nr_running -= nr_throttling;
+
+		if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+			break;
+	}
+
+	if (!se)
+		rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  				     unsigned long delta_exec)
  {
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
  	 * if we're unable to extend our runtime we resched so that the active
  	 * hierarchy can be throttled
  	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+	if (assign_cfs_rq_runtime(cfs_rq))
+		return;
+
+	if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+	   cfs_rq->runtime_state == RUNTIME_THROTTLING)
+		return;
+
+	resched_task(rq_of(cfs_rq)->curr);
+
+	/*
+	* Remove us from nr_running/h_nr_running so
+	* that idle_balance gets called if necessary
+	*/
+	account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+	cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() &&
+		cfs_rq->runtime_state != RUNTIME_UNLIMITED;
  }

  static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  						   unsigned long delta_exec)
  {
-	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+	if (!cfs_rq_runtime_enabled(cfs_rq))
  		return;

  	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_

  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  {
-	return cfs_bandwidth_used() && cfs_rq->throttled;
+	return cfs_bandwidth_used() &&
+		(cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+		 cfs_rq->runtime_state == RUNTIME_THROTTLING);
  }

  /* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
  	struct rq *rq = rq_of(cfs_rq);
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  	struct sched_entity *se;
-	long task_delta, dequeue = 1;

  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
  	rcu_read_unlock();

-	task_delta = cfs_rq->h_nr_running;
  	for_each_sched_entity(se) {
  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  		/* throttled entity or throttle-on-deactivate */
  		if (!se->on_rq)
  			break;

-		if (dequeue)
-			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-		qcfs_rq->h_nr_running -= task_delta;
+		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

  		if (qcfs_rq->load.weight)
-			dequeue = 0;
+			break;
  	}

-	if (!se)
-		rq->nr_running -= task_delta;
-
-	cfs_rq->throttled = 1;
+	cfs_rq->runtime_state = RUNTIME_THROTTLED;
  	cfs_rq->throttled_timestamp = rq->clock;
  	raw_spin_lock(&cfs_b->lock);
  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
  	int enqueue = 1;
  	long task_delta;

+	if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+		/* do only the partial unthrottle */
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		return;
+	}
+
  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

-	cfs_rq->throttled = 0;
+	cfs_rq->runtime_state = RUNTIME_AVAILABLE;
  	raw_spin_lock(&cfs_b->lock);
  	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
  	list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru

  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
-	if (!cfs_bandwidth_used())
-		return;
-
-	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
  		return;

  	__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
   */
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
  {
-	if (!cfs_bandwidth_used())
-		return;
-
  	/* an active group must be handled by the update_curr()->put() path */
-	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
  		return;

  	/* ensure the group is not already throttled */
-	if (cfs_rq_throttled(cfs_rq))
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
  		return;

  	/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct
  	if (!cfs_bandwidth_used())
  		return;

-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+	if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
  		return;

  	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
+	* it is possible for additional bandwidth to arrive between
+	* when we call resched_task for being out of runtime and we
+	* call put_prev_task, in which case reaccount the now running
+	* tasks
  	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	throttle_cfs_rq(cfs_rq);
+	if (unlikely(cfs_rq->runtime_remaining > 0)) {
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+	} else {
+		throttle_cfs_rq(cfs_rq);
+	}
  }
  #else
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/