[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aaa9cfea-d5f9-06a3-242f-05a1f3f9e7c8@oracle.com>
Date: Fri, 2 Nov 2018 16:39:00 -0700
From: Subhra Mazumdar <subhra.mazumdar@...cle.com>
To: Steve Sistare <steven.sistare@...cle.com>, mingo@...hat.com,
peterz@...radead.org
Cc: dhaval.giani@...cle.com, rohit.k.jain@...cle.com,
daniel.m.jordan@...cle.com, pavel.tatashin@...rosoft.com,
matt@...eblueprint.co.uk, umgwanakikbuti@...il.com,
riel@...hat.com, jbacik@...com, juri.lelli@...hat.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 00/10] steal tasks to improve CPU utilization
On 10/22/18 7:59 AM, Steve Sistare wrote:
> When a CPU has no more CFS tasks to run, and idle_balance() fails to
> find a task, then attempt to steal a task from an overloaded CPU in the
> same LLC. Maintain and use a bitmap of overloaded CPUs to efficiently
> identify candidates. To minimize search time, steal the first migratable
> task that is found when the bitmap is traversed. For fairness, search
> for migratable tasks on an overloaded CPU in order of next to run.
>
> This simple stealing yields a higher CPU utilization than idle_balance()
> alone, because the search is cheap, so it may be called every time the CPU
> is about to go idle. idle_balance() does more work because it searches
> widely for the busiest queue, so to limit its CPU consumption, it declines
> to search if the system is too busy. Simple stealing does not offload the
> globally busiest queue, but it is much better than running nothing at all.
>
> The bitmap of overloaded CPUs is a new type of sparse bitmap, designed to
> reduce cache contention vs the usual bitmap when many threads concurrently
> set, clear, and visit elements.
>
Is the bitmask saving much? I tried a simple stealing that just starts
searching the domain from the current cpu and steals a thread from the
first cpu that has more than one runnable thread. It seems to perform
similar to your patch.
hackbench on X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
baseline %stdev patch %stdev
1(40 tasks) 0.5524 2.36 0.5522 (0.045%) 3.82
2(80 tasks) 0.6482 11.4 0.7241 (-11.7%) 20.34
4(160 tasks) 0.9756 0.95 0.8276 (15.1%) 5.8
8(320 tasks) 1.7699 1.62 1.6655 (5.9%) 1.57
16(640 tasks) 3.1018 0.77 2.9858 (3.74%) 1.4
32(1280 tasks) 5.565 0.62 5.3388 (4.1%) 0.72
X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs
Oracle database OLTP, logging _enabled_
Users %speedup
20 1.2
40 -0.41
60 0.83
80 2.37
100 1.54
120 3.0
140 2.24
160 1.82
180 1.94
200 2.23
220 1.49
Below is the patch (not in best shape)
----------->8--------------------
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f808ddf..1690451 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3540,6 +3540,7 @@ static inline unsigned long cfs_rq_load_avg(struct
cfs_rq *cfs_rq)
}
static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int my_idle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
@@ -6619,6 +6620,8 @@ done: __maybe_unused;
idle:
new_tasks = idle_balance(rq, rf);
+ if (new_tasks == 0)
+ new_tasks = my_idle_balance(rq, rf);
/*
* Because idle_balance() releases (and re-acquires) rq->lock,
it is
@@ -8434,6 +8437,75 @@ static int should_we_balance(struct lb_env *env)
return balance_cpu == env->dst_cpu;
}
+int get_best_cpu(int this_cpu, struct sched_domain *sd)
+{
+ struct rq *this_rq, *rq;
+ int i;
+ int best_cpu = -1;
+
+ this_rq = cpu_rq(this_cpu);
+ for_each_cpu_wrap(i, sched_domain_span(sd), this_cpu) {
+ if (this_rq->nr_running > 0)
+ return (-1);
+ if (i == this_cpu)
+ continue;
+ rq = cpu_rq(i);
+ if (rq->nr_running <= 1)
+ continue;
+ best_cpu = i;
+ break;
+ }
+ return (best_cpu);
+}
+static int my_load_balance(int this_cpu, struct rq *this_rq,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ int ld_moved = 0;
+ struct rq *busiest;
+ unsigned long flags;
+ struct task_struct *p = NULL;
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+ int best_cpu;
+
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_span(sd->groups),
+ .idle = idle,
+ .cpus = cpus,
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ };
+
+ if (idle == CPU_NEWLY_IDLE)
+ env.dst_grpmask = NULL;
+
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
+
+ best_cpu = get_best_cpu(this_cpu, sd);
+
+ if (best_cpu >= 0)
+ busiest = cpu_rq(best_cpu);
+ else
+ goto out;
+
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ raw_spin_lock_irqsave(&busiest->lock, flags);
+
+ p = detach_one_task(&env);
+ raw_spin_unlock(&busiest->lock);
+ if (p) {
+ attach_one_task(this_rq, p);
+ ld_moved++;
+ }
+ local_irq_restore(flags);
+
+out:
+ return ld_moved;
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -9369,6 +9441,71 @@ static inline bool nohz_idle_balance(struct rq
*this_rq, enum cpu_idle_type idle
static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
+
+static int my_idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+ int this_cpu = this_rq->cpu;
+ struct sched_domain *sd;
+ int pulled_task = 0;
+ int level;
+
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such
that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ rq_unpin_lock(this_rq, rf);
+
+ /*
+ * Drop the rq->lock, but keep IRQ/preempt disabled.
+ */
+ raw_spin_unlock(&this_rq->lock);
+
+ rcu_read_lock();
+
+ level = 0;
+ for_each_domain(this_cpu, sd) {
+ if (level++ > 1)
+ break;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE)
+ pulled_task = my_load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ raw_spin_lock(&this_rq->lock);
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+out:
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task)
+ this_rq->idle_stamp = 0;
+
+ rq_repin_lock(this_rq, rf);
+ return pulled_task;
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
Powered by blists - more mailing lists