linux-kernel - Re: [PATCH v6 13/14] sched/topology: Make Energy Aware Scheduling depend on schedutil

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180907152923.oxsmcqciez4yhmkk@queper01-lin>
Date:   Fri, 7 Sep 2018 16:29:25 +0100
From:   Quentin Perret <quentin.perret@....com>
To:     "Rafael J. Wysocki" <rjw@...ysocki.net>
Cc:     "Rafael J. Wysocki" <rafael@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
        Linux PM <linux-pm@...r.kernel.org>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        Ingo Molnar <mingo@...hat.com>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Morten Rasmussen <morten.rasmussen@....com>,
        Chris Redpath <chris.redpath@....com>,
        Patrick Bellasi <patrick.bellasi@....com>,
        Valentin Schneider <valentin.schneider@....com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Thara Gopinath <thara.gopinath@...aro.org>,
        Viresh Kumar <viresh.kumar@...aro.org>,
        Todd Kjos <tkjos@...gle.com>,
        Joel Fernandes <joel@...lfernandes.org>,
        Steve Muckle <smuckle@...gle.com>, adharmap@...eaurora.org,
        Saravana Kannan <skannan@...eaurora.org>,
        Pavan Kondeti <pkondeti@...eaurora.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Eduardo Valentin <edubezval@...il.com>,
        Srinivas Pandruvada <srinivas.pandruvada@...ux.intel.com>,
        currojerez@...eup.net, Javi Merino <javi.merino@...nel.org>
Subject: Re: [PATCH v6 13/14] sched/topology: Make Energy Aware Scheduling
 depend on schedutil

On Friday 07 Sep 2018 at 10:52:01 (+0200), Rafael J. Wysocki wrote:
> On Thursday, September 6, 2018 4:38:44 PM CEST Quentin Perret wrote:
> > Hi Rafael,
> > 
> > On Thursday 06 Sep 2018 at 11:18:55 (+0200), Rafael J. Wysocki wrote:
> > > I'm not a particular fan of notifiers to be honest and you don't need
> > > to add an extra chain just in order to be able to register a callback
> > > from a single user.
> > 
> > Right. I agree there are alternatives to using notifiers. I used them
> > because they're existing infrastructure, and because they let me do what
> > I want without too much troubles, which are two important points.
> > 
> > > That can be achieved with a single callback
> > > pointer too, but also you could just call a function exported by the
> > > scheduler directly from where in the cpufreq code it needs to be
> > > called.
> > 
> > Are you thinking about something comparable to what is done in
> > cpufreq_add_update_util_hook() (kernel/sched/cpufreq.c) for example ?
> > That would probably have the same drawback as my current implementation,
> > that is that the scheduler is notified of _all_ governor changes, not
> > only changes to/from sugov although this is the only thing we care about
> > for EAS.
> 
> Well, why don't you implement it as something like "if the governor changes
> from sugov to something else (or the other way around), call this function
> from the scheduler"?

I just gave it a try and ended up with the diff below. It's basically
the exact same patch with a direct function call instead of a notifier.
(I also tried the sugov_start/stop thing I keep mentioning but it is
more complex, so let's see if the simplest solution could work first).

What do you think ?

----------8<----------
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b0dfd3222013..6300668ac67a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -2271,6 +2272,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		ret = cpufreq_start_governor(policy);
 		if (!ret) {
 			pr_debug("cpufreq: governor change\n");
+			sched_governor_change(policy, old_gov);
 			return 0;
 		}
 		cpufreq_exit_governor(policy);
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index afa940cd50dc..33b77eed8a41 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_SCHED_CPUFREQ_H
 #define _LINUX_SCHED_CPUFREQ_H
 
+#include <linux/cpufreq.h>
 #include <linux/types.h>
 
 /*
@@ -28,4 +29,12 @@ static inline unsigned long map_util_freq(unsigned long util,
 }
 #endif /* CONFIG_CPU_FREQ */
 
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+void sched_governor_change(struct cpufreq_policy *policy,
+			struct cpufreq_governor *old_gov);
+#else
+static inline void sched_governor_change(struct cpufreq_policy *policy,
+			struct cpufreq_governor *old_gov) { }
+#endif
+
 #endif /* _LINUX_SCHED_CPUFREQ_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 8356cb0072a6..2ff40b2a8ba0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -632,7 +632,7 @@ static struct kobj_type sugov_tunables_ktype = {
 
 /********************** cpufreq governor interface *********************/
 
-static struct cpufreq_governor schedutil_gov;
+struct cpufreq_governor schedutil_gov;
 
 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 {
@@ -891,7 +891,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
 	sg_policy->need_freq_update = true;
 }
 
-static struct cpufreq_governor schedutil_gov = {
+struct cpufreq_governor schedutil_gov = {
 	.name			= "schedutil",
 	.owner			= THIS_MODULE,
 	.dynamic_switching	= true,
@@ -914,3 +914,32 @@ static int __init sugov_register(void)
 	return cpufreq_register_governor(&schedutil_gov);
 }
 fs_initcall(sugov_register);
+
+#ifdef CONFIG_ENERGY_MODEL
+extern bool sched_energy_update;
+static DEFINE_MUTEX(rebuild_sd_mutex);
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+	mutex_lock(&rebuild_sd_mutex);
+	sched_energy_update = true;
+	rebuild_sched_domains();
+	sched_energy_update = false;
+	mutex_unlock(&rebuild_sd_mutex);
+}
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+void sched_governor_change(struct cpufreq_policy *policy,
+				  struct cpufreq_governor *old_gov)
+{
+	if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
+		/* Sched domains cannot be rebuilt directly from this context */
+		schedule_work(&rebuild_sd_work);
+	}
+
+}
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e594a854977f..915766600568 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2265,10 +2265,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
 }
 #endif
 
-#ifdef CONFIG_SMP
-#ifdef CONFIG_ENERGY_MODEL
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 #define perf_domain_span(pd) (to_cpumask(((pd)->obj->cpus)))
 #else
 #define perf_domain_span(pd) NULL
 #endif
-#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ae329447a082..781f3eba840e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,7 +209,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
  */
 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
 
-#ifdef CONFIG_ENERGY_MODEL
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+bool sched_energy_update;
+
 static void free_pd(struct perf_domain *pd)
 {
 	struct perf_domain *tmp;
@@ -297,12 +299,15 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
  */
 #define EM_MAX_COMPLEXITY 2048
 
+extern struct cpufreq_governor schedutil_gov;
 static void build_perf_domains(const struct cpumask *cpu_map)
 {
 	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
+	struct cpufreq_policy *policy;
+	struct cpufreq_governor *gov;
 
 	/* EAS is enabled for asymmetric CPU capacity topologies. */
 	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
@@ -318,6 +323,15 @@ static void build_perf_domains(const struct cpumask *cpu_map)
 		if (find_pd(pd, i))
 			continue;
 
+		/* Do not attempt EAS if schedutil is not being used. */
+		policy = cpufreq_cpu_get(i);
+		if (!policy)
+			goto free;
+		gov = policy->governor;
+		cpufreq_cpu_put(policy);
+		if (gov != &schedutil_gov)
+			goto free;
+
 		/* Create the new pd and add it to the local list. */
 		tmp = pd_init(i);
 		if (!tmp)
@@ -389,7 +403,7 @@ static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[])
 }
 #else
 static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL */
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
 
 static void free_rootdomain(struct rcu_head *rcu)
 {
@@ -2190,10 +2204,10 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		;
 	}
 
-#ifdef CONFIG_ENERGY_MODEL
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 	/* Build perf. domains: */
 	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < n; j++) {
+		for (j = 0; j < n && !sched_energy_update; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
 			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
 				goto match3;