linux-kernel - [PATCH] [RFC] ondemand governor: dynamic cpufreq scaling with different CPUs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAC4B8726E86A142B27A9E9A2F2F12470191AAD4C3@rrsmsx505.amr.corp.intel.com>
Date:	Tue, 26 Jul 2011 11:22:32 -0600
From:	"Yung, Winson W" <winson.w.yung@...el.com>
To:	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
CC:	"venki@...gle.com" <venki@...gle.com>,
	"Van De Ven, Arjan" <arjan.van.de.ven@...el.com>
Subject: [PATCH] [RFC] ondemand governor: dynamic cpufreq scaling with
 different CPUs

with using delayed work queue inside ondemand governor, I came to
realize that there are certain scenarios the governor may not
re-adjust cpu frequency swiftly for some duration. Here is a
brief description of the problem:

Currently when ondemand governor starts, it will create kthreads
for each CPU, something like kondemand/0, kondemand/1 and so on.
I found from ftrace that kondemand/0 is the only kthread that
handles all the CPU frequency dynamic cpu frequency scaling, other
kthread such as kondemand/1 is never used. I also confirmed this
by debugging through the kernel code.

So if there is no timer fired on CPU-0 (i.e. CPU-0 in C6), hence
no deferrable timers will ever fired by kondemand/0 to sample all
CPU workload. Since it is the only kthread doing the CPU freqency
scaling for all CPUs, it is possible to enter in the situation
where CPU frequency scaling will stop working for some duration
until CPU-0 becomes not idle again.

Signed-off-by: Hari K Kanigeri <hari.kkanigeri@...el.com>
---
 drivers/cpufreq/cpufreq_ondemand.c |   93 +++++++++++++++++++++++++++++++----
 include/linux/cpufreq.h            |    3 +
 2 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 891360e..b4f4f9d 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -541,6 +541,9 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 
 static void do_dbs_timer(struct work_struct *work)
 {
+	u64 old_time, new_do_dbs_time;
+	u64 saved_last_time;
+	
 	struct cpu_dbs_info_s *dbs_info =
 		container_of(work, struct cpu_dbs_info_s, work.work);
 	unsigned int cpu = dbs_info->cpu;
@@ -548,6 +551,41 @@ static void do_dbs_timer(struct work_struct *work)
 
 	int delay;
 
+	new_do_dbs_time = get_jiffies_64();
+
+	/* Save a copy of last_do_dbs_time do_dbs_timer */
+
+	saved_last_time = atomic64_read(
+		&dbs_info->cur_policy->last_do_dbs_time);
+
+	/* If the last time do_dbs_timer ran is less than */
+	/* 'delay' delta from current time, one of other  */
+	/* CPUs is still handling cpufreq governoring for */
+	/* for all CPUs. */
+
+	if ((new_do_dbs_time - saved_last_time) <
+		dbs_info->cur_policy->last_delay) {
+
+		/* If do_dbs_time ran not long ago */
+		/* (delta is less than last_delay) */
+		/* then bail out because there is  */
+		/* no need to run so frequently.   */
+		goto do_exit;
+	}
+
+	old_time = atomic64_cmpxchg(&dbs_info->cur_policy->last_do_dbs_time,
+					saved_last_time, new_do_dbs_time);
+
+	/* If old_time is not equal to saved_last_time, */ 
+	/* it means another CPU is calling do_dbs_time. */
+
+	if (old_time != saved_last_time)
+		goto do_exit;
+
+	/* Switch cpu saved in the policy */
+	if (dbs_info->cur_policy->cpu != cpu)
+		dbs_info->cur_policy->cpu = cpu;
+
 	mutex_lock(&dbs_info->timer_mutex);
 
 	/* Common NORMAL_SAMPLE setup */
@@ -559,6 +597,7 @@ static void do_dbs_timer(struct work_struct *work)
 			/* Setup timer for SUB_SAMPLE */
 			dbs_info->sample_type = DBS_SUB_SAMPLE;
 			delay = dbs_info->freq_hi_jiffies;
+			dbs_info->cur_policy->last_delay = delay;
 		} else {
 			/* We want all CPUs to do sampling nearly on
 			 * same jiffy
@@ -566,14 +605,18 @@ static void do_dbs_timer(struct work_struct *work)
 			delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate
 				* dbs_info->rate_mult);
 
-			if (num_online_cpus() > 1)
+			if (num_online_cpus() > 1) {
 				delay -= jiffies % delay;
+				dbs_info->cur_policy->last_delay = delay;
+			}
 		}
 	} else {
 		__cpufreq_driver_target(dbs_info->cur_policy,
 			dbs_info->freq_lo, CPUFREQ_RELATION_H);
 		delay = dbs_info->freq_lo_jiffies;
+		dbs_info->cur_policy->last_delay = delay;
 	}
+do_exit:
 	schedule_delayed_work_on(cpu, &dbs_info->work, delay);
 	mutex_unlock(&dbs_info->timer_mutex);
 }
@@ -648,10 +691,12 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				j_dbs_info->prev_cpu_nice =
 						kstat_cpu(j).cpustat.nice;
 			}
+
+			j_dbs_info->cpu = j;
+			j_dbs_info->rate_mult = 1;
+			ondemand_powersave_bias_init_cpu(j);
 		}
-		this_dbs_info->cpu = cpu;
-		this_dbs_info->rate_mult = 1;
-		ondemand_powersave_bias_init_cpu(cpu);
+
 		/*
 		 * Start the timerschedule work, when this governor
 		 * is used for first time
@@ -680,32 +725,58 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		}
 		mutex_unlock(&dbs_mutex);
 
-		mutex_init(&this_dbs_info->timer_mutex);
-		dbs_timer_init(this_dbs_info);
+		for_each_cpu(j, policy->cpus) {
+			struct cpu_dbs_info_s *j_dbs_info;
+			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
+			mutex_init(&j_dbs_info->timer_mutex);
+			dbs_timer_init(j_dbs_info);
+		}
 		break;
 
 	case CPUFREQ_GOV_STOP:
-		dbs_timer_exit(this_dbs_info);
+		for_each_cpu(j, policy->cpus) {
+			struct cpu_dbs_info_s *j_dbs_info;
+			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
+			dbs_timer_exit(j_dbs_info);
+		}
 
 		mutex_lock(&dbs_mutex);
-		mutex_destroy(&this_dbs_info->timer_mutex);
+
+		for_each_cpu(j, policy->cpus) {
+			struct cpu_dbs_info_s *j_dbs_info;
+			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
+			mutex_destroy(&j_dbs_info->timer_mutex);
+		}
+
 		dbs_enable--;
 		mutex_unlock(&dbs_mutex);
 		if (!dbs_enable)
 			sysfs_remove_group(cpufreq_global_kobject,
 					   &dbs_attr_group);
-
 		break;
 
 	case CPUFREQ_GOV_LIMITS:
-		mutex_lock(&this_dbs_info->timer_mutex);
+		/* Since ondemand governor is no longer running on the */
+		/* same CPU anymore,need to mutex_lock all timer_mutex */
+		/* before calling __cpufreq_driver_target. */
+		for_each_cpu(j, policy->cpus) {
+			struct cpu_dbs_info_s *j_dbs_info;
+			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
+			mutex_lock(&j_dbs_info->timer_mutex);
+		}
+
 		if (policy->max < this_dbs_info->cur_policy->cur)
 			__cpufreq_driver_target(this_dbs_info->cur_policy,
 				policy->max, CPUFREQ_RELATION_H);
 		else if (policy->min > this_dbs_info->cur_policy->cur)
 			__cpufreq_driver_target(this_dbs_info->cur_policy,
 				policy->min, CPUFREQ_RELATION_L);
-		mutex_unlock(&this_dbs_info->timer_mutex);
+
+		for_each_cpu(j, policy->cpus) {
+			struct cpu_dbs_info_s *j_dbs_info;
+			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
+			mutex_unlock(&j_dbs_info->timer_mutex);
+		}
 		break;
 	}
 	return 0;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 11be48e..2295a35 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -106,6 +106,9 @@ struct cpufreq_policy {
 
 	struct kobject		kobj;
 	struct completion	kobj_unregister;
+
+	atomic64_t		last_do_dbs_tim;
+	int			last_delay;
 };
 
 #define CPUFREQ_ADJUST		(0)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/