lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 11 Mar 2016 16:38:21 +0100
From:	Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To:	linux-kernel@...r.kernel.org
Cc:	Jacob Pan <jacob.jun.pan@...ux.intel.com>, rt@...utronix.de,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	Zhang Rui <rui.zhang@...el.com>,
	Eduardo Valentin <edubezval@...il.com>,
	linux-pm@...r.kernel.org
Subject: [PATCH] thermal/intel_powerclamp: convert to smpboot thread

Oh boy oh boy. This thing runs at SCHED_FIFO MAX_USER_RT_PRIO/2 and
stops at mwait_idle_with_hints(). Why bother with /2?
There are a few things I haven't fully decoded. For instance why is it
looking at local_softirq_pending()?
The timer is probably here if mwait would let it sleep too long.

I tried to convert it over to smpboot thread so we don't have that CPU
notifier stuff to fire the cpu threads during hotplug events.

the smp_mb() barriers are not documented - I just shifted the code to
the left.

The code / logic itself could be a little more inteligent and only wake
up the threads for the CPUs that are about to idle but it seems it is
done on all of the at once unless I missed something.

Cc: Zhang Rui <rui.zhang@...el.com>
Cc: Eduardo Valentin <edubezval@...il.com>
Cc: linux-pm@...r.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
 drivers/thermal/intel_powerclamp.c | 315 ++++++++++++++++---------------------
 1 file changed, 136 insertions(+), 179 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 6c79588251d5..e04f7631426a 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -51,6 +51,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/rt.h>
+#include <linux/smpboot.h>
 
 #include <asm/nmi.h>
 #include <asm/msr.h>
@@ -85,9 +86,9 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 				  * can be offlined.
 				  */
 static bool clamping;
+static DEFINE_PER_CPU(struct task_struct *, clamp_kthreads);
+static DEFINE_PER_CPU(struct timer_list, clamp_timer);
 
-
-static struct task_struct * __percpu *powerclamp_thread;
 static struct thermal_cooling_device *cooling_dev;
 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
 					   * clamping thread
@@ -368,100 +369,82 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 	return set_target_ratio + guard <= current_ratio;
 }
 
-static int clamp_thread(void *arg)
+static void clamp_thread_fn(unsigned int cpu)
 {
-	int cpunr = (unsigned long)arg;
-	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
-	static const struct sched_param param = {
-		.sched_priority = MAX_USER_RT_PRIO/2,
-	};
 	unsigned int count = 0;
 	unsigned int target_ratio;
+	int sleeptime;
+	unsigned long target_jiffies;
+	unsigned int guard;
+	unsigned int compensation = 0;
+	int interval; /* jiffies to sleep for each attempt */
+	unsigned int duration_jiffies = msecs_to_jiffies(duration);
+	unsigned int window_size_now;
+	struct timer_list *wake_timer = per_cpu_ptr(&clamp_timer, cpu);
 
-	set_bit(cpunr, cpu_clamping_mask);
-	set_freezable();
-	init_timer_on_stack(&wakeup_timer);
-	sched_setscheduler(current, SCHED_FIFO, &param);
+	/*
+	 * make sure user selected ratio does not take effect until
+	 * the next round. adjust target_ratio if user has changed
+	 * target such that we can converge quickly.
+	 */
+	target_ratio = set_target_ratio;
+	guard = 1 + target_ratio/20;
+	window_size_now = window_size;
+	count++;
 
-	while (true == clamping && !kthread_should_stop() &&
-		cpu_online(cpunr)) {
-		int sleeptime;
-		unsigned long target_jiffies;
-		unsigned int guard;
-		unsigned int compensation = 0;
-		int interval; /* jiffies to sleep for each attempt */
-		unsigned int duration_jiffies = msecs_to_jiffies(duration);
-		unsigned int window_size_now;
+	/*
+	 * systems may have different ability to enter package level
+	 * c-states, thus we need to compensate the injected idle ratio
+	 * to achieve the actual target reported by the HW.
+	 */
+	compensation = get_compensation(target_ratio);
+	interval = duration_jiffies*100/(target_ratio+compensation);
 
-		try_to_freeze();
-		/*
-		 * make sure user selected ratio does not take effect until
-		 * the next round. adjust target_ratio if user has changed
-		 * target such that we can converge quickly.
-		 */
-		target_ratio = set_target_ratio;
-		guard = 1 + target_ratio/20;
-		window_size_now = window_size;
-		count++;
-
-		/*
-		 * systems may have different ability to enter package level
-		 * c-states, thus we need to compensate the injected idle ratio
-		 * to achieve the actual target reported by the HW.
-		 */
-		compensation = get_compensation(target_ratio);
-		interval = duration_jiffies*100/(target_ratio+compensation);
-
-		/* align idle time */
-		target_jiffies = roundup(jiffies, interval);
-		sleeptime = target_jiffies - jiffies;
-		if (sleeptime <= 0)
-			sleeptime = 1;
-		schedule_timeout_interruptible(sleeptime);
-		/*
-		 * only elected controlling cpu can collect stats and update
-		 * control parameters.
-		 */
-		if (cpunr == control_cpu && !(count%window_size_now)) {
-			should_skip =
-				powerclamp_adjust_controls(target_ratio,
-							guard, window_size_now);
-			smp_mb();
-		}
-
-		if (should_skip)
-			continue;
-
-		target_jiffies = jiffies + duration_jiffies;
-		mod_timer(&wakeup_timer, target_jiffies);
-		if (unlikely(local_softirq_pending()))
-			continue;
-		/*
-		 * stop tick sched during idle time, interrupts are still
-		 * allowed. thus jiffies are updated properly.
-		 */
-		preempt_disable();
-		/* mwait until target jiffies is reached */
-		while (time_before(jiffies, target_jiffies)) {
-			unsigned long ecx = 1;
-			unsigned long eax = target_mwait;
-
-			/*
-			 * REVISIT: may call enter_idle() to notify drivers who
-			 * can save power during cpu idle. same for exit_idle()
-			 */
-			local_touch_nmi();
-			stop_critical_timings();
-			mwait_idle_with_hints(eax, ecx);
-			start_critical_timings();
-			atomic_inc(&idle_wakeup_counter);
-		}
-		preempt_enable();
+	/* align idle time */
+	target_jiffies = roundup(jiffies, interval);
+	sleeptime = target_jiffies - jiffies;
+	if (sleeptime <= 0)
+		sleeptime = 1;
+	schedule_timeout_interruptible(sleeptime);
+	/*
+	 * only elected controlling cpu can collect stats and update
+	 * control parameters.
+	 */
+	if (cpu == control_cpu && !(count%window_size_now)) {
+		should_skip =
+			powerclamp_adjust_controls(target_ratio,
+						   guard, window_size_now);
+		smp_mb();
 	}
-	del_timer_sync(&wakeup_timer);
-	clear_bit(cpunr, cpu_clamping_mask);
 
-	return 0;
+	if (should_skip)
+		return;
+
+	target_jiffies = jiffies + duration_jiffies;
+	mod_timer(wake_timer, target_jiffies);
+	if (unlikely(local_softirq_pending()))
+		return;
+	/*
+	 * stop tick sched during idle time, interrupts are still
+	 * allowed. thus jiffies are updated properly.
+	 */
+	preempt_disable();
+	/* mwait until target jiffies is reached */
+	while (time_before(jiffies, target_jiffies)) {
+		unsigned long ecx = 1;
+		unsigned long eax = target_mwait;
+
+		/*
+		 * REVISIT: may call enter_idle() to notify drivers who
+		 * can save power during cpu idle. same for exit_idle()
+		 */
+		local_touch_nmi();
+		stop_critical_timings();
+		mwait_idle_with_hints(eax, ecx);
+		start_critical_timings();
+		atomic_inc(&idle_wakeup_counter);
+	}
+	preempt_enable();
 }
 
 /*
@@ -505,10 +488,64 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 }
 
+static void clamp_thread_setup(unsigned int cpu)
+{
+	struct timer_list *wake_timer;
+	static struct sched_param param = {
+		.sched_priority = MAX_USER_RT_PRIO/2,
+	};
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	wake_timer = per_cpu_ptr(&clamp_timer, cpu);
+
+	setup_timer(wake_timer, noop_timer, 0);
+}
+
+static void clamp_thread_unpark(unsigned int cpu)
+{
+	set_bit(cpu, cpu_clamping_mask);
+	if (cpu == 0) {
+		control_cpu = 0;
+		smp_mb();
+	}
+}
+
+static void clamp_thread_park(unsigned int cpu)
+{
+	clear_bit(cpu, cpu_clamping_mask);
+	if (cpu == control_cpu) {
+		control_cpu = cpumask_any_but(cpu_online_mask, cpu);
+		smp_mb();
+	}
+	del_timer_sync(per_cpu_ptr(&clamp_timer, cpu));
+}
+
+static void clamp_thread_cleanup(unsigned int cpu, bool online)
+{
+	if (!online)
+		return;
+	clamp_thread_park(cpu);
+}
+
+static int clamp_thread_should_run(unsigned int cpu)
+{
+	return clamping == true;
+}
+
+static struct smp_hotplug_thread clamp_threads = {
+	.store			= &clamp_kthreads,
+	.setup			= clamp_thread_setup,
+	.cleanup		= clamp_thread_cleanup,
+	.thread_should_run	= clamp_thread_should_run,
+	.thread_fn		= clamp_thread_fn,
+	.park			= clamp_thread_park,
+	.unpark			= clamp_thread_unpark,
+	.thread_comm		= "kidle_inject/%u",
+};
+
 static int start_power_clamp(void)
 {
-	unsigned long cpu;
-	struct task_struct *thread;
+	unsigned int cpu;
 
 	/* check if pkg cstate counter is completely 0, abort in this case */
 	if (!has_pkg_state_counter()) {
@@ -528,23 +565,9 @@ static int start_power_clamp(void)
 	clamping = true;
 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
-	/* start one thread per online cpu */
-	for_each_online_cpu(cpu) {
-		struct task_struct **p =
-			per_cpu_ptr(powerclamp_thread, cpu);
+	for_each_online_cpu(cpu)
+		wake_up_process(per_cpu_ptr(clamp_kthreads, cpu));
 
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%ld", cpu);
-		/* bind to cpu here */
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*p = thread;
-		}
-
-	}
 	put_online_cpus();
 
 	return 0;
@@ -552,9 +575,6 @@ static int start_power_clamp(void)
 
 static void end_power_clamp(void)
 {
-	int i;
-	struct task_struct *thread;
-
 	clamping = false;
 	/*
 	 * make clamping visible to other cpus and give per cpu clamping threads
@@ -562,63 +582,8 @@ static void end_power_clamp(void)
 	 */
 	smp_mb();
 	msleep(20);
-	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
-		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
-			pr_debug("clamping thread for cpu %d alive, kill\n", i);
-			thread = *per_cpu_ptr(powerclamp_thread, i);
-			kthread_stop(thread);
-		}
-	}
 }
 
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
-{
-	unsigned long cpu = (unsigned long)hcpu;
-	struct task_struct *thread;
-	struct task_struct **percpu_thread =
-		per_cpu_ptr(powerclamp_thread, cpu);
-
-	if (false == clamping)
-		goto exit_ok;
-
-	switch (action) {
-	case CPU_ONLINE:
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%lu", cpu);
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*percpu_thread = thread;
-		}
-		/* prefer BSP as controlling CPU */
-		if (cpu == 0) {
-			control_cpu = 0;
-			smp_mb();
-		}
-		break;
-	case CPU_DEAD:
-		if (test_bit(cpu, cpu_clamping_mask)) {
-			pr_err("cpu %lu dead but powerclamping thread is not\n",
-				cpu);
-			kthread_stop(*percpu_thread);
-		}
-		if (cpu == control_cpu) {
-			control_cpu = smp_processor_id();
-			smp_mb();
-		}
-	}
-
-exit_ok:
-	return NOTIFY_OK;
-}
-
-static struct notifier_block powerclamp_cpu_notifier = {
-	.notifier_call = powerclamp_cpu_callback,
-};
-
 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
@@ -788,19 +753,15 @@ static int __init powerclamp_init(void)
 
 	/* set default limit, maybe adjusted during runtime based on feedback */
 	window_size = 2;
-	register_hotcpu_notifier(&powerclamp_cpu_notifier);
-
-	powerclamp_thread = alloc_percpu(struct task_struct *);
-	if (!powerclamp_thread) {
-		retval = -ENOMEM;
-		goto exit_unregister;
-	}
+	retval = smpboot_register_percpu_thread(&clamp_threads);
+	if (retval)
+		goto exit_free;
 
 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
 						&powerclamp_cooling_ops);
 	if (IS_ERR(cooling_dev)) {
 		retval = -ENODEV;
-		goto exit_free_thread;
+		goto exit_free_smp_thread;
 	}
 
 	if (!duration)
@@ -809,11 +770,8 @@ static int __init powerclamp_init(void)
 	powerclamp_create_debug_files();
 
 	return 0;
-
-exit_free_thread:
-	free_percpu(powerclamp_thread);
-exit_unregister:
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+exit_free_smp_thread:
+	smpboot_unregister_percpu_thread(&clamp_threads);
 exit_free:
 	kfree(cpu_clamping_mask);
 	return retval;
@@ -822,9 +780,8 @@ module_init(powerclamp_init);
 
 static void __exit powerclamp_exit(void)
 {
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 	end_power_clamp();
-	free_percpu(powerclamp_thread);
+	smpboot_unregister_percpu_thread(&clamp_threads);
 	thermal_cooling_device_unregister(cooling_dev);
 	kfree(cpu_clamping_mask);
 
-- 
2.7.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ