linux-kernel - Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue, 10 Nov 2015 14:23:24 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Jacob Pan <jacob.jun.pan@...ux.intel.com>
Cc:	LKML <linux-kernel@...r.kernel.org>,
	Rafael Wysocki <rafael.j.wysocki@...el.com>,
	Len Brown <len.brown@...el.com>,
	Andi Kleen <andi.kleen@...el.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Paul Turner <pjt@...gle.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Eduardo Valentin <edubezval@...il.com>,
	Punit Agrawal <punit.agrawal@....com>,
	Srinivas Pandruvada <srinivas.pandruvada@...ux.intel.com>
Subject: Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

On Mon, Nov 09, 2015 at 04:21:23PM -0800, Jacob Pan wrote:
> +++ b/include/trace/events/sched.h

> +/*
> + * Tracepoint for idle injection
> + */
> +TRACE_EVENT(sched_cfs_idle_inject,
> +
> +	TP_PROTO(char *msg, int throttled),
> +
> +	TP_ARGS(msg, throttled),
> +
> +	TP_STRUCT__entry(
> +		__string(msg, msg)
> +		__field(int, throttled)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(msg, msg);
> +		__entry->throttled = throttled;
> +	),
> +
> +	TP_printk("%s: throttled=%d", __get_str(msg), __entry->throttled)
> +);

So I hate tracepoints.. and I'd rather not see them. But at the very
least kill that @msg field and replace it with an enum or so.


> +/*
> + * Knobs for controlling percentage of time when idle is forced across all
> + * CPUs. This is a power management feature intended for achieving deepest
> + * and broadest idle without lower CPU frequencies to less optimal level.
> + * No action is taken if CPUs are natually idle.
> + */
> +#ifdef CONFIG_CFS_IDLE_INJECT
> +unsigned int sysctl_sched_cfs_idle_inject_pct;
> +unsigned int sysctl_sched_cfs_idle_inject_duration = 10UL;

since you're playing the ifdef game, you might as well also do:

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
		cfs_rq->runnable = true;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
	return cfs_rq->runnable;
}

#else

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
	cfs_rq->nr_running++;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
	return !!cfs_rq->nr_running;
}

> +#endif
> +
>  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
>  {
>  	lw->weight += inc;
> @@ -2334,7 +2346,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  		list_add(&se->group_node, &rq->cfs_tasks);
>  	}
>  #endif
> -	cfs_rq->nr_running++;
> +
> +	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
> +		cfs_rq->runnable = true;

which makes that:
	cfs_rq_nr_running_inc();

>  }
>  
>  static void
> @@ -2347,7 +2361,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
>  		list_del_init(&se->group_node);
>  	}
> -	cfs_rq->nr_running--;
> +
> +	if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
> +		cfs_rq->runnable = false;

	cfs_rq_nr_running_dec();

>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -5139,7 +5155,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
>  
>  again:
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -	if (!cfs_rq->nr_running)
> +	if (!cfs_rq->runnable)

	if (!cfs_rq_runnable(cfs_rq))
>  		goto idle;
>  
>  	if (prev->sched_class != &fair_sched_class)

>  idle:
> +	if ((cfs_rq->forced_idle)) {
> +		if (unlikely(local_softirq_pending())) {
> +			trace_sched_cfs_idle_inject("softirq pending", 1);

> +			cfs_rq->forced_idle = false;
> +			cfs_rq->runnable = cfs_rq->nr_running;

maybe:
			__unthrottle_cfs_rq(cfs_rq); ?

> +			goto again;
> +		}
> +		trace_sched_cfs_idle_inject("forced idle", 1);
> +		return NULL;
> +	}
>  	/*
>  	 * This is OK, because current is on_cpu, which avoids it being picked
>  	 * for load-balance and preemption/IRQs are still disabled avoiding
> @@ -8318,3 +8344,350 @@ __init void init_sched_fair_class(void)
>  #endif /* SMP */
>  
>  }
> +
> +#ifdef CONFIG_CFS_IDLE_INJECT

> +static atomic_t idle_inject_active;

You only use atomic_{read,set} on this, therefore atomic_t is pointless.

> +static DEFINE_PER_CPU(struct hrtimer, idle_inject_timer);
> +static DEFINE_PER_CPU(bool, idle_injected);

I tend to prefer to not use bool as a storage class; its ill defined.

> +/* protect injection parameters from runtime changes */
> +static DEFINE_SPINLOCK(idle_inject_lock);

A global lock, yay :-), I think you want this to be a RAW_SPINLOCK
though. As on -RT this would want to actually run from IRQ context too.

> +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *hrtimer)
> +{
> +	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +	int cpu = smp_processor_id();
> +	ktime_t now, delta, period;
> +	bool status;
> +
> +	now = hrtimer_cb_get_time(hrt);

You're not interested in the current time.

> +
> +	status = raw_cpu_read(idle_injected);
> +	if (status) {
> +		/*
> +		 * We were injecting idle in the last phase, let's forward the
> +		 * timer to the next period
> +		 *
> +		 * status: 1             0                1        0
> +		 * ____          ____________________           _______
> +		 *     |________|                    |_________|
> +		 *
> +		 *     |duration|      interval      |
> +		 *
> +		 *              ^ we are here
> +		 *                  forward to here: ^
> +		 */
> +		delta = ktime_sub(now, inject_start_time);
> +		period = ktime_add(ms_to_ktime(duration),
> +				ms_to_ktime(inject_interval));
> +		delta = ktime_roundup(delta, period);
> +		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

This doesn't make any sense. Who cares what the current time is.

> +	} else {
> +		/*
> +		 * We were not injecting idle in the last phase, let's forward
> +		 * timer after forced idle duration
> +		 * ____          ____________________           _______
> +		 *     |________|                    |_________|
> +		 *
> +		 *     |duration|      interval      |
> +		 *
> +		 *     ^ we are here
> +		 *              ^ forward timer to here
> +		 */
> +		hrtimer_set_expires(hrt, ktime_add(ms_to_ktime(duration), now));

Same here, we don't care about the current time. The timer was at the
previous start of injection, just forward it a whole period to find the
next injection slot.

> +	}

It looks like what you want is:

	hrtimer_forward(hrt, period);

unconditionally.

> +	raw_cpu_write(idle_injected, !status);
> +	trace_sched_cfs_idle_inject("idle sync timer", !status);
> +	if (status)
> +		unthrottle_rq(cpu);
> +	else
> +		throttle_rq(cpu);
> +
> +	return HRTIMER_RESTART;
> +}
> +
> +static void idle_inject_timer_start(void *info)
> +{
> +	int cpu = smp_processor_id();
> +	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +
> +	this_cpu_write(idle_injected, true);
> +	set_bit(cpu, idle_inject_cpumask);
> +	hrtimer_start(hrt, ms_to_ktime(duration), HRTIMER_MODE_ABS_PINNED);
> +	hrtimer_set_expires(hrt, *(ktime_t *)info);

This is broken, _first_ set an expiration time, then start the timer.

Now you insert the timer into the RB tree on a previous expiration time,
then you modify the expiration time under it, effectively wrecking the
RB tree.

> +}

> +static void stop_idle_inject(void)
> +{
> +	int i;
> +	struct hrtimer *hrt;
> +
> +	if (bitmap_weight(idle_inject_cpumask, num_possible_cpus())) {

I don't get the point of this bitmap; with the cpu notifier you
basically ensure this is equal to online_mask.

Also, this weight test is pointless, if the bitmap is empty the
for_each_set_bit() should be of equal cost -- and afaict nothing calling
this is performance critical in the first place.

> +		for_each_set_bit(i, idle_inject_cpumask, num_possible_cpus()) {

> +			hrt = &per_cpu(idle_inject_timer, i);
> +			hrtimer_cancel(hrt);
> +			unthrottle_rq(i);
> +		}
> +	}
> +}
> +
> +static int idle_inject_cpu_callback(struct notifier_block *nfb,
> +				unsigned long action, void *hcpu)
> +{
> +	unsigned long cpu = (unsigned long)hcpu;
> +	struct hrtimer *hrt = &per_cpu(idle_inject_timer, cpu);
> +	ktime_t now, delta, period;
> +
> +	if (!atomic_read(&idle_inject_active))
> +		goto exit_ok;

We should never get here if that weren't set, right? I mean you
register/unregister these callbacks around setting that variable.

> +
> +	switch (action) {
> +	case CPU_STARTING:
> +		raw_cpu_write(idle_injected, true);
> +
> +		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +		hrt->function = idle_inject_timer_fn;
> +		set_bit(cpu, idle_inject_cpumask);
> +
> +		now = hrtimer_cb_get_time(hrt);
> +		hrtimer_start(hrt, ms_to_ktime(duration),
> +			HRTIMER_MODE_ABS_PINNED);
> +		/*
> +		 * When a new CPU comes online, we need to make sure it aligns
> +		 * its phase with the rest of the CPUs. So we set the
> +		 * timer to the next period based on the common starting time,
> +		 * then start injecting idle time.
> +		 */
> +		spin_lock_irq(&idle_inject_lock);
> +		delta = ktime_sub(now, inject_start_time);
> +		period = ktime_add(ms_to_ktime(duration),
> +				ms_to_ktime(inject_interval));
> +		delta = ktime_roundup(delta, period);
> +		spin_unlock_irq(&idle_inject_lock);
> +		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

Same broken, you cannot call that on a timer you've already started.

> +		break;
> +	case CPU_DYING:
> +		clear_bit(cpu, idle_inject_cpumask);
> +		hrtimer_cancel(hrt);
> +		raw_cpu_write(idle_injected, false);
> +		unthrottle_rq(cpu);
> +		break;
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +exit_ok:
> +	return NOTIFY_OK;
> +}
> +
> +static int idle_inject_pm_callback(struct notifier_block *self,
> +				unsigned long action, void *hcpu)
> +{
> +	switch (action) {
> +	case PM_HIBERNATION_PREPARE:
> +	case PM_SUSPEND_PREPARE:
> +		if (atomic_read(&idle_inject_active))
> +			stop_idle_inject();

As with the above, if that were false, this whole callback would not be
called, seeing how you unregister before actually clearing that
idle_inject_active thing.

> +		break;
> +	case PM_POST_HIBERNATION:
> +	case PM_POST_SUSPEND:
> +		printk("POST SUSPEND restart idle injection\n");

Seems a tad inconsistent, printing here but not when stopping it.

> +		start_idle_inject();
> +		break;
> +	default:
> +		break;
> +	}
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block idle_inject_pm_notifier = {
> +	.notifier_call = idle_inject_pm_callback,
> +};
> +
> +static struct notifier_block idle_inject_cpu_notifier = {
> +	.notifier_call = idle_inject_cpu_callback,
> +};
> +
> +static void end_idle_inject(void) {
> +	unregister_hotcpu_notifier(&idle_inject_cpu_notifier);
> +	unregister_pm_notifier(&idle_inject_pm_notifier);

As per the above, these callbacks will not happen hereafter, and will
this never see:

> +	atomic_set(&idle_inject_active, 0);
> +	kfree(idle_inject_cpumask);
> +}
> +
> +static int prepare_idle_inject(void)
> +{
> +	int retval = 0;
> +	int bitmap_size;
> +	int cpu;
> +	struct hrtimer *hrt;
> +
> +	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);

This is incorrect, you want nr_cpu_ids. There is no guarantee the CPU
space does not contain holes. But seeing I still don't see the point of
the mask, this might all fix itself by killing it alltogether.

> +	idle_inject_cpumask = kzalloc(bitmap_size, GFP_KERNEL);
> +	if (!idle_inject_cpumask)
> +		return -ENOMEM;
> +
> +	retval = register_pm_notifier(&idle_inject_pm_notifier);
> +	if (retval)
> +		goto exit_free;
> +	retval = register_hotcpu_notifier(&idle_inject_cpu_notifier);
> +	if (retval)
> +		goto exit_unregister_pm;
> +	get_online_cpus();
> +	for_each_online_cpu(cpu) {
> +		hrt = &per_cpu(idle_inject_timer, cpu);
> +		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +		hrt->function = idle_inject_timer_fn;
> +	}
> +	put_online_cpus();
> +
> +	if (!duration)
> +		duration = DEFAULT_DURATION_MSECS;
> +
> +	return 0;
> +exit_unregister_pm:
> +	unregister_pm_notifier(&idle_inject_pm_notifier);
> +exit_free:
> +	kfree(idle_inject_cpumask);
> +	return retval;
> +}
> +
> +int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
> +					int write,
> +					void __user *buffer,
> +					size_t *length,	loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (ret)
> +		goto out;
> +
> +	if (idle_pct != sysctl_sched_cfs_idle_inject_pct) {
> +		if (!idle_pct)
> +			start_idle_inject();
> +		else if (!sysctl_sched_cfs_idle_inject_pct) {
> +			stop_idle_inject();
> +			end_idle_inject();
> +		}
> +
> +		/* recompute injection parameters */
> +		spin_lock_irq(&idle_inject_lock);
> +		idle_pct = sysctl_sched_cfs_idle_inject_pct;
> +		/*
> +		 * duration is fixed for each injection period, we adjust
> +		 * non idle interval to satisfy the idle percentage set
> +		 * by the user. e.g. if duration is 10 and we want 33% idle
> +		 * then interval is 20.
> +		 * 33% idle
> +		 * ____          ___________________          _________
> +		 *     |________|                   |________| 33% idle
> +		 * ____          ________          _______
> +		 *     |________|        |________|  50% idle
> +		 *
> +		 *     |duration|interval|
> +		 */
> +		if (idle_pct)
> +			inject_interval = (duration * (100 - idle_pct))
> +				/ idle_pct;

This needs {} (or just exceed the 80 char thing).

> +		spin_unlock_irq(&idle_inject_lock);
> +
> +	}
> +out:
> +	return ret;
> +}
> +
> +int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
> +						int write,
> +						void __user *buffer,
> +						size_t *length,	loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (ret)
> +		goto out;
> +
> +	if (duration == sysctl_sched_cfs_idle_inject_duration)
> +		goto out;
> +	/* recompute injection parameters */
> +	spin_lock_irq(&idle_inject_lock);
> +	duration = jiffies_to_msecs(sysctl_sched_cfs_idle_inject_duration);
> +	if (idle_pct)
> +		inject_interval = (duration * (100 - idle_pct)) / idle_pct;
> +
> +	spin_unlock_irq(&idle_inject_lock);
> +out:
> +	return ret;
> +}

And since you have proc handlers for both these, why not convert to
ktime here and avoid the enless ms_to_ktime() calls ?

Also, maybe precompute the period, since that is what you really need.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/