linux-kernel - Re: [PATCH tip/sched/core v2] sched/rt: Simplify the IPI rt balancing logic

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20170504153256.fbmglqe2zjo6ika2@hirez.programming.kicks-ass.net>
Date:   Thu, 4 May 2017 17:32:56 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Steven Rostedt <rostedt@...dmis.org>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...nel.org>,
        Clark Williams <williams@...hat.com>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        John Kacur <jkacur@...hat.com>, Scott Wood <swood@...hat.com>
Subject: Re: [PATCH tip/sched/core v2] sched/rt: Simplify the IPI rt
 balancing logic

On Mon, Apr 24, 2017 at 11:47:32AM -0400, Steven Rostedt wrote:
>  static int rto_next_cpu(struct rq *rq)
>  {
>  	int cpu;
>  
>  	/*
> +	 * When starting the IPI RT pushing, the rto_cpu is set to nr_cpu_ids
> +	 * or greater. rt_next_cpu() will simply return the first CPU found in
> +	 * the rto_mask.
> +	 *
> +	 * If rto_next_cpu() is called with rto_cpu less than nr_cpu_ids, it
> +	 * will return the next CPU found in the rto_mask.
> +	 *
> +	 * If there are no more CPUs left in the rto_mask, then a check is made
> +	 * against rto_loop and rto_loop_next. rto_loop is only updated with
> +	 * the rto_lock held, but any CPU may increment the rto_loop_next
> +	 * without any locking.
>  	 */
> +again:
> +	if (rq->rd->rto_cpu >= nr_cpu_ids) {
>  		cpu = cpumask_first(rq->rd->rto_mask);
> +		rq->rd->rto_cpu = cpu;
> +		/* If cpu is nr_cpu_ids, then there is no overloaded rqs */
> +		return cpu;
>  	}
>  
> +	cpu = cpumask_next(rq->rd->rto_cpu, rq->rd->rto_mask);
> +	rq->rd->rto_cpu = cpu;
>  
> +	if (cpu < nr_cpu_ids)
> +		return cpu;
>  
> +	if (rq->rd->rto_loop == atomic_read(&rq->rd->rto_loop_next))
> +		return cpu;
>  
> +	rq->rd->rto_loop = atomic_read(&rq->rd->rto_loop_next);
> +	goto again;
> +}

I think you want to write that as:

	struct root_domain *rd = rq->rd;
	int cpu, next;

	/* comment */
	for (;;) { 
		if (rd->rto_cpu >= nr_cpu_ids) {
			cpu = cpumask_first(rd->rto_mask);
			rd->rto_cpu = cpu;
			return cpu;
		}

		cpu = cpumask_next(rd->rto_mask);
		rd->rto_cpu = cpu;

		if (cpu < nr_cpu_ids)
			break;

//		rd->rto_cpu = -1;

		/*
		 * ACQUIRE ensures we see the @rto_mask changes
		 * made prior to the @next value observed.
		 * 
		 * Matches WMB in rt_set_overload().
		 */
		next = atomic_read_acquire(&rd->rto_loop_next);

		if (rd->rto_loop == next)
			break;

		rd->rto_loop = next;
	}

	return cpu;

And I don't fully understand the whole rto_cpu >= nr_cpus_ids thing,
can't you simply reset the thing to -1 and always use cpumask_next()?
As per the // comment above?

> +static inline bool rto_start_trylock(atomic_t *v)
> +{
> +	return !atomic_cmpxchg(v, 0, 1);

Arguably this could be: !atomic_cmpxchg_acquire(v, 0, 1);

>  }
>  
> +static inline void rto_start_unlock(atomic_t *v)
> +{
> +	atomic_set_release(v, 0);
> +}
>  

>  static void tell_cpu_to_push(struct rq *rq)
>  {
> +	int cpu = nr_cpu_ids;
>  
> +	/* Keep the loop going if the IPI is currently active */
> +	atomic_inc_return(&rq->rd->rto_loop_next);

Since rt_set_overload() already provides a WMB, we don't need an
ordered primitive here and atomic_inc() is fine.

>  
> +	/* Only one CPU can initiate a loop at a time */
> +	if (!rto_start_trylock(&rq->rd->rto_loop_start))
>  		return;
>  
> +	raw_spin_lock(&rq->rd->rto_lock);
> +
> +	/*
> +	 * The rto_cpu is updated under the lock, if it has a valid cpu
> +	 * then the IPI is still running and will continue due to the
> +	 * update to loop_next, and nothing needs to be done here.
> +	 * Otherwise it is finishing up and an ipi needs to be sent.
> +	 */
> +	if (rq->rd->rto_cpu >= nr_cpu_ids)
//	if (rq->rd->rto_cpu < 0)

> +		cpu = rto_next_cpu(rq);
>  
> +	raw_spin_unlock(&rq->rd->rto_lock);
> +
> +	rto_start_unlock(&rq->rd->rto_loop_start);
> +
> +	if (cpu < nr_cpu_ids)
> +		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
>  }
>  
>  /* Called from hardirq context */
> +void rto_push_irq_work_func(struct irq_work *work)
>  {
> +	struct rq *rq;
>  	int this_cpu;
>  	int cpu;
>  
> +	this_cpu = smp_processor_id();
>  	rq = cpu_rq(this_cpu);

	rq = this_rq();

>  
> +	/*
> +	 * We do not need to grab the lock to check for has_pushable_tasks.
> +	 * When it gets updated, a check is made if a push is possible.
> +	 */
>  	if (has_pushable_tasks(rq)) {
>  		raw_spin_lock(&rq->lock);
> +		push_rt_tasks(rq);
>  		raw_spin_unlock(&rq->lock);
>  	}
>  
> +	raw_spin_lock(&rq->rd->rto_lock);
>  
> +	/* Pass the IPI to the next rt overloaded queue */
> +	cpu = rto_next_cpu(rq);
>  
> +	raw_spin_unlock(&rq->rd->rto_lock);
>  
>  	if (cpu >= nr_cpu_ids)
>  		return;
>  
>  	/* Try the next RT overloaded CPU */
> +	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
>  }