linux-kernel - Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:	Tue, 12 Aug 2014 14:27:07 +0400
From:	Kirill Tkhai <ktkhai@...allels.com>
To:	Peter Zijlstra <peterz@...radead.org>
CC:	<linux-kernel@...r.kernel.org>, <pjt@...gle.com>,
	<oleg@...hat.com>, <rostedt@...dmis.org>,
	<umgwanakikbuti@...il.com>, <tkhai@...dex.ru>,
	<tim.c.chen@...ux.intel.com>, <mingo@...nel.org>,
	<nicolas.pitre@...aro.org>
Subject: Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from
 load_balance()

В Вт, 12/08/2014 в 11:36 +0200, Peter Zijlstra пишет:
> 
> Changed quite a bit around, _should_ be more or less the same end result
> I suppose. Only compile tested so far.

No objections. I've tested it in a virtual machine; simple "make -jxx"
test passed.

> 
> ---
> Subject: sched/fair: Remove double_lock_balance() from load_balance()
> From: Kirill Tkhai <ktkhai@...allels.com>
> Date: Wed, 6 Aug 2014 12:07:04 +0400
> 
> Avoid double_rq_lock() and use ONRQ_MIGRATING for load_balance(). The
> advantage is (obviously) not holding two 'rq->lock's at the same time
> and thereby increasing parallelism.
> 
> Further note that if there was no task to migrate we will not have
> acquired the second rq->lock at all.
> 
> The important point to note is that because we acquire dst->lock
> immediately after releasing src->lock the potential wait time of
> task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
> have been in the double rq lock scenario.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@...allels.com>
> Signed-off-by: Peter Zijlstra <peterz@...radead.org>
> Link: http://lkml.kernel.org/r/1407312424.8424.48.camel@tkhai
> ---
>  kernel/sched/fair.c |  151 ++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 99 insertions(+), 52 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct
>  		return;
>  
>  	/*
> -	 * This is possible from callers such as move_task(), in which we
> +	 * This is possible from callers such as attach_tasks(), in which we
>  	 * unconditionally check_prempt_curr() after an enqueue (which may have
>  	 * lead to a throttle).  This both saves work and prevents false
>  	 * next-buddy nomination below.
> @@ -5114,21 +5114,10 @@ struct lb_env {
>  	unsigned int		loop_max;
>  
>  	enum fbq_type		fbq_type;
> +	struct list_head	tasks;
>  };
>  
>  /*
> - * move_task - move a task from one runqueue to another runqueue.
> - * Both runqueues must be locked.
> - */
> -static void move_task(struct task_struct *p, struct lb_env *env)
> -{
> -	deactivate_task(env->src_rq, p, 0);
> -	set_task_cpu(p, env->dst_cpu);
> -	activate_task(env->dst_rq, p, 0);
> -	check_preempt_curr(env->dst_rq, p, 0);
> -}
> -
> -/*
>   * Is this task likely cache-hot:
>   */
>  static int task_hot(struct task_struct *p, struct lb_env *env)
> @@ -5343,6 +5332,18 @@ int can_migrate_task(struct task_struct
>  }
>  
>  /*
> + * detach_task() -- detach the task for the migration specified in env
> + */
> +static void detach_task(struct task_struct *p, struct lb_env *env)
> +{
> +	lockdep_assert_held(&env->src_rq->lock);
> +
> +	deactivate_task(env->src_rq, p, 0);
> +	p->on_rq = ONRQ_MIGRATING;
> +	set_task_cpu(p, env->dst_cpu);
> +}
> +
> +/*
>   * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
>   * part of active balancing operations within "domain".
>   *
> @@ -5358,15 +5359,13 @@ static struct task_struct *detach_one_ta
>  		if (!can_migrate_task(p, env))
>  			continue;
>  
> -		deactivate_task(env->src_rq, p, 0);
> -		p->on_rq = ONRQ_MIGRATING;
> -		set_task_cpu(p, env->dst_cpu);
> +		detach_task(p, env);
>  
>  		/*
>  		 * Right now, this is only the second place where
> -		 * lb_gained[env->idle] is updated (other is move_tasks)
> +		 * lb_gained[env->idle] is updated (other is detach_tasks)
>  		 * so we can safely collect stats here rather than
> -		 * inside move_tasks().
> +		 * inside detach_tasks().
>  		 */
>  		schedstat_inc(env->sd, lb_gained[env->idle]);
>  		return p;
> @@ -5374,35 +5373,22 @@ static struct task_struct *detach_one_ta
>  	return NULL;
>  }
>  
> -/*
> - * attach_one_task() -- attaches the task returned from detach_one_task() to
> - * its new rq.
> - */
> -static void attach_one_task(struct rq *rq, struct task_struct *p)
> -{
> -	raw_spin_lock(&rq->lock);
> -	BUG_ON(task_rq(p) != rq);
> -	p->on_rq = ONRQ_QUEUED;
> -	activate_task(rq, p, 0);
> -	check_preempt_curr(rq, p, 0);
> -	raw_spin_unlock(&rq->lock);
> -}
> -
>  static const unsigned int sched_nr_migrate_break = 32;
>  
>  /*
> - * move_tasks tries to move up to imbalance weighted load from busiest to
> - * this_rq, as part of a balancing operation within domain "sd".
> - * Returns 1 if successful and 0 otherwise.
> + * detach_tasks() -- tries to detach up to imbalance weighted load from
> + * busiest_rq, as part of a balancing operation within domain "sd".
>   *
> - * Called with both runqueues locked.
> + * Returns number of detached tasks if successful and 0 otherwise.
>   */
> -static int move_tasks(struct lb_env *env)
> +static int detach_tasks(struct lb_env *env)
>  {
>  	struct list_head *tasks = &env->src_rq->cfs_tasks;
>  	struct task_struct *p;
>  	unsigned long load;
> -	int pulled = 0;
> +	int detached = 0;
> +
> +	lockdep_assert_held(&env->src_rq->lock);
>  
>  	if (env->imbalance <= 0)
>  		return 0;
> @@ -5433,14 +5419,16 @@ static int move_tasks(struct lb_env *env
>  		if ((load / 2) > env->imbalance)
>  			goto next;
>  
> -		move_task(p, env);
> -		pulled++;
> +		detach_task(p, env);
> +		list_add(&p->se.group_node, &env->tasks);
> +
> +		detached++;
>  		env->imbalance -= load;
>  
>  #ifdef CONFIG_PREEMPT
>  		/*
>  		 * NEWIDLE balancing is a source of latency, so preemptible
> -		 * kernels will stop after the first task is pulled to minimize
> +		 * kernels will stop after the first task is detached to minimize
>  		 * the critical section.
>  		 */
>  		if (env->idle == CPU_NEWLY_IDLE)
> @@ -5460,13 +5448,58 @@ static int move_tasks(struct lb_env *env
>  	}
>  
>  	/*
> -	 * Right now, this is one of only two places move_task() is called,
> -	 * so we can safely collect move_task() stats here rather than
> -	 * inside move_task().
> +	 * Right now, this is one of only two places we collect this stat
> +	 * so we can safely collect detach_one_task() stats here rather
> +	 * than inside detach_one_task().
>  	 */
> -	schedstat_add(env->sd, lb_gained[env->idle], pulled);
> +	schedstat_add(env->sd, lb_gained[env->idle], detached);
>  
> -	return pulled;
> +	return detached;
> +}
> +
> +/*
> + * attach_task() -- attach the task detached by detach_task() to its new rq.
> + */
> +static void attach_task(struct rq *rq, struct task_struct *p)
> +{
> +	lockdep_assert_held(&rq->lock);
> +
> +	BUG_ON(task_rq(p) != rq);
> +	p->on_rq = ONRQ_QUEUED;
> +	activate_task(rq, p, 0);
> +	check_preempt_curr(rq, p, 0);
> +}
> +
> +/*
> + * attach_one_task() -- attaches the task returned from detach_one_task() to
> + * its new rq.
> + */
> +static void attach_one_task(struct rq *rq, struct task_struct *p)
> +{
> +	raw_spin_lock(&rq->lock);
> +	attach_task(rq, p);
> +	raw_spin_unlock(&rq->lock);
> +}
> +
> +/*
> + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
> + * new rq.
> + */
> +static void attach_tasks(struct lb_env *env)
> +{
> +	struct list_head *tasks = &env->tasks;
> +	struct task_struct *p;
> +
> +	raw_spin_lock(&env->dst_rq->lock);
> +
> +	while (!list_empty(tasks)) {
> +		p = list_first_entry(tasks, struct task_struct, se.group_node);
> +		list_del_init(&p->se.group_node);
> +
> +		attach_task(env->dst_rq, p);
> +	}
> +
> +	raw_spin_unlock(&env->dst_rq->lock);
>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -6600,6 +6633,7 @@ static int load_balance(int this_cpu, st
>  		.loop_break	= sched_nr_migrate_break,
>  		.cpus		= cpus,
>  		.fbq_type	= all,
> +		.tasks		= LIST_HEAD_INIT(env.tasks),
>  	};
>  
>  	/*
> @@ -6649,16 +6683,29 @@ static int load_balance(int this_cpu, st
>  		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>  
>  more_balance:
> -		local_irq_save(flags);
> -		double_rq_lock(env.dst_rq, busiest);
> +		raw_spin_lock_irqsave(&busiest->lock, flags);
>  
>  		/*
>  		 * cur_ld_moved - load moved in current iteration
>  		 * ld_moved     - cumulative load moved across iterations
>  		 */
> -		cur_ld_moved = move_tasks(&env);
> -		ld_moved += cur_ld_moved;
> -		double_rq_unlock(env.dst_rq, busiest);
> +		cur_ld_moved = detach_tasks(&env);
> +
> +		/*
> +		 * We've detached some tasks from busiest_rq. Every
> +		 * task is masked "ONRQ_MIGRATED", so we can safely
> +		 * unlock busiest->lock, and we are able to be sure
> +		 * that nobody can manipulate the tasks in parallel.
> +		 * See task_rq_lock() family for the details.
> +		 */
> +
> +		raw_spin_unlock(&busiest->lock);
> +
> +		if (cur_ld_moved) {
> +			attach_tasks(&env);
> +			ld_moved += cur_ld_moved;
> +		}
> +
>  		local_irq_restore(flags);
>  
>  		/*
> @@ -6794,7 +6841,7 @@ static int load_balance(int this_cpu, st
>  		 * If we've begun active balancing, start to back off. This
>  		 * case may not be covered by the all_pinned logic if there
>  		 * is only 1 task on the busy runqueue (because we don't call
> -		 * move_tasks).
> +		 * detach_tasks).
>  		 */
>  		if (sd->balance_interval < sd->max_interval)
>  			sd->balance_interval *= 2;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/