linux-kernel - Re: [patch]smp: make smp_call_function_many use the similar logic like smp_call_function

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 7 Jan 2013 16:14:04 +0800
From:	Shaohua Li <shli@...nel.org>
To:	linux-kernel@...r.kernel.org
Cc:	akpm@...ux-foundation.org, a.p.zijlstra@...llo.nl,
	rostedt@...dmis.org, axboe@...nel.dk
Subject: Re: [patch]smp: make smp_call_function_many use the similar logic
 like smp_call_function_single

ping!

On Mon, Dec 24, 2012 at 03:03:30PM +0800, Shaohua Li wrote:
> 
> I'm testing swapout workload in a two-socket Xeon machine. The workload has 10
> threads, each thread sequentially accesses separate memory region. TLB flush
> overhead is very big in the workload. For each page, page reclaim need move it
> from active lru list and then unmap it. Both need a TLB flush. And this is a
> multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses
> generic smp_call)function. So this workload stress smp_call_function_many
> heavily.
> 
> Without patch, perf shows:
> +  24.49%  [k] generic_smp_call_function_interrupt
> -  21.72%  [k] _raw_spin_lock 
>    - _raw_spin_lock 
>       + 79.80% __page_check_address 
>       + 6.42% generic_smp_call_function_interrupt
>       + 3.31% get_swap_page                       
>       + 2.37% free_pcppages_bulk                  
>       + 1.75% handle_pte_fault                    
>       + 1.54% put_super                           
>       + 1.41% grab_super_passive                  
>       + 1.36% __swap_duplicate                    
>       + 0.68% blk_flush_plug_list                 
>       + 0.62% swap_info_get                       
> +   6.55%  [k] flush_tlb_func                     
> +   6.46%  [k] smp_call_function_many             
> +   5.09%  [k] call_function_interrupt            
> +   4.75%  [k] default_send_IPI_mask_sequence_phys
> +   2.18%  [k] find_next_bit
> 
> swapout throughput is around 1300M/s.
> 
> With the patch, perf shows:
> -  27.23%  [k] _raw_spin_lock                           
>    - _raw_spin_lock                                     
>       + 80.53% __page_check_address                     
>       + 8.39% generic_smp_call_function_single_interrupt
>       + 2.44% get_swap_page                             
>       + 1.76% free_pcppages_bulk                        
>       + 1.40% handle_pte_fault                          
>       + 1.15% __swap_duplicate                          
>       + 1.05% put_super                                 
>       + 0.98% grab_super_passive                        
>       + 0.86% blk_flush_plug_list                       
>       + 0.57% swap_info_get                             
> +   8.25%  [k] default_send_IPI_mask_sequence_phys      
> +   7.55%  [k] call_function_interrupt                  
> +   7.47%  [k] smp_call_function_many                   
> +   7.25%  [k] flush_tlb_func                           
> +   3.81%  [k] _raw_spin_lock_irqsave                   
> +   3.78%  [k] generic_smp_call_function_single_interrupt
> 
> swapout throughput is around 1400M/s. So there is around a 7% improvement, and
> total cpu utilization doesn't change.
> 
> Without the patch, cfd_data is shared by all CPUs.
> generic_smp_call_function_interrupt does read/write cfd_data several times
> which will create a lot of cache ping-pong. With the patch, the data becomes
> per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make
> call_single_queue lock contend.
> 
> Next step is to remove generic_smp_call_function_interrupt from arch code.
> 
> Signed-off-by: Shaohua Li <shli@...ionio.com>
> ---
>  include/linux/smp.h |    3 
>  kernel/smp.c        |  184 ++++++++--------------------------------------------
>  2 files changed, 32 insertions(+), 155 deletions(-)
> 
> Index: linux/kernel/smp.c
> ===================================================================
> --- linux.orig/kernel/smp.c	2012-12-24 11:37:28.150604463 +0800
> +++ linux/kernel/smp.c	2012-12-24 14:57:11.807949527 +0800
> @@ -16,22 +16,12 @@
>  #include "smpboot.h"
>  
>  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
> -static struct {
> -	struct list_head	queue;
> -	raw_spinlock_t		lock;
> -} call_function __cacheline_aligned_in_smp =
> -	{
> -		.queue		= LIST_HEAD_INIT(call_function.queue),
> -		.lock		= __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
> -	};
> -
>  enum {
>  	CSD_FLAG_LOCK		= 0x01,
>  };
>  
>  struct call_function_data {
> -	struct call_single_data	csd;
> -	atomic_t		refs;
> +	struct call_single_data	__percpu *csd;
>  	cpumask_var_t		cpumask;
>  };
>  
> @@ -56,6 +46,11 @@ hotplug_cfd(struct notifier_block *nfb,
>  		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
>  				cpu_to_node(cpu)))
>  			return notifier_from_errno(-ENOMEM);
> +		cfd->csd = alloc_percpu(struct call_single_data);
> +		if (!cfd->csd) {
> +			free_cpumask_var(cfd->cpumask);
> +			return notifier_from_errno(-ENOMEM);
> +		}
>  		break;
>  
>  #ifdef CONFIG_HOTPLUG_CPU
> @@ -65,6 +60,7 @@ hotplug_cfd(struct notifier_block *nfb,
>  	case CPU_DEAD:
>  	case CPU_DEAD_FROZEN:
>  		free_cpumask_var(cfd->cpumask);
> +		free_percpu(cfd->csd);
>  		break;
>  #endif
>  	};
> @@ -166,85 +162,6 @@ void generic_exec_single(int cpu, struct
>  }
>  
>  /*
> - * Invoked by arch to handle an IPI for call function. Must be called with
> - * interrupts disabled.
> - */
> -void generic_smp_call_function_interrupt(void)
> -{
> -	struct call_function_data *data;
> -	int cpu = smp_processor_id();
> -
> -	/*
> -	 * Shouldn't receive this interrupt on a cpu that is not yet online.
> -	 */
> -	WARN_ON_ONCE(!cpu_online(cpu));
> -
> -	/*
> -	 * Ensure entry is visible on call_function_queue after we have
> -	 * entered the IPI. See comment in smp_call_function_many.
> -	 * If we don't have this, then we may miss an entry on the list
> -	 * and never get another IPI to process it.
> -	 */
> -	smp_mb();
> -
> -	/*
> -	 * It's ok to use list_for_each_rcu() here even though we may
> -	 * delete 'pos', since list_del_rcu() doesn't clear ->next
> -	 */
> -	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
> -		int refs;
> -		smp_call_func_t func;
> -
> -		/*
> -		 * Since we walk the list without any locks, we might
> -		 * see an entry that was completed, removed from the
> -		 * list and is in the process of being reused.
> -		 *
> -		 * We must check that the cpu is in the cpumask before
> -		 * checking the refs, and both must be set before
> -		 * executing the callback on this cpu.
> -		 */
> -
> -		if (!cpumask_test_cpu(cpu, data->cpumask))
> -			continue;
> -
> -		smp_rmb();
> -
> -		if (atomic_read(&data->refs) == 0)
> -			continue;
> -
> -		func = data->csd.func;		/* save for later warn */
> -		func(data->csd.info);
> -
> -		/*
> -		 * If the cpu mask is not still set then func enabled
> -		 * interrupts (BUG), and this cpu took another smp call
> -		 * function interrupt and executed func(info) twice
> -		 * on this cpu.  That nested execution decremented refs.
> -		 */
> -		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
> -			WARN(1, "%pf enabled interrupts and double executed\n", func);
> -			continue;
> -		}
> -
> -		refs = atomic_dec_return(&data->refs);
> -		WARN_ON(refs < 0);
> -
> -		if (refs)
> -			continue;
> -
> -		WARN_ON(!cpumask_empty(data->cpumask));
> -
> -		raw_spin_lock(&call_function.lock);
> -		list_del_rcu(&data->csd.list);
> -		raw_spin_unlock(&call_function.lock);
> -
> -		csd_unlock(&data->csd);
> -	}
> -
> -}
> -
> -/*
>   * Invoked by arch to handle an IPI for call function single. Must be
>   * called from the arch with interrupts disabled.
>   */
> @@ -448,8 +365,7 @@ void smp_call_function_many(const struct
>  			    smp_call_func_t func, void *info, bool wait)
>  {
>  	struct call_function_data *data;
> -	unsigned long flags;
> -	int refs, cpu, next_cpu, this_cpu = smp_processor_id();
> +	int cpu, next_cpu, this_cpu = smp_processor_id();
>  
>  	/*
>  	 * Can deadlock when called with interrupts disabled.
> @@ -481,79 +397,39 @@ void smp_call_function_many(const struct
>  	}
>  
>  	data = &__get_cpu_var(cfd_data);
> -	csd_lock(&data->csd);
> -
> -	/* This BUG_ON verifies our reuse assertions and can be removed */
> -	BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
> -
> -	/*
> -	 * The global call function queue list add and delete are protected
> -	 * by a lock, but the list is traversed without any lock, relying
> -	 * on the rcu list add and delete to allow safe concurrent traversal.
> -	 * We reuse the call function data without waiting for any grace
> -	 * period after some other cpu removes it from the global queue.
> -	 * This means a cpu might find our data block as it is being
> -	 * filled out.
> -	 *
> -	 * We hold off the interrupt handler on the other cpu by
> -	 * ordering our writes to the cpu mask vs our setting of the
> -	 * refs counter.  We assert only the cpu owning the data block
> -	 * will set a bit in cpumask, and each bit will only be cleared
> -	 * by the subject cpu.  Each cpu must first find its bit is
> -	 * set and then check that refs is set indicating the element is
> -	 * ready to be processed, otherwise it must skip the entry.
> -	 *
> -	 * On the previous iteration refs was set to 0 by another cpu.
> -	 * To avoid the use of transitivity, set the counter to 0 here
> -	 * so the wmb will pair with the rmb in the interrupt handler.
> -	 */
> -	atomic_set(&data->refs, 0);	/* convert 3rd to 1st party write */
>  
> -	data->csd.func = func;
> -	data->csd.info = info;
> -
> -	/* Ensure 0 refs is visible before mask.  Also orders func and info */
> -	smp_wmb();
> -
> -	/* We rely on the "and" being processed before the store */
>  	cpumask_and(data->cpumask, mask, cpu_online_mask);
>  	cpumask_clear_cpu(this_cpu, data->cpumask);
> -	refs = cpumask_weight(data->cpumask);
>  
>  	/* Some callers race with other cpus changing the passed mask */
> -	if (unlikely(!refs)) {
> -		csd_unlock(&data->csd);
> +	if (unlikely(!cpumask_weight(data->cpumask)))
>  		return;
> -	}
> -
> -	raw_spin_lock_irqsave(&call_function.lock, flags);
> -	/*
> -	 * Place entry at the _HEAD_ of the list, so that any cpu still
> -	 * observing the entry in generic_smp_call_function_interrupt()
> -	 * will not miss any other list entries:
> -	 */
> -	list_add_rcu(&data->csd.list, &call_function.queue);
> -	/*
> -	 * We rely on the wmb() in list_add_rcu to complete our writes
> -	 * to the cpumask before this write to refs, which indicates
> -	 * data is on the list and is ready to be processed.
> -	 */
> -	atomic_set(&data->refs, refs);
> -	raw_spin_unlock_irqrestore(&call_function.lock, flags);
>  
> -	/*
> -	 * Make the list addition visible before sending the ipi.
> -	 * (IPIs must obey or appear to obey normal Linux cache
> -	 * coherency rules -- see comment in generic_exec_single).
> -	 */
> -	smp_mb();
> +	for_each_cpu(cpu, data->cpumask) {
> +		struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
> +		struct call_single_queue *dst =
> +					&per_cpu(call_single_queue, cpu);
> +		unsigned long flags;
> +
> +		csd_lock(csd);
> +		csd->func = func;
> +		csd->info = info;
> +
> +		raw_spin_lock_irqsave(&dst->lock, flags);
> +		list_add_tail(&csd->list, &dst->list);
> +		raw_spin_unlock_irqrestore(&dst->lock, flags);
> +	}
>  
>  	/* Send a message to all CPUs in the map */
>  	arch_send_call_function_ipi_mask(data->cpumask);
>  
> -	/* Optionally wait for the CPUs to complete */
> -	if (wait)
> -		csd_lock_wait(&data->csd);
> +	if (wait) {
> +		for_each_cpu(cpu, data->cpumask) {
> +			struct call_single_data *csd =
> +					per_cpu_ptr(data->csd, cpu);
> +			csd_lock_wait(csd);
> +		}
> +	}
>  }
>  EXPORT_SYMBOL(smp_call_function_many);
>  
> Index: linux/include/linux/smp.h
> ===================================================================
> --- linux.orig/include/linux/smp.h	2012-12-24 11:38:28.901840885 +0800
> +++ linux/include/linux/smp.h	2012-12-24 11:39:16.557243069 +0800
> @@ -89,7 +89,8 @@ void kick_all_cpus_sync(void);
>  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
>  void __init call_function_init(void);
>  void generic_smp_call_function_single_interrupt(void);
> -void generic_smp_call_function_interrupt(void);
> +#define generic_smp_call_function_interrupt \
> +	generic_smp_call_function_single_interrupt
>  #else
>  static inline void call_function_init(void) { }
>  #endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/