linux-kernel - [patch]smp: make smp_call_function_many use the similar logic like smp_call_function

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20121224070330.GA9331@kernel.org>
Date:	Mon, 24 Dec 2012 15:03:30 +0800
From:	Shaohua Li <shli@...nel.org>
To:	linux-kernel@...r.kernel.org
Cc:	akpm@...ux-foundation.org, a.p.zijlstra@...llo.nl,
	rostedt@...dmis.org, axboe@...nel.dk
Subject: [patch]smp: make smp_call_function_many use the similar logic like
 smp_call_function_single


I'm testing swapout workload in a two-socket Xeon machine. The workload has 10
threads, each thread sequentially accesses separate memory region. TLB flush
overhead is very big in the workload. For each page, page reclaim need move it
from active lru list and then unmap it. Both need a TLB flush. And this is a
multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses
generic smp_call)function. So this workload stress smp_call_function_many
heavily.

Without patch, perf shows:
+  24.49%  [k] generic_smp_call_function_interrupt
-  21.72%  [k] _raw_spin_lock 
   - _raw_spin_lock 
      + 79.80% __page_check_address 
      + 6.42% generic_smp_call_function_interrupt
      + 3.31% get_swap_page                       
      + 2.37% free_pcppages_bulk                  
      + 1.75% handle_pte_fault                    
      + 1.54% put_super                           
      + 1.41% grab_super_passive                  
      + 1.36% __swap_duplicate                    
      + 0.68% blk_flush_plug_list                 
      + 0.62% swap_info_get                       
+   6.55%  [k] flush_tlb_func                     
+   6.46%  [k] smp_call_function_many             
+   5.09%  [k] call_function_interrupt            
+   4.75%  [k] default_send_IPI_mask_sequence_phys
+   2.18%  [k] find_next_bit

swapout throughput is around 1300M/s.

With the patch, perf shows:
-  27.23%  [k] _raw_spin_lock                           
   - _raw_spin_lock                                     
      + 80.53% __page_check_address                     
      + 8.39% generic_smp_call_function_single_interrupt
      + 2.44% get_swap_page                             
      + 1.76% free_pcppages_bulk                        
      + 1.40% handle_pte_fault                          
      + 1.15% __swap_duplicate                          
      + 1.05% put_super                                 
      + 0.98% grab_super_passive                        
      + 0.86% blk_flush_plug_list                       
      + 0.57% swap_info_get                             
+   8.25%  [k] default_send_IPI_mask_sequence_phys      
+   7.55%  [k] call_function_interrupt                  
+   7.47%  [k] smp_call_function_many                   
+   7.25%  [k] flush_tlb_func                           
+   3.81%  [k] _raw_spin_lock_irqsave                   
+   3.78%  [k] generic_smp_call_function_single_interrupt

swapout throughput is around 1400M/s. So there is around a 7% improvement, and
total cpu utilization doesn't change.

Without the patch, cfd_data is shared by all CPUs.
generic_smp_call_function_interrupt does read/write cfd_data several times
which will create a lot of cache ping-pong. With the patch, the data becomes
per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make
call_single_queue lock contend.

Next step is to remove generic_smp_call_function_interrupt from arch code.

Signed-off-by: Shaohua Li <shli@...ionio.com>
---
 include/linux/smp.h |    3 
 kernel/smp.c        |  184 ++++++++--------------------------------------------
 2 files changed, 32 insertions(+), 155 deletions(-)

Index: linux/kernel/smp.c
===================================================================
--- linux.orig/kernel/smp.c	2012-12-24 11:37:28.150604463 +0800
+++ linux/kernel/smp.c	2012-12-24 14:57:11.807949527 +0800
@@ -16,22 +16,12 @@
 #include "smpboot.h"
 
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
-	struct list_head	queue;
-	raw_spinlock_t		lock;
-} call_function __cacheline_aligned_in_smp =
-	{
-		.queue		= LIST_HEAD_INIT(call_function.queue),
-		.lock		= __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
-	};
-
 enum {
 	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
-	struct call_single_data	csd;
-	atomic_t		refs;
+	struct call_single_data	__percpu *csd;
 	cpumask_var_t		cpumask;
 };
 
@@ -56,6 +46,11 @@ hotplug_cfd(struct notifier_block *nfb,
 		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				cpu_to_node(cpu)))
 			return notifier_from_errno(-ENOMEM);
+		cfd->csd = alloc_percpu(struct call_single_data);
+		if (!cfd->csd) {
+			free_cpumask_var(cfd->cpumask);
+			return notifier_from_errno(-ENOMEM);
+		}
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -65,6 +60,7 @@ hotplug_cfd(struct notifier_block *nfb,
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		free_cpumask_var(cfd->cpumask);
+		free_percpu(cfd->csd);
 		break;
 #endif
 	};
@@ -166,85 +162,6 @@ void generic_exec_single(int cpu, struct
 }
 
 /*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
- */
-void generic_smp_call_function_interrupt(void)
-{
-	struct call_function_data *data;
-	int cpu = smp_processor_id();
-
-	/*
-	 * Shouldn't receive this interrupt on a cpu that is not yet online.
-	 */
-	WARN_ON_ONCE(!cpu_online(cpu));
-
-	/*
-	 * Ensure entry is visible on call_function_queue after we have
-	 * entered the IPI. See comment in smp_call_function_many.
-	 * If we don't have this, then we may miss an entry on the list
-	 * and never get another IPI to process it.
-	 */
-	smp_mb();
-
-	/*
-	 * It's ok to use list_for_each_rcu() here even though we may
-	 * delete 'pos', since list_del_rcu() doesn't clear ->next
-	 */
-	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
-		int refs;
-		smp_call_func_t func;
-
-		/*
-		 * Since we walk the list without any locks, we might
-		 * see an entry that was completed, removed from the
-		 * list and is in the process of being reused.
-		 *
-		 * We must check that the cpu is in the cpumask before
-		 * checking the refs, and both must be set before
-		 * executing the callback on this cpu.
-		 */
-
-		if (!cpumask_test_cpu(cpu, data->cpumask))
-			continue;
-
-		smp_rmb();
-
-		if (atomic_read(&data->refs) == 0)
-			continue;
-
-		func = data->csd.func;		/* save for later warn */
-		func(data->csd.info);
-
-		/*
-		 * If the cpu mask is not still set then func enabled
-		 * interrupts (BUG), and this cpu took another smp call
-		 * function interrupt and executed func(info) twice
-		 * on this cpu.  That nested execution decremented refs.
-		 */
-		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
-			WARN(1, "%pf enabled interrupts and double executed\n", func);
-			continue;
-		}
-
-		refs = atomic_dec_return(&data->refs);
-		WARN_ON(refs < 0);
-
-		if (refs)
-			continue;
-
-		WARN_ON(!cpumask_empty(data->cpumask));
-
-		raw_spin_lock(&call_function.lock);
-		list_del_rcu(&data->csd.list);
-		raw_spin_unlock(&call_function.lock);
-
-		csd_unlock(&data->csd);
-	}
-
-}
-
-/*
  * Invoked by arch to handle an IPI for call function single. Must be
  * called from the arch with interrupts disabled.
  */
@@ -448,8 +365,7 @@ void smp_call_function_many(const struct
 			    smp_call_func_t func, void *info, bool wait)
 {
 	struct call_function_data *data;
-	unsigned long flags;
-	int refs, cpu, next_cpu, this_cpu = smp_processor_id();
+	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/*
 	 * Can deadlock when called with interrupts disabled.
@@ -481,79 +397,39 @@ void smp_call_function_many(const struct
 	}
 
 	data = &__get_cpu_var(cfd_data);
-	csd_lock(&data->csd);
-
-	/* This BUG_ON verifies our reuse assertions and can be removed */
-	BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
-
-	/*
-	 * The global call function queue list add and delete are protected
-	 * by a lock, but the list is traversed without any lock, relying
-	 * on the rcu list add and delete to allow safe concurrent traversal.
-	 * We reuse the call function data without waiting for any grace
-	 * period after some other cpu removes it from the global queue.
-	 * This means a cpu might find our data block as it is being
-	 * filled out.
-	 *
-	 * We hold off the interrupt handler on the other cpu by
-	 * ordering our writes to the cpu mask vs our setting of the
-	 * refs counter.  We assert only the cpu owning the data block
-	 * will set a bit in cpumask, and each bit will only be cleared
-	 * by the subject cpu.  Each cpu must first find its bit is
-	 * set and then check that refs is set indicating the element is
-	 * ready to be processed, otherwise it must skip the entry.
-	 *
-	 * On the previous iteration refs was set to 0 by another cpu.
-	 * To avoid the use of transitivity, set the counter to 0 here
-	 * so the wmb will pair with the rmb in the interrupt handler.
-	 */
-	atomic_set(&data->refs, 0);	/* convert 3rd to 1st party write */
 
-	data->csd.func = func;
-	data->csd.info = info;
-
-	/* Ensure 0 refs is visible before mask.  Also orders func and info */
-	smp_wmb();
-
-	/* We rely on the "and" being processed before the store */
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
 	cpumask_clear_cpu(this_cpu, data->cpumask);
-	refs = cpumask_weight(data->cpumask);
 
 	/* Some callers race with other cpus changing the passed mask */
-	if (unlikely(!refs)) {
-		csd_unlock(&data->csd);
+	if (unlikely(!cpumask_weight(data->cpumask)))
 		return;
-	}
-
-	raw_spin_lock_irqsave(&call_function.lock, flags);
-	/*
-	 * Place entry at the _HEAD_ of the list, so that any cpu still
-	 * observing the entry in generic_smp_call_function_interrupt()
-	 * will not miss any other list entries:
-	 */
-	list_add_rcu(&data->csd.list, &call_function.queue);
-	/*
-	 * We rely on the wmb() in list_add_rcu to complete our writes
-	 * to the cpumask before this write to refs, which indicates
-	 * data is on the list and is ready to be processed.
-	 */
-	atomic_set(&data->refs, refs);
-	raw_spin_unlock_irqrestore(&call_function.lock, flags);
 
-	/*
-	 * Make the list addition visible before sending the ipi.
-	 * (IPIs must obey or appear to obey normal Linux cache
-	 * coherency rules -- see comment in generic_exec_single).
-	 */
-	smp_mb();
+	for_each_cpu(cpu, data->cpumask) {
+		struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
+		struct call_single_queue *dst =
+					&per_cpu(call_single_queue, cpu);
+		unsigned long flags;
+
+		csd_lock(csd);
+		csd->func = func;
+		csd->info = info;
+
+		raw_spin_lock_irqsave(&dst->lock, flags);
+		list_add_tail(&csd->list, &dst->list);
+		raw_spin_unlock_irqrestore(&dst->lock, flags);
+	}
 
 	/* Send a message to all CPUs in the map */
 	arch_send_call_function_ipi_mask(data->cpumask);
 
-	/* Optionally wait for the CPUs to complete */
-	if (wait)
-		csd_lock_wait(&data->csd);
+	if (wait) {
+		for_each_cpu(cpu, data->cpumask) {
+			struct call_single_data *csd =
+					per_cpu_ptr(data->csd, cpu);
+			csd_lock_wait(csd);
+		}
+	}
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
Index: linux/include/linux/smp.h
===================================================================
--- linux.orig/include/linux/smp.h	2012-12-24 11:38:28.901840885 +0800
+++ linux/include/linux/smp.h	2012-12-24 11:39:16.557243069 +0800
@@ -89,7 +89,8 @@ void kick_all_cpus_sync(void);
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 void __init call_function_init(void);
 void generic_smp_call_function_single_interrupt(void);
-void generic_smp_call_function_interrupt(void);
+#define generic_smp_call_function_interrupt \
+	generic_smp_call_function_single_interrupt
 #else
 static inline void call_function_init(void) { }
 #endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/