linux-kernel - Re: [patch]smp: make smp_call_function_many use the similar logic like smp_call_function

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20130108143837.93b4cd29.akpm@linux-foundation.org>
Date:	Tue, 8 Jan 2013 14:38:37 -0800
From:	Andrew Morton <akpm@...ux-foundation.org>
To:	Shaohua Li <shli@...nel.org>
Cc:	linux-kernel@...r.kernel.org, a.p.zijlstra@...llo.nl,
	rostedt@...dmis.org, axboe@...nel.dk
Subject: Re: [patch]smp: make smp_call_function_many use the similar logic
 like smp_call_function_single

On Mon, 24 Dec 2012 15:03:30 +0800
Shaohua Li <shli@...nel.org> wrote:

> 
> I'm testing swapout workload in a two-socket Xeon machine. The workload has 10
> threads, each thread sequentially accesses separate memory region. TLB flush
> overhead is very big in the workload. For each page, page reclaim need move it
> from active lru list and then unmap it. Both need a TLB flush. And this is a
> multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses
> generic smp_call)function. So this workload stress smp_call_function_many
> heavily.
> 
> Without patch, perf shows:
> +  24.49%  [k] generic_smp_call_function_interrupt
> -  21.72%  [k] _raw_spin_lock 
>    - _raw_spin_lock 
>       + 79.80% __page_check_address 
>       + 6.42% generic_smp_call_function_interrupt
>       + 3.31% get_swap_page                       
>       + 2.37% free_pcppages_bulk                  
>       + 1.75% handle_pte_fault                    
>       + 1.54% put_super                           
>       + 1.41% grab_super_passive                  
>       + 1.36% __swap_duplicate                    
>       + 0.68% blk_flush_plug_list                 
>       + 0.62% swap_info_get                       
> +   6.55%  [k] flush_tlb_func                     
> +   6.46%  [k] smp_call_function_many             
> +   5.09%  [k] call_function_interrupt            
> +   4.75%  [k] default_send_IPI_mask_sequence_phys
> +   2.18%  [k] find_next_bit
> 
> swapout throughput is around 1300M/s.
> 
> With the patch, perf shows:
> -  27.23%  [k] _raw_spin_lock                           
>    - _raw_spin_lock                                     
>       + 80.53% __page_check_address                     
>       + 8.39% generic_smp_call_function_single_interrupt
>       + 2.44% get_swap_page                             
>       + 1.76% free_pcppages_bulk                        
>       + 1.40% handle_pte_fault                          
>       + 1.15% __swap_duplicate                          
>       + 1.05% put_super                                 
>       + 0.98% grab_super_passive                        
>       + 0.86% blk_flush_plug_list                       
>       + 0.57% swap_info_get                             
> +   8.25%  [k] default_send_IPI_mask_sequence_phys      
> +   7.55%  [k] call_function_interrupt                  
> +   7.47%  [k] smp_call_function_many                   
> +   7.25%  [k] flush_tlb_func                           
> +   3.81%  [k] _raw_spin_lock_irqsave                   
> +   3.78%  [k] generic_smp_call_function_single_interrupt
> 
> swapout throughput is around 1400M/s. So there is around a 7% improvement, and
> total cpu utilization doesn't change.

Boy, that was a biiiig cleanup.  I hope it works ;)

The naming of locals in that code makes my eyes hurt.  How's this?


From: Andrew Morton <akpm@...ux-foundation.org>
Subject: kernel/smp.c: cleanups

We sometimes use "struct call_single_data *data" and sometimes "struct
call_single_data *csd".  Use "cfd" consistently.

We sometimes use "struct call_function_data *data" and sometimes "struct
call_function_data *cfd".  Use "cfd" consistently.

Also, avoid some 80-col layout tricks.

Cc: Ingo Molnar <mingo@...e.hu>
Cc: Jens Axboe <axboe@...nel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: Shaohua Li <shli@...ionio.com>
Cc: Shaohua Li <shli@...nel.org>
Cc: Steven Rostedt <rostedt@...dmis.org>
Signed-off-by: Andrew Morton <akpm@...ux-foundation.org>
---

 kernel/smp.c |   87 ++++++++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff -puN include/linux/smp.h~kernel-smpc-cleanups include/linux/smp.h
diff -puN kernel/smp.c~kernel-smpc-cleanups kernel/smp.c
--- a/kernel/smp.c~kernel-smpc-cleanups
+++ a/kernel/smp.c
@@ -95,16 +95,16 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void csd_lock_wait(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *csd)
 {
-	while (data->flags & CSD_FLAG_LOCK)
+	while (csd->flags & CSD_FLAG_LOCK)
 		cpu_relax();
 }
 
-static void csd_lock(struct call_single_data *data)
+static void csd_lock(struct call_single_data *csd)
 {
-	csd_lock_wait(data);
-	data->flags = CSD_FLAG_LOCK;
+	csd_lock_wait(csd);
+	csd->flags = CSD_FLAG_LOCK;
 
 	/*
 	 * prevent CPU from reordering the above assignment
@@ -114,16 +114,16 @@ static void csd_lock(struct call_single_
 	smp_mb();
 }
 
-static void csd_unlock(struct call_single_data *data)
+static void csd_unlock(struct call_single_data *csd)
 {
-	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
 	/*
 	 * ensure we're all done before releasing data:
 	 */
 	smp_mb();
 
-	data->flags &= ~CSD_FLAG_LOCK;
+	csd->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
@@ -132,7 +132,7 @@ static void csd_unlock(struct call_singl
  * ->func, ->info, and ->flags set.
  */
 static
-void generic_exec_single(int cpu, struct call_single_data *data, int wait)
+void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 {
 	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
 	unsigned long flags;
@@ -140,7 +140,7 @@ void generic_exec_single(int cpu, struct
 
 	raw_spin_lock_irqsave(&dst->lock, flags);
 	ipi = list_empty(&dst->list);
-	list_add_tail(&data->list, &dst->list);
+	list_add_tail(&csd->list, &dst->list);
 	raw_spin_unlock_irqrestore(&dst->lock, flags);
 
 	/*
@@ -158,7 +158,7 @@ void generic_exec_single(int cpu, struct
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_lock_wait(data);
+		csd_lock_wait(csd);
 }
 
 /*
@@ -168,7 +168,6 @@ void generic_exec_single(int cpu, struct
 void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
-	unsigned int data_flags;
 	LIST_HEAD(list);
 
 	/*
@@ -181,25 +180,26 @@ void generic_smp_call_function_single_in
 	raw_spin_unlock(&q->lock);
 
 	while (!list_empty(&list)) {
-		struct call_single_data *data;
+		struct call_single_data *csd;
+		unsigned int csd_flags;
 
-		data = list_entry(list.next, struct call_single_data, list);
-		list_del(&data->list);
+		csd = list_entry(list.next, struct call_single_data, list);
+		list_del(&csd->list);
 
 		/*
-		 * 'data' can be invalid after this call if flags == 0
+		 * 'csd' can be invalid after this call if flags == 0
 		 * (when called through generic_exec_single()),
 		 * so save them away before making the call:
 		 */
-		data_flags = data->flags;
+		csd_flags = csd->flags;
 
-		data->func(data->info);
+		csd->func(csd->info);
 
 		/*
 		 * Unlocked CSDs are valid through generic_exec_single():
 		 */
-		if (data_flags & CSD_FLAG_LOCK)
-			csd_unlock(data);
+		if (csd_flags & CSD_FLAG_LOCK)
+			csd_unlock(csd);
 	}
 }
 
@@ -244,16 +244,16 @@ int smp_call_function_single(int cpu, sm
 		local_irq_restore(flags);
 	} else {
 		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-			struct call_single_data *data = &d;
+			struct call_single_data *csd = &d;
 
 			if (!wait)
-				data = &__get_cpu_var(csd_data);
+				csd = &__get_cpu_var(csd_data);
 
-			csd_lock(data);
+			csd_lock(csd);
 
-			data->func = func;
-			data->info = info;
-			generic_exec_single(cpu, data, wait);
+			csd->func = func;
+			csd->info = info;
+			generic_exec_single(cpu, csd, wait);
 		} else {
 			err = -ENXIO;	/* CPU not online */
 		}
@@ -320,7 +320,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any)
  * pre-allocated data structure. Useful for embedding @data inside
  * other structures, for instance.
  */
-void __smp_call_function_single(int cpu, struct call_single_data *data,
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
 				int wait)
 {
 	unsigned int this_cpu;
@@ -338,11 +338,11 @@ void __smp_call_function_single(int cpu,
 
 	if (cpu == this_cpu) {
 		local_irq_save(flags);
-		data->func(data->info);
+		csd->func(csd->info);
 		local_irq_restore(flags);
 	} else {
-		csd_lock(data);
-		generic_exec_single(cpu, data, wait);
+		csd_lock(csd);
+		generic_exec_single(cpu, csd, wait);
 	}
 	put_cpu();
 }
@@ -364,7 +364,7 @@ void __smp_call_function_single(int cpu,
 void smp_call_function_many(const struct cpumask *mask,
 			    smp_call_func_t func, void *info, bool wait)
 {
-	struct call_function_data *data;
+	struct call_function_data *cfd;
 	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/*
@@ -396,21 +396,21 @@ void smp_call_function_many(const struct
 		return;
 	}
 
-	data = &__get_cpu_var(cfd_data);
+	cfd = &__get_cpu_var(cfd_data);
 
-	cpumask_and(data->cpumask, mask, cpu_online_mask);
-	cpumask_clear_cpu(this_cpu, data->cpumask);
+	cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+	cpumask_clear_cpu(this_cpu, cfd->cpumask);
 
 	/* Some callers race with other cpus changing the passed mask */
-	if (unlikely(!cpumask_weight(data->cpumask)))
+	if (unlikely(!cpumask_weight(cfd->cpumask)))
 		return;
 
-	for_each_cpu(cpu, data->cpumask) {
-		struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
-		struct call_single_queue *dst =
-					&per_cpu(call_single_queue, cpu);
+	for_each_cpu(cpu, cfd->cpumask) {
+		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+		struct call_single_queue *dst;
 		unsigned long flags;
 
+		dst = &per_cpu(call_single_queue, cpu);
 		csd_lock(csd);
 		csd->func = func;
 		csd->info = info;
@@ -421,12 +421,13 @@ void smp_call_function_many(const struct
 	}
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(data->cpumask);
+	arch_send_call_function_ipi_mask(cfd->cpumask);
 
 	if (wait) {
-		for_each_cpu(cpu, data->cpumask) {
-			struct call_single_data *csd =
-					per_cpu_ptr(data->csd, cpu);
+		for_each_cpu(cpu, cfd->cpumask) {
+			struct call_single_data *csd;
+
+			csd = per_cpu_ptr(cfd->csd, cpu);
 			csd_lock_wait(csd);
 		}
 	}
_

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/