linux-kernel - Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <200808221153.13889.nickpiggin@yahoo.com.au>
Date:	Fri, 22 Aug 2008 11:53:13 +1000
From:	Nick Piggin <nickpiggin@...oo.com.au>
To:	Jeremy Fitzhardinge <jeremy@...p.org>
Cc:	Ingo Molnar <mingo@...e.hu>, Andi Kleen <andi@...stfloor.org>,
	"Pallipadi, Venkatesh" <venkatesh.pallipadi@...el.com>,
	Suresh Siddha <suresh.b.siddha@...el.com>,
	Jens Axboe <jens.axboe@...cle.com>,
	Rusty Russell <rusty@...tcorp.com.au>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

On Friday 22 August 2008 10:29, Jeremy Fitzhardinge wrote:
> RCU can only control the lifetime of allocated memory blocks, which
> forces all the call structures to be allocated.  This is expensive
> compared to allocating them on the stack, which is the common case for
> synchronous calls.
>
> This patch takes a different approach.  Rather than using RCU, the
> queues are managed under rwlocks.  Adding or removing from the queue
> requires holding the lock for writing, but multiple CPUs can walk the
> queues to process function calls under read locks.  In the common
> case, where the structures are stack allocated, the calling CPU need
> only wait for its call to be done, take the lock for writing and
> remove the call structure.
>
> Lock contention - particularly write vs read - is reduced by using
> multiple queues.

Could be reasonable. Still, it's adding like 4 or 5 more atomic
operations, which will be approaching the cost of a slab allocation.

Another approach to reduce slab allocation cost would be to do the
call_rcu at the caller, in the wait case so we hit the CPU-local
freeing path in slab.


>
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@...rix.com>
> ---
>  kernel/smp.c |  140
> +++++++++++++++++++++++----------------------------------- 1 file changed,
> 56 insertions(+), 84 deletions(-)
>
> ===================================================================
> --- a/kernel/smp.c
> +++ b/kernel/smp.c
> @@ -7,8 +7,6 @@
>  #include <linux/init.h>
>  #include <linux/module.h>
>  #include <linux/percpu.h>
> -#include <linux/rcupdate.h>
> -#include <linux/rculist.h>
>  #include <linux/smp.h>
>  #include <asm/atomic.h>
>
> @@ -26,7 +24,7 @@
>  static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
>  struct ____cacheline_aligned queue {
>  	struct list_head list;
> -	spinlock_t lock;
> +	rwlock_t rwlock;
>  };
>
>  static __cacheline_aligned struct queue call_function_queues[NQUEUES];
> @@ -40,7 +38,7 @@
>  	struct call_single_data csd;
>  	atomic_t refs;
>  	cpumask_t cpumask;
> -	struct rcu_head rcu_head;
> +	struct list_head cull_list;
>  };
>
>  struct call_single_queue {
> @@ -98,15 +96,6 @@
>  		csd_flag_wait(data);
>  }
>
> -static void rcu_free_call_data(struct rcu_head *head)
> -{
> -	struct call_function_data *data;
> -
> -	data = container_of(head, struct call_function_data, rcu_head);
> -
> -	kfree(data);
> -}
> -
>  /*
>   * Invoked by arch to handle an IPI for call function. Must be called with
>   * interrupts disabled.
> @@ -115,17 +104,14 @@
>  {
>  	struct call_function_data *data;
>  	struct queue *queue;
> +	LIST_HEAD(cull_list);
>  	int cpu = get_cpu();
>
>  	queue = &call_function_queues[queue_no];
> -
> -	/*
> -	 * It's ok to use list_for_each_rcu() here even though we may delete
> -	 * 'pos', since list_del_rcu() doesn't clear ->next
> -	 */
> -	rcu_read_lock();
> -	list_for_each_entry_rcu(data, &queue->list, csd.list) {
> -		if (!cpu_isset(cpu, data->cpumask))
> +
> +	read_lock(&queue->rwlock);
> +	list_for_each_entry(data, &queue->list, csd.list) {
> + 		if (!cpu_isset(cpu, data->cpumask))
>  			continue;
>
>  		data->csd.func(data->csd.info);
> @@ -134,22 +120,36 @@
>  		if (!atomic_dec_and_test(&data->refs))
>  			continue;
>
> -		spin_lock(&queue->lock);
> -		list_del_rcu(&data->csd.list);
> -		spin_unlock(&queue->lock);
> -
>  		if (data->csd.flags & CSD_FLAG_WAIT) {
>  			/*
> -			 * serialize stores to data with the flag clear
> -			 * and wakeup
> +			 * Serialize stores to data with the flag
> +			 * clear and wakeup.  Waiter will remove us
> +			 * from the list.
>  			 */
>  			smp_wmb();
>  			data->csd.flags &= ~CSD_FLAG_WAIT;
> +		} else {
> +			/*
> +			 * If there's no waiter, then the data must
> +			 * have been heap-allocated.
> +			 */
> +			BUG_ON(!(data->csd.flags & CSD_FLAG_ALLOC));
> +
> +			list_add_tail(&data->cull_list, &cull_list);
>  		}
> -		if (data->csd.flags & CSD_FLAG_ALLOC)
> -			call_rcu(&data->rcu_head, rcu_free_call_data);
>  	}
> -	rcu_read_unlock();
> +	read_unlock(&queue->rwlock);
> +
> +	if (!list_empty(&cull_list)) {
> +		struct call_function_data *next;
> +
> +		write_lock(&queue->rwlock);
> +		list_for_each_entry_safe(data, next, &cull_list, cull_list) {
> +			list_del(&data->csd.list);
> +			kfree(data);
> +		}
> +		write_unlock(&queue->rwlock);
> +	}
>
>  	put_cpu();
>  }
> @@ -271,42 +271,6 @@
>  	generic_exec_single(cpu, data);
>  }
>
> -/* Dummy function */
> -static void quiesce_dummy(void *unused)
> -{
> -}
> -
> -/*
> - * Ensure stack based data used in call function mask is safe to free.
> - *
> - * This is needed by smp_call_function_mask when using on-stack data,
> because - * a single call function queue is shared by all CPUs, and any CPU
> may pick up - * the data item on the queue at any time before it is
> deleted. So we need to - * ensure that all CPUs have transitioned through a
> quiescent state after - * this call.
> - *
> - * This is a very slow function, implemented by sending synchronous IPIs
> to - * all possible CPUs. For this reason, we have to alloc data rather
> than use - * stack based data even in the case of synchronous calls. The
> stack based - * data is then just used for deadlock/oom fallback which will
> be very rare. - *
> - * If a faster scheme can be made, we could go back to preferring stack
> based - * data -- the data allocation/free is non-zero cost.
> - */
> -static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
> -{
> -	struct call_single_data data;
> -	int cpu;
> -
> -	data.func = quiesce_dummy;
> -	data.info = NULL;
> -
> -	for_each_cpu_mask(cpu, mask) {
> -		data.flags = CSD_FLAG_WAIT;
> -		generic_exec_single(cpu, &data);
> -	}
> -}
> -
>  /**
>   * smp_call_function_mask(): Run a function on a set of other CPUs.
>   * @mask: The set of cpus to run on.
> @@ -332,7 +296,6 @@
>  	cpumask_t allbutself;
>  	unsigned long flags;
>  	int cpu, num_cpus;
> -	int slowpath = 0;
>  	unsigned queue_no;
>  	struct queue *queue;
>
> @@ -359,16 +322,20 @@
>  		return smp_call_function_single(cpu, func, info, wait);
>  	}
>
> -	data = kmalloc(sizeof(*data), GFP_ATOMIC);
> -	if (data) {
> -		data->csd.flags = CSD_FLAG_ALLOC;
> -		if (wait)
> -			data->csd.flags |= CSD_FLAG_WAIT;
> -	} else {
> +	/*
> +	 * Allocate data if it's an async call, otherwise use stack.
> +	 * If the allocation fails, then convert it to a sync call and
> +	 * use the stack anyway.
> +	 */
> +	if (!wait) {
> +		data = kmalloc(sizeof(*data), GFP_ATOMIC);
> +		if (data)
> +			data->csd.flags = CSD_FLAG_ALLOC;
> +	}
> +	if (!data) {
>  		data = &d;
>  		data->csd.flags = CSD_FLAG_WAIT;
>  		wait = 1;
> -		slowpath = 1;
>  	}
>
>  	data->csd.func = func;
> @@ -376,9 +343,9 @@
>  	atomic_set(&data->refs, num_cpus);
>  	data->cpumask = mask;
>
> -	spin_lock_irqsave(&queue->lock, flags);
> -	list_add_tail_rcu(&data->csd.list, &queue->list);
> -	spin_unlock_irqrestore(&queue->lock, flags);
> +	write_lock_irqsave(&queue->rwlock, flags);
> +	list_add_tail(&data->csd.list, &queue->list);
> +	write_unlock_irqrestore(&queue->rwlock, flags);
>
>  	/* Send a message to all CPUs in the map */
>  	arch_send_call_function_ipi(mask);
> @@ -386,8 +353,13 @@
>  	/* optionally wait for the CPUs to complete */
>  	if (wait) {
>  		csd_flag_wait(&data->csd);
> -		if (unlikely(slowpath))
> -			smp_call_function_mask_quiesce_stack(mask);
> +
> +		write_lock_irqsave(&queue->rwlock, flags);
> +		list_del(&data->csd.list);
> +		write_unlock_irqrestore(&queue->rwlock, flags);
> +
> +		/* We should never wait for allocated data. */
> +		BUG_ON(data->csd.flags & CSD_FLAG_ALLOC);
>  	}
>
>  	return 0;
> @@ -425,7 +397,7 @@
>  	int i;
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_lock(&call_function_queues[i].lock);
> +		write_lock(&call_function_queues[i].rwlock);
>  }
>
>  void ipi_call_unlock(void)
> @@ -433,7 +405,7 @@
>  	int i;
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_unlock(&call_function_queues[i].lock);
> +		write_unlock(&call_function_queues[i].rwlock);
>  }
>
>  void ipi_call_lock_irq(void)
> @@ -443,7 +415,7 @@
>  	spin_lock_irq(&queues_lock);
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_lock_nest_lock(&call_function_queues[i].lock, &queues_lock);
> +		write_lock(&call_function_queues[i].rwlock);
>  }
>
>  void ipi_call_unlock_irq(void)
> @@ -451,7 +423,7 @@
>  	int i;
>
>  	for(i = 0; i < NQUEUES; i++)
> -		spin_unlock(&call_function_queues[i].lock);
> +		write_unlock(&call_function_queues[i].rwlock);
>
>  	spin_unlock_irq(&queues_lock);
>  }
> @@ -463,7 +435,7 @@
>
>  	for(i = 0; i < NQUEUES; i++) {
>  		INIT_LIST_HEAD(&call_function_queues[i].list);
> -		spin_lock_init(&call_function_queues[i].lock);
> +		rwlock_init(&call_function_queues[i].rwlock);
>  	}
>
>  	printk(KERN_INFO "smp function calls: using %d/%d queues\n",
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/