lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160420120805.GB3408@twins.programming.kicks-ass.net>
Date:	Wed, 20 Apr 2016 14:08:05 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Waiman Long <Waiman.Long@....com>
Cc:	Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
	Pan Xinhui <xinhui@...ux.vnet.ibm.com>,
	Scott J Norton <scott.norton@....com>,
	Douglas Hatch <doug.hatch@....com>
Subject: Re: [PATCH v2] locking/pvqspinlock: Add lock holder CPU argument to
 pv_wait()

On Thu, Apr 14, 2016 at 02:41:58PM -0400, Waiman Long wrote:
> Pan Xinhui was asking for a lock holder cpu argument in pv_wait()
> to help the porting of pvqspinlock to PPC. The new argument will can
> potentially help hypervisor expediate the execution of the critical
> section so that the lock holder vCPU can release the lock sooner.
> 
> This patch does just that by storing the previous node vCPU number.
> In pv_wait_head_or_lock(), pv_wait() will be called with that vCPU
> number as it is likely to be the lock holder.
> 
> In pv_wait_node(), the newly added pv_lookup_hash() function will
> be called to look up the queue head and pass in the lock holder vCPU
> number stored there.
> 
> This patch introduces negligible overhead to the current pvqspinlock
> code. The extra lockcpu argument isn't currently used in x86
> architecture.

This Changelog is completely useless; it does not explain how this
works at all.


> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index ce2f75e..99f31e4 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
>   */
>  
>  static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
> -static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
> +static __always_inline void __pv_wait_node(struct qspinlock *lock,
> +					   struct mcs_spinlock *node,
>  					   struct mcs_spinlock *prev) { }
>  static __always_inline void __pv_kick_node(struct qspinlock *lock,
>  					   struct mcs_spinlock *node) { }
> @@ -407,7 +408,7 @@ queue:
>  		prev = decode_tail(old);
>  		WRITE_ONCE(prev->next, node);
>  
> -		pv_wait_node(node, prev);
> +		pv_wait_node(lock, node, prev);
>  		arch_mcs_spin_lock_contended(&node->locked);
>  
>  		/*
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index 21ede57..895224e 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -51,6 +51,7 @@ struct pv_node {
>  	struct mcs_spinlock	__res[3];
>  
>  	int			cpu;
> +	int			prev_cpu;	/* Previous node cpu */

That is a horrible name; what is a 'node cpu'.

>  	u8			state;
>  };
>  
> @@ -156,8 +157,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>   * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
>   *
>   * Since we should not be holding locks from NMI context (very rare indeed) the
> - * max load factor is 0.75, which is around the point where open addressing
> - * breaks down.
> + * max load factor is 0.75.

Why? Isn't that true anymore?

>   *
>   */
>  struct pv_hash_entry {
> @@ -251,6 +251,31 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
>  }
>  
>  /*
> + * Look up the given lock in the hash table
> + * Return the pv_node if found, NULL otherwise
> + */
> +static struct pv_node *pv_lookup_hash(struct qspinlock *lock)
> +{
> +	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
> +	struct pv_hash_entry *he;
> +
> +	for_each_hash_entry(he, offset, hash) {
> +		struct qspinlock *l = READ_ONCE(he->lock);
> +
> +		if (l == lock)

The other loop writes:

		if (READ_ONCE(he->lock) == lock)

> +			return READ_ONCE(he->node);
> +		/*
> +		 * Presence of an empty slot signal the end of search. We
> +		 * may miss the entry, but that will limit the amount of
> +		 * time doing the search when the desired entry isn't there.
> +		 */
> +		else if (!l)
> +			break;

That 'else' is entirely pointless. Also, why isn't this: return NULL;

> +	}
> +	return NULL;

and this BUG() ?

> +}
> +
> +/*
>   * Return true if when it is time to check the previous node which is not
>   * in a running state.
>   */
> @@ -275,6 +300,7 @@ static void pv_init_node(struct mcs_spinlock *node)
>  
>  	pn->cpu = smp_processor_id();
>  	pn->state = vcpu_running;
> +	pn->prev_cpu = -1;

This does not match the struct element order.

>  }
>  
>  /*
> @@ -282,7 +308,8 @@ static void pv_init_node(struct mcs_spinlock *node)
>   * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
>   * behalf.
>   */
> -static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
> +static void pv_wait_node(struct qspinlock *lock, struct mcs_spinlock *node,
> +			 struct mcs_spinlock *prev)
>  {
>  	struct pv_node *pn = (struct pv_node *)node;
>  	struct pv_node *pp = (struct pv_node *)prev;
> @@ -290,6 +317,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>  	int loop;
>  	bool wait_early;
>  
> +	pn->prev_cpu = pp->cpu;	/* Save previous node vCPU */

again a useless comment.

> +
>  	/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
>  	for (;; waitcnt++) {
>  		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
> @@ -314,10 +343,21 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>  		smp_store_mb(pn->state, vcpu_halted);
>  
>  		if (!READ_ONCE(node->locked)) {
> +			struct pv_node *hn;
> +
>  			qstat_inc(qstat_pv_wait_node, true);
>  			qstat_inc(qstat_pv_wait_again, waitcnt);
>  			qstat_inc(qstat_pv_wait_early, wait_early);
> -			pv_wait(&pn->state, vcpu_halted);
> +
> +			/*
> +			 * We try to locate the queue head pv_node by looking
> +			 * up the hash table. If it is not found, use the
> +			 * CPU in the previous node instead.
> +			 */
> +			hn = pv_lookup_hash(lock);
> +			if (!hn)
> +				hn = pn;

This is potentially expensive... it does not explain why this lookup can
fail etc.. nor mentioned that lock stealing caveat.

> +			pv_wait(&pn->state, vcpu_halted, hn->prev_cpu);
>  		}
>  
>  		/*
> @@ -453,7 +493,15 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
>  		WRITE_ONCE(pn->state, vcpu_halted);
>  		qstat_inc(qstat_pv_wait_head, true);
>  		qstat_inc(qstat_pv_wait_again, waitcnt);
> -		pv_wait(&l->locked, _Q_SLOW_VAL);
> +
> +		/*
> +		 * Pass in the previous node vCPU nmber which is likely to be
> +		 * the lock holder vCPU. This additional information may help
> +		 * the hypervisor to give more resource to that vCPU so that
> +		 * it can release the lock faster. With lock stealing,
> +		 * however, that vCPU may not be the actual lock holder.
> +		 */
> +		pv_wait(&l->locked, _Q_SLOW_VAL, pn->prev_cpu);


urgh..


With all the holes in, does it really still matter?

In any case, I would really only want to see this together with the
patches that make use of it, and then still have it have numbers with
and without this thing.



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ