[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <51AD9563.5020404@linux.vnet.ibm.com>
Date: Tue, 04 Jun 2013 12:51:07 +0530
From: Raghavendra K T <raghavendra.kt@...ux.vnet.ibm.com>
To: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
CC: stefan.bader@...onical.com, gleb@...hat.com, mingo@...hat.com,
jeremy@...p.org, x86@...nel.org, hpa@...or.com,
pbonzini@...hat.com, linux-doc@...r.kernel.org,
habanero@...ux.vnet.ibm.com, xen-devel@...ts.xensource.com,
peterz@...radead.org, mtosatti@...hat.com,
stefano.stabellini@...citrix.com, andi@...stfloor.org,
attilio.rao@...rix.com, ouyang@...pitt.edu, gregkh@...e.de,
agraf@...e.de, chegu_vinod@...com, torvalds@...ux-foundation.org,
avi.kivity@...il.com, tglx@...utronix.de, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, stephan.diestelhorst@....com,
riel@...hat.com, drjones@...hat.com,
virtualization@...ts.linux-foundation.org,
srivatsa.vaddagiri@...il.com
Subject: Re: [PATCH RFC V9 5/19] xen/pvticketlock: Xen implementation for
PV ticket locks
On 06/03/2013 09:33 PM, Konrad Rzeszutek Wilk wrote:
> On Sat, Jun 01, 2013 at 12:23:14PM -0700, Raghavendra K T wrote:
>> xen/pvticketlock: Xen implementation for PV ticket locks
>>
>> From: Jeremy Fitzhardinge <jeremy.fitzhardinge@...rix.com>
>>
>> Replace the old Xen implementation of PV spinlocks with and implementation
>> of xen_lock_spinning and xen_unlock_kick.
>>
>> xen_lock_spinning simply registers the cpu in its entry in lock_waiting,
>> adds itself to the waiting_cpus set, and blocks on an event channel
>> until the channel becomes pending.
>>
>> xen_unlock_kick searches the cpus in waiting_cpus looking for the one
>> which next wants this lock with the next ticket, if any. If found,
>> it kicks it by making its event channel pending, which wakes it up.
>>
>> We need to make sure interrupts are disabled while we're relying on the
>> contents of the per-cpu lock_waiting values, otherwise an interrupt
>> handler could come in, try to take some other lock, block, and overwrite
>> our values.
>>
>> Raghu: use function + enum instead of macro, cmpxchg for zero status reset
>>
>> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@...rix.com>
>> Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
>> Signed-off-by: Raghavendra K T <raghavendra.kt@...ux.vnet.ibm.com>
>> ---
>> arch/x86/xen/spinlock.c | 347 +++++++++++------------------------------------
>> 1 file changed, 78 insertions(+), 269 deletions(-)
>>
>> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
>> index d6481a9..860e190 100644
>> --- a/arch/x86/xen/spinlock.c
>> +++ b/arch/x86/xen/spinlock.c
>> @@ -16,45 +16,44 @@
>> #include "xen-ops.h"
>> #include "debugfs.h"
>>
>> -#ifdef CONFIG_XEN_DEBUG_FS
>> -static struct xen_spinlock_stats
>> -{
>> - u64 taken;
>> - u32 taken_slow;
>> - u32 taken_slow_nested;
>> - u32 taken_slow_pickup;
>> - u32 taken_slow_spurious;
>> - u32 taken_slow_irqenable;
>> +enum xen_contention_stat {
>> + TAKEN_SLOW,
>> + TAKEN_SLOW_PICKUP,
>> + TAKEN_SLOW_SPURIOUS,
>> + RELEASED_SLOW,
>> + RELEASED_SLOW_KICKED,
>> + NR_CONTENTION_STATS
>> +};
>>
>> - u64 released;
>> - u32 released_slow;
>> - u32 released_slow_kicked;
>>
>> +#ifdef CONFIG_XEN_DEBUG_FS
>> #define HISTO_BUCKETS 30
>> - u32 histo_spin_total[HISTO_BUCKETS+1];
>> - u32 histo_spin_spinning[HISTO_BUCKETS+1];
>> +static struct xen_spinlock_stats
>> +{
>> + u32 contention_stats[NR_CONTENTION_STATS];
>> u32 histo_spin_blocked[HISTO_BUCKETS+1];
>> -
>> - u64 time_total;
>> - u64 time_spinning;
>> u64 time_blocked;
>> } spinlock_stats;
>>
>> static u8 zero_stats;
>>
>> -static unsigned lock_timeout = 1 << 10;
>> -#define TIMEOUT lock_timeout
>> -
>> static inline void check_zero(void)
>> {
>> - if (unlikely(zero_stats)) {
>> - memset(&spinlock_stats, 0, sizeof(spinlock_stats));
>> - zero_stats = 0;
>> + u8 ret;
>> + u8 old = ACCESS_ONCE(zero_stats);
>> + if (unlikely(old)) {
>> + ret = cmpxchg(&zero_stats, old, 0);
>> + /* This ensures only one fellow resets the stat */
>> + if (ret == old)
>> + memset(&spinlock_stats, 0, sizeof(spinlock_stats));
>> }
>> }
>>
>> -#define ADD_STATS(elem, val) \
>> - do { check_zero(); spinlock_stats.elem += (val); } while(0)
>> +static inline void add_stats(enum xen_contention_stat var, u32 val)
>> +{
>> + check_zero();
>> + spinlock_stats.contention_stats[var] += val;
>> +}
>>
>> static inline u64 spin_time_start(void)
>> {
>> @@ -73,22 +72,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
>> array[HISTO_BUCKETS]++;
>> }
>>
>> -static inline void spin_time_accum_spinning(u64 start)
>> -{
>> - u32 delta = xen_clocksource_read() - start;
>> -
>> - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
>> - spinlock_stats.time_spinning += delta;
>> -}
>> -
>> -static inline void spin_time_accum_total(u64 start)
>> -{
>> - u32 delta = xen_clocksource_read() - start;
>> -
>> - __spin_time_accum(delta, spinlock_stats.histo_spin_total);
>> - spinlock_stats.time_total += delta;
>> -}
>> -
>> static inline void spin_time_accum_blocked(u64 start)
>> {
>> u32 delta = xen_clocksource_read() - start;
>> @@ -98,19 +81,15 @@ static inline void spin_time_accum_blocked(u64 start)
>> }
>> #else /* !CONFIG_XEN_DEBUG_FS */
>> #define TIMEOUT (1 << 10)
>> -#define ADD_STATS(elem, val) do { (void)(val); } while(0)
>> +static inline void add_stats(enum xen_contention_stat var, u32 val)
>> +{
>> +}
>>
>> static inline u64 spin_time_start(void)
>> {
>> return 0;
>> }
>>
>> -static inline void spin_time_accum_total(u64 start)
>> -{
>> -}
>> -static inline void spin_time_accum_spinning(u64 start)
>> -{
>> -}
>> static inline void spin_time_accum_blocked(u64 start)
>> {
>> }
>> @@ -133,229 +112,82 @@ typedef u16 xen_spinners_t;
>> asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
>> #endif
>>
>> -struct xen_spinlock {
>> - unsigned char lock; /* 0 -> free; 1 -> locked */
>> - xen_spinners_t spinners; /* count of waiting cpus */
>> +struct xen_lock_waiting {
>> + struct arch_spinlock *lock;
>> + __ticket_t want;
>> };
>>
>> static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
>> +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
>> +static cpumask_t waiting_cpus;
>>
>> -#if 0
>> -static int xen_spin_is_locked(struct arch_spinlock *lock)
>> -{
>> - struct xen_spinlock *xl = (struct xen_spinlock *)lock;
>> -
>> - return xl->lock != 0;
>> -}
>> -
>> -static int xen_spin_is_contended(struct arch_spinlock *lock)
>> +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
>> {
>> - struct xen_spinlock *xl = (struct xen_spinlock *)lock;
>> -
>> - /* Not strictly true; this is only the count of contended
>> - lock-takers entering the slow path. */
>> - return xl->spinners != 0;
>> -}
>> -
>> -static int xen_spin_trylock(struct arch_spinlock *lock)
>> -{
>> - struct xen_spinlock *xl = (struct xen_spinlock *)lock;
>> - u8 old = 1;
>> -
>> - asm("xchgb %b0,%1"
>> - : "+q" (old), "+m" (xl->lock) : : "memory");
>> -
>> - return old == 0;
>> -}
>> -
>> -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
>> -
>> -/*
>> - * Mark a cpu as interested in a lock. Returns the CPU's previous
>> - * lock of interest, in case we got preempted by an interrupt.
>> - */
>> -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
>> -{
>> - struct xen_spinlock *prev;
>> -
>> - prev = __this_cpu_read(lock_spinners);
>> - __this_cpu_write(lock_spinners, xl);
>> -
>> - wmb(); /* set lock of interest before count */
>> -
>> - inc_spinners(xl);
>> -
>> - return prev;
>> -}
>> -
>> -/*
>> - * Mark a cpu as no longer interested in a lock. Restores previous
>> - * lock of interest (NULL for none).
>> - */
>> -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
>> -{
>> - dec_spinners(xl);
>> - wmb(); /* decrement count before restoring lock */
>> - __this_cpu_write(lock_spinners, prev);
>> -}
>> -
>> -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
>> -{
>> - struct xen_spinlock *xl = (struct xen_spinlock *)lock;
>> - struct xen_spinlock *prev;
>> int irq = __this_cpu_read(lock_kicker_irq);
>> - int ret;
>> + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
>> + int cpu = smp_processor_id();
>> u64 start;
>> + unsigned long flags;
>>
>> /* If kicker interrupts not initialized yet, just spin */
>> if (irq == -1)
>> - return 0;
>> + return;
>>
>> start = spin_time_start();
>>
>> - /* announce we're spinning */
>> - prev = spinning_lock(xl);
>> -
>> - ADD_STATS(taken_slow, 1);
>> - ADD_STATS(taken_slow_nested, prev != NULL);
>> -
>> - do {
>> - unsigned long flags;
>> -
>> - /* clear pending */
>> - xen_clear_irq_pending(irq);
>> -
>> - /* check again make sure it didn't become free while
>> - we weren't looking */
>> - ret = xen_spin_trylock(lock);
>> - if (ret) {
>> - ADD_STATS(taken_slow_pickup, 1);
>> -
>> - /*
>> - * If we interrupted another spinlock while it
>> - * was blocking, make sure it doesn't block
>> - * without rechecking the lock.
>> - */
>> - if (prev != NULL)
>> - xen_set_irq_pending(irq);
>> - goto out;
>> - }
>> + /*
>> + * Make sure an interrupt handler can't upset things in a
>> + * partially setup state.
>> + */
>> + local_irq_save(flags);
>>
>> - flags = arch_local_save_flags();
>> - if (irq_enable) {
>> - ADD_STATS(taken_slow_irqenable, 1);
>> - raw_local_irq_enable();
>> - }
>> + w->want = want;
>> + smp_wmb();
>> + w->lock = lock;
>>
>> - /*
>> - * Block until irq becomes pending. If we're
>> - * interrupted at this point (after the trylock but
>> - * before entering the block), then the nested lock
>> - * handler guarantees that the irq will be left
>> - * pending if there's any chance the lock became free;
>> - * xen_poll_irq() returns immediately if the irq is
>> - * pending.
>> - */
>> - xen_poll_irq(irq);
>> + /* This uses set_bit, which atomic and therefore a barrier */
>> + cpumask_set_cpu(cpu, &waiting_cpus);
>> + add_stats(TAKEN_SLOW, 1);
>>
>> - raw_local_irq_restore(flags);
>> + /* clear pending */
>> + xen_clear_irq_pending(irq);
>>
>> - ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
>> - } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
>> + /* Only check lock once pending cleared */
>> + barrier();
>>
>> + /* check again make sure it didn't become free while
>> + we weren't looking */
>> + if (ACCESS_ONCE(lock->tickets.head) == want) {
>> + add_stats(TAKEN_SLOW_PICKUP, 1);
>> + goto out;
>> + }
>> + /* Block until irq becomes pending (or perhaps a spurious wakeup) */
>> + xen_poll_irq(irq);
>> + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
>> kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
>> -
>> out:
>> - unspinning_lock(xl, prev);
>> + cpumask_clear_cpu(cpu, &waiting_cpus);
>> + w->lock = NULL;
>> + local_irq_restore(flags);
>> spin_time_accum_blocked(start);
>> -
>> - return ret;
>> }
>>
>> -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
>> -{
>> - struct xen_spinlock *xl = (struct xen_spinlock *)lock;
>> - unsigned timeout;
>> - u8 oldval;
>> - u64 start_spin;
>> -
>> - ADD_STATS(taken, 1);
>> -
>> - start_spin = spin_time_start();
>> -
>> - do {
>> - u64 start_spin_fast = spin_time_start();
>> -
>> - timeout = TIMEOUT;
>> -
>> - asm("1: xchgb %1,%0\n"
>> - " testb %1,%1\n"
>> - " jz 3f\n"
>> - "2: rep;nop\n"
>> - " cmpb $0,%0\n"
>> - " je 1b\n"
>> - " dec %2\n"
>> - " jnz 2b\n"
>> - "3:\n"
>> - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
>> - : "1" (1)
>> - : "memory");
>> -
>> - spin_time_accum_spinning(start_spin_fast);
>> -
>> - } while (unlikely(oldval != 0 &&
>> - (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
>> -
>> - spin_time_accum_total(start_spin);
>> -}
>> -
>> -static void xen_spin_lock(struct arch_spinlock *lock)
>> -{
>> - __xen_spin_lock(lock, false);
>> -}
>> -
>> -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
>> -{
>> - __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
>> -}
>> -
>> -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
>> +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
>> {
>> int cpu;
>>
>> - ADD_STATS(released_slow, 1);
>> + add_stats(RELEASED_SLOW, 1);
>> +
>> + for_each_cpu(cpu, &waiting_cpus) {
>> + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
>>
>> - for_each_online_cpu(cpu) {
>> - /* XXX should mix up next cpu selection */
>> - if (per_cpu(lock_spinners, cpu) == xl) {
>> - ADD_STATS(released_slow_kicked, 1);
>> + if (w->lock == lock && w->want == next) {
>> + add_stats(RELEASED_SLOW_KICKED, 1);
>> xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
>
> When this was initially implemented there was a "break" here. But
> 76eaca031f0af2bb303e405986f637811956a422 (xen: Send spinlock IPI to all waiters)
> fixed an issue and changed this to send an IPI to all of the CPUs. That
> means the 'break' was removed..
>
> With this implementation of spinlock, you know exactly which vCPU is holding it,
> so this code should introduce the 'break' back.
>
Thank you for spotting. agreed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists