xen/5555a4f8-use-ticket-locks-for-spin-locks.patch

# Commit 45fcc4568c5162b00fb3907fb158af82dd484a3d
# Date 2015-05-15 09:49:12 +0200
# Author David Vrabel <david.vrabel@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
use ticket locks for spin locks

Replace the byte locks with ticket locks.  Ticket locks are: a) fair;
and b) peform better when contented since they spin without an atomic
operation.

The lock is split into two ticket values: head and tail.  A locker
acquires a ticket by (atomically) increasing tail and using the
previous tail value.  A CPU holds the lock if its ticket == head.  The
lock is released by increasing head.

spin_lock_irq() and spin_lock_irqsave() now spin with irqs disabled
(previously, they would spin with irqs enabled if possible).  This is
required to prevent deadlocks when the irq handler tries to take the
same lock with a higher ticket.

Architectures need only provide arch_fetch_and_add() and two barriers:
arch_lock_acquire_barrier() and arch_lock_release_barrier().

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -115,125 +115,134 @@ void spin_debug_disable(void)

 #endif

+static always_inline spinlock_tickets_t observe_lock(spinlock_tickets_t *t)
+{
+    spinlock_tickets_t v;
+
+    smp_rmb();
+    v.head_tail = read_atomic(&t->head_tail);
+    return v;
+}
+
+static always_inline u16 observe_head(spinlock_tickets_t *t)
+{
+    smp_rmb();
+    return read_atomic(&t->head);
+}
+
 void _spin_lock(spinlock_t *lock)
 {
+    spinlock_tickets_t tickets = SPINLOCK_TICKET_INC;
     LOCK_PROFILE_VAR;

     check_lock(&lock->debug);
-    while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
+    tickets.head_tail = arch_fetch_and_add(&lock->tickets.head_tail,
+                                           tickets.head_tail);
+    while ( tickets.tail != observe_head(&lock->tickets) )
     {
         LOCK_PROFILE_BLOCK;
-        while ( likely(_raw_spin_is_locked(&lock->raw)) )
-            cpu_relax();
+        cpu_relax();
     }
     LOCK_PROFILE_GOT;
     preempt_disable();
+    arch_lock_acquire_barrier();
 }

 void _spin_lock_irq(spinlock_t *lock)
 {
-    LOCK_PROFILE_VAR;
-
     ASSERT(local_irq_is_enabled());
     local_irq_disable();
-    check_lock(&lock->debug);
-    while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
-    {
-        LOCK_PROFILE_BLOCK;
-        local_irq_enable();
-        while ( likely(_raw_spin_is_locked(&lock->raw)) )
-            cpu_relax();
-        local_irq_disable();
-    }
-    LOCK_PROFILE_GOT;
-    preempt_disable();
+    _spin_lock(lock);
 }

 unsigned long _spin_lock_irqsave(spinlock_t *lock)
 {
     unsigned long flags;
-    LOCK_PROFILE_VAR;

     local_irq_save(flags);
-    check_lock(&lock->debug);
-    while ( unlikely(!_raw_spin_trylock(&lock->raw)) )
-    {
-        LOCK_PROFILE_BLOCK;
-        local_irq_restore(flags);
-        while ( likely(_raw_spin_is_locked(&lock->raw)) )
-            cpu_relax();
-        local_irq_save(flags);
-    }
-    LOCK_PROFILE_GOT;
-    preempt_disable();
+    _spin_lock(lock);
     return flags;
 }

 void _spin_unlock(spinlock_t *lock)
 {
+    arch_lock_release_barrier();
     preempt_enable();
     LOCK_PROFILE_REL;
-    _raw_spin_unlock(&lock->raw);
+    add_sized(&lock->tickets.head, 1);
 }

 void _spin_unlock_irq(spinlock_t *lock)
 {
-    preempt_enable();
-    LOCK_PROFILE_REL;
-    _raw_spin_unlock(&lock->raw);
+    _spin_unlock(lock);
     local_irq_enable();
 }

 void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
-    preempt_enable();
-    LOCK_PROFILE_REL;
-    _raw_spin_unlock(&lock->raw);
+    _spin_unlock(lock);
     local_irq_restore(flags);
 }

 int _spin_is_locked(spinlock_t *lock)
 {
     check_lock(&lock->debug);
-    return _raw_spin_is_locked(&lock->raw);
+    return lock->tickets.head != lock->tickets.tail;
 }

 int _spin_trylock(spinlock_t *lock)
 {
+    spinlock_tickets_t old, new;
+
     check_lock(&lock->debug);
-    if ( !_raw_spin_trylock(&lock->raw) )
+    old = observe_lock(&lock->tickets);
+    if ( old.head != old.tail )
+        return 0;
+    new = old;
+    new.tail++;
+    if ( cmpxchg(&lock->tickets.head_tail,
+                 old.head_tail, new.head_tail) != old.head_tail )
         return 0;
 #ifdef LOCK_PROFILE
     if (lock->profile)
         lock->profile->time_locked = NOW();
 #endif
     preempt_disable();
+    /*
+     * cmpxchg() is a full barrier so no need for an
+     * arch_lock_acquire_barrier().
+     */
     return 1;
 }

 void _spin_barrier(spinlock_t *lock)
 {
+    spinlock_tickets_t sample;
 #ifdef LOCK_PROFILE
     s_time_t block = NOW();
-    u64      loop = 0;
+#endif

     check_barrier(&lock->debug);
-    do { smp_mb(); loop++;} while ( _raw_spin_is_locked(&lock->raw) );
-    if ((loop > 1) && lock->profile)
+    smp_mb();
+    sample = observe_lock(&lock->tickets);
+    if ( sample.head != sample.tail )
     {
-        lock->profile->time_block += NOW() - block;
-        lock->profile->block_cnt++;
-    }
-#else
-    check_barrier(&lock->debug);
-    do { smp_mb(); } while ( _raw_spin_is_locked(&lock->raw) );
+        while ( observe_head(&lock->tickets) == sample.head )
+            cpu_relax();
+#ifdef LOCK_PROFILE
+        if ( lock->profile )
+        {
+            lock->profile->time_block += NOW() - block;
+            lock->profile->block_cnt++;
+        }
 #endif
+    }
     smp_mb();
 }

 int _spin_trylock_recursive(spinlock_t *lock)
 {
-    int cpu = smp_processor_id();
+    unsigned int cpu = smp_processor_id();

     /* Don't allow overflow of recurse_cpu field. */
     BUILD_BUG_ON(NR_CPUS > 0xfffu);
@@ -256,8 +265,17 @@ int _spin_trylock_recursive(spinlock_t *

 void _spin_lock_recursive(spinlock_t *lock)
 {
-    while ( !spin_trylock_recursive(lock) )
-        cpu_relax();
+    unsigned int cpu = smp_processor_id();
+
+    if ( likely(lock->recurse_cpu != cpu) )
+    {
+        _spin_lock(lock);
+        lock->recurse_cpu = cpu;
+    }
+
+    /* We support only fairly shallow recursion, else the counter overflows. */
+    ASSERT(lock->recurse_cnt < 0xfu);
+    lock->recurse_cnt++;
 }

 void _spin_unlock_recursive(spinlock_t *lock)
--- a/xen/include/asm-arm/system.h
+++ b/xen/include/asm-arm/system.h
@@ -53,6 +53,9 @@

 #define arch_fetch_and_add(x, v) __sync_fetch_and_add(x, v)

+#define arch_lock_acquire_barrier() smp_mb()
+#define arch_lock_release_barrier() smp_mb()
+
 extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next);

 #endif
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -185,6 +185,17 @@ static always_inline unsigned long __xad
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)

+/*
+ * On x86 the only reordering is of reads with older writes.  In the
+ * lock case, the read in observe_head() can only be reordered with
+ * writes that precede it, and moving a write _into_ a locked section
+ * is OK.  In the release case, the write in add_sized() can only be
+ * reordered with reads that follow it, and hoisting a read _into_ a
+ * locked region is OK.
+ */
+#define arch_lock_acquire_barrier() barrier()
+#define arch_lock_release_barrier() barrier()
+
 #define local_irq_disable()     asm volatile ( "cli" : : : "memory" )
 #define local_irq_enable()      asm volatile ( "sti" : : : "memory" )

--- a/xen/include/xen/spinlock.h
+++ b/xen/include/xen/spinlock.h
@@ -80,8 +80,7 @@ struct lock_profile_qhead {
     static struct lock_profile *__lock_profile_##name                         \
     __used_section(".lockprofile.data") =                                     \
     &__lock_profile_data_##name
-#define _SPIN_LOCK_UNLOCKED(x) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0,          \
-                                 _LOCK_DEBUG, x }
+#define _SPIN_LOCK_UNLOCKED(x) { { 0 }, 0xfffu, 0, _LOCK_DEBUG, x }
 #define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED(NULL)
 #define DEFINE_SPINLOCK(l)                                                    \
     spinlock_t l = _SPIN_LOCK_UNLOCKED(NULL);                                 \
@@ -117,8 +116,7 @@ extern void spinlock_profile_reset(unsig

 struct lock_profile_qhead { };

-#define SPIN_LOCK_UNLOCKED                                                    \
-    { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
+#define SPIN_LOCK_UNLOCKED { { 0 }, 0xfffu, 0, _LOCK_DEBUG }
 #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED

 #define spin_lock_init_prof(s, l) spin_lock_init(&((s)->l))
@@ -127,8 +125,18 @@ struct lock_profile_qhead { };

 #endif

+typedef union {
+    u32 head_tail;
+    struct {
+        u16 head;
+        u16 tail;
+    };
+} spinlock_tickets_t;
+
+#define SPINLOCK_TICKET_INC { .head_tail = 0x10000, }
+
 typedef struct spinlock {
-    raw_spinlock_t raw;
+    spinlock_tickets_t tickets;
     u16 recurse_cpu:12;
     u16 recurse_cnt:4;
     struct lock_debug debug;