[patch] preempt-smp.patch, 2.6.9-rc1-mm5

From: Ingo Molnar
Date: Mon Sep 13 2004 - 05:25:19 EST



the attached patch is a merge of the preempt-smp patch to 2.6.9-rc1-mm5.

Changes since the -mm4 version: fixed it to work with CONFIG_LOCKMETER.

I compiled & booted all possible combinations of SMP/UP, PREEMPT and
LOCKMETER on x86 and it works fine. I also test-built and booted the
patched kernel on x64-SMP-PREEMPT. Details in the patch metadata.

Ingo

SMP locking latencies are one of the last architectural problems that
cause millisec-category scheduling delays. CONFIG_PREEMPT tries to solve
some of the SMP issues but there are still lots of problems remaining:
spinlocks nested at multiple levels, spinning with irqs turned off, and
non-nested spinning with preemption turned off permanently.

The nesting problem goes like this: if a piece of kernel code (e.g. the
MM or ext3's journalling code) does the following:

spin_lock(&spinlock_1);
...
spin_lock(&spinlock_2);
...

then even with CONFIG_PREEMPT enabled, current kernels may spin on
spinlock_2 indefinitely. A number of critical sections break their long
paths by using cond_resched_lock(), but this does not break the path on
SMP, because need_resched() *of the other CPU* is not set so
cond_resched_lock() doesnt notice that a reschedule is due.

to solve this problem i've introduced a new spinlock field,
lock->break_lock, which signals towards the holding CPU that a
spinlock-break is requested by another CPU. This field is only set if a
CPU is spinning in a spinlock function [at any locking depth], so the
default overhead is zero. I've extended cond_resched_lock() to check for
this flag - in this case we can also save a reschedule. I've added the
lock_need_resched(lock) and need_lockbreak(lock) methods to check for
the need to break out of a critical section.

Another latency problem was that the stock kernel, even with
CONFIG_PREEMPT enabled, didnt have any spin-nicely preemption logic for
the following, commonly used SMP locking primitives: read_lock(),
spin_lock_irqsave(), spin_lock_irq(), spin_lock_bh(),
read_lock_irqsave(), read_lock_irq(), read_lock_bh(),
write_lock_irqsave(), write_lock_irq(), write_lock_bh(). Only
spin_lock() and write_lock() [the two simplest cases] where covered.

In addition to the preemption latency problems, the _irq() variants in
the above list didnt do any IRQ-enabling while spinning - possibly
resulting in excessive irqs-off sections of code!

preempt-smp.patch fixes all these latency problems by spinning
irq-nicely (if possible) and by requesting lock-breaks if needed. Two
architecture-level changes were necessary for this: the addition of the
break_lock field to spinlock_t and rwlock_t, and the addition of the
_raw_read_trylock() function.

Testing done by Mark H Johnson and myself indicate SMP latencies
comparable to the UP kernel - while they were basically indefinitely
high without this patch.

i successfully test-compiled and test-booted this patch ontop of BK-curr
using the following .config combinations: SMP && PREEMPT, !SMP &&
PREEMPT, SMP && !PREEMPT and !SMP && !PREEMPT on x86, !SMP && !PREEMPT
and SMP && PREEMPT on x64. I also test-booted x86 with the
generic_read_trylock function to check that it works fine. Essentially
the same patch has been in testing as part of the voluntary-preempt
patches for some time already.

NOTE to architecture maintainers: generic_raw_read_trylock() is a crude
version that should be replaced with the proper arch-optimized version
ASAP.

Ingo

Signed-off-by: Ingo Molnar <mingo@xxxxxxx>

--- linux/include/asm-ppc64/spinlock.h.orig
+++ linux/include/asm-ppc64/spinlock.h
@@ -23,10 +23,16 @@

typedef struct {
volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

typedef struct {
volatile signed int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#ifdef __KERNEL__
@@ -216,6 +222,8 @@ static void __inline__ _raw_read_unlock(
: "cr0", "memory");
}

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
/*
* This returns the old value in the lock,
* so we got the write lock if the return value is 0.
--- linux/include/asm-parisc/system.h.orig
+++ linux/include/asm-parisc/system.h
@@ -176,6 +176,9 @@ typedef struct {
void *previous;
struct task_struct * task;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define __lock_aligned __attribute__((__section__(".data.lock_aligned")))
--- linux/include/asm-parisc/spinlock.h.orig
+++ linux/include/asm-parisc/spinlock.h
@@ -128,6 +128,9 @@ do { \
typedef struct {
spinlock_t lock;
volatile int counter;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#define RW_LOCK_UNLOCKED (rwlock_t) { __SPIN_LOCK_UNLOCKED, 0 }
@@ -136,6 +139,8 @@ typedef struct {

#define rwlock_is_locked(lp) ((lp)->counter != 0)

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
/* read_lock, read_unlock are pretty straightforward. Of course it somehow
* sucks we end up saving/restoring flags twice for read_lock_irqsave aso. */

--- linux/include/asm-mips/spinlock.h.orig
+++ linux/include/asm-mips/spinlock.h
@@ -15,6 +15,9 @@

typedef struct {
volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
@@ -97,6 +100,9 @@ typedef struct {
/* and we need this storage for CPU and lock INDEX */
unsigned lockmeter_magic;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#ifdef CONFIG_LOCKMETER
@@ -177,6 +183,8 @@ static inline void _raw_write_unlock(rwl
: "memory");
}

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
static inline int _raw_write_trylock(rwlock_t *rw)
{
unsigned int tmp;
--- linux/include/asm-ppc/spinlock.h.orig
+++ linux/include/asm-ppc/spinlock.h
@@ -13,6 +13,9 @@ typedef struct {
volatile unsigned long owner_pc;
volatile unsigned long owner_cpu;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#ifdef __KERNEL__
@@ -83,6 +86,9 @@ typedef struct {
#ifdef CONFIG_DEBUG_SPINLOCK
volatile unsigned long owner_pc;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#ifdef CONFIG_DEBUG_SPINLOCK
@@ -192,5 +198,7 @@ extern int _raw_write_trylock(rwlock_t *

#endif

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
#endif /* __ASM_SPINLOCK_H */
#endif /* __KERNEL__ */
--- linux/include/asm-x86_64/spinlock.h.orig
+++ linux/include/asm-x86_64/spinlock.h
@@ -18,6 +18,9 @@ typedef struct {
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned magic;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define SPINLOCK_MAGIC 0xdead4ead
@@ -174,6 +177,9 @@ typedef struct {
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned magic;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#define RWLOCK_MAGIC 0xdeaf1eed
@@ -228,6 +234,16 @@ static inline void _raw_write_lock(rwloc
#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")

+static inline int _raw_read_trylock(rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+ atomic_dec(count);
+ if (atomic_read(count) < RW_LOCK_BIAS)
+ return 1;
+ atomic_inc(count);
+ return 0;
+}
+
static inline int _raw_write_trylock(rwlock_t *lock)
{
atomic_t *count = (atomic_t *)lock;
--- linux/include/asm-ia64/spinlock.h.orig
+++ linux/include/asm-ia64/spinlock.h
@@ -19,6 +19,9 @@

typedef struct {
volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
@@ -121,6 +124,9 @@ typedef struct {
/* and we need this storage for CPU and lock INDEX */
unsigned lockmeter_magic;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#ifdef CONFIG_LOCKMETER
@@ -242,6 +248,8 @@ do { \

#endif /* !ASM_SUPPORTED */

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
#define _raw_write_unlock(x) \
({ \
smp_mb__before_clear_bit(); /* need barrier before releasing lock... */ \
--- linux/include/linux/spinlock.h.orig
+++ linux/include/linux/spinlock.h
@@ -47,6 +47,7 @@
#define __lockfunc fastcall __attribute__((section(".spinlock.text")))

int __lockfunc _spin_trylock(spinlock_t *lock);
+int __lockfunc _read_trylock(rwlock_t *lock);
int __lockfunc _write_trylock(rwlock_t *lock);
void __lockfunc _spin_lock(spinlock_t *lock);
void __lockfunc _write_lock(rwlock_t *lock);
@@ -74,7 +75,9 @@ void __lockfunc _write_unlock_irqrestore
void __lockfunc _write_unlock_irq(rwlock_t *lock);
void __lockfunc _write_unlock_bh(rwlock_t *lock);
int __lockfunc _spin_trylock_bh(spinlock_t *lock);
+int __lockfunc generic_raw_read_trylock(rwlock_t *lock);
int in_lock_functions(unsigned long addr);
+
#else

#define in_lock_functions(ADDR) 0
@@ -225,11 +228,15 @@ typedef struct {
#define _raw_read_unlock(lock) do { (void)(lock); } while(0)
#define _raw_write_lock(lock) do { (void)(lock); } while(0)
#define _raw_write_unlock(lock) do { (void)(lock); } while(0)
+#define _raw_read_trylock(lock) ({ (void)(lock); (1); })
#define _raw_write_trylock(lock) ({ (void)(lock); (1); })

#define _spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \
1 : ({preempt_enable(); 0;});})

+#define _read_trylock(lock) ({preempt_disable();_raw_read_trylock(lock) ? \
+ 1 : ({preempt_enable(); 0;});})
+
#define _write_trylock(lock) ({preempt_disable(); _raw_write_trylock(lock) ? \
1 : ({preempt_enable(); 0;});})

@@ -424,16 +431,12 @@ extern int _metered_write_trylock(rwloc
* methods are defined as nops in the case they are not required.
*/
#define spin_trylock(lock) _spin_trylock(lock)
+#define read_trylock(lock) _read_trylock(lock)
#define write_trylock(lock) _write_trylock(lock)

-/* Where's read_trylock? */
-
#define spin_lock(lock) _spin_lock(lock)
#define write_lock(lock) _write_lock(lock)
#define read_lock(lock) _read_lock(lock)
-#define spin_unlock(lock) _spin_unlock(lock)
-#define write_unlock(lock) _write_unlock(lock)
-#define read_unlock(lock) _read_unlock(lock)

#ifdef CONFIG_SMP
#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock)
@@ -453,6 +456,11 @@ extern int _metered_write_trylock(rwloc

#define write_lock_irq(lock) _write_lock_irq(lock)
#define write_lock_bh(lock) _write_lock_bh(lock)
+
+#define spin_unlock(lock) _spin_unlock(lock)
+#define write_unlock(lock) _write_unlock(lock)
+#define read_unlock(lock) _read_unlock(lock)
+
#define spin_unlock_irqrestore(lock, flags) _spin_unlock_irqrestore(lock, flags)
#define spin_unlock_irq(lock) _spin_unlock_irq(lock)
#define spin_unlock_bh(lock) _spin_unlock_bh(lock)
@@ -475,6 +483,7 @@ extern void _metered_read_lock (rwloc
extern void _metered_read_unlock (rwlock_t *lock);
extern void _metered_write_lock (rwlock_t *lock);
extern void _metered_write_unlock (rwlock_t *lock);
+extern int _metered_read_trylock (rwlock_t *lock);
extern int _metered_write_trylock(rwlock_t *lock);
#endif

@@ -637,8 +646,11 @@ static inline void bit_spin_lock(int bit
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
while (test_and_set_bit(bitnum, addr)) {
- while (test_bit(bitnum, addr))
+ while (test_bit(bitnum, addr)) {
+ preempt_enable();
cpu_relax();
+ preempt_disable();
+ }
}
#endif
}
--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -1132,23 +1132,7 @@ static inline void cond_resched(void)
__cond_resched();
}

-/*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-static inline void cond_resched_lock(spinlock_t * lock)
-{
- if (need_resched()) {
- _raw_spin_unlock(lock);
- preempt_enable_no_resched();
- __cond_resched();
- spin_lock(lock);
- }
-}
+extern int cond_resched_lock(spinlock_t * lock);

/* Reevaluate whether the task has signals pending delivery.
This is required every time the blocked sigset_t changes.
--- linux/include/asm-arm/spinlock.h.orig
+++ linux/include/asm-arm/spinlock.h
@@ -17,6 +17,9 @@
*/
typedef struct {
volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
@@ -70,6 +73,9 @@ static inline void _raw_spin_unlock(spin
*/
typedef struct {
volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
@@ -143,6 +149,8 @@ static inline void _raw_read_unlock(rwlo
: "cc", "memory");
}

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
static inline int _raw_write_trylock(rwlock_t *rw)
{
unsigned long tmp;
--- linux/include/asm-sparc/spinlock.h.orig
+++ linux/include/asm-sparc/spinlock.h
@@ -16,6 +16,9 @@
struct _spinlock_debug {
unsigned char lock;
unsigned long owner_pc;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
};
typedef struct _spinlock_debug spinlock_t;

@@ -36,6 +39,9 @@ struct _rwlock_debug {
volatile unsigned int lock;
unsigned long owner_pc;
unsigned long reader_pc[NR_CPUS];
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
};
typedef struct _rwlock_debug rwlock_t;

@@ -79,8 +85,14 @@ do { unsigned long flags; \

#else /* !CONFIG_DEBUG_SPINLOCK */

-typedef unsigned char spinlock_t;
-#define SPIN_LOCK_UNLOCKED 0
+typedef struct {
+ unsigned char lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
+} spinlock_t;
+
+#define SPIN_LOCK_UNLOCKED { 0, }

#define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0)
#define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
@@ -137,7 +149,12 @@ extern __inline__ void _raw_spin_unlock(
* XXX This might create some problems with my dual spinlock
* XXX scheme, deadlocks etc. -DaveM
*/
-typedef struct { volatile unsigned int lock; } rwlock_t;
+typedef struct {
+ volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
+} rwlock_t;

#define RW_LOCK_UNLOCKED (rwlock_t) { 0 }

--- linux/include/asm-alpha/spinlock.h.orig
+++ linux/include/asm-alpha/spinlock.h
@@ -23,6 +23,9 @@ typedef struct {
struct task_struct * task;
const char *base_file;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#ifdef CONFIG_DEBUG_SPINLOCK
@@ -96,6 +99,9 @@ static inline int _raw_spin_trylock(spin

typedef struct {
volatile int write_lock:1, read_counter:31;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} /*__attribute__((aligned(32)))*/ rwlock_t;

#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
--- linux/include/asm-i386/spinlock.h.orig
+++ linux/include/asm-i386/spinlock.h
@@ -19,6 +19,9 @@ typedef struct {
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned magic;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define SPINLOCK_MAGIC 0xdead4ead
@@ -171,6 +174,9 @@ typedef struct {
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned magic;
#endif
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#define RWLOCK_MAGIC 0xdeaf1eed
@@ -225,27 +231,24 @@ static inline void _raw_write_lock(rwloc
#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")

-static inline int _raw_write_trylock(rwlock_t *lock)
+static inline int _raw_read_trylock(rwlock_t *lock)
{
atomic_t *count = (atomic_t *)lock;
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ atomic_dec(count);
+ if (atomic_read(count) < RW_LOCK_BIAS)
return 1;
- atomic_add(RW_LOCK_BIAS, count);
+ atomic_inc(count);
return 0;
}

-#ifdef CONFIG_LOCKMETER
-static inline int _raw_read_trylock(rwlock_t *lock)
+static inline int _raw_write_trylock(rwlock_t *lock)
{
-/* FIXME -- replace with assembler */
atomic_t *count = (atomic_t *)lock;
- atomic_dec(count);
- if (count->counter > 0)
+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
return 1;
- atomic_inc(count);
+ atomic_add(RW_LOCK_BIAS, count);
return 0;
}
-#endif

#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK)
extern void _metered_spin_lock (spinlock_t *lock);
--- linux/include/asm-s390/spinlock.h.orig
+++ linux/include/asm-s390/spinlock.h
@@ -36,6 +36,9 @@

typedef struct {
volatile unsigned int lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} __attribute__ ((aligned (4))) spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
@@ -105,6 +108,9 @@ extern inline void _raw_spin_unlock(spin
typedef struct {
volatile unsigned long lock;
volatile unsigned long owner_pc;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 }
@@ -211,6 +217,8 @@ typedef struct {
"m" ((rw)->lock) : "2", "3", "cc", "memory" )
#endif /* __s390x__ */

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
extern inline int _raw_write_trylock(rwlock_t *rw)
{
unsigned long result, reg;
--- linux/include/asm-sparc64/spinlock.h.orig
+++ linux/include/asm-sparc64/spinlock.h
@@ -304,6 +304,8 @@ do { unsigned long flags; \

#endif /* CONFIG_DEBUG_SPINLOCK */

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
#endif /* !(__ASSEMBLY__) */

#endif /* !(__SPARC64_SPINLOCK_H) */
--- linux/include/asm-sh/spinlock.h.orig
+++ linux/include/asm-sh/spinlock.h
@@ -17,6 +17,9 @@
*/
typedef struct {
volatile unsigned long lock;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
@@ -68,6 +71,9 @@ static inline void _raw_spin_unlock(spin
typedef struct {
spinlock_t lock;
atomic_t counter;
+#ifdef CONFIG_PREEMPT
+ unsigned int break_lock;
+#endif
} rwlock_t;

#define RW_LOCK_BIAS 0x01000000
@@ -105,6 +111,8 @@ static inline void _raw_write_unlock(rwl
_raw_spin_unlock(&rw->lock);
}

+#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
+
static inline int _raw_write_trylock(rwlock_t *rw)
{
if (atomic_sub_and_test(RW_LOCK_BIAS, &rw->counter))
--- linux/kernel/spinlock.c.orig
+++ linux/kernel/spinlock.c
@@ -2,6 +2,8 @@
* Copyright (2004) Linus Torvalds
*
* Author: Zwane Mwaikambo <zwane@xxxxxxxxxxx>
+ *
+ * Copyright (2004) Ingo Molnar
*/

#include <linux/config.h>
@@ -11,6 +13,17 @@
#include <linux/interrupt.h>
#include <linux/module.h>

+/*
+ * Generic declaration of the raw read_trylock() function,
+ * architectures are supposed to optimize this:
+ */
+int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+{
+ _raw_read_lock(lock);
+ return 1;
+}
+EXPORT_SYMBOL(generic_raw_read_trylock);
+
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
@@ -22,86 +35,29 @@ int __lockfunc _spin_trylock(spinlock_t
}
EXPORT_SYMBOL(_spin_trylock);

-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _read_trylock(rwlock_t *lock)
{
preempt_disable();
- if (_raw_write_trylock(lock))
+ if (_raw_read_trylock(lock))
return 1;
-
+
preempt_enable();
return 0;
}
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(_read_trylock);

-#ifdef CONFIG_PREEMPT
-/*
- * This could be a long-held lock. If another CPU holds it for a long time,
- * and that CPU is not asked to reschedule then *this* CPU will spin on the
- * lock for a long time, even if *this* CPU is asked to reschedule.
- *
- * So what we do here, in the slow (contended) path is to spin on the lock by
- * hand while permitting preemption.
- *
- * Called inside preempt_disable().
- */
-static inline void __preempt_spin_lock(spinlock_t *lock)
-{
- if (preempt_count() > 1) {
- _raw_spin_lock(lock);
- return;
- }
-
- do {
- preempt_enable();
- while (spin_is_locked(lock))
- cpu_relax();
- preempt_disable();
- } while (!_raw_spin_trylock(lock));
-}
-
-void __lockfunc _spin_lock(spinlock_t *lock)
+int __lockfunc _write_trylock(rwlock_t *lock)
{
preempt_disable();
- if (unlikely(!_raw_spin_trylock(lock)))
- __preempt_spin_lock(lock);
-}
-
-static inline void __preempt_write_lock(rwlock_t *lock)
-{
- if (preempt_count() > 1) {
- _raw_write_lock(lock);
- return;
- }
-
- do {
- preempt_enable();
- while (rwlock_is_locked(lock))
- cpu_relax();
- preempt_disable();
- } while (!_raw_write_trylock(lock));
-}
+ if (_raw_write_trylock(lock))
+ return 1;

-void __lockfunc _write_lock(rwlock_t *lock)
-{
- preempt_disable();
- if (unlikely(!_raw_write_trylock(lock)))
- __preempt_write_lock(lock);
-}
-#else
-void __lockfunc _spin_lock(spinlock_t *lock)
-{
- preempt_disable();
- _raw_spin_lock(lock);
+ preempt_enable();
+ return 0;
}
+EXPORT_SYMBOL(_write_trylock);

-void __lockfunc _write_lock(rwlock_t *lock)
-{
- preempt_disable();
- _raw_write_lock(lock);
-}
-#endif
-EXPORT_SYMBOL(_spin_lock);
-EXPORT_SYMBOL(_write_lock);
+#ifndef CONFIG_PREEMPT

void __lockfunc _read_lock(rwlock_t *lock)
{
@@ -110,27 +66,6 @@ void __lockfunc _read_lock(rwlock_t *loc
}
EXPORT_SYMBOL(_read_lock);

-void __lockfunc _spin_unlock(spinlock_t *lock)
-{
- _raw_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(_spin_unlock);
-
-void __lockfunc _write_unlock(rwlock_t *lock)
-{
- _raw_write_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(_write_unlock);
-
-void __lockfunc _read_unlock(rwlock_t *lock)
-{
- _raw_read_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(_read_unlock);
-
unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
{
unsigned long flags;
@@ -212,6 +147,130 @@ void __lockfunc _write_lock_bh(rwlock_t
}
EXPORT_SYMBOL(_write_lock_bh);

+void __lockfunc _spin_lock(spinlock_t *lock)
+{
+ preempt_disable();
+ _raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+ preempt_disable();
+ _raw_write_lock(lock);
+}
+
+EXPORT_SYMBOL(_write_lock);
+
+#else /* CONFIG_PREEMPT: */
+
+/*
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ *
+ * (We do this in a function because inlining it would be excessive.)
+ */
+
+#define BUILD_LOCK_OPS(op, locktype) \
+void __lockfunc _##op##_lock(locktype *lock) \
+{ \
+ preempt_disable(); \
+ for (;;) { \
+ if (likely(_raw_##op##_trylock(lock))) \
+ break; \
+ preempt_enable(); \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ cpu_relax(); \
+ preempt_disable(); \
+ } \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock); \
+ \
+unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock) \
+{ \
+ unsigned long flags; \
+ \
+ preempt_disable(); \
+ for (;;) { \
+ local_irq_save(flags); \
+ if (likely(_raw_##op##_trylock(lock))) \
+ break; \
+ local_irq_restore(flags); \
+ \
+ preempt_enable(); \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ cpu_relax(); \
+ preempt_disable(); \
+ } \
+ return flags; \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_irqsave); \
+ \
+void __lockfunc _##op##_lock_irq(locktype *lock) \
+{ \
+ _##op##_lock_irqsave(lock); \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_irq); \
+ \
+void __lockfunc _##op##_lock_bh(locktype *lock) \
+{ \
+ unsigned long flags; \
+ \
+ /* */ \
+ /* Careful: we must exclude softirqs too, hence the */ \
+ /* irq-disabling. We use the generic preemption-aware */ \
+ /* function: */ \
+ /**/ \
+ flags = _##op##_lock_irqsave(lock); \
+ local_bh_disable(); \
+ local_irq_restore(flags); \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_bh)
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ * _[spin|read|write]_lock()
+ * _[spin|read|write]_lock_irq()
+ * _[spin|read|write]_lock_irqsave()
+ * _[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock_t);
+BUILD_LOCK_OPS(read, rwlock_t);
+BUILD_LOCK_OPS(write, rwlock_t);
+
+#endif /* CONFIG_PREEMPT */
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+ _raw_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+ _raw_write_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+ _raw_read_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
_raw_spin_unlock(lock);
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -3215,6 +3215,37 @@ void __sched __cond_resched(void)

EXPORT_SYMBOL(__cond_resched);

+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t * lock)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+ if (lock->break_lock) {
+ lock->break_lock = 0;
+ spin_unlock(lock);
+ cpu_relax();
+ spin_lock(lock);
+ }
+#endif
+ if (need_resched()) {
+ _raw_spin_unlock(lock);
+ preempt_enable_no_resched();
+ set_current_state(TASK_RUNNING);
+ schedule();
+ spin_lock(lock);
+ return 1;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_lock);
+
/**
* yield - yield the current processor to other threads.
*