x86 rwsem in 2.4.4pre[234] are still buggy [was Re: rwsem benchmarks [Re: generic rwsem [Re: Alpha "process table hang"]]]

From: Andrea Arcangeli (andrea@suse.de)
Date: Fri Apr 20 2001 - 12:17:10 EST


On Fri, Apr 20, 2001 at 03:42:15AM +0200, Andrea Arcangeli wrote:
> I'm uncertain if I should drop the list_empty() check from the fast path and if

While dropping the list_empty check to speed up the fast path I faced the same
complexity of the 2.4.4pre4 lib/rwsem.c and so before reinventing the wheel I
read how the problem was solved in 2.4.4pre4.

I couldn't get convinced that all the subtle possible cases was handled
correctly so I tried to stress them. To do that I changed the 2.4.4pre4 rwsem
(compiled for PII SMP) as below patch shows in order to reproduce more easily
those corner cases (note my changes cannot be the cause of the deadlock, as far
I can tell they can _only_ change the timings, and the timings have to be
flexible because they can be randomly influenced by interrupt handlers machine
checks and whatever else in whatever hardware):

--- rwsemref-1/lib/rwsem.c.~1~ Thu Apr 19 19:59:46 2001
+++ rwsemref-1/lib/rwsem.c Fri Apr 20 17:13:05 2001
@@ -6,6 +6,7 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/delay.h>
 
 /*
  * wait for the read lock to be granted
@@ -18,6 +19,7 @@
         DECLARE_WAITQUEUE(wait,tsk);
         signed long count;
 
+ mdelay(1);
         rwsemtrace(sem,"Entering rwsem_down_read_failed");
 
         /* this waitqueue context flag will be cleared when we are granted the lock */
@@ -59,6 +61,7 @@
         DECLARE_WAITQUEUE(wait,tsk);
         signed long count;
 
+ mdelay(1);
         rwsemtrace(sem,"Entering rwsem_down_write_failed");
 
         /* this waitqueue context flag will be cleared when we are granted the lock */
@@ -115,6 +118,7 @@
                 goto out;
         }
 
+ mdelay(1);
         /* try to grant a single write lock if there's a writer at the front of the queue
          * - note we leave the 'active part' of the count incremented by 1 and the waiting part
          * incremented by 0x00010000
@@ -122,6 +126,7 @@
         if (wake_up_ctx(&sem->wait,1,-RWSEM_WAITING_FOR_WRITE)==1)
                 goto out;
 
+ mdelay(10);
         /* grant an infinite number of read locks to the readers at the front of the queue
          * - note we increment the 'active part' of the count by the number of readers just woken,
          * less one for the activity decrement we've already done
@@ -129,6 +134,7 @@
         woken = wake_up_ctx(&sem->wait,65535,-RWSEM_WAITING_FOR_READ);
         if (woken<=0)
                 goto counter_correction;
+ mdelay(1);
 
         woken *= RWSEM_ACTIVE_BIAS-RWSEM_WAITING_BIAS;
         woken -= RWSEM_ACTIVE_BIAS;

and I succeeded in deadlocking the 2.4.4pre[2345] rwsemaphores with kernel
looping forever in 2.4.4pre4/lib/rwsem.c:rwsem_wake() but at least the breakage
is 100% reproducible if you use the above patch.

All you have to do to reproduce is to apply the above patch and then to run the
rwsem proggy that I posted to the list two days ago (the one that shows my
latest -5/6 revisions to be just a bit faster in the slow path) that does the
mmap and page faults from different threads in loop, in another console you run
`while :; do ps xa; done`, then while `ps` is near the end of its run you kill
the parent thread of the rwsem testcase so that all thread-children gets killed
as well, and at that point `ps` deadlocks hard in R state unkillable (it keeps
looping forever in the try_again: counter_correction: paths).

[..]
  285 ? SW 0:00 [rpciod]
  461 ? S 0:00 /usr/sbin/inetd
  463 tty1 S 0:00 /sbin/mingetty --noclear tty1
  464 tty2 S 0:00 /sbin/mingetty tty2
  465 tty3 S 0:00 /sbin/mingetty tty3
  466 tty4 S 0:00 /sbin/mingetty tty4
  467 tty5 S 0:00 /sbin/mingetty tty5
  468 tty6 S 0:00 /sbin/mingetty tty6
  469 ? S 0:00 in.rlogind
  470 pts/0 S 0:00 login -- andrea
  471 pts/0 S 0:00 -bash
  530 ? S 0:00 in.rlogind
  531 pts/1 S 0:00 login -- andrea
  532 pts/1 S 0:00 -bash
  666 pts/1 R 0:00 ps xa
  667 pts/0 S 0:00 ./rwsem
  668 pts/0 R 0:00 ./rwsem
  669 pts/0 D 0:00 ./rwsem
  670 pts/0 D 0:00 ./rwsem
  671 pts/0 D 0:00 ./rwsem
  672 pts/0 D 0:00 ./rwsem
  673 pts/0 D 0:00 ./rwsem
  674 pts/0 D 0:00 ./rwsem
  675 pts/0 D 0:00 ./rwsem
  676 pts/0 D 0:00 ./rwsem
  677 pts/0 R 0:00 ./rwsem
  678 pts/0 D 0:00 ./rwsem
  679 pts/0 D 0:00 ./rwsem
  680 pts/0 D 0:00 ./rwsem
  681 pts/0 D 0:00 ./rwsem
  682 pts/0 D 0:00 ./rwsem
  683 pts/0 D 0:00 ./rwsem
  684 pts/0 D 0:00 ./rwsem
  685 pts/0 D 0:00 ./rwsem
  686 pts/0 D 0:00 ./rwsem
  687 pts/0 D 0:00 ./rwsem
  688 pts/0 R 0:00 ./rwsem
  689 pts/0 D 0:00 ./rwsem
  690 pts/0 D 0:00 ./rwsem
  691 pts/0 D 0:00 ./rwsem
  692 pts/0 D 0:00 ./rwsem
  693 pts/0 D 0:00 ./rwsem
  694 pts/0 D 0:00 ./rwsem
  695 pts/0 D 0:00 ./rwsem
  696 pts/0 D 0:00 ./rwsem
  697 pts/0 D 0:00 ./rwsem
  698 pts/0 D 0:00 ./rwsem
  699 pts/0 D 0:00 ./rwsem
  700 pts/0 D 0:00 ./rwsem
  701 pts/0 D 0:00 ./rwsem
  702 pts/0 D 0:00 ./rwsem
  703 pts/0 D 0:00 ./rwsem
[ hang here ]

from another terminal:

andrea@laser:~ > kill 666
andrea@laser:~ > kill 666
andrea@laser:~ > kill 666
andrea@laser:~ > kill 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > ps 666
  PID TTY STAT TIME COMMAND
  666 pts/1 R 0:31 ps xa
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > kill -9 666
andrea@laser:~ > ps 666
  PID TTY STAT TIME COMMAND
  666 pts/1 R 53:38 ps xa
andrea@laser:~ >

At first I had a short look to check if this bug could be unrelated to the rwsem and
to be instead a race between the /proc filesystem and the do_exit path, but
after a short review of the interesting places the task_lock seems to serialize
correctly the accesses to the tsk->mm so the semaphore shouldn't go away under
rwsem_wake and furthmore it would be quite strange if the mm goes away under
the /proc filesystem that it only reproducibly fails in the same way in
rwsem_wake and I never get other system malfunctions for example while reading
the contents of the mm.

As further confirm this is a bug in the rwsemaphores I changed your benchmark
to be a little more aggressive:

--- rwsem-rw.c.orig Thu Apr 19 22:37:27 2001
+++ rwsem-rw.c Fri Apr 20 18:56:06 2001
@@ -71,9 +71,8 @@
 
 static inline void sched(void)
 {
-#ifdef SCHED
- schedule();
-#endif
+ if (current->need_resched)
+ schedule();
 }
 
 int reader(void *arg)
@@ -86,8 +85,8 @@
 
         while (atomic_read(&do_stuff)) {
                 dr();
- ur();
                 sched();
+ ur();
         }
 
         atomic_dec(&runners);
@@ -105,8 +104,8 @@
 
         while (atomic_read(&do_stuff)) {
                 dw();
- uw();
                 sched();
+ uw();
         }
 
         atomic_dec(&runners);
@@ -125,22 +124,64 @@
 
         init_rwsem(&rwsem_sem);
         atomic_set(&do_stuff, 1);
+ wmb();
 
 #ifdef DEBUG_RWSEM
         rwsem_to_monitor = &rwsem_sem;
 #endif
         init_timer(&timer);
         timer.function = stop_test;
- timer.expires = jiffies + 5 * HZ;
+ timer.expires = jiffies + 50 * HZ;
         add_timer(&timer);
 
- kernel_thread(writer, "WriteA", 0);
- kernel_thread(writer, "WriteB", 0);
-
- kernel_thread(reader, "ReadA", 0);
- kernel_thread(reader, "ReadB", 0);
- kernel_thread(reader, "ReadC", 0);
- kernel_thread(reader, "ReadD", 0);
+ kernel_thread(writer, "Write0", 0);
+ kernel_thread(writer, "Write1", 0);
+ kernel_thread(writer, "Write2", 0);
+ kernel_thread(writer, "Write3", 0);
+ kernel_thread(writer, "Write4", 0);
+ kernel_thread(writer, "Write5", 0);
+ kernel_thread(writer, "Write6", 0);
+ kernel_thread(writer, "Write7", 0);
+ kernel_thread(writer, "Write8", 0);
+ kernel_thread(writer, "Write9", 0);
+ kernel_thread(writer, "Write10", 0);
+ kernel_thread(writer, "Write11", 0);
+ kernel_thread(writer, "Write12", 0);
+ kernel_thread(writer, "Write13", 0);
+ kernel_thread(writer, "Write14", 0);
+ kernel_thread(writer, "Write15", 0);
+
+ kernel_thread(reader, "Read0", 0);
+ kernel_thread(reader, "Read1", 0);
+ kernel_thread(reader, "Read2", 0);
+ kernel_thread(reader, "Read3", 0);
+ kernel_thread(reader, "Read4", 0);
+ kernel_thread(reader, "Read5", 0);
+ kernel_thread(reader, "Read6", 0);
+ kernel_thread(reader, "Read7", 0);
+ kernel_thread(reader, "Read8", 0);
+ kernel_thread(reader, "Read9", 0);
+ kernel_thread(reader, "Read10", 0);
+ kernel_thread(reader, "Read11", 0);
+ kernel_thread(reader, "Read12", 0);
+ kernel_thread(reader, "Read13", 0);
+ kernel_thread(reader, "Read14", 0);
+ kernel_thread(reader, "Read15", 0);
+ kernel_thread(reader, "Read16", 0);
+ kernel_thread(reader, "Read17", 0);
+ kernel_thread(reader, "Read18", 0);
+ kernel_thread(reader, "Read19", 0);
+ kernel_thread(reader, "Read20", 0);
+ kernel_thread(reader, "Read21", 0);
+ kernel_thread(reader, "Read22", 0);
+ kernel_thread(reader, "Read23", 0);
+ kernel_thread(reader, "Read24", 0);
+ kernel_thread(reader, "Read25", 0);
+ kernel_thread(reader, "Read26", 0);
+ kernel_thread(reader, "Read27", 0);
+ kernel_thread(reader, "Read28", 0);
+ kernel_thread(reader, "Read29", 0);
+ kernel_thread(reader, "Read30", 0);
 
         return 0;
 }
@@ -155,6 +196,8 @@
                 schedule();
         printk("reads taken: %d\n", atomic_read(&reads_taken));
         printk("writes taken: %d\n", atomic_read(&writes_taken));
+ printk("readers: %d\n", atomic_read(&readers));
+ printk("writers: %d\n", atomic_read(&writers));
 #ifdef DEBUG_RWSEM
         rwsem_to_monitor = 0;
 #endif

and now also the benchmark deadlocked:

andrea@laser:~ > ps 669
  PID TTY STAT TIME COMMAND
  669 pts/0 R 6:25 [Read19]
andrea@laser:~ > su
root@laser:/home/andrea > rmmod rwsem-rw
[ hang here ]

Until we fix that bug I will keep running my rwsem-generic-6 patch to be 100%
sure to sit on rock solid rwsems.

BTW, this is the fix against pre4 for the 386+SMP rwsemaphores bug that I found
yesterday:

--- rwsemref-1/include/asm-i386/rwsem-spin.h.~1~ Fri Apr 20 05:03:59 2001
+++ rwsemref-1/include/asm-i386/rwsem-spin.h Fri Apr 20 19:05:52 2001
@@ -93,9 +93,9 @@
         __asm__ __volatile__(
                 "# beginning down_read\n\t"
 #ifdef CONFIG_SMP
+ "1:\n\t"
 LOCK_PREFIX " decb "RWSEM_SPINLOCK_OFFSET_STR"(%%eax)\n\t" /* try to grab the spinlock */
                 " js 3f\n" /* jump if failed */
- "1:\n\t"
 #endif
                 " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */
 #ifdef CONFIG_SMP
@@ -132,9 +132,9 @@
         __asm__ __volatile__(
                 "# beginning down_write\n\t"
 #ifdef CONFIG_SMP
+ "1:\n\t"
 LOCK_PREFIX " decb "RWSEM_SPINLOCK_OFFSET_STR"(%%eax)\n\t" /* try to grab the spinlock */
                 " js 3f\n" /* jump if failed */
- "1:\n\t"
 #endif
                 " xchg %0,(%%eax)\n\t" /* retrieve the old value */
                 " add %0,(%%eax)\n\t" /* add 0xffff0001, result in memory */
@@ -173,9 +173,9 @@
         __asm__ __volatile__(
                 "# beginning __up_read\n\t"
 #ifdef CONFIG_SMP
+ "1:\n\t"
 LOCK_PREFIX " decb "RWSEM_SPINLOCK_OFFSET_STR"(%%eax)\n\t" /* try to grab the spinlock */
                 " js 3f\n" /* jump if failed */
- "1:\n\t"
 #endif
                 " xchg %0,(%%eax)\n\t" /* retrieve the old value */
                 " addl %0,(%%eax)\n\t" /* subtract 1, result in memory */
@@ -213,9 +213,9 @@
         __asm__ __volatile__(
                 "# beginning __up_write\n\t"
 #ifdef CONFIG_SMP
+ "1:\n\t"
 LOCK_PREFIX " decb "RWSEM_SPINLOCK_OFFSET_STR"(%%eax)\n\t" /* try to grab the spinlock */
                 " js 3f\n" /* jump if failed */
- "1:\n\t"
 #endif
                 " addl %3,(%%eax)\n\t" /* adds 0x00010001 */
 #ifdef CONFIG_SMP
@@ -251,9 +251,9 @@
         __asm__ __volatile__(
                 "# beginning rwsem_atomic_update\n\t"
 #ifdef CONFIG_SMP
+ "1:\n\t"
 LOCK_PREFIX " decb "RWSEM_SPINLOCK_OFFSET_STR"(%1)\n\t" /* try to grab the spinlock */
                 " js 3f\n" /* jump if failed */
- "1:\n\t"
 #endif
                 " xchgl %0,(%1)\n\t" /* retrieve the old value */
                 " addl %0,(%1)\n\t" /* add 0xffff0001, result in memory */
@@ -287,9 +287,9 @@
         __asm__ __volatile__(
                 "# beginning rwsem_cmpxchgw\n\t"
 #ifdef CONFIG_SMP
+ "1:\n\t"
 LOCK_PREFIX " decb "RWSEM_SPINLOCK_OFFSET_STR"(%3)\n\t" /* try to grab the spinlock */
                 " js 3f\n" /* jump if failed */
- "1:\n\t"
 #endif
                 " cmpw %w1,(%3)\n\t"
                 " jne 4f\n\t" /* jump if old doesn't match sem->count LSW */

Andrea
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Apr 23 2001 - 21:00:37 EST