[3/3] wake-one PG_locked/BH_Lock semantics

From: William Lee Irwin III
Date: Wed May 05 2004 - 01:17:42 EST


On Tue, May 04, 2004 at 11:11:21PM -0700, William Lee Irwin III wrote:
> Now, make bh's use the new wakeup primitive also. This has the bugfix
> vs. the prior version that autoremoved waitqueue wakeup functions are
> made to match autoremove API usage in __wait_event_filtered().

This is still grossly inefficient in that it's only necessary to wake
one waiter when the waiter promises to eventually issue another wakeup
e.g. when it releases the bit on the page. So here, wake-one semantics
are implemented for those cases, using the WQ_FLAG_EXCLUSIVE flag in
the waitqueue and the surrounding API's e.g. prepare_to_wait_exclusive().

I took the small liberty of adding list_for_each_entry_reverse_safe()
to list.h as it generally makes sense, and gives the opportunity for
fair FIFO wakeups wrapped up in a neat API.


-- wli

Index: wake-2.6.6-rc3-mm1/fs/buffer.c
===================================================================
--- wake-2.6.6-rc3-mm1.orig/fs/buffer.c 2004-05-04 13:16:16.000000000 -0700
+++ wake-2.6.6-rc3-mm1/fs/buffer.c 2004-05-04 15:19:49.000000000 -0700
@@ -78,6 +78,30 @@
}
EXPORT_SYMBOL(wake_up_buffer);

+static void sync_buffer(struct buffer_head *bh)
+{
+ struct block_device *bd;
+ smp_mb();
+ bd = bh->b_bdev;
+ if (bd)
+ blk_run_address_space(bd->bd_inode->i_mapping);
+}
+
+void fastcall __lock_buffer(struct buffer_head *bh)
+{
+ wait_queue_head_t *wqh = bh_waitq_head(bh);
+ DEFINE_FILTERED_WAIT(wait, bh);
+ do {
+ prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (buffer_locked(bh)) {
+ sync_buffer(bh);
+ io_schedule();
+ }
+ } while (test_set_buffer_locked(bh));
+ finish_wait(wqh, &wait.wait);
+}
+EXPORT_SYMBOL(__lock_buffer);
+
void fastcall unlock_buffer(struct buffer_head *bh)
{
clear_buffer_locked(bh);
@@ -98,11 +122,7 @@
do {
prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
if (buffer_locked(bh)) {
- struct block_device *bd;
- smp_mb();
- bd = bh->b_bdev;
- if (bd)
- blk_run_address_space(bd->bd_inode->i_mapping);
+ sync_buffer(bh);
io_schedule();
}
} while (buffer_locked(bh));
Index: wake-2.6.6-rc3-mm1/include/linux/list.h
===================================================================
--- wake-2.6.6-rc3-mm1.orig/include/linux/list.h 2004-04-30 15:06:48.000000000 -0700
+++ wake-2.6.6-rc3-mm1/include/linux/list.h 2004-05-04 13:16:53.000000000 -0700
@@ -413,6 +413,19 @@
pos = n, n = list_entry(n->member.next, typeof(*n), member))

/**
+ * list_for_each_entry_reverse_safe - iterate over list of given type safe against removal of list entry backward
+ * @pos: the type * to use as a loop counter.
+ * @n: another type * to use as temporary storage
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member), \
+ n = list_entry(pos->member.prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+/**
* list_for_each_rcu - iterate over an rcu-protected list
* @pos: the &struct list_head to use as a loop counter.
* @head: the head for your list.
Index: wake-2.6.6-rc3-mm1/include/linux/buffer_head.h
===================================================================
--- wake-2.6.6-rc3-mm1.orig/include/linux/buffer_head.h 2004-04-30 15:05:52.000000000 -0700
+++ wake-2.6.6-rc3-mm1/include/linux/buffer_head.h 2004-05-04 15:13:37.000000000 -0700
@@ -170,6 +170,7 @@
struct buffer_head *alloc_buffer_head(int gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void FASTCALL(unlock_buffer(struct buffer_head *bh));
+void FASTCALL(__lock_buffer(struct buffer_head *bh));
void ll_rw_block(int, int, struct buffer_head * bh[]);
void sync_dirty_buffer(struct buffer_head *bh);
void submit_bh(int, struct buffer_head *);
@@ -279,8 +280,8 @@

static inline void lock_buffer(struct buffer_head *bh)
{
- while (test_set_buffer_locked(bh))
- __wait_on_buffer(bh);
+ if (test_set_buffer_locked(bh))
+ __lock_buffer(bh);
}

#endif /* _LINUX_BUFFER_HEAD_H */
Index: wake-2.6.6-rc3-mm1/kernel/sched.c
===================================================================
--- wake-2.6.6-rc3-mm1.orig/kernel/sched.c 2004-05-04 13:16:00.000000000 -0700
+++ wake-2.6.6-rc3-mm1/kernel/sched.c 2004-05-04 18:27:39.000000000 -0700
@@ -2524,9 +2524,14 @@
unsigned int mode = TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE;
struct filtered_wait_queue *wait, *save;
spin_lock_irqsave(&q->lock, flags);
- list_for_each_entry_safe(wait, save, &q->task_list, wait.task_list) {
- if (wait->key == key)
- wait->wait.func(&wait->wait, mode, 0);
+ list_for_each_entry_reverse_safe(wait, save, &q->task_list, wait.task_list) {
+ int exclusive = wait->wait.flags & WQ_FLAG_EXCLUSIVE;
+ if (wait->key != key)
+ continue;
+ else if (!wait->wait.func(&wait->wait, mode, 0))
+ continue;
+ else if (exclusive)
+ break;
}
spin_unlock_irqrestore(&q->lock, flags);
}
Index: wake-2.6.6-rc3-mm1/mm/filemap.c
===================================================================
--- wake-2.6.6-rc3-mm1.orig/mm/filemap.c 2004-05-04 13:16:00.000000000 -0700
+++ wake-2.6.6-rc3-mm1/mm/filemap.c 2004-05-04 15:24:01.000000000 -0700
@@ -297,17 +297,23 @@
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *page_waitqueue(struct page *page, int bit)
{
const struct zone *zone = page_zone(page);
+ unsigned long key = (unsigned long)page + bit;
+ return &zone->wait_table[hash_long(key, zone->wait_table_bits)];
+}

- return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+#define PAGE_KEY_SHIFT (BITS_PER_LONG - (BITS_PER_LONG == 32 ? 5 : 6))
+static void *page_key(struct page *page, unsigned long bit)
+{
+ return (void *)(page_to_pfn(page) | bit << PAGE_KEY_SHIFT);
}

void fastcall wait_on_page_bit(struct page *page, int bit_nr)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
- DEFINE_FILTERED_WAIT(wait, page);
+ wait_queue_head_t *waitqueue = page_waitqueue(page, bit_nr);
+ DEFINE_FILTERED_WAIT(wait, page_key(page, bit_nr));

do {
prepare_to_wait(waitqueue, &wait.wait, TASK_UNINTERRUPTIBLE);
@@ -338,13 +344,13 @@
*/
void fastcall unlock_page(struct page *page)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
+ wait_queue_head_t *waitqueue = page_waitqueue(page, PG_locked);
smp_mb__before_clear_bit();
if (!TestClearPageLocked(page))
BUG();
smp_mb__after_clear_bit();
if (waitqueue_active(waitqueue))
- wake_up_filtered(waitqueue, page);
+ wake_up_filtered(waitqueue, page_key(page, PG_locked));
}

EXPORT_SYMBOL(unlock_page);
@@ -355,7 +361,7 @@
*/
void end_page_writeback(struct page *page)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
+ wait_queue_head_t *waitqueue = page_waitqueue(page, PG_writeback);

if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
if (!test_clear_page_writeback(page))
@@ -363,7 +369,7 @@
smp_mb__after_clear_bit();
}
if (waitqueue_active(waitqueue))
- wake_up_filtered(waitqueue, page);
+ wake_up_filtered(waitqueue, page_key(page, PG_writeback));
}

EXPORT_SYMBOL(end_page_writeback);
@@ -378,11 +384,11 @@
*/
void fastcall __lock_page(struct page *page)
{
- wait_queue_head_t *wqh = page_waitqueue(page);
- DEFINE_FILTERED_WAIT(wait, page);
+ wait_queue_head_t *wqh = page_waitqueue(page, PG_locked);
+ DEFINE_FILTERED_WAIT(wait, page_key(page, PG_locked));

while (TestSetPageLocked(page)) {
- prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
if (PageLocked(page)) {
sync_page(page);
io_schedule();
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/